8 Most Popular Python HTML Web Scraping Packages with Benchmarks
Intro
This blog post will cover Python web scraping packages in terms of their speed, ease of use, and personal investigations. This blog post won't cover what webscraping is and how parsers work.
By the end of this blog post, you'll understand or have a better understanding of which parser/browser automation to use for your purpose.
This blog post is split into two parts:
- Python HTML parsers and their benchmarks.
- Python browser automation packages and their benchmarks.
The first section will be about speed comparison and ease of use. The second section will be specifically about Python's browser automation packages, memory consumption, and speed.
📌Please note that I'm not very advanced in benchmarking, feel free to comment on what feels wrong or confusing.
Used Hardware
Component | Specs |
RAM | 16GB |
CPU | Intel 12th Gen i7-12700H, 14 cores, 4.7 MHz |
Python Parsers Benchmarks
This section will compare 5 Python HTML parsers:
html.parser
html5lib
lxml
parsel
selectolax
(HTML, Lexbor backends)
There're four tests with saved locally Google Organic search results HTML file:
- 100 iterations with 10 repeats.
- 500 iterations with 10 repeats.
- 1000 iterations with 10 repeats.
- 3000 iterations with 5 repeats.
Combined Chart Results
100 Iteration Results
Raw Data
json
[
{
"bs4_html_parser":{
"raw":[
4.3710737,
3.8684199,
3.8580279,
3.8806571,
3.7997756,
3.8502996,
3.7788926,
3.840815,
3.8426696,
3.8693173
],
"min":3.78,
"max":4.37,
"mean":3.9,
"standard_deviation":0.17
},
"bs4_lxml_parser":{
"raw":[
3.4638336,
3.4571164,
3.4321829,
3.3861049,
3.3953615,
3.3972832,
3.4319847,
3.4485809,
3.4614765,
3.4190135
],
"min":3.39,
"max":3.46,
"mean":3.43,
"standard_deviation":0.03
},
"bs4_html5lib_parser":{
"raw":[
9.2249861,
9.203607,
9.1739324,
9.1611666,
9.2419121,
9.2673369,
9.3951571,
9.346496,
9.3287497,
9.3132658
],
"min":9.16,
"max":9.4,
"mean":9.27,
"standard_deviation":0.08
},
"parsel_parser":{
"raw":[
2.679202,
2.6082995,
2.6354441,
2.6149692,
2.5950211,
2.5903689,
2.6174196,
2.6494348,
2.5917189,
2.6739687
],
"min":2.59,
"max":2.68,
"mean":2.63,
"standard_deviation":0.03
},
"lxml_parser":{
"raw":[
0.8436189,
0.8283767,
0.8420748,
0.8255317,
0.827338,
0.8243309,
0.8134264,
0.8111648,
0.8125786,
0.8133089
],
"min":0.81,
"max":0.84,
"mean":0.82,
"standard_deviation":0.01
},
"selectolax_html_parser":{
"raw":[
0.18461,
0.1845551,
0.18227,
0.1875291,
0.1873954,
0.1860093,
0.19331,
0.1936247,
0.1898201,
0.1888448
],
"min":0.18,
"max":0.19,
"mean":0.19,
"standard_deviation":0.0
},
"selectolax_lexbor_parser":{
"raw":[
0.1461264,
0.1471886,
0.1477886,
0.148007,
0.1501829,
0.144999,
0.1466642,
0.1459928,
0.148036,
0.1476213
],
"min":0.14,
"max":0.15,
"mean":0.15,
"standard_deviation":0.0
}
}
]
500 Iteration Results
Raw Data
json
[
{
"bs4_html_parser":{
"raw":[
20.2197304,
19.8741745,
19.2251858,
19.3863994,
19.5261737,
19.3143987,
19.295948,
19.5619137,
19.3965105,
19.5783358
],
"min":19.23,
"max":20.22,
"mean":19.54,
"standard_deviation":0.3
},
"bs4_lxml_parser":{
"raw":[
17.4766226,
17.6523627,
17.3481226,
17.5603945,
17.6367459,
17.5745645,
17.5120047,
17.7037503,
17.0720811,
17.1215897
],
"min":17.07,
"max":17.7,
"mean":17.47,
"standard_deviation":0.22
},
"bs4_html5lib_parser":{
"raw":[
46.0190557,
46.1886938,
46.9830729,
46.2576209,
46.2123261,
46.0697103,
46.0612127,
46.1187136,
48.670253,
47.7965513
],
"min":46.02,
"max":48.67,
"mean":46.64,
"standard_deviation":0.91
},
"parsel_parser":{
"raw":[
13.4378014,
13.3847556,
13.5100824,
13.3942492,
13.4126319,
13.3633435,
13.4016935,
13.4611343,
13.3859031,
13.37833
],
"min":13.36,
"max":13.51,
"mean":13.41,
"standard_deviation":0.04
},
"lxml_parser":{
"raw":[
4.3305542,
4.3557282,
4.2665298,
4.3775716,
4.379404,
4.3783643,
4.3159682,
4.3932252,
4.43205,
4.391286
],
"min":4.27,
"max":4.43,
"mean":4.36,
"standard_deviation":0.05
},
"selectolax_html_parser":{
"raw":[
0.9595209,
0.9527344,
0.9630006,
0.9670192,
0.9511656,
0.9588291,
0.9580593,
0.9678788,
0.940953,
0.9192121
],
"min":0.92,
"max":0.97,
"mean":0.95,
"standard_deviation":0.01
},
"selectolax_lexbor_parser":{
"raw":[
0.7347563,
0.7880385,
0.7712946,
0.803693,
0.7808561,
0.7661754,
0.7927354,
0.8141454,
0.7798704,
0.7893399
],
"min":0.73,
"max":0.81,
"mean":0.78,
"standard_deviation":0.02
}
}
]
1000 Iteration Results
Raw Data
json
[
{
"bs4_html_parser":{
"raw":[
39.1850317,
38.8625512,
38.9220413,
38.52645,
39.5012472,
39.0812714,
40.2874018,
39.1607705,
38.9987914,
38.9538794
],
"min":38.53,
"max":40.29,
"mean":39.15,
"standard_deviation":0.47
},
"bs4_lxml_parser":{
"raw":[
35.340024,
35.5665838,
35.4370601,
35.4310116,
35.1316697,
35.1764227,
35.5666647,
35.2332891,
35.0854779,
35.7218053
],
"min":35.09,
"max":35.72,
"mean":35.37,
"standard_deviation":0.21
},
"bs4_html5lib_parser":{
"raw":[
96.2746173,
95.1716108,
94.9976969,
95.376245,
95.5432712,
96.4209762,
94.7473696,
95.6874917,
95.34009,
95.1788754
],
"min":94.75,
"max":96.42,
"mean":95.47,
"standard_deviation":0.53
},
"parsel_parser":{
"raw":[
27.1312845,
26.8943964,
32.5753347,
47.7647626,
40.4320701,
27.203478,
27.4574984,
26.7016569,
26.305777,
28.7976675
],
"min":26.31,
"max":47.76,
"mean":31.13,
"standard_deviation":7.28
},
"lxml_parser":{
"raw":[
8.8578888,
8.8009302,
9.0597153,
8.7252489,
8.7669332,
8.7292014,
8.8213478,
8.7004821,
8.6868534,
8.7972479
],
"min":8.69,
"max":9.06,
"mean":8.79,
"standard_deviation":0.11
},
"selectolax_html_parser":{
"raw":[
1.9501049,
2.0294157,
2.0303961,
2.0856484,
2.0366627,
2.0423066,
2.0580174,
2.0388072,
2.0790543,
2.0126756
],
"min":1.95,
"max":2.09,
"mean":2.04,
"standard_deviation":0.04
},
"selectolax_lexbor_parser":{
"raw":[
1.6442625,
1.6344478,
1.6930229,
1.6476543,
1.6561128,
1.6467175,
1.6648112,
1.6683134,
1.6984335,
1.6524308
],
"min":1.63,
"max":1.7,
"mean":1.66,
"standard_deviation":0.02
}
}
]
3000 Iteration Results
Raw Data
json
[
{
"bs4_html_parser":{
"raw":[
136.3143269,
113.9956494,
114.0446546,
121.5711856,
171.5756146
],
"min":114.0,
"max":171.58,
"mean":131.5,
"standard_deviation":24.18
},
"bs4_lxml_parser":{
"raw":[
217.9561803,
216.2607809,
192.2252402,
143.3660566,
123.4686671
],
"min":123.47,
"max":217.96,
"mean":178.66,
"standard_deviation":43.11
},
"bs4_html5lib_parser":{
"raw":[
316.4003634,
285.8371232,
284.3877244,
287.4475911,
285.5628814
],
"min":284.39,
"max":316.4,
"mean":291.93,
"standard_deviation":13.72
},
"parsel_parser":{
"raw":[
77.7046641,
78.2331405,
78.0829456,
78.017651,
78.879304
],
"min":77.7,
"max":78.88,
"mean":78.18,
"standard_deviation":0.43
},
"lxml_parser":{
"raw":[
24.8936843,
24.9198819,
24.8717849,
24.8783482,
25.0041617
],
"min":24.87,
"max":25.0,
"mean":24.91,
"standard_deviation":0.05
},
"selectolax_html_parser":{
"raw":[
5.466414,
5.4846724,
5.4738816,
5.4023584,
5.4395063
],
"min":5.4,
"max":5.48,
"mean":5.45,
"standard_deviation":0.03
},
"selectolax_lexbor_parser":{
"raw":[
4.6869728,
4.7313916,
4.9446686,
4.7700584,
4.8052036
],
"min":4.69,
"max":4.94,
"mean":4.79,
"standard_deviation":0.1
}
}
]
Personal Investigations
html5lib
is the slowest parser of all of the mentioned parsers.BeautifulSoup
html.parser
andlxml
parser has a very slight change in results. Interesting that in 3000 iterationsbs4
lxml
backend was slower. Didn't understand why.It's interesting that
lxml
parser is used in bothbs4
andparsel
butparsel
outperformsbs4
withlxml
backend.selectolax
HTML and Lexbor parsers are incredibly fast. Besides speed, I've noticed (not measured in this blog post) that the Lexbor parser used almost half less memory compared to all other parsers.lxml
was the hardest to use as it required XPath knowledge. This point is subjective, however, worth noting. Writing XPath takes some time and some additional knowledge. If you were writing XPath before, then there'll be no issues.selectolax
in terms of syntax is very similar to bothbs4
andparsel
, which is handy. One thing to keep in mind is thatselectolax
currently doesn't support XPath.
Python Parsers Benchmark Code
import timeit
import lxml # installing from wheel if on Python 3.11 https://stackoverflow.com/a/74546098/15164646
import json
from lxml import html
from bs4 import BeautifulSoup
from parsel import Selector
from selectolax.parser import HTMLParser
from selectolax.lexbor import LexborHTMLParser
import pandas as pd
import numpy as np
def get_html():
# locally saved HTML
with open('google-search-test.html', mode='r', encoding='utf-8') as html_file:
html_page = html_file.read()
return html_page
def bs4_html_parser(html_page):
soup = BeautifulSoup(html_page, 'html.parser')
for result in soup.select('.tF2Cxc'):
title = result.select_one('.DKV0Md').text
snippet = result.select_one('.lyLwlc').text
link = result.select_one('.yuRUbf a')['href']
displayed_link = result.select_one('.tjvcx').text
for related_search in soup.select('.s8bAkb'):
related_title = related_search.select_one('.AB4Wff').text
related_link = related_search['href']
for video in soup.select('.dFd2Tb'):
video_title = video.select_one('.DKV0Md').text
video_lenght = video.select_one('.R4Cuhd .J1mWY div')['aria-label']
for knowledge_graph in soup.select(".rVusze"):
if knowledge_graph.select_one(".kno-fv"):
knowledge_graph_key = knowledge_graph.select_one('.w8qArf .fl').text
knowledge_graph_value = knowledge_graph.select_one('.kno-fv').text
knowledge_graph_value_link = knowledge_graph.select_one('.kno-fv a')['href']
def bs4_lxml_parser(html_page):
soup = BeautifulSoup(html_page, 'lxml')
for result in soup.select('.tF2Cxc'):
title = result.select_one('.DKV0Md').text
snippet = result.select_one('.lyLwlc').text
link = result.select_one('.yuRUbf a')['href']
displayed_link = result.select_one('.tjvcx').text
for related_search in soup.select('.s8bAkb'):
related_title = related_search.select_one('.AB4Wff').text
related_link = related_search['href']
for video in soup.select('.dFd2Tb'):
video_title = video.select_one('.DKV0Md').text
video_lenght = video.select_one('.R4Cuhd .J1mWY div')['aria-label']
for knowledge_graph in soup.select(".rVusze"):
if knowledge_graph.select_one(".kno-fv"):
knowledge_graph_key = knowledge_graph.select_one('.w8qArf .fl').text
knowledge_graph_value = knowledge_graph.select_one('.kno-fv').text
knowledge_graph_value_link = knowledge_graph.select_one('.kno-fv a')['href']
def bs4_html5lib_parser(html_page):
soup = BeautifulSoup(html_page, 'html5lib')
for result in soup.select('.tF2Cxc'):
title = result.select_one('.DKV0Md').text
snippet = result.select_one('.lyLwlc').text
link = result.select_one('.yuRUbf a')['href']
displayed_link = result.select_one('.tjvcx').text
for related_search in soup.select('.s8bAkb'):
related_title = related_search.select_one('.AB4Wff').text
related_link = related_search['href']
for video in soup.select('.dFd2Tb'):
video_title = video.select_one('.DKV0Md').text
video_lenght = video.select_one('.R4Cuhd .J1mWY div')['aria-label']
for knowledge_graph in soup.select(".rVusze"):
if knowledge_graph.select_one(".kno-fv"):
knowledge_graph_key = knowledge_graph.select_one('.w8qArf .fl').text
knowledge_graph_value = knowledge_graph.select_one('.kno-fv').text
knowledge_graph_value_link = knowledge_graph.select_one('.kno-fv a')['href']
def parsel_parser(html_page):
selector = Selector(text=html_page)
for result in selector.css('.tF2Cxc'):
title = result.css('.DKV0Md::text').get()
snippet = result.css('.lyLwlc::text').get()
link = result.css('.yuRUbf a::attr(href)').get()
displayed_link = result.css('.tjvcx::text').get()
for related_search in selector.css('.s8bAkb'):
related_title = related_search.css('.AB4Wff::text').get()
related_link = related_search.attrib['href']
for video in selector.css('.dFd2Tb'):
video_title = video.css('.DKV0Md::text').get()
video_lenght = video.css('.R4Cuhd .J1mWY div::attr(aria-label)')
for knowledge_graph in selector.css(".rVusze"):
if knowledge_graph.css(".kno-fv").get():
knowledge_graph_key = knowledge_graph.css('.w8qArf .fl::text').get()
knowledge_graph_value = knowledge_graph.css('.kno-fv::text').get()
knowledge_graph_value_link = knowledge_graph.css('.kno-fv a::attr(href)').get()
def lxml_parser(html_page):
tree = lxml.html.fromstring(html_page)
for result in tree.xpath('//div[contains(@class, "tF2Cxc")]'):
title = result.xpath('.//h3[contains(@class, "DKV0Md")]/text()')[0]
snippet = result.xpath('.//div[contains(@class, "lyLwlc")]/text()')
link = result.xpath('.//div[contains(@class, "yuRUbf")]/a/@href')[0]
displayed_link = result.xpath('.//cite[contains(normalize-space(@class), "tjvcx")]')[0].text_content()
for related_search in tree.xpath('//a[contains(@class, "s8bAkb")]'):
related_title = related_search.xpath('.//div[contains(@class, "OhScic")]')[0].text_content()
related_link = related_search.xpath('.//@href')[0]
for video in tree.xpath('//div[contains(@class, "dFd2Tb")]'):
video_title = video.xpath('.//h3[contains(@class, "DKV0Md")]/text()')[0]
video_lenght = video.xpath('.//div[contains(@class, "J1mWY")]/div/@aria-label')[0]
# returns a lists of data
knowledge_graph_key = tree.xpath('//span[contains(@class, "w8qArf")]/a[contains(@class, "fl")]/text()')
knowledge_graph_value = tree.xpath('//span[contains(@class, "kno-fv")]/a/text()')
knowledge_graph_value_link = tree.xpath('//span[contains(@class, "kno-fv")]/a/@href')
def selectolax_html_parser(html_page):
parser = HTMLParser(html_page)
for result in parser.css('.tF2Cxc'):
title = result.css_first('.DKV0Md').text()
snippet = result.css_first('.lyLwlc').text()
link = result.css_first('.yuRUbf a').attrs['href']
displayed_link = result.css_first('.tjvcx').text()
for related_search in parser.css('.s8bAkb'):
related_title = related_search.css_first('.AB4Wff').text()
related_link = related_search.attrs['href']
for video in parser.css('.dFd2Tb'):
video_title = video.css_first('.DKV0Md').text()
video_lenght = video.css_first('.R4Cuhd .J1mWY div').attrs['aria-label']
for knowledge_graph in parser.css(".rVusze"):
if knowledge_graph.css_first(".kno-fv"):
knowledge_graph_key = knowledge_graph.css_first('.w8qArf .fl').text()
knowledge_graph_value = knowledge_graph.css_first('.kno-fv').text()
knowledge_graph_value_link = knowledge_graph.css_first('.kno-fv a').attrs['href']
def selectolax_lexbor_parser(html_page):
parser = LexborHTMLParser(html_page)
for result in parser.css('.tF2Cxc'):
title = result.css_first('.DKV0Md').text()
snippet = result.css_first('.lyLwlc').text()
link = result.css_first('.yuRUbf a').attrs['href']
displayed_link = result.css_first('.tjvcx').text()
for related_search in parser.css('.s8bAkb'):
related_title = related_search.css_first('.AB4Wff').text()
related_link = related_search.attrs['href']
for video in parser.css('.dFd2Tb'):
video_title = video.css_first('.DKV0Md').text()
video_lenght = video.css_first('.R4Cuhd .J1mWY div').attrs['aria-label']
for knowledge_graph in parser.css(".rVusze"):
if knowledge_graph.css_first(".kno-fv"):
knowledge_graph_key = knowledge_graph.css_first('.w8qArf .fl').text()
knowledge_graph_value = knowledge_graph.css_first('.kno-fv').text()
knowledge_graph_value_link = knowledge_graph.css_first('.kno-fv a').attrs['href']
if __name__ == '__main__':
html_page = get_html()
parsers_benchmark_data = {
'bs4_html_parser': [],
'bs4_lxml_parser': [],
'bs4_html5lib_parser': [],
'parsel_parser': [],
'lxml_parser': [],
'selectolax_html_parser': [],
'selectolax_lexbor_parser': []
}
iterations = 1000
repeat = 10
for parser_name in parsers_benchmark_data:
benchmark_data = timeit.repeat(
stmt=f'{parser_name}(html_page)',
setup=f'from __main__ import {parser_name}, html_page',
number=iterations,
repeat=repeat
)
parsers_benchmark_data[parser_name].append({
'raw': benchmark_data,
'min': round(min(benchmark_data), 2),
'max': round(max(benchmark_data), 2),
'mean': round(sum(benchmark_data) / len(benchmark_data), 2),
'standard_deviation': round(np.std(benchmark_data, ddof=1), 2),
})
print(json.dumps(parsers_benchmark_data, indent=2))
pd.DataFrame(data=parsers_benchmark_data).to_json('browser-automation-benchmark-data-10-iterations.json', orient='records')
Python Browser Automation
The test was fairly basic to see memory consumption, and speed. Each script makes a request to Google Search and paginates through 10 pages.
Browser Automation Profiling Results
Tests were run with the following command:
$ mprof run <file_name>.py
$ mprof plot # to plot the results (matplotlib is required)
Personal Investigations
Surprisingly
selenium
was a little faster thanrequests-html
. Or maybe not surprisingly :)playwright
was the slowest.selenium
had a hard time with complex CSS selectors, for example.d6cvqb a[id=pnnext]
(.get_attribute()
also didn't work). That's whyfor in range()
was used instead. I know it can be done with XPath but I wanted to use CSS selectors only, althoughrequests-html
uses only XPath.Memory consumption was pretty much identical between the 3 packages.
Browser Automation Code
from playwright.sync_api import sync_playwright
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from requests_html import HTMLSession
from memory_profiler import memory_usage
from memory_profiler import profile
@profile
def playwright_parser():
# https://stackoverflow.com/questions/73043207/how-to-start-playwright-outside-with-without-context-managers
playwright = sync_playwright().start()
chromium = playwright.chromium
browser = chromium.launch(headless=True, timeout=0)
page = browser.new_page()
page_num = 0
while True:
page.goto(f'https://www.google.com/search?q=minecraft&gl=us&hl=en&start={page_num}')
for result in page.query_selector_all('.tF2Cxc'):
title = result.query_selector('.DKV0Md').inner_text()
try:
snippet = result.query_selector('.lyLwlc').inner_text()
except: snippet = None
link = result.query_selector('.yuRUbf a').get_attribute('href')
displayed_link = result.query_selector('.tjvcx').inner_text()
for related_search in page.query_selector_all('.s8bAkb'):
related_title = related_search.query_selector('.AB4Wff').inner_text()
related_link = related_search.get_attribute('href')
for video in page.query_selector_all('.dFd2Tb'):
video_title = video.query_selector('.DKV0Md').inner_text()
video_lenght = video.query_selector('.R4Cuhd .J1mWY div').get_attribute('aria-label')
for knowledge_graph in page.query_selector_all(".rVusze"):
if knowledge_graph.query_selector(".kno-fv"):
knowledge_graph_key = knowledge_graph.query_selector('.w8qArf .fl').inner_text()
knowledge_graph_value = knowledge_graph.query_selector('.kno-fv').inner_text()
knowledge_graph_value_link = knowledge_graph.query_selector('.kno-fv a').get_attribute('href')
if page.query_selector('.d6cvqb a[id=pnnext]'):
page_num += 1
else:
break
browser.close()
playwright.stop()
@profile
def selenium_parser():
service = Service(executable_path='<full_path_to>\chromedriver.exe')
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome(service=service, options=options)
for page_num in range(0, 11):
driver.get(f'https://www.google.com/search?q=minecraft&gl=us&hl=en&start={page_num}')
for result in driver.find_elements(By.CSS_SELECTOR, '.tF2Cxc'):
title = result.find_element(By.CSS_SELECTOR, '.DKV0Md').text
try:
snippet = result.find_element(By.CSS_SELECTOR, '.lyLwlc').text
except: snippet = None
link = result.find_element(By.CSS_SELECTOR, '.yuRUbf a').get_attribute('href')
displayed_link = result.find_element(By.CSS_SELECTOR, '.tjvcx').text
for related_search in driver.find_elements(By.CSS_SELECTOR, '.s8bAkb'):
related_title = related_search.find_element(By.CSS_SELECTOR, '.AB4Wff').text
related_link = related_search.get_attribute('href')
for video in driver.find_elements(By.CSS_SELECTOR, '.dFd2Tb'):
video_title = video.find_element(By.CSS_SELECTOR, '.DKV0Md').text
video_lenght = video.find_element(By.CSS_SELECTOR,'.R4Cuhd .J1mWY div').get_attribute('aria-label')
for knowledge_graph in driver.find_elements(By.CSS_SELECTOR, '.rVusze'):
if knowledge_graph.find_element(By.CSS_SELECTOR, '.kno-fv'):
knowledge_graph_key = knowledge_graph.find_element(By.CSS_SELECTOR, '.w8qArf .fl').text
knowledge_graph_value = knowledge_graph.find_element(By.CSS_SELECTOR, '.kno-fv').text
knowledge_graph_value_link = knowledge_graph.find_element(By.CSS_SELECTOR, '.kno-fv a').get_attribute('href')
driver.close()
@profile
def requests_html_parser():
session = HTMLSession()
page_num = 0
while True:
soup = session.get(f'https://www.google.com/search?q=minecraft&gl=us&hl=en&start={page_num}')
for result in soup.html.xpath('//div[contains(@class, "tF2Cxc")]', first=False):
title = result.xpath('.//h3[contains(@class, "DKV0Md")]/text()', first=True)
snippet = result.xpath('.//div[contains(@class, "lyLwlc")]/text()', first=True)
link = result.xpath('.//div[contains(@class, "yuRUbf")]/a/@href', first=True)
displayed_link = result.xpath('.//cite[contains(normalize-space(@class), "tjvcx")]', first=True).text
for related_search in soup.html.xpath('//a[contains(@class, "s8bAkb")]', first=False):
related_title = related_search.xpath('.//div[contains(@class, "OhScic")]', first=True).text
related_link = related_search.xpath('.//@href', first=True)
for video in soup.html.xpath('//div[contains(@class, "dFd2Tb")]', first=False):
video_title = video.xpath('.//h3[contains(@class, "DKV0Md")]/text()', first=True)
video_lenght = video.xpath('.//div[contains(@class, "J1mWY")]/div/@aria-label', first=True)
knowledge_graph_key = soup.html.xpath('//span[contains(@class, "w8qArf")]/a[contains(@class, "fl")]/text()', first=False)
knowledge_graph_value = soup.html.xpath('//span[contains(@class, "kno-fv")]/a/text()', first=False)
knowledge_graph_value_link = soup.html.xpath('//span[contains(@class, "kno-fv")]/a/@href', first=False)
if soup.html.xpath('//td[contains(@class, "d6cvqb")]/a/@id', first=True):
page_num += 1
else:
break
if __name__ == '__main__':
playwright_parser()
requests_html_parser()
selenium_parser()
Conclusions
If you need to scrape data from a dynamic page that doesn't require clicking, scrolling and similar things but still requires rendering JavaScript, try
requests-html
. It uses pure XPath aslxml
and should be faster than the other two browser automations.If you need to do complex page manipulation on the dynamic page, try to use
playwright
orselenium
.If you scraping non-dynamic pages (rendered via JavaScript), try
selectolax
overbs4
,lxml
orparsel
. It's a lot faster, uses less memory, and has almost identical syntax toparsel
orbs4
. A hidden gem I would say.If you need to use XPath in your parser, try to use either
lxml
orparsel
.parsel
is built on top oflxml
and translates every CSS query to XPath and can combine (chain) CSS and XPath queries. However,lxml
is faster.