Karim shoair commited on
Commit ·
fcedcce
1
Parent(s): f300870
chore: migrating to ruff and updating pre-commit hooks
Browse files- .flake8 +0 -3
- .pre-commit-config.yaml +9 -8
- benchmarks.py +36 -24
- cleanup.py +8 -8
- ruff.toml +22 -0
- scrapling/__init__.py +19 -11
- scrapling/cli.py +27 -7
- scrapling/core/_types.py +16 -3
- scrapling/core/custom_types.py +122 -55
- scrapling/core/mixins.py +20 -16
- scrapling/core/storage_adaptors.py +11 -7
- scrapling/core/translator.py +1 -2
- scrapling/core/utils.py +44 -25
- scrapling/defaults.py +20 -8
- scrapling/engines/__init__.py +1 -1
- scrapling/engines/camo.py +125 -59
- scrapling/engines/constants.py +84 -87
- scrapling/engines/pw.py +169 -100
- scrapling/engines/static.py +57 -25
- scrapling/engines/toolbelt/__init__.py +16 -6
- scrapling/engines/toolbelt/custom.py +167 -95
- scrapling/engines/toolbelt/fingerprints.py +13 -13
- scrapling/engines/toolbelt/navigation.py +29 -14
- scrapling/fetchers.py +329 -83
- scrapling/parser.py +453 -180
- setup.py +10 -11
- tests/fetchers/async/test_camoufox.py +58 -46
- tests/fetchers/async/test_httpx.py +92 -51
- tests/fetchers/async/test_playwright.py +37 -27
- tests/fetchers/sync/test_camoufox.py +33 -13
- tests/fetchers/sync/test_httpx.py +79 -42
- tests/fetchers/sync/test_playwright.py +33 -21
- tests/fetchers/test_utils.py +105 -64
- tests/parser/test_automatch.py +22 -22
- tests/parser/test_general.py +61 -49
.flake8
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
[flake8]
|
| 2 |
-
ignore = E501, F401
|
| 3 |
-
exclude = .git,.venv,__pycache__,docs,.github,build,dist,tests,benchmarks.py
|
|
|
|
|
|
|
|
|
|
|
|
.pre-commit-config.yaml
CHANGED
|
@@ -1,17 +1,18 @@
|
|
| 1 |
repos:
|
| 2 |
- repo: https://github.com/PyCQA/bandit
|
| 3 |
-
rev: 1.8.
|
| 4 |
hooks:
|
| 5 |
- id: bandit
|
| 6 |
args: [-r, -c, .bandit.yml]
|
| 7 |
-
- repo: https://github.com/
|
| 8 |
-
|
|
|
|
| 9 |
hooks:
|
| 10 |
-
|
| 11 |
-
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
- repo: https://github.com/netromdk/vermin
|
| 16 |
rev: v1.6.0
|
| 17 |
hooks:
|
|
|
|
| 1 |
repos:
|
| 2 |
- repo: https://github.com/PyCQA/bandit
|
| 3 |
+
rev: 1.8.3
|
| 4 |
hooks:
|
| 5 |
- id: bandit
|
| 6 |
args: [-r, -c, .bandit.yml]
|
| 7 |
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
| 8 |
+
# Ruff version.
|
| 9 |
+
rev: v0.11.5
|
| 10 |
hooks:
|
| 11 |
+
# Run the linter.
|
| 12 |
+
- id: ruff
|
| 13 |
+
args: [ --fix ]
|
| 14 |
+
# Run the formatter.
|
| 15 |
+
- id: ruff-format
|
| 16 |
- repo: https://github.com/netromdk/vermin
|
| 17 |
rev: v1.6.0
|
| 18 |
hooks:
|
benchmarks.py
CHANGED
|
@@ -14,19 +14,27 @@ from selectolax.parser import HTMLParser
|
|
| 14 |
|
| 15 |
from scrapling import Adaptor
|
| 16 |
|
| 17 |
-
large_html =
|
|
|
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
def benchmark(func):
|
| 21 |
@functools.wraps(func)
|
| 22 |
def wrapper(*args, **kwargs):
|
| 23 |
-
benchmark_name = func.__name__.replace(
|
| 24 |
print(f"-> {benchmark_name}", end=" ", flush=True)
|
| 25 |
# Warm-up phase
|
| 26 |
-
timeit.repeat(
|
|
|
|
|
|
|
| 27 |
# Measure time (1 run, repeat 100 times, take average)
|
| 28 |
times = timeit.repeat(
|
| 29 |
-
lambda: func(*args, **kwargs),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
)
|
| 31 |
min_time = round(mean(times) * 1000, 2) # Convert to milliseconds
|
| 32 |
print(f"average execution time: {min_time} ms")
|
|
@@ -42,23 +50,24 @@ def test_lxml():
|
|
| 42 |
for e in etree.fromstring(
|
| 43 |
large_html,
|
| 44 |
# Scrapling and Parsel use the same parser inside so this is just to make it fair
|
| 45 |
-
parser=html.HTMLParser(recover=True, huge_tree=True)
|
| 46 |
-
).cssselect(
|
|
|
|
| 47 |
|
| 48 |
|
| 49 |
@benchmark
|
| 50 |
def test_bs4_lxml():
|
| 51 |
-
return [e.text for e in BeautifulSoup(large_html,
|
| 52 |
|
| 53 |
|
| 54 |
@benchmark
|
| 55 |
def test_bs4_html5lib():
|
| 56 |
-
return [e.text for e in BeautifulSoup(large_html,
|
| 57 |
|
| 58 |
|
| 59 |
@benchmark
|
| 60 |
def test_pyquery():
|
| 61 |
-
return [e.text() for e in pq(large_html)(
|
| 62 |
|
| 63 |
|
| 64 |
@benchmark
|
|
@@ -66,33 +75,33 @@ def test_scrapling():
|
|
| 66 |
# No need to do `.extract()` like parsel to extract text
|
| 67 |
# Also, this is faster than `[t.text for t in Adaptor(large_html, auto_match=False).css('.item')]`
|
| 68 |
# for obvious reasons, of course.
|
| 69 |
-
return Adaptor(large_html, auto_match=False).css(
|
| 70 |
|
| 71 |
|
| 72 |
@benchmark
|
| 73 |
def test_parsel():
|
| 74 |
-
return Selector(text=large_html).css(
|
| 75 |
|
| 76 |
|
| 77 |
@benchmark
|
| 78 |
def test_mechanicalsoup():
|
| 79 |
browser = StatefulBrowser()
|
| 80 |
browser.open_fake_page(large_html)
|
| 81 |
-
return [e.text for e in browser.page.select(
|
| 82 |
|
| 83 |
|
| 84 |
@benchmark
|
| 85 |
def test_selectolax():
|
| 86 |
-
return [node.text() for node in HTMLParser(large_html).css(
|
| 87 |
|
| 88 |
|
| 89 |
def display(results):
|
| 90 |
# Sort and display results
|
| 91 |
sorted_results = sorted(results.items(), key=lambda x: x[1]) # Sort by time
|
| 92 |
-
scrapling_time = results[
|
| 93 |
print("\nRanked Results (fastest to slowest):")
|
| 94 |
print(f" i. {'Library tested':<18} | {'avg. time (ms)':<15} | vs Scrapling")
|
| 95 |
-
print(
|
| 96 |
for i, (test_name, test_time) in enumerate(sorted_results, 1):
|
| 97 |
compare = round(test_time / scrapling_time, 3)
|
| 98 |
print(f" {i}. {test_name:<18} | {str(test_time):<15} | {compare}")
|
|
@@ -102,25 +111,28 @@ def display(results):
|
|
| 102 |
def test_scrapling_text(request_html):
|
| 103 |
# Will loop over resulted elements to get text too to make comparison even more fair otherwise Scrapling will be even faster
|
| 104 |
return [
|
| 105 |
-
element.text
|
| 106 |
-
|
| 107 |
-
|
|
|
|
| 108 |
]
|
| 109 |
|
| 110 |
|
| 111 |
@benchmark
|
| 112 |
def test_autoscraper(request_html):
|
| 113 |
# autoscraper by default returns elements text
|
| 114 |
-
return AutoScraper().build(html=request_html, wanted_list=[
|
| 115 |
|
| 116 |
|
| 117 |
if __name__ == "__main__":
|
| 118 |
-
print(
|
|
|
|
|
|
|
| 119 |
results1 = {
|
| 120 |
"Raw Lxml": test_lxml(),
|
| 121 |
"Parsel/Scrapy": test_parsel(),
|
| 122 |
"Scrapling": test_scrapling(),
|
| 123 |
-
|
| 124 |
"PyQuery": test_pyquery(),
|
| 125 |
"BS4 with Lxml": test_bs4_lxml(),
|
| 126 |
"MechanicalSoup": test_mechanicalsoup(),
|
|
@@ -128,10 +140,10 @@ if __name__ == "__main__":
|
|
| 128 |
}
|
| 129 |
|
| 130 |
display(results1)
|
| 131 |
-
print(
|
| 132 |
-
req = requests.get(
|
| 133 |
print(
|
| 134 |
-
|
| 135 |
)
|
| 136 |
results2 = {
|
| 137 |
"Scrapling": test_scrapling_text(req.text),
|
|
|
|
| 14 |
|
| 15 |
from scrapling import Adaptor
|
| 16 |
|
| 17 |
+
large_html = (
|
| 18 |
+
"<html><body>" + '<div class="item">' * 5000 + "</div>" * 5000 + "</body></html>"
|
| 19 |
+
)
|
| 20 |
|
| 21 |
|
| 22 |
def benchmark(func):
|
| 23 |
@functools.wraps(func)
|
| 24 |
def wrapper(*args, **kwargs):
|
| 25 |
+
benchmark_name = func.__name__.replace("test_", "").replace("_", " ")
|
| 26 |
print(f"-> {benchmark_name}", end=" ", flush=True)
|
| 27 |
# Warm-up phase
|
| 28 |
+
timeit.repeat(
|
| 29 |
+
lambda: func(*args, **kwargs), number=2, repeat=2, globals=globals()
|
| 30 |
+
)
|
| 31 |
# Measure time (1 run, repeat 100 times, take average)
|
| 32 |
times = timeit.repeat(
|
| 33 |
+
lambda: func(*args, **kwargs),
|
| 34 |
+
number=1,
|
| 35 |
+
repeat=100,
|
| 36 |
+
globals=globals(),
|
| 37 |
+
timer=time.process_time,
|
| 38 |
)
|
| 39 |
min_time = round(mean(times) * 1000, 2) # Convert to milliseconds
|
| 40 |
print(f"average execution time: {min_time} ms")
|
|
|
|
| 50 |
for e in etree.fromstring(
|
| 51 |
large_html,
|
| 52 |
# Scrapling and Parsel use the same parser inside so this is just to make it fair
|
| 53 |
+
parser=html.HTMLParser(recover=True, huge_tree=True),
|
| 54 |
+
).cssselect(".item")
|
| 55 |
+
]
|
| 56 |
|
| 57 |
|
| 58 |
@benchmark
|
| 59 |
def test_bs4_lxml():
|
| 60 |
+
return [e.text for e in BeautifulSoup(large_html, "lxml").select(".item")]
|
| 61 |
|
| 62 |
|
| 63 |
@benchmark
|
| 64 |
def test_bs4_html5lib():
|
| 65 |
+
return [e.text for e in BeautifulSoup(large_html, "html5lib").select(".item")]
|
| 66 |
|
| 67 |
|
| 68 |
@benchmark
|
| 69 |
def test_pyquery():
|
| 70 |
+
return [e.text() for e in pq(large_html)(".item").items()]
|
| 71 |
|
| 72 |
|
| 73 |
@benchmark
|
|
|
|
| 75 |
# No need to do `.extract()` like parsel to extract text
|
| 76 |
# Also, this is faster than `[t.text for t in Adaptor(large_html, auto_match=False).css('.item')]`
|
| 77 |
# for obvious reasons, of course.
|
| 78 |
+
return Adaptor(large_html, auto_match=False).css(".item::text")
|
| 79 |
|
| 80 |
|
| 81 |
@benchmark
|
| 82 |
def test_parsel():
|
| 83 |
+
return Selector(text=large_html).css(".item::text").extract()
|
| 84 |
|
| 85 |
|
| 86 |
@benchmark
|
| 87 |
def test_mechanicalsoup():
|
| 88 |
browser = StatefulBrowser()
|
| 89 |
browser.open_fake_page(large_html)
|
| 90 |
+
return [e.text for e in browser.page.select(".item")]
|
| 91 |
|
| 92 |
|
| 93 |
@benchmark
|
| 94 |
def test_selectolax():
|
| 95 |
+
return [node.text() for node in HTMLParser(large_html).css(".item")]
|
| 96 |
|
| 97 |
|
| 98 |
def display(results):
|
| 99 |
# Sort and display results
|
| 100 |
sorted_results = sorted(results.items(), key=lambda x: x[1]) # Sort by time
|
| 101 |
+
scrapling_time = results["Scrapling"]
|
| 102 |
print("\nRanked Results (fastest to slowest):")
|
| 103 |
print(f" i. {'Library tested':<18} | {'avg. time (ms)':<15} | vs Scrapling")
|
| 104 |
+
print("-" * 50)
|
| 105 |
for i, (test_name, test_time) in enumerate(sorted_results, 1):
|
| 106 |
compare = round(test_time / scrapling_time, 3)
|
| 107 |
print(f" {i}. {test_name:<18} | {str(test_time):<15} | {compare}")
|
|
|
|
| 111 |
def test_scrapling_text(request_html):
|
| 112 |
# Will loop over resulted elements to get text too to make comparison even more fair otherwise Scrapling will be even faster
|
| 113 |
return [
|
| 114 |
+
element.text
|
| 115 |
+
for element in Adaptor(request_html, auto_match=False)
|
| 116 |
+
.find_by_text("Tipping the Velvet", first_match=True)
|
| 117 |
+
.find_similar(ignore_attributes=["title"])
|
| 118 |
]
|
| 119 |
|
| 120 |
|
| 121 |
@benchmark
|
| 122 |
def test_autoscraper(request_html):
|
| 123 |
# autoscraper by default returns elements text
|
| 124 |
+
return AutoScraper().build(html=request_html, wanted_list=["Tipping the Velvet"])
|
| 125 |
|
| 126 |
|
| 127 |
if __name__ == "__main__":
|
| 128 |
+
print(
|
| 129 |
+
" Benchmark: Speed of parsing and retrieving the text content of 5000 nested elements \n"
|
| 130 |
+
)
|
| 131 |
results1 = {
|
| 132 |
"Raw Lxml": test_lxml(),
|
| 133 |
"Parsel/Scrapy": test_parsel(),
|
| 134 |
"Scrapling": test_scrapling(),
|
| 135 |
+
"Selectolax": test_selectolax(),
|
| 136 |
"PyQuery": test_pyquery(),
|
| 137 |
"BS4 with Lxml": test_bs4_lxml(),
|
| 138 |
"MechanicalSoup": test_mechanicalsoup(),
|
|
|
|
| 140 |
}
|
| 141 |
|
| 142 |
display(results1)
|
| 143 |
+
print("\n" + "=" * 25)
|
| 144 |
+
req = requests.get("https://books.toscrape.com/index.html")
|
| 145 |
print(
|
| 146 |
+
" Benchmark: Speed of searching for an element by text content, and retrieving the text of similar elements\n"
|
| 147 |
)
|
| 148 |
results2 = {
|
| 149 |
"Scrapling": test_scrapling_text(req.text),
|
cleanup.py
CHANGED
|
@@ -9,12 +9,12 @@ def clean():
|
|
| 9 |
|
| 10 |
# Directories and patterns to clean
|
| 11 |
cleanup_patterns = [
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
]
|
| 19 |
|
| 20 |
# Clean directories
|
|
@@ -30,7 +30,7 @@ def clean():
|
|
| 30 |
print(f"Could not remove {path}: {e}")
|
| 31 |
|
| 32 |
# Remove compiled Python files
|
| 33 |
-
for path in base_dir.rglob(
|
| 34 |
try:
|
| 35 |
path.unlink()
|
| 36 |
print(f"Removed compiled file: {path}")
|
|
@@ -38,5 +38,5 @@ def clean():
|
|
| 38 |
print(f"Could not remove {path}: {e}")
|
| 39 |
|
| 40 |
|
| 41 |
-
if __name__ ==
|
| 42 |
clean()
|
|
|
|
| 9 |
|
| 10 |
# Directories and patterns to clean
|
| 11 |
cleanup_patterns = [
|
| 12 |
+
"build",
|
| 13 |
+
"dist",
|
| 14 |
+
"*.egg-info",
|
| 15 |
+
"__pycache__",
|
| 16 |
+
".eggs",
|
| 17 |
+
".pytest_cache",
|
| 18 |
]
|
| 19 |
|
| 20 |
# Clean directories
|
|
|
|
| 30 |
print(f"Could not remove {path}: {e}")
|
| 31 |
|
| 32 |
# Remove compiled Python files
|
| 33 |
+
for path in base_dir.rglob("*.py[co]"):
|
| 34 |
try:
|
| 35 |
path.unlink()
|
| 36 |
print(f"Removed compiled file: {path}")
|
|
|
|
| 38 |
print(f"Could not remove {path}: {e}")
|
| 39 |
|
| 40 |
|
| 41 |
+
if __name__ == "__main__":
|
| 42 |
clean()
|
ruff.toml
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
exclude = [
|
| 2 |
+
".git",
|
| 3 |
+
".venv",
|
| 4 |
+
"__pycache__",
|
| 5 |
+
"docs",
|
| 6 |
+
".github",
|
| 7 |
+
"build",
|
| 8 |
+
"dist",
|
| 9 |
+
"tests",
|
| 10 |
+
"benchmarks.py",
|
| 11 |
+
]
|
| 12 |
+
|
| 13 |
+
# Assume Python 3.9
|
| 14 |
+
target-version = "py39"
|
| 15 |
+
|
| 16 |
+
[lint]
|
| 17 |
+
select = ["E", "F", "W"]
|
| 18 |
+
ignore = ["E501", "F401"]
|
| 19 |
+
|
| 20 |
+
[format]
|
| 21 |
+
# Like Black, use double quotes for strings.
|
| 22 |
+
quote-style = "double"
|
scrapling/__init__.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
|
| 2 |
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
| 3 |
__version__ = "0.2.99"
|
| 4 |
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
|
@@ -7,35 +6,44 @@ __copyright__ = "Copyright (c) 2024 Karim Shoair"
|
|
| 7 |
# A lightweight approach to create lazy loader for each import for backward compatibility
|
| 8 |
# This will reduces initial memory footprint significantly (only loads what's used)
|
| 9 |
def __getattr__(name):
|
| 10 |
-
if name ==
|
| 11 |
from scrapling.fetchers import Fetcher as cls
|
|
|
|
| 12 |
return cls
|
| 13 |
-
elif name ==
|
| 14 |
from scrapling.parser import Adaptor as cls
|
|
|
|
| 15 |
return cls
|
| 16 |
-
elif name ==
|
| 17 |
from scrapling.parser import Adaptors as cls
|
|
|
|
| 18 |
return cls
|
| 19 |
-
elif name ==
|
| 20 |
from scrapling.core.custom_types import AttributesHandler as cls
|
|
|
|
| 21 |
return cls
|
| 22 |
-
elif name ==
|
| 23 |
from scrapling.core.custom_types import TextHandler as cls
|
|
|
|
| 24 |
return cls
|
| 25 |
-
elif name ==
|
| 26 |
from scrapling.fetchers import AsyncFetcher as cls
|
|
|
|
| 27 |
return cls
|
| 28 |
-
elif name ==
|
| 29 |
from scrapling.fetchers import StealthyFetcher as cls
|
|
|
|
| 30 |
return cls
|
| 31 |
-
elif name ==
|
| 32 |
from scrapling.fetchers import PlayWrightFetcher as cls
|
|
|
|
| 33 |
return cls
|
| 34 |
-
elif name ==
|
| 35 |
from scrapling.fetchers import CustomFetcher as cls
|
|
|
|
| 36 |
return cls
|
| 37 |
else:
|
| 38 |
raise AttributeError(f"module 'scrapling' has no attribute '{name}'")
|
| 39 |
|
| 40 |
|
| 41 |
-
__all__ = [
|
|
|
|
|
|
|
| 1 |
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
| 2 |
__version__ = "0.2.99"
|
| 3 |
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
|
|
|
| 6 |
# A lightweight approach to create lazy loader for each import for backward compatibility
|
| 7 |
# This will reduces initial memory footprint significantly (only loads what's used)
|
| 8 |
def __getattr__(name):
|
| 9 |
+
if name == "Fetcher":
|
| 10 |
from scrapling.fetchers import Fetcher as cls
|
| 11 |
+
|
| 12 |
return cls
|
| 13 |
+
elif name == "Adaptor":
|
| 14 |
from scrapling.parser import Adaptor as cls
|
| 15 |
+
|
| 16 |
return cls
|
| 17 |
+
elif name == "Adaptors":
|
| 18 |
from scrapling.parser import Adaptors as cls
|
| 19 |
+
|
| 20 |
return cls
|
| 21 |
+
elif name == "AttributesHandler":
|
| 22 |
from scrapling.core.custom_types import AttributesHandler as cls
|
| 23 |
+
|
| 24 |
return cls
|
| 25 |
+
elif name == "TextHandler":
|
| 26 |
from scrapling.core.custom_types import TextHandler as cls
|
| 27 |
+
|
| 28 |
return cls
|
| 29 |
+
elif name == "AsyncFetcher":
|
| 30 |
from scrapling.fetchers import AsyncFetcher as cls
|
| 31 |
+
|
| 32 |
return cls
|
| 33 |
+
elif name == "StealthyFetcher":
|
| 34 |
from scrapling.fetchers import StealthyFetcher as cls
|
| 35 |
+
|
| 36 |
return cls
|
| 37 |
+
elif name == "PlayWrightFetcher":
|
| 38 |
from scrapling.fetchers import PlayWrightFetcher as cls
|
| 39 |
+
|
| 40 |
return cls
|
| 41 |
+
elif name == "CustomFetcher":
|
| 42 |
from scrapling.fetchers import CustomFetcher as cls
|
| 43 |
+
|
| 44 |
return cls
|
| 45 |
else:
|
| 46 |
raise AttributeError(f"module 'scrapling' has no attribute '{name}'")
|
| 47 |
|
| 48 |
|
| 49 |
+
__all__ = ["Adaptor", "Fetcher", "AsyncFetcher", "StealthyFetcher", "PlayWrightFetcher"]
|
scrapling/cli.py
CHANGED
|
@@ -12,21 +12,41 @@ def get_package_dir():
|
|
| 12 |
|
| 13 |
def run_command(command, line):
|
| 14 |
print(f"Installing {line}...")
|
| 15 |
-
_ = subprocess.check_call(
|
| 16 |
# I meant to not use try except here
|
| 17 |
|
| 18 |
|
| 19 |
@click.command(help="Install all Scrapling's Fetchers dependencies")
|
| 20 |
-
@click.option(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
def install(force):
|
| 22 |
-
if
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
# if no errors raised by above commands, then we add below file
|
| 27 |
get_package_dir().joinpath(".scrapling_dependencies_installed").touch()
|
| 28 |
else:
|
| 29 |
-
print(
|
| 30 |
|
| 31 |
|
| 32 |
@click.group()
|
|
|
|
| 12 |
|
| 13 |
def run_command(command, line):
|
| 14 |
print(f"Installing {line}...")
|
| 15 |
+
_ = subprocess.check_call(" ".join(command), shell=True)
|
| 16 |
# I meant to not use try except here
|
| 17 |
|
| 18 |
|
| 19 |
@click.command(help="Install all Scrapling's Fetchers dependencies")
|
| 20 |
+
@click.option(
|
| 21 |
+
"-f",
|
| 22 |
+
"--force",
|
| 23 |
+
"force",
|
| 24 |
+
is_flag=True,
|
| 25 |
+
default=False,
|
| 26 |
+
type=bool,
|
| 27 |
+
help="Force Scrapling to reinstall all Fetchers dependencies",
|
| 28 |
+
)
|
| 29 |
def install(force):
|
| 30 |
+
if (
|
| 31 |
+
force
|
| 32 |
+
or not get_package_dir().joinpath(".scrapling_dependencies_installed").exists()
|
| 33 |
+
):
|
| 34 |
+
run_command(
|
| 35 |
+
[sys.executable, "-m", "playwright", "install", "chromium"],
|
| 36 |
+
"Playwright browsers",
|
| 37 |
+
)
|
| 38 |
+
run_command(
|
| 39 |
+
[sys.executable, "-m", "playwright", "install-deps", "chromium", "firefox"],
|
| 40 |
+
"Playwright dependencies",
|
| 41 |
+
)
|
| 42 |
+
run_command(
|
| 43 |
+
[sys.executable, "-m", "camoufox", "fetch", "--browserforge"],
|
| 44 |
+
"Camoufox browser and databases",
|
| 45 |
+
)
|
| 46 |
# if no errors raised by above commands, then we add below file
|
| 47 |
get_package_dir().joinpath(".scrapling_dependencies_installed").touch()
|
| 48 |
else:
|
| 49 |
+
print("The dependencies are already installed")
|
| 50 |
|
| 51 |
|
| 52 |
@click.group()
|
scrapling/core/_types.py
CHANGED
|
@@ -2,9 +2,22 @@
|
|
| 2 |
Type definitions for type checking purposes.
|
| 3 |
"""
|
| 4 |
|
| 5 |
-
from typing import (
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
|
| 10 |
|
|
|
|
| 2 |
Type definitions for type checking purposes.
|
| 3 |
"""
|
| 4 |
|
| 5 |
+
from typing import (
|
| 6 |
+
TYPE_CHECKING,
|
| 7 |
+
Any,
|
| 8 |
+
Callable,
|
| 9 |
+
Dict,
|
| 10 |
+
Generator,
|
| 11 |
+
Iterable,
|
| 12 |
+
List,
|
| 13 |
+
Literal,
|
| 14 |
+
Optional,
|
| 15 |
+
Pattern,
|
| 16 |
+
Tuple,
|
| 17 |
+
Type,
|
| 18 |
+
TypeVar,
|
| 19 |
+
Union,
|
| 20 |
+
)
|
| 21 |
|
| 22 |
SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
|
| 23 |
|
scrapling/core/custom_types.py
CHANGED
|
@@ -6,16 +6,26 @@ from types import MappingProxyType
|
|
| 6 |
from orjson import dumps, loads
|
| 7 |
from w3lib.html import replace_entities as _replace_entities
|
| 8 |
|
| 9 |
-
from scrapling.core._types import (
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
from scrapling.core.utils import _is_iterable, flatten
|
| 12 |
|
| 13 |
# Define type variable for AttributeHandler value type
|
| 14 |
-
_TextHandlerType = TypeVar(
|
| 15 |
|
| 16 |
|
| 17 |
class TextHandler(str):
|
| 18 |
"""Extends standard Python string by adding more functionality"""
|
|
|
|
| 19 |
__slots__ = ()
|
| 20 |
|
| 21 |
def __new__(cls, string):
|
|
@@ -25,77 +35,89 @@ class TextHandler(str):
|
|
| 25 |
lst = super().__getitem__(key)
|
| 26 |
return typing.cast(_TextHandlerType, TextHandler(lst))
|
| 27 |
|
| 28 |
-
def split(self, sep: str = None, maxsplit: SupportsIndex = -1) ->
|
| 29 |
return TextHandlers(
|
| 30 |
-
typing.cast(
|
|
|
|
|
|
|
|
|
|
| 31 |
)
|
| 32 |
|
| 33 |
-
def strip(self, chars: str = None) -> Union[str,
|
| 34 |
return TextHandler(super().strip(chars))
|
| 35 |
|
| 36 |
-
def lstrip(self, chars: str = None) -> Union[str,
|
| 37 |
return TextHandler(super().lstrip(chars))
|
| 38 |
|
| 39 |
-
def rstrip(self, chars: str = None) -> Union[str,
|
| 40 |
return TextHandler(super().rstrip(chars))
|
| 41 |
|
| 42 |
-
def capitalize(self) -> Union[str,
|
| 43 |
return TextHandler(super().capitalize())
|
| 44 |
|
| 45 |
-
def casefold(self) -> Union[str,
|
| 46 |
return TextHandler(super().casefold())
|
| 47 |
|
| 48 |
-
def center(
|
|
|
|
|
|
|
| 49 |
return TextHandler(super().center(width, fillchar))
|
| 50 |
|
| 51 |
-
def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str,
|
| 52 |
return TextHandler(super().expandtabs(tabsize))
|
| 53 |
|
| 54 |
-
def format(self, *args: str, **kwargs: str) -> Union[str,
|
| 55 |
return TextHandler(super().format(*args, **kwargs))
|
| 56 |
|
| 57 |
-
def format_map(self, mapping) -> Union[str,
|
| 58 |
return TextHandler(super().format_map(mapping))
|
| 59 |
|
| 60 |
-
def join(self, iterable: Iterable[str]) -> Union[str,
|
| 61 |
return TextHandler(super().join(iterable))
|
| 62 |
|
| 63 |
-
def ljust(
|
|
|
|
|
|
|
| 64 |
return TextHandler(super().ljust(width, fillchar))
|
| 65 |
|
| 66 |
-
def rjust(
|
|
|
|
|
|
|
| 67 |
return TextHandler(super().rjust(width, fillchar))
|
| 68 |
|
| 69 |
-
def swapcase(self) -> Union[str,
|
| 70 |
return TextHandler(super().swapcase())
|
| 71 |
|
| 72 |
-
def title(self) -> Union[str,
|
| 73 |
return TextHandler(super().title())
|
| 74 |
|
| 75 |
-
def translate(self, table) -> Union[str,
|
| 76 |
return TextHandler(super().translate(table))
|
| 77 |
|
| 78 |
-
def zfill(self, width: SupportsIndex) -> Union[str,
|
| 79 |
return TextHandler(super().zfill(width))
|
| 80 |
|
| 81 |
-
def replace(
|
|
|
|
|
|
|
| 82 |
return TextHandler(super().replace(old, new, count))
|
| 83 |
|
| 84 |
-
def upper(self) -> Union[str,
|
| 85 |
return TextHandler(super().upper())
|
| 86 |
|
| 87 |
-
def lower(self) -> Union[str,
|
| 88 |
return TextHandler(super().lower())
|
|
|
|
| 89 |
##############
|
| 90 |
|
| 91 |
-
def sort(self, reverse: bool = False) -> Union[str,
|
| 92 |
"""Return a sorted version of the string"""
|
| 93 |
return self.__class__("".join(sorted(self, reverse=reverse)))
|
| 94 |
|
| 95 |
-
def clean(self) -> Union[str,
|
| 96 |
"""Return a new version of the string after removing all white spaces and consecutive spaces"""
|
| 97 |
-
data = re.sub(r
|
| 98 |
-
data = re.sub(
|
| 99 |
return self.__class__(data.strip())
|
| 100 |
|
| 101 |
# For easy copy-paste from Scrapy/parsel code when needed :)
|
|
@@ -122,8 +144,7 @@ class TextHandler(str):
|
|
| 122 |
replace_entities: bool = True,
|
| 123 |
clean_match: bool = False,
|
| 124 |
case_sensitive: bool = True,
|
| 125 |
-
) -> bool:
|
| 126 |
-
...
|
| 127 |
|
| 128 |
@typing.overload
|
| 129 |
def re(
|
|
@@ -133,12 +154,15 @@ class TextHandler(str):
|
|
| 133 |
clean_match: bool = False,
|
| 134 |
case_sensitive: bool = True,
|
| 135 |
check_match: Literal[False] = False,
|
| 136 |
-
) -> "TextHandlers[TextHandler]":
|
| 137 |
-
...
|
| 138 |
|
| 139 |
def re(
|
| 140 |
-
|
| 141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
) -> Union["TextHandlers[TextHandler]", bool]:
|
| 143 |
"""Apply the given regex to the current text and return a list of strings with the matches.
|
| 144 |
|
|
@@ -164,12 +188,27 @@ class TextHandler(str):
|
|
| 164 |
results = flatten(results)
|
| 165 |
|
| 166 |
if not replace_entities:
|
| 167 |
-
return TextHandlers(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
-
return TextHandlers(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
-
def re_first(
|
| 172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
| 174 |
|
| 175 |
:param regex: Can be either a compiled regular expression or a string.
|
|
@@ -179,7 +218,12 @@ class TextHandler(str):
|
|
| 179 |
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
| 180 |
|
| 181 |
"""
|
| 182 |
-
result = self.re(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
return result[0] if result else default
|
| 184 |
|
| 185 |
|
|
@@ -187,6 +231,7 @@ class TextHandlers(List[TextHandler]):
|
|
| 187 |
"""
|
| 188 |
The :class:`TextHandlers` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
|
| 189 |
"""
|
|
|
|
| 190 |
__slots__ = ()
|
| 191 |
|
| 192 |
@typing.overload
|
|
@@ -197,15 +242,22 @@ class TextHandlers(List[TextHandler]):
|
|
| 197 |
def __getitem__(self, pos: slice) -> "TextHandlers":
|
| 198 |
pass
|
| 199 |
|
| 200 |
-
def __getitem__(
|
|
|
|
|
|
|
| 201 |
lst = super().__getitem__(pos)
|
| 202 |
if isinstance(pos, slice):
|
| 203 |
lst = [TextHandler(s) for s in lst]
|
| 204 |
return TextHandlers(typing.cast(List[_TextHandlerType], lst))
|
| 205 |
return typing.cast(_TextHandlerType, TextHandler(lst))
|
| 206 |
|
| 207 |
-
def re(
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
"""Call the ``.re()`` method for each element in this list and return
|
| 210 |
their results flattened as TextHandlers.
|
| 211 |
|
|
@@ -219,8 +271,14 @@ class TextHandlers(List[TextHandler]):
|
|
| 219 |
]
|
| 220 |
return TextHandlers(flatten(results))
|
| 221 |
|
| 222 |
-
def re_first(
|
| 223 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
"""Call the ``.re_first()`` method for each element in this list and return
|
| 225 |
the first result or the default value otherwise.
|
| 226 |
|
|
@@ -251,26 +309,35 @@ class TextHandlers(List[TextHandler]):
|
|
| 251 |
|
| 252 |
class AttributesHandler(Mapping[str, _TextHandlerType]):
|
| 253 |
"""A read-only mapping to use instead of the standard dictionary for the speed boost but at the same time I use it to add more functionalities.
|
| 254 |
-
|
| 255 |
"""
|
| 256 |
-
|
|
|
|
| 257 |
|
| 258 |
def __init__(self, mapping=None, **kwargs):
|
| 259 |
-
mapping =
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
|
| 264 |
if kwargs:
|
| 265 |
-
mapping.update(
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
|
|
|
|
|
|
| 269 |
|
| 270 |
# Fastest read-only mapping type
|
| 271 |
self._data = MappingProxyType(mapping)
|
| 272 |
|
| 273 |
-
def get(
|
|
|
|
|
|
|
| 274 |
"""Acts like standard dictionary `.get()` method"""
|
| 275 |
return self._data.get(key, default)
|
| 276 |
|
|
|
|
| 6 |
from orjson import dumps, loads
|
| 7 |
from w3lib.html import replace_entities as _replace_entities
|
| 8 |
|
| 9 |
+
from scrapling.core._types import (
|
| 10 |
+
Dict,
|
| 11 |
+
Iterable,
|
| 12 |
+
List,
|
| 13 |
+
Literal,
|
| 14 |
+
Optional,
|
| 15 |
+
Pattern,
|
| 16 |
+
SupportsIndex,
|
| 17 |
+
TypeVar,
|
| 18 |
+
Union,
|
| 19 |
+
)
|
| 20 |
from scrapling.core.utils import _is_iterable, flatten
|
| 21 |
|
| 22 |
# Define type variable for AttributeHandler value type
|
| 23 |
+
_TextHandlerType = TypeVar("_TextHandlerType", bound="TextHandler")
|
| 24 |
|
| 25 |
|
| 26 |
class TextHandler(str):
|
| 27 |
"""Extends standard Python string by adding more functionality"""
|
| 28 |
+
|
| 29 |
__slots__ = ()
|
| 30 |
|
| 31 |
def __new__(cls, string):
|
|
|
|
| 35 |
lst = super().__getitem__(key)
|
| 36 |
return typing.cast(_TextHandlerType, TextHandler(lst))
|
| 37 |
|
| 38 |
+
def split(self, sep: str = None, maxsplit: SupportsIndex = -1) -> "TextHandlers":
|
| 39 |
return TextHandlers(
|
| 40 |
+
typing.cast(
|
| 41 |
+
List[_TextHandlerType],
|
| 42 |
+
[TextHandler(s) for s in super().split(sep, maxsplit)],
|
| 43 |
+
)
|
| 44 |
)
|
| 45 |
|
| 46 |
+
def strip(self, chars: str = None) -> Union[str, "TextHandler"]:
|
| 47 |
return TextHandler(super().strip(chars))
|
| 48 |
|
| 49 |
+
def lstrip(self, chars: str = None) -> Union[str, "TextHandler"]:
|
| 50 |
return TextHandler(super().lstrip(chars))
|
| 51 |
|
| 52 |
+
def rstrip(self, chars: str = None) -> Union[str, "TextHandler"]:
|
| 53 |
return TextHandler(super().rstrip(chars))
|
| 54 |
|
| 55 |
+
def capitalize(self) -> Union[str, "TextHandler"]:
|
| 56 |
return TextHandler(super().capitalize())
|
| 57 |
|
| 58 |
+
def casefold(self) -> Union[str, "TextHandler"]:
|
| 59 |
return TextHandler(super().casefold())
|
| 60 |
|
| 61 |
+
def center(
|
| 62 |
+
self, width: SupportsIndex, fillchar: str = " "
|
| 63 |
+
) -> Union[str, "TextHandler"]:
|
| 64 |
return TextHandler(super().center(width, fillchar))
|
| 65 |
|
| 66 |
+
def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, "TextHandler"]:
|
| 67 |
return TextHandler(super().expandtabs(tabsize))
|
| 68 |
|
| 69 |
+
def format(self, *args: str, **kwargs: str) -> Union[str, "TextHandler"]:
|
| 70 |
return TextHandler(super().format(*args, **kwargs))
|
| 71 |
|
| 72 |
+
def format_map(self, mapping) -> Union[str, "TextHandler"]:
|
| 73 |
return TextHandler(super().format_map(mapping))
|
| 74 |
|
| 75 |
+
def join(self, iterable: Iterable[str]) -> Union[str, "TextHandler"]:
|
| 76 |
return TextHandler(super().join(iterable))
|
| 77 |
|
| 78 |
+
def ljust(
|
| 79 |
+
self, width: SupportsIndex, fillchar: str = " "
|
| 80 |
+
) -> Union[str, "TextHandler"]:
|
| 81 |
return TextHandler(super().ljust(width, fillchar))
|
| 82 |
|
| 83 |
+
def rjust(
|
| 84 |
+
self, width: SupportsIndex, fillchar: str = " "
|
| 85 |
+
) -> Union[str, "TextHandler"]:
|
| 86 |
return TextHandler(super().rjust(width, fillchar))
|
| 87 |
|
| 88 |
+
def swapcase(self) -> Union[str, "TextHandler"]:
|
| 89 |
return TextHandler(super().swapcase())
|
| 90 |
|
| 91 |
+
def title(self) -> Union[str, "TextHandler"]:
|
| 92 |
return TextHandler(super().title())
|
| 93 |
|
| 94 |
+
def translate(self, table) -> Union[str, "TextHandler"]:
|
| 95 |
return TextHandler(super().translate(table))
|
| 96 |
|
| 97 |
+
def zfill(self, width: SupportsIndex) -> Union[str, "TextHandler"]:
|
| 98 |
return TextHandler(super().zfill(width))
|
| 99 |
|
| 100 |
+
def replace(
|
| 101 |
+
self, old: str, new: str, count: SupportsIndex = -1
|
| 102 |
+
) -> Union[str, "TextHandler"]:
|
| 103 |
return TextHandler(super().replace(old, new, count))
|
| 104 |
|
| 105 |
+
def upper(self) -> Union[str, "TextHandler"]:
|
| 106 |
return TextHandler(super().upper())
|
| 107 |
|
| 108 |
+
def lower(self) -> Union[str, "TextHandler"]:
|
| 109 |
return TextHandler(super().lower())
|
| 110 |
+
|
| 111 |
##############
|
| 112 |
|
| 113 |
+
def sort(self, reverse: bool = False) -> Union[str, "TextHandler"]:
|
| 114 |
"""Return a sorted version of the string"""
|
| 115 |
return self.__class__("".join(sorted(self, reverse=reverse)))
|
| 116 |
|
| 117 |
+
def clean(self) -> Union[str, "TextHandler"]:
|
| 118 |
"""Return a new version of the string after removing all white spaces and consecutive spaces"""
|
| 119 |
+
data = re.sub(r"[\t|\r|\n]", "", self)
|
| 120 |
+
data = re.sub(" +", " ", data)
|
| 121 |
return self.__class__(data.strip())
|
| 122 |
|
| 123 |
# For easy copy-paste from Scrapy/parsel code when needed :)
|
|
|
|
| 144 |
replace_entities: bool = True,
|
| 145 |
clean_match: bool = False,
|
| 146 |
case_sensitive: bool = True,
|
| 147 |
+
) -> bool: ...
|
|
|
|
| 148 |
|
| 149 |
@typing.overload
|
| 150 |
def re(
|
|
|
|
| 154 |
clean_match: bool = False,
|
| 155 |
case_sensitive: bool = True,
|
| 156 |
check_match: Literal[False] = False,
|
| 157 |
+
) -> "TextHandlers[TextHandler]": ...
|
|
|
|
| 158 |
|
| 159 |
def re(
|
| 160 |
+
self,
|
| 161 |
+
regex: Union[str, Pattern[str]],
|
| 162 |
+
replace_entities: bool = True,
|
| 163 |
+
clean_match: bool = False,
|
| 164 |
+
case_sensitive: bool = True,
|
| 165 |
+
check_match: bool = False,
|
| 166 |
) -> Union["TextHandlers[TextHandler]", bool]:
|
| 167 |
"""Apply the given regex to the current text and return a list of strings with the matches.
|
| 168 |
|
|
|
|
| 188 |
results = flatten(results)
|
| 189 |
|
| 190 |
if not replace_entities:
|
| 191 |
+
return TextHandlers(
|
| 192 |
+
typing.cast(
|
| 193 |
+
List[_TextHandlerType], [TextHandler(string) for string in results]
|
| 194 |
+
)
|
| 195 |
+
)
|
| 196 |
|
| 197 |
+
return TextHandlers(
|
| 198 |
+
typing.cast(
|
| 199 |
+
List[_TextHandlerType],
|
| 200 |
+
[TextHandler(_replace_entities(s)) for s in results],
|
| 201 |
+
)
|
| 202 |
+
)
|
| 203 |
|
| 204 |
+
def re_first(
|
| 205 |
+
self,
|
| 206 |
+
regex: Union[str, Pattern[str]],
|
| 207 |
+
default=None,
|
| 208 |
+
replace_entities: bool = True,
|
| 209 |
+
clean_match: bool = False,
|
| 210 |
+
case_sensitive: bool = True,
|
| 211 |
+
) -> "TextHandler":
|
| 212 |
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
| 213 |
|
| 214 |
:param regex: Can be either a compiled regular expression or a string.
|
|
|
|
| 218 |
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
| 219 |
|
| 220 |
"""
|
| 221 |
+
result = self.re(
|
| 222 |
+
regex,
|
| 223 |
+
replace_entities,
|
| 224 |
+
clean_match=clean_match,
|
| 225 |
+
case_sensitive=case_sensitive,
|
| 226 |
+
)
|
| 227 |
return result[0] if result else default
|
| 228 |
|
| 229 |
|
|
|
|
| 231 |
"""
|
| 232 |
The :class:`TextHandlers` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
|
| 233 |
"""
|
| 234 |
+
|
| 235 |
__slots__ = ()
|
| 236 |
|
| 237 |
@typing.overload
|
|
|
|
| 242 |
def __getitem__(self, pos: slice) -> "TextHandlers":
|
| 243 |
pass
|
| 244 |
|
| 245 |
+
def __getitem__(
|
| 246 |
+
self, pos: Union[SupportsIndex, slice]
|
| 247 |
+
) -> Union[TextHandler, "TextHandlers"]:
|
| 248 |
lst = super().__getitem__(pos)
|
| 249 |
if isinstance(pos, slice):
|
| 250 |
lst = [TextHandler(s) for s in lst]
|
| 251 |
return TextHandlers(typing.cast(List[_TextHandlerType], lst))
|
| 252 |
return typing.cast(_TextHandlerType, TextHandler(lst))
|
| 253 |
|
| 254 |
+
def re(
|
| 255 |
+
self,
|
| 256 |
+
regex: Union[str, Pattern[str]],
|
| 257 |
+
replace_entities: bool = True,
|
| 258 |
+
clean_match: bool = False,
|
| 259 |
+
case_sensitive: bool = True,
|
| 260 |
+
) -> "TextHandlers[TextHandler]":
|
| 261 |
"""Call the ``.re()`` method for each element in this list and return
|
| 262 |
their results flattened as TextHandlers.
|
| 263 |
|
|
|
|
| 271 |
]
|
| 272 |
return TextHandlers(flatten(results))
|
| 273 |
|
| 274 |
+
def re_first(
|
| 275 |
+
self,
|
| 276 |
+
regex: Union[str, Pattern[str]],
|
| 277 |
+
default=None,
|
| 278 |
+
replace_entities: bool = True,
|
| 279 |
+
clean_match: bool = False,
|
| 280 |
+
case_sensitive: bool = True,
|
| 281 |
+
) -> TextHandler:
|
| 282 |
"""Call the ``.re_first()`` method for each element in this list and return
|
| 283 |
the first result or the default value otherwise.
|
| 284 |
|
|
|
|
| 309 |
|
| 310 |
class AttributesHandler(Mapping[str, _TextHandlerType]):
|
| 311 |
"""A read-only mapping to use instead of the standard dictionary for the speed boost but at the same time I use it to add more functionalities.
|
| 312 |
+
If standard dictionary is needed, just convert this class to dictionary with `dict` function
|
| 313 |
"""
|
| 314 |
+
|
| 315 |
+
__slots__ = ("_data",)
|
| 316 |
|
| 317 |
def __init__(self, mapping=None, **kwargs):
|
| 318 |
+
mapping = (
|
| 319 |
+
{
|
| 320 |
+
key: TextHandler(value) if type(value) is str else value
|
| 321 |
+
for key, value in mapping.items()
|
| 322 |
+
}
|
| 323 |
+
if mapping is not None
|
| 324 |
+
else {}
|
| 325 |
+
)
|
| 326 |
|
| 327 |
if kwargs:
|
| 328 |
+
mapping.update(
|
| 329 |
+
{
|
| 330 |
+
key: TextHandler(value) if type(value) is str else value
|
| 331 |
+
for key, value in kwargs.items()
|
| 332 |
+
}
|
| 333 |
+
)
|
| 334 |
|
| 335 |
# Fastest read-only mapping type
|
| 336 |
self._data = MappingProxyType(mapping)
|
| 337 |
|
| 338 |
+
def get(
|
| 339 |
+
self, key: str, default: Optional[str] = None
|
| 340 |
+
) -> Union[_TextHandlerType, None]:
|
| 341 |
"""Acts like standard dictionary `.get()` method"""
|
| 342 |
return self._data.get(key, default)
|
| 343 |
|
scrapling/core/mixins.py
CHANGED
|
@@ -1,32 +1,33 @@
|
|
| 1 |
-
|
| 2 |
class SelectorsGeneration:
|
| 3 |
"""Selectors generation functions
|
| 4 |
Trying to generate selectors like Firefox or maybe cleaner ones!? Ehm
|
| 5 |
Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591"""
|
| 6 |
|
| 7 |
-
def __general_selection(self, selection: str =
|
| 8 |
"""Generate a selector for the current element.
|
| 9 |
:return: A string of the generated selector.
|
| 10 |
"""
|
| 11 |
selectorPath = []
|
| 12 |
target = self
|
| 13 |
-
css = selection.lower() ==
|
| 14 |
while target is not None:
|
| 15 |
if target.parent:
|
| 16 |
-
if target.attrib.get(
|
| 17 |
# id is enough
|
| 18 |
part = (
|
| 19 |
-
f
|
|
|
|
| 20 |
else f"[@id='{target.attrib['id']}']"
|
| 21 |
)
|
| 22 |
selectorPath.append(part)
|
| 23 |
if not full_path:
|
| 24 |
return (
|
| 25 |
-
" > ".join(reversed(selectorPath))
|
| 26 |
-
|
|
|
|
| 27 |
)
|
| 28 |
else:
|
| 29 |
-
part = f
|
| 30 |
# We won't use classes anymore because I some websites share exact classes between elements
|
| 31 |
# classes = target.attrib.get('class', '').split()
|
| 32 |
# if classes and css:
|
|
@@ -41,23 +42,26 @@ class SelectorsGeneration:
|
|
| 41 |
|
| 42 |
if counter[target.tag] > 1:
|
| 43 |
part += (
|
| 44 |
-
f":nth-of-type({counter[target.tag]})"
|
|
|
|
| 45 |
else f"[{counter[target.tag]}]"
|
| 46 |
)
|
| 47 |
|
| 48 |
selectorPath.append(part)
|
| 49 |
target = target.parent
|
| 50 |
-
if target is None or target.tag ==
|
| 51 |
return (
|
| 52 |
-
" > ".join(reversed(selectorPath))
|
| 53 |
-
|
|
|
|
| 54 |
)
|
| 55 |
else:
|
| 56 |
break
|
| 57 |
|
| 58 |
return (
|
| 59 |
-
" > ".join(reversed(selectorPath))
|
| 60 |
-
|
|
|
|
| 61 |
)
|
| 62 |
|
| 63 |
@property
|
|
@@ -79,11 +83,11 @@ class SelectorsGeneration:
|
|
| 79 |
"""Generate a XPath selector for the current element
|
| 80 |
:return: A string of the generated selector.
|
| 81 |
"""
|
| 82 |
-
return self.__general_selection(
|
| 83 |
|
| 84 |
@property
|
| 85 |
def generate_full_xpath_selector(self) -> str:
|
| 86 |
"""Generate a complete XPath selector for the current element
|
| 87 |
:return: A string of the generated selector.
|
| 88 |
"""
|
| 89 |
-
return self.__general_selection(
|
|
|
|
|
|
|
| 1 |
class SelectorsGeneration:
|
| 2 |
"""Selectors generation functions
|
| 3 |
Trying to generate selectors like Firefox or maybe cleaner ones!? Ehm
|
| 4 |
Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591"""
|
| 5 |
|
| 6 |
+
def __general_selection(self, selection: str = "css", full_path=False) -> str:
|
| 7 |
"""Generate a selector for the current element.
|
| 8 |
:return: A string of the generated selector.
|
| 9 |
"""
|
| 10 |
selectorPath = []
|
| 11 |
target = self
|
| 12 |
+
css = selection.lower() == "css"
|
| 13 |
while target is not None:
|
| 14 |
if target.parent:
|
| 15 |
+
if target.attrib.get("id"):
|
| 16 |
# id is enough
|
| 17 |
part = (
|
| 18 |
+
f"#{target.attrib['id']}"
|
| 19 |
+
if css
|
| 20 |
else f"[@id='{target.attrib['id']}']"
|
| 21 |
)
|
| 22 |
selectorPath.append(part)
|
| 23 |
if not full_path:
|
| 24 |
return (
|
| 25 |
+
" > ".join(reversed(selectorPath))
|
| 26 |
+
if css
|
| 27 |
+
else "//*" + "/".join(reversed(selectorPath))
|
| 28 |
)
|
| 29 |
else:
|
| 30 |
+
part = f"{target.tag}"
|
| 31 |
# We won't use classes anymore because I some websites share exact classes between elements
|
| 32 |
# classes = target.attrib.get('class', '').split()
|
| 33 |
# if classes and css:
|
|
|
|
| 42 |
|
| 43 |
if counter[target.tag] > 1:
|
| 44 |
part += (
|
| 45 |
+
f":nth-of-type({counter[target.tag]})"
|
| 46 |
+
if css
|
| 47 |
else f"[{counter[target.tag]}]"
|
| 48 |
)
|
| 49 |
|
| 50 |
selectorPath.append(part)
|
| 51 |
target = target.parent
|
| 52 |
+
if target is None or target.tag == "html":
|
| 53 |
return (
|
| 54 |
+
" > ".join(reversed(selectorPath))
|
| 55 |
+
if css
|
| 56 |
+
else "//" + "/".join(reversed(selectorPath))
|
| 57 |
)
|
| 58 |
else:
|
| 59 |
break
|
| 60 |
|
| 61 |
return (
|
| 62 |
+
" > ".join(reversed(selectorPath))
|
| 63 |
+
if css
|
| 64 |
+
else "//" + "/".join(reversed(selectorPath))
|
| 65 |
)
|
| 66 |
|
| 67 |
@property
|
|
|
|
| 83 |
"""Generate a XPath selector for the current element
|
| 84 |
:return: A string of the generated selector.
|
| 85 |
"""
|
| 86 |
+
return self.__general_selection("xpath")
|
| 87 |
|
| 88 |
@property
|
| 89 |
def generate_full_xpath_selector(self) -> str:
|
| 90 |
"""Generate a complete XPath selector for the current element
|
| 91 |
:return: A string of the generated selector.
|
| 92 |
"""
|
| 93 |
+
return self.__general_selection("xpath", full_path=True)
|
scrapling/core/storage_adaptors.py
CHANGED
|
@@ -20,7 +20,7 @@ class StorageSystemMixin(ABC):
|
|
| 20 |
self.url = url
|
| 21 |
|
| 22 |
@lru_cache(64, typed=True)
|
| 23 |
-
def _get_base_url(self, default_value: str =
|
| 24 |
if not self.url or type(self.url) is not str:
|
| 25 |
return default_value
|
| 26 |
|
|
@@ -38,7 +38,7 @@ class StorageSystemMixin(ABC):
|
|
| 38 |
:param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
|
| 39 |
the docs for more info.
|
| 40 |
"""
|
| 41 |
-
raise NotImplementedError(
|
| 42 |
|
| 43 |
@abstractmethod
|
| 44 |
def retrieve(self, identifier: str) -> Optional[Dict]:
|
|
@@ -48,7 +48,7 @@ class StorageSystemMixin(ABC):
|
|
| 48 |
the docs for more info.
|
| 49 |
:return: A dictionary of the unique properties
|
| 50 |
"""
|
| 51 |
-
raise NotImplementedError(
|
| 52 |
|
| 53 |
@staticmethod
|
| 54 |
@lru_cache(128, typed=True)
|
|
@@ -57,7 +57,7 @@ class StorageSystemMixin(ABC):
|
|
| 57 |
identifier = identifier.lower().strip()
|
| 58 |
if isinstance(identifier, str):
|
| 59 |
# Hash functions have to take bytes
|
| 60 |
-
identifier = identifier.encode(
|
| 61 |
|
| 62 |
hash_value = sha256(identifier).hexdigest()
|
| 63 |
return f"{hash_value}_{len(identifier)}" # Length to reduce collision chance
|
|
@@ -68,6 +68,7 @@ class SQLiteStorageSystem(StorageSystemMixin):
|
|
| 68 |
"""The recommended system to use, it's race condition safe and thread safe.
|
| 69 |
Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
|
| 70 |
> It's optimized for threaded applications but running it without threads shouldn't make it slow."""
|
|
|
|
| 71 |
def __init__(self, storage_file: str, url: Union[str, None] = None):
|
| 72 |
"""
|
| 73 |
:param storage_file: File to be used to store elements
|
|
@@ -111,10 +112,13 @@ class SQLiteStorageSystem(StorageSystemMixin):
|
|
| 111 |
url = self._get_base_url()
|
| 112 |
element_data = _StorageTools.element_to_dict(element)
|
| 113 |
with self.lock:
|
| 114 |
-
self.cursor.execute(
|
|
|
|
| 115 |
INSERT OR REPLACE INTO storage (url, identifier, element_data)
|
| 116 |
VALUES (?, ?, ?)
|
| 117 |
-
""",
|
|
|
|
|
|
|
| 118 |
self.cursor.fetchall()
|
| 119 |
self.connection.commit()
|
| 120 |
|
|
@@ -129,7 +133,7 @@ class SQLiteStorageSystem(StorageSystemMixin):
|
|
| 129 |
with self.lock:
|
| 130 |
self.cursor.execute(
|
| 131 |
"SELECT element_data FROM storage WHERE url = ? AND identifier = ?",
|
| 132 |
-
(url, identifier)
|
| 133 |
)
|
| 134 |
result = self.cursor.fetchone()
|
| 135 |
if result:
|
|
|
|
| 20 |
self.url = url
|
| 21 |
|
| 22 |
@lru_cache(64, typed=True)
|
| 23 |
+
def _get_base_url(self, default_value: str = "default") -> str:
|
| 24 |
if not self.url or type(self.url) is not str:
|
| 25 |
return default_value
|
| 26 |
|
|
|
|
| 38 |
:param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
|
| 39 |
the docs for more info.
|
| 40 |
"""
|
| 41 |
+
raise NotImplementedError("Storage system must implement `save` method")
|
| 42 |
|
| 43 |
@abstractmethod
|
| 44 |
def retrieve(self, identifier: str) -> Optional[Dict]:
|
|
|
|
| 48 |
the docs for more info.
|
| 49 |
:return: A dictionary of the unique properties
|
| 50 |
"""
|
| 51 |
+
raise NotImplementedError("Storage system must implement `save` method")
|
| 52 |
|
| 53 |
@staticmethod
|
| 54 |
@lru_cache(128, typed=True)
|
|
|
|
| 57 |
identifier = identifier.lower().strip()
|
| 58 |
if isinstance(identifier, str):
|
| 59 |
# Hash functions have to take bytes
|
| 60 |
+
identifier = identifier.encode("utf-8")
|
| 61 |
|
| 62 |
hash_value = sha256(identifier).hexdigest()
|
| 63 |
return f"{hash_value}_{len(identifier)}" # Length to reduce collision chance
|
|
|
|
| 68 |
"""The recommended system to use, it's race condition safe and thread safe.
|
| 69 |
Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
|
| 70 |
> It's optimized for threaded applications but running it without threads shouldn't make it slow."""
|
| 71 |
+
|
| 72 |
def __init__(self, storage_file: str, url: Union[str, None] = None):
|
| 73 |
"""
|
| 74 |
:param storage_file: File to be used to store elements
|
|
|
|
| 112 |
url = self._get_base_url()
|
| 113 |
element_data = _StorageTools.element_to_dict(element)
|
| 114 |
with self.lock:
|
| 115 |
+
self.cursor.execute(
|
| 116 |
+
"""
|
| 117 |
INSERT OR REPLACE INTO storage (url, identifier, element_data)
|
| 118 |
VALUES (?, ?, ?)
|
| 119 |
+
""",
|
| 120 |
+
(url, identifier, orjson.dumps(element_data)),
|
| 121 |
+
)
|
| 122 |
self.cursor.fetchall()
|
| 123 |
self.connection.commit()
|
| 124 |
|
|
|
|
| 133 |
with self.lock:
|
| 134 |
self.cursor.execute(
|
| 135 |
"SELECT element_data FROM storage WHERE url = ? AND identifier = ?",
|
| 136 |
+
(url, identifier),
|
| 137 |
)
|
| 138 |
result = self.cursor.fetchone()
|
| 139 |
if result:
|
scrapling/core/translator.py
CHANGED
|
@@ -24,7 +24,6 @@ replace_html5_whitespaces = re.compile(regex).sub
|
|
| 24 |
|
| 25 |
|
| 26 |
class XPathExpr(OriginalXPathExpr):
|
| 27 |
-
|
| 28 |
textnode: bool = False
|
| 29 |
attribute: Optional[str] = None
|
| 30 |
|
|
@@ -123,7 +122,7 @@ class TranslatorMixin:
|
|
| 123 |
|
| 124 |
@staticmethod
|
| 125 |
def xpath_attr_functional_pseudo_element(
|
| 126 |
-
|
| 127 |
) -> XPathExpr:
|
| 128 |
"""Support selecting attribute values using ::attr() pseudo-element"""
|
| 129 |
if function.argument_types() not in (["STRING"], ["IDENT"]):
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
class XPathExpr(OriginalXPathExpr):
|
|
|
|
| 27 |
textnode: bool = False
|
| 28 |
attribute: Optional[str] = None
|
| 29 |
|
|
|
|
| 122 |
|
| 123 |
@staticmethod
|
| 124 |
def xpath_attr_functional_pseudo_element(
|
| 125 |
+
xpath: OriginalXPathExpr, function: FunctionalPseudoElement
|
| 126 |
) -> XPathExpr:
|
| 127 |
"""Support selecting attribute values using ::attr() pseudo-element"""
|
| 128 |
if function.argument_types() not in (["STRING"], ["IDENT"]):
|
scrapling/core/utils.py
CHANGED
|
@@ -11,7 +11,9 @@ from scrapling.core._types import Any, Dict, Iterable, Union
|
|
| 11 |
# functools.cache is available on Python 3.9+ only so let's keep lru_cache
|
| 12 |
from functools import lru_cache # isort:skip
|
| 13 |
|
| 14 |
-
html_forbidden = {
|
|
|
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
@lru_cache(1, typed=True)
|
|
@@ -20,12 +22,11 @@ def setup_logger():
|
|
| 20 |
|
| 21 |
:returns: logging.Logger: Configured logger instance
|
| 22 |
"""
|
| 23 |
-
logger = logging.getLogger(
|
| 24 |
logger.setLevel(logging.INFO)
|
| 25 |
|
| 26 |
formatter = logging.Formatter(
|
| 27 |
-
fmt="[%(asctime)s] %(levelname)s: %(message)s",
|
| 28 |
-
datefmt="%Y-%m-%d %H:%M:%S"
|
| 29 |
)
|
| 30 |
|
| 31 |
console_handler = logging.StreamHandler()
|
|
@@ -58,7 +59,13 @@ def flatten(lst: Iterable):
|
|
| 58 |
|
| 59 |
def _is_iterable(s: Any):
|
| 60 |
# This will be used only in regex functions to make sure it's iterable but not string/bytes
|
| 61 |
-
return isinstance(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
|
| 64 |
class _StorageTools:
|
|
@@ -66,31 +73,43 @@ class _StorageTools:
|
|
| 66 |
def __clean_attributes(element: html.HtmlElement, forbidden: tuple = ()) -> Dict:
|
| 67 |
if not element.attrib:
|
| 68 |
return {}
|
| 69 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
@classmethod
|
| 72 |
def element_to_dict(cls, element: html.HtmlElement) -> Dict:
|
| 73 |
parent = element.getparent()
|
| 74 |
result = {
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
}
|
| 80 |
if parent is not None:
|
| 81 |
-
result.update(
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
|
|
|
|
|
|
| 86 |
|
| 87 |
-
siblings = [
|
|
|
|
|
|
|
| 88 |
if siblings:
|
| 89 |
-
result.update({
|
| 90 |
|
| 91 |
-
children = [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
if children:
|
| 93 |
-
result.update({
|
| 94 |
|
| 95 |
return result
|
| 96 |
|
|
@@ -98,9 +117,9 @@ class _StorageTools:
|
|
| 98 |
def _get_element_path(cls, element: html.HtmlElement):
|
| 99 |
parent = element.getparent()
|
| 100 |
return tuple(
|
| 101 |
-
(element.tag,)
|
| 102 |
-
|
| 103 |
-
)
|
| 104 |
)
|
| 105 |
|
| 106 |
|
|
@@ -117,6 +136,6 @@ class _StorageTools:
|
|
| 117 |
|
| 118 |
@lru_cache(128, typed=True)
|
| 119 |
def clean_spaces(string):
|
| 120 |
-
string = string.replace(
|
| 121 |
-
string = re.sub(
|
| 122 |
-
return re.sub(
|
|
|
|
| 11 |
# functools.cache is available on Python 3.9+ only so let's keep lru_cache
|
| 12 |
from functools import lru_cache # isort:skip
|
| 13 |
|
| 14 |
+
html_forbidden = {
|
| 15 |
+
html.HtmlComment,
|
| 16 |
+
}
|
| 17 |
|
| 18 |
|
| 19 |
@lru_cache(1, typed=True)
|
|
|
|
| 22 |
|
| 23 |
:returns: logging.Logger: Configured logger instance
|
| 24 |
"""
|
| 25 |
+
logger = logging.getLogger("scrapling")
|
| 26 |
logger.setLevel(logging.INFO)
|
| 27 |
|
| 28 |
formatter = logging.Formatter(
|
| 29 |
+
fmt="[%(asctime)s] %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
|
|
|
|
| 30 |
)
|
| 31 |
|
| 32 |
console_handler = logging.StreamHandler()
|
|
|
|
| 59 |
|
| 60 |
def _is_iterable(s: Any):
|
| 61 |
# This will be used only in regex functions to make sure it's iterable but not string/bytes
|
| 62 |
+
return isinstance(
|
| 63 |
+
s,
|
| 64 |
+
(
|
| 65 |
+
list,
|
| 66 |
+
tuple,
|
| 67 |
+
),
|
| 68 |
+
)
|
| 69 |
|
| 70 |
|
| 71 |
class _StorageTools:
|
|
|
|
| 73 |
def __clean_attributes(element: html.HtmlElement, forbidden: tuple = ()) -> Dict:
|
| 74 |
if not element.attrib:
|
| 75 |
return {}
|
| 76 |
+
return {
|
| 77 |
+
k: v.strip()
|
| 78 |
+
for k, v in element.attrib.items()
|
| 79 |
+
if v and v.strip() and k not in forbidden
|
| 80 |
+
}
|
| 81 |
|
| 82 |
@classmethod
|
| 83 |
def element_to_dict(cls, element: html.HtmlElement) -> Dict:
|
| 84 |
parent = element.getparent()
|
| 85 |
result = {
|
| 86 |
+
"tag": str(element.tag),
|
| 87 |
+
"attributes": cls.__clean_attributes(element),
|
| 88 |
+
"text": element.text.strip() if element.text else None,
|
| 89 |
+
"path": cls._get_element_path(element),
|
| 90 |
}
|
| 91 |
if parent is not None:
|
| 92 |
+
result.update(
|
| 93 |
+
{
|
| 94 |
+
"parent_name": parent.tag,
|
| 95 |
+
"parent_attribs": dict(parent.attrib),
|
| 96 |
+
"parent_text": parent.text.strip() if parent.text else None,
|
| 97 |
+
}
|
| 98 |
+
)
|
| 99 |
|
| 100 |
+
siblings = [
|
| 101 |
+
child.tag for child in parent.iterchildren() if child != element
|
| 102 |
+
]
|
| 103 |
if siblings:
|
| 104 |
+
result.update({"siblings": tuple(siblings)})
|
| 105 |
|
| 106 |
+
children = [
|
| 107 |
+
child.tag
|
| 108 |
+
for child in element.iterchildren()
|
| 109 |
+
if type(child) not in html_forbidden
|
| 110 |
+
]
|
| 111 |
if children:
|
| 112 |
+
result.update({"children": tuple(children)})
|
| 113 |
|
| 114 |
return result
|
| 115 |
|
|
|
|
| 117 |
def _get_element_path(cls, element: html.HtmlElement):
|
| 118 |
parent = element.getparent()
|
| 119 |
return tuple(
|
| 120 |
+
(element.tag,)
|
| 121 |
+
if parent is None
|
| 122 |
+
else (cls._get_element_path(parent) + (element.tag,))
|
| 123 |
)
|
| 124 |
|
| 125 |
|
|
|
|
| 136 |
|
| 137 |
@lru_cache(128, typed=True)
|
| 138 |
def clean_spaces(string):
|
| 139 |
+
string = string.replace("\t", " ")
|
| 140 |
+
string = re.sub("[\n|\r]", "", string)
|
| 141 |
+
return re.sub(" +", " ", string)
|
scrapling/defaults.py
CHANGED
|
@@ -5,21 +5,33 @@ from scrapling.core.utils import log
|
|
| 5 |
# A lightweight approach to create lazy loader for each import for backward compatibility
|
| 6 |
# This will reduces initial memory footprint significantly (only loads what's used)
|
| 7 |
def __getattr__(name):
|
| 8 |
-
if name ==
|
| 9 |
from scrapling.fetchers import Fetcher as cls
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
| 11 |
return cls
|
| 12 |
-
elif name ==
|
| 13 |
from scrapling.fetchers import AsyncFetcher as cls
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
| 15 |
return cls
|
| 16 |
-
elif name ==
|
| 17 |
from scrapling.fetchers import StealthyFetcher as cls
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
| 19 |
return cls
|
| 20 |
-
elif name ==
|
| 21 |
from scrapling.fetchers import PlayWrightFetcher as cls
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
| 23 |
return cls
|
| 24 |
else:
|
| 25 |
raise AttributeError(f"module 'scrapling' has no attribute '{name}'")
|
|
|
|
| 5 |
# A lightweight approach to create lazy loader for each import for backward compatibility
|
| 6 |
# This will reduces initial memory footprint significantly (only loads what's used)
|
| 7 |
def __getattr__(name):
|
| 8 |
+
if name == "Fetcher":
|
| 9 |
from scrapling.fetchers import Fetcher as cls
|
| 10 |
+
|
| 11 |
+
log.warning(
|
| 12 |
+
"This import is deprecated now and it will be removed with v0.3. Use `from scrapling.fetchers import Fetcher` instead"
|
| 13 |
+
)
|
| 14 |
return cls
|
| 15 |
+
elif name == "AsyncFetcher":
|
| 16 |
from scrapling.fetchers import AsyncFetcher as cls
|
| 17 |
+
|
| 18 |
+
log.warning(
|
| 19 |
+
"This import is deprecated now and it will be removed with v0.3. Use `from scrapling.fetchers import AsyncFetcher` instead"
|
| 20 |
+
)
|
| 21 |
return cls
|
| 22 |
+
elif name == "StealthyFetcher":
|
| 23 |
from scrapling.fetchers import StealthyFetcher as cls
|
| 24 |
+
|
| 25 |
+
log.warning(
|
| 26 |
+
"This import is deprecated now and it will be removed with v0.3. Use `from scrapling.fetchers import StealthyFetcher` instead"
|
| 27 |
+
)
|
| 28 |
return cls
|
| 29 |
+
elif name == "PlayWrightFetcher":
|
| 30 |
from scrapling.fetchers import PlayWrightFetcher as cls
|
| 31 |
+
|
| 32 |
+
log.warning(
|
| 33 |
+
"This import is deprecated now and it will be removed with v0.3. Use `from scrapling.fetchers import PlayWrightFetcher` instead"
|
| 34 |
+
)
|
| 35 |
return cls
|
| 36 |
else:
|
| 37 |
raise AttributeError(f"module 'scrapling' has no attribute '{name}'")
|
scrapling/engines/__init__.py
CHANGED
|
@@ -4,4 +4,4 @@ from .pw import PlaywrightEngine
|
|
| 4 |
from .static import StaticEngine
|
| 5 |
from .toolbelt import check_if_engine_usable
|
| 6 |
|
| 7 |
-
__all__ = [
|
|
|
|
| 4 |
from .static import StaticEngine
|
| 5 |
from .toolbelt import check_if_engine_usable
|
| 6 |
|
| 7 |
+
__all__ = ["CamoufoxEngine", "PlaywrightEngine"]
|
scrapling/engines/camo.py
CHANGED
|
@@ -2,27 +2,52 @@ from camoufox import DefaultAddons
|
|
| 2 |
from camoufox.async_api import AsyncCamoufox
|
| 3 |
from camoufox.sync_api import Camoufox
|
| 4 |
|
| 5 |
-
from scrapling.core._types import (
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
from scrapling.core.utils import log
|
| 8 |
-
from scrapling.engines.toolbelt import (
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
class CamoufoxEngine:
|
| 17 |
def __init__(
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
):
|
| 27 |
"""An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
|
| 28 |
|
|
@@ -97,7 +122,7 @@ class CamoufoxEngine:
|
|
| 97 |
"block_webrtc": self.block_webrtc,
|
| 98 |
"block_images": self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
|
| 99 |
"os": None if self.os_randomize else get_os_name(),
|
| 100 |
-
**self.additional_arguments
|
| 101 |
}
|
| 102 |
|
| 103 |
def _process_response_history(self, first_response):
|
|
@@ -109,19 +134,30 @@ class CamoufoxEngine:
|
|
| 109 |
while current_request:
|
| 110 |
try:
|
| 111 |
current_response = current_request.response()
|
| 112 |
-
history.insert(
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
except Exception as e:
|
| 126 |
log.error(f"Error processing redirect: {e}")
|
| 127 |
break
|
|
@@ -141,19 +177,30 @@ class CamoufoxEngine:
|
|
| 141 |
while current_request:
|
| 142 |
try:
|
| 143 |
current_response = await current_request.response()
|
| 144 |
-
history.insert(
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
except Exception as e:
|
| 158 |
log.error(f"Error processing redirect: {e}")
|
| 159 |
break
|
|
@@ -175,7 +222,10 @@ class CamoufoxEngine:
|
|
| 175 |
|
| 176 |
def handle_response(finished_response):
|
| 177 |
nonlocal final_response
|
| 178 |
-
if
|
|
|
|
|
|
|
|
|
|
| 179 |
final_response = finished_response
|
| 180 |
|
| 181 |
with Camoufox(**self._get_camoufox_options()) as browser:
|
|
@@ -195,7 +245,7 @@ class CamoufoxEngine:
|
|
| 195 |
page.wait_for_load_state(state="domcontentloaded")
|
| 196 |
|
| 197 |
if self.network_idle:
|
| 198 |
-
page.wait_for_load_state(
|
| 199 |
|
| 200 |
if self.page_action is not None:
|
| 201 |
try:
|
|
@@ -211,7 +261,7 @@ class CamoufoxEngine:
|
|
| 211 |
page.wait_for_load_state(state="load")
|
| 212 |
page.wait_for_load_state(state="domcontentloaded")
|
| 213 |
if self.network_idle:
|
| 214 |
-
page.wait_for_load_state(
|
| 215 |
except Exception as e:
|
| 216 |
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
| 217 |
|
|
@@ -222,9 +272,13 @@ class CamoufoxEngine:
|
|
| 222 |
raise ValueError("Failed to get a response from the page")
|
| 223 |
|
| 224 |
# This will be parsed inside `Response`
|
| 225 |
-
encoding =
|
|
|
|
|
|
|
| 226 |
# PlayWright API sometimes give empty status text for some reason!
|
| 227 |
-
status_text = final_response.status_text or StatusText.get(
|
|
|
|
|
|
|
| 228 |
|
| 229 |
history = self._process_response_history(first_response)
|
| 230 |
try:
|
|
@@ -236,15 +290,17 @@ class CamoufoxEngine:
|
|
| 236 |
response = Response(
|
| 237 |
url=page.url,
|
| 238 |
text=page_content,
|
| 239 |
-
body=page_content.encode(
|
| 240 |
status=final_response.status,
|
| 241 |
reason=status_text,
|
| 242 |
encoding=encoding,
|
| 243 |
-
cookies={
|
|
|
|
|
|
|
| 244 |
headers=first_response.all_headers(),
|
| 245 |
request_headers=first_response.request.all_headers(),
|
| 246 |
history=history,
|
| 247 |
-
**self.adaptor_arguments
|
| 248 |
)
|
| 249 |
page.close()
|
| 250 |
context.close()
|
|
@@ -262,7 +318,10 @@ class CamoufoxEngine:
|
|
| 262 |
|
| 263 |
async def handle_response(finished_response):
|
| 264 |
nonlocal final_response
|
| 265 |
-
if
|
|
|
|
|
|
|
|
|
|
| 266 |
final_response = finished_response
|
| 267 |
|
| 268 |
async with AsyncCamoufox(**self._get_camoufox_options()) as browser:
|
|
@@ -282,7 +341,7 @@ class CamoufoxEngine:
|
|
| 282 |
await page.wait_for_load_state(state="domcontentloaded")
|
| 283 |
|
| 284 |
if self.network_idle:
|
| 285 |
-
await page.wait_for_load_state(
|
| 286 |
|
| 287 |
if self.page_action is not None:
|
| 288 |
try:
|
|
@@ -298,7 +357,7 @@ class CamoufoxEngine:
|
|
| 298 |
await page.wait_for_load_state(state="load")
|
| 299 |
await page.wait_for_load_state(state="domcontentloaded")
|
| 300 |
if self.network_idle:
|
| 301 |
-
await page.wait_for_load_state(
|
| 302 |
except Exception as e:
|
| 303 |
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
| 304 |
|
|
@@ -309,9 +368,13 @@ class CamoufoxEngine:
|
|
| 309 |
raise ValueError("Failed to get a response from the page")
|
| 310 |
|
| 311 |
# This will be parsed inside `Response`
|
| 312 |
-
encoding =
|
|
|
|
|
|
|
| 313 |
# PlayWright API sometimes give empty status text for some reason!
|
| 314 |
-
status_text = final_response.status_text or StatusText.get(
|
|
|
|
|
|
|
| 315 |
|
| 316 |
history = await self._async_process_response_history(first_response)
|
| 317 |
try:
|
|
@@ -323,15 +386,18 @@ class CamoufoxEngine:
|
|
| 323 |
response = Response(
|
| 324 |
url=page.url,
|
| 325 |
text=page_content,
|
| 326 |
-
body=page_content.encode(
|
| 327 |
status=final_response.status,
|
| 328 |
reason=status_text,
|
| 329 |
encoding=encoding,
|
| 330 |
-
cookies={
|
|
|
|
|
|
|
|
|
|
| 331 |
headers=await first_response.all_headers(),
|
| 332 |
request_headers=await first_response.request.all_headers(),
|
| 333 |
history=history,
|
| 334 |
-
**self.adaptor_arguments
|
| 335 |
)
|
| 336 |
await page.close()
|
| 337 |
await context.close()
|
|
|
|
| 2 |
from camoufox.async_api import AsyncCamoufox
|
| 3 |
from camoufox.sync_api import Camoufox
|
| 4 |
|
| 5 |
+
from scrapling.core._types import (
|
| 6 |
+
Callable,
|
| 7 |
+
Dict,
|
| 8 |
+
List,
|
| 9 |
+
Literal,
|
| 10 |
+
Optional,
|
| 11 |
+
SelectorWaitStates,
|
| 12 |
+
Union,
|
| 13 |
+
)
|
| 14 |
from scrapling.core.utils import log
|
| 15 |
+
from scrapling.engines.toolbelt import (
|
| 16 |
+
Response,
|
| 17 |
+
StatusText,
|
| 18 |
+
async_intercept_route,
|
| 19 |
+
check_type_validity,
|
| 20 |
+
construct_proxy_dict,
|
| 21 |
+
generate_convincing_referer,
|
| 22 |
+
get_os_name,
|
| 23 |
+
intercept_route,
|
| 24 |
+
)
|
| 25 |
|
| 26 |
|
| 27 |
class CamoufoxEngine:
|
| 28 |
def __init__(
|
| 29 |
+
self,
|
| 30 |
+
headless: Union[bool, Literal["virtual"]] = True, # noqa: F821
|
| 31 |
+
block_images: bool = False,
|
| 32 |
+
disable_resources: bool = False,
|
| 33 |
+
block_webrtc: bool = False,
|
| 34 |
+
allow_webgl: bool = True,
|
| 35 |
+
network_idle: bool = False,
|
| 36 |
+
humanize: Union[bool, float] = True,
|
| 37 |
+
wait: Optional[int] = 0,
|
| 38 |
+
timeout: Optional[float] = 30000,
|
| 39 |
+
page_action: Callable = None,
|
| 40 |
+
wait_selector: Optional[str] = None,
|
| 41 |
+
addons: Optional[List[str]] = None,
|
| 42 |
+
wait_selector_state: SelectorWaitStates = "attached",
|
| 43 |
+
google_search: bool = True,
|
| 44 |
+
extra_headers: Optional[Dict[str, str]] = None,
|
| 45 |
+
proxy: Optional[Union[str, Dict[str, str]]] = None,
|
| 46 |
+
os_randomize: bool = False,
|
| 47 |
+
disable_ads: bool = False,
|
| 48 |
+
geoip: bool = False,
|
| 49 |
+
adaptor_arguments: Dict = None,
|
| 50 |
+
additional_arguments: Dict = None,
|
| 51 |
):
|
| 52 |
"""An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
|
| 53 |
|
|
|
|
| 122 |
"block_webrtc": self.block_webrtc,
|
| 123 |
"block_images": self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
|
| 124 |
"os": None if self.os_randomize else get_os_name(),
|
| 125 |
+
**self.additional_arguments,
|
| 126 |
}
|
| 127 |
|
| 128 |
def _process_response_history(self, first_response):
|
|
|
|
| 134 |
while current_request:
|
| 135 |
try:
|
| 136 |
current_response = current_request.response()
|
| 137 |
+
history.insert(
|
| 138 |
+
0,
|
| 139 |
+
Response(
|
| 140 |
+
url=current_request.url,
|
| 141 |
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
| 142 |
+
text="",
|
| 143 |
+
body=b"",
|
| 144 |
+
status=current_response.status if current_response else 301,
|
| 145 |
+
reason=(
|
| 146 |
+
current_response.status_text
|
| 147 |
+
or StatusText.get(current_response.status)
|
| 148 |
+
)
|
| 149 |
+
if current_response
|
| 150 |
+
else StatusText.get(301),
|
| 151 |
+
encoding=current_response.headers.get("content-type", "")
|
| 152 |
+
or "utf-8",
|
| 153 |
+
cookies={},
|
| 154 |
+
headers=current_response.all_headers()
|
| 155 |
+
if current_response
|
| 156 |
+
else {},
|
| 157 |
+
request_headers=current_request.all_headers(),
|
| 158 |
+
**self.adaptor_arguments,
|
| 159 |
+
),
|
| 160 |
+
)
|
| 161 |
except Exception as e:
|
| 162 |
log.error(f"Error processing redirect: {e}")
|
| 163 |
break
|
|
|
|
| 177 |
while current_request:
|
| 178 |
try:
|
| 179 |
current_response = await current_request.response()
|
| 180 |
+
history.insert(
|
| 181 |
+
0,
|
| 182 |
+
Response(
|
| 183 |
+
url=current_request.url,
|
| 184 |
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
| 185 |
+
text="",
|
| 186 |
+
body=b"",
|
| 187 |
+
status=current_response.status if current_response else 301,
|
| 188 |
+
reason=(
|
| 189 |
+
current_response.status_text
|
| 190 |
+
or StatusText.get(current_response.status)
|
| 191 |
+
)
|
| 192 |
+
if current_response
|
| 193 |
+
else StatusText.get(301),
|
| 194 |
+
encoding=current_response.headers.get("content-type", "")
|
| 195 |
+
or "utf-8",
|
| 196 |
+
cookies={},
|
| 197 |
+
headers=await current_response.all_headers()
|
| 198 |
+
if current_response
|
| 199 |
+
else {},
|
| 200 |
+
request_headers=await current_request.all_headers(),
|
| 201 |
+
**self.adaptor_arguments,
|
| 202 |
+
),
|
| 203 |
+
)
|
| 204 |
except Exception as e:
|
| 205 |
log.error(f"Error processing redirect: {e}")
|
| 206 |
break
|
|
|
|
| 222 |
|
| 223 |
def handle_response(finished_response):
|
| 224 |
nonlocal final_response
|
| 225 |
+
if (
|
| 226 |
+
finished_response.request.resource_type == "document"
|
| 227 |
+
and finished_response.request.is_navigation_request()
|
| 228 |
+
):
|
| 229 |
final_response = finished_response
|
| 230 |
|
| 231 |
with Camoufox(**self._get_camoufox_options()) as browser:
|
|
|
|
| 245 |
page.wait_for_load_state(state="domcontentloaded")
|
| 246 |
|
| 247 |
if self.network_idle:
|
| 248 |
+
page.wait_for_load_state("networkidle")
|
| 249 |
|
| 250 |
if self.page_action is not None:
|
| 251 |
try:
|
|
|
|
| 261 |
page.wait_for_load_state(state="load")
|
| 262 |
page.wait_for_load_state(state="domcontentloaded")
|
| 263 |
if self.network_idle:
|
| 264 |
+
page.wait_for_load_state("networkidle")
|
| 265 |
except Exception as e:
|
| 266 |
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
| 267 |
|
|
|
|
| 272 |
raise ValueError("Failed to get a response from the page")
|
| 273 |
|
| 274 |
# This will be parsed inside `Response`
|
| 275 |
+
encoding = (
|
| 276 |
+
final_response.headers.get("content-type", "") or "utf-8"
|
| 277 |
+
) # default encoding
|
| 278 |
# PlayWright API sometimes give empty status text for some reason!
|
| 279 |
+
status_text = final_response.status_text or StatusText.get(
|
| 280 |
+
final_response.status
|
| 281 |
+
)
|
| 282 |
|
| 283 |
history = self._process_response_history(first_response)
|
| 284 |
try:
|
|
|
|
| 290 |
response = Response(
|
| 291 |
url=page.url,
|
| 292 |
text=page_content,
|
| 293 |
+
body=page_content.encode("utf-8"),
|
| 294 |
status=final_response.status,
|
| 295 |
reason=status_text,
|
| 296 |
encoding=encoding,
|
| 297 |
+
cookies={
|
| 298 |
+
cookie["name"]: cookie["value"] for cookie in page.context.cookies()
|
| 299 |
+
},
|
| 300 |
headers=first_response.all_headers(),
|
| 301 |
request_headers=first_response.request.all_headers(),
|
| 302 |
history=history,
|
| 303 |
+
**self.adaptor_arguments,
|
| 304 |
)
|
| 305 |
page.close()
|
| 306 |
context.close()
|
|
|
|
| 318 |
|
| 319 |
async def handle_response(finished_response):
|
| 320 |
nonlocal final_response
|
| 321 |
+
if (
|
| 322 |
+
finished_response.request.resource_type == "document"
|
| 323 |
+
and finished_response.request.is_navigation_request()
|
| 324 |
+
):
|
| 325 |
final_response = finished_response
|
| 326 |
|
| 327 |
async with AsyncCamoufox(**self._get_camoufox_options()) as browser:
|
|
|
|
| 341 |
await page.wait_for_load_state(state="domcontentloaded")
|
| 342 |
|
| 343 |
if self.network_idle:
|
| 344 |
+
await page.wait_for_load_state("networkidle")
|
| 345 |
|
| 346 |
if self.page_action is not None:
|
| 347 |
try:
|
|
|
|
| 357 |
await page.wait_for_load_state(state="load")
|
| 358 |
await page.wait_for_load_state(state="domcontentloaded")
|
| 359 |
if self.network_idle:
|
| 360 |
+
await page.wait_for_load_state("networkidle")
|
| 361 |
except Exception as e:
|
| 362 |
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
| 363 |
|
|
|
|
| 368 |
raise ValueError("Failed to get a response from the page")
|
| 369 |
|
| 370 |
# This will be parsed inside `Response`
|
| 371 |
+
encoding = (
|
| 372 |
+
final_response.headers.get("content-type", "") or "utf-8"
|
| 373 |
+
) # default encoding
|
| 374 |
# PlayWright API sometimes give empty status text for some reason!
|
| 375 |
+
status_text = final_response.status_text or StatusText.get(
|
| 376 |
+
final_response.status
|
| 377 |
+
)
|
| 378 |
|
| 379 |
history = await self._async_process_response_history(first_response)
|
| 380 |
try:
|
|
|
|
| 386 |
response = Response(
|
| 387 |
url=page.url,
|
| 388 |
text=page_content,
|
| 389 |
+
body=page_content.encode("utf-8"),
|
| 390 |
status=final_response.status,
|
| 391 |
reason=status_text,
|
| 392 |
encoding=encoding,
|
| 393 |
+
cookies={
|
| 394 |
+
cookie["name"]: cookie["value"]
|
| 395 |
+
for cookie in await page.context.cookies()
|
| 396 |
+
},
|
| 397 |
headers=await first_response.all_headers(),
|
| 398 |
request_headers=await first_response.request.all_headers(),
|
| 399 |
history=history,
|
| 400 |
+
**self.adaptor_arguments,
|
| 401 |
)
|
| 402 |
await page.close()
|
| 403 |
await context.close()
|
scrapling/engines/constants.py
CHANGED
|
@@ -1,92 +1,92 @@
|
|
| 1 |
# Disable loading these resources for speed
|
| 2 |
DEFAULT_DISABLED_RESOURCES = {
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
}
|
| 14 |
|
| 15 |
DEFAULT_STEALTH_FLAGS = (
|
| 16 |
# Explanation: https://peter.sh/experiments/chromium-command-line-switches/
|
| 17 |
# Generally this will make the browser faster and less detectable
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
# '--disable-popup-blocking',
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
# '--disable-reading-from-canvas', # For Firefox
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
)
|
| 91 |
|
| 92 |
# Defaulting to the docker mode, token doesn't matter in it as it's passed for the container
|
|
@@ -95,13 +95,10 @@ NSTBROWSER_DEFAULT_QUERY = {
|
|
| 95 |
"headless": True,
|
| 96 |
"autoClose": True,
|
| 97 |
"fingerprint": {
|
| 98 |
-
"flags": {
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
"platform": 'linux', # support: windows, mac, linux
|
| 103 |
-
"kernel": 'chromium', # only support: chromium
|
| 104 |
-
"kernelMilestone": '128',
|
| 105 |
"hardwareConcurrency": 8,
|
| 106 |
"deviceMemory": 8,
|
| 107 |
},
|
|
|
|
| 1 |
# Disable loading these resources for speed
|
| 2 |
DEFAULT_DISABLED_RESOURCES = {
|
| 3 |
+
"font",
|
| 4 |
+
"image",
|
| 5 |
+
"media",
|
| 6 |
+
"beacon",
|
| 7 |
+
"object",
|
| 8 |
+
"imageset",
|
| 9 |
+
"texttrack",
|
| 10 |
+
"websocket",
|
| 11 |
+
"csp_report",
|
| 12 |
+
"stylesheet",
|
| 13 |
}
|
| 14 |
|
| 15 |
DEFAULT_STEALTH_FLAGS = (
|
| 16 |
# Explanation: https://peter.sh/experiments/chromium-command-line-switches/
|
| 17 |
# Generally this will make the browser faster and less detectable
|
| 18 |
+
"--no-pings",
|
| 19 |
+
"--incognito",
|
| 20 |
+
"--test-type",
|
| 21 |
+
"--lang=en-US",
|
| 22 |
+
"--mute-audio",
|
| 23 |
+
"--no-first-run",
|
| 24 |
+
"--disable-sync",
|
| 25 |
+
"--hide-scrollbars",
|
| 26 |
+
"--disable-logging",
|
| 27 |
+
"--start-maximized", # For headless check bypass
|
| 28 |
+
"--enable-async-dns",
|
| 29 |
+
"--disable-breakpad",
|
| 30 |
+
"--disable-infobars",
|
| 31 |
+
"--accept-lang=en-US",
|
| 32 |
+
"--use-mock-keychain",
|
| 33 |
+
"--disable-translate",
|
| 34 |
+
"--disable-extensions",
|
| 35 |
+
"--disable-voice-input",
|
| 36 |
+
"--window-position=0,0",
|
| 37 |
+
"--disable-wake-on-wifi",
|
| 38 |
+
"--ignore-gpu-blocklist",
|
| 39 |
+
"--enable-tcp-fast-open",
|
| 40 |
+
"--enable-web-bluetooth",
|
| 41 |
+
"--disable-hang-monitor",
|
| 42 |
+
"--password-store=basic",
|
| 43 |
+
"--disable-cloud-import",
|
| 44 |
+
"--disable-default-apps",
|
| 45 |
+
"--disable-print-preview",
|
| 46 |
+
"--disable-dev-shm-usage",
|
| 47 |
# '--disable-popup-blocking',
|
| 48 |
+
"--metrics-recording-only",
|
| 49 |
+
"--disable-crash-reporter",
|
| 50 |
+
"--disable-partial-raster",
|
| 51 |
+
"--disable-gesture-typing",
|
| 52 |
+
"--disable-checker-imaging",
|
| 53 |
+
"--disable-prompt-on-repost",
|
| 54 |
+
"--force-color-profile=srgb",
|
| 55 |
+
"--font-render-hinting=none",
|
| 56 |
+
"--no-default-browser-check",
|
| 57 |
+
"--aggressive-cache-discard",
|
| 58 |
+
"--disable-component-update",
|
| 59 |
+
"--disable-cookie-encryption",
|
| 60 |
+
"--disable-domain-reliability",
|
| 61 |
+
"--disable-threaded-animation",
|
| 62 |
+
"--disable-threaded-scrolling",
|
| 63 |
# '--disable-reading-from-canvas', # For Firefox
|
| 64 |
+
"--enable-simple-cache-backend",
|
| 65 |
+
"--disable-background-networking",
|
| 66 |
+
"--disable-session-crashed-bubble",
|
| 67 |
+
"--enable-surface-synchronization",
|
| 68 |
+
"--disable-image-animation-resync",
|
| 69 |
+
"--disable-renderer-backgrounding",
|
| 70 |
+
"--disable-ipc-flooding-protection",
|
| 71 |
+
"--prerender-from-omnibox=disabled",
|
| 72 |
+
"--safebrowsing-disable-auto-update",
|
| 73 |
+
"--disable-offer-upload-credit-cards",
|
| 74 |
+
"--disable-features=site-per-process",
|
| 75 |
+
"--disable-background-timer-throttling",
|
| 76 |
+
"--disable-new-content-rendering-timeout",
|
| 77 |
+
"--run-all-compositor-stages-before-draw",
|
| 78 |
+
"--disable-client-side-phishing-detection",
|
| 79 |
+
"--disable-backgrounding-occluded-windows",
|
| 80 |
+
"--disable-layer-tree-host-memory-pressure",
|
| 81 |
+
"--autoplay-policy=no-user-gesture-required",
|
| 82 |
+
"--disable-offer-store-unmasked-wallet-cards",
|
| 83 |
+
"--disable-blink-features=AutomationControlled",
|
| 84 |
+
"--webrtc-ip-handling-policy=disable_non_proxied_udp",
|
| 85 |
+
"--disable-component-extensions-with-background-pages",
|
| 86 |
+
"--force-webrtc-ip-handling-policy=disable_non_proxied_udp",
|
| 87 |
+
"--enable-features=NetworkService,NetworkServiceInProcess,TrustTokens,TrustTokensAlwaysAllowIssuance",
|
| 88 |
+
"--blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4",
|
| 89 |
+
"--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process,TranslateUI,BlinkGenPropertyTrees",
|
| 90 |
)
|
| 91 |
|
| 92 |
# Defaulting to the docker mode, token doesn't matter in it as it's passed for the container
|
|
|
|
| 95 |
"headless": True,
|
| 96 |
"autoClose": True,
|
| 97 |
"fingerprint": {
|
| 98 |
+
"flags": {"timezone": "BasedOnIp", "screen": "Custom"},
|
| 99 |
+
"platform": "linux", # support: windows, mac, linux
|
| 100 |
+
"kernel": "chromium", # only support: chromium
|
| 101 |
+
"kernelMilestone": "128",
|
|
|
|
|
|
|
|
|
|
| 102 |
"hardwareConcurrency": 8,
|
| 103 |
"deviceMemory": 8,
|
| 104 |
},
|
scrapling/engines/pw.py
CHANGED
|
@@ -1,42 +1,46 @@
|
|
| 1 |
import json
|
| 2 |
|
| 3 |
-
from scrapling.core._types import
|
| 4 |
-
SelectorWaitStates, Union)
|
| 5 |
from scrapling.core.utils import log, lru_cache
|
| 6 |
-
from scrapling.engines.constants import
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
class PlaywrightEngine:
|
| 18 |
def __init__(
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
|
|
|
| 40 |
):
|
| 41 |
"""An engine that utilizes PlayWright library, check the `PlayWrightFetcher` class for more documentation.
|
| 42 |
|
|
@@ -65,7 +69,7 @@ class PlaywrightEngine:
|
|
| 65 |
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
| 66 |
"""
|
| 67 |
self.headless = headless
|
| 68 |
-
self.locale = check_type_validity(locale, [str],
|
| 69 |
self.disable_resources = disable_resources
|
| 70 |
self.network_idle = bool(network_idle)
|
| 71 |
self.stealth = bool(stealth)
|
|
@@ -95,8 +99,8 @@ class PlaywrightEngine:
|
|
| 95 |
self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
|
| 96 |
self.harmful_default_args = [
|
| 97 |
# This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
|
| 98 |
-
|
| 99 |
-
|
| 100 |
# '--disable-component-update',
|
| 101 |
# '--disable-default-apps',
|
| 102 |
# '--disable-extensions',
|
|
@@ -114,12 +118,16 @@ class PlaywrightEngine:
|
|
| 114 |
query = NSTBROWSER_DEFAULT_QUERY.copy()
|
| 115 |
if self.stealth:
|
| 116 |
flags = self.__set_flags()
|
| 117 |
-
query.update(
|
| 118 |
-
|
| 119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
config = {
|
| 122 |
-
|
| 123 |
# 'token': ''
|
| 124 |
}
|
| 125 |
cdp_url = construct_cdp_url(cdp_url, config)
|
|
@@ -134,17 +142,25 @@ class PlaywrightEngine:
|
|
| 134 |
"""Returns the flags that will be used while launching the browser if stealth mode is enabled"""
|
| 135 |
flags = DEFAULT_STEALTH_FLAGS
|
| 136 |
if self.hide_canvas:
|
| 137 |
-
flags += (
|
| 138 |
if self.disable_webgl:
|
| 139 |
-
flags += (
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
return flags
|
| 142 |
|
| 143 |
def __launch_kwargs(self):
|
| 144 |
"""Creates the arguments we will use while launching playwright's browser"""
|
| 145 |
-
launch_kwargs = {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
if self.stealth:
|
| 147 |
-
launch_kwargs.update({
|
| 148 |
|
| 149 |
return launch_kwargs
|
| 150 |
|
|
@@ -153,22 +169,26 @@ class PlaywrightEngine:
|
|
| 153 |
context_kwargs = {
|
| 154 |
"proxy": self.proxy,
|
| 155 |
"locale": self.locale,
|
| 156 |
-
"color_scheme":
|
| 157 |
"device_scale_factor": 2,
|
| 158 |
"extra_http_headers": self.extra_headers if self.extra_headers else {},
|
| 159 |
-
"user_agent": self.useragent
|
|
|
|
|
|
|
| 160 |
}
|
| 161 |
if self.stealth:
|
| 162 |
-
context_kwargs.update(
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
|
|
|
|
|
|
| 172 |
|
| 173 |
return context_kwargs
|
| 174 |
|
|
@@ -184,10 +204,16 @@ class PlaywrightEngine:
|
|
| 184 |
# https://arh.antoinevastel.com/bots/areyouheadless/
|
| 185 |
# https://prescience-data.github.io/execution-monitor.html
|
| 186 |
return tuple(
|
| 187 |
-
js_bypass_path(script)
|
|
|
|
| 188 |
# Order is important
|
| 189 |
-
|
| 190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
)
|
| 192 |
)
|
| 193 |
|
|
@@ -200,19 +226,30 @@ class PlaywrightEngine:
|
|
| 200 |
while current_request:
|
| 201 |
try:
|
| 202 |
current_response = current_request.response()
|
| 203 |
-
history.insert(
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
except Exception as e:
|
| 217 |
log.error(f"Error processing redirect: {e}")
|
| 218 |
break
|
|
@@ -232,19 +269,30 @@ class PlaywrightEngine:
|
|
| 232 |
while current_request:
|
| 233 |
try:
|
| 234 |
current_response = await current_request.response()
|
| 235 |
-
history.insert(
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
except Exception as e:
|
| 249 |
log.error(f"Error processing redirect: {e}")
|
| 250 |
break
|
|
@@ -262,6 +310,7 @@ class PlaywrightEngine:
|
|
| 262 |
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 263 |
"""
|
| 264 |
from playwright.sync_api import Response as PlaywrightResponse
|
|
|
|
| 265 |
if not self.stealth or self.real_chrome:
|
| 266 |
# Because rebrowser_playwright doesn't play well with real browsers
|
| 267 |
from playwright.sync_api import sync_playwright
|
|
@@ -273,7 +322,10 @@ class PlaywrightEngine:
|
|
| 273 |
|
| 274 |
def handle_response(finished_response: PlaywrightResponse):
|
| 275 |
nonlocal final_response
|
| 276 |
-
if
|
|
|
|
|
|
|
|
|
|
| 277 |
final_response = finished_response
|
| 278 |
|
| 279 |
with sync_playwright() as p:
|
|
@@ -304,7 +356,7 @@ class PlaywrightEngine:
|
|
| 304 |
page.wait_for_load_state(state="domcontentloaded")
|
| 305 |
|
| 306 |
if self.network_idle:
|
| 307 |
-
page.wait_for_load_state(
|
| 308 |
|
| 309 |
if self.page_action is not None:
|
| 310 |
try:
|
|
@@ -320,7 +372,7 @@ class PlaywrightEngine:
|
|
| 320 |
page.wait_for_load_state(state="load")
|
| 321 |
page.wait_for_load_state(state="domcontentloaded")
|
| 322 |
if self.network_idle:
|
| 323 |
-
page.wait_for_load_state(
|
| 324 |
except Exception as e:
|
| 325 |
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
| 326 |
|
|
@@ -331,9 +383,13 @@ class PlaywrightEngine:
|
|
| 331 |
raise ValueError("Failed to get a response from the page")
|
| 332 |
|
| 333 |
# This will be parsed inside `Response`
|
| 334 |
-
encoding =
|
|
|
|
|
|
|
| 335 |
# PlayWright API sometimes give empty status text for some reason!
|
| 336 |
-
status_text = final_response.status_text or StatusText.get(
|
|
|
|
|
|
|
| 337 |
|
| 338 |
history = self._process_response_history(first_response)
|
| 339 |
try:
|
|
@@ -345,15 +401,17 @@ class PlaywrightEngine:
|
|
| 345 |
response = Response(
|
| 346 |
url=page.url,
|
| 347 |
text=page_content,
|
| 348 |
-
body=page_content.encode(
|
| 349 |
status=final_response.status,
|
| 350 |
reason=status_text,
|
| 351 |
encoding=encoding,
|
| 352 |
-
cookies={
|
|
|
|
|
|
|
| 353 |
headers=first_response.all_headers(),
|
| 354 |
request_headers=first_response.request.all_headers(),
|
| 355 |
history=history,
|
| 356 |
-
**self.adaptor_arguments
|
| 357 |
)
|
| 358 |
page.close()
|
| 359 |
context.close()
|
|
@@ -366,6 +424,7 @@ class PlaywrightEngine:
|
|
| 366 |
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 367 |
"""
|
| 368 |
from playwright.async_api import Response as PlaywrightResponse
|
|
|
|
| 369 |
if not self.stealth or self.real_chrome:
|
| 370 |
# Because rebrowser_playwright doesn't play well with real browsers
|
| 371 |
from playwright.async_api import async_playwright
|
|
@@ -377,7 +436,10 @@ class PlaywrightEngine:
|
|
| 377 |
|
| 378 |
async def handle_response(finished_response: PlaywrightResponse):
|
| 379 |
nonlocal final_response
|
| 380 |
-
if
|
|
|
|
|
|
|
|
|
|
| 381 |
final_response = finished_response
|
| 382 |
|
| 383 |
async with async_playwright() as p:
|
|
@@ -408,7 +470,7 @@ class PlaywrightEngine:
|
|
| 408 |
await page.wait_for_load_state(state="domcontentloaded")
|
| 409 |
|
| 410 |
if self.network_idle:
|
| 411 |
-
await page.wait_for_load_state(
|
| 412 |
|
| 413 |
if self.page_action is not None:
|
| 414 |
try:
|
|
@@ -424,7 +486,7 @@ class PlaywrightEngine:
|
|
| 424 |
await page.wait_for_load_state(state="load")
|
| 425 |
await page.wait_for_load_state(state="domcontentloaded")
|
| 426 |
if self.network_idle:
|
| 427 |
-
await page.wait_for_load_state(
|
| 428 |
except Exception as e:
|
| 429 |
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
| 430 |
|
|
@@ -435,9 +497,13 @@ class PlaywrightEngine:
|
|
| 435 |
raise ValueError("Failed to get a response from the page")
|
| 436 |
|
| 437 |
# This will be parsed inside `Response`
|
| 438 |
-
encoding =
|
|
|
|
|
|
|
| 439 |
# PlayWright API sometimes give empty status text for some reason!
|
| 440 |
-
status_text = final_response.status_text or StatusText.get(
|
|
|
|
|
|
|
| 441 |
|
| 442 |
history = await self._async_process_response_history(first_response)
|
| 443 |
try:
|
|
@@ -449,15 +515,18 @@ class PlaywrightEngine:
|
|
| 449 |
response = Response(
|
| 450 |
url=page.url,
|
| 451 |
text=page_content,
|
| 452 |
-
body=page_content.encode(
|
| 453 |
status=final_response.status,
|
| 454 |
reason=status_text,
|
| 455 |
encoding=encoding,
|
| 456 |
-
cookies={
|
|
|
|
|
|
|
|
|
|
| 457 |
headers=await first_response.all_headers(),
|
| 458 |
request_headers=await first_response.request.all_headers(),
|
| 459 |
history=history,
|
| 460 |
-
**self.adaptor_arguments
|
| 461 |
)
|
| 462 |
await page.close()
|
| 463 |
await context.close()
|
|
|
|
| 1 |
import json
|
| 2 |
|
| 3 |
+
from scrapling.core._types import Callable, Dict, Optional, SelectorWaitStates, Union
|
|
|
|
| 4 |
from scrapling.core.utils import log, lru_cache
|
| 5 |
+
from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS, NSTBROWSER_DEFAULT_QUERY
|
| 6 |
+
from scrapling.engines.toolbelt import (
|
| 7 |
+
Response,
|
| 8 |
+
StatusText,
|
| 9 |
+
async_intercept_route,
|
| 10 |
+
check_type_validity,
|
| 11 |
+
construct_cdp_url,
|
| 12 |
+
construct_proxy_dict,
|
| 13 |
+
generate_convincing_referer,
|
| 14 |
+
generate_headers,
|
| 15 |
+
intercept_route,
|
| 16 |
+
js_bypass_path,
|
| 17 |
+
)
|
| 18 |
|
| 19 |
|
| 20 |
class PlaywrightEngine:
|
| 21 |
def __init__(
|
| 22 |
+
self,
|
| 23 |
+
headless: Union[bool, str] = True,
|
| 24 |
+
disable_resources: bool = False,
|
| 25 |
+
useragent: Optional[str] = None,
|
| 26 |
+
network_idle: bool = False,
|
| 27 |
+
timeout: Optional[float] = 30000,
|
| 28 |
+
wait: Optional[int] = 0,
|
| 29 |
+
page_action: Callable = None,
|
| 30 |
+
wait_selector: Optional[str] = None,
|
| 31 |
+
locale: Optional[str] = "en-US",
|
| 32 |
+
wait_selector_state: SelectorWaitStates = "attached",
|
| 33 |
+
stealth: bool = False,
|
| 34 |
+
real_chrome: bool = False,
|
| 35 |
+
hide_canvas: bool = False,
|
| 36 |
+
disable_webgl: bool = False,
|
| 37 |
+
cdp_url: Optional[str] = None,
|
| 38 |
+
nstbrowser_mode: bool = False,
|
| 39 |
+
nstbrowser_config: Optional[Dict] = None,
|
| 40 |
+
google_search: bool = True,
|
| 41 |
+
extra_headers: Optional[Dict[str, str]] = None,
|
| 42 |
+
proxy: Optional[Union[str, Dict[str, str]]] = None,
|
| 43 |
+
adaptor_arguments: Dict = None,
|
| 44 |
):
|
| 45 |
"""An engine that utilizes PlayWright library, check the `PlayWrightFetcher` class for more documentation.
|
| 46 |
|
|
|
|
| 69 |
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
| 70 |
"""
|
| 71 |
self.headless = headless
|
| 72 |
+
self.locale = check_type_validity(locale, [str], "en-US", param_name="locale")
|
| 73 |
self.disable_resources = disable_resources
|
| 74 |
self.network_idle = bool(network_idle)
|
| 75 |
self.stealth = bool(stealth)
|
|
|
|
| 99 |
self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
|
| 100 |
self.harmful_default_args = [
|
| 101 |
# This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
|
| 102 |
+
"--enable-automation",
|
| 103 |
+
"--disable-popup-blocking",
|
| 104 |
# '--disable-component-update',
|
| 105 |
# '--disable-default-apps',
|
| 106 |
# '--disable-extensions',
|
|
|
|
| 118 |
query = NSTBROWSER_DEFAULT_QUERY.copy()
|
| 119 |
if self.stealth:
|
| 120 |
flags = self.__set_flags()
|
| 121 |
+
query.update(
|
| 122 |
+
{
|
| 123 |
+
"args": dict(
|
| 124 |
+
zip(flags, [""] * len(flags))
|
| 125 |
+
), # browser args should be a dictionary
|
| 126 |
+
}
|
| 127 |
+
)
|
| 128 |
|
| 129 |
config = {
|
| 130 |
+
"config": json.dumps(query),
|
| 131 |
# 'token': ''
|
| 132 |
}
|
| 133 |
cdp_url = construct_cdp_url(cdp_url, config)
|
|
|
|
| 142 |
"""Returns the flags that will be used while launching the browser if stealth mode is enabled"""
|
| 143 |
flags = DEFAULT_STEALTH_FLAGS
|
| 144 |
if self.hide_canvas:
|
| 145 |
+
flags += ("--fingerprinting-canvas-image-data-noise",)
|
| 146 |
if self.disable_webgl:
|
| 147 |
+
flags += (
|
| 148 |
+
"--disable-webgl",
|
| 149 |
+
"--disable-webgl-image-chromium",
|
| 150 |
+
"--disable-webgl2",
|
| 151 |
+
)
|
| 152 |
|
| 153 |
return flags
|
| 154 |
|
| 155 |
def __launch_kwargs(self):
|
| 156 |
"""Creates the arguments we will use while launching playwright's browser"""
|
| 157 |
+
launch_kwargs = {
|
| 158 |
+
"headless": self.headless,
|
| 159 |
+
"ignore_default_args": self.harmful_default_args,
|
| 160 |
+
"channel": "chrome" if self.real_chrome else "chromium",
|
| 161 |
+
}
|
| 162 |
if self.stealth:
|
| 163 |
+
launch_kwargs.update({"args": self.__set_flags(), "chromium_sandbox": True})
|
| 164 |
|
| 165 |
return launch_kwargs
|
| 166 |
|
|
|
|
| 169 |
context_kwargs = {
|
| 170 |
"proxy": self.proxy,
|
| 171 |
"locale": self.locale,
|
| 172 |
+
"color_scheme": "dark", # Bypasses the 'prefersLightColor' check in creepjs
|
| 173 |
"device_scale_factor": 2,
|
| 174 |
"extra_http_headers": self.extra_headers if self.extra_headers else {},
|
| 175 |
+
"user_agent": self.useragent
|
| 176 |
+
if self.useragent
|
| 177 |
+
else generate_headers(browser_mode=True).get("User-Agent"),
|
| 178 |
}
|
| 179 |
if self.stealth:
|
| 180 |
+
context_kwargs.update(
|
| 181 |
+
{
|
| 182 |
+
"is_mobile": False,
|
| 183 |
+
"has_touch": False,
|
| 184 |
+
# I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
|
| 185 |
+
"service_workers": "allow",
|
| 186 |
+
"ignore_https_errors": True,
|
| 187 |
+
"screen": {"width": 1920, "height": 1080},
|
| 188 |
+
"viewport": {"width": 1920, "height": 1080},
|
| 189 |
+
"permissions": ["geolocation", "notifications"],
|
| 190 |
+
}
|
| 191 |
+
)
|
| 192 |
|
| 193 |
return context_kwargs
|
| 194 |
|
|
|
|
| 204 |
# https://arh.antoinevastel.com/bots/areyouheadless/
|
| 205 |
# https://prescience-data.github.io/execution-monitor.html
|
| 206 |
return tuple(
|
| 207 |
+
js_bypass_path(script)
|
| 208 |
+
for script in (
|
| 209 |
# Order is important
|
| 210 |
+
"webdriver_fully.js",
|
| 211 |
+
"window_chrome.js",
|
| 212 |
+
"navigator_plugins.js",
|
| 213 |
+
"pdf_viewer.js",
|
| 214 |
+
"notification_permission.js",
|
| 215 |
+
"screen_props.js",
|
| 216 |
+
"playwright_fingerprint.js",
|
| 217 |
)
|
| 218 |
)
|
| 219 |
|
|
|
|
| 226 |
while current_request:
|
| 227 |
try:
|
| 228 |
current_response = current_request.response()
|
| 229 |
+
history.insert(
|
| 230 |
+
0,
|
| 231 |
+
Response(
|
| 232 |
+
url=current_request.url,
|
| 233 |
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
| 234 |
+
text="",
|
| 235 |
+
body=b"",
|
| 236 |
+
status=current_response.status if current_response else 301,
|
| 237 |
+
reason=(
|
| 238 |
+
current_response.status_text
|
| 239 |
+
or StatusText.get(current_response.status)
|
| 240 |
+
)
|
| 241 |
+
if current_response
|
| 242 |
+
else StatusText.get(301),
|
| 243 |
+
encoding=current_response.headers.get("content-type", "")
|
| 244 |
+
or "utf-8",
|
| 245 |
+
cookies={},
|
| 246 |
+
headers=current_response.all_headers()
|
| 247 |
+
if current_response
|
| 248 |
+
else {},
|
| 249 |
+
request_headers=current_request.all_headers(),
|
| 250 |
+
**self.adaptor_arguments,
|
| 251 |
+
),
|
| 252 |
+
)
|
| 253 |
except Exception as e:
|
| 254 |
log.error(f"Error processing redirect: {e}")
|
| 255 |
break
|
|
|
|
| 269 |
while current_request:
|
| 270 |
try:
|
| 271 |
current_response = await current_request.response()
|
| 272 |
+
history.insert(
|
| 273 |
+
0,
|
| 274 |
+
Response(
|
| 275 |
+
url=current_request.url,
|
| 276 |
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
| 277 |
+
text="",
|
| 278 |
+
body=b"",
|
| 279 |
+
status=current_response.status if current_response else 301,
|
| 280 |
+
reason=(
|
| 281 |
+
current_response.status_text
|
| 282 |
+
or StatusText.get(current_response.status)
|
| 283 |
+
)
|
| 284 |
+
if current_response
|
| 285 |
+
else StatusText.get(301),
|
| 286 |
+
encoding=current_response.headers.get("content-type", "")
|
| 287 |
+
or "utf-8",
|
| 288 |
+
cookies={},
|
| 289 |
+
headers=await current_response.all_headers()
|
| 290 |
+
if current_response
|
| 291 |
+
else {},
|
| 292 |
+
request_headers=await current_request.all_headers(),
|
| 293 |
+
**self.adaptor_arguments,
|
| 294 |
+
),
|
| 295 |
+
)
|
| 296 |
except Exception as e:
|
| 297 |
log.error(f"Error processing redirect: {e}")
|
| 298 |
break
|
|
|
|
| 310 |
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 311 |
"""
|
| 312 |
from playwright.sync_api import Response as PlaywrightResponse
|
| 313 |
+
|
| 314 |
if not self.stealth or self.real_chrome:
|
| 315 |
# Because rebrowser_playwright doesn't play well with real browsers
|
| 316 |
from playwright.sync_api import sync_playwright
|
|
|
|
| 322 |
|
| 323 |
def handle_response(finished_response: PlaywrightResponse):
|
| 324 |
nonlocal final_response
|
| 325 |
+
if (
|
| 326 |
+
finished_response.request.resource_type == "document"
|
| 327 |
+
and finished_response.request.is_navigation_request()
|
| 328 |
+
):
|
| 329 |
final_response = finished_response
|
| 330 |
|
| 331 |
with sync_playwright() as p:
|
|
|
|
| 356 |
page.wait_for_load_state(state="domcontentloaded")
|
| 357 |
|
| 358 |
if self.network_idle:
|
| 359 |
+
page.wait_for_load_state("networkidle")
|
| 360 |
|
| 361 |
if self.page_action is not None:
|
| 362 |
try:
|
|
|
|
| 372 |
page.wait_for_load_state(state="load")
|
| 373 |
page.wait_for_load_state(state="domcontentloaded")
|
| 374 |
if self.network_idle:
|
| 375 |
+
page.wait_for_load_state("networkidle")
|
| 376 |
except Exception as e:
|
| 377 |
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
| 378 |
|
|
|
|
| 383 |
raise ValueError("Failed to get a response from the page")
|
| 384 |
|
| 385 |
# This will be parsed inside `Response`
|
| 386 |
+
encoding = (
|
| 387 |
+
final_response.headers.get("content-type", "") or "utf-8"
|
| 388 |
+
) # default encoding
|
| 389 |
# PlayWright API sometimes give empty status text for some reason!
|
| 390 |
+
status_text = final_response.status_text or StatusText.get(
|
| 391 |
+
final_response.status
|
| 392 |
+
)
|
| 393 |
|
| 394 |
history = self._process_response_history(first_response)
|
| 395 |
try:
|
|
|
|
| 401 |
response = Response(
|
| 402 |
url=page.url,
|
| 403 |
text=page_content,
|
| 404 |
+
body=page_content.encode("utf-8"),
|
| 405 |
status=final_response.status,
|
| 406 |
reason=status_text,
|
| 407 |
encoding=encoding,
|
| 408 |
+
cookies={
|
| 409 |
+
cookie["name"]: cookie["value"] for cookie in page.context.cookies()
|
| 410 |
+
},
|
| 411 |
headers=first_response.all_headers(),
|
| 412 |
request_headers=first_response.request.all_headers(),
|
| 413 |
history=history,
|
| 414 |
+
**self.adaptor_arguments,
|
| 415 |
)
|
| 416 |
page.close()
|
| 417 |
context.close()
|
|
|
|
| 424 |
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 425 |
"""
|
| 426 |
from playwright.async_api import Response as PlaywrightResponse
|
| 427 |
+
|
| 428 |
if not self.stealth or self.real_chrome:
|
| 429 |
# Because rebrowser_playwright doesn't play well with real browsers
|
| 430 |
from playwright.async_api import async_playwright
|
|
|
|
| 436 |
|
| 437 |
async def handle_response(finished_response: PlaywrightResponse):
|
| 438 |
nonlocal final_response
|
| 439 |
+
if (
|
| 440 |
+
finished_response.request.resource_type == "document"
|
| 441 |
+
and finished_response.request.is_navigation_request()
|
| 442 |
+
):
|
| 443 |
final_response = finished_response
|
| 444 |
|
| 445 |
async with async_playwright() as p:
|
|
|
|
| 470 |
await page.wait_for_load_state(state="domcontentloaded")
|
| 471 |
|
| 472 |
if self.network_idle:
|
| 473 |
+
await page.wait_for_load_state("networkidle")
|
| 474 |
|
| 475 |
if self.page_action is not None:
|
| 476 |
try:
|
|
|
|
| 486 |
await page.wait_for_load_state(state="load")
|
| 487 |
await page.wait_for_load_state(state="domcontentloaded")
|
| 488 |
if self.network_idle:
|
| 489 |
+
await page.wait_for_load_state("networkidle")
|
| 490 |
except Exception as e:
|
| 491 |
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
| 492 |
|
|
|
|
| 497 |
raise ValueError("Failed to get a response from the page")
|
| 498 |
|
| 499 |
# This will be parsed inside `Response`
|
| 500 |
+
encoding = (
|
| 501 |
+
final_response.headers.get("content-type", "") or "utf-8"
|
| 502 |
+
) # default encoding
|
| 503 |
# PlayWright API sometimes give empty status text for some reason!
|
| 504 |
+
status_text = final_response.status_text or StatusText.get(
|
| 505 |
+
final_response.status
|
| 506 |
+
)
|
| 507 |
|
| 508 |
history = await self._async_process_response_history(first_response)
|
| 509 |
try:
|
|
|
|
| 515 |
response = Response(
|
| 516 |
url=page.url,
|
| 517 |
text=page_content,
|
| 518 |
+
body=page_content.encode("utf-8"),
|
| 519 |
status=final_response.status,
|
| 520 |
reason=status_text,
|
| 521 |
encoding=encoding,
|
| 522 |
+
cookies={
|
| 523 |
+
cookie["name"]: cookie["value"]
|
| 524 |
+
for cookie in await page.context.cookies()
|
| 525 |
+
},
|
| 526 |
headers=await first_response.all_headers(),
|
| 527 |
request_headers=await first_response.request.all_headers(),
|
| 528 |
history=history,
|
| 529 |
+
**self.adaptor_arguments,
|
| 530 |
)
|
| 531 |
await page.close()
|
| 532 |
await context.close()
|
scrapling/engines/static.py
CHANGED
|
@@ -10,8 +10,14 @@ from .toolbelt import Response, generate_convincing_referer, generate_headers
|
|
| 10 |
@lru_cache(2, typed=True) # Singleton easily
|
| 11 |
class StaticEngine:
|
| 12 |
def __init__(
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
):
|
| 16 |
"""An engine that utilizes httpx library, check the `Fetcher` class for more documentation.
|
| 17 |
|
|
@@ -47,14 +53,22 @@ class StaticEngine:
|
|
| 47 |
if self.stealth:
|
| 48 |
extra_headers = generate_headers(browser_mode=False)
|
| 49 |
# Don't overwrite user supplied headers
|
| 50 |
-
extra_headers = {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
headers.update(extra_headers)
|
| 52 |
-
if
|
| 53 |
-
headers.update({
|
| 54 |
|
| 55 |
-
elif
|
| 56 |
-
headers[
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
return headers
|
| 60 |
|
|
@@ -70,25 +84,43 @@ class StaticEngine:
|
|
| 70 |
body=response.content,
|
| 71 |
status=response.status_code,
|
| 72 |
reason=response.reason_phrase,
|
| 73 |
-
encoding=response.encoding or
|
| 74 |
cookies=dict(response.cookies),
|
| 75 |
headers=dict(response.headers),
|
| 76 |
request_headers=dict(response.request.headers),
|
| 77 |
method=response.request.method,
|
| 78 |
-
history=[
|
| 79 |
-
|
|
|
|
|
|
|
| 80 |
)
|
| 81 |
|
| 82 |
def _make_request(self, method: str, **kwargs) -> Response:
|
| 83 |
-
headers = self._headers_job(kwargs.pop(
|
| 84 |
-
with httpx.Client(
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
return self._prepare_response(request)
|
| 87 |
|
| 88 |
async def _async_make_request(self, method: str, **kwargs) -> Response:
|
| 89 |
-
headers = self._headers_job(kwargs.pop(
|
| 90 |
-
async with httpx.AsyncClient(
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
return self._prepare_response(request)
|
| 93 |
|
| 94 |
def get(self, **kwargs: Dict) -> Response:
|
|
@@ -97,7 +129,7 @@ class StaticEngine:
|
|
| 97 |
:param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
| 98 |
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 99 |
"""
|
| 100 |
-
return self._make_request(
|
| 101 |
|
| 102 |
async def async_get(self, **kwargs: Dict) -> Response:
|
| 103 |
"""Make basic async HTTP GET request for you but with some added flavors.
|
|
@@ -105,7 +137,7 @@ class StaticEngine:
|
|
| 105 |
:param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
| 106 |
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 107 |
"""
|
| 108 |
-
return await self._async_make_request(
|
| 109 |
|
| 110 |
def post(self, **kwargs: Dict) -> Response:
|
| 111 |
"""Make basic HTTP POST request for you but with some added flavors.
|
|
@@ -113,7 +145,7 @@ class StaticEngine:
|
|
| 113 |
:param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
| 114 |
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 115 |
"""
|
| 116 |
-
return self._make_request(
|
| 117 |
|
| 118 |
async def async_post(self, **kwargs: Dict) -> Response:
|
| 119 |
"""Make basic async HTTP POST request for you but with some added flavors.
|
|
@@ -121,7 +153,7 @@ class StaticEngine:
|
|
| 121 |
:param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
| 122 |
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 123 |
"""
|
| 124 |
-
return await self._async_make_request(
|
| 125 |
|
| 126 |
def delete(self, **kwargs: Dict) -> Response:
|
| 127 |
"""Make basic HTTP DELETE request for you but with some added flavors.
|
|
@@ -129,7 +161,7 @@ class StaticEngine:
|
|
| 129 |
:param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
| 130 |
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 131 |
"""
|
| 132 |
-
return self._make_request(
|
| 133 |
|
| 134 |
async def async_delete(self, **kwargs: Dict) -> Response:
|
| 135 |
"""Make basic async HTTP DELETE request for you but with some added flavors.
|
|
@@ -137,7 +169,7 @@ class StaticEngine:
|
|
| 137 |
:param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
| 138 |
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 139 |
"""
|
| 140 |
-
return await self._async_make_request(
|
| 141 |
|
| 142 |
def put(self, **kwargs: Dict) -> Response:
|
| 143 |
"""Make basic HTTP PUT request for you but with some added flavors.
|
|
@@ -145,7 +177,7 @@ class StaticEngine:
|
|
| 145 |
:param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
| 146 |
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 147 |
"""
|
| 148 |
-
return self._make_request(
|
| 149 |
|
| 150 |
async def async_put(self, **kwargs: Dict) -> Response:
|
| 151 |
"""Make basic async HTTP PUT request for you but with some added flavors.
|
|
@@ -153,4 +185,4 @@ class StaticEngine:
|
|
| 153 |
:param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
| 154 |
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 155 |
"""
|
| 156 |
-
return await self._async_make_request(
|
|
|
|
| 10 |
@lru_cache(2, typed=True) # Singleton easily
|
| 11 |
class StaticEngine:
|
| 12 |
def __init__(
|
| 13 |
+
self,
|
| 14 |
+
url: str,
|
| 15 |
+
proxy: Optional[str] = None,
|
| 16 |
+
stealthy_headers: bool = True,
|
| 17 |
+
follow_redirects: bool = True,
|
| 18 |
+
timeout: Optional[Union[int, float]] = None,
|
| 19 |
+
retries: Optional[int] = 3,
|
| 20 |
+
adaptor_arguments: Tuple = None,
|
| 21 |
):
|
| 22 |
"""An engine that utilizes httpx library, check the `Fetcher` class for more documentation.
|
| 23 |
|
|
|
|
| 53 |
if self.stealth:
|
| 54 |
extra_headers = generate_headers(browser_mode=False)
|
| 55 |
# Don't overwrite user supplied headers
|
| 56 |
+
extra_headers = {
|
| 57 |
+
key: value
|
| 58 |
+
for key, value in extra_headers.items()
|
| 59 |
+
if key.lower() not in headers_keys
|
| 60 |
+
}
|
| 61 |
headers.update(extra_headers)
|
| 62 |
+
if "referer" not in headers_keys:
|
| 63 |
+
headers.update({"referer": generate_convincing_referer(self.url)})
|
| 64 |
|
| 65 |
+
elif "user-agent" not in headers_keys:
|
| 66 |
+
headers["User-Agent"] = generate_headers(browser_mode=False).get(
|
| 67 |
+
"User-Agent"
|
| 68 |
+
)
|
| 69 |
+
log.debug(
|
| 70 |
+
f"Can't find useragent in headers so '{headers['User-Agent']}' was used."
|
| 71 |
+
)
|
| 72 |
|
| 73 |
return headers
|
| 74 |
|
|
|
|
| 84 |
body=response.content,
|
| 85 |
status=response.status_code,
|
| 86 |
reason=response.reason_phrase,
|
| 87 |
+
encoding=response.encoding or "utf-8",
|
| 88 |
cookies=dict(response.cookies),
|
| 89 |
headers=dict(response.headers),
|
| 90 |
request_headers=dict(response.request.headers),
|
| 91 |
method=response.request.method,
|
| 92 |
+
history=[
|
| 93 |
+
self._prepare_response(redirection) for redirection in response.history
|
| 94 |
+
],
|
| 95 |
+
**self.adaptor_arguments,
|
| 96 |
)
|
| 97 |
|
| 98 |
def _make_request(self, method: str, **kwargs) -> Response:
|
| 99 |
+
headers = self._headers_job(kwargs.pop("headers", {}))
|
| 100 |
+
with httpx.Client(
|
| 101 |
+
proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)
|
| 102 |
+
) as client:
|
| 103 |
+
request = getattr(client, method)(
|
| 104 |
+
url=self.url,
|
| 105 |
+
headers=headers,
|
| 106 |
+
follow_redirects=self.follow_redirects,
|
| 107 |
+
timeout=self.timeout,
|
| 108 |
+
**kwargs,
|
| 109 |
+
)
|
| 110 |
return self._prepare_response(request)
|
| 111 |
|
| 112 |
async def _async_make_request(self, method: str, **kwargs) -> Response:
|
| 113 |
+
headers = self._headers_job(kwargs.pop("headers", {}))
|
| 114 |
+
async with httpx.AsyncClient(
|
| 115 |
+
proxy=self.proxy, transport=httpx.AsyncHTTPTransport(retries=self.retries)
|
| 116 |
+
) as client:
|
| 117 |
+
request = await getattr(client, method)(
|
| 118 |
+
url=self.url,
|
| 119 |
+
headers=headers,
|
| 120 |
+
follow_redirects=self.follow_redirects,
|
| 121 |
+
timeout=self.timeout,
|
| 122 |
+
**kwargs,
|
| 123 |
+
)
|
| 124 |
return self._prepare_response(request)
|
| 125 |
|
| 126 |
def get(self, **kwargs: Dict) -> Response:
|
|
|
|
| 129 |
:param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
| 130 |
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 131 |
"""
|
| 132 |
+
return self._make_request("get", **kwargs)
|
| 133 |
|
| 134 |
async def async_get(self, **kwargs: Dict) -> Response:
|
| 135 |
"""Make basic async HTTP GET request for you but with some added flavors.
|
|
|
|
| 137 |
:param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
| 138 |
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 139 |
"""
|
| 140 |
+
return await self._async_make_request("get", **kwargs)
|
| 141 |
|
| 142 |
def post(self, **kwargs: Dict) -> Response:
|
| 143 |
"""Make basic HTTP POST request for you but with some added flavors.
|
|
|
|
| 145 |
:param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
| 146 |
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 147 |
"""
|
| 148 |
+
return self._make_request("post", **kwargs)
|
| 149 |
|
| 150 |
async def async_post(self, **kwargs: Dict) -> Response:
|
| 151 |
"""Make basic async HTTP POST request for you but with some added flavors.
|
|
|
|
| 153 |
:param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
| 154 |
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 155 |
"""
|
| 156 |
+
return await self._async_make_request("post", **kwargs)
|
| 157 |
|
| 158 |
def delete(self, **kwargs: Dict) -> Response:
|
| 159 |
"""Make basic HTTP DELETE request for you but with some added flavors.
|
|
|
|
| 161 |
:param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
| 162 |
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 163 |
"""
|
| 164 |
+
return self._make_request("delete", **kwargs)
|
| 165 |
|
| 166 |
async def async_delete(self, **kwargs: Dict) -> Response:
|
| 167 |
"""Make basic async HTTP DELETE request for you but with some added flavors.
|
|
|
|
| 169 |
:param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
| 170 |
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 171 |
"""
|
| 172 |
+
return await self._async_make_request("delete", **kwargs)
|
| 173 |
|
| 174 |
def put(self, **kwargs: Dict) -> Response:
|
| 175 |
"""Make basic HTTP PUT request for you but with some added flavors.
|
|
|
|
| 177 |
:param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
| 178 |
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 179 |
"""
|
| 180 |
+
return self._make_request("put", **kwargs)
|
| 181 |
|
| 182 |
async def async_put(self, **kwargs: Dict) -> Response:
|
| 183 |
"""Make basic async HTTP PUT request for you but with some added flavors.
|
|
|
|
| 185 |
:param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
| 186 |
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 187 |
"""
|
| 188 |
+
return await self._async_make_request("put", **kwargs)
|
scrapling/engines/toolbelt/__init__.py
CHANGED
|
@@ -1,6 +1,16 @@
|
|
| 1 |
-
from .custom import (
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .custom import (
|
| 2 |
+
BaseFetcher,
|
| 3 |
+
Response,
|
| 4 |
+
StatusText,
|
| 5 |
+
check_if_engine_usable,
|
| 6 |
+
check_type_validity,
|
| 7 |
+
get_variable_name,
|
| 8 |
+
)
|
| 9 |
+
from .fingerprints import generate_convincing_referer, generate_headers, get_os_name
|
| 10 |
+
from .navigation import (
|
| 11 |
+
async_intercept_route,
|
| 12 |
+
construct_cdp_url,
|
| 13 |
+
construct_proxy_dict,
|
| 14 |
+
intercept_route,
|
| 15 |
+
js_bypass_path,
|
| 16 |
+
)
|
scrapling/engines/toolbelt/custom.py
CHANGED
|
@@ -1,11 +1,20 @@
|
|
| 1 |
"""
|
| 2 |
Functions related to custom types or type checking
|
| 3 |
"""
|
|
|
|
| 4 |
import inspect
|
| 5 |
from email.message import Message
|
| 6 |
|
| 7 |
-
from scrapling.core._types import (
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
from scrapling.core.custom_types import MappingProxyType
|
| 10 |
from scrapling.core.utils import log, lru_cache
|
| 11 |
from scrapling.parser import Adaptor, SQLiteStorageSystem
|
|
@@ -13,7 +22,12 @@ from scrapling.parser import Adaptor, SQLiteStorageSystem
|
|
| 13 |
|
| 14 |
class ResponseEncoding:
|
| 15 |
__DEFAULT_ENCODING = "utf-8"
|
| 16 |
-
__ISO_8859_1_CONTENT_TYPES = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
@classmethod
|
| 19 |
@lru_cache(maxsize=128)
|
|
@@ -27,19 +41,21 @@ class ResponseEncoding:
|
|
| 27 |
"""
|
| 28 |
# Create a Message object and set the Content-Type header then get the content type and parameters
|
| 29 |
msg = Message()
|
| 30 |
-
msg[
|
| 31 |
|
| 32 |
content_type = msg.get_content_type()
|
| 33 |
params = dict(msg.get_params(failobj=[]))
|
| 34 |
|
| 35 |
# Remove the content-type from params if present somehow
|
| 36 |
-
params.pop(
|
| 37 |
|
| 38 |
return content_type, params
|
| 39 |
|
| 40 |
@classmethod
|
| 41 |
@lru_cache(maxsize=128)
|
| 42 |
-
def get_value(
|
|
|
|
|
|
|
| 43 |
"""Determine the appropriate character encoding from a content-type header.
|
| 44 |
|
| 45 |
The encoding is determined by these rules in order:
|
|
@@ -72,7 +88,9 @@ class ResponseEncoding:
|
|
| 72 |
encoding = cls.__DEFAULT_ENCODING
|
| 73 |
|
| 74 |
if encoding:
|
| 75 |
-
_ = text.encode(
|
|
|
|
|
|
|
| 76 |
return encoding
|
| 77 |
|
| 78 |
return cls.__DEFAULT_ENCODING
|
|
@@ -84,9 +102,22 @@ class ResponseEncoding:
|
|
| 84 |
class Response(Adaptor):
|
| 85 |
"""This class is returned by all engines as a way to unify response type between different libraries."""
|
| 86 |
|
| 87 |
-
def __init__(
|
| 88 |
-
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
self.status = status
|
| 91 |
self.reason = reason
|
| 92 |
self.cookies = cookies
|
|
@@ -94,11 +125,19 @@ class Response(Adaptor):
|
|
| 94 |
self.request_headers = request_headers
|
| 95 |
self.history = history or []
|
| 96 |
encoding = ResponseEncoding.get_value(encoding, text)
|
| 97 |
-
super().__init__(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
# For back-ward compatibility
|
| 99 |
self.adaptor = self
|
| 100 |
# For easier debugging while working from a Python shell
|
| 101 |
-
log.info(
|
|
|
|
|
|
|
| 102 |
|
| 103 |
# def __repr__(self):
|
| 104 |
# return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
|
|
@@ -113,16 +152,26 @@ class BaseFetcher:
|
|
| 113 |
storage_args: Optional[Dict] = None
|
| 114 |
keep_comments: Optional[bool] = False
|
| 115 |
automatch_domain: Optional[str] = None
|
| 116 |
-
parser_keywords: Tuple = (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
def __init__(self, *args, **kwargs):
|
| 119 |
# For backward-compatibility before 0.2.99
|
| 120 |
-
args_str = ", ".join(args) or
|
| 121 |
-
kwargs_str = ", ".join(f
|
| 122 |
if args_str:
|
| 123 |
-
args_str +=
|
| 124 |
|
| 125 |
-
log.warning(
|
|
|
|
|
|
|
| 126 |
pass
|
| 127 |
|
| 128 |
@classmethod
|
|
@@ -150,12 +199,18 @@ class BaseFetcher:
|
|
| 150 |
setattr(cls, key, value)
|
| 151 |
else:
|
| 152 |
# Yup, no fun allowed LOL
|
| 153 |
-
raise AttributeError(
|
|
|
|
|
|
|
| 154 |
else:
|
| 155 |
-
raise ValueError(
|
|
|
|
|
|
|
| 156 |
|
| 157 |
if not kwargs:
|
| 158 |
-
raise AttributeError(
|
|
|
|
|
|
|
| 159 |
|
| 160 |
@classmethod
|
| 161 |
def _generate_parser_arguments(cls) -> Dict:
|
|
@@ -167,13 +222,15 @@ class BaseFetcher:
|
|
| 167 |
keep_cdata=cls.keep_cdata,
|
| 168 |
auto_match=cls.auto_match,
|
| 169 |
storage=cls.storage,
|
| 170 |
-
storage_args=cls.storage_args
|
| 171 |
)
|
| 172 |
if cls.automatch_domain:
|
| 173 |
if type(cls.automatch_domain) is not str:
|
| 174 |
-
log.warning(
|
|
|
|
|
|
|
| 175 |
else:
|
| 176 |
-
parser_arguments.update({
|
| 177 |
|
| 178 |
return parser_arguments
|
| 179 |
|
|
@@ -181,72 +238,75 @@ class BaseFetcher:
|
|
| 181 |
class StatusText:
|
| 182 |
"""A class that gets the status text of response status code.
|
| 183 |
|
| 184 |
-
|
| 185 |
"""
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
|
|
|
|
|
|
|
|
|
| 250 |
|
| 251 |
@classmethod
|
| 252 |
@lru_cache(maxsize=128)
|
|
@@ -265,20 +325,26 @@ def check_if_engine_usable(engine: Callable) -> Union[Callable, None]:
|
|
| 265 |
# if isinstance(engine, type):
|
| 266 |
# raise TypeError("Expected an engine instance, not a class definition of the engine")
|
| 267 |
|
| 268 |
-
if hasattr(engine,
|
| 269 |
fetch_function = getattr(engine, "fetch")
|
| 270 |
if callable(fetch_function):
|
| 271 |
if len(inspect.signature(fetch_function).parameters) > 0:
|
| 272 |
return engine
|
| 273 |
else:
|
| 274 |
# raise TypeError("Engine class instance must have a callable method 'fetch' with the first argument used for the url.")
|
| 275 |
-
raise TypeError(
|
|
|
|
|
|
|
| 276 |
else:
|
| 277 |
# raise TypeError("Invalid engine instance! Engine class must have a callable method 'fetch'")
|
| 278 |
-
raise TypeError(
|
|
|
|
|
|
|
| 279 |
else:
|
| 280 |
# raise TypeError("Invalid engine instance! Engine class must have the method 'fetch'")
|
| 281 |
-
raise TypeError(
|
|
|
|
|
|
|
| 282 |
|
| 283 |
|
| 284 |
def get_variable_name(var: Any) -> Optional[str]:
|
|
@@ -293,7 +359,13 @@ def get_variable_name(var: Any) -> Optional[str]:
|
|
| 293 |
return None
|
| 294 |
|
| 295 |
|
| 296 |
-
def check_type_validity(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
"""Check if a variable matches the specified type constraints.
|
| 298 |
:param variable: The variable to check
|
| 299 |
:param valid_types: List of valid types for the variable
|
|
@@ -316,7 +388,7 @@ def check_type_validity(variable: Any, valid_types: Union[List[Type], None], def
|
|
| 316 |
error_msg = f'Argument "{var_name}" cannot be None'
|
| 317 |
if critical:
|
| 318 |
raise TypeError(error_msg)
|
| 319 |
-
log.error(f
|
| 320 |
return default_value
|
| 321 |
|
| 322 |
# If no valid_types specified and variable has a value, return it
|
|
@@ -329,7 +401,7 @@ def check_type_validity(variable: Any, valid_types: Union[List[Type], None], def
|
|
| 329 |
error_msg = f'Argument "{var_name}" must be of type {" or ".join(type_names)}'
|
| 330 |
if critical:
|
| 331 |
raise TypeError(error_msg)
|
| 332 |
-
log.error(f
|
| 333 |
return default_value
|
| 334 |
|
| 335 |
return variable
|
|
|
|
| 1 |
"""
|
| 2 |
Functions related to custom types or type checking
|
| 3 |
"""
|
| 4 |
+
|
| 5 |
import inspect
|
| 6 |
from email.message import Message
|
| 7 |
|
| 8 |
+
from scrapling.core._types import (
|
| 9 |
+
Any,
|
| 10 |
+
Callable,
|
| 11 |
+
Dict,
|
| 12 |
+
List,
|
| 13 |
+
Optional,
|
| 14 |
+
Tuple,
|
| 15 |
+
Type,
|
| 16 |
+
Union,
|
| 17 |
+
)
|
| 18 |
from scrapling.core.custom_types import MappingProxyType
|
| 19 |
from scrapling.core.utils import log, lru_cache
|
| 20 |
from scrapling.parser import Adaptor, SQLiteStorageSystem
|
|
|
|
| 22 |
|
| 23 |
class ResponseEncoding:
|
| 24 |
__DEFAULT_ENCODING = "utf-8"
|
| 25 |
+
__ISO_8859_1_CONTENT_TYPES = {
|
| 26 |
+
"text/plain",
|
| 27 |
+
"text/html",
|
| 28 |
+
"text/css",
|
| 29 |
+
"text/javascript",
|
| 30 |
+
}
|
| 31 |
|
| 32 |
@classmethod
|
| 33 |
@lru_cache(maxsize=128)
|
|
|
|
| 41 |
"""
|
| 42 |
# Create a Message object and set the Content-Type header then get the content type and parameters
|
| 43 |
msg = Message()
|
| 44 |
+
msg["content-type"] = header_value
|
| 45 |
|
| 46 |
content_type = msg.get_content_type()
|
| 47 |
params = dict(msg.get_params(failobj=[]))
|
| 48 |
|
| 49 |
# Remove the content-type from params if present somehow
|
| 50 |
+
params.pop("content-type", None)
|
| 51 |
|
| 52 |
return content_type, params
|
| 53 |
|
| 54 |
@classmethod
|
| 55 |
@lru_cache(maxsize=128)
|
| 56 |
+
def get_value(
|
| 57 |
+
cls, content_type: Optional[str], text: Optional[str] = "test"
|
| 58 |
+
) -> str:
|
| 59 |
"""Determine the appropriate character encoding from a content-type header.
|
| 60 |
|
| 61 |
The encoding is determined by these rules in order:
|
|
|
|
| 88 |
encoding = cls.__DEFAULT_ENCODING
|
| 89 |
|
| 90 |
if encoding:
|
| 91 |
+
_ = text.encode(
|
| 92 |
+
encoding
|
| 93 |
+
) # Validate encoding and validate it can encode the given text
|
| 94 |
return encoding
|
| 95 |
|
| 96 |
return cls.__DEFAULT_ENCODING
|
|
|
|
| 102 |
class Response(Adaptor):
|
| 103 |
"""This class is returned by all engines as a way to unify response type between different libraries."""
|
| 104 |
|
| 105 |
+
def __init__(
|
| 106 |
+
self,
|
| 107 |
+
url: str,
|
| 108 |
+
text: str,
|
| 109 |
+
body: bytes,
|
| 110 |
+
status: int,
|
| 111 |
+
reason: str,
|
| 112 |
+
cookies: Dict,
|
| 113 |
+
headers: Dict,
|
| 114 |
+
request_headers: Dict,
|
| 115 |
+
encoding: str = "utf-8",
|
| 116 |
+
method: str = "GET",
|
| 117 |
+
history: List = None,
|
| 118 |
+
**adaptor_arguments: Dict,
|
| 119 |
+
):
|
| 120 |
+
automatch_domain = adaptor_arguments.pop("automatch_domain", None)
|
| 121 |
self.status = status
|
| 122 |
self.reason = reason
|
| 123 |
self.cookies = cookies
|
|
|
|
| 125 |
self.request_headers = request_headers
|
| 126 |
self.history = history or []
|
| 127 |
encoding = ResponseEncoding.get_value(encoding, text)
|
| 128 |
+
super().__init__(
|
| 129 |
+
text=text,
|
| 130 |
+
body=body,
|
| 131 |
+
url=automatch_domain or url,
|
| 132 |
+
encoding=encoding,
|
| 133 |
+
**adaptor_arguments,
|
| 134 |
+
)
|
| 135 |
# For back-ward compatibility
|
| 136 |
self.adaptor = self
|
| 137 |
# For easier debugging while working from a Python shell
|
| 138 |
+
log.info(
|
| 139 |
+
f"Fetched ({status}) <{method} {url}> (referer: {request_headers.get('referer')})"
|
| 140 |
+
)
|
| 141 |
|
| 142 |
# def __repr__(self):
|
| 143 |
# return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
|
|
|
|
| 152 |
storage_args: Optional[Dict] = None
|
| 153 |
keep_comments: Optional[bool] = False
|
| 154 |
automatch_domain: Optional[str] = None
|
| 155 |
+
parser_keywords: Tuple = (
|
| 156 |
+
"huge_tree",
|
| 157 |
+
"auto_match",
|
| 158 |
+
"storage",
|
| 159 |
+
"keep_cdata",
|
| 160 |
+
"storage_args",
|
| 161 |
+
"keep_comments",
|
| 162 |
+
"automatch_domain",
|
| 163 |
+
) # Left open for the user
|
| 164 |
|
| 165 |
def __init__(self, *args, **kwargs):
|
| 166 |
# For backward-compatibility before 0.2.99
|
| 167 |
+
args_str = ", ".join(args) or ""
|
| 168 |
+
kwargs_str = ", ".join(f"{k}={v}" for k, v in kwargs.items()) or ""
|
| 169 |
if args_str:
|
| 170 |
+
args_str += ", "
|
| 171 |
|
| 172 |
+
log.warning(
|
| 173 |
+
f"This logic is deprecated now, and have no effect; It will be removed with v0.3. Use `{self.__class__.__name__}.configure({args_str}{kwargs_str})` instead before fetching"
|
| 174 |
+
)
|
| 175 |
pass
|
| 176 |
|
| 177 |
@classmethod
|
|
|
|
| 199 |
setattr(cls, key, value)
|
| 200 |
else:
|
| 201 |
# Yup, no fun allowed LOL
|
| 202 |
+
raise AttributeError(
|
| 203 |
+
f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?'
|
| 204 |
+
)
|
| 205 |
else:
|
| 206 |
+
raise ValueError(
|
| 207 |
+
f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?'
|
| 208 |
+
)
|
| 209 |
|
| 210 |
if not kwargs:
|
| 211 |
+
raise AttributeError(
|
| 212 |
+
f"You must pass a keyword to configure, current keywords: {cls.parser_keywords}?"
|
| 213 |
+
)
|
| 214 |
|
| 215 |
@classmethod
|
| 216 |
def _generate_parser_arguments(cls) -> Dict:
|
|
|
|
| 222 |
keep_cdata=cls.keep_cdata,
|
| 223 |
auto_match=cls.auto_match,
|
| 224 |
storage=cls.storage,
|
| 225 |
+
storage_args=cls.storage_args,
|
| 226 |
)
|
| 227 |
if cls.automatch_domain:
|
| 228 |
if type(cls.automatch_domain) is not str:
|
| 229 |
+
log.warning(
|
| 230 |
+
'[Ignored] The argument "automatch_domain" must be of string type'
|
| 231 |
+
)
|
| 232 |
else:
|
| 233 |
+
parser_arguments.update({"automatch_domain": cls.automatch_domain})
|
| 234 |
|
| 235 |
return parser_arguments
|
| 236 |
|
|
|
|
| 238 |
class StatusText:
|
| 239 |
"""A class that gets the status text of response status code.
|
| 240 |
|
| 241 |
+
Reference: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status
|
| 242 |
"""
|
| 243 |
+
|
| 244 |
+
_phrases = MappingProxyType(
|
| 245 |
+
{
|
| 246 |
+
100: "Continue",
|
| 247 |
+
101: "Switching Protocols",
|
| 248 |
+
102: "Processing",
|
| 249 |
+
103: "Early Hints",
|
| 250 |
+
200: "OK",
|
| 251 |
+
201: "Created",
|
| 252 |
+
202: "Accepted",
|
| 253 |
+
203: "Non-Authoritative Information",
|
| 254 |
+
204: "No Content",
|
| 255 |
+
205: "Reset Content",
|
| 256 |
+
206: "Partial Content",
|
| 257 |
+
207: "Multi-Status",
|
| 258 |
+
208: "Already Reported",
|
| 259 |
+
226: "IM Used",
|
| 260 |
+
300: "Multiple Choices",
|
| 261 |
+
301: "Moved Permanently",
|
| 262 |
+
302: "Found",
|
| 263 |
+
303: "See Other",
|
| 264 |
+
304: "Not Modified",
|
| 265 |
+
305: "Use Proxy",
|
| 266 |
+
307: "Temporary Redirect",
|
| 267 |
+
308: "Permanent Redirect",
|
| 268 |
+
400: "Bad Request",
|
| 269 |
+
401: "Unauthorized",
|
| 270 |
+
402: "Payment Required",
|
| 271 |
+
403: "Forbidden",
|
| 272 |
+
404: "Not Found",
|
| 273 |
+
405: "Method Not Allowed",
|
| 274 |
+
406: "Not Acceptable",
|
| 275 |
+
407: "Proxy Authentication Required",
|
| 276 |
+
408: "Request Timeout",
|
| 277 |
+
409: "Conflict",
|
| 278 |
+
410: "Gone",
|
| 279 |
+
411: "Length Required",
|
| 280 |
+
412: "Precondition Failed",
|
| 281 |
+
413: "Payload Too Large",
|
| 282 |
+
414: "URI Too Long",
|
| 283 |
+
415: "Unsupported Media Type",
|
| 284 |
+
416: "Range Not Satisfiable",
|
| 285 |
+
417: "Expectation Failed",
|
| 286 |
+
418: "I'm a teapot",
|
| 287 |
+
421: "Misdirected Request",
|
| 288 |
+
422: "Unprocessable Entity",
|
| 289 |
+
423: "Locked",
|
| 290 |
+
424: "Failed Dependency",
|
| 291 |
+
425: "Too Early",
|
| 292 |
+
426: "Upgrade Required",
|
| 293 |
+
428: "Precondition Required",
|
| 294 |
+
429: "Too Many Requests",
|
| 295 |
+
431: "Request Header Fields Too Large",
|
| 296 |
+
451: "Unavailable For Legal Reasons",
|
| 297 |
+
500: "Internal Server Error",
|
| 298 |
+
501: "Not Implemented",
|
| 299 |
+
502: "Bad Gateway",
|
| 300 |
+
503: "Service Unavailable",
|
| 301 |
+
504: "Gateway Timeout",
|
| 302 |
+
505: "HTTP Version Not Supported",
|
| 303 |
+
506: "Variant Also Negotiates",
|
| 304 |
+
507: "Insufficient Storage",
|
| 305 |
+
508: "Loop Detected",
|
| 306 |
+
510: "Not Extended",
|
| 307 |
+
511: "Network Authentication Required",
|
| 308 |
+
}
|
| 309 |
+
)
|
| 310 |
|
| 311 |
@classmethod
|
| 312 |
@lru_cache(maxsize=128)
|
|
|
|
| 325 |
# if isinstance(engine, type):
|
| 326 |
# raise TypeError("Expected an engine instance, not a class definition of the engine")
|
| 327 |
|
| 328 |
+
if hasattr(engine, "fetch"):
|
| 329 |
fetch_function = getattr(engine, "fetch")
|
| 330 |
if callable(fetch_function):
|
| 331 |
if len(inspect.signature(fetch_function).parameters) > 0:
|
| 332 |
return engine
|
| 333 |
else:
|
| 334 |
# raise TypeError("Engine class instance must have a callable method 'fetch' with the first argument used for the url.")
|
| 335 |
+
raise TypeError(
|
| 336 |
+
"Engine class must have a callable method 'fetch' with the first argument used for the url."
|
| 337 |
+
)
|
| 338 |
else:
|
| 339 |
# raise TypeError("Invalid engine instance! Engine class must have a callable method 'fetch'")
|
| 340 |
+
raise TypeError(
|
| 341 |
+
"Invalid engine class! Engine class must have a callable method 'fetch'"
|
| 342 |
+
)
|
| 343 |
else:
|
| 344 |
# raise TypeError("Invalid engine instance! Engine class must have the method 'fetch'")
|
| 345 |
+
raise TypeError(
|
| 346 |
+
"Invalid engine class! Engine class must have the method 'fetch'"
|
| 347 |
+
)
|
| 348 |
|
| 349 |
|
| 350 |
def get_variable_name(var: Any) -> Optional[str]:
|
|
|
|
| 359 |
return None
|
| 360 |
|
| 361 |
|
| 362 |
+
def check_type_validity(
|
| 363 |
+
variable: Any,
|
| 364 |
+
valid_types: Union[List[Type], None],
|
| 365 |
+
default_value: Any = None,
|
| 366 |
+
critical: bool = False,
|
| 367 |
+
param_name: Optional[str] = None,
|
| 368 |
+
) -> Any:
|
| 369 |
"""Check if a variable matches the specified type constraints.
|
| 370 |
:param variable: The variable to check
|
| 371 |
:param valid_types: List of valid types for the variable
|
|
|
|
| 388 |
error_msg = f'Argument "{var_name}" cannot be None'
|
| 389 |
if critical:
|
| 390 |
raise TypeError(error_msg)
|
| 391 |
+
log.error(f"[Ignored] {error_msg}")
|
| 392 |
return default_value
|
| 393 |
|
| 394 |
# If no valid_types specified and variable has a value, return it
|
|
|
|
| 401 |
error_msg = f'Argument "{var_name}" must be of type {" or ".join(type_names)}'
|
| 402 |
if critical:
|
| 403 |
raise TypeError(error_msg)
|
| 404 |
+
log.error(f"[Ignored] {error_msg}")
|
| 405 |
return default_value
|
| 406 |
|
| 407 |
return variable
|
scrapling/engines/toolbelt/fingerprints.py
CHANGED
|
@@ -23,7 +23,7 @@ def generate_convincing_referer(url: str) -> str:
|
|
| 23 |
:return: Google's search URL of the domain name
|
| 24 |
"""
|
| 25 |
website_name = extract(url).domain
|
| 26 |
-
return f
|
| 27 |
|
| 28 |
|
| 29 |
@lru_cache(1, typed=True)
|
|
@@ -35,11 +35,11 @@ def get_os_name() -> Union[str, None]:
|
|
| 35 |
#
|
| 36 |
os_name = platform.system()
|
| 37 |
return {
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
# For the future? because why not
|
| 42 |
-
|
| 43 |
}.get(os_name)
|
| 44 |
|
| 45 |
|
|
@@ -50,9 +50,9 @@ def generate_suitable_fingerprint() -> Fingerprint:
|
|
| 50 |
:return: `Fingerprint` object
|
| 51 |
"""
|
| 52 |
return FingerprintGenerator(
|
| 53 |
-
browser=[Browser(name=
|
| 54 |
os=get_os_name(), # None is ignored
|
| 55 |
-
device=
|
| 56 |
).generate()
|
| 57 |
|
| 58 |
|
|
@@ -67,15 +67,15 @@ def generate_headers(browser_mode: bool = False) -> Dict:
|
|
| 67 |
# So we don't raise any inconsistency red flags while websites fingerprinting us
|
| 68 |
os_name = get_os_name()
|
| 69 |
return HeaderGenerator(
|
| 70 |
-
browser=[Browser(name=
|
| 71 |
os=os_name, # None is ignored
|
| 72 |
-
device=
|
| 73 |
).generate()
|
| 74 |
else:
|
| 75 |
# Here it's used for normal requests that aren't done through browsers so we can take it lightly
|
| 76 |
browsers = [
|
| 77 |
-
Browser(name=
|
| 78 |
-
Browser(name=
|
| 79 |
-
Browser(name=
|
| 80 |
]
|
| 81 |
-
return HeaderGenerator(browser=browsers, device=
|
|
|
|
| 23 |
:return: Google's search URL of the domain name
|
| 24 |
"""
|
| 25 |
website_name = extract(url).domain
|
| 26 |
+
return f"https://www.google.com/search?q={website_name}"
|
| 27 |
|
| 28 |
|
| 29 |
@lru_cache(1, typed=True)
|
|
|
|
| 35 |
#
|
| 36 |
os_name = platform.system()
|
| 37 |
return {
|
| 38 |
+
"Linux": "linux",
|
| 39 |
+
"Darwin": "macos",
|
| 40 |
+
"Windows": "windows",
|
| 41 |
# For the future? because why not
|
| 42 |
+
"iOS": "ios",
|
| 43 |
}.get(os_name)
|
| 44 |
|
| 45 |
|
|
|
|
| 50 |
:return: `Fingerprint` object
|
| 51 |
"""
|
| 52 |
return FingerprintGenerator(
|
| 53 |
+
browser=[Browser(name="chrome", min_version=128)],
|
| 54 |
os=get_os_name(), # None is ignored
|
| 55 |
+
device="desktop",
|
| 56 |
).generate()
|
| 57 |
|
| 58 |
|
|
|
|
| 67 |
# So we don't raise any inconsistency red flags while websites fingerprinting us
|
| 68 |
os_name = get_os_name()
|
| 69 |
return HeaderGenerator(
|
| 70 |
+
browser=[Browser(name="chrome", min_version=130)],
|
| 71 |
os=os_name, # None is ignored
|
| 72 |
+
device="desktop",
|
| 73 |
).generate()
|
| 74 |
else:
|
| 75 |
# Here it's used for normal requests that aren't done through browsers so we can take it lightly
|
| 76 |
browsers = [
|
| 77 |
+
Browser(name="chrome", min_version=120),
|
| 78 |
+
Browser(name="firefox", min_version=120),
|
| 79 |
+
Browser(name="edge", min_version=120),
|
| 80 |
]
|
| 81 |
+
return HeaderGenerator(browser=browsers, device="desktop").generate()
|
scrapling/engines/toolbelt/navigation.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
"""
|
| 2 |
Functions related to files and URLs
|
| 3 |
"""
|
|
|
|
| 4 |
import os
|
| 5 |
from urllib.parse import urlencode, urlparse
|
| 6 |
|
|
@@ -19,7 +20,9 @@ def intercept_route(route: Route):
|
|
| 19 |
:return: PlayWright `Route` object
|
| 20 |
"""
|
| 21 |
if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
|
| 22 |
-
log.debug(
|
|
|
|
|
|
|
| 23 |
route.abort()
|
| 24 |
else:
|
| 25 |
route.continue_()
|
|
@@ -32,7 +35,9 @@ async def async_intercept_route(route: async_Route):
|
|
| 32 |
:return: PlayWright `Route` object
|
| 33 |
"""
|
| 34 |
if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
|
| 35 |
-
log.debug(
|
|
|
|
|
|
|
| 36 |
await route.abort()
|
| 37 |
else:
|
| 38 |
await route.continue_()
|
|
@@ -50,23 +55,33 @@ def construct_proxy_dict(proxy_string: Union[str, Dict[str, str]]) -> Union[Dict
|
|
| 50 |
proxy = urlparse(proxy_string)
|
| 51 |
try:
|
| 52 |
return {
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
}
|
| 57 |
except ValueError:
|
| 58 |
# Urllib will say that one of the parameters above can't be casted to the correct type like `int` for port etc...
|
| 59 |
-
raise TypeError(
|
| 60 |
|
| 61 |
elif isinstance(proxy_string, dict):
|
| 62 |
-
valid_keys = (
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
return proxy_string
|
| 65 |
else:
|
| 66 |
-
raise TypeError(
|
|
|
|
|
|
|
| 67 |
|
| 68 |
else:
|
| 69 |
-
raise TypeError(
|
|
|
|
|
|
|
| 70 |
|
| 71 |
# The default value for proxy in Playwright's source is `None`
|
| 72 |
return None
|
|
@@ -84,7 +99,7 @@ def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
|
|
| 84 |
parsed = urlparse(cdp_url)
|
| 85 |
|
| 86 |
# Check scheme
|
| 87 |
-
if parsed.scheme not in (
|
| 88 |
raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
|
| 89 |
|
| 90 |
# Validate hostname and port
|
|
@@ -93,8 +108,8 @@ def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
|
|
| 93 |
|
| 94 |
# Ensure path starts with /
|
| 95 |
path = parsed.path
|
| 96 |
-
if not path.startswith(
|
| 97 |
-
path =
|
| 98 |
|
| 99 |
# Reconstruct the base URL with validated parts
|
| 100 |
validated_base = f"{parsed.scheme}://{parsed.netloc}{path}"
|
|
@@ -118,4 +133,4 @@ def js_bypass_path(filename: str) -> str:
|
|
| 118 |
:return: The full path of the JS file.
|
| 119 |
"""
|
| 120 |
current_directory = os.path.dirname(__file__)
|
| 121 |
-
return os.path.join(current_directory,
|
|
|
|
| 1 |
"""
|
| 2 |
Functions related to files and URLs
|
| 3 |
"""
|
| 4 |
+
|
| 5 |
import os
|
| 6 |
from urllib.parse import urlencode, urlparse
|
| 7 |
|
|
|
|
| 20 |
:return: PlayWright `Route` object
|
| 21 |
"""
|
| 22 |
if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
|
| 23 |
+
log.debug(
|
| 24 |
+
f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"'
|
| 25 |
+
)
|
| 26 |
route.abort()
|
| 27 |
else:
|
| 28 |
route.continue_()
|
|
|
|
| 35 |
:return: PlayWright `Route` object
|
| 36 |
"""
|
| 37 |
if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
|
| 38 |
+
log.debug(
|
| 39 |
+
f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"'
|
| 40 |
+
)
|
| 41 |
await route.abort()
|
| 42 |
else:
|
| 43 |
await route.continue_()
|
|
|
|
| 55 |
proxy = urlparse(proxy_string)
|
| 56 |
try:
|
| 57 |
return {
|
| 58 |
+
"server": f"{proxy.scheme}://{proxy.hostname}:{proxy.port}",
|
| 59 |
+
"username": proxy.username or "",
|
| 60 |
+
"password": proxy.password or "",
|
| 61 |
}
|
| 62 |
except ValueError:
|
| 63 |
# Urllib will say that one of the parameters above can't be casted to the correct type like `int` for port etc...
|
| 64 |
+
raise TypeError("The proxy argument's string is in invalid format!")
|
| 65 |
|
| 66 |
elif isinstance(proxy_string, dict):
|
| 67 |
+
valid_keys = (
|
| 68 |
+
"server",
|
| 69 |
+
"username",
|
| 70 |
+
"password",
|
| 71 |
+
)
|
| 72 |
+
if all(key in valid_keys for key in proxy_string.keys()) and not any(
|
| 73 |
+
key not in valid_keys for key in proxy_string.keys()
|
| 74 |
+
):
|
| 75 |
return proxy_string
|
| 76 |
else:
|
| 77 |
+
raise TypeError(
|
| 78 |
+
f"A proxy dictionary must have only these keys: {valid_keys}"
|
| 79 |
+
)
|
| 80 |
|
| 81 |
else:
|
| 82 |
+
raise TypeError(
|
| 83 |
+
f"Invalid type of proxy ({type(proxy_string)}), the proxy argument must be a string or a dictionary!"
|
| 84 |
+
)
|
| 85 |
|
| 86 |
# The default value for proxy in Playwright's source is `None`
|
| 87 |
return None
|
|
|
|
| 99 |
parsed = urlparse(cdp_url)
|
| 100 |
|
| 101 |
# Check scheme
|
| 102 |
+
if parsed.scheme not in ("ws", "wss"):
|
| 103 |
raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
|
| 104 |
|
| 105 |
# Validate hostname and port
|
|
|
|
| 108 |
|
| 109 |
# Ensure path starts with /
|
| 110 |
path = parsed.path
|
| 111 |
+
if not path.startswith("/"):
|
| 112 |
+
path = "/" + path
|
| 113 |
|
| 114 |
# Reconstruct the base URL with validated parts
|
| 115 |
validated_base = f"{parsed.scheme}://{parsed.netloc}{path}"
|
|
|
|
| 133 |
:return: The full path of the JS file.
|
| 134 |
"""
|
| 135 |
current_directory = os.path.dirname(__file__)
|
| 136 |
+
return os.path.join(current_directory, "bypasses", filename)
|
scrapling/fetchers.py
CHANGED
|
@@ -1,7 +1,18 @@
|
|
| 1 |
-
from scrapling.core._types import (
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
from scrapling.engines.toolbelt import BaseFetcher, Response
|
| 6 |
|
| 7 |
|
|
@@ -10,10 +21,19 @@ class Fetcher(BaseFetcher):
|
|
| 10 |
|
| 11 |
Any additional keyword arguments passed to the methods below are passed to the respective httpx's method directly.
|
| 12 |
"""
|
|
|
|
| 13 |
@classmethod
|
| 14 |
def get(
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
"""Make basic HTTP GET request for you but with some added flavors.
|
| 18 |
|
| 19 |
:param url: Target url.
|
|
@@ -30,16 +50,36 @@ class Fetcher(BaseFetcher):
|
|
| 30 |
if not custom_config:
|
| 31 |
custom_config = {}
|
| 32 |
elif not isinstance(custom_config, dict):
|
| 33 |
-
ValueError(
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
adaptor_arguments = tuple(
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
return response_object
|
| 38 |
|
| 39 |
@classmethod
|
| 40 |
def post(
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
"""Make basic HTTP POST request for you but with some added flavors.
|
| 44 |
|
| 45 |
:param url: Target url.
|
|
@@ -56,16 +96,36 @@ class Fetcher(BaseFetcher):
|
|
| 56 |
if not custom_config:
|
| 57 |
custom_config = {}
|
| 58 |
elif not isinstance(custom_config, dict):
|
| 59 |
-
ValueError(
|
|
|
|
|
|
|
| 60 |
|
| 61 |
-
adaptor_arguments = tuple(
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
return response_object
|
| 64 |
|
| 65 |
@classmethod
|
| 66 |
def put(
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
"""Make basic HTTP PUT request for you but with some added flavors.
|
| 70 |
|
| 71 |
:param url: Target url
|
|
@@ -83,16 +143,36 @@ class Fetcher(BaseFetcher):
|
|
| 83 |
if not custom_config:
|
| 84 |
custom_config = {}
|
| 85 |
elif not isinstance(custom_config, dict):
|
| 86 |
-
ValueError(
|
|
|
|
|
|
|
| 87 |
|
| 88 |
-
adaptor_arguments = tuple(
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
return response_object
|
| 91 |
|
| 92 |
@classmethod
|
| 93 |
def delete(
|
| 94 |
-
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
"""Make basic HTTP DELETE request for you but with some added flavors.
|
| 97 |
|
| 98 |
:param url: Target url
|
|
@@ -109,18 +189,38 @@ class Fetcher(BaseFetcher):
|
|
| 109 |
if not custom_config:
|
| 110 |
custom_config = {}
|
| 111 |
elif not isinstance(custom_config, dict):
|
| 112 |
-
ValueError(
|
|
|
|
|
|
|
| 113 |
|
| 114 |
-
adaptor_arguments = tuple(
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
return response_object
|
| 117 |
|
| 118 |
|
| 119 |
class AsyncFetcher(Fetcher):
|
| 120 |
@classmethod
|
| 121 |
async def get(
|
| 122 |
-
|
| 123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
"""Make basic HTTP GET request for you but with some added flavors.
|
| 125 |
|
| 126 |
:param url: Target url.
|
|
@@ -137,16 +237,36 @@ class AsyncFetcher(Fetcher):
|
|
| 137 |
if not custom_config:
|
| 138 |
custom_config = {}
|
| 139 |
elif not isinstance(custom_config, dict):
|
| 140 |
-
ValueError(
|
|
|
|
|
|
|
| 141 |
|
| 142 |
-
adaptor_arguments = tuple(
|
| 143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
return response_object
|
| 145 |
|
| 146 |
@classmethod
|
| 147 |
async def post(
|
| 148 |
-
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
"""Make basic HTTP POST request for you but with some added flavors.
|
| 151 |
|
| 152 |
:param url: Target url.
|
|
@@ -163,16 +283,36 @@ class AsyncFetcher(Fetcher):
|
|
| 163 |
if not custom_config:
|
| 164 |
custom_config = {}
|
| 165 |
elif not isinstance(custom_config, dict):
|
| 166 |
-
ValueError(
|
|
|
|
|
|
|
| 167 |
|
| 168 |
-
adaptor_arguments = tuple(
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
return response_object
|
| 171 |
|
| 172 |
@classmethod
|
| 173 |
async def put(
|
| 174 |
-
|
| 175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
"""Make basic HTTP PUT request for you but with some added flavors.
|
| 177 |
|
| 178 |
:param url: Target url
|
|
@@ -189,16 +329,36 @@ class AsyncFetcher(Fetcher):
|
|
| 189 |
if not custom_config:
|
| 190 |
custom_config = {}
|
| 191 |
elif not isinstance(custom_config, dict):
|
| 192 |
-
ValueError(
|
|
|
|
|
|
|
| 193 |
|
| 194 |
-
adaptor_arguments = tuple(
|
| 195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
return response_object
|
| 197 |
|
| 198 |
@classmethod
|
| 199 |
async def delete(
|
| 200 |
-
|
| 201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
"""Make basic HTTP DELETE request for you but with some added flavors.
|
| 203 |
|
| 204 |
:param url: Target url
|
|
@@ -215,27 +375,57 @@ class AsyncFetcher(Fetcher):
|
|
| 215 |
if not custom_config:
|
| 216 |
custom_config = {}
|
| 217 |
elif not isinstance(custom_config, dict):
|
| 218 |
-
ValueError(
|
|
|
|
|
|
|
| 219 |
|
| 220 |
-
adaptor_arguments = tuple(
|
| 221 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
return response_object
|
| 223 |
|
| 224 |
|
| 225 |
class StealthyFetcher(BaseFetcher):
|
| 226 |
"""A `Fetcher` class type that is completely stealthy fetcher that uses a modified version of Firefox.
|
| 227 |
|
| 228 |
-
|
| 229 |
-
|
| 230 |
"""
|
|
|
|
| 231 |
@classmethod
|
| 232 |
def fetch(
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
) -> Response:
|
| 240 |
"""
|
| 241 |
Opens up a browser and do your request based on your chosen options below.
|
|
@@ -271,7 +461,9 @@ class StealthyFetcher(BaseFetcher):
|
|
| 271 |
if not custom_config:
|
| 272 |
custom_config = {}
|
| 273 |
elif not isinstance(custom_config, dict):
|
| 274 |
-
ValueError(
|
|
|
|
|
|
|
| 275 |
|
| 276 |
engine = CamoufoxEngine(
|
| 277 |
wait=wait,
|
|
@@ -294,18 +486,35 @@ class StealthyFetcher(BaseFetcher):
|
|
| 294 |
disable_resources=disable_resources,
|
| 295 |
wait_selector_state=wait_selector_state,
|
| 296 |
adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
|
| 297 |
-
additional_arguments=additional_arguments or {}
|
| 298 |
)
|
| 299 |
return engine.fetch(url)
|
| 300 |
|
| 301 |
@classmethod
|
| 302 |
async def async_fetch(
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
) -> Response:
|
| 310 |
"""
|
| 311 |
Opens up a browser and do your request based on your chosen options below.
|
|
@@ -341,7 +550,9 @@ class StealthyFetcher(BaseFetcher):
|
|
| 341 |
if not custom_config:
|
| 342 |
custom_config = {}
|
| 343 |
elif not isinstance(custom_config, dict):
|
| 344 |
-
ValueError(
|
|
|
|
|
|
|
| 345 |
|
| 346 |
engine = CamoufoxEngine(
|
| 347 |
wait=wait,
|
|
@@ -364,7 +575,7 @@ class StealthyFetcher(BaseFetcher):
|
|
| 364 |
disable_resources=disable_resources,
|
| 365 |
wait_selector_state=wait_selector_state,
|
| 366 |
adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
|
| 367 |
-
additional_arguments=additional_arguments or {}
|
| 368 |
)
|
| 369 |
return await engine.async_fetch(url)
|
| 370 |
|
|
@@ -385,17 +596,32 @@ class PlayWrightFetcher(BaseFetcher):
|
|
| 385 |
|
| 386 |
> Note that these are the main options with PlayWright but it can be mixed together.
|
| 387 |
"""
|
|
|
|
| 388 |
@classmethod
|
| 389 |
def fetch(
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 399 |
) -> Response:
|
| 400 |
"""Opens up a browser and do your request based on your chosen options below.
|
| 401 |
|
|
@@ -428,7 +654,9 @@ class PlayWrightFetcher(BaseFetcher):
|
|
| 428 |
if not custom_config:
|
| 429 |
custom_config = {}
|
| 430 |
elif not isinstance(custom_config, dict):
|
| 431 |
-
ValueError(
|
|
|
|
|
|
|
| 432 |
|
| 433 |
engine = PlaywrightEngine(
|
| 434 |
wait=wait,
|
|
@@ -457,15 +685,29 @@ class PlayWrightFetcher(BaseFetcher):
|
|
| 457 |
|
| 458 |
@classmethod
|
| 459 |
async def async_fetch(
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 469 |
) -> Response:
|
| 470 |
"""Opens up a browser and do your request based on your chosen options below.
|
| 471 |
|
|
@@ -498,7 +740,9 @@ class PlayWrightFetcher(BaseFetcher):
|
|
| 498 |
if not custom_config:
|
| 499 |
custom_config = {}
|
| 500 |
elif not isinstance(custom_config, dict):
|
| 501 |
-
ValueError(
|
|
|
|
|
|
|
| 502 |
|
| 503 |
engine = PlaywrightEngine(
|
| 504 |
wait=wait,
|
|
@@ -529,5 +773,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
| 529 |
class CustomFetcher(BaseFetcher):
|
| 530 |
@classmethod
|
| 531 |
def fetch(cls, url: str, browser_engine, **kwargs) -> Response:
|
| 532 |
-
engine = check_if_engine_usable(browser_engine)(
|
|
|
|
|
|
|
| 533 |
return engine.fetch(url)
|
|
|
|
| 1 |
+
from scrapling.core._types import (
|
| 2 |
+
Callable,
|
| 3 |
+
Dict,
|
| 4 |
+
List,
|
| 5 |
+
Literal,
|
| 6 |
+
Optional,
|
| 7 |
+
SelectorWaitStates,
|
| 8 |
+
Union,
|
| 9 |
+
)
|
| 10 |
+
from scrapling.engines import (
|
| 11 |
+
CamoufoxEngine,
|
| 12 |
+
PlaywrightEngine,
|
| 13 |
+
StaticEngine,
|
| 14 |
+
check_if_engine_usable,
|
| 15 |
+
)
|
| 16 |
from scrapling.engines.toolbelt import BaseFetcher, Response
|
| 17 |
|
| 18 |
|
|
|
|
| 21 |
|
| 22 |
Any additional keyword arguments passed to the methods below are passed to the respective httpx's method directly.
|
| 23 |
"""
|
| 24 |
+
|
| 25 |
@classmethod
|
| 26 |
def get(
|
| 27 |
+
cls,
|
| 28 |
+
url: str,
|
| 29 |
+
follow_redirects: bool = True,
|
| 30 |
+
timeout: Optional[Union[int, float]] = 10,
|
| 31 |
+
stealthy_headers: bool = True,
|
| 32 |
+
proxy: Optional[str] = None,
|
| 33 |
+
retries: Optional[int] = 3,
|
| 34 |
+
custom_config: Dict = None,
|
| 35 |
+
**kwargs: Dict,
|
| 36 |
+
) -> Response:
|
| 37 |
"""Make basic HTTP GET request for you but with some added flavors.
|
| 38 |
|
| 39 |
:param url: Target url.
|
|
|
|
| 50 |
if not custom_config:
|
| 51 |
custom_config = {}
|
| 52 |
elif not isinstance(custom_config, dict):
|
| 53 |
+
ValueError(
|
| 54 |
+
f"The custom parser config must be of type dictionary, got {cls.__class__}"
|
| 55 |
+
)
|
| 56 |
|
| 57 |
+
adaptor_arguments = tuple(
|
| 58 |
+
{**cls._generate_parser_arguments(), **custom_config}.items()
|
| 59 |
+
)
|
| 60 |
+
response_object = StaticEngine(
|
| 61 |
+
url,
|
| 62 |
+
proxy,
|
| 63 |
+
stealthy_headers,
|
| 64 |
+
follow_redirects,
|
| 65 |
+
timeout,
|
| 66 |
+
retries,
|
| 67 |
+
adaptor_arguments=adaptor_arguments,
|
| 68 |
+
).get(**kwargs)
|
| 69 |
return response_object
|
| 70 |
|
| 71 |
@classmethod
|
| 72 |
def post(
|
| 73 |
+
cls,
|
| 74 |
+
url: str,
|
| 75 |
+
follow_redirects: bool = True,
|
| 76 |
+
timeout: Optional[Union[int, float]] = 10,
|
| 77 |
+
stealthy_headers: bool = True,
|
| 78 |
+
proxy: Optional[str] = None,
|
| 79 |
+
retries: Optional[int] = 3,
|
| 80 |
+
custom_config: Dict = None,
|
| 81 |
+
**kwargs: Dict,
|
| 82 |
+
) -> Response:
|
| 83 |
"""Make basic HTTP POST request for you but with some added flavors.
|
| 84 |
|
| 85 |
:param url: Target url.
|
|
|
|
| 96 |
if not custom_config:
|
| 97 |
custom_config = {}
|
| 98 |
elif not isinstance(custom_config, dict):
|
| 99 |
+
ValueError(
|
| 100 |
+
f"The custom parser config must be of type dictionary, got {cls.__class__}"
|
| 101 |
+
)
|
| 102 |
|
| 103 |
+
adaptor_arguments = tuple(
|
| 104 |
+
{**cls._generate_parser_arguments(), **custom_config}.items()
|
| 105 |
+
)
|
| 106 |
+
response_object = StaticEngine(
|
| 107 |
+
url,
|
| 108 |
+
proxy,
|
| 109 |
+
stealthy_headers,
|
| 110 |
+
follow_redirects,
|
| 111 |
+
timeout,
|
| 112 |
+
retries,
|
| 113 |
+
adaptor_arguments=adaptor_arguments,
|
| 114 |
+
).post(**kwargs)
|
| 115 |
return response_object
|
| 116 |
|
| 117 |
@classmethod
|
| 118 |
def put(
|
| 119 |
+
cls,
|
| 120 |
+
url: str,
|
| 121 |
+
follow_redirects: bool = True,
|
| 122 |
+
timeout: Optional[Union[int, float]] = 10,
|
| 123 |
+
stealthy_headers: bool = True,
|
| 124 |
+
proxy: Optional[str] = None,
|
| 125 |
+
retries: Optional[int] = 3,
|
| 126 |
+
custom_config: Dict = None,
|
| 127 |
+
**kwargs: Dict,
|
| 128 |
+
) -> Response:
|
| 129 |
"""Make basic HTTP PUT request for you but with some added flavors.
|
| 130 |
|
| 131 |
:param url: Target url
|
|
|
|
| 143 |
if not custom_config:
|
| 144 |
custom_config = {}
|
| 145 |
elif not isinstance(custom_config, dict):
|
| 146 |
+
ValueError(
|
| 147 |
+
f"The custom parser config must be of type dictionary, got {cls.__class__}"
|
| 148 |
+
)
|
| 149 |
|
| 150 |
+
adaptor_arguments = tuple(
|
| 151 |
+
{**cls._generate_parser_arguments(), **custom_config}.items()
|
| 152 |
+
)
|
| 153 |
+
response_object = StaticEngine(
|
| 154 |
+
url,
|
| 155 |
+
proxy,
|
| 156 |
+
stealthy_headers,
|
| 157 |
+
follow_redirects,
|
| 158 |
+
timeout,
|
| 159 |
+
retries,
|
| 160 |
+
adaptor_arguments=adaptor_arguments,
|
| 161 |
+
).put(**kwargs)
|
| 162 |
return response_object
|
| 163 |
|
| 164 |
@classmethod
|
| 165 |
def delete(
|
| 166 |
+
cls,
|
| 167 |
+
url: str,
|
| 168 |
+
follow_redirects: bool = True,
|
| 169 |
+
timeout: Optional[Union[int, float]] = 10,
|
| 170 |
+
stealthy_headers: bool = True,
|
| 171 |
+
proxy: Optional[str] = None,
|
| 172 |
+
retries: Optional[int] = 3,
|
| 173 |
+
custom_config: Dict = None,
|
| 174 |
+
**kwargs: Dict,
|
| 175 |
+
) -> Response:
|
| 176 |
"""Make basic HTTP DELETE request for you but with some added flavors.
|
| 177 |
|
| 178 |
:param url: Target url
|
|
|
|
| 189 |
if not custom_config:
|
| 190 |
custom_config = {}
|
| 191 |
elif not isinstance(custom_config, dict):
|
| 192 |
+
ValueError(
|
| 193 |
+
f"The custom parser config must be of type dictionary, got {cls.__class__}"
|
| 194 |
+
)
|
| 195 |
|
| 196 |
+
adaptor_arguments = tuple(
|
| 197 |
+
{**cls._generate_parser_arguments(), **custom_config}.items()
|
| 198 |
+
)
|
| 199 |
+
response_object = StaticEngine(
|
| 200 |
+
url,
|
| 201 |
+
proxy,
|
| 202 |
+
stealthy_headers,
|
| 203 |
+
follow_redirects,
|
| 204 |
+
timeout,
|
| 205 |
+
retries,
|
| 206 |
+
adaptor_arguments=adaptor_arguments,
|
| 207 |
+
).delete(**kwargs)
|
| 208 |
return response_object
|
| 209 |
|
| 210 |
|
| 211 |
class AsyncFetcher(Fetcher):
|
| 212 |
@classmethod
|
| 213 |
async def get(
|
| 214 |
+
cls,
|
| 215 |
+
url: str,
|
| 216 |
+
follow_redirects: bool = True,
|
| 217 |
+
timeout: Optional[Union[int, float]] = 10,
|
| 218 |
+
stealthy_headers: bool = True,
|
| 219 |
+
proxy: Optional[str] = None,
|
| 220 |
+
retries: Optional[int] = 3,
|
| 221 |
+
custom_config: Dict = None,
|
| 222 |
+
**kwargs: Dict,
|
| 223 |
+
) -> Response:
|
| 224 |
"""Make basic HTTP GET request for you but with some added flavors.
|
| 225 |
|
| 226 |
:param url: Target url.
|
|
|
|
| 237 |
if not custom_config:
|
| 238 |
custom_config = {}
|
| 239 |
elif not isinstance(custom_config, dict):
|
| 240 |
+
ValueError(
|
| 241 |
+
f"The custom parser config must be of type dictionary, got {cls.__class__}"
|
| 242 |
+
)
|
| 243 |
|
| 244 |
+
adaptor_arguments = tuple(
|
| 245 |
+
{**cls._generate_parser_arguments(), **custom_config}.items()
|
| 246 |
+
)
|
| 247 |
+
response_object = await StaticEngine(
|
| 248 |
+
url,
|
| 249 |
+
proxy,
|
| 250 |
+
stealthy_headers,
|
| 251 |
+
follow_redirects,
|
| 252 |
+
timeout,
|
| 253 |
+
retries=retries,
|
| 254 |
+
adaptor_arguments=adaptor_arguments,
|
| 255 |
+
).async_get(**kwargs)
|
| 256 |
return response_object
|
| 257 |
|
| 258 |
@classmethod
|
| 259 |
async def post(
|
| 260 |
+
cls,
|
| 261 |
+
url: str,
|
| 262 |
+
follow_redirects: bool = True,
|
| 263 |
+
timeout: Optional[Union[int, float]] = 10,
|
| 264 |
+
stealthy_headers: bool = True,
|
| 265 |
+
proxy: Optional[str] = None,
|
| 266 |
+
retries: Optional[int] = 3,
|
| 267 |
+
custom_config: Dict = None,
|
| 268 |
+
**kwargs: Dict,
|
| 269 |
+
) -> Response:
|
| 270 |
"""Make basic HTTP POST request for you but with some added flavors.
|
| 271 |
|
| 272 |
:param url: Target url.
|
|
|
|
| 283 |
if not custom_config:
|
| 284 |
custom_config = {}
|
| 285 |
elif not isinstance(custom_config, dict):
|
| 286 |
+
ValueError(
|
| 287 |
+
f"The custom parser config must be of type dictionary, got {cls.__class__}"
|
| 288 |
+
)
|
| 289 |
|
| 290 |
+
adaptor_arguments = tuple(
|
| 291 |
+
{**cls._generate_parser_arguments(), **custom_config}.items()
|
| 292 |
+
)
|
| 293 |
+
response_object = await StaticEngine(
|
| 294 |
+
url,
|
| 295 |
+
proxy,
|
| 296 |
+
stealthy_headers,
|
| 297 |
+
follow_redirects,
|
| 298 |
+
timeout,
|
| 299 |
+
retries=retries,
|
| 300 |
+
adaptor_arguments=adaptor_arguments,
|
| 301 |
+
).async_post(**kwargs)
|
| 302 |
return response_object
|
| 303 |
|
| 304 |
@classmethod
|
| 305 |
async def put(
|
| 306 |
+
cls,
|
| 307 |
+
url: str,
|
| 308 |
+
follow_redirects: bool = True,
|
| 309 |
+
timeout: Optional[Union[int, float]] = 10,
|
| 310 |
+
stealthy_headers: bool = True,
|
| 311 |
+
proxy: Optional[str] = None,
|
| 312 |
+
retries: Optional[int] = 3,
|
| 313 |
+
custom_config: Dict = None,
|
| 314 |
+
**kwargs: Dict,
|
| 315 |
+
) -> Response:
|
| 316 |
"""Make basic HTTP PUT request for you but with some added flavors.
|
| 317 |
|
| 318 |
:param url: Target url
|
|
|
|
| 329 |
if not custom_config:
|
| 330 |
custom_config = {}
|
| 331 |
elif not isinstance(custom_config, dict):
|
| 332 |
+
ValueError(
|
| 333 |
+
f"The custom parser config must be of type dictionary, got {cls.__class__}"
|
| 334 |
+
)
|
| 335 |
|
| 336 |
+
adaptor_arguments = tuple(
|
| 337 |
+
{**cls._generate_parser_arguments(), **custom_config}.items()
|
| 338 |
+
)
|
| 339 |
+
response_object = await StaticEngine(
|
| 340 |
+
url,
|
| 341 |
+
proxy,
|
| 342 |
+
stealthy_headers,
|
| 343 |
+
follow_redirects,
|
| 344 |
+
timeout,
|
| 345 |
+
retries=retries,
|
| 346 |
+
adaptor_arguments=adaptor_arguments,
|
| 347 |
+
).async_put(**kwargs)
|
| 348 |
return response_object
|
| 349 |
|
| 350 |
@classmethod
|
| 351 |
async def delete(
|
| 352 |
+
cls,
|
| 353 |
+
url: str,
|
| 354 |
+
follow_redirects: bool = True,
|
| 355 |
+
timeout: Optional[Union[int, float]] = 10,
|
| 356 |
+
stealthy_headers: bool = True,
|
| 357 |
+
proxy: Optional[str] = None,
|
| 358 |
+
retries: Optional[int] = 3,
|
| 359 |
+
custom_config: Dict = None,
|
| 360 |
+
**kwargs: Dict,
|
| 361 |
+
) -> Response:
|
| 362 |
"""Make basic HTTP DELETE request for you but with some added flavors.
|
| 363 |
|
| 364 |
:param url: Target url
|
|
|
|
| 375 |
if not custom_config:
|
| 376 |
custom_config = {}
|
| 377 |
elif not isinstance(custom_config, dict):
|
| 378 |
+
ValueError(
|
| 379 |
+
f"The custom parser config must be of type dictionary, got {cls.__class__}"
|
| 380 |
+
)
|
| 381 |
|
| 382 |
+
adaptor_arguments = tuple(
|
| 383 |
+
{**cls._generate_parser_arguments(), **custom_config}.items()
|
| 384 |
+
)
|
| 385 |
+
response_object = await StaticEngine(
|
| 386 |
+
url,
|
| 387 |
+
proxy,
|
| 388 |
+
stealthy_headers,
|
| 389 |
+
follow_redirects,
|
| 390 |
+
timeout,
|
| 391 |
+
retries=retries,
|
| 392 |
+
adaptor_arguments=adaptor_arguments,
|
| 393 |
+
).async_delete(**kwargs)
|
| 394 |
return response_object
|
| 395 |
|
| 396 |
|
| 397 |
class StealthyFetcher(BaseFetcher):
|
| 398 |
"""A `Fetcher` class type that is completely stealthy fetcher that uses a modified version of Firefox.
|
| 399 |
|
| 400 |
+
It works as real browsers passing almost all online tests/protections based on Camoufox.
|
| 401 |
+
Other added flavors include setting the faked OS fingerprints to match the user's OS and the referer of every request is set as if this request came from Google's search of this URL's domain.
|
| 402 |
"""
|
| 403 |
+
|
| 404 |
@classmethod
|
| 405 |
def fetch(
|
| 406 |
+
cls,
|
| 407 |
+
url: str,
|
| 408 |
+
headless: Union[bool, Literal["virtual"]] = True, # noqa: F821
|
| 409 |
+
block_images: bool = False,
|
| 410 |
+
disable_resources: bool = False,
|
| 411 |
+
block_webrtc: bool = False,
|
| 412 |
+
allow_webgl: bool = True,
|
| 413 |
+
network_idle: bool = False,
|
| 414 |
+
addons: Optional[List[str]] = None,
|
| 415 |
+
wait: Optional[int] = 0,
|
| 416 |
+
timeout: Optional[float] = 30000,
|
| 417 |
+
page_action: Callable = None,
|
| 418 |
+
wait_selector: Optional[str] = None,
|
| 419 |
+
humanize: Optional[Union[bool, float]] = True,
|
| 420 |
+
wait_selector_state: SelectorWaitStates = "attached",
|
| 421 |
+
google_search: bool = True,
|
| 422 |
+
extra_headers: Optional[Dict[str, str]] = None,
|
| 423 |
+
proxy: Optional[Union[str, Dict[str, str]]] = None,
|
| 424 |
+
os_randomize: bool = False,
|
| 425 |
+
disable_ads: bool = False,
|
| 426 |
+
geoip: bool = False,
|
| 427 |
+
custom_config: Dict = None,
|
| 428 |
+
additional_arguments: Dict = None,
|
| 429 |
) -> Response:
|
| 430 |
"""
|
| 431 |
Opens up a browser and do your request based on your chosen options below.
|
|
|
|
| 461 |
if not custom_config:
|
| 462 |
custom_config = {}
|
| 463 |
elif not isinstance(custom_config, dict):
|
| 464 |
+
ValueError(
|
| 465 |
+
f"The custom parser config must be of type dictionary, got {cls.__class__}"
|
| 466 |
+
)
|
| 467 |
|
| 468 |
engine = CamoufoxEngine(
|
| 469 |
wait=wait,
|
|
|
|
| 486 |
disable_resources=disable_resources,
|
| 487 |
wait_selector_state=wait_selector_state,
|
| 488 |
adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
|
| 489 |
+
additional_arguments=additional_arguments or {},
|
| 490 |
)
|
| 491 |
return engine.fetch(url)
|
| 492 |
|
| 493 |
@classmethod
|
| 494 |
async def async_fetch(
|
| 495 |
+
cls,
|
| 496 |
+
url: str,
|
| 497 |
+
headless: Union[bool, Literal["virtual"]] = True, # noqa: F821
|
| 498 |
+
block_images: bool = False,
|
| 499 |
+
disable_resources: bool = False,
|
| 500 |
+
block_webrtc: bool = False,
|
| 501 |
+
allow_webgl: bool = True,
|
| 502 |
+
network_idle: bool = False,
|
| 503 |
+
addons: Optional[List[str]] = None,
|
| 504 |
+
wait: Optional[int] = 0,
|
| 505 |
+
timeout: Optional[float] = 30000,
|
| 506 |
+
page_action: Callable = None,
|
| 507 |
+
wait_selector: Optional[str] = None,
|
| 508 |
+
humanize: Optional[Union[bool, float]] = True,
|
| 509 |
+
wait_selector_state: SelectorWaitStates = "attached",
|
| 510 |
+
google_search: bool = True,
|
| 511 |
+
extra_headers: Optional[Dict[str, str]] = None,
|
| 512 |
+
proxy: Optional[Union[str, Dict[str, str]]] = None,
|
| 513 |
+
os_randomize: bool = False,
|
| 514 |
+
disable_ads: bool = False,
|
| 515 |
+
geoip: bool = False,
|
| 516 |
+
custom_config: Dict = None,
|
| 517 |
+
additional_arguments: Dict = None,
|
| 518 |
) -> Response:
|
| 519 |
"""
|
| 520 |
Opens up a browser and do your request based on your chosen options below.
|
|
|
|
| 550 |
if not custom_config:
|
| 551 |
custom_config = {}
|
| 552 |
elif not isinstance(custom_config, dict):
|
| 553 |
+
ValueError(
|
| 554 |
+
f"The custom parser config must be of type dictionary, got {cls.__class__}"
|
| 555 |
+
)
|
| 556 |
|
| 557 |
engine = CamoufoxEngine(
|
| 558 |
wait=wait,
|
|
|
|
| 575 |
disable_resources=disable_resources,
|
| 576 |
wait_selector_state=wait_selector_state,
|
| 577 |
adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
|
| 578 |
+
additional_arguments=additional_arguments or {},
|
| 579 |
)
|
| 580 |
return await engine.async_fetch(url)
|
| 581 |
|
|
|
|
| 596 |
|
| 597 |
> Note that these are the main options with PlayWright but it can be mixed together.
|
| 598 |
"""
|
| 599 |
+
|
| 600 |
@classmethod
|
| 601 |
def fetch(
|
| 602 |
+
cls,
|
| 603 |
+
url: str,
|
| 604 |
+
headless: Union[bool, str] = True,
|
| 605 |
+
disable_resources: bool = None,
|
| 606 |
+
useragent: Optional[str] = None,
|
| 607 |
+
network_idle: bool = False,
|
| 608 |
+
timeout: Optional[float] = 30000,
|
| 609 |
+
wait: Optional[int] = 0,
|
| 610 |
+
page_action: Optional[Callable] = None,
|
| 611 |
+
wait_selector: Optional[str] = None,
|
| 612 |
+
wait_selector_state: SelectorWaitStates = "attached",
|
| 613 |
+
hide_canvas: bool = False,
|
| 614 |
+
disable_webgl: bool = False,
|
| 615 |
+
extra_headers: Optional[Dict[str, str]] = None,
|
| 616 |
+
google_search: bool = True,
|
| 617 |
+
proxy: Optional[Union[str, Dict[str, str]]] = None,
|
| 618 |
+
locale: Optional[str] = "en-US",
|
| 619 |
+
stealth: bool = False,
|
| 620 |
+
real_chrome: bool = False,
|
| 621 |
+
cdp_url: Optional[str] = None,
|
| 622 |
+
nstbrowser_mode: bool = False,
|
| 623 |
+
nstbrowser_config: Optional[Dict] = None,
|
| 624 |
+
custom_config: Dict = None,
|
| 625 |
) -> Response:
|
| 626 |
"""Opens up a browser and do your request based on your chosen options below.
|
| 627 |
|
|
|
|
| 654 |
if not custom_config:
|
| 655 |
custom_config = {}
|
| 656 |
elif not isinstance(custom_config, dict):
|
| 657 |
+
ValueError(
|
| 658 |
+
f"The custom parser config must be of type dictionary, got {cls.__class__}"
|
| 659 |
+
)
|
| 660 |
|
| 661 |
engine = PlaywrightEngine(
|
| 662 |
wait=wait,
|
|
|
|
| 685 |
|
| 686 |
@classmethod
|
| 687 |
async def async_fetch(
|
| 688 |
+
cls,
|
| 689 |
+
url: str,
|
| 690 |
+
headless: Union[bool, str] = True,
|
| 691 |
+
disable_resources: bool = None,
|
| 692 |
+
useragent: Optional[str] = None,
|
| 693 |
+
network_idle: bool = False,
|
| 694 |
+
timeout: Optional[float] = 30000,
|
| 695 |
+
wait: Optional[int] = 0,
|
| 696 |
+
page_action: Optional[Callable] = None,
|
| 697 |
+
wait_selector: Optional[str] = None,
|
| 698 |
+
wait_selector_state: SelectorWaitStates = "attached",
|
| 699 |
+
hide_canvas: bool = False,
|
| 700 |
+
disable_webgl: bool = False,
|
| 701 |
+
extra_headers: Optional[Dict[str, str]] = None,
|
| 702 |
+
google_search: bool = True,
|
| 703 |
+
proxy: Optional[Union[str, Dict[str, str]]] = None,
|
| 704 |
+
locale: Optional[str] = "en-US",
|
| 705 |
+
stealth: bool = False,
|
| 706 |
+
real_chrome: bool = False,
|
| 707 |
+
cdp_url: Optional[str] = None,
|
| 708 |
+
nstbrowser_mode: bool = False,
|
| 709 |
+
nstbrowser_config: Optional[Dict] = None,
|
| 710 |
+
custom_config: Dict = None,
|
| 711 |
) -> Response:
|
| 712 |
"""Opens up a browser and do your request based on your chosen options below.
|
| 713 |
|
|
|
|
| 740 |
if not custom_config:
|
| 741 |
custom_config = {}
|
| 742 |
elif not isinstance(custom_config, dict):
|
| 743 |
+
ValueError(
|
| 744 |
+
f"The custom parser config must be of type dictionary, got {cls.__class__}"
|
| 745 |
+
)
|
| 746 |
|
| 747 |
engine = PlaywrightEngine(
|
| 748 |
wait=wait,
|
|
|
|
| 773 |
class CustomFetcher(BaseFetcher):
|
| 774 |
@classmethod
|
| 775 |
def fetch(cls, url: str, browser_engine, **kwargs) -> Response:
|
| 776 |
+
engine = check_if_engine_usable(browser_engine)(
|
| 777 |
+
adaptor_arguments=cls._generate_parser_arguments(), **kwargs
|
| 778 |
+
)
|
| 779 |
return engine.fetch(url)
|
scrapling/parser.py
CHANGED
|
@@ -9,40 +9,59 @@ from cssselect import SelectorError, SelectorSyntaxError
|
|
| 9 |
from cssselect import parse as split_selectors
|
| 10 |
from lxml import etree, html
|
| 11 |
|
| 12 |
-
from scrapling.core._types import (
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
from scrapling.core.mixins import SelectorsGeneration
|
| 18 |
-
from scrapling.core.storage_adaptors import (
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
| 20 |
from scrapling.core.translator import translator_instance
|
| 21 |
-
from scrapling.core.utils import
|
| 22 |
-
is_jsonable, log)
|
| 23 |
|
| 24 |
|
| 25 |
class Adaptor(SelectorsGeneration):
|
| 26 |
__slots__ = (
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
)
|
| 31 |
|
| 32 |
def __init__(
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
):
|
| 47 |
"""The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
|
| 48 |
with expressions in CSS, XPath, or with simply text. Check the docs for more info.
|
|
@@ -69,25 +88,37 @@ class Adaptor(SelectorsGeneration):
|
|
| 69 |
If empty, default values will be used.
|
| 70 |
"""
|
| 71 |
if root is None and not body and text is None:
|
| 72 |
-
raise ValueError(
|
|
|
|
|
|
|
| 73 |
|
| 74 |
-
self.__text =
|
| 75 |
if root is None:
|
| 76 |
if text is None:
|
| 77 |
if not body or not isinstance(body, bytes):
|
| 78 |
-
raise TypeError(
|
|
|
|
|
|
|
| 79 |
|
| 80 |
body = body.replace(b"\x00", b"").strip()
|
| 81 |
else:
|
| 82 |
if not isinstance(text, str):
|
| 83 |
-
raise TypeError(
|
|
|
|
|
|
|
| 84 |
|
| 85 |
body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
|
| 86 |
|
| 87 |
# https://lxml.de/api/lxml.etree.HTMLParser-class.html
|
| 88 |
parser = html.HTMLParser(
|
| 89 |
-
recover=True,
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
)
|
| 92 |
self._root = etree.fromstring(body, parser=parser, base_url=url)
|
| 93 |
if is_jsonable(text or body.decode()):
|
|
@@ -107,15 +138,21 @@ class Adaptor(SelectorsGeneration):
|
|
| 107 |
if self.__auto_match_enabled:
|
| 108 |
if not storage_args:
|
| 109 |
storage_args = {
|
| 110 |
-
|
| 111 |
-
|
|
|
|
|
|
|
| 112 |
}
|
| 113 |
|
| 114 |
-
if not hasattr(storage,
|
| 115 |
-
raise ValueError(
|
|
|
|
|
|
|
| 116 |
|
| 117 |
if not issubclass(storage.__wrapped__, StorageSystemMixin):
|
| 118 |
-
raise ValueError(
|
|
|
|
|
|
|
| 119 |
|
| 120 |
self._storage = storage(**storage_args)
|
| 121 |
|
|
@@ -128,13 +165,27 @@ class Adaptor(SelectorsGeneration):
|
|
| 128 |
self.__attributes = None
|
| 129 |
self.__tag = None
|
| 130 |
# No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
|
| 131 |
-
self.__response_data =
|
| 132 |
-
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
# Node functionalities, I wanted to move to separate Mixin class but it had slight impact on performance
|
| 136 |
@staticmethod
|
| 137 |
-
def _is_text_node(
|
|
|
|
|
|
|
| 138 |
"""Return True if given element is a result of a string expression
|
| 139 |
Examples:
|
| 140 |
XPath -> '/text()', '/@attribute' etc...
|
|
@@ -144,25 +195,33 @@ class Adaptor(SelectorsGeneration):
|
|
| 144 |
return issubclass(type(element), etree._ElementUnicodeResult)
|
| 145 |
|
| 146 |
@staticmethod
|
| 147 |
-
def __content_convertor(
|
|
|
|
|
|
|
| 148 |
"""Used internally to convert a single element's text content to TextHandler directly without checks
|
| 149 |
|
| 150 |
This single line has been isolated like this so when it's used with map we get that slight performance boost vs list comprehension
|
| 151 |
"""
|
| 152 |
return TextHandler(str(element))
|
| 153 |
|
| 154 |
-
def __element_convertor(self, element: html.HtmlElement) ->
|
| 155 |
"""Used internally to convert a single HtmlElement to Adaptor directly without checks"""
|
| 156 |
return Adaptor(
|
| 157 |
root=element,
|
| 158 |
-
text=
|
| 159 |
-
|
| 160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
huge_tree=self.__huge_tree_enabled,
|
| 162 |
-
**self.__response_data
|
| 163 |
)
|
| 164 |
|
| 165 |
-
def __handle_element(
|
|
|
|
|
|
|
| 166 |
"""Used internally in all functions to convert a single element to type (Adaptor|TextHandler) when possible"""
|
| 167 |
if element is None:
|
| 168 |
return None
|
|
@@ -172,9 +231,13 @@ class Adaptor(SelectorsGeneration):
|
|
| 172 |
else:
|
| 173 |
return self.__element_convertor(element)
|
| 174 |
|
| 175 |
-
def __handle_elements(
|
|
|
|
|
|
|
| 176 |
"""Used internally in all functions to convert results to type (Adaptors|TextHandlers) in bulk when possible"""
|
| 177 |
-
if not len(
|
|
|
|
|
|
|
| 178 |
return Adaptors([])
|
| 179 |
|
| 180 |
# From within the code, this method will always get a list of the same type
|
|
@@ -209,7 +272,16 @@ class Adaptor(SelectorsGeneration):
|
|
| 209 |
self.__text = TextHandler(self._root.text)
|
| 210 |
return self.__text
|
| 211 |
|
| 212 |
-
def get_all_text(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
"""Get all child strings of this element, concatenated using the given separator.
|
| 214 |
|
| 215 |
:param separator: Strings will be concatenated using this separator.
|
|
@@ -220,7 +292,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 220 |
:return: A TextHandler
|
| 221 |
"""
|
| 222 |
_all_strings = []
|
| 223 |
-
for node in self._root.xpath(
|
| 224 |
if node.tag not in ignore_tags:
|
| 225 |
text = node.text
|
| 226 |
if text and type(text) is str:
|
|
@@ -245,13 +317,25 @@ class Adaptor(SelectorsGeneration):
|
|
| 245 |
@property
|
| 246 |
def html_content(self) -> TextHandler:
|
| 247 |
"""Return the inner html code of the element"""
|
| 248 |
-
return TextHandler(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
|
| 250 |
body = html_content
|
| 251 |
|
| 252 |
def prettify(self) -> TextHandler:
|
| 253 |
"""Return a prettified version of the element's inner html-code"""
|
| 254 |
-
return TextHandler(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
|
| 256 |
def has_class(self, class_name: str) -> bool:
|
| 257 |
"""Check if element has a specific class
|
|
@@ -261,36 +345,44 @@ class Adaptor(SelectorsGeneration):
|
|
| 261 |
return class_name in self._root.classes
|
| 262 |
|
| 263 |
@property
|
| 264 |
-
def parent(self) -> Union[
|
| 265 |
"""Return the direct parent of the element or ``None`` otherwise"""
|
| 266 |
return self.__handle_element(self._root.getparent())
|
| 267 |
|
| 268 |
@property
|
| 269 |
-
def below_elements(self) ->
|
| 270 |
"""Return all elements under the current element in the DOM tree"""
|
| 271 |
-
below = self._root.xpath(
|
| 272 |
return self.__handle_elements(below)
|
| 273 |
|
| 274 |
@property
|
| 275 |
-
def children(self) ->
|
| 276 |
"""Return the children elements of the current element or empty list otherwise"""
|
| 277 |
-
return Adaptors(
|
| 278 |
-
|
| 279 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
|
| 281 |
@property
|
| 282 |
-
def siblings(self) ->
|
| 283 |
"""Return other children of the current element's parent or empty list otherwise"""
|
| 284 |
if self.parent:
|
| 285 |
-
return Adaptors(
|
|
|
|
|
|
|
| 286 |
return Adaptors([])
|
| 287 |
|
| 288 |
-
def iterancestors(self) -> Generator[
|
| 289 |
"""Return a generator that loops over all ancestors of the element, starting with element's parent."""
|
| 290 |
for ancestor in self._root.iterancestors():
|
| 291 |
yield self.__element_convertor(ancestor)
|
| 292 |
|
| 293 |
-
def find_ancestor(
|
|
|
|
|
|
|
| 294 |
"""Loop over all ancestors of the element till one match the passed function
|
| 295 |
:param func: A function that takes each ancestor as an argument and returns True/False
|
| 296 |
:return: The first ancestor that match the function or ``None`` otherwise.
|
|
@@ -301,13 +393,13 @@ class Adaptor(SelectorsGeneration):
|
|
| 301 |
return None
|
| 302 |
|
| 303 |
@property
|
| 304 |
-
def path(self) ->
|
| 305 |
"""Returns list of type :class:`Adaptors` that contains the path leading to the current element from the root."""
|
| 306 |
lst = list(self.iterancestors())
|
| 307 |
return Adaptors(lst)
|
| 308 |
|
| 309 |
@property
|
| 310 |
-
def next(self) -> Union[
|
| 311 |
"""Returns the next element of the current element in the children of the parent or ``None`` otherwise."""
|
| 312 |
next_element = self._root.getnext()
|
| 313 |
if next_element is not None:
|
|
@@ -318,7 +410,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 318 |
return self.__handle_element(next_element)
|
| 319 |
|
| 320 |
@property
|
| 321 |
-
def previous(self) -> Union[
|
| 322 |
"""Returns the previous element of the current element in the children of the parent or ``None`` otherwise."""
|
| 323 |
prev_element = self._root.getprevious()
|
| 324 |
if prev_element is not None:
|
|
@@ -346,13 +438,13 @@ class Adaptor(SelectorsGeneration):
|
|
| 346 |
data = "<"
|
| 347 |
content = clean_spaces(self.html_content)
|
| 348 |
if len(content) > length_limit:
|
| 349 |
-
content = content[:length_limit].strip() +
|
| 350 |
data += f"data='{content}'"
|
| 351 |
|
| 352 |
if self.parent:
|
| 353 |
parent_content = clean_spaces(self.parent.html_content)
|
| 354 |
if len(parent_content) > length_limit:
|
| 355 |
-
parent_content = parent_content[:length_limit].strip() +
|
| 356 |
|
| 357 |
data += f" parent='{parent_content}'"
|
| 358 |
|
|
@@ -360,8 +452,11 @@ class Adaptor(SelectorsGeneration):
|
|
| 360 |
|
| 361 |
# From here we start the selecting functions
|
| 362 |
def relocate(
|
| 363 |
-
|
| 364 |
-
|
|
|
|
|
|
|
|
|
|
| 365 |
"""This function will search again for the element in the page tree, used automatically on page structure change
|
| 366 |
|
| 367 |
:param element: The element we want to relocate in the tree
|
|
@@ -379,7 +474,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 379 |
if issubclass(type(element), html.HtmlElement):
|
| 380 |
element = _StorageTools.element_to_dict(element)
|
| 381 |
|
| 382 |
-
for node in self._root.xpath(
|
| 383 |
# Collect all elements in the page then for each element get the matching score of it against the node.
|
| 384 |
# Hence: the code doesn't stop even if the score was 100%
|
| 385 |
# because there might be another element(s) left in page with the same score
|
|
@@ -391,19 +486,26 @@ class Adaptor(SelectorsGeneration):
|
|
| 391 |
if score_table[highest_probability] and highest_probability >= percentage:
|
| 392 |
if log.getEffectiveLevel() < 20:
|
| 393 |
# No need to execute this part if logging level is not debugging
|
| 394 |
-
log.debug(f
|
| 395 |
-
log.debug(
|
| 396 |
for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
|
| 397 |
-
log.debug(
|
|
|
|
|
|
|
| 398 |
|
| 399 |
if not adaptor_type:
|
| 400 |
return score_table[highest_probability]
|
| 401 |
return self.__handle_elements(score_table[highest_probability])
|
| 402 |
return []
|
| 403 |
|
| 404 |
-
def css_first(
|
| 405 |
-
|
| 406 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 407 |
"""Search current tree with CSS3 selectors and return the first result if possible, otherwise return `None`
|
| 408 |
|
| 409 |
**Important:
|
|
@@ -419,13 +521,21 @@ class Adaptor(SelectorsGeneration):
|
|
| 419 |
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
| 420 |
number unless you must know what you are doing!
|
| 421 |
"""
|
| 422 |
-
for element in self.css(
|
|
|
|
|
|
|
| 423 |
return element
|
| 424 |
return None
|
| 425 |
|
| 426 |
-
def xpath_first(
|
| 427 |
-
|
| 428 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 429 |
"""Search current tree with XPath selectors and return the first result if possible, otherwise return `None`
|
| 430 |
|
| 431 |
**Important:
|
|
@@ -443,13 +553,20 @@ class Adaptor(SelectorsGeneration):
|
|
| 443 |
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
| 444 |
number unless you must know what you are doing!
|
| 445 |
"""
|
| 446 |
-
for element in self.xpath(
|
|
|
|
|
|
|
| 447 |
return element
|
| 448 |
return None
|
| 449 |
|
| 450 |
-
def css(
|
| 451 |
-
|
| 452 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 453 |
"""Search current tree with CSS3 selectors
|
| 454 |
|
| 455 |
**Important:
|
|
@@ -468,28 +585,49 @@ class Adaptor(SelectorsGeneration):
|
|
| 468 |
:return: List as :class:`Adaptors`
|
| 469 |
"""
|
| 470 |
try:
|
| 471 |
-
if not self.__auto_match_enabled or
|
| 472 |
# No need to split selectors in this case, let's save some CPU cycles :)
|
| 473 |
xpath_selector = translator_instance.css_to_xpath(selector)
|
| 474 |
-
return self.xpath(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 475 |
|
| 476 |
results = []
|
| 477 |
-
if
|
| 478 |
for single_selector in split_selectors(selector):
|
| 479 |
# I'm doing this only so the `save` function save data correctly for combined selectors
|
| 480 |
# Like using the ',' to combine two different selectors that point to different elements.
|
| 481 |
-
xpath_selector = translator_instance.css_to_xpath(
|
|
|
|
|
|
|
| 482 |
results += self.xpath(
|
| 483 |
-
xpath_selector,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 484 |
)
|
| 485 |
|
| 486 |
return results
|
| 487 |
-
except (
|
|
|
|
|
|
|
|
|
|
| 488 |
raise SelectorSyntaxError(f"Invalid CSS selector: {selector}")
|
| 489 |
|
| 490 |
-
def xpath(
|
| 491 |
-
|
| 492 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 493 |
"""Search current tree with XPath selectors
|
| 494 |
|
| 495 |
**Important:
|
|
@@ -515,7 +653,9 @@ class Adaptor(SelectorsGeneration):
|
|
| 515 |
if elements:
|
| 516 |
if auto_save:
|
| 517 |
if not self.__auto_match_enabled:
|
| 518 |
-
log.warning(
|
|
|
|
|
|
|
| 519 |
else:
|
| 520 |
self.save(elements[0], identifier or selector)
|
| 521 |
|
|
@@ -531,16 +671,29 @@ class Adaptor(SelectorsGeneration):
|
|
| 531 |
return self.__handle_elements(elements)
|
| 532 |
else:
|
| 533 |
if auto_match:
|
| 534 |
-
log.warning(
|
|
|
|
|
|
|
| 535 |
elif auto_save:
|
| 536 |
-
log.warning(
|
|
|
|
|
|
|
| 537 |
|
| 538 |
return self.__handle_elements(elements)
|
| 539 |
|
| 540 |
-
except (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 541 |
raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
|
| 542 |
|
| 543 |
-
def find_all(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 544 |
"""Find elements by filters of your creations for ease..
|
| 545 |
|
| 546 |
:param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
|
|
@@ -551,12 +704,14 @@ class Adaptor(SelectorsGeneration):
|
|
| 551 |
# Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
|
| 552 |
# https://www.w3schools.com/python/python_ref_keywords.asp
|
| 553 |
whitelisted = {
|
| 554 |
-
|
| 555 |
-
|
| 556 |
}
|
| 557 |
|
| 558 |
if not args and not kwargs:
|
| 559 |
-
raise TypeError(
|
|
|
|
|
|
|
| 560 |
|
| 561 |
attributes = dict()
|
| 562 |
tags, patterns = set(), set()
|
|
@@ -569,12 +724,18 @@ class Adaptor(SelectorsGeneration):
|
|
| 569 |
|
| 570 |
elif type(arg) in [list, tuple, set]:
|
| 571 |
if not all(map(lambda x: type(x) is str, arg)):
|
| 572 |
-
raise TypeError(
|
|
|
|
|
|
|
| 573 |
tags.update(set(arg))
|
| 574 |
|
| 575 |
elif isinstance(arg, dict):
|
| 576 |
-
if not all(
|
| 577 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 578 |
attributes.update(arg)
|
| 579 |
|
| 580 |
elif isinstance(arg, re.Pattern):
|
|
@@ -584,13 +745,17 @@ class Adaptor(SelectorsGeneration):
|
|
| 584 |
if len(inspect.signature(arg).parameters) > 0:
|
| 585 |
functions.append(arg)
|
| 586 |
else:
|
| 587 |
-
raise TypeError(
|
|
|
|
|
|
|
| 588 |
|
| 589 |
else:
|
| 590 |
-
raise TypeError(
|
|
|
|
|
|
|
| 591 |
|
| 592 |
if not all([(type(k) is str and type(v) is str) for k, v in kwargs.items()]):
|
| 593 |
-
raise TypeError(
|
| 594 |
|
| 595 |
for attribute_name, value in kwargs.items():
|
| 596 |
# Only replace names for kwargs, replacing them in dictionaries doesn't make sense
|
|
@@ -598,22 +763,24 @@ class Adaptor(SelectorsGeneration):
|
|
| 598 |
attributes[attribute_name] = value
|
| 599 |
|
| 600 |
# It's easier and faster to build a selector than traversing the tree
|
| 601 |
-
tags = tags or [
|
| 602 |
for tag in tags:
|
| 603 |
selector = tag
|
| 604 |
for key, value in attributes.items():
|
| 605 |
-
value = value.replace('"', r
|
| 606 |
# Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
|
| 607 |
selector += '[{}="{}"]'.format(key, value)
|
| 608 |
-
if selector !=
|
| 609 |
selectors.append(selector)
|
| 610 |
|
| 611 |
if selectors:
|
| 612 |
-
results = self.css(
|
| 613 |
if results:
|
| 614 |
# From the results, get the ones that fulfill passed regex patterns
|
| 615 |
for pattern in patterns:
|
| 616 |
-
results = results.filter(
|
|
|
|
|
|
|
| 617 |
|
| 618 |
# From the results, get the ones that fulfill passed functions
|
| 619 |
for function in functions:
|
|
@@ -629,7 +796,11 @@ class Adaptor(SelectorsGeneration):
|
|
| 629 |
|
| 630 |
return results
|
| 631 |
|
| 632 |
-
def find(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 633 |
"""Find elements by filters of your creations for ease then return the first result. Otherwise return `None`.
|
| 634 |
|
| 635 |
:param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
|
|
@@ -640,7 +811,9 @@ class Adaptor(SelectorsGeneration):
|
|
| 640 |
return element
|
| 641 |
return None
|
| 642 |
|
| 643 |
-
def __calculate_similarity_score(
|
|
|
|
|
|
|
| 644 |
"""Used internally to calculate a score that shows how candidate element similar to the original one
|
| 645 |
|
| 646 |
:param original: The original element in the form of the dictionary generated from `element_to_dict` function
|
|
@@ -653,53 +826,68 @@ class Adaptor(SelectorsGeneration):
|
|
| 653 |
# Possible TODO:
|
| 654 |
# Study the idea of giving weight to each test below so some are more important than others
|
| 655 |
# Current results: With weights some websites had better score while it was worse for others
|
| 656 |
-
score += 1 if original[
|
| 657 |
checks += 1
|
| 658 |
|
| 659 |
-
if original[
|
| 660 |
-
score += SequenceMatcher(
|
|
|
|
|
|
|
| 661 |
checks += 1
|
| 662 |
|
| 663 |
# if both doesn't have attributes, it still count for something!
|
| 664 |
-
score += self.__calculate_dict_diff(
|
|
|
|
|
|
|
| 665 |
checks += 1
|
| 666 |
|
| 667 |
# Separate similarity test for class, id, href,... this will help in full structural changes
|
| 668 |
-
for attrib in (
|
| 669 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 670 |
score += SequenceMatcher(
|
| 671 |
-
None,
|
|
|
|
|
|
|
| 672 |
).ratio() # * 0.3 # 30%
|
| 673 |
checks += 1
|
| 674 |
|
| 675 |
-
score += SequenceMatcher(
|
|
|
|
|
|
|
| 676 |
checks += 1
|
| 677 |
|
| 678 |
-
if original.get(
|
| 679 |
# Then we start comparing parents' data
|
| 680 |
-
if candidate.get(
|
| 681 |
score += SequenceMatcher(
|
| 682 |
-
None, original[
|
| 683 |
).ratio() # * 0.2 # 20%
|
| 684 |
checks += 1
|
| 685 |
|
| 686 |
score += self.__calculate_dict_diff(
|
| 687 |
-
original[
|
| 688 |
) # * 0.2 # 20%
|
| 689 |
checks += 1
|
| 690 |
|
| 691 |
-
if original[
|
| 692 |
score += SequenceMatcher(
|
| 693 |
-
None,
|
|
|
|
|
|
|
| 694 |
).ratio() # * 0.1 # 10%
|
| 695 |
checks += 1
|
| 696 |
# else:
|
| 697 |
# # The original element have a parent and this one not, this is not a good sign
|
| 698 |
# score -= 0.1
|
| 699 |
|
| 700 |
-
if original.get(
|
| 701 |
score += SequenceMatcher(
|
| 702 |
-
None, original[
|
| 703 |
).ratio() # * 0.1 # 10%
|
| 704 |
checks += 1
|
| 705 |
|
|
@@ -708,13 +896,20 @@ class Adaptor(SelectorsGeneration):
|
|
| 708 |
|
| 709 |
@staticmethod
|
| 710 |
def __calculate_dict_diff(dict1: dict, dict2: dict) -> float:
|
| 711 |
-
"""Used internally calculate similarity between two dictionaries as SequenceMatcher doesn't accept dictionaries
|
| 712 |
-
|
| 713 |
-
|
| 714 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 715 |
return score
|
| 716 |
|
| 717 |
-
def save(
|
|
|
|
|
|
|
| 718 |
"""Saves the element's unique properties to the storage for retrieval and relocation later
|
| 719 |
|
| 720 |
:param element: The element itself that we want to save to storage, it can be a `Adaptor` or pure `HtmlElement`
|
|
@@ -756,8 +951,13 @@ class Adaptor(SelectorsGeneration):
|
|
| 756 |
else:
|
| 757 |
return self.get_all_text(strip=True).json()
|
| 758 |
|
| 759 |
-
def re(
|
| 760 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 761 |
"""Apply the given regex to the current text and return a list of strings with the matches.
|
| 762 |
|
| 763 |
:param regex: Can be either a compiled regular expression or a string.
|
|
@@ -767,8 +967,14 @@ class Adaptor(SelectorsGeneration):
|
|
| 767 |
"""
|
| 768 |
return self.text.re(regex, replace_entities, clean_match, case_sensitive)
|
| 769 |
|
| 770 |
-
def re_first(
|
| 771 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 772 |
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
| 773 |
|
| 774 |
:param regex: Can be either a compiled regular expression or a string.
|
|
@@ -777,14 +983,19 @@ class Adaptor(SelectorsGeneration):
|
|
| 777 |
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 778 |
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
| 779 |
"""
|
| 780 |
-
return self.text.re_first(
|
|
|
|
|
|
|
| 781 |
|
| 782 |
def find_similar(
|
| 783 |
-
|
| 784 |
-
|
| 785 |
-
|
| 786 |
-
|
| 787 |
-
|
|
|
|
|
|
|
|
|
|
| 788 |
"""Find elements that are in the same tree depth in the page with the same tag name and same parent tag etc...
|
| 789 |
then return the ones that match the current element attributes with percentage higher than the input threshold.
|
| 790 |
|
|
@@ -805,19 +1016,28 @@ class Adaptor(SelectorsGeneration):
|
|
| 805 |
|
| 806 |
:return: A ``Adaptors`` container of ``Adaptor`` objects or empty list
|
| 807 |
"""
|
|
|
|
| 808 |
def get_attributes(element: html.HtmlElement) -> Dict:
|
| 809 |
"""Return attributes dictionary without the ignored list"""
|
| 810 |
-
return {
|
| 811 |
-
|
| 812 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 813 |
"""Calculate a score of how much these elements are alike and return True
|
| 814 |
-
|
| 815 |
-
candidate_attributes =
|
|
|
|
|
|
|
| 816 |
score, checks = 0, 0
|
| 817 |
|
| 818 |
if original_attributes:
|
| 819 |
score += sum(
|
| 820 |
-
SequenceMatcher(None, v, candidate_attributes.get(k,
|
| 821 |
for k, v in original_attributes.items()
|
| 822 |
)
|
| 823 |
checks += len(candidate_attributes)
|
|
@@ -829,7 +1049,9 @@ class Adaptor(SelectorsGeneration):
|
|
| 829 |
|
| 830 |
if match_text:
|
| 831 |
score += SequenceMatcher(
|
| 832 |
-
None,
|
|
|
|
|
|
|
| 833 |
).ratio()
|
| 834 |
checks += 1
|
| 835 |
|
|
@@ -851,20 +1073,30 @@ class Adaptor(SelectorsGeneration):
|
|
| 851 |
f"//{grandparent.tag}/{parent.tag}/{self.tag}[count(ancestor::*) = {current_depth}]"
|
| 852 |
)
|
| 853 |
else:
|
| 854 |
-
potential_matches = root.xpath(
|
|
|
|
|
|
|
| 855 |
else:
|
| 856 |
-
potential_matches = root.xpath(
|
|
|
|
|
|
|
| 857 |
|
| 858 |
for potential_match in potential_matches:
|
| 859 |
-
if potential_match != root and are_alike(
|
|
|
|
|
|
|
| 860 |
similar_elements.append(potential_match)
|
| 861 |
|
| 862 |
return self.__handle_elements(similar_elements)
|
| 863 |
|
| 864 |
def find_by_text(
|
| 865 |
-
|
| 866 |
-
|
| 867 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 868 |
"""Find elements that its text content fully/partially matches input.
|
| 869 |
:param text: Text query to match
|
| 870 |
:param first_match: Return first element that matches conditions, enabled by default
|
|
@@ -878,7 +1110,9 @@ class Adaptor(SelectorsGeneration):
|
|
| 878 |
text = text.lower()
|
| 879 |
|
| 880 |
# This selector gets all elements with text content
|
| 881 |
-
for node in self.__handle_elements(
|
|
|
|
|
|
|
| 882 |
"""Check if element matches given text otherwise, traverse the children tree and iterate"""
|
| 883 |
node_text = node.text
|
| 884 |
if clean_match:
|
|
@@ -903,8 +1137,12 @@ class Adaptor(SelectorsGeneration):
|
|
| 903 |
return results
|
| 904 |
|
| 905 |
def find_by_regex(
|
| 906 |
-
|
| 907 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 908 |
"""Find elements that its text content matches the input regex pattern.
|
| 909 |
:param query: Regex query/pattern to match
|
| 910 |
:param first_match: Return first element that matches conditions, enabled by default
|
|
@@ -914,10 +1152,17 @@ class Adaptor(SelectorsGeneration):
|
|
| 914 |
results = Adaptors([])
|
| 915 |
|
| 916 |
# This selector gets all elements with text content
|
| 917 |
-
for node in self.__handle_elements(
|
|
|
|
|
|
|
| 918 |
"""Check if element matches given regex otherwise, traverse the children tree and iterate"""
|
| 919 |
node_text = node.text
|
| 920 |
-
if node_text.re(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 921 |
results.append(node)
|
| 922 |
|
| 923 |
if first_match and results:
|
|
@@ -933,6 +1178,7 @@ class Adaptors(List[Adaptor]):
|
|
| 933 |
"""
|
| 934 |
The :class:`Adaptors` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
|
| 935 |
"""
|
|
|
|
| 936 |
__slots__ = ()
|
| 937 |
|
| 938 |
@typing.overload
|
|
@@ -943,7 +1189,9 @@ class Adaptors(List[Adaptor]):
|
|
| 943 |
def __getitem__(self, pos: slice) -> "Adaptors":
|
| 944 |
pass
|
| 945 |
|
| 946 |
-
def __getitem__(
|
|
|
|
|
|
|
| 947 |
lst = super().__getitem__(pos)
|
| 948 |
if isinstance(pos, slice):
|
| 949 |
return self.__class__(lst)
|
|
@@ -951,7 +1199,12 @@ class Adaptors(List[Adaptor]):
|
|
| 951 |
return lst
|
| 952 |
|
| 953 |
def xpath(
|
| 954 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 955 |
) -> "Adaptors[Adaptor]":
|
| 956 |
"""
|
| 957 |
Call the ``.xpath()`` method for each element in this list and return
|
|
@@ -974,11 +1227,20 @@ class Adaptors(List[Adaptor]):
|
|
| 974 |
:return: List as :class:`Adaptors`
|
| 975 |
"""
|
| 976 |
results = [
|
| 977 |
-
n.xpath(
|
|
|
|
|
|
|
|
|
|
| 978 |
]
|
| 979 |
return self.__class__(flatten(results))
|
| 980 |
|
| 981 |
-
def css(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 982 |
"""
|
| 983 |
Call the ``.css()`` method for each element in this list and return
|
| 984 |
their results flattened as another :class:`Adaptors`.
|
|
@@ -998,12 +1260,18 @@ class Adaptors(List[Adaptor]):
|
|
| 998 |
:return: List as :class:`Adaptors`
|
| 999 |
"""
|
| 1000 |
results = [
|
| 1001 |
-
n.css(selector, identifier or selector, False, auto_save, percentage)
|
|
|
|
| 1002 |
]
|
| 1003 |
return self.__class__(flatten(results))
|
| 1004 |
|
| 1005 |
-
def re(
|
| 1006 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1007 |
"""Call the ``.re()`` method for each element in this list and return
|
| 1008 |
their results flattened as List of TextHandler.
|
| 1009 |
|
|
@@ -1013,12 +1281,19 @@ class Adaptors(List[Adaptor]):
|
|
| 1013 |
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
| 1014 |
"""
|
| 1015 |
results = [
|
| 1016 |
-
n.text.re(regex, replace_entities, clean_match, case_sensitive)
|
|
|
|
| 1017 |
]
|
| 1018 |
return TextHandlers(flatten(results))
|
| 1019 |
|
| 1020 |
-
def re_first(
|
| 1021 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1022 |
"""Call the ``.re_first()`` method for each element in this list and return
|
| 1023 |
the first result or the default value otherwise.
|
| 1024 |
|
|
@@ -1033,7 +1308,7 @@ class Adaptors(List[Adaptor]):
|
|
| 1033 |
return result
|
| 1034 |
return default
|
| 1035 |
|
| 1036 |
-
def search(self, func: Callable[[
|
| 1037 |
"""Loop over all current elements and return the first element that matches the passed function
|
| 1038 |
:param func: A function that takes each element as an argument and returns True/False
|
| 1039 |
:return: The first element that match the function or ``None`` otherwise.
|
|
@@ -1043,14 +1318,12 @@ class Adaptors(List[Adaptor]):
|
|
| 1043 |
return element
|
| 1044 |
return None
|
| 1045 |
|
| 1046 |
-
def filter(self, func: Callable[[
|
| 1047 |
"""Filter current elements based on the passed function
|
| 1048 |
:param func: A function that takes each element as an argument and returns True/False
|
| 1049 |
:return: The new `Adaptors` object or empty list otherwise.
|
| 1050 |
"""
|
| 1051 |
-
return self.__class__([
|
| 1052 |
-
element for element in self if func(element)
|
| 1053 |
-
])
|
| 1054 |
|
| 1055 |
# For easy copy-paste from Scrapy/parsel code when needed :)
|
| 1056 |
def get(self, default=None):
|
|
|
|
| 9 |
from cssselect import parse as split_selectors
|
| 10 |
from lxml import etree, html
|
| 11 |
|
| 12 |
+
from scrapling.core._types import (
|
| 13 |
+
Any,
|
| 14 |
+
Callable,
|
| 15 |
+
Dict,
|
| 16 |
+
Generator,
|
| 17 |
+
Iterable,
|
| 18 |
+
List,
|
| 19 |
+
Optional,
|
| 20 |
+
Pattern,
|
| 21 |
+
SupportsIndex,
|
| 22 |
+
Tuple,
|
| 23 |
+
Union,
|
| 24 |
+
)
|
| 25 |
+
from scrapling.core.custom_types import AttributesHandler, TextHandler, TextHandlers
|
| 26 |
from scrapling.core.mixins import SelectorsGeneration
|
| 27 |
+
from scrapling.core.storage_adaptors import (
|
| 28 |
+
SQLiteStorageSystem,
|
| 29 |
+
StorageSystemMixin,
|
| 30 |
+
_StorageTools,
|
| 31 |
+
)
|
| 32 |
from scrapling.core.translator import translator_instance
|
| 33 |
+
from scrapling.core.utils import clean_spaces, flatten, html_forbidden, is_jsonable, log
|
|
|
|
| 34 |
|
| 35 |
|
| 36 |
class Adaptor(SelectorsGeneration):
|
| 37 |
__slots__ = (
|
| 38 |
+
"url",
|
| 39 |
+
"encoding",
|
| 40 |
+
"__auto_match_enabled",
|
| 41 |
+
"_root",
|
| 42 |
+
"_storage",
|
| 43 |
+
"__keep_comments",
|
| 44 |
+
"__huge_tree_enabled",
|
| 45 |
+
"__attributes",
|
| 46 |
+
"__text",
|
| 47 |
+
"__tag",
|
| 48 |
+
"__keep_cdata",
|
| 49 |
)
|
| 50 |
|
| 51 |
def __init__(
|
| 52 |
+
self,
|
| 53 |
+
text: Optional[str] = None,
|
| 54 |
+
url: Optional[str] = None,
|
| 55 |
+
body: bytes = b"",
|
| 56 |
+
encoding: str = "utf8",
|
| 57 |
+
huge_tree: bool = True,
|
| 58 |
+
root: Optional[html.HtmlElement] = None,
|
| 59 |
+
keep_comments: Optional[bool] = False,
|
| 60 |
+
keep_cdata: Optional[bool] = False,
|
| 61 |
+
auto_match: Optional[bool] = False,
|
| 62 |
+
storage: Any = SQLiteStorageSystem,
|
| 63 |
+
storage_args: Optional[Dict] = None,
|
| 64 |
+
**kwargs,
|
| 65 |
):
|
| 66 |
"""The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
|
| 67 |
with expressions in CSS, XPath, or with simply text. Check the docs for more info.
|
|
|
|
| 88 |
If empty, default values will be used.
|
| 89 |
"""
|
| 90 |
if root is None and not body and text is None:
|
| 91 |
+
raise ValueError(
|
| 92 |
+
"Adaptor class needs text, body, or root arguments to work"
|
| 93 |
+
)
|
| 94 |
|
| 95 |
+
self.__text = ""
|
| 96 |
if root is None:
|
| 97 |
if text is None:
|
| 98 |
if not body or not isinstance(body, bytes):
|
| 99 |
+
raise TypeError(
|
| 100 |
+
f"body argument must be valid and of type bytes, got {body.__class__}"
|
| 101 |
+
)
|
| 102 |
|
| 103 |
body = body.replace(b"\x00", b"").strip()
|
| 104 |
else:
|
| 105 |
if not isinstance(text, str):
|
| 106 |
+
raise TypeError(
|
| 107 |
+
f"text argument must be of type str, got {text.__class__}"
|
| 108 |
+
)
|
| 109 |
|
| 110 |
body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
|
| 111 |
|
| 112 |
# https://lxml.de/api/lxml.etree.HTMLParser-class.html
|
| 113 |
parser = html.HTMLParser(
|
| 114 |
+
recover=True,
|
| 115 |
+
remove_blank_text=True,
|
| 116 |
+
remove_comments=(not keep_comments),
|
| 117 |
+
encoding=encoding,
|
| 118 |
+
compact=True,
|
| 119 |
+
huge_tree=huge_tree,
|
| 120 |
+
default_doctype=True,
|
| 121 |
+
strip_cdata=(not keep_cdata),
|
| 122 |
)
|
| 123 |
self._root = etree.fromstring(body, parser=parser, base_url=url)
|
| 124 |
if is_jsonable(text or body.decode()):
|
|
|
|
| 138 |
if self.__auto_match_enabled:
|
| 139 |
if not storage_args:
|
| 140 |
storage_args = {
|
| 141 |
+
"storage_file": os.path.join(
|
| 142 |
+
os.path.dirname(__file__), "elements_storage.db"
|
| 143 |
+
),
|
| 144 |
+
"url": url,
|
| 145 |
}
|
| 146 |
|
| 147 |
+
if not hasattr(storage, "__wrapped__"):
|
| 148 |
+
raise ValueError(
|
| 149 |
+
"Storage class must be wrapped with lru_cache decorator, see docs for info"
|
| 150 |
+
)
|
| 151 |
|
| 152 |
if not issubclass(storage.__wrapped__, StorageSystemMixin):
|
| 153 |
+
raise ValueError(
|
| 154 |
+
"Storage system must be inherited from class `StorageSystemMixin`"
|
| 155 |
+
)
|
| 156 |
|
| 157 |
self._storage = storage(**storage_args)
|
| 158 |
|
|
|
|
| 165 |
self.__attributes = None
|
| 166 |
self.__tag = None
|
| 167 |
# No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
|
| 168 |
+
self.__response_data = (
|
| 169 |
+
{
|
| 170 |
+
key: getattr(self, key)
|
| 171 |
+
for key in (
|
| 172 |
+
"status",
|
| 173 |
+
"reason",
|
| 174 |
+
"cookies",
|
| 175 |
+
"history",
|
| 176 |
+
"headers",
|
| 177 |
+
"request_headers",
|
| 178 |
+
)
|
| 179 |
+
}
|
| 180 |
+
if hasattr(self, "status")
|
| 181 |
+
else {}
|
| 182 |
+
)
|
| 183 |
|
| 184 |
# Node functionalities, I wanted to move to separate Mixin class but it had slight impact on performance
|
| 185 |
@staticmethod
|
| 186 |
+
def _is_text_node(
|
| 187 |
+
element: Union[html.HtmlElement, etree._ElementUnicodeResult],
|
| 188 |
+
) -> bool:
|
| 189 |
"""Return True if given element is a result of a string expression
|
| 190 |
Examples:
|
| 191 |
XPath -> '/text()', '/@attribute' etc...
|
|
|
|
| 195 |
return issubclass(type(element), etree._ElementUnicodeResult)
|
| 196 |
|
| 197 |
@staticmethod
|
| 198 |
+
def __content_convertor(
|
| 199 |
+
element: Union[html.HtmlElement, etree._ElementUnicodeResult],
|
| 200 |
+
) -> TextHandler:
|
| 201 |
"""Used internally to convert a single element's text content to TextHandler directly without checks
|
| 202 |
|
| 203 |
This single line has been isolated like this so when it's used with map we get that slight performance boost vs list comprehension
|
| 204 |
"""
|
| 205 |
return TextHandler(str(element))
|
| 206 |
|
| 207 |
+
def __element_convertor(self, element: html.HtmlElement) -> "Adaptor":
|
| 208 |
"""Used internally to convert a single HtmlElement to Adaptor directly without checks"""
|
| 209 |
return Adaptor(
|
| 210 |
root=element,
|
| 211 |
+
text="",
|
| 212 |
+
body=b"", # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
|
| 213 |
+
url=self.url,
|
| 214 |
+
encoding=self.encoding,
|
| 215 |
+
auto_match=self.__auto_match_enabled,
|
| 216 |
+
keep_comments=self.__keep_comments,
|
| 217 |
+
keep_cdata=self.__keep_cdata,
|
| 218 |
huge_tree=self.__huge_tree_enabled,
|
| 219 |
+
**self.__response_data,
|
| 220 |
)
|
| 221 |
|
| 222 |
+
def __handle_element(
|
| 223 |
+
self, element: Union[html.HtmlElement, etree._ElementUnicodeResult]
|
| 224 |
+
) -> Union[TextHandler, "Adaptor", None]:
|
| 225 |
"""Used internally in all functions to convert a single element to type (Adaptor|TextHandler) when possible"""
|
| 226 |
if element is None:
|
| 227 |
return None
|
|
|
|
| 231 |
else:
|
| 232 |
return self.__element_convertor(element)
|
| 233 |
|
| 234 |
+
def __handle_elements(
|
| 235 |
+
self, result: List[Union[html.HtmlElement, etree._ElementUnicodeResult]]
|
| 236 |
+
) -> Union["Adaptors", "TextHandlers", List]:
|
| 237 |
"""Used internally in all functions to convert results to type (Adaptors|TextHandlers) in bulk when possible"""
|
| 238 |
+
if not len(
|
| 239 |
+
result
|
| 240 |
+
): # Lxml will give a warning if I used something like `not result`
|
| 241 |
return Adaptors([])
|
| 242 |
|
| 243 |
# From within the code, this method will always get a list of the same type
|
|
|
|
| 272 |
self.__text = TextHandler(self._root.text)
|
| 273 |
return self.__text
|
| 274 |
|
| 275 |
+
def get_all_text(
|
| 276 |
+
self,
|
| 277 |
+
separator: str = "\n",
|
| 278 |
+
strip: bool = False,
|
| 279 |
+
ignore_tags: Tuple = (
|
| 280 |
+
"script",
|
| 281 |
+
"style",
|
| 282 |
+
),
|
| 283 |
+
valid_values: bool = True,
|
| 284 |
+
) -> TextHandler:
|
| 285 |
"""Get all child strings of this element, concatenated using the given separator.
|
| 286 |
|
| 287 |
:param separator: Strings will be concatenated using this separator.
|
|
|
|
| 292 |
:return: A TextHandler
|
| 293 |
"""
|
| 294 |
_all_strings = []
|
| 295 |
+
for node in self._root.xpath(".//*"):
|
| 296 |
if node.tag not in ignore_tags:
|
| 297 |
text = node.text
|
| 298 |
if text and type(text) is str:
|
|
|
|
| 317 |
@property
|
| 318 |
def html_content(self) -> TextHandler:
|
| 319 |
"""Return the inner html code of the element"""
|
| 320 |
+
return TextHandler(
|
| 321 |
+
etree.tostring(
|
| 322 |
+
self._root, encoding="unicode", method="html", with_tail=False
|
| 323 |
+
)
|
| 324 |
+
)
|
| 325 |
|
| 326 |
body = html_content
|
| 327 |
|
| 328 |
def prettify(self) -> TextHandler:
|
| 329 |
"""Return a prettified version of the element's inner html-code"""
|
| 330 |
+
return TextHandler(
|
| 331 |
+
etree.tostring(
|
| 332 |
+
self._root,
|
| 333 |
+
encoding="unicode",
|
| 334 |
+
pretty_print=True,
|
| 335 |
+
method="html",
|
| 336 |
+
with_tail=False,
|
| 337 |
+
)
|
| 338 |
+
)
|
| 339 |
|
| 340 |
def has_class(self, class_name: str) -> bool:
|
| 341 |
"""Check if element has a specific class
|
|
|
|
| 345 |
return class_name in self._root.classes
|
| 346 |
|
| 347 |
@property
|
| 348 |
+
def parent(self) -> Union["Adaptor", None]:
|
| 349 |
"""Return the direct parent of the element or ``None`` otherwise"""
|
| 350 |
return self.__handle_element(self._root.getparent())
|
| 351 |
|
| 352 |
@property
|
| 353 |
+
def below_elements(self) -> "Adaptors[Adaptor]":
|
| 354 |
"""Return all elements under the current element in the DOM tree"""
|
| 355 |
+
below = self._root.xpath(".//*")
|
| 356 |
return self.__handle_elements(below)
|
| 357 |
|
| 358 |
@property
|
| 359 |
+
def children(self) -> "Adaptors[Adaptor]":
|
| 360 |
"""Return the children elements of the current element or empty list otherwise"""
|
| 361 |
+
return Adaptors(
|
| 362 |
+
[
|
| 363 |
+
self.__element_convertor(child)
|
| 364 |
+
for child in self._root.iterchildren()
|
| 365 |
+
if type(child) not in html_forbidden
|
| 366 |
+
]
|
| 367 |
+
)
|
| 368 |
|
| 369 |
@property
|
| 370 |
+
def siblings(self) -> "Adaptors[Adaptor]":
|
| 371 |
"""Return other children of the current element's parent or empty list otherwise"""
|
| 372 |
if self.parent:
|
| 373 |
+
return Adaptors(
|
| 374 |
+
[child for child in self.parent.children if child._root != self._root]
|
| 375 |
+
)
|
| 376 |
return Adaptors([])
|
| 377 |
|
| 378 |
+
def iterancestors(self) -> Generator["Adaptor", None, None]:
|
| 379 |
"""Return a generator that loops over all ancestors of the element, starting with element's parent."""
|
| 380 |
for ancestor in self._root.iterancestors():
|
| 381 |
yield self.__element_convertor(ancestor)
|
| 382 |
|
| 383 |
+
def find_ancestor(
|
| 384 |
+
self, func: Callable[["Adaptor"], bool]
|
| 385 |
+
) -> Union["Adaptor", None]:
|
| 386 |
"""Loop over all ancestors of the element till one match the passed function
|
| 387 |
:param func: A function that takes each ancestor as an argument and returns True/False
|
| 388 |
:return: The first ancestor that match the function or ``None`` otherwise.
|
|
|
|
| 393 |
return None
|
| 394 |
|
| 395 |
@property
|
| 396 |
+
def path(self) -> "Adaptors[Adaptor]":
|
| 397 |
"""Returns list of type :class:`Adaptors` that contains the path leading to the current element from the root."""
|
| 398 |
lst = list(self.iterancestors())
|
| 399 |
return Adaptors(lst)
|
| 400 |
|
| 401 |
@property
|
| 402 |
+
def next(self) -> Union["Adaptor", None]:
|
| 403 |
"""Returns the next element of the current element in the children of the parent or ``None`` otherwise."""
|
| 404 |
next_element = self._root.getnext()
|
| 405 |
if next_element is not None:
|
|
|
|
| 410 |
return self.__handle_element(next_element)
|
| 411 |
|
| 412 |
@property
|
| 413 |
+
def previous(self) -> Union["Adaptor", None]:
|
| 414 |
"""Returns the previous element of the current element in the children of the parent or ``None`` otherwise."""
|
| 415 |
prev_element = self._root.getprevious()
|
| 416 |
if prev_element is not None:
|
|
|
|
| 438 |
data = "<"
|
| 439 |
content = clean_spaces(self.html_content)
|
| 440 |
if len(content) > length_limit:
|
| 441 |
+
content = content[:length_limit].strip() + "..."
|
| 442 |
data += f"data='{content}'"
|
| 443 |
|
| 444 |
if self.parent:
|
| 445 |
parent_content = clean_spaces(self.parent.html_content)
|
| 446 |
if len(parent_content) > length_limit:
|
| 447 |
+
parent_content = parent_content[:length_limit].strip() + "..."
|
| 448 |
|
| 449 |
data += f" parent='{parent_content}'"
|
| 450 |
|
|
|
|
| 452 |
|
| 453 |
# From here we start the selecting functions
|
| 454 |
def relocate(
|
| 455 |
+
self,
|
| 456 |
+
element: Union[Dict, html.HtmlElement, "Adaptor"],
|
| 457 |
+
percentage: int = 0,
|
| 458 |
+
adaptor_type: bool = False,
|
| 459 |
+
) -> Union[List[Union[html.HtmlElement, None]], "Adaptors"]:
|
| 460 |
"""This function will search again for the element in the page tree, used automatically on page structure change
|
| 461 |
|
| 462 |
:param element: The element we want to relocate in the tree
|
|
|
|
| 474 |
if issubclass(type(element), html.HtmlElement):
|
| 475 |
element = _StorageTools.element_to_dict(element)
|
| 476 |
|
| 477 |
+
for node in self._root.xpath(".//*"):
|
| 478 |
# Collect all elements in the page then for each element get the matching score of it against the node.
|
| 479 |
# Hence: the code doesn't stop even if the score was 100%
|
| 480 |
# because there might be another element(s) left in page with the same score
|
|
|
|
| 486 |
if score_table[highest_probability] and highest_probability >= percentage:
|
| 487 |
if log.getEffectiveLevel() < 20:
|
| 488 |
# No need to execute this part if logging level is not debugging
|
| 489 |
+
log.debug(f"Highest probability was {highest_probability}%")
|
| 490 |
+
log.debug("Top 5 best matching elements are: ")
|
| 491 |
for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
|
| 492 |
+
log.debug(
|
| 493 |
+
f"{percent} -> {self.__handle_elements(score_table[percent])}"
|
| 494 |
+
)
|
| 495 |
|
| 496 |
if not adaptor_type:
|
| 497 |
return score_table[highest_probability]
|
| 498 |
return self.__handle_elements(score_table[highest_probability])
|
| 499 |
return []
|
| 500 |
|
| 501 |
+
def css_first(
|
| 502 |
+
self,
|
| 503 |
+
selector: str,
|
| 504 |
+
identifier: str = "",
|
| 505 |
+
auto_match: bool = False,
|
| 506 |
+
auto_save: bool = False,
|
| 507 |
+
percentage: int = 0,
|
| 508 |
+
) -> Union["Adaptor", "TextHandler", None]:
|
| 509 |
"""Search current tree with CSS3 selectors and return the first result if possible, otherwise return `None`
|
| 510 |
|
| 511 |
**Important:
|
|
|
|
| 521 |
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
| 522 |
number unless you must know what you are doing!
|
| 523 |
"""
|
| 524 |
+
for element in self.css(
|
| 525 |
+
selector, identifier, auto_match, auto_save, percentage
|
| 526 |
+
):
|
| 527 |
return element
|
| 528 |
return None
|
| 529 |
|
| 530 |
+
def xpath_first(
|
| 531 |
+
self,
|
| 532 |
+
selector: str,
|
| 533 |
+
identifier: str = "",
|
| 534 |
+
auto_match: bool = False,
|
| 535 |
+
auto_save: bool = False,
|
| 536 |
+
percentage: int = 0,
|
| 537 |
+
**kwargs: Any,
|
| 538 |
+
) -> Union["Adaptor", "TextHandler", None]:
|
| 539 |
"""Search current tree with XPath selectors and return the first result if possible, otherwise return `None`
|
| 540 |
|
| 541 |
**Important:
|
|
|
|
| 553 |
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
| 554 |
number unless you must know what you are doing!
|
| 555 |
"""
|
| 556 |
+
for element in self.xpath(
|
| 557 |
+
selector, identifier, auto_match, auto_save, percentage, **kwargs
|
| 558 |
+
):
|
| 559 |
return element
|
| 560 |
return None
|
| 561 |
|
| 562 |
+
def css(
|
| 563 |
+
self,
|
| 564 |
+
selector: str,
|
| 565 |
+
identifier: str = "",
|
| 566 |
+
auto_match: bool = False,
|
| 567 |
+
auto_save: bool = False,
|
| 568 |
+
percentage: int = 0,
|
| 569 |
+
) -> Union["Adaptors[Adaptor]", List, "TextHandlers[TextHandler]"]:
|
| 570 |
"""Search current tree with CSS3 selectors
|
| 571 |
|
| 572 |
**Important:
|
|
|
|
| 585 |
:return: List as :class:`Adaptors`
|
| 586 |
"""
|
| 587 |
try:
|
| 588 |
+
if not self.__auto_match_enabled or "," not in selector:
|
| 589 |
# No need to split selectors in this case, let's save some CPU cycles :)
|
| 590 |
xpath_selector = translator_instance.css_to_xpath(selector)
|
| 591 |
+
return self.xpath(
|
| 592 |
+
xpath_selector,
|
| 593 |
+
identifier or selector,
|
| 594 |
+
auto_match,
|
| 595 |
+
auto_save,
|
| 596 |
+
percentage,
|
| 597 |
+
)
|
| 598 |
|
| 599 |
results = []
|
| 600 |
+
if "," in selector:
|
| 601 |
for single_selector in split_selectors(selector):
|
| 602 |
# I'm doing this only so the `save` function save data correctly for combined selectors
|
| 603 |
# Like using the ',' to combine two different selectors that point to different elements.
|
| 604 |
+
xpath_selector = translator_instance.css_to_xpath(
|
| 605 |
+
single_selector.canonical()
|
| 606 |
+
)
|
| 607 |
results += self.xpath(
|
| 608 |
+
xpath_selector,
|
| 609 |
+
identifier or single_selector.canonical(),
|
| 610 |
+
auto_match,
|
| 611 |
+
auto_save,
|
| 612 |
+
percentage,
|
| 613 |
)
|
| 614 |
|
| 615 |
return results
|
| 616 |
+
except (
|
| 617 |
+
SelectorError,
|
| 618 |
+
SelectorSyntaxError,
|
| 619 |
+
):
|
| 620 |
raise SelectorSyntaxError(f"Invalid CSS selector: {selector}")
|
| 621 |
|
| 622 |
+
def xpath(
|
| 623 |
+
self,
|
| 624 |
+
selector: str,
|
| 625 |
+
identifier: str = "",
|
| 626 |
+
auto_match: bool = False,
|
| 627 |
+
auto_save: bool = False,
|
| 628 |
+
percentage: int = 0,
|
| 629 |
+
**kwargs: Any,
|
| 630 |
+
) -> Union["Adaptors[Adaptor]", List, "TextHandlers[TextHandler]"]:
|
| 631 |
"""Search current tree with XPath selectors
|
| 632 |
|
| 633 |
**Important:
|
|
|
|
| 653 |
if elements:
|
| 654 |
if auto_save:
|
| 655 |
if not self.__auto_match_enabled:
|
| 656 |
+
log.warning(
|
| 657 |
+
"Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info."
|
| 658 |
+
)
|
| 659 |
else:
|
| 660 |
self.save(elements[0], identifier or selector)
|
| 661 |
|
|
|
|
| 671 |
return self.__handle_elements(elements)
|
| 672 |
else:
|
| 673 |
if auto_match:
|
| 674 |
+
log.warning(
|
| 675 |
+
"Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info."
|
| 676 |
+
)
|
| 677 |
elif auto_save:
|
| 678 |
+
log.warning(
|
| 679 |
+
"Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info."
|
| 680 |
+
)
|
| 681 |
|
| 682 |
return self.__handle_elements(elements)
|
| 683 |
|
| 684 |
+
except (
|
| 685 |
+
SelectorError,
|
| 686 |
+
SelectorSyntaxError,
|
| 687 |
+
etree.XPathError,
|
| 688 |
+
etree.XPathEvalError,
|
| 689 |
+
):
|
| 690 |
raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
|
| 691 |
|
| 692 |
+
def find_all(
|
| 693 |
+
self,
|
| 694 |
+
*args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]],
|
| 695 |
+
**kwargs: str,
|
| 696 |
+
) -> "Adaptors":
|
| 697 |
"""Find elements by filters of your creations for ease..
|
| 698 |
|
| 699 |
:param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
|
|
|
|
| 704 |
# Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
|
| 705 |
# https://www.w3schools.com/python/python_ref_keywords.asp
|
| 706 |
whitelisted = {
|
| 707 |
+
"class_": "class",
|
| 708 |
+
"for_": "for",
|
| 709 |
}
|
| 710 |
|
| 711 |
if not args and not kwargs:
|
| 712 |
+
raise TypeError(
|
| 713 |
+
"You have to pass something to search with, like tag name(s), tag attributes, or both."
|
| 714 |
+
)
|
| 715 |
|
| 716 |
attributes = dict()
|
| 717 |
tags, patterns = set(), set()
|
|
|
|
| 724 |
|
| 725 |
elif type(arg) in [list, tuple, set]:
|
| 726 |
if not all(map(lambda x: type(x) is str, arg)):
|
| 727 |
+
raise TypeError(
|
| 728 |
+
"Nested Iterables are not accepted, only iterables of tag names are accepted"
|
| 729 |
+
)
|
| 730 |
tags.update(set(arg))
|
| 731 |
|
| 732 |
elif isinstance(arg, dict):
|
| 733 |
+
if not all(
|
| 734 |
+
[(type(k) is str and type(v) is str) for k, v in arg.items()]
|
| 735 |
+
):
|
| 736 |
+
raise TypeError(
|
| 737 |
+
"Nested dictionaries are not accepted, only string keys and string values are accepted"
|
| 738 |
+
)
|
| 739 |
attributes.update(arg)
|
| 740 |
|
| 741 |
elif isinstance(arg, re.Pattern):
|
|
|
|
| 745 |
if len(inspect.signature(arg).parameters) > 0:
|
| 746 |
functions.append(arg)
|
| 747 |
else:
|
| 748 |
+
raise TypeError(
|
| 749 |
+
"Callable filter function must have at least one argument to take `Adaptor` objects."
|
| 750 |
+
)
|
| 751 |
|
| 752 |
else:
|
| 753 |
+
raise TypeError(
|
| 754 |
+
f'Argument with type "{type(arg)}" is not accepted, please read the docs.'
|
| 755 |
+
)
|
| 756 |
|
| 757 |
if not all([(type(k) is str and type(v) is str) for k, v in kwargs.items()]):
|
| 758 |
+
raise TypeError("Only string values are accepted for arguments")
|
| 759 |
|
| 760 |
for attribute_name, value in kwargs.items():
|
| 761 |
# Only replace names for kwargs, replacing them in dictionaries doesn't make sense
|
|
|
|
| 763 |
attributes[attribute_name] = value
|
| 764 |
|
| 765 |
# It's easier and faster to build a selector than traversing the tree
|
| 766 |
+
tags = tags or ["*"]
|
| 767 |
for tag in tags:
|
| 768 |
selector = tag
|
| 769 |
for key, value in attributes.items():
|
| 770 |
+
value = value.replace('"', r"\"") # Escape double quotes in user input
|
| 771 |
# Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
|
| 772 |
selector += '[{}="{}"]'.format(key, value)
|
| 773 |
+
if selector != "*":
|
| 774 |
selectors.append(selector)
|
| 775 |
|
| 776 |
if selectors:
|
| 777 |
+
results = self.css(", ".join(selectors))
|
| 778 |
if results:
|
| 779 |
# From the results, get the ones that fulfill passed regex patterns
|
| 780 |
for pattern in patterns:
|
| 781 |
+
results = results.filter(
|
| 782 |
+
lambda e: e.text.re(pattern, check_match=True)
|
| 783 |
+
)
|
| 784 |
|
| 785 |
# From the results, get the ones that fulfill passed functions
|
| 786 |
for function in functions:
|
|
|
|
| 796 |
|
| 797 |
return results
|
| 798 |
|
| 799 |
+
def find(
|
| 800 |
+
self,
|
| 801 |
+
*args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]],
|
| 802 |
+
**kwargs: str,
|
| 803 |
+
) -> Union["Adaptor", None]:
|
| 804 |
"""Find elements by filters of your creations for ease then return the first result. Otherwise return `None`.
|
| 805 |
|
| 806 |
:param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
|
|
|
|
| 811 |
return element
|
| 812 |
return None
|
| 813 |
|
| 814 |
+
def __calculate_similarity_score(
|
| 815 |
+
self, original: Dict, candidate: html.HtmlElement
|
| 816 |
+
) -> float:
|
| 817 |
"""Used internally to calculate a score that shows how candidate element similar to the original one
|
| 818 |
|
| 819 |
:param original: The original element in the form of the dictionary generated from `element_to_dict` function
|
|
|
|
| 826 |
# Possible TODO:
|
| 827 |
# Study the idea of giving weight to each test below so some are more important than others
|
| 828 |
# Current results: With weights some websites had better score while it was worse for others
|
| 829 |
+
score += 1 if original["tag"] == candidate["tag"] else 0 # * 0.3 # 30%
|
| 830 |
checks += 1
|
| 831 |
|
| 832 |
+
if original["text"]:
|
| 833 |
+
score += SequenceMatcher(
|
| 834 |
+
None, original["text"], candidate.get("text") or ""
|
| 835 |
+
).ratio() # * 0.3 # 30%
|
| 836 |
checks += 1
|
| 837 |
|
| 838 |
# if both doesn't have attributes, it still count for something!
|
| 839 |
+
score += self.__calculate_dict_diff(
|
| 840 |
+
original["attributes"], candidate["attributes"]
|
| 841 |
+
) # * 0.3 # 30%
|
| 842 |
checks += 1
|
| 843 |
|
| 844 |
# Separate similarity test for class, id, href,... this will help in full structural changes
|
| 845 |
+
for attrib in (
|
| 846 |
+
"class",
|
| 847 |
+
"id",
|
| 848 |
+
"href",
|
| 849 |
+
"src",
|
| 850 |
+
):
|
| 851 |
+
if original["attributes"].get(attrib):
|
| 852 |
score += SequenceMatcher(
|
| 853 |
+
None,
|
| 854 |
+
original["attributes"][attrib],
|
| 855 |
+
candidate["attributes"].get(attrib) or "",
|
| 856 |
).ratio() # * 0.3 # 30%
|
| 857 |
checks += 1
|
| 858 |
|
| 859 |
+
score += SequenceMatcher(
|
| 860 |
+
None, original["path"], candidate["path"]
|
| 861 |
+
).ratio() # * 0.1 # 10%
|
| 862 |
checks += 1
|
| 863 |
|
| 864 |
+
if original.get("parent_name"):
|
| 865 |
# Then we start comparing parents' data
|
| 866 |
+
if candidate.get("parent_name"):
|
| 867 |
score += SequenceMatcher(
|
| 868 |
+
None, original["parent_name"], candidate.get("parent_name") or ""
|
| 869 |
).ratio() # * 0.2 # 20%
|
| 870 |
checks += 1
|
| 871 |
|
| 872 |
score += self.__calculate_dict_diff(
|
| 873 |
+
original["parent_attribs"], candidate.get("parent_attribs") or {}
|
| 874 |
) # * 0.2 # 20%
|
| 875 |
checks += 1
|
| 876 |
|
| 877 |
+
if original["parent_text"]:
|
| 878 |
score += SequenceMatcher(
|
| 879 |
+
None,
|
| 880 |
+
original["parent_text"],
|
| 881 |
+
candidate.get("parent_text") or "",
|
| 882 |
).ratio() # * 0.1 # 10%
|
| 883 |
checks += 1
|
| 884 |
# else:
|
| 885 |
# # The original element have a parent and this one not, this is not a good sign
|
| 886 |
# score -= 0.1
|
| 887 |
|
| 888 |
+
if original.get("siblings"):
|
| 889 |
score += SequenceMatcher(
|
| 890 |
+
None, original["siblings"], candidate.get("siblings") or []
|
| 891 |
).ratio() # * 0.1 # 10%
|
| 892 |
checks += 1
|
| 893 |
|
|
|
|
| 896 |
|
| 897 |
@staticmethod
|
| 898 |
def __calculate_dict_diff(dict1: dict, dict2: dict) -> float:
|
| 899 |
+
"""Used internally calculate similarity between two dictionaries as SequenceMatcher doesn't accept dictionaries"""
|
| 900 |
+
score = (
|
| 901 |
+
SequenceMatcher(None, tuple(dict1.keys()), tuple(dict2.keys())).ratio()
|
| 902 |
+
* 0.5
|
| 903 |
+
)
|
| 904 |
+
score += (
|
| 905 |
+
SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio()
|
| 906 |
+
* 0.5
|
| 907 |
+
)
|
| 908 |
return score
|
| 909 |
|
| 910 |
+
def save(
|
| 911 |
+
self, element: Union["Adaptor", html.HtmlElement], identifier: str
|
| 912 |
+
) -> None:
|
| 913 |
"""Saves the element's unique properties to the storage for retrieval and relocation later
|
| 914 |
|
| 915 |
:param element: The element itself that we want to save to storage, it can be a `Adaptor` or pure `HtmlElement`
|
|
|
|
| 951 |
else:
|
| 952 |
return self.get_all_text(strip=True).json()
|
| 953 |
|
| 954 |
+
def re(
|
| 955 |
+
self,
|
| 956 |
+
regex: Union[str, Pattern[str]],
|
| 957 |
+
replace_entities: bool = True,
|
| 958 |
+
clean_match: bool = False,
|
| 959 |
+
case_sensitive: bool = True,
|
| 960 |
+
) -> TextHandlers:
|
| 961 |
"""Apply the given regex to the current text and return a list of strings with the matches.
|
| 962 |
|
| 963 |
:param regex: Can be either a compiled regular expression or a string.
|
|
|
|
| 967 |
"""
|
| 968 |
return self.text.re(regex, replace_entities, clean_match, case_sensitive)
|
| 969 |
|
| 970 |
+
def re_first(
|
| 971 |
+
self,
|
| 972 |
+
regex: Union[str, Pattern[str]],
|
| 973 |
+
default=None,
|
| 974 |
+
replace_entities: bool = True,
|
| 975 |
+
clean_match: bool = False,
|
| 976 |
+
case_sensitive: bool = True,
|
| 977 |
+
) -> TextHandler:
|
| 978 |
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
| 979 |
|
| 980 |
:param regex: Can be either a compiled regular expression or a string.
|
|
|
|
| 983 |
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 984 |
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
| 985 |
"""
|
| 986 |
+
return self.text.re_first(
|
| 987 |
+
regex, default, replace_entities, clean_match, case_sensitive
|
| 988 |
+
)
|
| 989 |
|
| 990 |
def find_similar(
|
| 991 |
+
self,
|
| 992 |
+
similarity_threshold: float = 0.2,
|
| 993 |
+
ignore_attributes: Union[List, Tuple] = (
|
| 994 |
+
"href",
|
| 995 |
+
"src",
|
| 996 |
+
),
|
| 997 |
+
match_text: bool = False,
|
| 998 |
+
) -> Union["Adaptors[Adaptor]", List]:
|
| 999 |
"""Find elements that are in the same tree depth in the page with the same tag name and same parent tag etc...
|
| 1000 |
then return the ones that match the current element attributes with percentage higher than the input threshold.
|
| 1001 |
|
|
|
|
| 1016 |
|
| 1017 |
:return: A ``Adaptors`` container of ``Adaptor`` objects or empty list
|
| 1018 |
"""
|
| 1019 |
+
|
| 1020 |
def get_attributes(element: html.HtmlElement) -> Dict:
|
| 1021 |
"""Return attributes dictionary without the ignored list"""
|
| 1022 |
+
return {
|
| 1023 |
+
k: v for k, v in element.attrib.items() if k not in ignore_attributes
|
| 1024 |
+
}
|
| 1025 |
+
|
| 1026 |
+
def are_alike(
|
| 1027 |
+
original: html.HtmlElement,
|
| 1028 |
+
original_attributes: Dict,
|
| 1029 |
+
candidate: html.HtmlElement,
|
| 1030 |
+
) -> bool:
|
| 1031 |
"""Calculate a score of how much these elements are alike and return True
|
| 1032 |
+
if score is higher or equal the threshold"""
|
| 1033 |
+
candidate_attributes = (
|
| 1034 |
+
get_attributes(candidate) if ignore_attributes else candidate.attrib
|
| 1035 |
+
)
|
| 1036 |
score, checks = 0, 0
|
| 1037 |
|
| 1038 |
if original_attributes:
|
| 1039 |
score += sum(
|
| 1040 |
+
SequenceMatcher(None, v, candidate_attributes.get(k, "")).ratio()
|
| 1041 |
for k, v in original_attributes.items()
|
| 1042 |
)
|
| 1043 |
checks += len(candidate_attributes)
|
|
|
|
| 1049 |
|
| 1050 |
if match_text:
|
| 1051 |
score += SequenceMatcher(
|
| 1052 |
+
None,
|
| 1053 |
+
clean_spaces(original.text or ""),
|
| 1054 |
+
clean_spaces(candidate.text or ""),
|
| 1055 |
).ratio()
|
| 1056 |
checks += 1
|
| 1057 |
|
|
|
|
| 1073 |
f"//{grandparent.tag}/{parent.tag}/{self.tag}[count(ancestor::*) = {current_depth}]"
|
| 1074 |
)
|
| 1075 |
else:
|
| 1076 |
+
potential_matches = root.xpath(
|
| 1077 |
+
f"//{parent.tag}/{self.tag}[count(ancestor::*) = {current_depth}]"
|
| 1078 |
+
)
|
| 1079 |
else:
|
| 1080 |
+
potential_matches = root.xpath(
|
| 1081 |
+
f"//{self.tag}[count(ancestor::*) = {current_depth}]"
|
| 1082 |
+
)
|
| 1083 |
|
| 1084 |
for potential_match in potential_matches:
|
| 1085 |
+
if potential_match != root and are_alike(
|
| 1086 |
+
root, target_attrs, potential_match
|
| 1087 |
+
):
|
| 1088 |
similar_elements.append(potential_match)
|
| 1089 |
|
| 1090 |
return self.__handle_elements(similar_elements)
|
| 1091 |
|
| 1092 |
def find_by_text(
|
| 1093 |
+
self,
|
| 1094 |
+
text: str,
|
| 1095 |
+
first_match: bool = True,
|
| 1096 |
+
partial: bool = False,
|
| 1097 |
+
case_sensitive: bool = False,
|
| 1098 |
+
clean_match: bool = True,
|
| 1099 |
+
) -> Union["Adaptors[Adaptor]", "Adaptor"]:
|
| 1100 |
"""Find elements that its text content fully/partially matches input.
|
| 1101 |
:param text: Text query to match
|
| 1102 |
:param first_match: Return first element that matches conditions, enabled by default
|
|
|
|
| 1110 |
text = text.lower()
|
| 1111 |
|
| 1112 |
# This selector gets all elements with text content
|
| 1113 |
+
for node in self.__handle_elements(
|
| 1114 |
+
self._root.xpath(".//*[normalize-space(text())]")
|
| 1115 |
+
):
|
| 1116 |
"""Check if element matches given text otherwise, traverse the children tree and iterate"""
|
| 1117 |
node_text = node.text
|
| 1118 |
if clean_match:
|
|
|
|
| 1137 |
return results
|
| 1138 |
|
| 1139 |
def find_by_regex(
|
| 1140 |
+
self,
|
| 1141 |
+
query: Union[str, Pattern[str]],
|
| 1142 |
+
first_match: bool = True,
|
| 1143 |
+
case_sensitive: bool = False,
|
| 1144 |
+
clean_match: bool = True,
|
| 1145 |
+
) -> Union["Adaptors[Adaptor]", "Adaptor"]:
|
| 1146 |
"""Find elements that its text content matches the input regex pattern.
|
| 1147 |
:param query: Regex query/pattern to match
|
| 1148 |
:param first_match: Return first element that matches conditions, enabled by default
|
|
|
|
| 1152 |
results = Adaptors([])
|
| 1153 |
|
| 1154 |
# This selector gets all elements with text content
|
| 1155 |
+
for node in self.__handle_elements(
|
| 1156 |
+
self._root.xpath(".//*[normalize-space(text())]")
|
| 1157 |
+
):
|
| 1158 |
"""Check if element matches given regex otherwise, traverse the children tree and iterate"""
|
| 1159 |
node_text = node.text
|
| 1160 |
+
if node_text.re(
|
| 1161 |
+
query,
|
| 1162 |
+
check_match=True,
|
| 1163 |
+
clean_match=clean_match,
|
| 1164 |
+
case_sensitive=case_sensitive,
|
| 1165 |
+
):
|
| 1166 |
results.append(node)
|
| 1167 |
|
| 1168 |
if first_match and results:
|
|
|
|
| 1178 |
"""
|
| 1179 |
The :class:`Adaptors` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
|
| 1180 |
"""
|
| 1181 |
+
|
| 1182 |
__slots__ = ()
|
| 1183 |
|
| 1184 |
@typing.overload
|
|
|
|
| 1189 |
def __getitem__(self, pos: slice) -> "Adaptors":
|
| 1190 |
pass
|
| 1191 |
|
| 1192 |
+
def __getitem__(
|
| 1193 |
+
self, pos: Union[SupportsIndex, slice]
|
| 1194 |
+
) -> Union[Adaptor, "Adaptors"]:
|
| 1195 |
lst = super().__getitem__(pos)
|
| 1196 |
if isinstance(pos, slice):
|
| 1197 |
return self.__class__(lst)
|
|
|
|
| 1199 |
return lst
|
| 1200 |
|
| 1201 |
def xpath(
|
| 1202 |
+
self,
|
| 1203 |
+
selector: str,
|
| 1204 |
+
identifier: str = "",
|
| 1205 |
+
auto_save: bool = False,
|
| 1206 |
+
percentage: int = 0,
|
| 1207 |
+
**kwargs: Any,
|
| 1208 |
) -> "Adaptors[Adaptor]":
|
| 1209 |
"""
|
| 1210 |
Call the ``.xpath()`` method for each element in this list and return
|
|
|
|
| 1227 |
:return: List as :class:`Adaptors`
|
| 1228 |
"""
|
| 1229 |
results = [
|
| 1230 |
+
n.xpath(
|
| 1231 |
+
selector, identifier or selector, False, auto_save, percentage, **kwargs
|
| 1232 |
+
)
|
| 1233 |
+
for n in self
|
| 1234 |
]
|
| 1235 |
return self.__class__(flatten(results))
|
| 1236 |
|
| 1237 |
+
def css(
|
| 1238 |
+
self,
|
| 1239 |
+
selector: str,
|
| 1240 |
+
identifier: str = "",
|
| 1241 |
+
auto_save: bool = False,
|
| 1242 |
+
percentage: int = 0,
|
| 1243 |
+
) -> "Adaptors[Adaptor]":
|
| 1244 |
"""
|
| 1245 |
Call the ``.css()`` method for each element in this list and return
|
| 1246 |
their results flattened as another :class:`Adaptors`.
|
|
|
|
| 1260 |
:return: List as :class:`Adaptors`
|
| 1261 |
"""
|
| 1262 |
results = [
|
| 1263 |
+
n.css(selector, identifier or selector, False, auto_save, percentage)
|
| 1264 |
+
for n in self
|
| 1265 |
]
|
| 1266 |
return self.__class__(flatten(results))
|
| 1267 |
|
| 1268 |
+
def re(
|
| 1269 |
+
self,
|
| 1270 |
+
regex: Union[str, Pattern[str]],
|
| 1271 |
+
replace_entities: bool = True,
|
| 1272 |
+
clean_match: bool = False,
|
| 1273 |
+
case_sensitive: bool = True,
|
| 1274 |
+
) -> TextHandlers[TextHandler]:
|
| 1275 |
"""Call the ``.re()`` method for each element in this list and return
|
| 1276 |
their results flattened as List of TextHandler.
|
| 1277 |
|
|
|
|
| 1281 |
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
| 1282 |
"""
|
| 1283 |
results = [
|
| 1284 |
+
n.text.re(regex, replace_entities, clean_match, case_sensitive)
|
| 1285 |
+
for n in self
|
| 1286 |
]
|
| 1287 |
return TextHandlers(flatten(results))
|
| 1288 |
|
| 1289 |
+
def re_first(
|
| 1290 |
+
self,
|
| 1291 |
+
regex: Union[str, Pattern[str]],
|
| 1292 |
+
default=None,
|
| 1293 |
+
replace_entities: bool = True,
|
| 1294 |
+
clean_match: bool = False,
|
| 1295 |
+
case_sensitive: bool = True,
|
| 1296 |
+
) -> TextHandler:
|
| 1297 |
"""Call the ``.re_first()`` method for each element in this list and return
|
| 1298 |
the first result or the default value otherwise.
|
| 1299 |
|
|
|
|
| 1308 |
return result
|
| 1309 |
return default
|
| 1310 |
|
| 1311 |
+
def search(self, func: Callable[["Adaptor"], bool]) -> Union["Adaptor", None]:
|
| 1312 |
"""Loop over all current elements and return the first element that matches the passed function
|
| 1313 |
:param func: A function that takes each element as an argument and returns True/False
|
| 1314 |
:return: The first element that match the function or ``None`` otherwise.
|
|
|
|
| 1318 |
return element
|
| 1319 |
return None
|
| 1320 |
|
| 1321 |
+
def filter(self, func: Callable[["Adaptor"], bool]) -> "Adaptors[Adaptor]":
|
| 1322 |
"""Filter current elements based on the passed function
|
| 1323 |
:param func: A function that takes each element as an argument and returns True/False
|
| 1324 |
:return: The new `Adaptors` object or empty list otherwise.
|
| 1325 |
"""
|
| 1326 |
+
return self.__class__([element for element in self if func(element)])
|
|
|
|
|
|
|
| 1327 |
|
| 1328 |
# For easy copy-paste from Scrapy/parsel code when needed :)
|
| 1329 |
def get(self, default=None):
|
setup.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
|
|
|
|
|
| 1 |
from setuptools import find_packages, setup
|
| 2 |
|
| 3 |
-
|
| 4 |
-
long_description = fh.read()
|
| 5 |
|
| 6 |
|
| 7 |
setup(
|
|
@@ -20,9 +21,7 @@ setup(
|
|
| 20 |
"scrapling": "scrapling",
|
| 21 |
},
|
| 22 |
entry_points={
|
| 23 |
-
|
| 24 |
-
'scrapling=scrapling.cli:main'
|
| 25 |
-
],
|
| 26 |
},
|
| 27 |
include_package_data=True,
|
| 28 |
classifiers=[
|
|
@@ -53,14 +52,14 @@ setup(
|
|
| 53 |
install_requires=[
|
| 54 |
"lxml>=5.0",
|
| 55 |
"cssselect>=1.2",
|
| 56 |
-
|
| 57 |
"w3lib",
|
| 58 |
"orjson>=3",
|
| 59 |
"tldextract",
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
],
|
| 65 |
python_requires=">=3.9",
|
| 66 |
url="https://github.com/D4Vinci/Scrapling",
|
|
@@ -68,5 +67,5 @@ setup(
|
|
| 68 |
"Documentation": "https://scrapling.readthedocs.io/en/latest/",
|
| 69 |
"Source": "https://github.com/D4Vinci/Scrapling",
|
| 70 |
"Tracker": "https://github.com/D4Vinci/Scrapling/issues",
|
| 71 |
-
}
|
| 72 |
)
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
from setuptools import find_packages, setup
|
| 4 |
|
| 5 |
+
long_description = Path("README.md").read_text(encoding="utf-8")
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
setup(
|
|
|
|
| 21 |
"scrapling": "scrapling",
|
| 22 |
},
|
| 23 |
entry_points={
|
| 24 |
+
"console_scripts": ["scrapling=scrapling.cli:main"],
|
|
|
|
|
|
|
| 25 |
},
|
| 26 |
include_package_data=True,
|
| 27 |
classifiers=[
|
|
|
|
| 52 |
install_requires=[
|
| 53 |
"lxml>=5.0",
|
| 54 |
"cssselect>=1.2",
|
| 55 |
+
"click",
|
| 56 |
"w3lib",
|
| 57 |
"orjson>=3",
|
| 58 |
"tldextract",
|
| 59 |
+
"httpx[brotli,zstd, socks]",
|
| 60 |
+
"playwright>=1.49.1",
|
| 61 |
+
"rebrowser-playwright>=1.49.1",
|
| 62 |
+
"camoufox[geoip]>=0.4.11",
|
| 63 |
],
|
| 64 |
python_requires=">=3.9",
|
| 65 |
url="https://github.com/D4Vinci/Scrapling",
|
|
|
|
| 67 |
"Documentation": "https://scrapling.readthedocs.io/en/latest/",
|
| 68 |
"Source": "https://github.com/D4Vinci/Scrapling",
|
| 69 |
"Tracker": "https://github.com/D4Vinci/Scrapling/issues",
|
| 70 |
+
},
|
| 71 |
)
|
tests/fetchers/async/test_camoufox.py
CHANGED
|
@@ -17,43 +17,51 @@ class TestStealthyFetcher:
|
|
| 17 |
def urls(self, httpbin):
|
| 18 |
url = httpbin.url
|
| 19 |
return {
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
}
|
| 28 |
|
| 29 |
async def test_basic_fetch(self, fetcher, urls):
|
| 30 |
"""Test doing basic fetch request with multiple statuses"""
|
| 31 |
-
assert (await fetcher.async_fetch(urls[
|
| 32 |
-
assert (await fetcher.async_fetch(urls[
|
| 33 |
-
assert (await fetcher.async_fetch(urls[
|
| 34 |
|
| 35 |
async def test_networkidle(self, fetcher, urls):
|
| 36 |
"""Test if waiting for `networkidle` make page does not finish loading or not"""
|
| 37 |
-
assert (
|
|
|
|
|
|
|
| 38 |
|
| 39 |
async def test_blocking_resources(self, fetcher, urls):
|
| 40 |
"""Test if blocking resources make page does not finish loading or not"""
|
| 41 |
-
assert (
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
async def test_waiting_selector(self, fetcher, urls):
|
| 45 |
"""Test if waiting for a selector make page does not finish loading or not"""
|
| 46 |
-
assert (
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
|
|
|
|
|
|
| 52 |
|
| 53 |
async def test_cookies_loading(self, fetcher, urls):
|
| 54 |
"""Test if cookies are set after the request"""
|
| 55 |
-
response = await fetcher.async_fetch(urls[
|
| 56 |
-
assert response.cookies == {
|
| 57 |
|
| 58 |
async def test_automation(self, fetcher, urls):
|
| 59 |
"""Test if automation break the code or not"""
|
|
@@ -64,34 +72,38 @@ class TestStealthyFetcher:
|
|
| 64 |
await page.mouse.up()
|
| 65 |
return page
|
| 66 |
|
| 67 |
-
assert (
|
|
|
|
|
|
|
| 68 |
|
| 69 |
async def test_properties(self, fetcher, urls):
|
| 70 |
"""Test if different arguments breaks the code or not"""
|
| 71 |
-
assert (
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
)
|
| 76 |
-
|
| 77 |
-
assert (
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
)
|
| 82 |
-
|
| 83 |
-
assert (
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
)
|
| 88 |
-
|
| 89 |
-
assert (
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
)
|
| 94 |
|
| 95 |
async def test_infinite_timeout(self, fetcher, urls):
|
| 96 |
"""Test if infinite timeout breaks the code or not"""
|
| 97 |
-
assert (
|
|
|
|
|
|
|
|
|
| 17 |
def urls(self, httpbin):
|
| 18 |
url = httpbin.url
|
| 19 |
return {
|
| 20 |
+
"status_200": f"{url}/status/200",
|
| 21 |
+
"status_404": f"{url}/status/404",
|
| 22 |
+
"status_501": f"{url}/status/501",
|
| 23 |
+
"basic_url": f"{url}/get",
|
| 24 |
+
"html_url": f"{url}/html",
|
| 25 |
+
"delayed_url": f"{url}/delay/10", # 10 Seconds delay response
|
| 26 |
+
"cookies_url": f"{url}/cookies/set/test/value",
|
| 27 |
}
|
| 28 |
|
| 29 |
async def test_basic_fetch(self, fetcher, urls):
|
| 30 |
"""Test doing basic fetch request with multiple statuses"""
|
| 31 |
+
assert (await fetcher.async_fetch(urls["status_200"])).status == 200
|
| 32 |
+
assert (await fetcher.async_fetch(urls["status_404"])).status == 404
|
| 33 |
+
assert (await fetcher.async_fetch(urls["status_501"])).status == 501
|
| 34 |
|
| 35 |
async def test_networkidle(self, fetcher, urls):
|
| 36 |
"""Test if waiting for `networkidle` make page does not finish loading or not"""
|
| 37 |
+
assert (
|
| 38 |
+
await fetcher.async_fetch(urls["basic_url"], network_idle=True)
|
| 39 |
+
).status == 200
|
| 40 |
|
| 41 |
async def test_blocking_resources(self, fetcher, urls):
|
| 42 |
"""Test if blocking resources make page does not finish loading or not"""
|
| 43 |
+
assert (
|
| 44 |
+
await fetcher.async_fetch(urls["basic_url"], block_images=True)
|
| 45 |
+
).status == 200
|
| 46 |
+
assert (
|
| 47 |
+
await fetcher.async_fetch(urls["basic_url"], disable_resources=True)
|
| 48 |
+
).status == 200
|
| 49 |
|
| 50 |
async def test_waiting_selector(self, fetcher, urls):
|
| 51 |
"""Test if waiting for a selector make page does not finish loading or not"""
|
| 52 |
+
assert (
|
| 53 |
+
await fetcher.async_fetch(urls["html_url"], wait_selector="h1")
|
| 54 |
+
).status == 200
|
| 55 |
+
assert (
|
| 56 |
+
await fetcher.async_fetch(
|
| 57 |
+
urls["html_url"], wait_selector="h1", wait_selector_state="visible"
|
| 58 |
+
)
|
| 59 |
+
).status == 200
|
| 60 |
|
| 61 |
async def test_cookies_loading(self, fetcher, urls):
|
| 62 |
"""Test if cookies are set after the request"""
|
| 63 |
+
response = await fetcher.async_fetch(urls["cookies_url"])
|
| 64 |
+
assert response.cookies == {"test": "value"}
|
| 65 |
|
| 66 |
async def test_automation(self, fetcher, urls):
|
| 67 |
"""Test if automation break the code or not"""
|
|
|
|
| 72 |
await page.mouse.up()
|
| 73 |
return page
|
| 74 |
|
| 75 |
+
assert (
|
| 76 |
+
await fetcher.async_fetch(urls["html_url"], page_action=scroll_page)
|
| 77 |
+
).status == 200
|
| 78 |
|
| 79 |
async def test_properties(self, fetcher, urls):
|
| 80 |
"""Test if different arguments breaks the code or not"""
|
| 81 |
+
assert (
|
| 82 |
+
await fetcher.async_fetch(
|
| 83 |
+
urls["html_url"], block_webrtc=True, allow_webgl=True
|
| 84 |
+
)
|
| 85 |
+
).status == 200
|
| 86 |
+
|
| 87 |
+
assert (
|
| 88 |
+
await fetcher.async_fetch(
|
| 89 |
+
urls["html_url"], block_webrtc=False, allow_webgl=True
|
| 90 |
+
)
|
| 91 |
+
).status == 200
|
| 92 |
+
|
| 93 |
+
assert (
|
| 94 |
+
await fetcher.async_fetch(
|
| 95 |
+
urls["html_url"], block_webrtc=True, allow_webgl=False
|
| 96 |
+
)
|
| 97 |
+
).status == 200
|
| 98 |
+
|
| 99 |
+
assert (
|
| 100 |
+
await fetcher.async_fetch(
|
| 101 |
+
urls["html_url"], extra_headers={"ayo": ""}, os_randomize=True
|
| 102 |
+
)
|
| 103 |
+
).status == 200
|
| 104 |
|
| 105 |
async def test_infinite_timeout(self, fetcher, urls):
|
| 106 |
"""Test if infinite timeout breaks the code or not"""
|
| 107 |
+
assert (
|
| 108 |
+
await fetcher.async_fetch(urls["delayed_url"], timeout=None)
|
| 109 |
+
).status == 200
|
tests/fetchers/async/test_httpx.py
CHANGED
|
@@ -16,70 +16,111 @@ class TestAsyncFetcher:
|
|
| 16 |
@pytest.fixture(scope="class")
|
| 17 |
def urls(self, httpbin):
|
| 18 |
return {
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
}
|
| 28 |
|
| 29 |
async def test_basic_get(self, fetcher, urls):
|
| 30 |
"""Test doing basic get request with multiple statuses"""
|
| 31 |
-
assert (await fetcher.get(urls[
|
| 32 |
-
assert (await fetcher.get(urls[
|
| 33 |
-
assert (await fetcher.get(urls[
|
| 34 |
|
| 35 |
async def test_get_properties(self, fetcher, urls):
|
| 36 |
"""Test if different arguments with GET request breaks the code or not"""
|
| 37 |
-
assert (
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
assert (
|
| 41 |
-
urls[
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
async def test_post_properties(self, fetcher, urls):
|
| 48 |
"""Test if different arguments with POST request breaks the code or not"""
|
| 49 |
-
assert (
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
assert (
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
async def test_put_properties(self, fetcher, urls):
|
| 62 |
"""Test if different arguments with PUT request breaks the code or not"""
|
| 63 |
-
assert (await fetcher.put(urls[
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
assert (
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
async def test_delete_properties(self, fetcher, urls):
|
| 76 |
"""Test if different arguments with DELETE request breaks the code or not"""
|
| 77 |
-
assert (
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
assert (
|
| 81 |
-
urls[
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
@pytest.fixture(scope="class")
|
| 17 |
def urls(self, httpbin):
|
| 18 |
return {
|
| 19 |
+
"status_200": f"{httpbin.url}/status/200",
|
| 20 |
+
"status_404": f"{httpbin.url}/status/404",
|
| 21 |
+
"status_501": f"{httpbin.url}/status/501",
|
| 22 |
+
"basic_url": f"{httpbin.url}/get",
|
| 23 |
+
"post_url": f"{httpbin.url}/post",
|
| 24 |
+
"put_url": f"{httpbin.url}/put",
|
| 25 |
+
"delete_url": f"{httpbin.url}/delete",
|
| 26 |
+
"html_url": f"{httpbin.url}/html",
|
| 27 |
}
|
| 28 |
|
| 29 |
async def test_basic_get(self, fetcher, urls):
|
| 30 |
"""Test doing basic get request with multiple statuses"""
|
| 31 |
+
assert (await fetcher.get(urls["status_200"])).status == 200
|
| 32 |
+
assert (await fetcher.get(urls["status_404"])).status == 404
|
| 33 |
+
assert (await fetcher.get(urls["status_501"])).status == 501
|
| 34 |
|
| 35 |
async def test_get_properties(self, fetcher, urls):
|
| 36 |
"""Test if different arguments with GET request breaks the code or not"""
|
| 37 |
+
assert (
|
| 38 |
+
await fetcher.get(urls["status_200"], stealthy_headers=True)
|
| 39 |
+
).status == 200
|
| 40 |
+
assert (
|
| 41 |
+
await fetcher.get(urls["status_200"], follow_redirects=True)
|
| 42 |
+
).status == 200
|
| 43 |
+
assert (await fetcher.get(urls["status_200"], timeout=None)).status == 200
|
| 44 |
+
assert (
|
| 45 |
+
await fetcher.get(
|
| 46 |
+
urls["status_200"],
|
| 47 |
+
stealthy_headers=True,
|
| 48 |
+
follow_redirects=True,
|
| 49 |
+
timeout=None,
|
| 50 |
+
)
|
| 51 |
+
).status == 200
|
| 52 |
|
| 53 |
async def test_post_properties(self, fetcher, urls):
|
| 54 |
"""Test if different arguments with POST request breaks the code or not"""
|
| 55 |
+
assert (
|
| 56 |
+
await fetcher.post(urls["post_url"], data={"key": "value"})
|
| 57 |
+
).status == 200
|
| 58 |
+
assert (
|
| 59 |
+
await fetcher.post(
|
| 60 |
+
urls["post_url"], data={"key": "value"}, stealthy_headers=True
|
| 61 |
+
)
|
| 62 |
+
).status == 200
|
| 63 |
+
assert (
|
| 64 |
+
await fetcher.post(
|
| 65 |
+
urls["post_url"], data={"key": "value"}, follow_redirects=True
|
| 66 |
+
)
|
| 67 |
+
).status == 200
|
| 68 |
+
assert (
|
| 69 |
+
await fetcher.post(urls["post_url"], data={"key": "value"}, timeout=None)
|
| 70 |
+
).status == 200
|
| 71 |
+
assert (
|
| 72 |
+
await fetcher.post(
|
| 73 |
+
urls["post_url"],
|
| 74 |
+
data={"key": "value"},
|
| 75 |
+
stealthy_headers=True,
|
| 76 |
+
follow_redirects=True,
|
| 77 |
+
timeout=None,
|
| 78 |
+
)
|
| 79 |
+
).status == 200
|
| 80 |
|
| 81 |
async def test_put_properties(self, fetcher, urls):
|
| 82 |
"""Test if different arguments with PUT request breaks the code or not"""
|
| 83 |
+
assert (await fetcher.put(urls["put_url"], data={"key": "value"})).status in [
|
| 84 |
+
200,
|
| 85 |
+
405,
|
| 86 |
+
]
|
| 87 |
+
assert (
|
| 88 |
+
await fetcher.put(
|
| 89 |
+
urls["put_url"], data={"key": "value"}, stealthy_headers=True
|
| 90 |
+
)
|
| 91 |
+
).status in [200, 405]
|
| 92 |
+
assert (
|
| 93 |
+
await fetcher.put(
|
| 94 |
+
urls["put_url"], data={"key": "value"}, follow_redirects=True
|
| 95 |
+
)
|
| 96 |
+
).status in [200, 405]
|
| 97 |
+
assert (
|
| 98 |
+
await fetcher.put(urls["put_url"], data={"key": "value"}, timeout=None)
|
| 99 |
+
).status in [200, 405]
|
| 100 |
+
assert (
|
| 101 |
+
await fetcher.put(
|
| 102 |
+
urls["put_url"],
|
| 103 |
+
data={"key": "value"},
|
| 104 |
+
stealthy_headers=True,
|
| 105 |
+
follow_redirects=True,
|
| 106 |
+
timeout=None,
|
| 107 |
+
)
|
| 108 |
+
).status in [200, 405]
|
| 109 |
|
| 110 |
async def test_delete_properties(self, fetcher, urls):
|
| 111 |
"""Test if different arguments with DELETE request breaks the code or not"""
|
| 112 |
+
assert (
|
| 113 |
+
await fetcher.delete(urls["delete_url"], stealthy_headers=True)
|
| 114 |
+
).status == 200
|
| 115 |
+
assert (
|
| 116 |
+
await fetcher.delete(urls["delete_url"], follow_redirects=True)
|
| 117 |
+
).status == 200
|
| 118 |
+
assert (await fetcher.delete(urls["delete_url"], timeout=None)).status == 200
|
| 119 |
+
assert (
|
| 120 |
+
await fetcher.delete(
|
| 121 |
+
urls["delete_url"],
|
| 122 |
+
stealthy_headers=True,
|
| 123 |
+
follow_redirects=True,
|
| 124 |
+
timeout=None,
|
| 125 |
+
)
|
| 126 |
+
).status == 200
|
tests/fetchers/async/test_playwright.py
CHANGED
|
@@ -15,87 +15,97 @@ class TestPlayWrightFetcherAsync:
|
|
| 15 |
@pytest.fixture
|
| 16 |
def urls(self, httpbin):
|
| 17 |
return {
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
}
|
| 26 |
|
| 27 |
@pytest.mark.asyncio
|
| 28 |
async def test_basic_fetch(self, fetcher, urls):
|
| 29 |
"""Test doing basic fetch request with multiple statuses"""
|
| 30 |
-
response = await fetcher.async_fetch(urls[
|
| 31 |
assert response.status == 200
|
| 32 |
|
| 33 |
@pytest.mark.asyncio
|
| 34 |
async def test_networkidle(self, fetcher, urls):
|
| 35 |
"""Test if waiting for `networkidle` make page does not finish loading or not"""
|
| 36 |
-
response = await fetcher.async_fetch(urls[
|
| 37 |
assert response.status == 200
|
| 38 |
|
| 39 |
@pytest.mark.asyncio
|
| 40 |
async def test_blocking_resources(self, fetcher, urls):
|
| 41 |
"""Test if blocking resources make page does not finish loading or not"""
|
| 42 |
-
response = await fetcher.async_fetch(urls[
|
| 43 |
assert response.status == 200
|
| 44 |
|
| 45 |
@pytest.mark.asyncio
|
| 46 |
async def test_waiting_selector(self, fetcher, urls):
|
| 47 |
"""Test if waiting for a selector make page does not finish loading or not"""
|
| 48 |
-
response1 = await fetcher.async_fetch(urls[
|
| 49 |
assert response1.status == 200
|
| 50 |
|
| 51 |
-
response2 = await fetcher.async_fetch(
|
|
|
|
|
|
|
| 52 |
assert response2.status == 200
|
| 53 |
|
| 54 |
@pytest.mark.asyncio
|
| 55 |
async def test_cookies_loading(self, fetcher, urls):
|
| 56 |
"""Test if cookies are set after the request"""
|
| 57 |
-
response = await fetcher.async_fetch(urls[
|
| 58 |
-
assert response.cookies == {
|
| 59 |
|
| 60 |
@pytest.mark.asyncio
|
| 61 |
async def test_automation(self, fetcher, urls):
|
| 62 |
"""Test if automation break the code or not"""
|
|
|
|
| 63 |
async def scroll_page(page):
|
| 64 |
await page.mouse.wheel(10, 0)
|
| 65 |
await page.mouse.move(100, 400)
|
| 66 |
await page.mouse.up()
|
| 67 |
return page
|
| 68 |
|
| 69 |
-
response = await fetcher.async_fetch(urls[
|
| 70 |
assert response.status == 200
|
| 71 |
|
| 72 |
-
@pytest.mark.parametrize(
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
@pytest.mark.asyncio
|
| 80 |
async def test_properties(self, fetcher, urls, kwargs):
|
| 81 |
"""Test if different arguments breaks the code or not"""
|
| 82 |
-
response = await fetcher.async_fetch(urls[
|
| 83 |
assert response.status == 200
|
| 84 |
|
| 85 |
@pytest.mark.asyncio
|
| 86 |
async def test_cdp_url_invalid(self, fetcher, urls):
|
| 87 |
"""Test if invalid CDP URLs raise appropriate exceptions"""
|
| 88 |
with pytest.raises(ValueError):
|
| 89 |
-
await fetcher.async_fetch(urls[
|
| 90 |
|
| 91 |
with pytest.raises(ValueError):
|
| 92 |
-
await fetcher.async_fetch(
|
|
|
|
|
|
|
| 93 |
|
| 94 |
with pytest.raises(Exception):
|
| 95 |
-
await fetcher.async_fetch(urls[
|
| 96 |
|
| 97 |
@pytest.mark.asyncio
|
| 98 |
async def test_infinite_timeout(self, fetcher, urls):
|
| 99 |
"""Test if infinite timeout breaks the code or not"""
|
| 100 |
-
response = await fetcher.async_fetch(urls[
|
| 101 |
assert response.status == 200
|
|
|
|
| 15 |
@pytest.fixture
|
| 16 |
def urls(self, httpbin):
|
| 17 |
return {
|
| 18 |
+
"status_200": f"{httpbin.url}/status/200",
|
| 19 |
+
"status_404": f"{httpbin.url}/status/404",
|
| 20 |
+
"status_501": f"{httpbin.url}/status/501",
|
| 21 |
+
"basic_url": f"{httpbin.url}/get",
|
| 22 |
+
"html_url": f"{httpbin.url}/html",
|
| 23 |
+
"delayed_url": f"{httpbin.url}/delay/10",
|
| 24 |
+
"cookies_url": f"{httpbin.url}/cookies/set/test/value",
|
| 25 |
}
|
| 26 |
|
| 27 |
@pytest.mark.asyncio
|
| 28 |
async def test_basic_fetch(self, fetcher, urls):
|
| 29 |
"""Test doing basic fetch request with multiple statuses"""
|
| 30 |
+
response = await fetcher.async_fetch(urls["status_200"])
|
| 31 |
assert response.status == 200
|
| 32 |
|
| 33 |
@pytest.mark.asyncio
|
| 34 |
async def test_networkidle(self, fetcher, urls):
|
| 35 |
"""Test if waiting for `networkidle` make page does not finish loading or not"""
|
| 36 |
+
response = await fetcher.async_fetch(urls["basic_url"], network_idle=True)
|
| 37 |
assert response.status == 200
|
| 38 |
|
| 39 |
@pytest.mark.asyncio
|
| 40 |
async def test_blocking_resources(self, fetcher, urls):
|
| 41 |
"""Test if blocking resources make page does not finish loading or not"""
|
| 42 |
+
response = await fetcher.async_fetch(urls["basic_url"], disable_resources=True)
|
| 43 |
assert response.status == 200
|
| 44 |
|
| 45 |
@pytest.mark.asyncio
|
| 46 |
async def test_waiting_selector(self, fetcher, urls):
|
| 47 |
"""Test if waiting for a selector make page does not finish loading or not"""
|
| 48 |
+
response1 = await fetcher.async_fetch(urls["html_url"], wait_selector="h1")
|
| 49 |
assert response1.status == 200
|
| 50 |
|
| 51 |
+
response2 = await fetcher.async_fetch(
|
| 52 |
+
urls["html_url"], wait_selector="h1", wait_selector_state="visible"
|
| 53 |
+
)
|
| 54 |
assert response2.status == 200
|
| 55 |
|
| 56 |
@pytest.mark.asyncio
|
| 57 |
async def test_cookies_loading(self, fetcher, urls):
|
| 58 |
"""Test if cookies are set after the request"""
|
| 59 |
+
response = await fetcher.async_fetch(urls["cookies_url"])
|
| 60 |
+
assert response.cookies == {"test": "value"}
|
| 61 |
|
| 62 |
@pytest.mark.asyncio
|
| 63 |
async def test_automation(self, fetcher, urls):
|
| 64 |
"""Test if automation break the code or not"""
|
| 65 |
+
|
| 66 |
async def scroll_page(page):
|
| 67 |
await page.mouse.wheel(10, 0)
|
| 68 |
await page.mouse.move(100, 400)
|
| 69 |
await page.mouse.up()
|
| 70 |
return page
|
| 71 |
|
| 72 |
+
response = await fetcher.async_fetch(urls["html_url"], page_action=scroll_page)
|
| 73 |
assert response.status == 200
|
| 74 |
|
| 75 |
+
@pytest.mark.parametrize(
|
| 76 |
+
"kwargs",
|
| 77 |
+
[
|
| 78 |
+
{"disable_webgl": True, "hide_canvas": False},
|
| 79 |
+
{"disable_webgl": False, "hide_canvas": True},
|
| 80 |
+
# {"stealth": True}, # causes issues with Github Actions
|
| 81 |
+
{
|
| 82 |
+
"useragent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0"
|
| 83 |
+
},
|
| 84 |
+
{"extra_headers": {"ayo": ""}},
|
| 85 |
+
],
|
| 86 |
+
)
|
| 87 |
@pytest.mark.asyncio
|
| 88 |
async def test_properties(self, fetcher, urls, kwargs):
|
| 89 |
"""Test if different arguments breaks the code or not"""
|
| 90 |
+
response = await fetcher.async_fetch(urls["html_url"], **kwargs)
|
| 91 |
assert response.status == 200
|
| 92 |
|
| 93 |
@pytest.mark.asyncio
|
| 94 |
async def test_cdp_url_invalid(self, fetcher, urls):
|
| 95 |
"""Test if invalid CDP URLs raise appropriate exceptions"""
|
| 96 |
with pytest.raises(ValueError):
|
| 97 |
+
await fetcher.async_fetch(urls["html_url"], cdp_url="blahblah")
|
| 98 |
|
| 99 |
with pytest.raises(ValueError):
|
| 100 |
+
await fetcher.async_fetch(
|
| 101 |
+
urls["html_url"], cdp_url="blahblah", nstbrowser_mode=True
|
| 102 |
+
)
|
| 103 |
|
| 104 |
with pytest.raises(Exception):
|
| 105 |
+
await fetcher.async_fetch(urls["html_url"], cdp_url="ws://blahblah")
|
| 106 |
|
| 107 |
@pytest.mark.asyncio
|
| 108 |
async def test_infinite_timeout(self, fetcher, urls):
|
| 109 |
"""Test if infinite timeout breaks the code or not"""
|
| 110 |
+
response = await fetcher.async_fetch(urls["delayed_url"], timeout=None)
|
| 111 |
assert response.status == 200
|
tests/fetchers/sync/test_camoufox.py
CHANGED
|
@@ -16,12 +16,12 @@ class TestStealthyFetcher:
|
|
| 16 |
@pytest.fixture(autouse=True)
|
| 17 |
def setup_urls(self, httpbin):
|
| 18 |
"""Fixture to set up URLs for testing"""
|
| 19 |
-
self.status_200 = f
|
| 20 |
-
self.status_404 = f
|
| 21 |
-
self.status_501 = f
|
| 22 |
-
self.basic_url = f
|
| 23 |
-
self.html_url = f
|
| 24 |
-
self.delayed_url = f
|
| 25 |
self.cookies_url = f"{httpbin.url}/cookies/set/test/value"
|
| 26 |
|
| 27 |
def test_basic_fetch(self, fetcher):
|
|
@@ -41,15 +41,21 @@ class TestStealthyFetcher:
|
|
| 41 |
|
| 42 |
def test_waiting_selector(self, fetcher):
|
| 43 |
"""Test if waiting for a selector make page does not finish loading or not"""
|
| 44 |
-
assert fetcher.fetch(self.html_url, wait_selector=
|
| 45 |
-
assert
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
def test_cookies_loading(self, fetcher):
|
| 48 |
"""Test if cookies are set after the request"""
|
| 49 |
-
assert fetcher.fetch(self.cookies_url).cookies == {
|
| 50 |
|
| 51 |
def test_automation(self, fetcher):
|
| 52 |
"""Test if automation break the code or not"""
|
|
|
|
| 53 |
def scroll_page(page):
|
| 54 |
page.mouse.wheel(10, 0)
|
| 55 |
page.mouse.move(100, 400)
|
|
@@ -60,10 +66,24 @@ class TestStealthyFetcher:
|
|
| 60 |
|
| 61 |
def test_properties(self, fetcher):
|
| 62 |
"""Test if different arguments breaks the code or not"""
|
| 63 |
-
assert
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
def test_infinite_timeout(self, fetcher):
|
| 69 |
"""Test if infinite timeout breaks the code or not"""
|
|
|
|
| 16 |
@pytest.fixture(autouse=True)
|
| 17 |
def setup_urls(self, httpbin):
|
| 18 |
"""Fixture to set up URLs for testing"""
|
| 19 |
+
self.status_200 = f"{httpbin.url}/status/200"
|
| 20 |
+
self.status_404 = f"{httpbin.url}/status/404"
|
| 21 |
+
self.status_501 = f"{httpbin.url}/status/501"
|
| 22 |
+
self.basic_url = f"{httpbin.url}/get"
|
| 23 |
+
self.html_url = f"{httpbin.url}/html"
|
| 24 |
+
self.delayed_url = f"{httpbin.url}/delay/10" # 10 Seconds delay response
|
| 25 |
self.cookies_url = f"{httpbin.url}/cookies/set/test/value"
|
| 26 |
|
| 27 |
def test_basic_fetch(self, fetcher):
|
|
|
|
| 41 |
|
| 42 |
def test_waiting_selector(self, fetcher):
|
| 43 |
"""Test if waiting for a selector make page does not finish loading or not"""
|
| 44 |
+
assert fetcher.fetch(self.html_url, wait_selector="h1").status == 200
|
| 45 |
+
assert (
|
| 46 |
+
fetcher.fetch(
|
| 47 |
+
self.html_url, wait_selector="h1", wait_selector_state="visible"
|
| 48 |
+
).status
|
| 49 |
+
== 200
|
| 50 |
+
)
|
| 51 |
|
| 52 |
def test_cookies_loading(self, fetcher):
|
| 53 |
"""Test if cookies are set after the request"""
|
| 54 |
+
assert fetcher.fetch(self.cookies_url).cookies == {"test": "value"}
|
| 55 |
|
| 56 |
def test_automation(self, fetcher):
|
| 57 |
"""Test if automation break the code or not"""
|
| 58 |
+
|
| 59 |
def scroll_page(page):
|
| 60 |
page.mouse.wheel(10, 0)
|
| 61 |
page.mouse.move(100, 400)
|
|
|
|
| 66 |
|
| 67 |
def test_properties(self, fetcher):
|
| 68 |
"""Test if different arguments breaks the code or not"""
|
| 69 |
+
assert (
|
| 70 |
+
fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=True).status
|
| 71 |
+
== 200
|
| 72 |
+
)
|
| 73 |
+
assert (
|
| 74 |
+
fetcher.fetch(self.html_url, block_webrtc=False, allow_webgl=True).status
|
| 75 |
+
== 200
|
| 76 |
+
)
|
| 77 |
+
assert (
|
| 78 |
+
fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=False).status
|
| 79 |
+
== 200
|
| 80 |
+
)
|
| 81 |
+
assert (
|
| 82 |
+
fetcher.fetch(
|
| 83 |
+
self.html_url, extra_headers={"ayo": ""}, os_randomize=True
|
| 84 |
+
).status
|
| 85 |
+
== 200
|
| 86 |
+
)
|
| 87 |
|
| 88 |
def test_infinite_timeout(self, fetcher):
|
| 89 |
"""Test if infinite timeout breaks the code or not"""
|
tests/fetchers/sync/test_httpx.py
CHANGED
|
@@ -16,14 +16,14 @@ class TestFetcher:
|
|
| 16 |
@pytest.fixture(autouse=True)
|
| 17 |
def setup_urls(self, httpbin):
|
| 18 |
"""Fixture to set up URLs for testing"""
|
| 19 |
-
self.status_200 = f
|
| 20 |
-
self.status_404 = f
|
| 21 |
-
self.status_501 = f
|
| 22 |
-
self.basic_url = f
|
| 23 |
-
self.post_url = f
|
| 24 |
-
self.put_url = f
|
| 25 |
-
self.delete_url = f
|
| 26 |
-
self.html_url = f
|
| 27 |
|
| 28 |
def test_basic_get(self, fetcher):
|
| 29 |
"""Test doing basic get request with multiple statuses"""
|
|
@@ -36,49 +36,86 @@ class TestFetcher:
|
|
| 36 |
assert fetcher.get(self.status_200, stealthy_headers=True).status == 200
|
| 37 |
assert fetcher.get(self.status_200, follow_redirects=True).status == 200
|
| 38 |
assert fetcher.get(self.status_200, timeout=None).status == 200
|
| 39 |
-
assert
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
def test_post_properties(self, fetcher):
|
| 47 |
"""Test if different arguments with POST request breaks the code or not"""
|
| 48 |
-
assert fetcher.post(self.post_url, data={
|
| 49 |
-
assert
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
def test_put_properties(self, fetcher):
|
| 61 |
"""Test if different arguments with PUT request breaks the code or not"""
|
| 62 |
-
assert fetcher.put(self.put_url, data={
|
| 63 |
-
assert
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
def test_delete_properties(self, fetcher):
|
| 75 |
"""Test if different arguments with DELETE request breaks the code or not"""
|
| 76 |
assert fetcher.delete(self.delete_url, stealthy_headers=True).status == 200
|
| 77 |
assert fetcher.delete(self.delete_url, follow_redirects=True).status == 200
|
| 78 |
assert fetcher.delete(self.delete_url, timeout=None).status == 200
|
| 79 |
-
assert
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
@pytest.fixture(autouse=True)
|
| 17 |
def setup_urls(self, httpbin):
|
| 18 |
"""Fixture to set up URLs for testing"""
|
| 19 |
+
self.status_200 = f"{httpbin.url}/status/200"
|
| 20 |
+
self.status_404 = f"{httpbin.url}/status/404"
|
| 21 |
+
self.status_501 = f"{httpbin.url}/status/501"
|
| 22 |
+
self.basic_url = f"{httpbin.url}/get"
|
| 23 |
+
self.post_url = f"{httpbin.url}/post"
|
| 24 |
+
self.put_url = f"{httpbin.url}/put"
|
| 25 |
+
self.delete_url = f"{httpbin.url}/delete"
|
| 26 |
+
self.html_url = f"{httpbin.url}/html"
|
| 27 |
|
| 28 |
def test_basic_get(self, fetcher):
|
| 29 |
"""Test doing basic get request with multiple statuses"""
|
|
|
|
| 36 |
assert fetcher.get(self.status_200, stealthy_headers=True).status == 200
|
| 37 |
assert fetcher.get(self.status_200, follow_redirects=True).status == 200
|
| 38 |
assert fetcher.get(self.status_200, timeout=None).status == 200
|
| 39 |
+
assert (
|
| 40 |
+
fetcher.get(
|
| 41 |
+
self.status_200,
|
| 42 |
+
stealthy_headers=True,
|
| 43 |
+
follow_redirects=True,
|
| 44 |
+
timeout=None,
|
| 45 |
+
).status
|
| 46 |
+
== 200
|
| 47 |
+
)
|
| 48 |
|
| 49 |
def test_post_properties(self, fetcher):
|
| 50 |
"""Test if different arguments with POST request breaks the code or not"""
|
| 51 |
+
assert fetcher.post(self.post_url, data={"key": "value"}).status == 200
|
| 52 |
+
assert (
|
| 53 |
+
fetcher.post(
|
| 54 |
+
self.post_url, data={"key": "value"}, stealthy_headers=True
|
| 55 |
+
).status
|
| 56 |
+
== 200
|
| 57 |
+
)
|
| 58 |
+
assert (
|
| 59 |
+
fetcher.post(
|
| 60 |
+
self.post_url, data={"key": "value"}, follow_redirects=True
|
| 61 |
+
).status
|
| 62 |
+
== 200
|
| 63 |
+
)
|
| 64 |
+
assert (
|
| 65 |
+
fetcher.post(self.post_url, data={"key": "value"}, timeout=None).status
|
| 66 |
+
== 200
|
| 67 |
+
)
|
| 68 |
+
assert (
|
| 69 |
+
fetcher.post(
|
| 70 |
+
self.post_url,
|
| 71 |
+
data={"key": "value"},
|
| 72 |
+
stealthy_headers=True,
|
| 73 |
+
follow_redirects=True,
|
| 74 |
+
timeout=None,
|
| 75 |
+
).status
|
| 76 |
+
== 200
|
| 77 |
+
)
|
| 78 |
|
| 79 |
def test_put_properties(self, fetcher):
|
| 80 |
"""Test if different arguments with PUT request breaks the code or not"""
|
| 81 |
+
assert fetcher.put(self.put_url, data={"key": "value"}).status == 200
|
| 82 |
+
assert (
|
| 83 |
+
fetcher.put(
|
| 84 |
+
self.put_url, data={"key": "value"}, stealthy_headers=True
|
| 85 |
+
).status
|
| 86 |
+
== 200
|
| 87 |
+
)
|
| 88 |
+
assert (
|
| 89 |
+
fetcher.put(
|
| 90 |
+
self.put_url, data={"key": "value"}, follow_redirects=True
|
| 91 |
+
).status
|
| 92 |
+
== 200
|
| 93 |
+
)
|
| 94 |
+
assert (
|
| 95 |
+
fetcher.put(self.put_url, data={"key": "value"}, timeout=None).status == 200
|
| 96 |
+
)
|
| 97 |
+
assert (
|
| 98 |
+
fetcher.put(
|
| 99 |
+
self.put_url,
|
| 100 |
+
data={"key": "value"},
|
| 101 |
+
stealthy_headers=True,
|
| 102 |
+
follow_redirects=True,
|
| 103 |
+
timeout=None,
|
| 104 |
+
).status
|
| 105 |
+
== 200
|
| 106 |
+
)
|
| 107 |
|
| 108 |
def test_delete_properties(self, fetcher):
|
| 109 |
"""Test if different arguments with DELETE request breaks the code or not"""
|
| 110 |
assert fetcher.delete(self.delete_url, stealthy_headers=True).status == 200
|
| 111 |
assert fetcher.delete(self.delete_url, follow_redirects=True).status == 200
|
| 112 |
assert fetcher.delete(self.delete_url, timeout=None).status == 200
|
| 113 |
+
assert (
|
| 114 |
+
fetcher.delete(
|
| 115 |
+
self.delete_url,
|
| 116 |
+
stealthy_headers=True,
|
| 117 |
+
follow_redirects=True,
|
| 118 |
+
timeout=None,
|
| 119 |
+
).status
|
| 120 |
+
== 200
|
| 121 |
+
)
|
tests/fetchers/sync/test_playwright.py
CHANGED
|
@@ -8,7 +8,6 @@ PlayWrightFetcher.auto_match = True
|
|
| 8 |
|
| 9 |
@pytest_httpbin.use_class_based_httpbin
|
| 10 |
class TestPlayWrightFetcher:
|
| 11 |
-
|
| 12 |
@pytest.fixture(scope="class")
|
| 13 |
def fetcher(self):
|
| 14 |
"""Fixture to create a StealthyFetcher instance for the entire test class"""
|
|
@@ -17,12 +16,12 @@ class TestPlayWrightFetcher:
|
|
| 17 |
@pytest.fixture(autouse=True)
|
| 18 |
def setup_urls(self, httpbin):
|
| 19 |
"""Fixture to set up URLs for testing"""
|
| 20 |
-
self.status_200 = f
|
| 21 |
-
self.status_404 = f
|
| 22 |
-
self.status_501 = f
|
| 23 |
-
self.basic_url = f
|
| 24 |
-
self.html_url = f
|
| 25 |
-
self.delayed_url = f
|
| 26 |
self.cookies_url = f"{httpbin.url}/cookies/set/test/value"
|
| 27 |
|
| 28 |
def test_basic_fetch(self, fetcher):
|
|
@@ -42,12 +41,17 @@ class TestPlayWrightFetcher:
|
|
| 42 |
|
| 43 |
def test_waiting_selector(self, fetcher):
|
| 44 |
"""Test if waiting for a selector make page does not finish loading or not"""
|
| 45 |
-
assert fetcher.fetch(self.html_url, wait_selector=
|
| 46 |
-
assert
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
def test_cookies_loading(self, fetcher):
|
| 49 |
"""Test if cookies are set after the request"""
|
| 50 |
-
assert fetcher.fetch(self.cookies_url).cookies == {
|
| 51 |
|
| 52 |
def test_automation(self, fetcher):
|
| 53 |
"""Test if automation break the code or not"""
|
|
@@ -60,13 +64,18 @@ class TestPlayWrightFetcher:
|
|
| 60 |
|
| 61 |
assert fetcher.fetch(self.html_url, page_action=scroll_page).status == 200
|
| 62 |
|
| 63 |
-
@pytest.mark.parametrize(
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
def test_properties(self, fetcher, kwargs):
|
| 71 |
"""Test if different arguments breaks the code or not"""
|
| 72 |
response = fetcher.fetch(self.html_url, **kwargs)
|
|
@@ -75,15 +84,18 @@ class TestPlayWrightFetcher:
|
|
| 75 |
def test_cdp_url_invalid(self, fetcher):
|
| 76 |
"""Test if invalid CDP URLs raise appropriate exceptions"""
|
| 77 |
with pytest.raises(ValueError):
|
| 78 |
-
fetcher.fetch(self.html_url, cdp_url=
|
| 79 |
|
| 80 |
with pytest.raises(ValueError):
|
| 81 |
-
fetcher.fetch(self.html_url, cdp_url=
|
| 82 |
|
| 83 |
with pytest.raises(Exception):
|
| 84 |
-
fetcher.fetch(self.html_url, cdp_url=
|
| 85 |
|
| 86 |
-
def test_infinite_timeout(
|
|
|
|
|
|
|
|
|
|
| 87 |
"""Test if infinite timeout breaks the code or not"""
|
| 88 |
response = fetcher.fetch(self.delayed_url, timeout=None)
|
| 89 |
assert response.status == 200
|
|
|
|
| 8 |
|
| 9 |
@pytest_httpbin.use_class_based_httpbin
|
| 10 |
class TestPlayWrightFetcher:
|
|
|
|
| 11 |
@pytest.fixture(scope="class")
|
| 12 |
def fetcher(self):
|
| 13 |
"""Fixture to create a StealthyFetcher instance for the entire test class"""
|
|
|
|
| 16 |
@pytest.fixture(autouse=True)
|
| 17 |
def setup_urls(self, httpbin):
|
| 18 |
"""Fixture to set up URLs for testing"""
|
| 19 |
+
self.status_200 = f"{httpbin.url}/status/200"
|
| 20 |
+
self.status_404 = f"{httpbin.url}/status/404"
|
| 21 |
+
self.status_501 = f"{httpbin.url}/status/501"
|
| 22 |
+
self.basic_url = f"{httpbin.url}/get"
|
| 23 |
+
self.html_url = f"{httpbin.url}/html"
|
| 24 |
+
self.delayed_url = f"{httpbin.url}/delay/10" # 10 Seconds delay response
|
| 25 |
self.cookies_url = f"{httpbin.url}/cookies/set/test/value"
|
| 26 |
|
| 27 |
def test_basic_fetch(self, fetcher):
|
|
|
|
| 41 |
|
| 42 |
def test_waiting_selector(self, fetcher):
|
| 43 |
"""Test if waiting for a selector make page does not finish loading or not"""
|
| 44 |
+
assert fetcher.fetch(self.html_url, wait_selector="h1").status == 200
|
| 45 |
+
assert (
|
| 46 |
+
fetcher.fetch(
|
| 47 |
+
self.html_url, wait_selector="h1", wait_selector_state="visible"
|
| 48 |
+
).status
|
| 49 |
+
== 200
|
| 50 |
+
)
|
| 51 |
|
| 52 |
def test_cookies_loading(self, fetcher):
|
| 53 |
"""Test if cookies are set after the request"""
|
| 54 |
+
assert fetcher.fetch(self.cookies_url).cookies == {"test": "value"}
|
| 55 |
|
| 56 |
def test_automation(self, fetcher):
|
| 57 |
"""Test if automation break the code or not"""
|
|
|
|
| 64 |
|
| 65 |
assert fetcher.fetch(self.html_url, page_action=scroll_page).status == 200
|
| 66 |
|
| 67 |
+
@pytest.mark.parametrize(
|
| 68 |
+
"kwargs",
|
| 69 |
+
[
|
| 70 |
+
{"disable_webgl": True, "hide_canvas": False},
|
| 71 |
+
{"disable_webgl": False, "hide_canvas": True},
|
| 72 |
+
# {"stealth": True}, # causes issues with Github Actions
|
| 73 |
+
{
|
| 74 |
+
"useragent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0"
|
| 75 |
+
},
|
| 76 |
+
{"extra_headers": {"ayo": ""}},
|
| 77 |
+
],
|
| 78 |
+
)
|
| 79 |
def test_properties(self, fetcher, kwargs):
|
| 80 |
"""Test if different arguments breaks the code or not"""
|
| 81 |
response = fetcher.fetch(self.html_url, **kwargs)
|
|
|
|
| 84 |
def test_cdp_url_invalid(self, fetcher):
|
| 85 |
"""Test if invalid CDP URLs raise appropriate exceptions"""
|
| 86 |
with pytest.raises(ValueError):
|
| 87 |
+
fetcher.fetch(self.html_url, cdp_url="blahblah")
|
| 88 |
|
| 89 |
with pytest.raises(ValueError):
|
| 90 |
+
fetcher.fetch(self.html_url, cdp_url="blahblah", nstbrowser_mode=True)
|
| 91 |
|
| 92 |
with pytest.raises(Exception):
|
| 93 |
+
fetcher.fetch(self.html_url, cdp_url="ws://blahblah")
|
| 94 |
|
| 95 |
+
def test_infinite_timeout(
|
| 96 |
+
self,
|
| 97 |
+
fetcher,
|
| 98 |
+
):
|
| 99 |
"""Test if infinite timeout breaks the code or not"""
|
| 100 |
response = fetcher.fetch(self.delayed_url, timeout=None)
|
| 101 |
assert response.status == 200
|
tests/fetchers/test_utils.py
CHANGED
|
@@ -7,76 +7,117 @@ from scrapling.engines.toolbelt.custom import ResponseEncoding, StatusText
|
|
| 7 |
def content_type_map():
|
| 8 |
return {
|
| 9 |
# A map generated by ChatGPT for most possible `content_type` values and the expected outcome
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
'text/html; charset="UTF-8"':
|
| 27 |
-
'text/html; charset="ISO-8859-1"':
|
| 28 |
-
'text/html; charset="windows-1252"':
|
| 29 |
-
'application/json; charset="UTF-8"':
|
| 30 |
-
'application/json; charset="ISO-8859-1"':
|
| 31 |
-
'application/json; charset="windows-1252"':
|
| 32 |
-
'text/json; charset="UTF-8"':
|
| 33 |
-
'application/javascript; charset="UTF-8"':
|
| 34 |
-
'application/javascript; charset="ISO-8859-1"':
|
| 35 |
-
'text/plain; charset="UTF-8"':
|
| 36 |
-
'text/plain; charset="ISO-8859-1"':
|
| 37 |
-
'text/plain; charset="windows-1252"':
|
| 38 |
-
'application/xhtml+xml; charset="UTF-8"':
|
| 39 |
-
'application/xhtml+xml; charset="ISO-8859-1"':
|
| 40 |
-
'application/xhtml+xml; charset="windows-1252"':
|
| 41 |
-
'text/html; charset="US-ASCII"':
|
| 42 |
-
'application/json; charset="US-ASCII"':
|
| 43 |
-
'text/plain; charset="US-ASCII"':
|
| 44 |
-
'text/html; charset="Shift_JIS"':
|
| 45 |
-
'application/json; charset="Shift_JIS"':
|
| 46 |
-
'text/plain; charset="Shift_JIS"':
|
| 47 |
-
'application/xml; charset="UTF-8"':
|
| 48 |
-
'application/xml; charset="ISO-8859-1"':
|
| 49 |
-
|
| 50 |
-
'text/xml; charset="UTF-8"':
|
| 51 |
-
'text/xml; charset="ISO-8859-1"':
|
| 52 |
-
|
| 53 |
}
|
| 54 |
|
| 55 |
|
| 56 |
@pytest.fixture
|
| 57 |
def status_map():
|
| 58 |
return {
|
| 59 |
-
100: "Continue",
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
}
|
| 81 |
|
| 82 |
|
|
|
|
| 7 |
def content_type_map():
|
| 8 |
return {
|
| 9 |
# A map generated by ChatGPT for most possible `content_type` values and the expected outcome
|
| 10 |
+
"text/html; charset=UTF-8": "UTF-8",
|
| 11 |
+
"text/html; charset=ISO-8859-1": "ISO-8859-1",
|
| 12 |
+
"text/html": "ISO-8859-1",
|
| 13 |
+
"application/json; charset=UTF-8": "UTF-8",
|
| 14 |
+
"application/json": "utf-8",
|
| 15 |
+
"text/json": "utf-8",
|
| 16 |
+
"application/javascript; charset=UTF-8": "UTF-8",
|
| 17 |
+
"application/javascript": "utf-8",
|
| 18 |
+
"text/plain; charset=UTF-8": "UTF-8",
|
| 19 |
+
"text/plain; charset=ISO-8859-1": "ISO-8859-1",
|
| 20 |
+
"text/plain": "ISO-8859-1",
|
| 21 |
+
"application/xhtml+xml; charset=UTF-8": "UTF-8",
|
| 22 |
+
"application/xhtml+xml": "utf-8",
|
| 23 |
+
"text/html; charset=windows-1252": "windows-1252",
|
| 24 |
+
"application/json; charset=windows-1252": "windows-1252",
|
| 25 |
+
"text/plain; charset=windows-1252": "windows-1252",
|
| 26 |
+
'text/html; charset="UTF-8"': "UTF-8",
|
| 27 |
+
'text/html; charset="ISO-8859-1"': "ISO-8859-1",
|
| 28 |
+
'text/html; charset="windows-1252"': "windows-1252",
|
| 29 |
+
'application/json; charset="UTF-8"': "UTF-8",
|
| 30 |
+
'application/json; charset="ISO-8859-1"': "ISO-8859-1",
|
| 31 |
+
'application/json; charset="windows-1252"': "windows-1252",
|
| 32 |
+
'text/json; charset="UTF-8"': "UTF-8",
|
| 33 |
+
'application/javascript; charset="UTF-8"': "UTF-8",
|
| 34 |
+
'application/javascript; charset="ISO-8859-1"': "ISO-8859-1",
|
| 35 |
+
'text/plain; charset="UTF-8"': "UTF-8",
|
| 36 |
+
'text/plain; charset="ISO-8859-1"': "ISO-8859-1",
|
| 37 |
+
'text/plain; charset="windows-1252"': "windows-1252",
|
| 38 |
+
'application/xhtml+xml; charset="UTF-8"': "UTF-8",
|
| 39 |
+
'application/xhtml+xml; charset="ISO-8859-1"': "ISO-8859-1",
|
| 40 |
+
'application/xhtml+xml; charset="windows-1252"': "windows-1252",
|
| 41 |
+
'text/html; charset="US-ASCII"': "US-ASCII",
|
| 42 |
+
'application/json; charset="US-ASCII"': "US-ASCII",
|
| 43 |
+
'text/plain; charset="US-ASCII"': "US-ASCII",
|
| 44 |
+
'text/html; charset="Shift_JIS"': "Shift_JIS",
|
| 45 |
+
'application/json; charset="Shift_JIS"': "Shift_JIS",
|
| 46 |
+
'text/plain; charset="Shift_JIS"': "Shift_JIS",
|
| 47 |
+
'application/xml; charset="UTF-8"': "UTF-8",
|
| 48 |
+
'application/xml; charset="ISO-8859-1"': "ISO-8859-1",
|
| 49 |
+
"application/xml": "utf-8",
|
| 50 |
+
'text/xml; charset="UTF-8"': "UTF-8",
|
| 51 |
+
'text/xml; charset="ISO-8859-1"': "ISO-8859-1",
|
| 52 |
+
"text/xml": "utf-8",
|
| 53 |
}
|
| 54 |
|
| 55 |
|
| 56 |
@pytest.fixture
|
| 57 |
def status_map():
|
| 58 |
return {
|
| 59 |
+
100: "Continue",
|
| 60 |
+
101: "Switching Protocols",
|
| 61 |
+
102: "Processing",
|
| 62 |
+
103: "Early Hints",
|
| 63 |
+
200: "OK",
|
| 64 |
+
201: "Created",
|
| 65 |
+
202: "Accepted",
|
| 66 |
+
203: "Non-Authoritative Information",
|
| 67 |
+
204: "No Content",
|
| 68 |
+
205: "Reset Content",
|
| 69 |
+
206: "Partial Content",
|
| 70 |
+
207: "Multi-Status",
|
| 71 |
+
208: "Already Reported",
|
| 72 |
+
226: "IM Used",
|
| 73 |
+
300: "Multiple Choices",
|
| 74 |
+
301: "Moved Permanently",
|
| 75 |
+
302: "Found",
|
| 76 |
+
303: "See Other",
|
| 77 |
+
304: "Not Modified",
|
| 78 |
+
305: "Use Proxy",
|
| 79 |
+
307: "Temporary Redirect",
|
| 80 |
+
308: "Permanent Redirect",
|
| 81 |
+
400: "Bad Request",
|
| 82 |
+
401: "Unauthorized",
|
| 83 |
+
402: "Payment Required",
|
| 84 |
+
403: "Forbidden",
|
| 85 |
+
404: "Not Found",
|
| 86 |
+
405: "Method Not Allowed",
|
| 87 |
+
406: "Not Acceptable",
|
| 88 |
+
407: "Proxy Authentication Required",
|
| 89 |
+
408: "Request Timeout",
|
| 90 |
+
409: "Conflict",
|
| 91 |
+
410: "Gone",
|
| 92 |
+
411: "Length Required",
|
| 93 |
+
412: "Precondition Failed",
|
| 94 |
+
413: "Payload Too Large",
|
| 95 |
+
414: "URI Too Long",
|
| 96 |
+
415: "Unsupported Media Type",
|
| 97 |
+
416: "Range Not Satisfiable",
|
| 98 |
+
417: "Expectation Failed",
|
| 99 |
+
418: "I'm a teapot",
|
| 100 |
+
421: "Misdirected Request",
|
| 101 |
+
422: "Unprocessable Entity",
|
| 102 |
+
423: "Locked",
|
| 103 |
+
424: "Failed Dependency",
|
| 104 |
+
425: "Too Early",
|
| 105 |
+
426: "Upgrade Required",
|
| 106 |
+
428: "Precondition Required",
|
| 107 |
+
429: "Too Many Requests",
|
| 108 |
+
431: "Request Header Fields Too Large",
|
| 109 |
+
451: "Unavailable For Legal Reasons",
|
| 110 |
+
500: "Internal Server Error",
|
| 111 |
+
501: "Not Implemented",
|
| 112 |
+
502: "Bad Gateway",
|
| 113 |
+
503: "Service Unavailable",
|
| 114 |
+
504: "Gateway Timeout",
|
| 115 |
+
505: "HTTP Version Not Supported",
|
| 116 |
+
506: "Variant Also Negotiates",
|
| 117 |
+
507: "Insufficient Storage",
|
| 118 |
+
508: "Loop Detected",
|
| 119 |
+
510: "Not Extended",
|
| 120 |
+
511: "Network Authentication Required",
|
| 121 |
}
|
| 122 |
|
| 123 |
|
tests/parser/test_automatch.py
CHANGED
|
@@ -8,7 +8,7 @@ from scrapling import Adaptor
|
|
| 8 |
class TestParserAutoMatch:
|
| 9 |
def test_element_relocation(self):
|
| 10 |
"""Test relocating element after structure change"""
|
| 11 |
-
original_html =
|
| 12 |
<div class="container">
|
| 13 |
<section class="products">
|
| 14 |
<article class="product" id="p1">
|
|
@@ -21,8 +21,8 @@ class TestParserAutoMatch:
|
|
| 21 |
</article>
|
| 22 |
</section>
|
| 23 |
</div>
|
| 24 |
-
|
| 25 |
-
changed_html =
|
| 26 |
<div class="new-container">
|
| 27 |
<div class="product-wrapper">
|
| 28 |
<section class="products">
|
|
@@ -41,25 +41,25 @@ class TestParserAutoMatch:
|
|
| 41 |
</section>
|
| 42 |
</div>
|
| 43 |
</div>
|
| 44 |
-
|
| 45 |
|
| 46 |
-
old_page = Adaptor(original_html, url=
|
| 47 |
-
new_page = Adaptor(changed_html, url=
|
| 48 |
|
| 49 |
# 'p1' was used as ID and now it's not and all the path elements have changes
|
| 50 |
# Also at the same time testing auto-match vs combined selectors
|
| 51 |
-
_ = old_page.css(
|
| 52 |
-
relocated = new_page.css(
|
| 53 |
|
| 54 |
assert relocated is not None
|
| 55 |
-
assert relocated[0].attrib[
|
| 56 |
-
assert relocated[0].has_class(
|
| 57 |
-
assert relocated[0].css(
|
| 58 |
|
| 59 |
@pytest.mark.asyncio
|
| 60 |
async def test_element_relocation_async(self):
|
| 61 |
"""Test relocating element after structure change in async mode"""
|
| 62 |
-
original_html =
|
| 63 |
<div class="container">
|
| 64 |
<section class="products">
|
| 65 |
<article class="product" id="p1">
|
|
@@ -72,8 +72,8 @@ class TestParserAutoMatch:
|
|
| 72 |
</article>
|
| 73 |
</section>
|
| 74 |
</div>
|
| 75 |
-
|
| 76 |
-
changed_html =
|
| 77 |
<div class="new-container">
|
| 78 |
<div class="product-wrapper">
|
| 79 |
<section class="products">
|
|
@@ -92,20 +92,20 @@ class TestParserAutoMatch:
|
|
| 92 |
</section>
|
| 93 |
</div>
|
| 94 |
</div>
|
| 95 |
-
|
| 96 |
|
| 97 |
# Simulate async operation
|
| 98 |
await asyncio.sleep(0.1) # Minimal async operation
|
| 99 |
|
| 100 |
-
old_page = Adaptor(original_html, url=
|
| 101 |
-
new_page = Adaptor(changed_html, url=
|
| 102 |
|
| 103 |
# 'p1' was used as ID and now it's not and all the path elements have changes
|
| 104 |
# Also at the same time testing auto-match vs combined selectors
|
| 105 |
-
_ = old_page.css(
|
| 106 |
-
relocated = new_page.css(
|
| 107 |
|
| 108 |
assert relocated is not None
|
| 109 |
-
assert relocated[0].attrib[
|
| 110 |
-
assert relocated[0].has_class(
|
| 111 |
-
assert relocated[0].css(
|
|
|
|
| 8 |
class TestParserAutoMatch:
|
| 9 |
def test_element_relocation(self):
|
| 10 |
"""Test relocating element after structure change"""
|
| 11 |
+
original_html = """
|
| 12 |
<div class="container">
|
| 13 |
<section class="products">
|
| 14 |
<article class="product" id="p1">
|
|
|
|
| 21 |
</article>
|
| 22 |
</section>
|
| 23 |
</div>
|
| 24 |
+
"""
|
| 25 |
+
changed_html = """
|
| 26 |
<div class="new-container">
|
| 27 |
<div class="product-wrapper">
|
| 28 |
<section class="products">
|
|
|
|
| 41 |
</section>
|
| 42 |
</div>
|
| 43 |
</div>
|
| 44 |
+
"""
|
| 45 |
|
| 46 |
+
old_page = Adaptor(original_html, url="example.com", auto_match=True)
|
| 47 |
+
new_page = Adaptor(changed_html, url="example.com", auto_match=True)
|
| 48 |
|
| 49 |
# 'p1' was used as ID and now it's not and all the path elements have changes
|
| 50 |
# Also at the same time testing auto-match vs combined selectors
|
| 51 |
+
_ = old_page.css("#p1, #p2", auto_save=True)[0]
|
| 52 |
+
relocated = new_page.css("#p1", auto_match=True)
|
| 53 |
|
| 54 |
assert relocated is not None
|
| 55 |
+
assert relocated[0].attrib["data-id"] == "p1"
|
| 56 |
+
assert relocated[0].has_class("new-class")
|
| 57 |
+
assert relocated[0].css(".new-description")[0].text == "Description 1"
|
| 58 |
|
| 59 |
@pytest.mark.asyncio
|
| 60 |
async def test_element_relocation_async(self):
|
| 61 |
"""Test relocating element after structure change in async mode"""
|
| 62 |
+
original_html = """
|
| 63 |
<div class="container">
|
| 64 |
<section class="products">
|
| 65 |
<article class="product" id="p1">
|
|
|
|
| 72 |
</article>
|
| 73 |
</section>
|
| 74 |
</div>
|
| 75 |
+
"""
|
| 76 |
+
changed_html = """
|
| 77 |
<div class="new-container">
|
| 78 |
<div class="product-wrapper">
|
| 79 |
<section class="products">
|
|
|
|
| 92 |
</section>
|
| 93 |
</div>
|
| 94 |
</div>
|
| 95 |
+
"""
|
| 96 |
|
| 97 |
# Simulate async operation
|
| 98 |
await asyncio.sleep(0.1) # Minimal async operation
|
| 99 |
|
| 100 |
+
old_page = Adaptor(original_html, url="example.com", auto_match=True)
|
| 101 |
+
new_page = Adaptor(changed_html, url="example.com", auto_match=True)
|
| 102 |
|
| 103 |
# 'p1' was used as ID and now it's not and all the path elements have changes
|
| 104 |
# Also at the same time testing auto-match vs combined selectors
|
| 105 |
+
_ = old_page.css("#p1, #p2", auto_save=True)[0]
|
| 106 |
+
relocated = new_page.css("#p1", auto_match=True)
|
| 107 |
|
| 108 |
assert relocated is not None
|
| 109 |
+
assert relocated[0].attrib["data-id"] == "p1"
|
| 110 |
+
assert relocated[0].has_class("new-class")
|
| 111 |
+
assert relocated[0].css(".new-description")[0].text == "Description 1"
|
tests/parser/test_general.py
CHANGED
|
@@ -9,7 +9,7 @@ from scrapling import Adaptor
|
|
| 9 |
|
| 10 |
@pytest.fixture
|
| 11 |
def html_content():
|
| 12 |
-
return
|
| 13 |
<html>
|
| 14 |
<head>
|
| 15 |
<title>Complex Web Page</title>
|
|
@@ -73,7 +73,7 @@ def html_content():
|
|
| 73 |
</script>
|
| 74 |
</body>
|
| 75 |
</html>
|
| 76 |
-
|
| 77 |
|
| 78 |
|
| 79 |
@pytest.fixture
|
|
@@ -85,13 +85,14 @@ def page(html_content):
|
|
| 85 |
class TestCSSSelectors:
|
| 86 |
def test_basic_product_selection(self, page):
|
| 87 |
"""Test selecting all product elements"""
|
| 88 |
-
elements = page.css(
|
| 89 |
assert len(elements) == 3
|
| 90 |
|
| 91 |
def test_in_stock_product_selection(self, page):
|
| 92 |
"""Test selecting in-stock products"""
|
| 93 |
in_stock_products = page.css(
|
| 94 |
-
'main #products .product-list article.product:not(:contains("Out of stock"))'
|
|
|
|
| 95 |
assert len(in_stock_products) == 2
|
| 96 |
|
| 97 |
|
|
@@ -117,22 +118,26 @@ class TestXPathSelectors:
|
|
| 117 |
class TestTextMatching:
|
| 118 |
def test_regex_multiple_matches(self, page):
|
| 119 |
"""Test finding multiple matches with regex"""
|
| 120 |
-
stock_info = page.find_by_regex(r
|
| 121 |
assert len(stock_info) == 2
|
| 122 |
|
| 123 |
def test_regex_first_match(self, page):
|
| 124 |
"""Test finding the first match with regex"""
|
| 125 |
-
stock_info = page.find_by_regex(
|
| 126 |
-
|
|
|
|
|
|
|
| 127 |
|
| 128 |
def test_partial_text_match(self, page):
|
| 129 |
"""Test finding elements with partial text match"""
|
| 130 |
-
stock_info = page.find_by_text(r
|
| 131 |
assert len(stock_info) == 2
|
| 132 |
|
| 133 |
def test_exact_text_match(self, page):
|
| 134 |
"""Test finding elements with exact text match"""
|
| 135 |
-
out_of_stock = page.find_by_text(
|
|
|
|
|
|
|
| 136 |
assert len(out_of_stock) == 1
|
| 137 |
|
| 138 |
|
|
@@ -140,17 +145,17 @@ class TestTextMatching:
|
|
| 140 |
class TestSimilarElements:
|
| 141 |
def test_finding_similar_products(self, page):
|
| 142 |
"""Test finding similar product elements"""
|
| 143 |
-
first_product = page.css_first(
|
| 144 |
similar_products = first_product.find_similar()
|
| 145 |
assert len(similar_products) == 2
|
| 146 |
|
| 147 |
def test_finding_similar_reviews(self, page):
|
| 148 |
"""Test finding similar review elements with additional filtering"""
|
| 149 |
-
first_review = page.find(
|
| 150 |
similar_high_rated_reviews = [
|
| 151 |
review
|
| 152 |
for review in first_review.find_similar()
|
| 153 |
-
if int(review.attrib.get(
|
| 154 |
]
|
| 155 |
assert len(similar_high_rated_reviews) == 1
|
| 156 |
|
|
@@ -181,17 +186,17 @@ class TestErrorHandling:
|
|
| 181 |
def test_bad_selectors(self, page):
|
| 182 |
"""Test handling of invalid selectors"""
|
| 183 |
with pytest.raises((SelectorError, SelectorSyntaxError)):
|
| 184 |
-
page.css(
|
| 185 |
|
| 186 |
with pytest.raises((SelectorError, SelectorSyntaxError)):
|
| 187 |
-
page.xpath(
|
| 188 |
|
| 189 |
|
| 190 |
# Pickling and Object Representation Tests
|
| 191 |
class TestPicklingAndRepresentation:
|
| 192 |
def test_unpickleable_objects(self, page):
|
| 193 |
"""Test that Adaptor objects cannot be pickled"""
|
| 194 |
-
table = page.css(
|
| 195 |
with pytest.raises(TypeError):
|
| 196 |
pickle.dumps(table)
|
| 197 |
|
|
@@ -200,7 +205,7 @@ class TestPicklingAndRepresentation:
|
|
| 200 |
|
| 201 |
def test_string_representations(self, page):
|
| 202 |
"""Test custom string representations of objects"""
|
| 203 |
-
table = page.css(
|
| 204 |
assert issubclass(type(table.__str__()), str)
|
| 205 |
assert issubclass(type(table.__repr__()), str)
|
| 206 |
assert issubclass(type(table.attrib.__str__()), str)
|
|
@@ -211,40 +216,40 @@ class TestPicklingAndRepresentation:
|
|
| 211 |
class TestElementNavigation:
|
| 212 |
def test_basic_navigation_properties(self, page):
|
| 213 |
"""Test basic navigation properties of elements"""
|
| 214 |
-
table = page.css(
|
| 215 |
assert table.path is not None
|
| 216 |
-
assert table.html_content !=
|
| 217 |
-
assert table.prettify() !=
|
| 218 |
|
| 219 |
def test_parent_and_sibling_navigation(self, page):
|
| 220 |
"""Test parent and sibling navigation"""
|
| 221 |
-
table = page.css(
|
| 222 |
parent = table.parent
|
| 223 |
-
assert parent.attrib[
|
| 224 |
|
| 225 |
parent_siblings = parent.siblings
|
| 226 |
assert len(parent_siblings) == 1
|
| 227 |
|
| 228 |
def test_child_navigation(self, page):
|
| 229 |
"""Test child navigation"""
|
| 230 |
-
table = page.css(
|
| 231 |
children = table.children
|
| 232 |
assert len(children) == 3
|
| 233 |
|
| 234 |
def test_next_and_previous_navigation(self, page):
|
| 235 |
"""Test next and previous element navigation"""
|
| 236 |
-
child = page.css(
|
| 237 |
next_element = child.next
|
| 238 |
-
assert next_element.attrib[
|
| 239 |
|
| 240 |
prev_element = next_element.previous
|
| 241 |
assert prev_element.tag == child.tag
|
| 242 |
|
| 243 |
def test_ancestor_finding(self, page):
|
| 244 |
"""Test finding ancestors of elements"""
|
| 245 |
-
all_prices = page.css(
|
| 246 |
products_with_prices = [
|
| 247 |
-
price.find_ancestor(lambda p: p.has_class(
|
| 248 |
for price in all_prices
|
| 249 |
]
|
| 250 |
assert len(products_with_prices) == 3
|
|
@@ -254,52 +259,59 @@ class TestElementNavigation:
|
|
| 254 |
class TestJSONAndAttributes:
|
| 255 |
def test_json_conversion(self, page):
|
| 256 |
"""Test converting content to JSON"""
|
| 257 |
-
script_content = page.css(
|
| 258 |
assert issubclass(type(script_content.sort()), str)
|
| 259 |
page_data = script_content.json()
|
| 260 |
-
assert page_data[
|
| 261 |
-
assert
|
| 262 |
|
| 263 |
def test_attribute_operations(self, page):
|
| 264 |
"""Test various attribute-related operations"""
|
| 265 |
# Product ID extraction
|
| 266 |
-
products = page.css(
|
| 267 |
-
product_ids = [product.attrib[
|
| 268 |
-
assert product_ids == [
|
| 269 |
-
assert
|
| 270 |
|
| 271 |
# Review rating calculations
|
| 272 |
-
reviews = page.css(
|
| 273 |
-
review_ratings = [int(review.attrib[
|
| 274 |
assert sum(review_ratings) / len(review_ratings) == 4.5
|
| 275 |
|
| 276 |
# Attribute searching
|
| 277 |
-
key_value = list(products[0].attrib.search_values(
|
| 278 |
-
assert list(key_value[0].keys()) == [
|
| 279 |
|
| 280 |
-
key_value = list(products[0].attrib.search_values(
|
| 281 |
-
assert list(key_value[0].keys()) == [
|
| 282 |
|
| 283 |
# JSON attribute conversion
|
| 284 |
-
attr_json = page.css_first(
|
| 285 |
-
assert attr_json == {
|
| 286 |
-
assert isinstance(page.css(
|
| 287 |
|
| 288 |
|
| 289 |
# Performance Test
|
| 290 |
def test_large_html_parsing_performance():
|
| 291 |
"""Test parsing and selecting performance on large HTML"""
|
| 292 |
-
large_html =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
|
| 294 |
start_time = time.time()
|
| 295 |
parsed = Adaptor(large_html, auto_match=False)
|
| 296 |
-
elements = parsed.css(
|
| 297 |
end_time = time.time()
|
| 298 |
|
| 299 |
assert len(elements) == 5000
|
| 300 |
# Converting 5000 elements to a class and doing operations on them will take time
|
| 301 |
# Based on my tests with 100 runs, 1 loop each Scrapling (given the extra work/features) takes 10.4ms on average
|
| 302 |
-
assert
|
|
|
|
|
|
|
| 303 |
|
| 304 |
|
| 305 |
# Selector Generation Test
|
|
@@ -318,13 +330,13 @@ def test_selectors_generation(page):
|
|
| 318 |
# Miscellaneous Tests
|
| 319 |
def test_getting_all_text(page):
|
| 320 |
"""Test getting all text from the page"""
|
| 321 |
-
assert page.get_all_text() !=
|
| 322 |
|
| 323 |
|
| 324 |
def test_regex_on_text(page):
|
| 325 |
"""Test regex operations on text"""
|
| 326 |
element = page.css('[data-id="1"] .price')[0]
|
| 327 |
-
match = element.re_first(r
|
| 328 |
-
assert match ==
|
| 329 |
-
match = element.text.re(r
|
| 330 |
assert len(match) == 2
|
|
|
|
| 9 |
|
| 10 |
@pytest.fixture
|
| 11 |
def html_content():
|
| 12 |
+
return """
|
| 13 |
<html>
|
| 14 |
<head>
|
| 15 |
<title>Complex Web Page</title>
|
|
|
|
| 73 |
</script>
|
| 74 |
</body>
|
| 75 |
</html>
|
| 76 |
+
"""
|
| 77 |
|
| 78 |
|
| 79 |
@pytest.fixture
|
|
|
|
| 85 |
class TestCSSSelectors:
|
| 86 |
def test_basic_product_selection(self, page):
|
| 87 |
"""Test selecting all product elements"""
|
| 88 |
+
elements = page.css("main #products .product-list article.product")
|
| 89 |
assert len(elements) == 3
|
| 90 |
|
| 91 |
def test_in_stock_product_selection(self, page):
|
| 92 |
"""Test selecting in-stock products"""
|
| 93 |
in_stock_products = page.css(
|
| 94 |
+
'main #products .product-list article.product:not(:contains("Out of stock"))'
|
| 95 |
+
)
|
| 96 |
assert len(in_stock_products) == 2
|
| 97 |
|
| 98 |
|
|
|
|
| 118 |
class TestTextMatching:
|
| 119 |
def test_regex_multiple_matches(self, page):
|
| 120 |
"""Test finding multiple matches with regex"""
|
| 121 |
+
stock_info = page.find_by_regex(r"In stock: \d+", first_match=False)
|
| 122 |
assert len(stock_info) == 2
|
| 123 |
|
| 124 |
def test_regex_first_match(self, page):
|
| 125 |
"""Test finding the first match with regex"""
|
| 126 |
+
stock_info = page.find_by_regex(
|
| 127 |
+
r"In stock: \d+", first_match=True, case_sensitive=True
|
| 128 |
+
)
|
| 129 |
+
assert stock_info.text == "In stock: 5"
|
| 130 |
|
| 131 |
def test_partial_text_match(self, page):
|
| 132 |
"""Test finding elements with partial text match"""
|
| 133 |
+
stock_info = page.find_by_text(r"In stock:", partial=True, first_match=False)
|
| 134 |
assert len(stock_info) == 2
|
| 135 |
|
| 136 |
def test_exact_text_match(self, page):
|
| 137 |
"""Test finding elements with exact text match"""
|
| 138 |
+
out_of_stock = page.find_by_text(
|
| 139 |
+
"Out of stock", partial=False, first_match=False
|
| 140 |
+
)
|
| 141 |
assert len(out_of_stock) == 1
|
| 142 |
|
| 143 |
|
|
|
|
| 145 |
class TestSimilarElements:
|
| 146 |
def test_finding_similar_products(self, page):
|
| 147 |
"""Test finding similar product elements"""
|
| 148 |
+
first_product = page.css_first(".product")
|
| 149 |
similar_products = first_product.find_similar()
|
| 150 |
assert len(similar_products) == 2
|
| 151 |
|
| 152 |
def test_finding_similar_reviews(self, page):
|
| 153 |
"""Test finding similar review elements with additional filtering"""
|
| 154 |
+
first_review = page.find("div", class_="review")
|
| 155 |
similar_high_rated_reviews = [
|
| 156 |
review
|
| 157 |
for review in first_review.find_similar()
|
| 158 |
+
if int(review.attrib.get("data-rating", 0)) >= 4
|
| 159 |
]
|
| 160 |
assert len(similar_high_rated_reviews) == 1
|
| 161 |
|
|
|
|
| 186 |
def test_bad_selectors(self, page):
|
| 187 |
"""Test handling of invalid selectors"""
|
| 188 |
with pytest.raises((SelectorError, SelectorSyntaxError)):
|
| 189 |
+
page.css("4 ayo")
|
| 190 |
|
| 191 |
with pytest.raises((SelectorError, SelectorSyntaxError)):
|
| 192 |
+
page.xpath("4 ayo")
|
| 193 |
|
| 194 |
|
| 195 |
# Pickling and Object Representation Tests
|
| 196 |
class TestPicklingAndRepresentation:
|
| 197 |
def test_unpickleable_objects(self, page):
|
| 198 |
"""Test that Adaptor objects cannot be pickled"""
|
| 199 |
+
table = page.css(".product-list")[0]
|
| 200 |
with pytest.raises(TypeError):
|
| 201 |
pickle.dumps(table)
|
| 202 |
|
|
|
|
| 205 |
|
| 206 |
def test_string_representations(self, page):
|
| 207 |
"""Test custom string representations of objects"""
|
| 208 |
+
table = page.css(".product-list")[0]
|
| 209 |
assert issubclass(type(table.__str__()), str)
|
| 210 |
assert issubclass(type(table.__repr__()), str)
|
| 211 |
assert issubclass(type(table.attrib.__str__()), str)
|
|
|
|
| 216 |
class TestElementNavigation:
|
| 217 |
def test_basic_navigation_properties(self, page):
|
| 218 |
"""Test basic navigation properties of elements"""
|
| 219 |
+
table = page.css(".product-list")[0]
|
| 220 |
assert table.path is not None
|
| 221 |
+
assert table.html_content != ""
|
| 222 |
+
assert table.prettify() != ""
|
| 223 |
|
| 224 |
def test_parent_and_sibling_navigation(self, page):
|
| 225 |
"""Test parent and sibling navigation"""
|
| 226 |
+
table = page.css(".product-list")[0]
|
| 227 |
parent = table.parent
|
| 228 |
+
assert parent.attrib["id"] == "products"
|
| 229 |
|
| 230 |
parent_siblings = parent.siblings
|
| 231 |
assert len(parent_siblings) == 1
|
| 232 |
|
| 233 |
def test_child_navigation(self, page):
|
| 234 |
"""Test child navigation"""
|
| 235 |
+
table = page.css(".product-list")[0]
|
| 236 |
children = table.children
|
| 237 |
assert len(children) == 3
|
| 238 |
|
| 239 |
def test_next_and_previous_navigation(self, page):
|
| 240 |
"""Test next and previous element navigation"""
|
| 241 |
+
child = page.css(".product-list")[0].find({"data-id": "1"})
|
| 242 |
next_element = child.next
|
| 243 |
+
assert next_element.attrib["data-id"] == "2"
|
| 244 |
|
| 245 |
prev_element = next_element.previous
|
| 246 |
assert prev_element.tag == child.tag
|
| 247 |
|
| 248 |
def test_ancestor_finding(self, page):
|
| 249 |
"""Test finding ancestors of elements"""
|
| 250 |
+
all_prices = page.css(".price")
|
| 251 |
products_with_prices = [
|
| 252 |
+
price.find_ancestor(lambda p: p.has_class("product"))
|
| 253 |
for price in all_prices
|
| 254 |
]
|
| 255 |
assert len(products_with_prices) == 3
|
|
|
|
| 259 |
class TestJSONAndAttributes:
|
| 260 |
def test_json_conversion(self, page):
|
| 261 |
"""Test converting content to JSON"""
|
| 262 |
+
script_content = page.css("#page-data::text")[0]
|
| 263 |
assert issubclass(type(script_content.sort()), str)
|
| 264 |
page_data = script_content.json()
|
| 265 |
+
assert page_data["totalProducts"] == 3
|
| 266 |
+
assert "lastUpdated" in page_data
|
| 267 |
|
| 268 |
def test_attribute_operations(self, page):
|
| 269 |
"""Test various attribute-related operations"""
|
| 270 |
# Product ID extraction
|
| 271 |
+
products = page.css(".product")
|
| 272 |
+
product_ids = [product.attrib["data-id"] for product in products]
|
| 273 |
+
assert product_ids == ["1", "2", "3"]
|
| 274 |
+
assert "data-id" in products[0].attrib
|
| 275 |
|
| 276 |
# Review rating calculations
|
| 277 |
+
reviews = page.css(".review")
|
| 278 |
+
review_ratings = [int(review.attrib["data-rating"]) for review in reviews]
|
| 279 |
assert sum(review_ratings) / len(review_ratings) == 4.5
|
| 280 |
|
| 281 |
# Attribute searching
|
| 282 |
+
key_value = list(products[0].attrib.search_values("1", partial=False))
|
| 283 |
+
assert list(key_value[0].keys()) == ["data-id"]
|
| 284 |
|
| 285 |
+
key_value = list(products[0].attrib.search_values("1", partial=True))
|
| 286 |
+
assert list(key_value[0].keys()) == ["data-id"]
|
| 287 |
|
| 288 |
# JSON attribute conversion
|
| 289 |
+
attr_json = page.css_first("#products").attrib["schema"].json()
|
| 290 |
+
assert attr_json == {"jsonable": "data"}
|
| 291 |
+
assert isinstance(page.css("#products")[0].attrib.json_string, bytes)
|
| 292 |
|
| 293 |
|
| 294 |
# Performance Test
|
| 295 |
def test_large_html_parsing_performance():
|
| 296 |
"""Test parsing and selecting performance on large HTML"""
|
| 297 |
+
large_html = (
|
| 298 |
+
"<html><body>"
|
| 299 |
+
+ '<div class="item">' * 5000
|
| 300 |
+
+ "</div>" * 5000
|
| 301 |
+
+ "</body></html>"
|
| 302 |
+
)
|
| 303 |
|
| 304 |
start_time = time.time()
|
| 305 |
parsed = Adaptor(large_html, auto_match=False)
|
| 306 |
+
elements = parsed.css(".item")
|
| 307 |
end_time = time.time()
|
| 308 |
|
| 309 |
assert len(elements) == 5000
|
| 310 |
# Converting 5000 elements to a class and doing operations on them will take time
|
| 311 |
# Based on my tests with 100 runs, 1 loop each Scrapling (given the extra work/features) takes 10.4ms on average
|
| 312 |
+
assert (
|
| 313 |
+
end_time - start_time < 0.5
|
| 314 |
+
) # Locally I test on 0.1 but on GitHub actions with browsers and threading sometimes closing adds fractions of seconds
|
| 315 |
|
| 316 |
|
| 317 |
# Selector Generation Test
|
|
|
|
| 330 |
# Miscellaneous Tests
|
| 331 |
def test_getting_all_text(page):
|
| 332 |
"""Test getting all text from the page"""
|
| 333 |
+
assert page.get_all_text() != ""
|
| 334 |
|
| 335 |
|
| 336 |
def test_regex_on_text(page):
|
| 337 |
"""Test regex operations on text"""
|
| 338 |
element = page.css('[data-id="1"] .price')[0]
|
| 339 |
+
match = element.re_first(r"[\.\d]+")
|
| 340 |
+
assert match == "10.99"
|
| 341 |
+
match = element.text.re(r"(\d+)", replace_entities=False)
|
| 342 |
assert len(match) == 2
|