Karim shoair commited on
Commit
fcedcce
·
1 Parent(s): f300870

chore: migrating to ruff and updating pre-commit hooks

Browse files
.flake8 DELETED
@@ -1,3 +0,0 @@
1
- [flake8]
2
- ignore = E501, F401
3
- exclude = .git,.venv,__pycache__,docs,.github,build,dist,tests,benchmarks.py
 
 
 
 
.pre-commit-config.yaml CHANGED
@@ -1,17 +1,18 @@
1
  repos:
2
  - repo: https://github.com/PyCQA/bandit
3
- rev: 1.8.0
4
  hooks:
5
  - id: bandit
6
  args: [-r, -c, .bandit.yml]
7
- - repo: https://github.com/PyCQA/flake8
8
- rev: 7.1.1
 
9
  hooks:
10
- - id: flake8
11
- - repo: https://github.com/pycqa/isort
12
- rev: 5.13.2
13
- hooks:
14
- - id: isort
15
  - repo: https://github.com/netromdk/vermin
16
  rev: v1.6.0
17
  hooks:
 
1
  repos:
2
  - repo: https://github.com/PyCQA/bandit
3
+ rev: 1.8.3
4
  hooks:
5
  - id: bandit
6
  args: [-r, -c, .bandit.yml]
7
+ - repo: https://github.com/astral-sh/ruff-pre-commit
8
+ # Ruff version.
9
+ rev: v0.11.5
10
  hooks:
11
+ # Run the linter.
12
+ - id: ruff
13
+ args: [ --fix ]
14
+ # Run the formatter.
15
+ - id: ruff-format
16
  - repo: https://github.com/netromdk/vermin
17
  rev: v1.6.0
18
  hooks:
benchmarks.py CHANGED
@@ -14,19 +14,27 @@ from selectolax.parser import HTMLParser
14
 
15
  from scrapling import Adaptor
16
 
17
- large_html = '<html><body>' + '<div class="item">' * 5000 + '</div>' * 5000 + '</body></html>'
 
 
18
 
19
 
20
  def benchmark(func):
21
  @functools.wraps(func)
22
  def wrapper(*args, **kwargs):
23
- benchmark_name = func.__name__.replace('test_', '').replace('_', ' ')
24
  print(f"-> {benchmark_name}", end=" ", flush=True)
25
  # Warm-up phase
26
- timeit.repeat(lambda: func(*args, **kwargs), number=2, repeat=2, globals=globals())
 
 
27
  # Measure time (1 run, repeat 100 times, take average)
28
  times = timeit.repeat(
29
- lambda: func(*args, **kwargs), number=1, repeat=100, globals=globals(), timer=time.process_time
 
 
 
 
30
  )
31
  min_time = round(mean(times) * 1000, 2) # Convert to milliseconds
32
  print(f"average execution time: {min_time} ms")
@@ -42,23 +50,24 @@ def test_lxml():
42
  for e in etree.fromstring(
43
  large_html,
44
  # Scrapling and Parsel use the same parser inside so this is just to make it fair
45
- parser=html.HTMLParser(recover=True, huge_tree=True)
46
- ).cssselect('.item')]
 
47
 
48
 
49
  @benchmark
50
  def test_bs4_lxml():
51
- return [e.text for e in BeautifulSoup(large_html, 'lxml').select('.item')]
52
 
53
 
54
  @benchmark
55
  def test_bs4_html5lib():
56
- return [e.text for e in BeautifulSoup(large_html, 'html5lib').select('.item')]
57
 
58
 
59
  @benchmark
60
  def test_pyquery():
61
- return [e.text() for e in pq(large_html)('.item').items()]
62
 
63
 
64
  @benchmark
@@ -66,33 +75,33 @@ def test_scrapling():
66
  # No need to do `.extract()` like parsel to extract text
67
  # Also, this is faster than `[t.text for t in Adaptor(large_html, auto_match=False).css('.item')]`
68
  # for obvious reasons, of course.
69
- return Adaptor(large_html, auto_match=False).css('.item::text')
70
 
71
 
72
  @benchmark
73
  def test_parsel():
74
- return Selector(text=large_html).css('.item::text').extract()
75
 
76
 
77
  @benchmark
78
  def test_mechanicalsoup():
79
  browser = StatefulBrowser()
80
  browser.open_fake_page(large_html)
81
- return [e.text for e in browser.page.select('.item')]
82
 
83
 
84
  @benchmark
85
  def test_selectolax():
86
- return [node.text() for node in HTMLParser(large_html).css('.item')]
87
 
88
 
89
  def display(results):
90
  # Sort and display results
91
  sorted_results = sorted(results.items(), key=lambda x: x[1]) # Sort by time
92
- scrapling_time = results['Scrapling']
93
  print("\nRanked Results (fastest to slowest):")
94
  print(f" i. {'Library tested':<18} | {'avg. time (ms)':<15} | vs Scrapling")
95
- print('-' * 50)
96
  for i, (test_name, test_time) in enumerate(sorted_results, 1):
97
  compare = round(test_time / scrapling_time, 3)
98
  print(f" {i}. {test_name:<18} | {str(test_time):<15} | {compare}")
@@ -102,25 +111,28 @@ def display(results):
102
  def test_scrapling_text(request_html):
103
  # Will loop over resulted elements to get text too to make comparison even more fair otherwise Scrapling will be even faster
104
  return [
105
- element.text for element in Adaptor(
106
- request_html, auto_match=False
107
- ).find_by_text('Tipping the Velvet', first_match=True).find_similar(ignore_attributes=['title'])
 
108
  ]
109
 
110
 
111
  @benchmark
112
  def test_autoscraper(request_html):
113
  # autoscraper by default returns elements text
114
- return AutoScraper().build(html=request_html, wanted_list=['Tipping the Velvet'])
115
 
116
 
117
  if __name__ == "__main__":
118
- print(' Benchmark: Speed of parsing and retrieving the text content of 5000 nested elements \n')
 
 
119
  results1 = {
120
  "Raw Lxml": test_lxml(),
121
  "Parsel/Scrapy": test_parsel(),
122
  "Scrapling": test_scrapling(),
123
- 'Selectolax': test_selectolax(),
124
  "PyQuery": test_pyquery(),
125
  "BS4 with Lxml": test_bs4_lxml(),
126
  "MechanicalSoup": test_mechanicalsoup(),
@@ -128,10 +140,10 @@ if __name__ == "__main__":
128
  }
129
 
130
  display(results1)
131
- print('\n' + "="*25)
132
- req = requests.get('https://books.toscrape.com/index.html')
133
  print(
134
- ' Benchmark: Speed of searching for an element by text content, and retrieving the text of similar elements\n'
135
  )
136
  results2 = {
137
  "Scrapling": test_scrapling_text(req.text),
 
14
 
15
  from scrapling import Adaptor
16
 
17
+ large_html = (
18
+ "<html><body>" + '<div class="item">' * 5000 + "</div>" * 5000 + "</body></html>"
19
+ )
20
 
21
 
22
  def benchmark(func):
23
  @functools.wraps(func)
24
  def wrapper(*args, **kwargs):
25
+ benchmark_name = func.__name__.replace("test_", "").replace("_", " ")
26
  print(f"-> {benchmark_name}", end=" ", flush=True)
27
  # Warm-up phase
28
+ timeit.repeat(
29
+ lambda: func(*args, **kwargs), number=2, repeat=2, globals=globals()
30
+ )
31
  # Measure time (1 run, repeat 100 times, take average)
32
  times = timeit.repeat(
33
+ lambda: func(*args, **kwargs),
34
+ number=1,
35
+ repeat=100,
36
+ globals=globals(),
37
+ timer=time.process_time,
38
  )
39
  min_time = round(mean(times) * 1000, 2) # Convert to milliseconds
40
  print(f"average execution time: {min_time} ms")
 
50
  for e in etree.fromstring(
51
  large_html,
52
  # Scrapling and Parsel use the same parser inside so this is just to make it fair
53
+ parser=html.HTMLParser(recover=True, huge_tree=True),
54
+ ).cssselect(".item")
55
+ ]
56
 
57
 
58
  @benchmark
59
  def test_bs4_lxml():
60
+ return [e.text for e in BeautifulSoup(large_html, "lxml").select(".item")]
61
 
62
 
63
  @benchmark
64
  def test_bs4_html5lib():
65
+ return [e.text for e in BeautifulSoup(large_html, "html5lib").select(".item")]
66
 
67
 
68
  @benchmark
69
  def test_pyquery():
70
+ return [e.text() for e in pq(large_html)(".item").items()]
71
 
72
 
73
  @benchmark
 
75
  # No need to do `.extract()` like parsel to extract text
76
  # Also, this is faster than `[t.text for t in Adaptor(large_html, auto_match=False).css('.item')]`
77
  # for obvious reasons, of course.
78
+ return Adaptor(large_html, auto_match=False).css(".item::text")
79
 
80
 
81
  @benchmark
82
  def test_parsel():
83
+ return Selector(text=large_html).css(".item::text").extract()
84
 
85
 
86
  @benchmark
87
  def test_mechanicalsoup():
88
  browser = StatefulBrowser()
89
  browser.open_fake_page(large_html)
90
+ return [e.text for e in browser.page.select(".item")]
91
 
92
 
93
  @benchmark
94
  def test_selectolax():
95
+ return [node.text() for node in HTMLParser(large_html).css(".item")]
96
 
97
 
98
  def display(results):
99
  # Sort and display results
100
  sorted_results = sorted(results.items(), key=lambda x: x[1]) # Sort by time
101
+ scrapling_time = results["Scrapling"]
102
  print("\nRanked Results (fastest to slowest):")
103
  print(f" i. {'Library tested':<18} | {'avg. time (ms)':<15} | vs Scrapling")
104
+ print("-" * 50)
105
  for i, (test_name, test_time) in enumerate(sorted_results, 1):
106
  compare = round(test_time / scrapling_time, 3)
107
  print(f" {i}. {test_name:<18} | {str(test_time):<15} | {compare}")
 
111
  def test_scrapling_text(request_html):
112
  # Will loop over resulted elements to get text too to make comparison even more fair otherwise Scrapling will be even faster
113
  return [
114
+ element.text
115
+ for element in Adaptor(request_html, auto_match=False)
116
+ .find_by_text("Tipping the Velvet", first_match=True)
117
+ .find_similar(ignore_attributes=["title"])
118
  ]
119
 
120
 
121
  @benchmark
122
  def test_autoscraper(request_html):
123
  # autoscraper by default returns elements text
124
+ return AutoScraper().build(html=request_html, wanted_list=["Tipping the Velvet"])
125
 
126
 
127
  if __name__ == "__main__":
128
+ print(
129
+ " Benchmark: Speed of parsing and retrieving the text content of 5000 nested elements \n"
130
+ )
131
  results1 = {
132
  "Raw Lxml": test_lxml(),
133
  "Parsel/Scrapy": test_parsel(),
134
  "Scrapling": test_scrapling(),
135
+ "Selectolax": test_selectolax(),
136
  "PyQuery": test_pyquery(),
137
  "BS4 with Lxml": test_bs4_lxml(),
138
  "MechanicalSoup": test_mechanicalsoup(),
 
140
  }
141
 
142
  display(results1)
143
+ print("\n" + "=" * 25)
144
+ req = requests.get("https://books.toscrape.com/index.html")
145
  print(
146
+ " Benchmark: Speed of searching for an element by text content, and retrieving the text of similar elements\n"
147
  )
148
  results2 = {
149
  "Scrapling": test_scrapling_text(req.text),
cleanup.py CHANGED
@@ -9,12 +9,12 @@ def clean():
9
 
10
  # Directories and patterns to clean
11
  cleanup_patterns = [
12
- 'build',
13
- 'dist',
14
- '*.egg-info',
15
- '__pycache__',
16
- '.eggs',
17
- '.pytest_cache'
18
  ]
19
 
20
  # Clean directories
@@ -30,7 +30,7 @@ def clean():
30
  print(f"Could not remove {path}: {e}")
31
 
32
  # Remove compiled Python files
33
- for path in base_dir.rglob('*.py[co]'):
34
  try:
35
  path.unlink()
36
  print(f"Removed compiled file: {path}")
@@ -38,5 +38,5 @@ def clean():
38
  print(f"Could not remove {path}: {e}")
39
 
40
 
41
- if __name__ == '__main__':
42
  clean()
 
9
 
10
  # Directories and patterns to clean
11
  cleanup_patterns = [
12
+ "build",
13
+ "dist",
14
+ "*.egg-info",
15
+ "__pycache__",
16
+ ".eggs",
17
+ ".pytest_cache",
18
  ]
19
 
20
  # Clean directories
 
30
  print(f"Could not remove {path}: {e}")
31
 
32
  # Remove compiled Python files
33
+ for path in base_dir.rglob("*.py[co]"):
34
  try:
35
  path.unlink()
36
  print(f"Removed compiled file: {path}")
 
38
  print(f"Could not remove {path}: {e}")
39
 
40
 
41
+ if __name__ == "__main__":
42
  clean()
ruff.toml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ exclude = [
2
+ ".git",
3
+ ".venv",
4
+ "__pycache__",
5
+ "docs",
6
+ ".github",
7
+ "build",
8
+ "dist",
9
+ "tests",
10
+ "benchmarks.py",
11
+ ]
12
+
13
+ # Assume Python 3.9
14
+ target-version = "py39"
15
+
16
+ [lint]
17
+ select = ["E", "F", "W"]
18
+ ignore = ["E501", "F401"]
19
+
20
+ [format]
21
+ # Like Black, use double quotes for strings.
22
+ quote-style = "double"
scrapling/__init__.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
3
  __version__ = "0.2.99"
4
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
@@ -7,35 +6,44 @@ __copyright__ = "Copyright (c) 2024 Karim Shoair"
7
  # A lightweight approach to create lazy loader for each import for backward compatibility
8
  # This will reduces initial memory footprint significantly (only loads what's used)
9
  def __getattr__(name):
10
- if name == 'Fetcher':
11
  from scrapling.fetchers import Fetcher as cls
 
12
  return cls
13
- elif name == 'Adaptor':
14
  from scrapling.parser import Adaptor as cls
 
15
  return cls
16
- elif name == 'Adaptors':
17
  from scrapling.parser import Adaptors as cls
 
18
  return cls
19
- elif name == 'AttributesHandler':
20
  from scrapling.core.custom_types import AttributesHandler as cls
 
21
  return cls
22
- elif name == 'TextHandler':
23
  from scrapling.core.custom_types import TextHandler as cls
 
24
  return cls
25
- elif name == 'AsyncFetcher':
26
  from scrapling.fetchers import AsyncFetcher as cls
 
27
  return cls
28
- elif name == 'StealthyFetcher':
29
  from scrapling.fetchers import StealthyFetcher as cls
 
30
  return cls
31
- elif name == 'PlayWrightFetcher':
32
  from scrapling.fetchers import PlayWrightFetcher as cls
 
33
  return cls
34
- elif name == 'CustomFetcher':
35
  from scrapling.fetchers import CustomFetcher as cls
 
36
  return cls
37
  else:
38
  raise AttributeError(f"module 'scrapling' has no attribute '{name}'")
39
 
40
 
41
- __all__ = ['Adaptor', 'Fetcher', 'AsyncFetcher', 'StealthyFetcher', 'PlayWrightFetcher']
 
 
1
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
2
  __version__ = "0.2.99"
3
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
 
6
  # A lightweight approach to create lazy loader for each import for backward compatibility
7
  # This will reduces initial memory footprint significantly (only loads what's used)
8
  def __getattr__(name):
9
+ if name == "Fetcher":
10
  from scrapling.fetchers import Fetcher as cls
11
+
12
  return cls
13
+ elif name == "Adaptor":
14
  from scrapling.parser import Adaptor as cls
15
+
16
  return cls
17
+ elif name == "Adaptors":
18
  from scrapling.parser import Adaptors as cls
19
+
20
  return cls
21
+ elif name == "AttributesHandler":
22
  from scrapling.core.custom_types import AttributesHandler as cls
23
+
24
  return cls
25
+ elif name == "TextHandler":
26
  from scrapling.core.custom_types import TextHandler as cls
27
+
28
  return cls
29
+ elif name == "AsyncFetcher":
30
  from scrapling.fetchers import AsyncFetcher as cls
31
+
32
  return cls
33
+ elif name == "StealthyFetcher":
34
  from scrapling.fetchers import StealthyFetcher as cls
35
+
36
  return cls
37
+ elif name == "PlayWrightFetcher":
38
  from scrapling.fetchers import PlayWrightFetcher as cls
39
+
40
  return cls
41
+ elif name == "CustomFetcher":
42
  from scrapling.fetchers import CustomFetcher as cls
43
+
44
  return cls
45
  else:
46
  raise AttributeError(f"module 'scrapling' has no attribute '{name}'")
47
 
48
 
49
+ __all__ = ["Adaptor", "Fetcher", "AsyncFetcher", "StealthyFetcher", "PlayWrightFetcher"]
scrapling/cli.py CHANGED
@@ -12,21 +12,41 @@ def get_package_dir():
12
 
13
  def run_command(command, line):
14
  print(f"Installing {line}...")
15
- _ = subprocess.check_call(' '.join(command), shell=True)
16
  # I meant to not use try except here
17
 
18
 
19
  @click.command(help="Install all Scrapling's Fetchers dependencies")
20
- @click.option('-f', '--force', 'force', is_flag=True, default=False, type=bool, help="Force Scrapling to reinstall all Fetchers dependencies")
 
 
 
 
 
 
 
 
21
  def install(force):
22
- if force or not get_package_dir().joinpath(".scrapling_dependencies_installed").exists():
23
- run_command([sys.executable, "-m", "playwright", "install", 'chromium'], 'Playwright browsers')
24
- run_command([sys.executable, "-m", "playwright", "install-deps", 'chromium', 'firefox'], 'Playwright dependencies')
25
- run_command([sys.executable, "-m", "camoufox", "fetch", '--browserforge'], 'Camoufox browser and databases')
 
 
 
 
 
 
 
 
 
 
 
 
26
  # if no errors raised by above commands, then we add below file
27
  get_package_dir().joinpath(".scrapling_dependencies_installed").touch()
28
  else:
29
- print('The dependencies are already installed')
30
 
31
 
32
  @click.group()
 
12
 
13
  def run_command(command, line):
14
  print(f"Installing {line}...")
15
+ _ = subprocess.check_call(" ".join(command), shell=True)
16
  # I meant to not use try except here
17
 
18
 
19
  @click.command(help="Install all Scrapling's Fetchers dependencies")
20
+ @click.option(
21
+ "-f",
22
+ "--force",
23
+ "force",
24
+ is_flag=True,
25
+ default=False,
26
+ type=bool,
27
+ help="Force Scrapling to reinstall all Fetchers dependencies",
28
+ )
29
  def install(force):
30
+ if (
31
+ force
32
+ or not get_package_dir().joinpath(".scrapling_dependencies_installed").exists()
33
+ ):
34
+ run_command(
35
+ [sys.executable, "-m", "playwright", "install", "chromium"],
36
+ "Playwright browsers",
37
+ )
38
+ run_command(
39
+ [sys.executable, "-m", "playwright", "install-deps", "chromium", "firefox"],
40
+ "Playwright dependencies",
41
+ )
42
+ run_command(
43
+ [sys.executable, "-m", "camoufox", "fetch", "--browserforge"],
44
+ "Camoufox browser and databases",
45
+ )
46
  # if no errors raised by above commands, then we add below file
47
  get_package_dir().joinpath(".scrapling_dependencies_installed").touch()
48
  else:
49
+ print("The dependencies are already installed")
50
 
51
 
52
  @click.group()
scrapling/core/_types.py CHANGED
@@ -2,9 +2,22 @@
2
  Type definitions for type checking purposes.
3
  """
4
 
5
- from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
6
- List, Literal, Optional, Pattern, Tuple, Type, TypeVar,
7
- Union)
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
10
 
 
2
  Type definitions for type checking purposes.
3
  """
4
 
5
+ from typing import (
6
+ TYPE_CHECKING,
7
+ Any,
8
+ Callable,
9
+ Dict,
10
+ Generator,
11
+ Iterable,
12
+ List,
13
+ Literal,
14
+ Optional,
15
+ Pattern,
16
+ Tuple,
17
+ Type,
18
+ TypeVar,
19
+ Union,
20
+ )
21
 
22
  SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
23
 
scrapling/core/custom_types.py CHANGED
@@ -6,16 +6,26 @@ from types import MappingProxyType
6
  from orjson import dumps, loads
7
  from w3lib.html import replace_entities as _replace_entities
8
 
9
- from scrapling.core._types import (Dict, Iterable, List, Literal, Optional,
10
- Pattern, SupportsIndex, TypeVar, Union)
 
 
 
 
 
 
 
 
 
11
  from scrapling.core.utils import _is_iterable, flatten
12
 
13
  # Define type variable for AttributeHandler value type
14
- _TextHandlerType = TypeVar('_TextHandlerType', bound='TextHandler')
15
 
16
 
17
  class TextHandler(str):
18
  """Extends standard Python string by adding more functionality"""
 
19
  __slots__ = ()
20
 
21
  def __new__(cls, string):
@@ -25,77 +35,89 @@ class TextHandler(str):
25
  lst = super().__getitem__(key)
26
  return typing.cast(_TextHandlerType, TextHandler(lst))
27
 
28
- def split(self, sep: str = None, maxsplit: SupportsIndex = -1) -> 'TextHandlers':
29
  return TextHandlers(
30
- typing.cast(List[_TextHandlerType], [TextHandler(s) for s in super().split(sep, maxsplit)])
 
 
 
31
  )
32
 
33
- def strip(self, chars: str = None) -> Union[str, 'TextHandler']:
34
  return TextHandler(super().strip(chars))
35
 
36
- def lstrip(self, chars: str = None) -> Union[str, 'TextHandler']:
37
  return TextHandler(super().lstrip(chars))
38
 
39
- def rstrip(self, chars: str = None) -> Union[str, 'TextHandler']:
40
  return TextHandler(super().rstrip(chars))
41
 
42
- def capitalize(self) -> Union[str, 'TextHandler']:
43
  return TextHandler(super().capitalize())
44
 
45
- def casefold(self) -> Union[str, 'TextHandler']:
46
  return TextHandler(super().casefold())
47
 
48
- def center(self, width: SupportsIndex, fillchar: str = ' ') -> Union[str, 'TextHandler']:
 
 
49
  return TextHandler(super().center(width, fillchar))
50
 
51
- def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, 'TextHandler']:
52
  return TextHandler(super().expandtabs(tabsize))
53
 
54
- def format(self, *args: str, **kwargs: str) -> Union[str, 'TextHandler']:
55
  return TextHandler(super().format(*args, **kwargs))
56
 
57
- def format_map(self, mapping) -> Union[str, 'TextHandler']:
58
  return TextHandler(super().format_map(mapping))
59
 
60
- def join(self, iterable: Iterable[str]) -> Union[str, 'TextHandler']:
61
  return TextHandler(super().join(iterable))
62
 
63
- def ljust(self, width: SupportsIndex, fillchar: str = ' ') -> Union[str, 'TextHandler']:
 
 
64
  return TextHandler(super().ljust(width, fillchar))
65
 
66
- def rjust(self, width: SupportsIndex, fillchar: str = ' ') -> Union[str, 'TextHandler']:
 
 
67
  return TextHandler(super().rjust(width, fillchar))
68
 
69
- def swapcase(self) -> Union[str, 'TextHandler']:
70
  return TextHandler(super().swapcase())
71
 
72
- def title(self) -> Union[str, 'TextHandler']:
73
  return TextHandler(super().title())
74
 
75
- def translate(self, table) -> Union[str, 'TextHandler']:
76
  return TextHandler(super().translate(table))
77
 
78
- def zfill(self, width: SupportsIndex) -> Union[str, 'TextHandler']:
79
  return TextHandler(super().zfill(width))
80
 
81
- def replace(self, old: str, new: str, count: SupportsIndex = -1) -> Union[str, 'TextHandler']:
 
 
82
  return TextHandler(super().replace(old, new, count))
83
 
84
- def upper(self) -> Union[str, 'TextHandler']:
85
  return TextHandler(super().upper())
86
 
87
- def lower(self) -> Union[str, 'TextHandler']:
88
  return TextHandler(super().lower())
 
89
  ##############
90
 
91
- def sort(self, reverse: bool = False) -> Union[str, 'TextHandler']:
92
  """Return a sorted version of the string"""
93
  return self.__class__("".join(sorted(self, reverse=reverse)))
94
 
95
- def clean(self) -> Union[str, 'TextHandler']:
96
  """Return a new version of the string after removing all white spaces and consecutive spaces"""
97
- data = re.sub(r'[\t|\r|\n]', '', self)
98
- data = re.sub(' +', ' ', data)
99
  return self.__class__(data.strip())
100
 
101
  # For easy copy-paste from Scrapy/parsel code when needed :)
@@ -122,8 +144,7 @@ class TextHandler(str):
122
  replace_entities: bool = True,
123
  clean_match: bool = False,
124
  case_sensitive: bool = True,
125
- ) -> bool:
126
- ...
127
 
128
  @typing.overload
129
  def re(
@@ -133,12 +154,15 @@ class TextHandler(str):
133
  clean_match: bool = False,
134
  case_sensitive: bool = True,
135
  check_match: Literal[False] = False,
136
- ) -> "TextHandlers[TextHandler]":
137
- ...
138
 
139
  def re(
140
- self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
141
- case_sensitive: bool = True, check_match: bool = False
 
 
 
 
142
  ) -> Union["TextHandlers[TextHandler]", bool]:
143
  """Apply the given regex to the current text and return a list of strings with the matches.
144
 
@@ -164,12 +188,27 @@ class TextHandler(str):
164
  results = flatten(results)
165
 
166
  if not replace_entities:
167
- return TextHandlers(typing.cast(List[_TextHandlerType], [TextHandler(string) for string in results]))
 
 
 
 
168
 
169
- return TextHandlers(typing.cast(List[_TextHandlerType], [TextHandler(_replace_entities(s)) for s in results]))
 
 
 
 
 
170
 
171
- def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
172
- clean_match: bool = False, case_sensitive: bool = True) -> "TextHandler":
 
 
 
 
 
 
173
  """Apply the given regex to text and return the first match if found, otherwise return the default value.
174
 
175
  :param regex: Can be either a compiled regular expression or a string.
@@ -179,7 +218,12 @@ class TextHandler(str):
179
  :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
180
 
181
  """
182
- result = self.re(regex, replace_entities, clean_match=clean_match, case_sensitive=case_sensitive)
 
 
 
 
 
183
  return result[0] if result else default
184
 
185
 
@@ -187,6 +231,7 @@ class TextHandlers(List[TextHandler]):
187
  """
188
  The :class:`TextHandlers` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
189
  """
 
190
  __slots__ = ()
191
 
192
  @typing.overload
@@ -197,15 +242,22 @@ class TextHandlers(List[TextHandler]):
197
  def __getitem__(self, pos: slice) -> "TextHandlers":
198
  pass
199
 
200
- def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union[TextHandler, "TextHandlers"]:
 
 
201
  lst = super().__getitem__(pos)
202
  if isinstance(pos, slice):
203
  lst = [TextHandler(s) for s in lst]
204
  return TextHandlers(typing.cast(List[_TextHandlerType], lst))
205
  return typing.cast(_TextHandlerType, TextHandler(lst))
206
 
207
- def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
208
- case_sensitive: bool = True) -> 'TextHandlers[TextHandler]':
 
 
 
 
 
209
  """Call the ``.re()`` method for each element in this list and return
210
  their results flattened as TextHandlers.
211
 
@@ -219,8 +271,14 @@ class TextHandlers(List[TextHandler]):
219
  ]
220
  return TextHandlers(flatten(results))
221
 
222
- def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
223
- clean_match: bool = False, case_sensitive: bool = True) -> TextHandler:
 
 
 
 
 
 
224
  """Call the ``.re_first()`` method for each element in this list and return
225
  the first result or the default value otherwise.
226
 
@@ -251,26 +309,35 @@ class TextHandlers(List[TextHandler]):
251
 
252
  class AttributesHandler(Mapping[str, _TextHandlerType]):
253
  """A read-only mapping to use instead of the standard dictionary for the speed boost but at the same time I use it to add more functionalities.
254
- If standard dictionary is needed, just convert this class to dictionary with `dict` function
255
  """
256
- __slots__ = ('_data',)
 
257
 
258
  def __init__(self, mapping=None, **kwargs):
259
- mapping = {
260
- key: TextHandler(value) if type(value) is str else value
261
- for key, value in mapping.items()
262
- } if mapping is not None else {}
 
 
 
 
263
 
264
  if kwargs:
265
- mapping.update({
266
- key: TextHandler(value) if type(value) is str else value
267
- for key, value in kwargs.items()
268
- })
 
 
269
 
270
  # Fastest read-only mapping type
271
  self._data = MappingProxyType(mapping)
272
 
273
- def get(self, key: str, default: Optional[str] = None) -> Union[_TextHandlerType, None]:
 
 
274
  """Acts like standard dictionary `.get()` method"""
275
  return self._data.get(key, default)
276
 
 
6
  from orjson import dumps, loads
7
  from w3lib.html import replace_entities as _replace_entities
8
 
9
+ from scrapling.core._types import (
10
+ Dict,
11
+ Iterable,
12
+ List,
13
+ Literal,
14
+ Optional,
15
+ Pattern,
16
+ SupportsIndex,
17
+ TypeVar,
18
+ Union,
19
+ )
20
  from scrapling.core.utils import _is_iterable, flatten
21
 
22
  # Define type variable for AttributeHandler value type
23
+ _TextHandlerType = TypeVar("_TextHandlerType", bound="TextHandler")
24
 
25
 
26
  class TextHandler(str):
27
  """Extends standard Python string by adding more functionality"""
28
+
29
  __slots__ = ()
30
 
31
  def __new__(cls, string):
 
35
  lst = super().__getitem__(key)
36
  return typing.cast(_TextHandlerType, TextHandler(lst))
37
 
38
+ def split(self, sep: str = None, maxsplit: SupportsIndex = -1) -> "TextHandlers":
39
  return TextHandlers(
40
+ typing.cast(
41
+ List[_TextHandlerType],
42
+ [TextHandler(s) for s in super().split(sep, maxsplit)],
43
+ )
44
  )
45
 
46
+ def strip(self, chars: str = None) -> Union[str, "TextHandler"]:
47
  return TextHandler(super().strip(chars))
48
 
49
+ def lstrip(self, chars: str = None) -> Union[str, "TextHandler"]:
50
  return TextHandler(super().lstrip(chars))
51
 
52
+ def rstrip(self, chars: str = None) -> Union[str, "TextHandler"]:
53
  return TextHandler(super().rstrip(chars))
54
 
55
+ def capitalize(self) -> Union[str, "TextHandler"]:
56
  return TextHandler(super().capitalize())
57
 
58
+ def casefold(self) -> Union[str, "TextHandler"]:
59
  return TextHandler(super().casefold())
60
 
61
+ def center(
62
+ self, width: SupportsIndex, fillchar: str = " "
63
+ ) -> Union[str, "TextHandler"]:
64
  return TextHandler(super().center(width, fillchar))
65
 
66
+ def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, "TextHandler"]:
67
  return TextHandler(super().expandtabs(tabsize))
68
 
69
+ def format(self, *args: str, **kwargs: str) -> Union[str, "TextHandler"]:
70
  return TextHandler(super().format(*args, **kwargs))
71
 
72
+ def format_map(self, mapping) -> Union[str, "TextHandler"]:
73
  return TextHandler(super().format_map(mapping))
74
 
75
+ def join(self, iterable: Iterable[str]) -> Union[str, "TextHandler"]:
76
  return TextHandler(super().join(iterable))
77
 
78
+ def ljust(
79
+ self, width: SupportsIndex, fillchar: str = " "
80
+ ) -> Union[str, "TextHandler"]:
81
  return TextHandler(super().ljust(width, fillchar))
82
 
83
+ def rjust(
84
+ self, width: SupportsIndex, fillchar: str = " "
85
+ ) -> Union[str, "TextHandler"]:
86
  return TextHandler(super().rjust(width, fillchar))
87
 
88
+ def swapcase(self) -> Union[str, "TextHandler"]:
89
  return TextHandler(super().swapcase())
90
 
91
+ def title(self) -> Union[str, "TextHandler"]:
92
  return TextHandler(super().title())
93
 
94
+ def translate(self, table) -> Union[str, "TextHandler"]:
95
  return TextHandler(super().translate(table))
96
 
97
+ def zfill(self, width: SupportsIndex) -> Union[str, "TextHandler"]:
98
  return TextHandler(super().zfill(width))
99
 
100
+ def replace(
101
+ self, old: str, new: str, count: SupportsIndex = -1
102
+ ) -> Union[str, "TextHandler"]:
103
  return TextHandler(super().replace(old, new, count))
104
 
105
+ def upper(self) -> Union[str, "TextHandler"]:
106
  return TextHandler(super().upper())
107
 
108
+ def lower(self) -> Union[str, "TextHandler"]:
109
  return TextHandler(super().lower())
110
+
111
  ##############
112
 
113
+ def sort(self, reverse: bool = False) -> Union[str, "TextHandler"]:
114
  """Return a sorted version of the string"""
115
  return self.__class__("".join(sorted(self, reverse=reverse)))
116
 
117
+ def clean(self) -> Union[str, "TextHandler"]:
118
  """Return a new version of the string after removing all white spaces and consecutive spaces"""
119
+ data = re.sub(r"[\t|\r|\n]", "", self)
120
+ data = re.sub(" +", " ", data)
121
  return self.__class__(data.strip())
122
 
123
  # For easy copy-paste from Scrapy/parsel code when needed :)
 
144
  replace_entities: bool = True,
145
  clean_match: bool = False,
146
  case_sensitive: bool = True,
147
+ ) -> bool: ...
 
148
 
149
  @typing.overload
150
  def re(
 
154
  clean_match: bool = False,
155
  case_sensitive: bool = True,
156
  check_match: Literal[False] = False,
157
+ ) -> "TextHandlers[TextHandler]": ...
 
158
 
159
  def re(
160
+ self,
161
+ regex: Union[str, Pattern[str]],
162
+ replace_entities: bool = True,
163
+ clean_match: bool = False,
164
+ case_sensitive: bool = True,
165
+ check_match: bool = False,
166
  ) -> Union["TextHandlers[TextHandler]", bool]:
167
  """Apply the given regex to the current text and return a list of strings with the matches.
168
 
 
188
  results = flatten(results)
189
 
190
  if not replace_entities:
191
+ return TextHandlers(
192
+ typing.cast(
193
+ List[_TextHandlerType], [TextHandler(string) for string in results]
194
+ )
195
+ )
196
 
197
+ return TextHandlers(
198
+ typing.cast(
199
+ List[_TextHandlerType],
200
+ [TextHandler(_replace_entities(s)) for s in results],
201
+ )
202
+ )
203
 
204
+ def re_first(
205
+ self,
206
+ regex: Union[str, Pattern[str]],
207
+ default=None,
208
+ replace_entities: bool = True,
209
+ clean_match: bool = False,
210
+ case_sensitive: bool = True,
211
+ ) -> "TextHandler":
212
  """Apply the given regex to text and return the first match if found, otherwise return the default value.
213
 
214
  :param regex: Can be either a compiled regular expression or a string.
 
218
  :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
219
 
220
  """
221
+ result = self.re(
222
+ regex,
223
+ replace_entities,
224
+ clean_match=clean_match,
225
+ case_sensitive=case_sensitive,
226
+ )
227
  return result[0] if result else default
228
 
229
 
 
231
  """
232
  The :class:`TextHandlers` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
233
  """
234
+
235
  __slots__ = ()
236
 
237
  @typing.overload
 
242
  def __getitem__(self, pos: slice) -> "TextHandlers":
243
  pass
244
 
245
+ def __getitem__(
246
+ self, pos: Union[SupportsIndex, slice]
247
+ ) -> Union[TextHandler, "TextHandlers"]:
248
  lst = super().__getitem__(pos)
249
  if isinstance(pos, slice):
250
  lst = [TextHandler(s) for s in lst]
251
  return TextHandlers(typing.cast(List[_TextHandlerType], lst))
252
  return typing.cast(_TextHandlerType, TextHandler(lst))
253
 
254
+ def re(
255
+ self,
256
+ regex: Union[str, Pattern[str]],
257
+ replace_entities: bool = True,
258
+ clean_match: bool = False,
259
+ case_sensitive: bool = True,
260
+ ) -> "TextHandlers[TextHandler]":
261
  """Call the ``.re()`` method for each element in this list and return
262
  their results flattened as TextHandlers.
263
 
 
271
  ]
272
  return TextHandlers(flatten(results))
273
 
274
+ def re_first(
275
+ self,
276
+ regex: Union[str, Pattern[str]],
277
+ default=None,
278
+ replace_entities: bool = True,
279
+ clean_match: bool = False,
280
+ case_sensitive: bool = True,
281
+ ) -> TextHandler:
282
  """Call the ``.re_first()`` method for each element in this list and return
283
  the first result or the default value otherwise.
284
 
 
309
 
310
  class AttributesHandler(Mapping[str, _TextHandlerType]):
311
  """A read-only mapping to use instead of the standard dictionary for the speed boost but at the same time I use it to add more functionalities.
312
+ If standard dictionary is needed, just convert this class to dictionary with `dict` function
313
  """
314
+
315
+ __slots__ = ("_data",)
316
 
317
  def __init__(self, mapping=None, **kwargs):
318
+ mapping = (
319
+ {
320
+ key: TextHandler(value) if type(value) is str else value
321
+ for key, value in mapping.items()
322
+ }
323
+ if mapping is not None
324
+ else {}
325
+ )
326
 
327
  if kwargs:
328
+ mapping.update(
329
+ {
330
+ key: TextHandler(value) if type(value) is str else value
331
+ for key, value in kwargs.items()
332
+ }
333
+ )
334
 
335
  # Fastest read-only mapping type
336
  self._data = MappingProxyType(mapping)
337
 
338
+ def get(
339
+ self, key: str, default: Optional[str] = None
340
+ ) -> Union[_TextHandlerType, None]:
341
  """Acts like standard dictionary `.get()` method"""
342
  return self._data.get(key, default)
343
 
scrapling/core/mixins.py CHANGED
@@ -1,32 +1,33 @@
1
-
2
  class SelectorsGeneration:
3
  """Selectors generation functions
4
  Trying to generate selectors like Firefox or maybe cleaner ones!? Ehm
5
  Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591"""
6
 
7
- def __general_selection(self, selection: str = 'css', full_path=False) -> str:
8
  """Generate a selector for the current element.
9
  :return: A string of the generated selector.
10
  """
11
  selectorPath = []
12
  target = self
13
- css = selection.lower() == 'css'
14
  while target is not None:
15
  if target.parent:
16
- if target.attrib.get('id'):
17
  # id is enough
18
  part = (
19
- f'#{target.attrib["id"]}' if css
 
20
  else f"[@id='{target.attrib['id']}']"
21
  )
22
  selectorPath.append(part)
23
  if not full_path:
24
  return (
25
- " > ".join(reversed(selectorPath)) if css
26
- else '//*' + "/".join(reversed(selectorPath))
 
27
  )
28
  else:
29
- part = f'{target.tag}'
30
  # We won't use classes anymore because I some websites share exact classes between elements
31
  # classes = target.attrib.get('class', '').split()
32
  # if classes and css:
@@ -41,23 +42,26 @@ class SelectorsGeneration:
41
 
42
  if counter[target.tag] > 1:
43
  part += (
44
- f":nth-of-type({counter[target.tag]})" if css
 
45
  else f"[{counter[target.tag]}]"
46
  )
47
 
48
  selectorPath.append(part)
49
  target = target.parent
50
- if target is None or target.tag == 'html':
51
  return (
52
- " > ".join(reversed(selectorPath)) if css
53
- else '//' + "/".join(reversed(selectorPath))
 
54
  )
55
  else:
56
  break
57
 
58
  return (
59
- " > ".join(reversed(selectorPath)) if css
60
- else '//' + "/".join(reversed(selectorPath))
 
61
  )
62
 
63
  @property
@@ -79,11 +83,11 @@ class SelectorsGeneration:
79
  """Generate a XPath selector for the current element
80
  :return: A string of the generated selector.
81
  """
82
- return self.__general_selection('xpath')
83
 
84
  @property
85
  def generate_full_xpath_selector(self) -> str:
86
  """Generate a complete XPath selector for the current element
87
  :return: A string of the generated selector.
88
  """
89
- return self.__general_selection('xpath', full_path=True)
 
 
1
  class SelectorsGeneration:
2
  """Selectors generation functions
3
  Trying to generate selectors like Firefox or maybe cleaner ones!? Ehm
4
  Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591"""
5
 
6
+ def __general_selection(self, selection: str = "css", full_path=False) -> str:
7
  """Generate a selector for the current element.
8
  :return: A string of the generated selector.
9
  """
10
  selectorPath = []
11
  target = self
12
+ css = selection.lower() == "css"
13
  while target is not None:
14
  if target.parent:
15
+ if target.attrib.get("id"):
16
  # id is enough
17
  part = (
18
+ f"#{target.attrib['id']}"
19
+ if css
20
  else f"[@id='{target.attrib['id']}']"
21
  )
22
  selectorPath.append(part)
23
  if not full_path:
24
  return (
25
+ " > ".join(reversed(selectorPath))
26
+ if css
27
+ else "//*" + "/".join(reversed(selectorPath))
28
  )
29
  else:
30
+ part = f"{target.tag}"
31
  # We won't use classes anymore because I some websites share exact classes between elements
32
  # classes = target.attrib.get('class', '').split()
33
  # if classes and css:
 
42
 
43
  if counter[target.tag] > 1:
44
  part += (
45
+ f":nth-of-type({counter[target.tag]})"
46
+ if css
47
  else f"[{counter[target.tag]}]"
48
  )
49
 
50
  selectorPath.append(part)
51
  target = target.parent
52
+ if target is None or target.tag == "html":
53
  return (
54
+ " > ".join(reversed(selectorPath))
55
+ if css
56
+ else "//" + "/".join(reversed(selectorPath))
57
  )
58
  else:
59
  break
60
 
61
  return (
62
+ " > ".join(reversed(selectorPath))
63
+ if css
64
+ else "//" + "/".join(reversed(selectorPath))
65
  )
66
 
67
  @property
 
83
  """Generate a XPath selector for the current element
84
  :return: A string of the generated selector.
85
  """
86
+ return self.__general_selection("xpath")
87
 
88
  @property
89
  def generate_full_xpath_selector(self) -> str:
90
  """Generate a complete XPath selector for the current element
91
  :return: A string of the generated selector.
92
  """
93
+ return self.__general_selection("xpath", full_path=True)
scrapling/core/storage_adaptors.py CHANGED
@@ -20,7 +20,7 @@ class StorageSystemMixin(ABC):
20
  self.url = url
21
 
22
  @lru_cache(64, typed=True)
23
- def _get_base_url(self, default_value: str = 'default') -> str:
24
  if not self.url or type(self.url) is not str:
25
  return default_value
26
 
@@ -38,7 +38,7 @@ class StorageSystemMixin(ABC):
38
  :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
39
  the docs for more info.
40
  """
41
- raise NotImplementedError('Storage system must implement `save` method')
42
 
43
  @abstractmethod
44
  def retrieve(self, identifier: str) -> Optional[Dict]:
@@ -48,7 +48,7 @@ class StorageSystemMixin(ABC):
48
  the docs for more info.
49
  :return: A dictionary of the unique properties
50
  """
51
- raise NotImplementedError('Storage system must implement `save` method')
52
 
53
  @staticmethod
54
  @lru_cache(128, typed=True)
@@ -57,7 +57,7 @@ class StorageSystemMixin(ABC):
57
  identifier = identifier.lower().strip()
58
  if isinstance(identifier, str):
59
  # Hash functions have to take bytes
60
- identifier = identifier.encode('utf-8')
61
 
62
  hash_value = sha256(identifier).hexdigest()
63
  return f"{hash_value}_{len(identifier)}" # Length to reduce collision chance
@@ -68,6 +68,7 @@ class SQLiteStorageSystem(StorageSystemMixin):
68
  """The recommended system to use, it's race condition safe and thread safe.
69
  Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
70
  > It's optimized for threaded applications but running it without threads shouldn't make it slow."""
 
71
  def __init__(self, storage_file: str, url: Union[str, None] = None):
72
  """
73
  :param storage_file: File to be used to store elements
@@ -111,10 +112,13 @@ class SQLiteStorageSystem(StorageSystemMixin):
111
  url = self._get_base_url()
112
  element_data = _StorageTools.element_to_dict(element)
113
  with self.lock:
114
- self.cursor.execute("""
 
115
  INSERT OR REPLACE INTO storage (url, identifier, element_data)
116
  VALUES (?, ?, ?)
117
- """, (url, identifier, orjson.dumps(element_data)))
 
 
118
  self.cursor.fetchall()
119
  self.connection.commit()
120
 
@@ -129,7 +133,7 @@ class SQLiteStorageSystem(StorageSystemMixin):
129
  with self.lock:
130
  self.cursor.execute(
131
  "SELECT element_data FROM storage WHERE url = ? AND identifier = ?",
132
- (url, identifier)
133
  )
134
  result = self.cursor.fetchone()
135
  if result:
 
20
  self.url = url
21
 
22
  @lru_cache(64, typed=True)
23
+ def _get_base_url(self, default_value: str = "default") -> str:
24
  if not self.url or type(self.url) is not str:
25
  return default_value
26
 
 
38
  :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
39
  the docs for more info.
40
  """
41
+ raise NotImplementedError("Storage system must implement `save` method")
42
 
43
  @abstractmethod
44
  def retrieve(self, identifier: str) -> Optional[Dict]:
 
48
  the docs for more info.
49
  :return: A dictionary of the unique properties
50
  """
51
+ raise NotImplementedError("Storage system must implement `save` method")
52
 
53
  @staticmethod
54
  @lru_cache(128, typed=True)
 
57
  identifier = identifier.lower().strip()
58
  if isinstance(identifier, str):
59
  # Hash functions have to take bytes
60
+ identifier = identifier.encode("utf-8")
61
 
62
  hash_value = sha256(identifier).hexdigest()
63
  return f"{hash_value}_{len(identifier)}" # Length to reduce collision chance
 
68
  """The recommended system to use, it's race condition safe and thread safe.
69
  Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
70
  > It's optimized for threaded applications but running it without threads shouldn't make it slow."""
71
+
72
  def __init__(self, storage_file: str, url: Union[str, None] = None):
73
  """
74
  :param storage_file: File to be used to store elements
 
112
  url = self._get_base_url()
113
  element_data = _StorageTools.element_to_dict(element)
114
  with self.lock:
115
+ self.cursor.execute(
116
+ """
117
  INSERT OR REPLACE INTO storage (url, identifier, element_data)
118
  VALUES (?, ?, ?)
119
+ """,
120
+ (url, identifier, orjson.dumps(element_data)),
121
+ )
122
  self.cursor.fetchall()
123
  self.connection.commit()
124
 
 
133
  with self.lock:
134
  self.cursor.execute(
135
  "SELECT element_data FROM storage WHERE url = ? AND identifier = ?",
136
+ (url, identifier),
137
  )
138
  result = self.cursor.fetchone()
139
  if result:
scrapling/core/translator.py CHANGED
@@ -24,7 +24,6 @@ replace_html5_whitespaces = re.compile(regex).sub
24
 
25
 
26
  class XPathExpr(OriginalXPathExpr):
27
-
28
  textnode: bool = False
29
  attribute: Optional[str] = None
30
 
@@ -123,7 +122,7 @@ class TranslatorMixin:
123
 
124
  @staticmethod
125
  def xpath_attr_functional_pseudo_element(
126
- xpath: OriginalXPathExpr, function: FunctionalPseudoElement
127
  ) -> XPathExpr:
128
  """Support selecting attribute values using ::attr() pseudo-element"""
129
  if function.argument_types() not in (["STRING"], ["IDENT"]):
 
24
 
25
 
26
  class XPathExpr(OriginalXPathExpr):
 
27
  textnode: bool = False
28
  attribute: Optional[str] = None
29
 
 
122
 
123
  @staticmethod
124
  def xpath_attr_functional_pseudo_element(
125
+ xpath: OriginalXPathExpr, function: FunctionalPseudoElement
126
  ) -> XPathExpr:
127
  """Support selecting attribute values using ::attr() pseudo-element"""
128
  if function.argument_types() not in (["STRING"], ["IDENT"]):
scrapling/core/utils.py CHANGED
@@ -11,7 +11,9 @@ from scrapling.core._types import Any, Dict, Iterable, Union
11
  # functools.cache is available on Python 3.9+ only so let's keep lru_cache
12
  from functools import lru_cache # isort:skip
13
 
14
- html_forbidden = {html.HtmlComment, }
 
 
15
 
16
 
17
  @lru_cache(1, typed=True)
@@ -20,12 +22,11 @@ def setup_logger():
20
 
21
  :returns: logging.Logger: Configured logger instance
22
  """
23
- logger = logging.getLogger('scrapling')
24
  logger.setLevel(logging.INFO)
25
 
26
  formatter = logging.Formatter(
27
- fmt="[%(asctime)s] %(levelname)s: %(message)s",
28
- datefmt="%Y-%m-%d %H:%M:%S"
29
  )
30
 
31
  console_handler = logging.StreamHandler()
@@ -58,7 +59,13 @@ def flatten(lst: Iterable):
58
 
59
  def _is_iterable(s: Any):
60
  # This will be used only in regex functions to make sure it's iterable but not string/bytes
61
- return isinstance(s, (list, tuple,))
 
 
 
 
 
 
62
 
63
 
64
  class _StorageTools:
@@ -66,31 +73,43 @@ class _StorageTools:
66
  def __clean_attributes(element: html.HtmlElement, forbidden: tuple = ()) -> Dict:
67
  if not element.attrib:
68
  return {}
69
- return {k: v.strip() for k, v in element.attrib.items() if v and v.strip() and k not in forbidden}
 
 
 
 
70
 
71
  @classmethod
72
  def element_to_dict(cls, element: html.HtmlElement) -> Dict:
73
  parent = element.getparent()
74
  result = {
75
- 'tag': str(element.tag),
76
- 'attributes': cls.__clean_attributes(element),
77
- 'text': element.text.strip() if element.text else None,
78
- 'path': cls._get_element_path(element)
79
  }
80
  if parent is not None:
81
- result.update({
82
- 'parent_name': parent.tag,
83
- 'parent_attribs': dict(parent.attrib),
84
- 'parent_text': parent.text.strip() if parent.text else None
85
- })
 
 
86
 
87
- siblings = [child.tag for child in parent.iterchildren() if child != element]
 
 
88
  if siblings:
89
- result.update({'siblings': tuple(siblings)})
90
 
91
- children = [child.tag for child in element.iterchildren() if type(child) not in html_forbidden]
 
 
 
 
92
  if children:
93
- result.update({'children': tuple(children)})
94
 
95
  return result
96
 
@@ -98,9 +117,9 @@ class _StorageTools:
98
  def _get_element_path(cls, element: html.HtmlElement):
99
  parent = element.getparent()
100
  return tuple(
101
- (element.tag,) if parent is None else (
102
- cls._get_element_path(parent) + (element.tag,)
103
- )
104
  )
105
 
106
 
@@ -117,6 +136,6 @@ class _StorageTools:
117
 
118
  @lru_cache(128, typed=True)
119
  def clean_spaces(string):
120
- string = string.replace('\t', ' ')
121
- string = re.sub('[\n|\r]', '', string)
122
- return re.sub(' +', ' ', string)
 
11
  # functools.cache is available on Python 3.9+ only so let's keep lru_cache
12
  from functools import lru_cache # isort:skip
13
 
14
+ html_forbidden = {
15
+ html.HtmlComment,
16
+ }
17
 
18
 
19
  @lru_cache(1, typed=True)
 
22
 
23
  :returns: logging.Logger: Configured logger instance
24
  """
25
+ logger = logging.getLogger("scrapling")
26
  logger.setLevel(logging.INFO)
27
 
28
  formatter = logging.Formatter(
29
+ fmt="[%(asctime)s] %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
 
30
  )
31
 
32
  console_handler = logging.StreamHandler()
 
59
 
60
  def _is_iterable(s: Any):
61
  # This will be used only in regex functions to make sure it's iterable but not string/bytes
62
+ return isinstance(
63
+ s,
64
+ (
65
+ list,
66
+ tuple,
67
+ ),
68
+ )
69
 
70
 
71
  class _StorageTools:
 
73
  def __clean_attributes(element: html.HtmlElement, forbidden: tuple = ()) -> Dict:
74
  if not element.attrib:
75
  return {}
76
+ return {
77
+ k: v.strip()
78
+ for k, v in element.attrib.items()
79
+ if v and v.strip() and k not in forbidden
80
+ }
81
 
82
  @classmethod
83
  def element_to_dict(cls, element: html.HtmlElement) -> Dict:
84
  parent = element.getparent()
85
  result = {
86
+ "tag": str(element.tag),
87
+ "attributes": cls.__clean_attributes(element),
88
+ "text": element.text.strip() if element.text else None,
89
+ "path": cls._get_element_path(element),
90
  }
91
  if parent is not None:
92
+ result.update(
93
+ {
94
+ "parent_name": parent.tag,
95
+ "parent_attribs": dict(parent.attrib),
96
+ "parent_text": parent.text.strip() if parent.text else None,
97
+ }
98
+ )
99
 
100
+ siblings = [
101
+ child.tag for child in parent.iterchildren() if child != element
102
+ ]
103
  if siblings:
104
+ result.update({"siblings": tuple(siblings)})
105
 
106
+ children = [
107
+ child.tag
108
+ for child in element.iterchildren()
109
+ if type(child) not in html_forbidden
110
+ ]
111
  if children:
112
+ result.update({"children": tuple(children)})
113
 
114
  return result
115
 
 
117
  def _get_element_path(cls, element: html.HtmlElement):
118
  parent = element.getparent()
119
  return tuple(
120
+ (element.tag,)
121
+ if parent is None
122
+ else (cls._get_element_path(parent) + (element.tag,))
123
  )
124
 
125
 
 
136
 
137
  @lru_cache(128, typed=True)
138
  def clean_spaces(string):
139
+ string = string.replace("\t", " ")
140
+ string = re.sub("[\n|\r]", "", string)
141
+ return re.sub(" +", " ", string)
scrapling/defaults.py CHANGED
@@ -5,21 +5,33 @@ from scrapling.core.utils import log
5
  # A lightweight approach to create lazy loader for each import for backward compatibility
6
  # This will reduces initial memory footprint significantly (only loads what's used)
7
  def __getattr__(name):
8
- if name == 'Fetcher':
9
  from scrapling.fetchers import Fetcher as cls
10
- log.warning('This import is deprecated now and it will be removed with v0.3. Use `from scrapling.fetchers import Fetcher` instead')
 
 
 
11
  return cls
12
- elif name == 'AsyncFetcher':
13
  from scrapling.fetchers import AsyncFetcher as cls
14
- log.warning('This import is deprecated now and it will be removed with v0.3. Use `from scrapling.fetchers import AsyncFetcher` instead')
 
 
 
15
  return cls
16
- elif name == 'StealthyFetcher':
17
  from scrapling.fetchers import StealthyFetcher as cls
18
- log.warning('This import is deprecated now and it will be removed with v0.3. Use `from scrapling.fetchers import StealthyFetcher` instead')
 
 
 
19
  return cls
20
- elif name == 'PlayWrightFetcher':
21
  from scrapling.fetchers import PlayWrightFetcher as cls
22
- log.warning('This import is deprecated now and it will be removed with v0.3. Use `from scrapling.fetchers import PlayWrightFetcher` instead')
 
 
 
23
  return cls
24
  else:
25
  raise AttributeError(f"module 'scrapling' has no attribute '{name}'")
 
5
  # A lightweight approach to create lazy loader for each import for backward compatibility
6
  # This will reduces initial memory footprint significantly (only loads what's used)
7
  def __getattr__(name):
8
+ if name == "Fetcher":
9
  from scrapling.fetchers import Fetcher as cls
10
+
11
+ log.warning(
12
+ "This import is deprecated now and it will be removed with v0.3. Use `from scrapling.fetchers import Fetcher` instead"
13
+ )
14
  return cls
15
+ elif name == "AsyncFetcher":
16
  from scrapling.fetchers import AsyncFetcher as cls
17
+
18
+ log.warning(
19
+ "This import is deprecated now and it will be removed with v0.3. Use `from scrapling.fetchers import AsyncFetcher` instead"
20
+ )
21
  return cls
22
+ elif name == "StealthyFetcher":
23
  from scrapling.fetchers import StealthyFetcher as cls
24
+
25
+ log.warning(
26
+ "This import is deprecated now and it will be removed with v0.3. Use `from scrapling.fetchers import StealthyFetcher` instead"
27
+ )
28
  return cls
29
+ elif name == "PlayWrightFetcher":
30
  from scrapling.fetchers import PlayWrightFetcher as cls
31
+
32
+ log.warning(
33
+ "This import is deprecated now and it will be removed with v0.3. Use `from scrapling.fetchers import PlayWrightFetcher` instead"
34
+ )
35
  return cls
36
  else:
37
  raise AttributeError(f"module 'scrapling' has no attribute '{name}'")
scrapling/engines/__init__.py CHANGED
@@ -4,4 +4,4 @@ from .pw import PlaywrightEngine
4
  from .static import StaticEngine
5
  from .toolbelt import check_if_engine_usable
6
 
7
- __all__ = ['CamoufoxEngine', 'PlaywrightEngine']
 
4
  from .static import StaticEngine
5
  from .toolbelt import check_if_engine_usable
6
 
7
+ __all__ = ["CamoufoxEngine", "PlaywrightEngine"]
scrapling/engines/camo.py CHANGED
@@ -2,27 +2,52 @@ from camoufox import DefaultAddons
2
  from camoufox.async_api import AsyncCamoufox
3
  from camoufox.sync_api import Camoufox
4
 
5
- from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
6
- SelectorWaitStates, Union)
 
 
 
 
 
 
 
7
  from scrapling.core.utils import log
8
- from scrapling.engines.toolbelt import (Response, StatusText,
9
- async_intercept_route,
10
- check_type_validity,
11
- construct_proxy_dict,
12
- generate_convincing_referer,
13
- get_os_name, intercept_route)
 
 
 
 
14
 
15
 
16
  class CamoufoxEngine:
17
  def __init__(
18
- self, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
19
- block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, humanize: Union[bool, float] = True, wait: Optional[int] = 0,
20
- timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
21
- wait_selector_state: SelectorWaitStates = 'attached', google_search: bool = True, extra_headers: Optional[Dict[str, str]] = None,
22
- proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: bool = False, disable_ads: bool = False,
23
- geoip: bool = False,
24
- adaptor_arguments: Dict = None,
25
- additional_arguments: Dict = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  ):
27
  """An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
28
 
@@ -97,7 +122,7 @@ class CamoufoxEngine:
97
  "block_webrtc": self.block_webrtc,
98
  "block_images": self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
99
  "os": None if self.os_randomize else get_os_name(),
100
- **self.additional_arguments
101
  }
102
 
103
  def _process_response_history(self, first_response):
@@ -109,19 +134,30 @@ class CamoufoxEngine:
109
  while current_request:
110
  try:
111
  current_response = current_request.response()
112
- history.insert(0, Response(
113
- url=current_request.url,
114
- # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
115
- text='',
116
- body=b'',
117
- status=current_response.status if current_response else 301,
118
- reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
119
- encoding=current_response.headers.get('content-type', '') or 'utf-8',
120
- cookies={},
121
- headers=current_response.all_headers() if current_response else {},
122
- request_headers=current_request.all_headers(),
123
- **self.adaptor_arguments
124
- ))
 
 
 
 
 
 
 
 
 
 
 
125
  except Exception as e:
126
  log.error(f"Error processing redirect: {e}")
127
  break
@@ -141,19 +177,30 @@ class CamoufoxEngine:
141
  while current_request:
142
  try:
143
  current_response = await current_request.response()
144
- history.insert(0, Response(
145
- url=current_request.url,
146
- # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
147
- text='',
148
- body=b'',
149
- status=current_response.status if current_response else 301,
150
- reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
151
- encoding=current_response.headers.get('content-type', '') or 'utf-8',
152
- cookies={},
153
- headers=await current_response.all_headers() if current_response else {},
154
- request_headers=await current_request.all_headers(),
155
- **self.adaptor_arguments
156
- ))
 
 
 
 
 
 
 
 
 
 
 
157
  except Exception as e:
158
  log.error(f"Error processing redirect: {e}")
159
  break
@@ -175,7 +222,10 @@ class CamoufoxEngine:
175
 
176
  def handle_response(finished_response):
177
  nonlocal final_response
178
- if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
 
 
 
179
  final_response = finished_response
180
 
181
  with Camoufox(**self._get_camoufox_options()) as browser:
@@ -195,7 +245,7 @@ class CamoufoxEngine:
195
  page.wait_for_load_state(state="domcontentloaded")
196
 
197
  if self.network_idle:
198
- page.wait_for_load_state('networkidle')
199
 
200
  if self.page_action is not None:
201
  try:
@@ -211,7 +261,7 @@ class CamoufoxEngine:
211
  page.wait_for_load_state(state="load")
212
  page.wait_for_load_state(state="domcontentloaded")
213
  if self.network_idle:
214
- page.wait_for_load_state('networkidle')
215
  except Exception as e:
216
  log.error(f"Error waiting for selector {self.wait_selector}: {e}")
217
 
@@ -222,9 +272,13 @@ class CamoufoxEngine:
222
  raise ValueError("Failed to get a response from the page")
223
 
224
  # This will be parsed inside `Response`
225
- encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
 
 
226
  # PlayWright API sometimes give empty status text for some reason!
227
- status_text = final_response.status_text or StatusText.get(final_response.status)
 
 
228
 
229
  history = self._process_response_history(first_response)
230
  try:
@@ -236,15 +290,17 @@ class CamoufoxEngine:
236
  response = Response(
237
  url=page.url,
238
  text=page_content,
239
- body=page_content.encode('utf-8'),
240
  status=final_response.status,
241
  reason=status_text,
242
  encoding=encoding,
243
- cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
 
 
244
  headers=first_response.all_headers(),
245
  request_headers=first_response.request.all_headers(),
246
  history=history,
247
- **self.adaptor_arguments
248
  )
249
  page.close()
250
  context.close()
@@ -262,7 +318,10 @@ class CamoufoxEngine:
262
 
263
  async def handle_response(finished_response):
264
  nonlocal final_response
265
- if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
 
 
 
266
  final_response = finished_response
267
 
268
  async with AsyncCamoufox(**self._get_camoufox_options()) as browser:
@@ -282,7 +341,7 @@ class CamoufoxEngine:
282
  await page.wait_for_load_state(state="domcontentloaded")
283
 
284
  if self.network_idle:
285
- await page.wait_for_load_state('networkidle')
286
 
287
  if self.page_action is not None:
288
  try:
@@ -298,7 +357,7 @@ class CamoufoxEngine:
298
  await page.wait_for_load_state(state="load")
299
  await page.wait_for_load_state(state="domcontentloaded")
300
  if self.network_idle:
301
- await page.wait_for_load_state('networkidle')
302
  except Exception as e:
303
  log.error(f"Error waiting for selector {self.wait_selector}: {e}")
304
 
@@ -309,9 +368,13 @@ class CamoufoxEngine:
309
  raise ValueError("Failed to get a response from the page")
310
 
311
  # This will be parsed inside `Response`
312
- encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
 
 
313
  # PlayWright API sometimes give empty status text for some reason!
314
- status_text = final_response.status_text or StatusText.get(final_response.status)
 
 
315
 
316
  history = await self._async_process_response_history(first_response)
317
  try:
@@ -323,15 +386,18 @@ class CamoufoxEngine:
323
  response = Response(
324
  url=page.url,
325
  text=page_content,
326
- body=page_content.encode('utf-8'),
327
  status=final_response.status,
328
  reason=status_text,
329
  encoding=encoding,
330
- cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
 
 
 
331
  headers=await first_response.all_headers(),
332
  request_headers=await first_response.request.all_headers(),
333
  history=history,
334
- **self.adaptor_arguments
335
  )
336
  await page.close()
337
  await context.close()
 
2
  from camoufox.async_api import AsyncCamoufox
3
  from camoufox.sync_api import Camoufox
4
 
5
+ from scrapling.core._types import (
6
+ Callable,
7
+ Dict,
8
+ List,
9
+ Literal,
10
+ Optional,
11
+ SelectorWaitStates,
12
+ Union,
13
+ )
14
  from scrapling.core.utils import log
15
+ from scrapling.engines.toolbelt import (
16
+ Response,
17
+ StatusText,
18
+ async_intercept_route,
19
+ check_type_validity,
20
+ construct_proxy_dict,
21
+ generate_convincing_referer,
22
+ get_os_name,
23
+ intercept_route,
24
+ )
25
 
26
 
27
  class CamoufoxEngine:
28
  def __init__(
29
+ self,
30
+ headless: Union[bool, Literal["virtual"]] = True, # noqa: F821
31
+ block_images: bool = False,
32
+ disable_resources: bool = False,
33
+ block_webrtc: bool = False,
34
+ allow_webgl: bool = True,
35
+ network_idle: bool = False,
36
+ humanize: Union[bool, float] = True,
37
+ wait: Optional[int] = 0,
38
+ timeout: Optional[float] = 30000,
39
+ page_action: Callable = None,
40
+ wait_selector: Optional[str] = None,
41
+ addons: Optional[List[str]] = None,
42
+ wait_selector_state: SelectorWaitStates = "attached",
43
+ google_search: bool = True,
44
+ extra_headers: Optional[Dict[str, str]] = None,
45
+ proxy: Optional[Union[str, Dict[str, str]]] = None,
46
+ os_randomize: bool = False,
47
+ disable_ads: bool = False,
48
+ geoip: bool = False,
49
+ adaptor_arguments: Dict = None,
50
+ additional_arguments: Dict = None,
51
  ):
52
  """An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
53
 
 
122
  "block_webrtc": self.block_webrtc,
123
  "block_images": self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
124
  "os": None if self.os_randomize else get_os_name(),
125
+ **self.additional_arguments,
126
  }
127
 
128
  def _process_response_history(self, first_response):
 
134
  while current_request:
135
  try:
136
  current_response = current_request.response()
137
+ history.insert(
138
+ 0,
139
+ Response(
140
+ url=current_request.url,
141
+ # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
142
+ text="",
143
+ body=b"",
144
+ status=current_response.status if current_response else 301,
145
+ reason=(
146
+ current_response.status_text
147
+ or StatusText.get(current_response.status)
148
+ )
149
+ if current_response
150
+ else StatusText.get(301),
151
+ encoding=current_response.headers.get("content-type", "")
152
+ or "utf-8",
153
+ cookies={},
154
+ headers=current_response.all_headers()
155
+ if current_response
156
+ else {},
157
+ request_headers=current_request.all_headers(),
158
+ **self.adaptor_arguments,
159
+ ),
160
+ )
161
  except Exception as e:
162
  log.error(f"Error processing redirect: {e}")
163
  break
 
177
  while current_request:
178
  try:
179
  current_response = await current_request.response()
180
+ history.insert(
181
+ 0,
182
+ Response(
183
+ url=current_request.url,
184
+ # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
185
+ text="",
186
+ body=b"",
187
+ status=current_response.status if current_response else 301,
188
+ reason=(
189
+ current_response.status_text
190
+ or StatusText.get(current_response.status)
191
+ )
192
+ if current_response
193
+ else StatusText.get(301),
194
+ encoding=current_response.headers.get("content-type", "")
195
+ or "utf-8",
196
+ cookies={},
197
+ headers=await current_response.all_headers()
198
+ if current_response
199
+ else {},
200
+ request_headers=await current_request.all_headers(),
201
+ **self.adaptor_arguments,
202
+ ),
203
+ )
204
  except Exception as e:
205
  log.error(f"Error processing redirect: {e}")
206
  break
 
222
 
223
  def handle_response(finished_response):
224
  nonlocal final_response
225
+ if (
226
+ finished_response.request.resource_type == "document"
227
+ and finished_response.request.is_navigation_request()
228
+ ):
229
  final_response = finished_response
230
 
231
  with Camoufox(**self._get_camoufox_options()) as browser:
 
245
  page.wait_for_load_state(state="domcontentloaded")
246
 
247
  if self.network_idle:
248
+ page.wait_for_load_state("networkidle")
249
 
250
  if self.page_action is not None:
251
  try:
 
261
  page.wait_for_load_state(state="load")
262
  page.wait_for_load_state(state="domcontentloaded")
263
  if self.network_idle:
264
+ page.wait_for_load_state("networkidle")
265
  except Exception as e:
266
  log.error(f"Error waiting for selector {self.wait_selector}: {e}")
267
 
 
272
  raise ValueError("Failed to get a response from the page")
273
 
274
  # This will be parsed inside `Response`
275
+ encoding = (
276
+ final_response.headers.get("content-type", "") or "utf-8"
277
+ ) # default encoding
278
  # PlayWright API sometimes give empty status text for some reason!
279
+ status_text = final_response.status_text or StatusText.get(
280
+ final_response.status
281
+ )
282
 
283
  history = self._process_response_history(first_response)
284
  try:
 
290
  response = Response(
291
  url=page.url,
292
  text=page_content,
293
+ body=page_content.encode("utf-8"),
294
  status=final_response.status,
295
  reason=status_text,
296
  encoding=encoding,
297
+ cookies={
298
+ cookie["name"]: cookie["value"] for cookie in page.context.cookies()
299
+ },
300
  headers=first_response.all_headers(),
301
  request_headers=first_response.request.all_headers(),
302
  history=history,
303
+ **self.adaptor_arguments,
304
  )
305
  page.close()
306
  context.close()
 
318
 
319
  async def handle_response(finished_response):
320
  nonlocal final_response
321
+ if (
322
+ finished_response.request.resource_type == "document"
323
+ and finished_response.request.is_navigation_request()
324
+ ):
325
  final_response = finished_response
326
 
327
  async with AsyncCamoufox(**self._get_camoufox_options()) as browser:
 
341
  await page.wait_for_load_state(state="domcontentloaded")
342
 
343
  if self.network_idle:
344
+ await page.wait_for_load_state("networkidle")
345
 
346
  if self.page_action is not None:
347
  try:
 
357
  await page.wait_for_load_state(state="load")
358
  await page.wait_for_load_state(state="domcontentloaded")
359
  if self.network_idle:
360
+ await page.wait_for_load_state("networkidle")
361
  except Exception as e:
362
  log.error(f"Error waiting for selector {self.wait_selector}: {e}")
363
 
 
368
  raise ValueError("Failed to get a response from the page")
369
 
370
  # This will be parsed inside `Response`
371
+ encoding = (
372
+ final_response.headers.get("content-type", "") or "utf-8"
373
+ ) # default encoding
374
  # PlayWright API sometimes give empty status text for some reason!
375
+ status_text = final_response.status_text or StatusText.get(
376
+ final_response.status
377
+ )
378
 
379
  history = await self._async_process_response_history(first_response)
380
  try:
 
386
  response = Response(
387
  url=page.url,
388
  text=page_content,
389
+ body=page_content.encode("utf-8"),
390
  status=final_response.status,
391
  reason=status_text,
392
  encoding=encoding,
393
+ cookies={
394
+ cookie["name"]: cookie["value"]
395
+ for cookie in await page.context.cookies()
396
+ },
397
  headers=await first_response.all_headers(),
398
  request_headers=await first_response.request.all_headers(),
399
  history=history,
400
+ **self.adaptor_arguments,
401
  )
402
  await page.close()
403
  await context.close()
scrapling/engines/constants.py CHANGED
@@ -1,92 +1,92 @@
1
  # Disable loading these resources for speed
2
  DEFAULT_DISABLED_RESOURCES = {
3
- 'font',
4
- 'image',
5
- 'media',
6
- 'beacon',
7
- 'object',
8
- 'imageset',
9
- 'texttrack',
10
- 'websocket',
11
- 'csp_report',
12
- 'stylesheet',
13
  }
14
 
15
  DEFAULT_STEALTH_FLAGS = (
16
  # Explanation: https://peter.sh/experiments/chromium-command-line-switches/
17
  # Generally this will make the browser faster and less detectable
18
- '--no-pings',
19
- '--incognito',
20
- '--test-type',
21
- '--lang=en-US',
22
- '--mute-audio',
23
- '--no-first-run',
24
- '--disable-sync',
25
- '--hide-scrollbars',
26
- '--disable-logging',
27
- '--start-maximized', # For headless check bypass
28
- '--enable-async-dns',
29
- '--disable-breakpad',
30
- '--disable-infobars',
31
- '--accept-lang=en-US',
32
- '--use-mock-keychain',
33
- '--disable-translate',
34
- '--disable-extensions',
35
- '--disable-voice-input',
36
- '--window-position=0,0',
37
- '--disable-wake-on-wifi',
38
- '--ignore-gpu-blocklist',
39
- '--enable-tcp-fast-open',
40
- '--enable-web-bluetooth',
41
- '--disable-hang-monitor',
42
- '--password-store=basic',
43
- '--disable-cloud-import',
44
- '--disable-default-apps',
45
- '--disable-print-preview',
46
- '--disable-dev-shm-usage',
47
  # '--disable-popup-blocking',
48
- '--metrics-recording-only',
49
- '--disable-crash-reporter',
50
- '--disable-partial-raster',
51
- '--disable-gesture-typing',
52
- '--disable-checker-imaging',
53
- '--disable-prompt-on-repost',
54
- '--force-color-profile=srgb',
55
- '--font-render-hinting=none',
56
- '--no-default-browser-check',
57
- '--aggressive-cache-discard',
58
- '--disable-component-update',
59
- '--disable-cookie-encryption',
60
- '--disable-domain-reliability',
61
- '--disable-threaded-animation',
62
- '--disable-threaded-scrolling',
63
  # '--disable-reading-from-canvas', # For Firefox
64
- '--enable-simple-cache-backend',
65
- '--disable-background-networking',
66
- '--disable-session-crashed-bubble',
67
- '--enable-surface-synchronization',
68
- '--disable-image-animation-resync',
69
- '--disable-renderer-backgrounding',
70
- '--disable-ipc-flooding-protection',
71
- '--prerender-from-omnibox=disabled',
72
- '--safebrowsing-disable-auto-update',
73
- '--disable-offer-upload-credit-cards',
74
- '--disable-features=site-per-process',
75
- '--disable-background-timer-throttling',
76
- '--disable-new-content-rendering-timeout',
77
- '--run-all-compositor-stages-before-draw',
78
- '--disable-client-side-phishing-detection',
79
- '--disable-backgrounding-occluded-windows',
80
- '--disable-layer-tree-host-memory-pressure',
81
- '--autoplay-policy=no-user-gesture-required',
82
- '--disable-offer-store-unmasked-wallet-cards',
83
- '--disable-blink-features=AutomationControlled',
84
- '--webrtc-ip-handling-policy=disable_non_proxied_udp',
85
- '--disable-component-extensions-with-background-pages',
86
- '--force-webrtc-ip-handling-policy=disable_non_proxied_udp',
87
- '--enable-features=NetworkService,NetworkServiceInProcess,TrustTokens,TrustTokensAlwaysAllowIssuance',
88
- '--blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4',
89
- '--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process,TranslateUI,BlinkGenPropertyTrees',
90
  )
91
 
92
  # Defaulting to the docker mode, token doesn't matter in it as it's passed for the container
@@ -95,13 +95,10 @@ NSTBROWSER_DEFAULT_QUERY = {
95
  "headless": True,
96
  "autoClose": True,
97
  "fingerprint": {
98
- "flags": {
99
- "timezone": "BasedOnIp",
100
- "screen": "Custom"
101
- },
102
- "platform": 'linux', # support: windows, mac, linux
103
- "kernel": 'chromium', # only support: chromium
104
- "kernelMilestone": '128',
105
  "hardwareConcurrency": 8,
106
  "deviceMemory": 8,
107
  },
 
1
  # Disable loading these resources for speed
2
  DEFAULT_DISABLED_RESOURCES = {
3
+ "font",
4
+ "image",
5
+ "media",
6
+ "beacon",
7
+ "object",
8
+ "imageset",
9
+ "texttrack",
10
+ "websocket",
11
+ "csp_report",
12
+ "stylesheet",
13
  }
14
 
15
  DEFAULT_STEALTH_FLAGS = (
16
  # Explanation: https://peter.sh/experiments/chromium-command-line-switches/
17
  # Generally this will make the browser faster and less detectable
18
+ "--no-pings",
19
+ "--incognito",
20
+ "--test-type",
21
+ "--lang=en-US",
22
+ "--mute-audio",
23
+ "--no-first-run",
24
+ "--disable-sync",
25
+ "--hide-scrollbars",
26
+ "--disable-logging",
27
+ "--start-maximized", # For headless check bypass
28
+ "--enable-async-dns",
29
+ "--disable-breakpad",
30
+ "--disable-infobars",
31
+ "--accept-lang=en-US",
32
+ "--use-mock-keychain",
33
+ "--disable-translate",
34
+ "--disable-extensions",
35
+ "--disable-voice-input",
36
+ "--window-position=0,0",
37
+ "--disable-wake-on-wifi",
38
+ "--ignore-gpu-blocklist",
39
+ "--enable-tcp-fast-open",
40
+ "--enable-web-bluetooth",
41
+ "--disable-hang-monitor",
42
+ "--password-store=basic",
43
+ "--disable-cloud-import",
44
+ "--disable-default-apps",
45
+ "--disable-print-preview",
46
+ "--disable-dev-shm-usage",
47
  # '--disable-popup-blocking',
48
+ "--metrics-recording-only",
49
+ "--disable-crash-reporter",
50
+ "--disable-partial-raster",
51
+ "--disable-gesture-typing",
52
+ "--disable-checker-imaging",
53
+ "--disable-prompt-on-repost",
54
+ "--force-color-profile=srgb",
55
+ "--font-render-hinting=none",
56
+ "--no-default-browser-check",
57
+ "--aggressive-cache-discard",
58
+ "--disable-component-update",
59
+ "--disable-cookie-encryption",
60
+ "--disable-domain-reliability",
61
+ "--disable-threaded-animation",
62
+ "--disable-threaded-scrolling",
63
  # '--disable-reading-from-canvas', # For Firefox
64
+ "--enable-simple-cache-backend",
65
+ "--disable-background-networking",
66
+ "--disable-session-crashed-bubble",
67
+ "--enable-surface-synchronization",
68
+ "--disable-image-animation-resync",
69
+ "--disable-renderer-backgrounding",
70
+ "--disable-ipc-flooding-protection",
71
+ "--prerender-from-omnibox=disabled",
72
+ "--safebrowsing-disable-auto-update",
73
+ "--disable-offer-upload-credit-cards",
74
+ "--disable-features=site-per-process",
75
+ "--disable-background-timer-throttling",
76
+ "--disable-new-content-rendering-timeout",
77
+ "--run-all-compositor-stages-before-draw",
78
+ "--disable-client-side-phishing-detection",
79
+ "--disable-backgrounding-occluded-windows",
80
+ "--disable-layer-tree-host-memory-pressure",
81
+ "--autoplay-policy=no-user-gesture-required",
82
+ "--disable-offer-store-unmasked-wallet-cards",
83
+ "--disable-blink-features=AutomationControlled",
84
+ "--webrtc-ip-handling-policy=disable_non_proxied_udp",
85
+ "--disable-component-extensions-with-background-pages",
86
+ "--force-webrtc-ip-handling-policy=disable_non_proxied_udp",
87
+ "--enable-features=NetworkService,NetworkServiceInProcess,TrustTokens,TrustTokensAlwaysAllowIssuance",
88
+ "--blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4",
89
+ "--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process,TranslateUI,BlinkGenPropertyTrees",
90
  )
91
 
92
  # Defaulting to the docker mode, token doesn't matter in it as it's passed for the container
 
95
  "headless": True,
96
  "autoClose": True,
97
  "fingerprint": {
98
+ "flags": {"timezone": "BasedOnIp", "screen": "Custom"},
99
+ "platform": "linux", # support: windows, mac, linux
100
+ "kernel": "chromium", # only support: chromium
101
+ "kernelMilestone": "128",
 
 
 
102
  "hardwareConcurrency": 8,
103
  "deviceMemory": 8,
104
  },
scrapling/engines/pw.py CHANGED
@@ -1,42 +1,46 @@
1
  import json
2
 
3
- from scrapling.core._types import (Callable, Dict, Optional,
4
- SelectorWaitStates, Union)
5
  from scrapling.core.utils import log, lru_cache
6
- from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
7
- NSTBROWSER_DEFAULT_QUERY)
8
- from scrapling.engines.toolbelt import (Response, StatusText,
9
- async_intercept_route,
10
- check_type_validity, construct_cdp_url,
11
- construct_proxy_dict,
12
- generate_convincing_referer,
13
- generate_headers, intercept_route,
14
- js_bypass_path)
 
 
 
 
15
 
16
 
17
  class PlaywrightEngine:
18
  def __init__(
19
- self, headless: Union[bool, str] = True,
20
- disable_resources: bool = False,
21
- useragent: Optional[str] = None,
22
- network_idle: bool = False,
23
- timeout: Optional[float] = 30000,
24
- wait: Optional[int] = 0,
25
- page_action: Callable = None,
26
- wait_selector: Optional[str] = None,
27
- locale: Optional[str] = 'en-US',
28
- wait_selector_state: SelectorWaitStates = 'attached',
29
- stealth: bool = False,
30
- real_chrome: bool = False,
31
- hide_canvas: bool = False,
32
- disable_webgl: bool = False,
33
- cdp_url: Optional[str] = None,
34
- nstbrowser_mode: bool = False,
35
- nstbrowser_config: Optional[Dict] = None,
36
- google_search: bool = True,
37
- extra_headers: Optional[Dict[str, str]] = None,
38
- proxy: Optional[Union[str, Dict[str, str]]] = None,
39
- adaptor_arguments: Dict = None
 
40
  ):
41
  """An engine that utilizes PlayWright library, check the `PlayWrightFetcher` class for more documentation.
42
 
@@ -65,7 +69,7 @@ class PlaywrightEngine:
65
  :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
66
  """
67
  self.headless = headless
68
- self.locale = check_type_validity(locale, [str], 'en-US', param_name='locale')
69
  self.disable_resources = disable_resources
70
  self.network_idle = bool(network_idle)
71
  self.stealth = bool(stealth)
@@ -95,8 +99,8 @@ class PlaywrightEngine:
95
  self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
96
  self.harmful_default_args = [
97
  # This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
98
- '--enable-automation',
99
- '--disable-popup-blocking',
100
  # '--disable-component-update',
101
  # '--disable-default-apps',
102
  # '--disable-extensions',
@@ -114,12 +118,16 @@ class PlaywrightEngine:
114
  query = NSTBROWSER_DEFAULT_QUERY.copy()
115
  if self.stealth:
116
  flags = self.__set_flags()
117
- query.update({
118
- "args": dict(zip(flags, [''] * len(flags))), # browser args should be a dictionary
119
- })
 
 
 
 
120
 
121
  config = {
122
- 'config': json.dumps(query),
123
  # 'token': ''
124
  }
125
  cdp_url = construct_cdp_url(cdp_url, config)
@@ -134,17 +142,25 @@ class PlaywrightEngine:
134
  """Returns the flags that will be used while launching the browser if stealth mode is enabled"""
135
  flags = DEFAULT_STEALTH_FLAGS
136
  if self.hide_canvas:
137
- flags += ('--fingerprinting-canvas-image-data-noise',)
138
  if self.disable_webgl:
139
- flags += ('--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2',)
 
 
 
 
140
 
141
  return flags
142
 
143
  def __launch_kwargs(self):
144
  """Creates the arguments we will use while launching playwright's browser"""
145
- launch_kwargs = {'headless': self.headless, 'ignore_default_args': self.harmful_default_args, 'channel': 'chrome' if self.real_chrome else 'chromium'}
 
 
 
 
146
  if self.stealth:
147
- launch_kwargs.update({'args': self.__set_flags(), 'chromium_sandbox': True})
148
 
149
  return launch_kwargs
150
 
@@ -153,22 +169,26 @@ class PlaywrightEngine:
153
  context_kwargs = {
154
  "proxy": self.proxy,
155
  "locale": self.locale,
156
- "color_scheme": 'dark', # Bypasses the 'prefersLightColor' check in creepjs
157
  "device_scale_factor": 2,
158
  "extra_http_headers": self.extra_headers if self.extra_headers else {},
159
- "user_agent": self.useragent if self.useragent else generate_headers(browser_mode=True).get('User-Agent'),
 
 
160
  }
161
  if self.stealth:
162
- context_kwargs.update({
163
- 'is_mobile': False,
164
- 'has_touch': False,
165
- # I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
166
- 'service_workers': 'allow',
167
- 'ignore_https_errors': True,
168
- 'screen': {'width': 1920, 'height': 1080},
169
- 'viewport': {'width': 1920, 'height': 1080},
170
- 'permissions': ['geolocation', 'notifications']
171
- })
 
 
172
 
173
  return context_kwargs
174
 
@@ -184,10 +204,16 @@ class PlaywrightEngine:
184
  # https://arh.antoinevastel.com/bots/areyouheadless/
185
  # https://prescience-data.github.io/execution-monitor.html
186
  return tuple(
187
- js_bypass_path(script) for script in (
 
188
  # Order is important
189
- 'webdriver_fully.js', 'window_chrome.js', 'navigator_plugins.js', 'pdf_viewer.js',
190
- 'notification_permission.js', 'screen_props.js', 'playwright_fingerprint.js'
 
 
 
 
 
191
  )
192
  )
193
 
@@ -200,19 +226,30 @@ class PlaywrightEngine:
200
  while current_request:
201
  try:
202
  current_response = current_request.response()
203
- history.insert(0, Response(
204
- url=current_request.url,
205
- # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
206
- text='',
207
- body=b'',
208
- status=current_response.status if current_response else 301,
209
- reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
210
- encoding=current_response.headers.get('content-type', '') or 'utf-8',
211
- cookies={},
212
- headers=current_response.all_headers() if current_response else {},
213
- request_headers=current_request.all_headers(),
214
- **self.adaptor_arguments
215
- ))
 
 
 
 
 
 
 
 
 
 
 
216
  except Exception as e:
217
  log.error(f"Error processing redirect: {e}")
218
  break
@@ -232,19 +269,30 @@ class PlaywrightEngine:
232
  while current_request:
233
  try:
234
  current_response = await current_request.response()
235
- history.insert(0, Response(
236
- url=current_request.url,
237
- # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
238
- text='',
239
- body=b'',
240
- status=current_response.status if current_response else 301,
241
- reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
242
- encoding=current_response.headers.get('content-type', '') or 'utf-8',
243
- cookies={},
244
- headers=await current_response.all_headers() if current_response else {},
245
- request_headers=await current_request.all_headers(),
246
- **self.adaptor_arguments
247
- ))
 
 
 
 
 
 
 
 
 
 
 
248
  except Exception as e:
249
  log.error(f"Error processing redirect: {e}")
250
  break
@@ -262,6 +310,7 @@ class PlaywrightEngine:
262
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
263
  """
264
  from playwright.sync_api import Response as PlaywrightResponse
 
265
  if not self.stealth or self.real_chrome:
266
  # Because rebrowser_playwright doesn't play well with real browsers
267
  from playwright.sync_api import sync_playwright
@@ -273,7 +322,10 @@ class PlaywrightEngine:
273
 
274
  def handle_response(finished_response: PlaywrightResponse):
275
  nonlocal final_response
276
- if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
 
 
 
277
  final_response = finished_response
278
 
279
  with sync_playwright() as p:
@@ -304,7 +356,7 @@ class PlaywrightEngine:
304
  page.wait_for_load_state(state="domcontentloaded")
305
 
306
  if self.network_idle:
307
- page.wait_for_load_state('networkidle')
308
 
309
  if self.page_action is not None:
310
  try:
@@ -320,7 +372,7 @@ class PlaywrightEngine:
320
  page.wait_for_load_state(state="load")
321
  page.wait_for_load_state(state="domcontentloaded")
322
  if self.network_idle:
323
- page.wait_for_load_state('networkidle')
324
  except Exception as e:
325
  log.error(f"Error waiting for selector {self.wait_selector}: {e}")
326
 
@@ -331,9 +383,13 @@ class PlaywrightEngine:
331
  raise ValueError("Failed to get a response from the page")
332
 
333
  # This will be parsed inside `Response`
334
- encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
 
 
335
  # PlayWright API sometimes give empty status text for some reason!
336
- status_text = final_response.status_text or StatusText.get(final_response.status)
 
 
337
 
338
  history = self._process_response_history(first_response)
339
  try:
@@ -345,15 +401,17 @@ class PlaywrightEngine:
345
  response = Response(
346
  url=page.url,
347
  text=page_content,
348
- body=page_content.encode('utf-8'),
349
  status=final_response.status,
350
  reason=status_text,
351
  encoding=encoding,
352
- cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
 
 
353
  headers=first_response.all_headers(),
354
  request_headers=first_response.request.all_headers(),
355
  history=history,
356
- **self.adaptor_arguments
357
  )
358
  page.close()
359
  context.close()
@@ -366,6 +424,7 @@ class PlaywrightEngine:
366
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
367
  """
368
  from playwright.async_api import Response as PlaywrightResponse
 
369
  if not self.stealth or self.real_chrome:
370
  # Because rebrowser_playwright doesn't play well with real browsers
371
  from playwright.async_api import async_playwright
@@ -377,7 +436,10 @@ class PlaywrightEngine:
377
 
378
  async def handle_response(finished_response: PlaywrightResponse):
379
  nonlocal final_response
380
- if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
 
 
 
381
  final_response = finished_response
382
 
383
  async with async_playwright() as p:
@@ -408,7 +470,7 @@ class PlaywrightEngine:
408
  await page.wait_for_load_state(state="domcontentloaded")
409
 
410
  if self.network_idle:
411
- await page.wait_for_load_state('networkidle')
412
 
413
  if self.page_action is not None:
414
  try:
@@ -424,7 +486,7 @@ class PlaywrightEngine:
424
  await page.wait_for_load_state(state="load")
425
  await page.wait_for_load_state(state="domcontentloaded")
426
  if self.network_idle:
427
- await page.wait_for_load_state('networkidle')
428
  except Exception as e:
429
  log.error(f"Error waiting for selector {self.wait_selector}: {e}")
430
 
@@ -435,9 +497,13 @@ class PlaywrightEngine:
435
  raise ValueError("Failed to get a response from the page")
436
 
437
  # This will be parsed inside `Response`
438
- encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
 
 
439
  # PlayWright API sometimes give empty status text for some reason!
440
- status_text = final_response.status_text or StatusText.get(final_response.status)
 
 
441
 
442
  history = await self._async_process_response_history(first_response)
443
  try:
@@ -449,15 +515,18 @@ class PlaywrightEngine:
449
  response = Response(
450
  url=page.url,
451
  text=page_content,
452
- body=page_content.encode('utf-8'),
453
  status=final_response.status,
454
  reason=status_text,
455
  encoding=encoding,
456
- cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
 
 
 
457
  headers=await first_response.all_headers(),
458
  request_headers=await first_response.request.all_headers(),
459
  history=history,
460
- **self.adaptor_arguments
461
  )
462
  await page.close()
463
  await context.close()
 
1
  import json
2
 
3
+ from scrapling.core._types import Callable, Dict, Optional, SelectorWaitStates, Union
 
4
  from scrapling.core.utils import log, lru_cache
5
+ from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS, NSTBROWSER_DEFAULT_QUERY
6
+ from scrapling.engines.toolbelt import (
7
+ Response,
8
+ StatusText,
9
+ async_intercept_route,
10
+ check_type_validity,
11
+ construct_cdp_url,
12
+ construct_proxy_dict,
13
+ generate_convincing_referer,
14
+ generate_headers,
15
+ intercept_route,
16
+ js_bypass_path,
17
+ )
18
 
19
 
20
  class PlaywrightEngine:
21
  def __init__(
22
+ self,
23
+ headless: Union[bool, str] = True,
24
+ disable_resources: bool = False,
25
+ useragent: Optional[str] = None,
26
+ network_idle: bool = False,
27
+ timeout: Optional[float] = 30000,
28
+ wait: Optional[int] = 0,
29
+ page_action: Callable = None,
30
+ wait_selector: Optional[str] = None,
31
+ locale: Optional[str] = "en-US",
32
+ wait_selector_state: SelectorWaitStates = "attached",
33
+ stealth: bool = False,
34
+ real_chrome: bool = False,
35
+ hide_canvas: bool = False,
36
+ disable_webgl: bool = False,
37
+ cdp_url: Optional[str] = None,
38
+ nstbrowser_mode: bool = False,
39
+ nstbrowser_config: Optional[Dict] = None,
40
+ google_search: bool = True,
41
+ extra_headers: Optional[Dict[str, str]] = None,
42
+ proxy: Optional[Union[str, Dict[str, str]]] = None,
43
+ adaptor_arguments: Dict = None,
44
  ):
45
  """An engine that utilizes PlayWright library, check the `PlayWrightFetcher` class for more documentation.
46
 
 
69
  :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
70
  """
71
  self.headless = headless
72
+ self.locale = check_type_validity(locale, [str], "en-US", param_name="locale")
73
  self.disable_resources = disable_resources
74
  self.network_idle = bool(network_idle)
75
  self.stealth = bool(stealth)
 
99
  self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
100
  self.harmful_default_args = [
101
  # This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
102
+ "--enable-automation",
103
+ "--disable-popup-blocking",
104
  # '--disable-component-update',
105
  # '--disable-default-apps',
106
  # '--disable-extensions',
 
118
  query = NSTBROWSER_DEFAULT_QUERY.copy()
119
  if self.stealth:
120
  flags = self.__set_flags()
121
+ query.update(
122
+ {
123
+ "args": dict(
124
+ zip(flags, [""] * len(flags))
125
+ ), # browser args should be a dictionary
126
+ }
127
+ )
128
 
129
  config = {
130
+ "config": json.dumps(query),
131
  # 'token': ''
132
  }
133
  cdp_url = construct_cdp_url(cdp_url, config)
 
142
  """Returns the flags that will be used while launching the browser if stealth mode is enabled"""
143
  flags = DEFAULT_STEALTH_FLAGS
144
  if self.hide_canvas:
145
+ flags += ("--fingerprinting-canvas-image-data-noise",)
146
  if self.disable_webgl:
147
+ flags += (
148
+ "--disable-webgl",
149
+ "--disable-webgl-image-chromium",
150
+ "--disable-webgl2",
151
+ )
152
 
153
  return flags
154
 
155
  def __launch_kwargs(self):
156
  """Creates the arguments we will use while launching playwright's browser"""
157
+ launch_kwargs = {
158
+ "headless": self.headless,
159
+ "ignore_default_args": self.harmful_default_args,
160
+ "channel": "chrome" if self.real_chrome else "chromium",
161
+ }
162
  if self.stealth:
163
+ launch_kwargs.update({"args": self.__set_flags(), "chromium_sandbox": True})
164
 
165
  return launch_kwargs
166
 
 
169
  context_kwargs = {
170
  "proxy": self.proxy,
171
  "locale": self.locale,
172
+ "color_scheme": "dark", # Bypasses the 'prefersLightColor' check in creepjs
173
  "device_scale_factor": 2,
174
  "extra_http_headers": self.extra_headers if self.extra_headers else {},
175
+ "user_agent": self.useragent
176
+ if self.useragent
177
+ else generate_headers(browser_mode=True).get("User-Agent"),
178
  }
179
  if self.stealth:
180
+ context_kwargs.update(
181
+ {
182
+ "is_mobile": False,
183
+ "has_touch": False,
184
+ # I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
185
+ "service_workers": "allow",
186
+ "ignore_https_errors": True,
187
+ "screen": {"width": 1920, "height": 1080},
188
+ "viewport": {"width": 1920, "height": 1080},
189
+ "permissions": ["geolocation", "notifications"],
190
+ }
191
+ )
192
 
193
  return context_kwargs
194
 
 
204
  # https://arh.antoinevastel.com/bots/areyouheadless/
205
  # https://prescience-data.github.io/execution-monitor.html
206
  return tuple(
207
+ js_bypass_path(script)
208
+ for script in (
209
  # Order is important
210
+ "webdriver_fully.js",
211
+ "window_chrome.js",
212
+ "navigator_plugins.js",
213
+ "pdf_viewer.js",
214
+ "notification_permission.js",
215
+ "screen_props.js",
216
+ "playwright_fingerprint.js",
217
  )
218
  )
219
 
 
226
  while current_request:
227
  try:
228
  current_response = current_request.response()
229
+ history.insert(
230
+ 0,
231
+ Response(
232
+ url=current_request.url,
233
+ # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
234
+ text="",
235
+ body=b"",
236
+ status=current_response.status if current_response else 301,
237
+ reason=(
238
+ current_response.status_text
239
+ or StatusText.get(current_response.status)
240
+ )
241
+ if current_response
242
+ else StatusText.get(301),
243
+ encoding=current_response.headers.get("content-type", "")
244
+ or "utf-8",
245
+ cookies={},
246
+ headers=current_response.all_headers()
247
+ if current_response
248
+ else {},
249
+ request_headers=current_request.all_headers(),
250
+ **self.adaptor_arguments,
251
+ ),
252
+ )
253
  except Exception as e:
254
  log.error(f"Error processing redirect: {e}")
255
  break
 
269
  while current_request:
270
  try:
271
  current_response = await current_request.response()
272
+ history.insert(
273
+ 0,
274
+ Response(
275
+ url=current_request.url,
276
+ # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
277
+ text="",
278
+ body=b"",
279
+ status=current_response.status if current_response else 301,
280
+ reason=(
281
+ current_response.status_text
282
+ or StatusText.get(current_response.status)
283
+ )
284
+ if current_response
285
+ else StatusText.get(301),
286
+ encoding=current_response.headers.get("content-type", "")
287
+ or "utf-8",
288
+ cookies={},
289
+ headers=await current_response.all_headers()
290
+ if current_response
291
+ else {},
292
+ request_headers=await current_request.all_headers(),
293
+ **self.adaptor_arguments,
294
+ ),
295
+ )
296
  except Exception as e:
297
  log.error(f"Error processing redirect: {e}")
298
  break
 
310
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
311
  """
312
  from playwright.sync_api import Response as PlaywrightResponse
313
+
314
  if not self.stealth or self.real_chrome:
315
  # Because rebrowser_playwright doesn't play well with real browsers
316
  from playwright.sync_api import sync_playwright
 
322
 
323
  def handle_response(finished_response: PlaywrightResponse):
324
  nonlocal final_response
325
+ if (
326
+ finished_response.request.resource_type == "document"
327
+ and finished_response.request.is_navigation_request()
328
+ ):
329
  final_response = finished_response
330
 
331
  with sync_playwright() as p:
 
356
  page.wait_for_load_state(state="domcontentloaded")
357
 
358
  if self.network_idle:
359
+ page.wait_for_load_state("networkidle")
360
 
361
  if self.page_action is not None:
362
  try:
 
372
  page.wait_for_load_state(state="load")
373
  page.wait_for_load_state(state="domcontentloaded")
374
  if self.network_idle:
375
+ page.wait_for_load_state("networkidle")
376
  except Exception as e:
377
  log.error(f"Error waiting for selector {self.wait_selector}: {e}")
378
 
 
383
  raise ValueError("Failed to get a response from the page")
384
 
385
  # This will be parsed inside `Response`
386
+ encoding = (
387
+ final_response.headers.get("content-type", "") or "utf-8"
388
+ ) # default encoding
389
  # PlayWright API sometimes give empty status text for some reason!
390
+ status_text = final_response.status_text or StatusText.get(
391
+ final_response.status
392
+ )
393
 
394
  history = self._process_response_history(first_response)
395
  try:
 
401
  response = Response(
402
  url=page.url,
403
  text=page_content,
404
+ body=page_content.encode("utf-8"),
405
  status=final_response.status,
406
  reason=status_text,
407
  encoding=encoding,
408
+ cookies={
409
+ cookie["name"]: cookie["value"] for cookie in page.context.cookies()
410
+ },
411
  headers=first_response.all_headers(),
412
  request_headers=first_response.request.all_headers(),
413
  history=history,
414
+ **self.adaptor_arguments,
415
  )
416
  page.close()
417
  context.close()
 
424
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
425
  """
426
  from playwright.async_api import Response as PlaywrightResponse
427
+
428
  if not self.stealth or self.real_chrome:
429
  # Because rebrowser_playwright doesn't play well with real browsers
430
  from playwright.async_api import async_playwright
 
436
 
437
  async def handle_response(finished_response: PlaywrightResponse):
438
  nonlocal final_response
439
+ if (
440
+ finished_response.request.resource_type == "document"
441
+ and finished_response.request.is_navigation_request()
442
+ ):
443
  final_response = finished_response
444
 
445
  async with async_playwright() as p:
 
470
  await page.wait_for_load_state(state="domcontentloaded")
471
 
472
  if self.network_idle:
473
+ await page.wait_for_load_state("networkidle")
474
 
475
  if self.page_action is not None:
476
  try:
 
486
  await page.wait_for_load_state(state="load")
487
  await page.wait_for_load_state(state="domcontentloaded")
488
  if self.network_idle:
489
+ await page.wait_for_load_state("networkidle")
490
  except Exception as e:
491
  log.error(f"Error waiting for selector {self.wait_selector}: {e}")
492
 
 
497
  raise ValueError("Failed to get a response from the page")
498
 
499
  # This will be parsed inside `Response`
500
+ encoding = (
501
+ final_response.headers.get("content-type", "") or "utf-8"
502
+ ) # default encoding
503
  # PlayWright API sometimes give empty status text for some reason!
504
+ status_text = final_response.status_text or StatusText.get(
505
+ final_response.status
506
+ )
507
 
508
  history = await self._async_process_response_history(first_response)
509
  try:
 
515
  response = Response(
516
  url=page.url,
517
  text=page_content,
518
+ body=page_content.encode("utf-8"),
519
  status=final_response.status,
520
  reason=status_text,
521
  encoding=encoding,
522
+ cookies={
523
+ cookie["name"]: cookie["value"]
524
+ for cookie in await page.context.cookies()
525
+ },
526
  headers=await first_response.all_headers(),
527
  request_headers=await first_response.request.all_headers(),
528
  history=history,
529
+ **self.adaptor_arguments,
530
  )
531
  await page.close()
532
  await context.close()
scrapling/engines/static.py CHANGED
@@ -10,8 +10,14 @@ from .toolbelt import Response, generate_convincing_referer, generate_headers
10
  @lru_cache(2, typed=True) # Singleton easily
11
  class StaticEngine:
12
  def __init__(
13
- self, url: str, proxy: Optional[str] = None, stealthy_headers: bool = True, follow_redirects: bool = True,
14
- timeout: Optional[Union[int, float]] = None, retries: Optional[int] = 3, adaptor_arguments: Tuple = None
 
 
 
 
 
 
15
  ):
16
  """An engine that utilizes httpx library, check the `Fetcher` class for more documentation.
17
 
@@ -47,14 +53,22 @@ class StaticEngine:
47
  if self.stealth:
48
  extra_headers = generate_headers(browser_mode=False)
49
  # Don't overwrite user supplied headers
50
- extra_headers = {key: value for key, value in extra_headers.items() if key.lower() not in headers_keys}
 
 
 
 
51
  headers.update(extra_headers)
52
- if 'referer' not in headers_keys:
53
- headers.update({'referer': generate_convincing_referer(self.url)})
54
 
55
- elif 'user-agent' not in headers_keys:
56
- headers['User-Agent'] = generate_headers(browser_mode=False).get('User-Agent')
57
- log.debug(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
 
 
 
 
58
 
59
  return headers
60
 
@@ -70,25 +84,43 @@ class StaticEngine:
70
  body=response.content,
71
  status=response.status_code,
72
  reason=response.reason_phrase,
73
- encoding=response.encoding or 'utf-8',
74
  cookies=dict(response.cookies),
75
  headers=dict(response.headers),
76
  request_headers=dict(response.request.headers),
77
  method=response.request.method,
78
- history=[self._prepare_response(redirection) for redirection in response.history],
79
- **self.adaptor_arguments
 
 
80
  )
81
 
82
  def _make_request(self, method: str, **kwargs) -> Response:
83
- headers = self._headers_job(kwargs.pop('headers', {}))
84
- with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
85
- request = getattr(client, method)(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
 
 
 
 
 
 
 
 
86
  return self._prepare_response(request)
87
 
88
  async def _async_make_request(self, method: str, **kwargs) -> Response:
89
- headers = self._headers_job(kwargs.pop('headers', {}))
90
- async with httpx.AsyncClient(proxy=self.proxy, transport=httpx.AsyncHTTPTransport(retries=self.retries)) as client:
91
- request = await getattr(client, method)(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
 
 
 
 
 
 
 
 
92
  return self._prepare_response(request)
93
 
94
  def get(self, **kwargs: Dict) -> Response:
@@ -97,7 +129,7 @@ class StaticEngine:
97
  :param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
98
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
99
  """
100
- return self._make_request('get', **kwargs)
101
 
102
  async def async_get(self, **kwargs: Dict) -> Response:
103
  """Make basic async HTTP GET request for you but with some added flavors.
@@ -105,7 +137,7 @@ class StaticEngine:
105
  :param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
106
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
107
  """
108
- return await self._async_make_request('get', **kwargs)
109
 
110
  def post(self, **kwargs: Dict) -> Response:
111
  """Make basic HTTP POST request for you but with some added flavors.
@@ -113,7 +145,7 @@ class StaticEngine:
113
  :param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
114
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
115
  """
116
- return self._make_request('post', **kwargs)
117
 
118
  async def async_post(self, **kwargs: Dict) -> Response:
119
  """Make basic async HTTP POST request for you but with some added flavors.
@@ -121,7 +153,7 @@ class StaticEngine:
121
  :param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
122
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
123
  """
124
- return await self._async_make_request('post', **kwargs)
125
 
126
  def delete(self, **kwargs: Dict) -> Response:
127
  """Make basic HTTP DELETE request for you but with some added flavors.
@@ -129,7 +161,7 @@ class StaticEngine:
129
  :param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
130
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
131
  """
132
- return self._make_request('delete', **kwargs)
133
 
134
  async def async_delete(self, **kwargs: Dict) -> Response:
135
  """Make basic async HTTP DELETE request for you but with some added flavors.
@@ -137,7 +169,7 @@ class StaticEngine:
137
  :param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
138
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
139
  """
140
- return await self._async_make_request('delete', **kwargs)
141
 
142
  def put(self, **kwargs: Dict) -> Response:
143
  """Make basic HTTP PUT request for you but with some added flavors.
@@ -145,7 +177,7 @@ class StaticEngine:
145
  :param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
146
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
147
  """
148
- return self._make_request('put', **kwargs)
149
 
150
  async def async_put(self, **kwargs: Dict) -> Response:
151
  """Make basic async HTTP PUT request for you but with some added flavors.
@@ -153,4 +185,4 @@ class StaticEngine:
153
  :param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
154
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
155
  """
156
- return await self._async_make_request('put', **kwargs)
 
10
  @lru_cache(2, typed=True) # Singleton easily
11
  class StaticEngine:
12
  def __init__(
13
+ self,
14
+ url: str,
15
+ proxy: Optional[str] = None,
16
+ stealthy_headers: bool = True,
17
+ follow_redirects: bool = True,
18
+ timeout: Optional[Union[int, float]] = None,
19
+ retries: Optional[int] = 3,
20
+ adaptor_arguments: Tuple = None,
21
  ):
22
  """An engine that utilizes httpx library, check the `Fetcher` class for more documentation.
23
 
 
53
  if self.stealth:
54
  extra_headers = generate_headers(browser_mode=False)
55
  # Don't overwrite user supplied headers
56
+ extra_headers = {
57
+ key: value
58
+ for key, value in extra_headers.items()
59
+ if key.lower() not in headers_keys
60
+ }
61
  headers.update(extra_headers)
62
+ if "referer" not in headers_keys:
63
+ headers.update({"referer": generate_convincing_referer(self.url)})
64
 
65
+ elif "user-agent" not in headers_keys:
66
+ headers["User-Agent"] = generate_headers(browser_mode=False).get(
67
+ "User-Agent"
68
+ )
69
+ log.debug(
70
+ f"Can't find useragent in headers so '{headers['User-Agent']}' was used."
71
+ )
72
 
73
  return headers
74
 
 
84
  body=response.content,
85
  status=response.status_code,
86
  reason=response.reason_phrase,
87
+ encoding=response.encoding or "utf-8",
88
  cookies=dict(response.cookies),
89
  headers=dict(response.headers),
90
  request_headers=dict(response.request.headers),
91
  method=response.request.method,
92
+ history=[
93
+ self._prepare_response(redirection) for redirection in response.history
94
+ ],
95
+ **self.adaptor_arguments,
96
  )
97
 
98
  def _make_request(self, method: str, **kwargs) -> Response:
99
+ headers = self._headers_job(kwargs.pop("headers", {}))
100
+ with httpx.Client(
101
+ proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)
102
+ ) as client:
103
+ request = getattr(client, method)(
104
+ url=self.url,
105
+ headers=headers,
106
+ follow_redirects=self.follow_redirects,
107
+ timeout=self.timeout,
108
+ **kwargs,
109
+ )
110
  return self._prepare_response(request)
111
 
112
  async def _async_make_request(self, method: str, **kwargs) -> Response:
113
+ headers = self._headers_job(kwargs.pop("headers", {}))
114
+ async with httpx.AsyncClient(
115
+ proxy=self.proxy, transport=httpx.AsyncHTTPTransport(retries=self.retries)
116
+ ) as client:
117
+ request = await getattr(client, method)(
118
+ url=self.url,
119
+ headers=headers,
120
+ follow_redirects=self.follow_redirects,
121
+ timeout=self.timeout,
122
+ **kwargs,
123
+ )
124
  return self._prepare_response(request)
125
 
126
  def get(self, **kwargs: Dict) -> Response:
 
129
  :param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
130
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
131
  """
132
+ return self._make_request("get", **kwargs)
133
 
134
  async def async_get(self, **kwargs: Dict) -> Response:
135
  """Make basic async HTTP GET request for you but with some added flavors.
 
137
  :param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
138
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
139
  """
140
+ return await self._async_make_request("get", **kwargs)
141
 
142
  def post(self, **kwargs: Dict) -> Response:
143
  """Make basic HTTP POST request for you but with some added flavors.
 
145
  :param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
146
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
147
  """
148
+ return self._make_request("post", **kwargs)
149
 
150
  async def async_post(self, **kwargs: Dict) -> Response:
151
  """Make basic async HTTP POST request for you but with some added flavors.
 
153
  :param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
154
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
155
  """
156
+ return await self._async_make_request("post", **kwargs)
157
 
158
  def delete(self, **kwargs: Dict) -> Response:
159
  """Make basic HTTP DELETE request for you but with some added flavors.
 
161
  :param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
162
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
163
  """
164
+ return self._make_request("delete", **kwargs)
165
 
166
  async def async_delete(self, **kwargs: Dict) -> Response:
167
  """Make basic async HTTP DELETE request for you but with some added flavors.
 
169
  :param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
170
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
171
  """
172
+ return await self._async_make_request("delete", **kwargs)
173
 
174
  def put(self, **kwargs: Dict) -> Response:
175
  """Make basic HTTP PUT request for you but with some added flavors.
 
177
  :param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
178
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
179
  """
180
+ return self._make_request("put", **kwargs)
181
 
182
  async def async_put(self, **kwargs: Dict) -> Response:
183
  """Make basic async HTTP PUT request for you but with some added flavors.
 
185
  :param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
186
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
187
  """
188
+ return await self._async_make_request("put", **kwargs)
scrapling/engines/toolbelt/__init__.py CHANGED
@@ -1,6 +1,16 @@
1
- from .custom import (BaseFetcher, Response, StatusText, check_if_engine_usable,
2
- check_type_validity, get_variable_name)
3
- from .fingerprints import (generate_convincing_referer, generate_headers,
4
- get_os_name)
5
- from .navigation import (async_intercept_route, construct_cdp_url,
6
- construct_proxy_dict, intercept_route, js_bypass_path)
 
 
 
 
 
 
 
 
 
 
 
1
+ from .custom import (
2
+ BaseFetcher,
3
+ Response,
4
+ StatusText,
5
+ check_if_engine_usable,
6
+ check_type_validity,
7
+ get_variable_name,
8
+ )
9
+ from .fingerprints import generate_convincing_referer, generate_headers, get_os_name
10
+ from .navigation import (
11
+ async_intercept_route,
12
+ construct_cdp_url,
13
+ construct_proxy_dict,
14
+ intercept_route,
15
+ js_bypass_path,
16
+ )
scrapling/engines/toolbelt/custom.py CHANGED
@@ -1,11 +1,20 @@
1
  """
2
  Functions related to custom types or type checking
3
  """
 
4
  import inspect
5
  from email.message import Message
6
 
7
- from scrapling.core._types import (Any, Callable, Dict, List, Optional, Tuple,
8
- Type, Union)
 
 
 
 
 
 
 
 
9
  from scrapling.core.custom_types import MappingProxyType
10
  from scrapling.core.utils import log, lru_cache
11
  from scrapling.parser import Adaptor, SQLiteStorageSystem
@@ -13,7 +22,12 @@ from scrapling.parser import Adaptor, SQLiteStorageSystem
13
 
14
  class ResponseEncoding:
15
  __DEFAULT_ENCODING = "utf-8"
16
- __ISO_8859_1_CONTENT_TYPES = {"text/plain", "text/html", "text/css", "text/javascript"}
 
 
 
 
 
17
 
18
  @classmethod
19
  @lru_cache(maxsize=128)
@@ -27,19 +41,21 @@ class ResponseEncoding:
27
  """
28
  # Create a Message object and set the Content-Type header then get the content type and parameters
29
  msg = Message()
30
- msg['content-type'] = header_value
31
 
32
  content_type = msg.get_content_type()
33
  params = dict(msg.get_params(failobj=[]))
34
 
35
  # Remove the content-type from params if present somehow
36
- params.pop('content-type', None)
37
 
38
  return content_type, params
39
 
40
  @classmethod
41
  @lru_cache(maxsize=128)
42
- def get_value(cls, content_type: Optional[str], text: Optional[str] = 'test') -> str:
 
 
43
  """Determine the appropriate character encoding from a content-type header.
44
 
45
  The encoding is determined by these rules in order:
@@ -72,7 +88,9 @@ class ResponseEncoding:
72
  encoding = cls.__DEFAULT_ENCODING
73
 
74
  if encoding:
75
- _ = text.encode(encoding) # Validate encoding and validate it can encode the given text
 
 
76
  return encoding
77
 
78
  return cls.__DEFAULT_ENCODING
@@ -84,9 +102,22 @@ class ResponseEncoding:
84
  class Response(Adaptor):
85
  """This class is returned by all engines as a way to unify response type between different libraries."""
86
 
87
- def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict,
88
- encoding: str = 'utf-8', method: str = 'GET', history: List = None, **adaptor_arguments: Dict):
89
- automatch_domain = adaptor_arguments.pop('automatch_domain', None)
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  self.status = status
91
  self.reason = reason
92
  self.cookies = cookies
@@ -94,11 +125,19 @@ class Response(Adaptor):
94
  self.request_headers = request_headers
95
  self.history = history or []
96
  encoding = ResponseEncoding.get_value(encoding, text)
97
- super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
 
 
 
 
 
 
98
  # For back-ward compatibility
99
  self.adaptor = self
100
  # For easier debugging while working from a Python shell
101
- log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')
 
 
102
 
103
  # def __repr__(self):
104
  # return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
@@ -113,16 +152,26 @@ class BaseFetcher:
113
  storage_args: Optional[Dict] = None
114
  keep_comments: Optional[bool] = False
115
  automatch_domain: Optional[str] = None
116
- parser_keywords: Tuple = ('huge_tree', 'auto_match', 'storage', 'keep_cdata', 'storage_args', 'keep_comments', 'automatch_domain',) # Left open for the user
 
 
 
 
 
 
 
 
117
 
118
  def __init__(self, *args, **kwargs):
119
  # For backward-compatibility before 0.2.99
120
- args_str = ", ".join(args) or ''
121
- kwargs_str = ", ".join(f'{k}={v}' for k, v in kwargs.items()) or ''
122
  if args_str:
123
- args_str += ', '
124
 
125
- log.warning(f'This logic is deprecated now, and have no effect; It will be removed with v0.3. Use `{self.__class__.__name__}.configure({args_str}{kwargs_str})` instead before fetching')
 
 
126
  pass
127
 
128
  @classmethod
@@ -150,12 +199,18 @@ class BaseFetcher:
150
  setattr(cls, key, value)
151
  else:
152
  # Yup, no fun allowed LOL
153
- raise AttributeError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
 
 
154
  else:
155
- raise ValueError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
 
 
156
 
157
  if not kwargs:
158
- raise AttributeError(f'You must pass a keyword to configure, current keywords: {cls.parser_keywords}?')
 
 
159
 
160
  @classmethod
161
  def _generate_parser_arguments(cls) -> Dict:
@@ -167,13 +222,15 @@ class BaseFetcher:
167
  keep_cdata=cls.keep_cdata,
168
  auto_match=cls.auto_match,
169
  storage=cls.storage,
170
- storage_args=cls.storage_args
171
  )
172
  if cls.automatch_domain:
173
  if type(cls.automatch_domain) is not str:
174
- log.warning('[Ignored] The argument "automatch_domain" must be of string type')
 
 
175
  else:
176
- parser_arguments.update({'automatch_domain': cls.automatch_domain})
177
 
178
  return parser_arguments
179
 
@@ -181,72 +238,75 @@ class BaseFetcher:
181
  class StatusText:
182
  """A class that gets the status text of response status code.
183
 
184
- Reference: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status
185
  """
186
- _phrases = MappingProxyType({
187
- 100: "Continue",
188
- 101: "Switching Protocols",
189
- 102: "Processing",
190
- 103: "Early Hints",
191
- 200: "OK",
192
- 201: "Created",
193
- 202: "Accepted",
194
- 203: "Non-Authoritative Information",
195
- 204: "No Content",
196
- 205: "Reset Content",
197
- 206: "Partial Content",
198
- 207: "Multi-Status",
199
- 208: "Already Reported",
200
- 226: "IM Used",
201
- 300: "Multiple Choices",
202
- 301: "Moved Permanently",
203
- 302: "Found",
204
- 303: "See Other",
205
- 304: "Not Modified",
206
- 305: "Use Proxy",
207
- 307: "Temporary Redirect",
208
- 308: "Permanent Redirect",
209
- 400: "Bad Request",
210
- 401: "Unauthorized",
211
- 402: "Payment Required",
212
- 403: "Forbidden",
213
- 404: "Not Found",
214
- 405: "Method Not Allowed",
215
- 406: "Not Acceptable",
216
- 407: "Proxy Authentication Required",
217
- 408: "Request Timeout",
218
- 409: "Conflict",
219
- 410: "Gone",
220
- 411: "Length Required",
221
- 412: "Precondition Failed",
222
- 413: "Payload Too Large",
223
- 414: "URI Too Long",
224
- 415: "Unsupported Media Type",
225
- 416: "Range Not Satisfiable",
226
- 417: "Expectation Failed",
227
- 418: "I'm a teapot",
228
- 421: "Misdirected Request",
229
- 422: "Unprocessable Entity",
230
- 423: "Locked",
231
- 424: "Failed Dependency",
232
- 425: "Too Early",
233
- 426: "Upgrade Required",
234
- 428: "Precondition Required",
235
- 429: "Too Many Requests",
236
- 431: "Request Header Fields Too Large",
237
- 451: "Unavailable For Legal Reasons",
238
- 500: "Internal Server Error",
239
- 501: "Not Implemented",
240
- 502: "Bad Gateway",
241
- 503: "Service Unavailable",
242
- 504: "Gateway Timeout",
243
- 505: "HTTP Version Not Supported",
244
- 506: "Variant Also Negotiates",
245
- 507: "Insufficient Storage",
246
- 508: "Loop Detected",
247
- 510: "Not Extended",
248
- 511: "Network Authentication Required"
249
- })
 
 
 
250
 
251
  @classmethod
252
  @lru_cache(maxsize=128)
@@ -265,20 +325,26 @@ def check_if_engine_usable(engine: Callable) -> Union[Callable, None]:
265
  # if isinstance(engine, type):
266
  # raise TypeError("Expected an engine instance, not a class definition of the engine")
267
 
268
- if hasattr(engine, 'fetch'):
269
  fetch_function = getattr(engine, "fetch")
270
  if callable(fetch_function):
271
  if len(inspect.signature(fetch_function).parameters) > 0:
272
  return engine
273
  else:
274
  # raise TypeError("Engine class instance must have a callable method 'fetch' with the first argument used for the url.")
275
- raise TypeError("Engine class must have a callable method 'fetch' with the first argument used for the url.")
 
 
276
  else:
277
  # raise TypeError("Invalid engine instance! Engine class must have a callable method 'fetch'")
278
- raise TypeError("Invalid engine class! Engine class must have a callable method 'fetch'")
 
 
279
  else:
280
  # raise TypeError("Invalid engine instance! Engine class must have the method 'fetch'")
281
- raise TypeError("Invalid engine class! Engine class must have the method 'fetch'")
 
 
282
 
283
 
284
  def get_variable_name(var: Any) -> Optional[str]:
@@ -293,7 +359,13 @@ def get_variable_name(var: Any) -> Optional[str]:
293
  return None
294
 
295
 
296
- def check_type_validity(variable: Any, valid_types: Union[List[Type], None], default_value: Any = None, critical: bool = False, param_name: Optional[str] = None) -> Any:
 
 
 
 
 
 
297
  """Check if a variable matches the specified type constraints.
298
  :param variable: The variable to check
299
  :param valid_types: List of valid types for the variable
@@ -316,7 +388,7 @@ def check_type_validity(variable: Any, valid_types: Union[List[Type], None], def
316
  error_msg = f'Argument "{var_name}" cannot be None'
317
  if critical:
318
  raise TypeError(error_msg)
319
- log.error(f'[Ignored] {error_msg}')
320
  return default_value
321
 
322
  # If no valid_types specified and variable has a value, return it
@@ -329,7 +401,7 @@ def check_type_validity(variable: Any, valid_types: Union[List[Type], None], def
329
  error_msg = f'Argument "{var_name}" must be of type {" or ".join(type_names)}'
330
  if critical:
331
  raise TypeError(error_msg)
332
- log.error(f'[Ignored] {error_msg}')
333
  return default_value
334
 
335
  return variable
 
1
  """
2
  Functions related to custom types or type checking
3
  """
4
+
5
  import inspect
6
  from email.message import Message
7
 
8
+ from scrapling.core._types import (
9
+ Any,
10
+ Callable,
11
+ Dict,
12
+ List,
13
+ Optional,
14
+ Tuple,
15
+ Type,
16
+ Union,
17
+ )
18
  from scrapling.core.custom_types import MappingProxyType
19
  from scrapling.core.utils import log, lru_cache
20
  from scrapling.parser import Adaptor, SQLiteStorageSystem
 
22
 
23
  class ResponseEncoding:
24
  __DEFAULT_ENCODING = "utf-8"
25
+ __ISO_8859_1_CONTENT_TYPES = {
26
+ "text/plain",
27
+ "text/html",
28
+ "text/css",
29
+ "text/javascript",
30
+ }
31
 
32
  @classmethod
33
  @lru_cache(maxsize=128)
 
41
  """
42
  # Create a Message object and set the Content-Type header then get the content type and parameters
43
  msg = Message()
44
+ msg["content-type"] = header_value
45
 
46
  content_type = msg.get_content_type()
47
  params = dict(msg.get_params(failobj=[]))
48
 
49
  # Remove the content-type from params if present somehow
50
+ params.pop("content-type", None)
51
 
52
  return content_type, params
53
 
54
  @classmethod
55
  @lru_cache(maxsize=128)
56
+ def get_value(
57
+ cls, content_type: Optional[str], text: Optional[str] = "test"
58
+ ) -> str:
59
  """Determine the appropriate character encoding from a content-type header.
60
 
61
  The encoding is determined by these rules in order:
 
88
  encoding = cls.__DEFAULT_ENCODING
89
 
90
  if encoding:
91
+ _ = text.encode(
92
+ encoding
93
+ ) # Validate encoding and validate it can encode the given text
94
  return encoding
95
 
96
  return cls.__DEFAULT_ENCODING
 
102
  class Response(Adaptor):
103
  """This class is returned by all engines as a way to unify response type between different libraries."""
104
 
105
+ def __init__(
106
+ self,
107
+ url: str,
108
+ text: str,
109
+ body: bytes,
110
+ status: int,
111
+ reason: str,
112
+ cookies: Dict,
113
+ headers: Dict,
114
+ request_headers: Dict,
115
+ encoding: str = "utf-8",
116
+ method: str = "GET",
117
+ history: List = None,
118
+ **adaptor_arguments: Dict,
119
+ ):
120
+ automatch_domain = adaptor_arguments.pop("automatch_domain", None)
121
  self.status = status
122
  self.reason = reason
123
  self.cookies = cookies
 
125
  self.request_headers = request_headers
126
  self.history = history or []
127
  encoding = ResponseEncoding.get_value(encoding, text)
128
+ super().__init__(
129
+ text=text,
130
+ body=body,
131
+ url=automatch_domain or url,
132
+ encoding=encoding,
133
+ **adaptor_arguments,
134
+ )
135
  # For back-ward compatibility
136
  self.adaptor = self
137
  # For easier debugging while working from a Python shell
138
+ log.info(
139
+ f"Fetched ({status}) <{method} {url}> (referer: {request_headers.get('referer')})"
140
+ )
141
 
142
  # def __repr__(self):
143
  # return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
 
152
  storage_args: Optional[Dict] = None
153
  keep_comments: Optional[bool] = False
154
  automatch_domain: Optional[str] = None
155
+ parser_keywords: Tuple = (
156
+ "huge_tree",
157
+ "auto_match",
158
+ "storage",
159
+ "keep_cdata",
160
+ "storage_args",
161
+ "keep_comments",
162
+ "automatch_domain",
163
+ ) # Left open for the user
164
 
165
  def __init__(self, *args, **kwargs):
166
  # For backward-compatibility before 0.2.99
167
+ args_str = ", ".join(args) or ""
168
+ kwargs_str = ", ".join(f"{k}={v}" for k, v in kwargs.items()) or ""
169
  if args_str:
170
+ args_str += ", "
171
 
172
+ log.warning(
173
+ f"This logic is deprecated now, and have no effect; It will be removed with v0.3. Use `{self.__class__.__name__}.configure({args_str}{kwargs_str})` instead before fetching"
174
+ )
175
  pass
176
 
177
  @classmethod
 
199
  setattr(cls, key, value)
200
  else:
201
  # Yup, no fun allowed LOL
202
+ raise AttributeError(
203
+ f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?'
204
+ )
205
  else:
206
+ raise ValueError(
207
+ f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?'
208
+ )
209
 
210
  if not kwargs:
211
+ raise AttributeError(
212
+ f"You must pass a keyword to configure, current keywords: {cls.parser_keywords}?"
213
+ )
214
 
215
  @classmethod
216
  def _generate_parser_arguments(cls) -> Dict:
 
222
  keep_cdata=cls.keep_cdata,
223
  auto_match=cls.auto_match,
224
  storage=cls.storage,
225
+ storage_args=cls.storage_args,
226
  )
227
  if cls.automatch_domain:
228
  if type(cls.automatch_domain) is not str:
229
+ log.warning(
230
+ '[Ignored] The argument "automatch_domain" must be of string type'
231
+ )
232
  else:
233
+ parser_arguments.update({"automatch_domain": cls.automatch_domain})
234
 
235
  return parser_arguments
236
 
 
238
  class StatusText:
239
  """A class that gets the status text of response status code.
240
 
241
+ Reference: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status
242
  """
243
+
244
+ _phrases = MappingProxyType(
245
+ {
246
+ 100: "Continue",
247
+ 101: "Switching Protocols",
248
+ 102: "Processing",
249
+ 103: "Early Hints",
250
+ 200: "OK",
251
+ 201: "Created",
252
+ 202: "Accepted",
253
+ 203: "Non-Authoritative Information",
254
+ 204: "No Content",
255
+ 205: "Reset Content",
256
+ 206: "Partial Content",
257
+ 207: "Multi-Status",
258
+ 208: "Already Reported",
259
+ 226: "IM Used",
260
+ 300: "Multiple Choices",
261
+ 301: "Moved Permanently",
262
+ 302: "Found",
263
+ 303: "See Other",
264
+ 304: "Not Modified",
265
+ 305: "Use Proxy",
266
+ 307: "Temporary Redirect",
267
+ 308: "Permanent Redirect",
268
+ 400: "Bad Request",
269
+ 401: "Unauthorized",
270
+ 402: "Payment Required",
271
+ 403: "Forbidden",
272
+ 404: "Not Found",
273
+ 405: "Method Not Allowed",
274
+ 406: "Not Acceptable",
275
+ 407: "Proxy Authentication Required",
276
+ 408: "Request Timeout",
277
+ 409: "Conflict",
278
+ 410: "Gone",
279
+ 411: "Length Required",
280
+ 412: "Precondition Failed",
281
+ 413: "Payload Too Large",
282
+ 414: "URI Too Long",
283
+ 415: "Unsupported Media Type",
284
+ 416: "Range Not Satisfiable",
285
+ 417: "Expectation Failed",
286
+ 418: "I'm a teapot",
287
+ 421: "Misdirected Request",
288
+ 422: "Unprocessable Entity",
289
+ 423: "Locked",
290
+ 424: "Failed Dependency",
291
+ 425: "Too Early",
292
+ 426: "Upgrade Required",
293
+ 428: "Precondition Required",
294
+ 429: "Too Many Requests",
295
+ 431: "Request Header Fields Too Large",
296
+ 451: "Unavailable For Legal Reasons",
297
+ 500: "Internal Server Error",
298
+ 501: "Not Implemented",
299
+ 502: "Bad Gateway",
300
+ 503: "Service Unavailable",
301
+ 504: "Gateway Timeout",
302
+ 505: "HTTP Version Not Supported",
303
+ 506: "Variant Also Negotiates",
304
+ 507: "Insufficient Storage",
305
+ 508: "Loop Detected",
306
+ 510: "Not Extended",
307
+ 511: "Network Authentication Required",
308
+ }
309
+ )
310
 
311
  @classmethod
312
  @lru_cache(maxsize=128)
 
325
  # if isinstance(engine, type):
326
  # raise TypeError("Expected an engine instance, not a class definition of the engine")
327
 
328
+ if hasattr(engine, "fetch"):
329
  fetch_function = getattr(engine, "fetch")
330
  if callable(fetch_function):
331
  if len(inspect.signature(fetch_function).parameters) > 0:
332
  return engine
333
  else:
334
  # raise TypeError("Engine class instance must have a callable method 'fetch' with the first argument used for the url.")
335
+ raise TypeError(
336
+ "Engine class must have a callable method 'fetch' with the first argument used for the url."
337
+ )
338
  else:
339
  # raise TypeError("Invalid engine instance! Engine class must have a callable method 'fetch'")
340
+ raise TypeError(
341
+ "Invalid engine class! Engine class must have a callable method 'fetch'"
342
+ )
343
  else:
344
  # raise TypeError("Invalid engine instance! Engine class must have the method 'fetch'")
345
+ raise TypeError(
346
+ "Invalid engine class! Engine class must have the method 'fetch'"
347
+ )
348
 
349
 
350
  def get_variable_name(var: Any) -> Optional[str]:
 
359
  return None
360
 
361
 
362
+ def check_type_validity(
363
+ variable: Any,
364
+ valid_types: Union[List[Type], None],
365
+ default_value: Any = None,
366
+ critical: bool = False,
367
+ param_name: Optional[str] = None,
368
+ ) -> Any:
369
  """Check if a variable matches the specified type constraints.
370
  :param variable: The variable to check
371
  :param valid_types: List of valid types for the variable
 
388
  error_msg = f'Argument "{var_name}" cannot be None'
389
  if critical:
390
  raise TypeError(error_msg)
391
+ log.error(f"[Ignored] {error_msg}")
392
  return default_value
393
 
394
  # If no valid_types specified and variable has a value, return it
 
401
  error_msg = f'Argument "{var_name}" must be of type {" or ".join(type_names)}'
402
  if critical:
403
  raise TypeError(error_msg)
404
+ log.error(f"[Ignored] {error_msg}")
405
  return default_value
406
 
407
  return variable
scrapling/engines/toolbelt/fingerprints.py CHANGED
@@ -23,7 +23,7 @@ def generate_convincing_referer(url: str) -> str:
23
  :return: Google's search URL of the domain name
24
  """
25
  website_name = extract(url).domain
26
- return f'https://www.google.com/search?q={website_name}'
27
 
28
 
29
  @lru_cache(1, typed=True)
@@ -35,11 +35,11 @@ def get_os_name() -> Union[str, None]:
35
  #
36
  os_name = platform.system()
37
  return {
38
- 'Linux': 'linux',
39
- 'Darwin': 'macos',
40
- 'Windows': 'windows',
41
  # For the future? because why not
42
- 'iOS': 'ios',
43
  }.get(os_name)
44
 
45
 
@@ -50,9 +50,9 @@ def generate_suitable_fingerprint() -> Fingerprint:
50
  :return: `Fingerprint` object
51
  """
52
  return FingerprintGenerator(
53
- browser=[Browser(name='chrome', min_version=128)],
54
  os=get_os_name(), # None is ignored
55
- device='desktop'
56
  ).generate()
57
 
58
 
@@ -67,15 +67,15 @@ def generate_headers(browser_mode: bool = False) -> Dict:
67
  # So we don't raise any inconsistency red flags while websites fingerprinting us
68
  os_name = get_os_name()
69
  return HeaderGenerator(
70
- browser=[Browser(name='chrome', min_version=130)],
71
  os=os_name, # None is ignored
72
- device='desktop'
73
  ).generate()
74
  else:
75
  # Here it's used for normal requests that aren't done through browsers so we can take it lightly
76
  browsers = [
77
- Browser(name='chrome', min_version=120),
78
- Browser(name='firefox', min_version=120),
79
- Browser(name='edge', min_version=120),
80
  ]
81
- return HeaderGenerator(browser=browsers, device='desktop').generate()
 
23
  :return: Google's search URL of the domain name
24
  """
25
  website_name = extract(url).domain
26
+ return f"https://www.google.com/search?q={website_name}"
27
 
28
 
29
  @lru_cache(1, typed=True)
 
35
  #
36
  os_name = platform.system()
37
  return {
38
+ "Linux": "linux",
39
+ "Darwin": "macos",
40
+ "Windows": "windows",
41
  # For the future? because why not
42
+ "iOS": "ios",
43
  }.get(os_name)
44
 
45
 
 
50
  :return: `Fingerprint` object
51
  """
52
  return FingerprintGenerator(
53
+ browser=[Browser(name="chrome", min_version=128)],
54
  os=get_os_name(), # None is ignored
55
+ device="desktop",
56
  ).generate()
57
 
58
 
 
67
  # So we don't raise any inconsistency red flags while websites fingerprinting us
68
  os_name = get_os_name()
69
  return HeaderGenerator(
70
+ browser=[Browser(name="chrome", min_version=130)],
71
  os=os_name, # None is ignored
72
+ device="desktop",
73
  ).generate()
74
  else:
75
  # Here it's used for normal requests that aren't done through browsers so we can take it lightly
76
  browsers = [
77
+ Browser(name="chrome", min_version=120),
78
+ Browser(name="firefox", min_version=120),
79
+ Browser(name="edge", min_version=120),
80
  ]
81
+ return HeaderGenerator(browser=browsers, device="desktop").generate()
scrapling/engines/toolbelt/navigation.py CHANGED
@@ -1,6 +1,7 @@
1
  """
2
  Functions related to files and URLs
3
  """
 
4
  import os
5
  from urllib.parse import urlencode, urlparse
6
 
@@ -19,7 +20,9 @@ def intercept_route(route: Route):
19
  :return: PlayWright `Route` object
20
  """
21
  if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
22
- log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
 
 
23
  route.abort()
24
  else:
25
  route.continue_()
@@ -32,7 +35,9 @@ async def async_intercept_route(route: async_Route):
32
  :return: PlayWright `Route` object
33
  """
34
  if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
35
- log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
 
 
36
  await route.abort()
37
  else:
38
  await route.continue_()
@@ -50,23 +55,33 @@ def construct_proxy_dict(proxy_string: Union[str, Dict[str, str]]) -> Union[Dict
50
  proxy = urlparse(proxy_string)
51
  try:
52
  return {
53
- 'server': f'{proxy.scheme}://{proxy.hostname}:{proxy.port}',
54
- 'username': proxy.username or '',
55
- 'password': proxy.password or '',
56
  }
57
  except ValueError:
58
  # Urllib will say that one of the parameters above can't be casted to the correct type like `int` for port etc...
59
- raise TypeError('The proxy argument\'s string is in invalid format!')
60
 
61
  elif isinstance(proxy_string, dict):
62
- valid_keys = ('server', 'username', 'password', )
63
- if all(key in valid_keys for key in proxy_string.keys()) and not any(key not in valid_keys for key in proxy_string.keys()):
 
 
 
 
 
 
64
  return proxy_string
65
  else:
66
- raise TypeError(f'A proxy dictionary must have only these keys: {valid_keys}')
 
 
67
 
68
  else:
69
- raise TypeError(f'Invalid type of proxy ({type(proxy_string)}), the proxy argument must be a string or a dictionary!')
 
 
70
 
71
  # The default value for proxy in Playwright's source is `None`
72
  return None
@@ -84,7 +99,7 @@ def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
84
  parsed = urlparse(cdp_url)
85
 
86
  # Check scheme
87
- if parsed.scheme not in ('ws', 'wss'):
88
  raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
89
 
90
  # Validate hostname and port
@@ -93,8 +108,8 @@ def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
93
 
94
  # Ensure path starts with /
95
  path = parsed.path
96
- if not path.startswith('/'):
97
- path = '/' + path
98
 
99
  # Reconstruct the base URL with validated parts
100
  validated_base = f"{parsed.scheme}://{parsed.netloc}{path}"
@@ -118,4 +133,4 @@ def js_bypass_path(filename: str) -> str:
118
  :return: The full path of the JS file.
119
  """
120
  current_directory = os.path.dirname(__file__)
121
- return os.path.join(current_directory, 'bypasses', filename)
 
1
  """
2
  Functions related to files and URLs
3
  """
4
+
5
  import os
6
  from urllib.parse import urlencode, urlparse
7
 
 
20
  :return: PlayWright `Route` object
21
  """
22
  if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
23
+ log.debug(
24
+ f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"'
25
+ )
26
  route.abort()
27
  else:
28
  route.continue_()
 
35
  :return: PlayWright `Route` object
36
  """
37
  if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
38
+ log.debug(
39
+ f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"'
40
+ )
41
  await route.abort()
42
  else:
43
  await route.continue_()
 
55
  proxy = urlparse(proxy_string)
56
  try:
57
  return {
58
+ "server": f"{proxy.scheme}://{proxy.hostname}:{proxy.port}",
59
+ "username": proxy.username or "",
60
+ "password": proxy.password or "",
61
  }
62
  except ValueError:
63
  # Urllib will say that one of the parameters above can't be casted to the correct type like `int` for port etc...
64
+ raise TypeError("The proxy argument's string is in invalid format!")
65
 
66
  elif isinstance(proxy_string, dict):
67
+ valid_keys = (
68
+ "server",
69
+ "username",
70
+ "password",
71
+ )
72
+ if all(key in valid_keys for key in proxy_string.keys()) and not any(
73
+ key not in valid_keys for key in proxy_string.keys()
74
+ ):
75
  return proxy_string
76
  else:
77
+ raise TypeError(
78
+ f"A proxy dictionary must have only these keys: {valid_keys}"
79
+ )
80
 
81
  else:
82
+ raise TypeError(
83
+ f"Invalid type of proxy ({type(proxy_string)}), the proxy argument must be a string or a dictionary!"
84
+ )
85
 
86
  # The default value for proxy in Playwright's source is `None`
87
  return None
 
99
  parsed = urlparse(cdp_url)
100
 
101
  # Check scheme
102
+ if parsed.scheme not in ("ws", "wss"):
103
  raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
104
 
105
  # Validate hostname and port
 
108
 
109
  # Ensure path starts with /
110
  path = parsed.path
111
+ if not path.startswith("/"):
112
+ path = "/" + path
113
 
114
  # Reconstruct the base URL with validated parts
115
  validated_base = f"{parsed.scheme}://{parsed.netloc}{path}"
 
133
  :return: The full path of the JS file.
134
  """
135
  current_directory = os.path.dirname(__file__)
136
+ return os.path.join(current_directory, "bypasses", filename)
scrapling/fetchers.py CHANGED
@@ -1,7 +1,18 @@
1
- from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
2
- SelectorWaitStates, Union)
3
- from scrapling.engines import (CamoufoxEngine, PlaywrightEngine, StaticEngine,
4
- check_if_engine_usable)
 
 
 
 
 
 
 
 
 
 
 
5
  from scrapling.engines.toolbelt import BaseFetcher, Response
6
 
7
 
@@ -10,10 +21,19 @@ class Fetcher(BaseFetcher):
10
 
11
  Any additional keyword arguments passed to the methods below are passed to the respective httpx's method directly.
12
  """
 
13
  @classmethod
14
  def get(
15
- cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
16
- proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
 
 
 
 
 
 
 
 
17
  """Make basic HTTP GET request for you but with some added flavors.
18
 
19
  :param url: Target url.
@@ -30,16 +50,36 @@ class Fetcher(BaseFetcher):
30
  if not custom_config:
31
  custom_config = {}
32
  elif not isinstance(custom_config, dict):
33
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
 
 
34
 
35
- adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
36
- response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).get(**kwargs)
 
 
 
 
 
 
 
 
 
 
37
  return response_object
38
 
39
  @classmethod
40
  def post(
41
- cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
42
- proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
 
 
 
 
 
 
 
 
43
  """Make basic HTTP POST request for you but with some added flavors.
44
 
45
  :param url: Target url.
@@ -56,16 +96,36 @@ class Fetcher(BaseFetcher):
56
  if not custom_config:
57
  custom_config = {}
58
  elif not isinstance(custom_config, dict):
59
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
 
 
60
 
61
- adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
62
- response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).post(**kwargs)
 
 
 
 
 
 
 
 
 
 
63
  return response_object
64
 
65
  @classmethod
66
  def put(
67
- cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
68
- proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
 
 
 
 
 
 
 
 
69
  """Make basic HTTP PUT request for you but with some added flavors.
70
 
71
  :param url: Target url
@@ -83,16 +143,36 @@ class Fetcher(BaseFetcher):
83
  if not custom_config:
84
  custom_config = {}
85
  elif not isinstance(custom_config, dict):
86
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
 
 
87
 
88
- adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
89
- response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).put(**kwargs)
 
 
 
 
 
 
 
 
 
 
90
  return response_object
91
 
92
  @classmethod
93
  def delete(
94
- cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
95
- proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
 
 
 
 
 
 
 
 
96
  """Make basic HTTP DELETE request for you but with some added flavors.
97
 
98
  :param url: Target url
@@ -109,18 +189,38 @@ class Fetcher(BaseFetcher):
109
  if not custom_config:
110
  custom_config = {}
111
  elif not isinstance(custom_config, dict):
112
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
 
 
113
 
114
- adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
115
- response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).delete(**kwargs)
 
 
 
 
 
 
 
 
 
 
116
  return response_object
117
 
118
 
119
  class AsyncFetcher(Fetcher):
120
  @classmethod
121
  async def get(
122
- cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
123
- proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
 
 
 
 
 
 
 
 
124
  """Make basic HTTP GET request for you but with some added flavors.
125
 
126
  :param url: Target url.
@@ -137,16 +237,36 @@ class AsyncFetcher(Fetcher):
137
  if not custom_config:
138
  custom_config = {}
139
  elif not isinstance(custom_config, dict):
140
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
 
 
141
 
142
- adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
143
- response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_get(**kwargs)
 
 
 
 
 
 
 
 
 
 
144
  return response_object
145
 
146
  @classmethod
147
  async def post(
148
- cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
149
- proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
 
 
 
 
 
 
 
 
150
  """Make basic HTTP POST request for you but with some added flavors.
151
 
152
  :param url: Target url.
@@ -163,16 +283,36 @@ class AsyncFetcher(Fetcher):
163
  if not custom_config:
164
  custom_config = {}
165
  elif not isinstance(custom_config, dict):
166
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
 
 
167
 
168
- adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
169
- response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_post(**kwargs)
 
 
 
 
 
 
 
 
 
 
170
  return response_object
171
 
172
  @classmethod
173
  async def put(
174
- cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
175
- proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
 
 
 
 
 
 
 
 
176
  """Make basic HTTP PUT request for you but with some added flavors.
177
 
178
  :param url: Target url
@@ -189,16 +329,36 @@ class AsyncFetcher(Fetcher):
189
  if not custom_config:
190
  custom_config = {}
191
  elif not isinstance(custom_config, dict):
192
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
 
 
193
 
194
- adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
195
- response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_put(**kwargs)
 
 
 
 
 
 
 
 
 
 
196
  return response_object
197
 
198
  @classmethod
199
  async def delete(
200
- cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
201
- proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
 
 
 
 
 
 
 
 
202
  """Make basic HTTP DELETE request for you but with some added flavors.
203
 
204
  :param url: Target url
@@ -215,27 +375,57 @@ class AsyncFetcher(Fetcher):
215
  if not custom_config:
216
  custom_config = {}
217
  elif not isinstance(custom_config, dict):
218
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
 
 
219
 
220
- adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
221
- response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_delete(**kwargs)
 
 
 
 
 
 
 
 
 
 
222
  return response_object
223
 
224
 
225
  class StealthyFetcher(BaseFetcher):
226
  """A `Fetcher` class type that is completely stealthy fetcher that uses a modified version of Firefox.
227
 
228
- It works as real browsers passing almost all online tests/protections based on Camoufox.
229
- Other added flavors include setting the faked OS fingerprints to match the user's OS and the referer of every request is set as if this request came from Google's search of this URL's domain.
230
  """
 
231
  @classmethod
232
  def fetch(
233
- cls, url: str, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
234
- block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, addons: Optional[List[str]] = None, wait: Optional[int] = 0,
235
- timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
236
- wait_selector_state: SelectorWaitStates = 'attached', google_search: bool = True, extra_headers: Optional[Dict[str, str]] = None,
237
- proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: bool = False, disable_ads: bool = False, geoip: bool = False,
238
- custom_config: Dict = None, additional_arguments: Dict = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  ) -> Response:
240
  """
241
  Opens up a browser and do your request based on your chosen options below.
@@ -271,7 +461,9 @@ class StealthyFetcher(BaseFetcher):
271
  if not custom_config:
272
  custom_config = {}
273
  elif not isinstance(custom_config, dict):
274
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
 
 
275
 
276
  engine = CamoufoxEngine(
277
  wait=wait,
@@ -294,18 +486,35 @@ class StealthyFetcher(BaseFetcher):
294
  disable_resources=disable_resources,
295
  wait_selector_state=wait_selector_state,
296
  adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
297
- additional_arguments=additional_arguments or {}
298
  )
299
  return engine.fetch(url)
300
 
301
  @classmethod
302
  async def async_fetch(
303
- cls, url: str, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
304
- block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, addons: Optional[List[str]] = None, wait: Optional[int] = 0,
305
- timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
306
- wait_selector_state: SelectorWaitStates = 'attached', google_search: bool = True, extra_headers: Optional[Dict[str, str]] = None,
307
- proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: bool = False, disable_ads: bool = False, geoip: bool = False,
308
- custom_config: Dict = None, additional_arguments: Dict = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  ) -> Response:
310
  """
311
  Opens up a browser and do your request based on your chosen options below.
@@ -341,7 +550,9 @@ class StealthyFetcher(BaseFetcher):
341
  if not custom_config:
342
  custom_config = {}
343
  elif not isinstance(custom_config, dict):
344
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
 
 
345
 
346
  engine = CamoufoxEngine(
347
  wait=wait,
@@ -364,7 +575,7 @@ class StealthyFetcher(BaseFetcher):
364
  disable_resources=disable_resources,
365
  wait_selector_state=wait_selector_state,
366
  adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
367
- additional_arguments=additional_arguments or {}
368
  )
369
  return await engine.async_fetch(url)
370
 
@@ -385,17 +596,32 @@ class PlayWrightFetcher(BaseFetcher):
385
 
386
  > Note that these are the main options with PlayWright but it can be mixed together.
387
  """
 
388
  @classmethod
389
  def fetch(
390
- cls, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
391
- useragent: Optional[str] = None, network_idle: bool = False, timeout: Optional[float] = 30000, wait: Optional[int] = 0,
392
- page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
393
- hide_canvas: bool = False, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: bool = True,
394
- proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
395
- stealth: bool = False, real_chrome: bool = False,
396
- cdp_url: Optional[str] = None,
397
- nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
398
- custom_config: Dict = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
  ) -> Response:
400
  """Opens up a browser and do your request based on your chosen options below.
401
 
@@ -428,7 +654,9 @@ class PlayWrightFetcher(BaseFetcher):
428
  if not custom_config:
429
  custom_config = {}
430
  elif not isinstance(custom_config, dict):
431
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
 
 
432
 
433
  engine = PlaywrightEngine(
434
  wait=wait,
@@ -457,15 +685,29 @@ class PlayWrightFetcher(BaseFetcher):
457
 
458
  @classmethod
459
  async def async_fetch(
460
- cls, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
461
- useragent: Optional[str] = None, network_idle: bool = False, timeout: Optional[float] = 30000, wait: Optional[int] = 0,
462
- page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
463
- hide_canvas: bool = False, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: bool = True,
464
- proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
465
- stealth: bool = False, real_chrome: bool = False,
466
- cdp_url: Optional[str] = None,
467
- nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
468
- custom_config: Dict = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469
  ) -> Response:
470
  """Opens up a browser and do your request based on your chosen options below.
471
 
@@ -498,7 +740,9 @@ class PlayWrightFetcher(BaseFetcher):
498
  if not custom_config:
499
  custom_config = {}
500
  elif not isinstance(custom_config, dict):
501
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
 
 
502
 
503
  engine = PlaywrightEngine(
504
  wait=wait,
@@ -529,5 +773,7 @@ class PlayWrightFetcher(BaseFetcher):
529
  class CustomFetcher(BaseFetcher):
530
  @classmethod
531
  def fetch(cls, url: str, browser_engine, **kwargs) -> Response:
532
- engine = check_if_engine_usable(browser_engine)(adaptor_arguments=cls._generate_parser_arguments(), **kwargs)
 
 
533
  return engine.fetch(url)
 
1
+ from scrapling.core._types import (
2
+ Callable,
3
+ Dict,
4
+ List,
5
+ Literal,
6
+ Optional,
7
+ SelectorWaitStates,
8
+ Union,
9
+ )
10
+ from scrapling.engines import (
11
+ CamoufoxEngine,
12
+ PlaywrightEngine,
13
+ StaticEngine,
14
+ check_if_engine_usable,
15
+ )
16
  from scrapling.engines.toolbelt import BaseFetcher, Response
17
 
18
 
 
21
 
22
  Any additional keyword arguments passed to the methods below are passed to the respective httpx's method directly.
23
  """
24
+
25
  @classmethod
26
  def get(
27
+ cls,
28
+ url: str,
29
+ follow_redirects: bool = True,
30
+ timeout: Optional[Union[int, float]] = 10,
31
+ stealthy_headers: bool = True,
32
+ proxy: Optional[str] = None,
33
+ retries: Optional[int] = 3,
34
+ custom_config: Dict = None,
35
+ **kwargs: Dict,
36
+ ) -> Response:
37
  """Make basic HTTP GET request for you but with some added flavors.
38
 
39
  :param url: Target url.
 
50
  if not custom_config:
51
  custom_config = {}
52
  elif not isinstance(custom_config, dict):
53
+ ValueError(
54
+ f"The custom parser config must be of type dictionary, got {cls.__class__}"
55
+ )
56
 
57
+ adaptor_arguments = tuple(
58
+ {**cls._generate_parser_arguments(), **custom_config}.items()
59
+ )
60
+ response_object = StaticEngine(
61
+ url,
62
+ proxy,
63
+ stealthy_headers,
64
+ follow_redirects,
65
+ timeout,
66
+ retries,
67
+ adaptor_arguments=adaptor_arguments,
68
+ ).get(**kwargs)
69
  return response_object
70
 
71
  @classmethod
72
  def post(
73
+ cls,
74
+ url: str,
75
+ follow_redirects: bool = True,
76
+ timeout: Optional[Union[int, float]] = 10,
77
+ stealthy_headers: bool = True,
78
+ proxy: Optional[str] = None,
79
+ retries: Optional[int] = 3,
80
+ custom_config: Dict = None,
81
+ **kwargs: Dict,
82
+ ) -> Response:
83
  """Make basic HTTP POST request for you but with some added flavors.
84
 
85
  :param url: Target url.
 
96
  if not custom_config:
97
  custom_config = {}
98
  elif not isinstance(custom_config, dict):
99
+ ValueError(
100
+ f"The custom parser config must be of type dictionary, got {cls.__class__}"
101
+ )
102
 
103
+ adaptor_arguments = tuple(
104
+ {**cls._generate_parser_arguments(), **custom_config}.items()
105
+ )
106
+ response_object = StaticEngine(
107
+ url,
108
+ proxy,
109
+ stealthy_headers,
110
+ follow_redirects,
111
+ timeout,
112
+ retries,
113
+ adaptor_arguments=adaptor_arguments,
114
+ ).post(**kwargs)
115
  return response_object
116
 
117
  @classmethod
118
  def put(
119
+ cls,
120
+ url: str,
121
+ follow_redirects: bool = True,
122
+ timeout: Optional[Union[int, float]] = 10,
123
+ stealthy_headers: bool = True,
124
+ proxy: Optional[str] = None,
125
+ retries: Optional[int] = 3,
126
+ custom_config: Dict = None,
127
+ **kwargs: Dict,
128
+ ) -> Response:
129
  """Make basic HTTP PUT request for you but with some added flavors.
130
 
131
  :param url: Target url
 
143
  if not custom_config:
144
  custom_config = {}
145
  elif not isinstance(custom_config, dict):
146
+ ValueError(
147
+ f"The custom parser config must be of type dictionary, got {cls.__class__}"
148
+ )
149
 
150
+ adaptor_arguments = tuple(
151
+ {**cls._generate_parser_arguments(), **custom_config}.items()
152
+ )
153
+ response_object = StaticEngine(
154
+ url,
155
+ proxy,
156
+ stealthy_headers,
157
+ follow_redirects,
158
+ timeout,
159
+ retries,
160
+ adaptor_arguments=adaptor_arguments,
161
+ ).put(**kwargs)
162
  return response_object
163
 
164
  @classmethod
165
  def delete(
166
+ cls,
167
+ url: str,
168
+ follow_redirects: bool = True,
169
+ timeout: Optional[Union[int, float]] = 10,
170
+ stealthy_headers: bool = True,
171
+ proxy: Optional[str] = None,
172
+ retries: Optional[int] = 3,
173
+ custom_config: Dict = None,
174
+ **kwargs: Dict,
175
+ ) -> Response:
176
  """Make basic HTTP DELETE request for you but with some added flavors.
177
 
178
  :param url: Target url
 
189
  if not custom_config:
190
  custom_config = {}
191
  elif not isinstance(custom_config, dict):
192
+ ValueError(
193
+ f"The custom parser config must be of type dictionary, got {cls.__class__}"
194
+ )
195
 
196
+ adaptor_arguments = tuple(
197
+ {**cls._generate_parser_arguments(), **custom_config}.items()
198
+ )
199
+ response_object = StaticEngine(
200
+ url,
201
+ proxy,
202
+ stealthy_headers,
203
+ follow_redirects,
204
+ timeout,
205
+ retries,
206
+ adaptor_arguments=adaptor_arguments,
207
+ ).delete(**kwargs)
208
  return response_object
209
 
210
 
211
  class AsyncFetcher(Fetcher):
212
  @classmethod
213
  async def get(
214
+ cls,
215
+ url: str,
216
+ follow_redirects: bool = True,
217
+ timeout: Optional[Union[int, float]] = 10,
218
+ stealthy_headers: bool = True,
219
+ proxy: Optional[str] = None,
220
+ retries: Optional[int] = 3,
221
+ custom_config: Dict = None,
222
+ **kwargs: Dict,
223
+ ) -> Response:
224
  """Make basic HTTP GET request for you but with some added flavors.
225
 
226
  :param url: Target url.
 
237
  if not custom_config:
238
  custom_config = {}
239
  elif not isinstance(custom_config, dict):
240
+ ValueError(
241
+ f"The custom parser config must be of type dictionary, got {cls.__class__}"
242
+ )
243
 
244
+ adaptor_arguments = tuple(
245
+ {**cls._generate_parser_arguments(), **custom_config}.items()
246
+ )
247
+ response_object = await StaticEngine(
248
+ url,
249
+ proxy,
250
+ stealthy_headers,
251
+ follow_redirects,
252
+ timeout,
253
+ retries=retries,
254
+ adaptor_arguments=adaptor_arguments,
255
+ ).async_get(**kwargs)
256
  return response_object
257
 
258
  @classmethod
259
  async def post(
260
+ cls,
261
+ url: str,
262
+ follow_redirects: bool = True,
263
+ timeout: Optional[Union[int, float]] = 10,
264
+ stealthy_headers: bool = True,
265
+ proxy: Optional[str] = None,
266
+ retries: Optional[int] = 3,
267
+ custom_config: Dict = None,
268
+ **kwargs: Dict,
269
+ ) -> Response:
270
  """Make basic HTTP POST request for you but with some added flavors.
271
 
272
  :param url: Target url.
 
283
  if not custom_config:
284
  custom_config = {}
285
  elif not isinstance(custom_config, dict):
286
+ ValueError(
287
+ f"The custom parser config must be of type dictionary, got {cls.__class__}"
288
+ )
289
 
290
+ adaptor_arguments = tuple(
291
+ {**cls._generate_parser_arguments(), **custom_config}.items()
292
+ )
293
+ response_object = await StaticEngine(
294
+ url,
295
+ proxy,
296
+ stealthy_headers,
297
+ follow_redirects,
298
+ timeout,
299
+ retries=retries,
300
+ adaptor_arguments=adaptor_arguments,
301
+ ).async_post(**kwargs)
302
  return response_object
303
 
304
  @classmethod
305
  async def put(
306
+ cls,
307
+ url: str,
308
+ follow_redirects: bool = True,
309
+ timeout: Optional[Union[int, float]] = 10,
310
+ stealthy_headers: bool = True,
311
+ proxy: Optional[str] = None,
312
+ retries: Optional[int] = 3,
313
+ custom_config: Dict = None,
314
+ **kwargs: Dict,
315
+ ) -> Response:
316
  """Make basic HTTP PUT request for you but with some added flavors.
317
 
318
  :param url: Target url
 
329
  if not custom_config:
330
  custom_config = {}
331
  elif not isinstance(custom_config, dict):
332
+ ValueError(
333
+ f"The custom parser config must be of type dictionary, got {cls.__class__}"
334
+ )
335
 
336
+ adaptor_arguments = tuple(
337
+ {**cls._generate_parser_arguments(), **custom_config}.items()
338
+ )
339
+ response_object = await StaticEngine(
340
+ url,
341
+ proxy,
342
+ stealthy_headers,
343
+ follow_redirects,
344
+ timeout,
345
+ retries=retries,
346
+ adaptor_arguments=adaptor_arguments,
347
+ ).async_put(**kwargs)
348
  return response_object
349
 
350
  @classmethod
351
  async def delete(
352
+ cls,
353
+ url: str,
354
+ follow_redirects: bool = True,
355
+ timeout: Optional[Union[int, float]] = 10,
356
+ stealthy_headers: bool = True,
357
+ proxy: Optional[str] = None,
358
+ retries: Optional[int] = 3,
359
+ custom_config: Dict = None,
360
+ **kwargs: Dict,
361
+ ) -> Response:
362
  """Make basic HTTP DELETE request for you but with some added flavors.
363
 
364
  :param url: Target url
 
375
  if not custom_config:
376
  custom_config = {}
377
  elif not isinstance(custom_config, dict):
378
+ ValueError(
379
+ f"The custom parser config must be of type dictionary, got {cls.__class__}"
380
+ )
381
 
382
+ adaptor_arguments = tuple(
383
+ {**cls._generate_parser_arguments(), **custom_config}.items()
384
+ )
385
+ response_object = await StaticEngine(
386
+ url,
387
+ proxy,
388
+ stealthy_headers,
389
+ follow_redirects,
390
+ timeout,
391
+ retries=retries,
392
+ adaptor_arguments=adaptor_arguments,
393
+ ).async_delete(**kwargs)
394
  return response_object
395
 
396
 
397
  class StealthyFetcher(BaseFetcher):
398
  """A `Fetcher` class type that is completely stealthy fetcher that uses a modified version of Firefox.
399
 
400
+ It works as real browsers passing almost all online tests/protections based on Camoufox.
401
+ Other added flavors include setting the faked OS fingerprints to match the user's OS and the referer of every request is set as if this request came from Google's search of this URL's domain.
402
  """
403
+
404
  @classmethod
405
  def fetch(
406
+ cls,
407
+ url: str,
408
+ headless: Union[bool, Literal["virtual"]] = True, # noqa: F821
409
+ block_images: bool = False,
410
+ disable_resources: bool = False,
411
+ block_webrtc: bool = False,
412
+ allow_webgl: bool = True,
413
+ network_idle: bool = False,
414
+ addons: Optional[List[str]] = None,
415
+ wait: Optional[int] = 0,
416
+ timeout: Optional[float] = 30000,
417
+ page_action: Callable = None,
418
+ wait_selector: Optional[str] = None,
419
+ humanize: Optional[Union[bool, float]] = True,
420
+ wait_selector_state: SelectorWaitStates = "attached",
421
+ google_search: bool = True,
422
+ extra_headers: Optional[Dict[str, str]] = None,
423
+ proxy: Optional[Union[str, Dict[str, str]]] = None,
424
+ os_randomize: bool = False,
425
+ disable_ads: bool = False,
426
+ geoip: bool = False,
427
+ custom_config: Dict = None,
428
+ additional_arguments: Dict = None,
429
  ) -> Response:
430
  """
431
  Opens up a browser and do your request based on your chosen options below.
 
461
  if not custom_config:
462
  custom_config = {}
463
  elif not isinstance(custom_config, dict):
464
+ ValueError(
465
+ f"The custom parser config must be of type dictionary, got {cls.__class__}"
466
+ )
467
 
468
  engine = CamoufoxEngine(
469
  wait=wait,
 
486
  disable_resources=disable_resources,
487
  wait_selector_state=wait_selector_state,
488
  adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
489
+ additional_arguments=additional_arguments or {},
490
  )
491
  return engine.fetch(url)
492
 
493
  @classmethod
494
  async def async_fetch(
495
+ cls,
496
+ url: str,
497
+ headless: Union[bool, Literal["virtual"]] = True, # noqa: F821
498
+ block_images: bool = False,
499
+ disable_resources: bool = False,
500
+ block_webrtc: bool = False,
501
+ allow_webgl: bool = True,
502
+ network_idle: bool = False,
503
+ addons: Optional[List[str]] = None,
504
+ wait: Optional[int] = 0,
505
+ timeout: Optional[float] = 30000,
506
+ page_action: Callable = None,
507
+ wait_selector: Optional[str] = None,
508
+ humanize: Optional[Union[bool, float]] = True,
509
+ wait_selector_state: SelectorWaitStates = "attached",
510
+ google_search: bool = True,
511
+ extra_headers: Optional[Dict[str, str]] = None,
512
+ proxy: Optional[Union[str, Dict[str, str]]] = None,
513
+ os_randomize: bool = False,
514
+ disable_ads: bool = False,
515
+ geoip: bool = False,
516
+ custom_config: Dict = None,
517
+ additional_arguments: Dict = None,
518
  ) -> Response:
519
  """
520
  Opens up a browser and do your request based on your chosen options below.
 
550
  if not custom_config:
551
  custom_config = {}
552
  elif not isinstance(custom_config, dict):
553
+ ValueError(
554
+ f"The custom parser config must be of type dictionary, got {cls.__class__}"
555
+ )
556
 
557
  engine = CamoufoxEngine(
558
  wait=wait,
 
575
  disable_resources=disable_resources,
576
  wait_selector_state=wait_selector_state,
577
  adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
578
+ additional_arguments=additional_arguments or {},
579
  )
580
  return await engine.async_fetch(url)
581
 
 
596
 
597
  > Note that these are the main options with PlayWright but it can be mixed together.
598
  """
599
+
600
  @classmethod
601
  def fetch(
602
+ cls,
603
+ url: str,
604
+ headless: Union[bool, str] = True,
605
+ disable_resources: bool = None,
606
+ useragent: Optional[str] = None,
607
+ network_idle: bool = False,
608
+ timeout: Optional[float] = 30000,
609
+ wait: Optional[int] = 0,
610
+ page_action: Optional[Callable] = None,
611
+ wait_selector: Optional[str] = None,
612
+ wait_selector_state: SelectorWaitStates = "attached",
613
+ hide_canvas: bool = False,
614
+ disable_webgl: bool = False,
615
+ extra_headers: Optional[Dict[str, str]] = None,
616
+ google_search: bool = True,
617
+ proxy: Optional[Union[str, Dict[str, str]]] = None,
618
+ locale: Optional[str] = "en-US",
619
+ stealth: bool = False,
620
+ real_chrome: bool = False,
621
+ cdp_url: Optional[str] = None,
622
+ nstbrowser_mode: bool = False,
623
+ nstbrowser_config: Optional[Dict] = None,
624
+ custom_config: Dict = None,
625
  ) -> Response:
626
  """Opens up a browser and do your request based on your chosen options below.
627
 
 
654
  if not custom_config:
655
  custom_config = {}
656
  elif not isinstance(custom_config, dict):
657
+ ValueError(
658
+ f"The custom parser config must be of type dictionary, got {cls.__class__}"
659
+ )
660
 
661
  engine = PlaywrightEngine(
662
  wait=wait,
 
685
 
686
  @classmethod
687
  async def async_fetch(
688
+ cls,
689
+ url: str,
690
+ headless: Union[bool, str] = True,
691
+ disable_resources: bool = None,
692
+ useragent: Optional[str] = None,
693
+ network_idle: bool = False,
694
+ timeout: Optional[float] = 30000,
695
+ wait: Optional[int] = 0,
696
+ page_action: Optional[Callable] = None,
697
+ wait_selector: Optional[str] = None,
698
+ wait_selector_state: SelectorWaitStates = "attached",
699
+ hide_canvas: bool = False,
700
+ disable_webgl: bool = False,
701
+ extra_headers: Optional[Dict[str, str]] = None,
702
+ google_search: bool = True,
703
+ proxy: Optional[Union[str, Dict[str, str]]] = None,
704
+ locale: Optional[str] = "en-US",
705
+ stealth: bool = False,
706
+ real_chrome: bool = False,
707
+ cdp_url: Optional[str] = None,
708
+ nstbrowser_mode: bool = False,
709
+ nstbrowser_config: Optional[Dict] = None,
710
+ custom_config: Dict = None,
711
  ) -> Response:
712
  """Opens up a browser and do your request based on your chosen options below.
713
 
 
740
  if not custom_config:
741
  custom_config = {}
742
  elif not isinstance(custom_config, dict):
743
+ ValueError(
744
+ f"The custom parser config must be of type dictionary, got {cls.__class__}"
745
+ )
746
 
747
  engine = PlaywrightEngine(
748
  wait=wait,
 
773
  class CustomFetcher(BaseFetcher):
774
  @classmethod
775
  def fetch(cls, url: str, browser_engine, **kwargs) -> Response:
776
+ engine = check_if_engine_usable(browser_engine)(
777
+ adaptor_arguments=cls._generate_parser_arguments(), **kwargs
778
+ )
779
  return engine.fetch(url)
scrapling/parser.py CHANGED
@@ -9,40 +9,59 @@ from cssselect import SelectorError, SelectorSyntaxError
9
  from cssselect import parse as split_selectors
10
  from lxml import etree, html
11
 
12
- from scrapling.core._types import (Any, Callable, Dict, Generator, Iterable,
13
- List, Optional, Pattern, SupportsIndex,
14
- Tuple, Union)
15
- from scrapling.core.custom_types import (AttributesHandler, TextHandler,
16
- TextHandlers)
 
 
 
 
 
 
 
 
 
17
  from scrapling.core.mixins import SelectorsGeneration
18
- from scrapling.core.storage_adaptors import (SQLiteStorageSystem,
19
- StorageSystemMixin, _StorageTools)
 
 
 
20
  from scrapling.core.translator import translator_instance
21
- from scrapling.core.utils import (clean_spaces, flatten, html_forbidden,
22
- is_jsonable, log)
23
 
24
 
25
  class Adaptor(SelectorsGeneration):
26
  __slots__ = (
27
- 'url', 'encoding', '__auto_match_enabled', '_root', '_storage',
28
- '__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
29
- '__keep_cdata'
 
 
 
 
 
 
 
 
30
  )
31
 
32
  def __init__(
33
- self,
34
- text: Optional[str] = None,
35
- url: Optional[str] = None,
36
- body: bytes = b"",
37
- encoding: str = "utf8",
38
- huge_tree: bool = True,
39
- root: Optional[html.HtmlElement] = None,
40
- keep_comments: Optional[bool] = False,
41
- keep_cdata: Optional[bool] = False,
42
- auto_match: Optional[bool] = False,
43
- storage: Any = SQLiteStorageSystem,
44
- storage_args: Optional[Dict] = None,
45
- **kwargs
46
  ):
47
  """The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
48
  with expressions in CSS, XPath, or with simply text. Check the docs for more info.
@@ -69,25 +88,37 @@ class Adaptor(SelectorsGeneration):
69
  If empty, default values will be used.
70
  """
71
  if root is None and not body and text is None:
72
- raise ValueError("Adaptor class needs text, body, or root arguments to work")
 
 
73
 
74
- self.__text = ''
75
  if root is None:
76
  if text is None:
77
  if not body or not isinstance(body, bytes):
78
- raise TypeError(f"body argument must be valid and of type bytes, got {body.__class__}")
 
 
79
 
80
  body = body.replace(b"\x00", b"").strip()
81
  else:
82
  if not isinstance(text, str):
83
- raise TypeError(f"text argument must be of type str, got {text.__class__}")
 
 
84
 
85
  body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
86
 
87
  # https://lxml.de/api/lxml.etree.HTMLParser-class.html
88
  parser = html.HTMLParser(
89
- recover=True, remove_blank_text=True, remove_comments=(not keep_comments), encoding=encoding,
90
- compact=True, huge_tree=huge_tree, default_doctype=True, strip_cdata=(not keep_cdata),
 
 
 
 
 
 
91
  )
92
  self._root = etree.fromstring(body, parser=parser, base_url=url)
93
  if is_jsonable(text or body.decode()):
@@ -107,15 +138,21 @@ class Adaptor(SelectorsGeneration):
107
  if self.__auto_match_enabled:
108
  if not storage_args:
109
  storage_args = {
110
- 'storage_file': os.path.join(os.path.dirname(__file__), 'elements_storage.db'),
111
- 'url': url
 
 
112
  }
113
 
114
- if not hasattr(storage, '__wrapped__'):
115
- raise ValueError("Storage class must be wrapped with lru_cache decorator, see docs for info")
 
 
116
 
117
  if not issubclass(storage.__wrapped__, StorageSystemMixin):
118
- raise ValueError("Storage system must be inherited from class `StorageSystemMixin`")
 
 
119
 
120
  self._storage = storage(**storage_args)
121
 
@@ -128,13 +165,27 @@ class Adaptor(SelectorsGeneration):
128
  self.__attributes = None
129
  self.__tag = None
130
  # No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
131
- self.__response_data = {
132
- key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'history', 'headers', 'request_headers',)
133
- } if hasattr(self, 'status') else {}
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
  # Node functionalities, I wanted to move to separate Mixin class but it had slight impact on performance
136
  @staticmethod
137
- def _is_text_node(element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> bool:
 
 
138
  """Return True if given element is a result of a string expression
139
  Examples:
140
  XPath -> '/text()', '/@attribute' etc...
@@ -144,25 +195,33 @@ class Adaptor(SelectorsGeneration):
144
  return issubclass(type(element), etree._ElementUnicodeResult)
145
 
146
  @staticmethod
147
- def __content_convertor(element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> TextHandler:
 
 
148
  """Used internally to convert a single element's text content to TextHandler directly without checks
149
 
150
  This single line has been isolated like this so when it's used with map we get that slight performance boost vs list comprehension
151
  """
152
  return TextHandler(str(element))
153
 
154
- def __element_convertor(self, element: html.HtmlElement) -> 'Adaptor':
155
  """Used internally to convert a single HtmlElement to Adaptor directly without checks"""
156
  return Adaptor(
157
  root=element,
158
- text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
159
- url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
160
- keep_comments=self.__keep_comments, keep_cdata=self.__keep_cdata,
 
 
 
 
161
  huge_tree=self.__huge_tree_enabled,
162
- **self.__response_data
163
  )
164
 
165
- def __handle_element(self, element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> Union[TextHandler, 'Adaptor', None]:
 
 
166
  """Used internally in all functions to convert a single element to type (Adaptor|TextHandler) when possible"""
167
  if element is None:
168
  return None
@@ -172,9 +231,13 @@ class Adaptor(SelectorsGeneration):
172
  else:
173
  return self.__element_convertor(element)
174
 
175
- def __handle_elements(self, result: List[Union[html.HtmlElement, etree._ElementUnicodeResult]]) -> Union['Adaptors', 'TextHandlers', List]:
 
 
176
  """Used internally in all functions to convert results to type (Adaptors|TextHandlers) in bulk when possible"""
177
- if not len(result): # Lxml will give a warning if I used something like `not result`
 
 
178
  return Adaptors([])
179
 
180
  # From within the code, this method will always get a list of the same type
@@ -209,7 +272,16 @@ class Adaptor(SelectorsGeneration):
209
  self.__text = TextHandler(self._root.text)
210
  return self.__text
211
 
212
- def get_all_text(self, separator: str = "\n", strip: bool = False, ignore_tags: Tuple = ('script', 'style',), valid_values: bool = True) -> TextHandler:
 
 
 
 
 
 
 
 
 
213
  """Get all child strings of this element, concatenated using the given separator.
214
 
215
  :param separator: Strings will be concatenated using this separator.
@@ -220,7 +292,7 @@ class Adaptor(SelectorsGeneration):
220
  :return: A TextHandler
221
  """
222
  _all_strings = []
223
- for node in self._root.xpath('.//*'):
224
  if node.tag not in ignore_tags:
225
  text = node.text
226
  if text and type(text) is str:
@@ -245,13 +317,25 @@ class Adaptor(SelectorsGeneration):
245
  @property
246
  def html_content(self) -> TextHandler:
247
  """Return the inner html code of the element"""
248
- return TextHandler(etree.tostring(self._root, encoding='unicode', method='html', with_tail=False))
 
 
 
 
249
 
250
  body = html_content
251
 
252
  def prettify(self) -> TextHandler:
253
  """Return a prettified version of the element's inner html-code"""
254
- return TextHandler(etree.tostring(self._root, encoding='unicode', pretty_print=True, method='html', with_tail=False))
 
 
 
 
 
 
 
 
255
 
256
  def has_class(self, class_name: str) -> bool:
257
  """Check if element has a specific class
@@ -261,36 +345,44 @@ class Adaptor(SelectorsGeneration):
261
  return class_name in self._root.classes
262
 
263
  @property
264
- def parent(self) -> Union['Adaptor', None]:
265
  """Return the direct parent of the element or ``None`` otherwise"""
266
  return self.__handle_element(self._root.getparent())
267
 
268
  @property
269
- def below_elements(self) -> 'Adaptors[Adaptor]':
270
  """Return all elements under the current element in the DOM tree"""
271
- below = self._root.xpath('.//*')
272
  return self.__handle_elements(below)
273
 
274
  @property
275
- def children(self) -> 'Adaptors[Adaptor]':
276
  """Return the children elements of the current element or empty list otherwise"""
277
- return Adaptors([
278
- self.__element_convertor(child) for child in self._root.iterchildren() if type(child) not in html_forbidden
279
- ])
 
 
 
 
280
 
281
  @property
282
- def siblings(self) -> 'Adaptors[Adaptor]':
283
  """Return other children of the current element's parent or empty list otherwise"""
284
  if self.parent:
285
- return Adaptors([child for child in self.parent.children if child._root != self._root])
 
 
286
  return Adaptors([])
287
 
288
- def iterancestors(self) -> Generator['Adaptor', None, None]:
289
  """Return a generator that loops over all ancestors of the element, starting with element's parent."""
290
  for ancestor in self._root.iterancestors():
291
  yield self.__element_convertor(ancestor)
292
 
293
- def find_ancestor(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptor', None]:
 
 
294
  """Loop over all ancestors of the element till one match the passed function
295
  :param func: A function that takes each ancestor as an argument and returns True/False
296
  :return: The first ancestor that match the function or ``None`` otherwise.
@@ -301,13 +393,13 @@ class Adaptor(SelectorsGeneration):
301
  return None
302
 
303
  @property
304
- def path(self) -> 'Adaptors[Adaptor]':
305
  """Returns list of type :class:`Adaptors` that contains the path leading to the current element from the root."""
306
  lst = list(self.iterancestors())
307
  return Adaptors(lst)
308
 
309
  @property
310
- def next(self) -> Union['Adaptor', None]:
311
  """Returns the next element of the current element in the children of the parent or ``None`` otherwise."""
312
  next_element = self._root.getnext()
313
  if next_element is not None:
@@ -318,7 +410,7 @@ class Adaptor(SelectorsGeneration):
318
  return self.__handle_element(next_element)
319
 
320
  @property
321
- def previous(self) -> Union['Adaptor', None]:
322
  """Returns the previous element of the current element in the children of the parent or ``None`` otherwise."""
323
  prev_element = self._root.getprevious()
324
  if prev_element is not None:
@@ -346,13 +438,13 @@ class Adaptor(SelectorsGeneration):
346
  data = "<"
347
  content = clean_spaces(self.html_content)
348
  if len(content) > length_limit:
349
- content = content[:length_limit].strip() + '...'
350
  data += f"data='{content}'"
351
 
352
  if self.parent:
353
  parent_content = clean_spaces(self.parent.html_content)
354
  if len(parent_content) > length_limit:
355
- parent_content = parent_content[:length_limit].strip() + '...'
356
 
357
  data += f" parent='{parent_content}'"
358
 
@@ -360,8 +452,11 @@ class Adaptor(SelectorsGeneration):
360
 
361
  # From here we start the selecting functions
362
  def relocate(
363
- self, element: Union[Dict, html.HtmlElement, 'Adaptor'], percentage: int = 0, adaptor_type: bool = False
364
- ) -> Union[List[Union[html.HtmlElement, None]], 'Adaptors']:
 
 
 
365
  """This function will search again for the element in the page tree, used automatically on page structure change
366
 
367
  :param element: The element we want to relocate in the tree
@@ -379,7 +474,7 @@ class Adaptor(SelectorsGeneration):
379
  if issubclass(type(element), html.HtmlElement):
380
  element = _StorageTools.element_to_dict(element)
381
 
382
- for node in self._root.xpath('.//*'):
383
  # Collect all elements in the page then for each element get the matching score of it against the node.
384
  # Hence: the code doesn't stop even if the score was 100%
385
  # because there might be another element(s) left in page with the same score
@@ -391,19 +486,26 @@ class Adaptor(SelectorsGeneration):
391
  if score_table[highest_probability] and highest_probability >= percentage:
392
  if log.getEffectiveLevel() < 20:
393
  # No need to execute this part if logging level is not debugging
394
- log.debug(f'Highest probability was {highest_probability}%')
395
- log.debug('Top 5 best matching elements are: ')
396
  for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
397
- log.debug(f'{percent} -> {self.__handle_elements(score_table[percent])}')
 
 
398
 
399
  if not adaptor_type:
400
  return score_table[highest_probability]
401
  return self.__handle_elements(score_table[highest_probability])
402
  return []
403
 
404
- def css_first(self, selector: str, identifier: str = '',
405
- auto_match: bool = False, auto_save: bool = False, percentage: int = 0
406
- ) -> Union['Adaptor', 'TextHandler', None]:
 
 
 
 
 
407
  """Search current tree with CSS3 selectors and return the first result if possible, otherwise return `None`
408
 
409
  **Important:
@@ -419,13 +521,21 @@ class Adaptor(SelectorsGeneration):
419
  Be aware that the percentage calculation depends solely on the page structure so don't play with this
420
  number unless you must know what you are doing!
421
  """
422
- for element in self.css(selector, identifier, auto_match, auto_save, percentage):
 
 
423
  return element
424
  return None
425
 
426
- def xpath_first(self, selector: str, identifier: str = '',
427
- auto_match: bool = False, auto_save: bool = False, percentage: int = 0, **kwargs: Any
428
- ) -> Union['Adaptor', 'TextHandler', None]:
 
 
 
 
 
 
429
  """Search current tree with XPath selectors and return the first result if possible, otherwise return `None`
430
 
431
  **Important:
@@ -443,13 +553,20 @@ class Adaptor(SelectorsGeneration):
443
  Be aware that the percentage calculation depends solely on the page structure so don't play with this
444
  number unless you must know what you are doing!
445
  """
446
- for element in self.xpath(selector, identifier, auto_match, auto_save, percentage, **kwargs):
 
 
447
  return element
448
  return None
449
 
450
- def css(self, selector: str, identifier: str = '',
451
- auto_match: bool = False, auto_save: bool = False, percentage: int = 0
452
- ) -> Union['Adaptors[Adaptor]', List, 'TextHandlers[TextHandler]']:
 
 
 
 
 
453
  """Search current tree with CSS3 selectors
454
 
455
  **Important:
@@ -468,28 +585,49 @@ class Adaptor(SelectorsGeneration):
468
  :return: List as :class:`Adaptors`
469
  """
470
  try:
471
- if not self.__auto_match_enabled or ',' not in selector:
472
  # No need to split selectors in this case, let's save some CPU cycles :)
473
  xpath_selector = translator_instance.css_to_xpath(selector)
474
- return self.xpath(xpath_selector, identifier or selector, auto_match, auto_save, percentage)
 
 
 
 
 
 
475
 
476
  results = []
477
- if ',' in selector:
478
  for single_selector in split_selectors(selector):
479
  # I'm doing this only so the `save` function save data correctly for combined selectors
480
  # Like using the ',' to combine two different selectors that point to different elements.
481
- xpath_selector = translator_instance.css_to_xpath(single_selector.canonical())
 
 
482
  results += self.xpath(
483
- xpath_selector, identifier or single_selector.canonical(), auto_match, auto_save, percentage
 
 
 
 
484
  )
485
 
486
  return results
487
- except (SelectorError, SelectorSyntaxError,):
 
 
 
488
  raise SelectorSyntaxError(f"Invalid CSS selector: {selector}")
489
 
490
- def xpath(self, selector: str, identifier: str = '',
491
- auto_match: bool = False, auto_save: bool = False, percentage: int = 0, **kwargs: Any
492
- ) -> Union['Adaptors[Adaptor]', List, 'TextHandlers[TextHandler]']:
 
 
 
 
 
 
493
  """Search current tree with XPath selectors
494
 
495
  **Important:
@@ -515,7 +653,9 @@ class Adaptor(SelectorsGeneration):
515
  if elements:
516
  if auto_save:
517
  if not self.__auto_match_enabled:
518
- log.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
 
 
519
  else:
520
  self.save(elements[0], identifier or selector)
521
 
@@ -531,16 +671,29 @@ class Adaptor(SelectorsGeneration):
531
  return self.__handle_elements(elements)
532
  else:
533
  if auto_match:
534
- log.warning("Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
 
 
535
  elif auto_save:
536
- log.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
 
 
537
 
538
  return self.__handle_elements(elements)
539
 
540
- except (SelectorError, SelectorSyntaxError, etree.XPathError, etree.XPathEvalError):
 
 
 
 
 
541
  raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
542
 
543
- def find_all(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> 'Adaptors':
 
 
 
 
544
  """Find elements by filters of your creations for ease..
545
 
546
  :param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
@@ -551,12 +704,14 @@ class Adaptor(SelectorsGeneration):
551
  # Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
552
  # https://www.w3schools.com/python/python_ref_keywords.asp
553
  whitelisted = {
554
- 'class_': 'class',
555
- 'for_': 'for',
556
  }
557
 
558
  if not args and not kwargs:
559
- raise TypeError('You have to pass something to search with, like tag name(s), tag attributes, or both.')
 
 
560
 
561
  attributes = dict()
562
  tags, patterns = set(), set()
@@ -569,12 +724,18 @@ class Adaptor(SelectorsGeneration):
569
 
570
  elif type(arg) in [list, tuple, set]:
571
  if not all(map(lambda x: type(x) is str, arg)):
572
- raise TypeError('Nested Iterables are not accepted, only iterables of tag names are accepted')
 
 
573
  tags.update(set(arg))
574
 
575
  elif isinstance(arg, dict):
576
- if not all([(type(k) is str and type(v) is str) for k, v in arg.items()]):
577
- raise TypeError('Nested dictionaries are not accepted, only string keys and string values are accepted')
 
 
 
 
578
  attributes.update(arg)
579
 
580
  elif isinstance(arg, re.Pattern):
@@ -584,13 +745,17 @@ class Adaptor(SelectorsGeneration):
584
  if len(inspect.signature(arg).parameters) > 0:
585
  functions.append(arg)
586
  else:
587
- raise TypeError("Callable filter function must have at least one argument to take `Adaptor` objects.")
 
 
588
 
589
  else:
590
- raise TypeError(f'Argument with type "{type(arg)}" is not accepted, please read the docs.')
 
 
591
 
592
  if not all([(type(k) is str and type(v) is str) for k, v in kwargs.items()]):
593
- raise TypeError('Only string values are accepted for arguments')
594
 
595
  for attribute_name, value in kwargs.items():
596
  # Only replace names for kwargs, replacing them in dictionaries doesn't make sense
@@ -598,22 +763,24 @@ class Adaptor(SelectorsGeneration):
598
  attributes[attribute_name] = value
599
 
600
  # It's easier and faster to build a selector than traversing the tree
601
- tags = tags or ['*']
602
  for tag in tags:
603
  selector = tag
604
  for key, value in attributes.items():
605
- value = value.replace('"', r'\"') # Escape double quotes in user input
606
  # Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
607
  selector += '[{}="{}"]'.format(key, value)
608
- if selector != '*':
609
  selectors.append(selector)
610
 
611
  if selectors:
612
- results = self.css(', '.join(selectors))
613
  if results:
614
  # From the results, get the ones that fulfill passed regex patterns
615
  for pattern in patterns:
616
- results = results.filter(lambda e: e.text.re(pattern, check_match=True))
 
 
617
 
618
  # From the results, get the ones that fulfill passed functions
619
  for function in functions:
@@ -629,7 +796,11 @@ class Adaptor(SelectorsGeneration):
629
 
630
  return results
631
 
632
- def find(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> Union['Adaptor', None]:
 
 
 
 
633
  """Find elements by filters of your creations for ease then return the first result. Otherwise return `None`.
634
 
635
  :param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
@@ -640,7 +811,9 @@ class Adaptor(SelectorsGeneration):
640
  return element
641
  return None
642
 
643
- def __calculate_similarity_score(self, original: Dict, candidate: html.HtmlElement) -> float:
 
 
644
  """Used internally to calculate a score that shows how candidate element similar to the original one
645
 
646
  :param original: The original element in the form of the dictionary generated from `element_to_dict` function
@@ -653,53 +826,68 @@ class Adaptor(SelectorsGeneration):
653
  # Possible TODO:
654
  # Study the idea of giving weight to each test below so some are more important than others
655
  # Current results: With weights some websites had better score while it was worse for others
656
- score += 1 if original['tag'] == candidate['tag'] else 0 # * 0.3 # 30%
657
  checks += 1
658
 
659
- if original['text']:
660
- score += SequenceMatcher(None, original['text'], candidate.get('text') or '').ratio() # * 0.3 # 30%
 
 
661
  checks += 1
662
 
663
  # if both doesn't have attributes, it still count for something!
664
- score += self.__calculate_dict_diff(original['attributes'], candidate['attributes']) # * 0.3 # 30%
 
 
665
  checks += 1
666
 
667
  # Separate similarity test for class, id, href,... this will help in full structural changes
668
- for attrib in ('class', 'id', 'href', 'src',):
669
- if original['attributes'].get(attrib):
 
 
 
 
 
670
  score += SequenceMatcher(
671
- None, original['attributes'][attrib], candidate['attributes'].get(attrib) or ''
 
 
672
  ).ratio() # * 0.3 # 30%
673
  checks += 1
674
 
675
- score += SequenceMatcher(None, original['path'], candidate['path']).ratio() # * 0.1 # 10%
 
 
676
  checks += 1
677
 
678
- if original.get('parent_name'):
679
  # Then we start comparing parents' data
680
- if candidate.get('parent_name'):
681
  score += SequenceMatcher(
682
- None, original['parent_name'], candidate.get('parent_name') or ''
683
  ).ratio() # * 0.2 # 20%
684
  checks += 1
685
 
686
  score += self.__calculate_dict_diff(
687
- original['parent_attribs'], candidate.get('parent_attribs') or {}
688
  ) # * 0.2 # 20%
689
  checks += 1
690
 
691
- if original['parent_text']:
692
  score += SequenceMatcher(
693
- None, original['parent_text'], candidate.get('parent_text') or ''
 
 
694
  ).ratio() # * 0.1 # 10%
695
  checks += 1
696
  # else:
697
  # # The original element have a parent and this one not, this is not a good sign
698
  # score -= 0.1
699
 
700
- if original.get('siblings'):
701
  score += SequenceMatcher(
702
- None, original['siblings'], candidate.get('siblings') or []
703
  ).ratio() # * 0.1 # 10%
704
  checks += 1
705
 
@@ -708,13 +896,20 @@ class Adaptor(SelectorsGeneration):
708
 
709
  @staticmethod
710
  def __calculate_dict_diff(dict1: dict, dict2: dict) -> float:
711
- """Used internally calculate similarity between two dictionaries as SequenceMatcher doesn't accept dictionaries
712
- """
713
- score = SequenceMatcher(None, tuple(dict1.keys()), tuple(dict2.keys())).ratio() * 0.5
714
- score += SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio() * 0.5
 
 
 
 
 
715
  return score
716
 
717
- def save(self, element: Union['Adaptor', html.HtmlElement], identifier: str) -> None:
 
 
718
  """Saves the element's unique properties to the storage for retrieval and relocation later
719
 
720
  :param element: The element itself that we want to save to storage, it can be a `Adaptor` or pure `HtmlElement`
@@ -756,8 +951,13 @@ class Adaptor(SelectorsGeneration):
756
  else:
757
  return self.get_all_text(strip=True).json()
758
 
759
- def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
760
- clean_match: bool = False, case_sensitive: bool = True) -> TextHandlers:
 
 
 
 
 
761
  """Apply the given regex to the current text and return a list of strings with the matches.
762
 
763
  :param regex: Can be either a compiled regular expression or a string.
@@ -767,8 +967,14 @@ class Adaptor(SelectorsGeneration):
767
  """
768
  return self.text.re(regex, replace_entities, clean_match, case_sensitive)
769
 
770
- def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
771
- clean_match: bool = False, case_sensitive: bool = True) -> TextHandler:
 
 
 
 
 
 
772
  """Apply the given regex to text and return the first match if found, otherwise return the default value.
773
 
774
  :param regex: Can be either a compiled regular expression or a string.
@@ -777,14 +983,19 @@ class Adaptor(SelectorsGeneration):
777
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
778
  :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
779
  """
780
- return self.text.re_first(regex, default, replace_entities, clean_match, case_sensitive)
 
 
781
 
782
  def find_similar(
783
- self,
784
- similarity_threshold: float = 0.2,
785
- ignore_attributes: Union[List, Tuple] = ('href', 'src',),
786
- match_text: bool = False
787
- ) -> Union['Adaptors[Adaptor]', List]:
 
 
 
788
  """Find elements that are in the same tree depth in the page with the same tag name and same parent tag etc...
789
  then return the ones that match the current element attributes with percentage higher than the input threshold.
790
 
@@ -805,19 +1016,28 @@ class Adaptor(SelectorsGeneration):
805
 
806
  :return: A ``Adaptors`` container of ``Adaptor`` objects or empty list
807
  """
 
808
  def get_attributes(element: html.HtmlElement) -> Dict:
809
  """Return attributes dictionary without the ignored list"""
810
- return {k: v for k, v in element.attrib.items() if k not in ignore_attributes}
811
-
812
- def are_alike(original: html.HtmlElement, original_attributes: Dict, candidate: html.HtmlElement) -> bool:
 
 
 
 
 
 
813
  """Calculate a score of how much these elements are alike and return True
814
- if score is higher or equal the threshold"""
815
- candidate_attributes = get_attributes(candidate) if ignore_attributes else candidate.attrib
 
 
816
  score, checks = 0, 0
817
 
818
  if original_attributes:
819
  score += sum(
820
- SequenceMatcher(None, v, candidate_attributes.get(k, '')).ratio()
821
  for k, v in original_attributes.items()
822
  )
823
  checks += len(candidate_attributes)
@@ -829,7 +1049,9 @@ class Adaptor(SelectorsGeneration):
829
 
830
  if match_text:
831
  score += SequenceMatcher(
832
- None, clean_spaces(original.text or ''), clean_spaces(candidate.text or '')
 
 
833
  ).ratio()
834
  checks += 1
835
 
@@ -851,20 +1073,30 @@ class Adaptor(SelectorsGeneration):
851
  f"//{grandparent.tag}/{parent.tag}/{self.tag}[count(ancestor::*) = {current_depth}]"
852
  )
853
  else:
854
- potential_matches = root.xpath(f"//{parent.tag}/{self.tag}[count(ancestor::*) = {current_depth}]")
 
 
855
  else:
856
- potential_matches = root.xpath(f"//{self.tag}[count(ancestor::*) = {current_depth}]")
 
 
857
 
858
  for potential_match in potential_matches:
859
- if potential_match != root and are_alike(root, target_attrs, potential_match):
 
 
860
  similar_elements.append(potential_match)
861
 
862
  return self.__handle_elements(similar_elements)
863
 
864
  def find_by_text(
865
- self, text: str, first_match: bool = True, partial: bool = False,
866
- case_sensitive: bool = False, clean_match: bool = True
867
- ) -> Union['Adaptors[Adaptor]', 'Adaptor']:
 
 
 
 
868
  """Find elements that its text content fully/partially matches input.
869
  :param text: Text query to match
870
  :param first_match: Return first element that matches conditions, enabled by default
@@ -878,7 +1110,9 @@ class Adaptor(SelectorsGeneration):
878
  text = text.lower()
879
 
880
  # This selector gets all elements with text content
881
- for node in self.__handle_elements(self._root.xpath('.//*[normalize-space(text())]')):
 
 
882
  """Check if element matches given text otherwise, traverse the children tree and iterate"""
883
  node_text = node.text
884
  if clean_match:
@@ -903,8 +1137,12 @@ class Adaptor(SelectorsGeneration):
903
  return results
904
 
905
  def find_by_regex(
906
- self, query: Union[str, Pattern[str]], first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
907
- ) -> Union['Adaptors[Adaptor]', 'Adaptor']:
 
 
 
 
908
  """Find elements that its text content matches the input regex pattern.
909
  :param query: Regex query/pattern to match
910
  :param first_match: Return first element that matches conditions, enabled by default
@@ -914,10 +1152,17 @@ class Adaptor(SelectorsGeneration):
914
  results = Adaptors([])
915
 
916
  # This selector gets all elements with text content
917
- for node in self.__handle_elements(self._root.xpath('.//*[normalize-space(text())]')):
 
 
918
  """Check if element matches given regex otherwise, traverse the children tree and iterate"""
919
  node_text = node.text
920
- if node_text.re(query, check_match=True, clean_match=clean_match, case_sensitive=case_sensitive):
 
 
 
 
 
921
  results.append(node)
922
 
923
  if first_match and results:
@@ -933,6 +1178,7 @@ class Adaptors(List[Adaptor]):
933
  """
934
  The :class:`Adaptors` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
935
  """
 
936
  __slots__ = ()
937
 
938
  @typing.overload
@@ -943,7 +1189,9 @@ class Adaptors(List[Adaptor]):
943
  def __getitem__(self, pos: slice) -> "Adaptors":
944
  pass
945
 
946
- def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union[Adaptor, "Adaptors"]:
 
 
947
  lst = super().__getitem__(pos)
948
  if isinstance(pos, slice):
949
  return self.__class__(lst)
@@ -951,7 +1199,12 @@ class Adaptors(List[Adaptor]):
951
  return lst
952
 
953
  def xpath(
954
- self, selector: str, identifier: str = '', auto_save: bool = False, percentage: int = 0, **kwargs: Any
 
 
 
 
 
955
  ) -> "Adaptors[Adaptor]":
956
  """
957
  Call the ``.xpath()`` method for each element in this list and return
@@ -974,11 +1227,20 @@ class Adaptors(List[Adaptor]):
974
  :return: List as :class:`Adaptors`
975
  """
976
  results = [
977
- n.xpath(selector, identifier or selector, False, auto_save, percentage, **kwargs) for n in self
 
 
 
978
  ]
979
  return self.__class__(flatten(results))
980
 
981
- def css(self, selector: str, identifier: str = '', auto_save: bool = False, percentage: int = 0) -> "Adaptors[Adaptor]":
 
 
 
 
 
 
982
  """
983
  Call the ``.css()`` method for each element in this list and return
984
  their results flattened as another :class:`Adaptors`.
@@ -998,12 +1260,18 @@ class Adaptors(List[Adaptor]):
998
  :return: List as :class:`Adaptors`
999
  """
1000
  results = [
1001
- n.css(selector, identifier or selector, False, auto_save, percentage) for n in self
 
1002
  ]
1003
  return self.__class__(flatten(results))
1004
 
1005
- def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
1006
- clean_match: bool = False, case_sensitive: bool = True) -> TextHandlers[TextHandler]:
 
 
 
 
 
1007
  """Call the ``.re()`` method for each element in this list and return
1008
  their results flattened as List of TextHandler.
1009
 
@@ -1013,12 +1281,19 @@ class Adaptors(List[Adaptor]):
1013
  :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
1014
  """
1015
  results = [
1016
- n.text.re(regex, replace_entities, clean_match, case_sensitive) for n in self
 
1017
  ]
1018
  return TextHandlers(flatten(results))
1019
 
1020
- def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
1021
- clean_match: bool = False, case_sensitive: bool = True) -> TextHandler:
 
 
 
 
 
 
1022
  """Call the ``.re_first()`` method for each element in this list and return
1023
  the first result or the default value otherwise.
1024
 
@@ -1033,7 +1308,7 @@ class Adaptors(List[Adaptor]):
1033
  return result
1034
  return default
1035
 
1036
- def search(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptor', None]:
1037
  """Loop over all current elements and return the first element that matches the passed function
1038
  :param func: A function that takes each element as an argument and returns True/False
1039
  :return: The first element that match the function or ``None`` otherwise.
@@ -1043,14 +1318,12 @@ class Adaptors(List[Adaptor]):
1043
  return element
1044
  return None
1045
 
1046
- def filter(self, func: Callable[['Adaptor'], bool]) -> 'Adaptors[Adaptor]':
1047
  """Filter current elements based on the passed function
1048
  :param func: A function that takes each element as an argument and returns True/False
1049
  :return: The new `Adaptors` object or empty list otherwise.
1050
  """
1051
- return self.__class__([
1052
- element for element in self if func(element)
1053
- ])
1054
 
1055
  # For easy copy-paste from Scrapy/parsel code when needed :)
1056
  def get(self, default=None):
 
9
  from cssselect import parse as split_selectors
10
  from lxml import etree, html
11
 
12
+ from scrapling.core._types import (
13
+ Any,
14
+ Callable,
15
+ Dict,
16
+ Generator,
17
+ Iterable,
18
+ List,
19
+ Optional,
20
+ Pattern,
21
+ SupportsIndex,
22
+ Tuple,
23
+ Union,
24
+ )
25
+ from scrapling.core.custom_types import AttributesHandler, TextHandler, TextHandlers
26
  from scrapling.core.mixins import SelectorsGeneration
27
+ from scrapling.core.storage_adaptors import (
28
+ SQLiteStorageSystem,
29
+ StorageSystemMixin,
30
+ _StorageTools,
31
+ )
32
  from scrapling.core.translator import translator_instance
33
+ from scrapling.core.utils import clean_spaces, flatten, html_forbidden, is_jsonable, log
 
34
 
35
 
36
  class Adaptor(SelectorsGeneration):
37
  __slots__ = (
38
+ "url",
39
+ "encoding",
40
+ "__auto_match_enabled",
41
+ "_root",
42
+ "_storage",
43
+ "__keep_comments",
44
+ "__huge_tree_enabled",
45
+ "__attributes",
46
+ "__text",
47
+ "__tag",
48
+ "__keep_cdata",
49
  )
50
 
51
  def __init__(
52
+ self,
53
+ text: Optional[str] = None,
54
+ url: Optional[str] = None,
55
+ body: bytes = b"",
56
+ encoding: str = "utf8",
57
+ huge_tree: bool = True,
58
+ root: Optional[html.HtmlElement] = None,
59
+ keep_comments: Optional[bool] = False,
60
+ keep_cdata: Optional[bool] = False,
61
+ auto_match: Optional[bool] = False,
62
+ storage: Any = SQLiteStorageSystem,
63
+ storage_args: Optional[Dict] = None,
64
+ **kwargs,
65
  ):
66
  """The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
67
  with expressions in CSS, XPath, or with simply text. Check the docs for more info.
 
88
  If empty, default values will be used.
89
  """
90
  if root is None and not body and text is None:
91
+ raise ValueError(
92
+ "Adaptor class needs text, body, or root arguments to work"
93
+ )
94
 
95
+ self.__text = ""
96
  if root is None:
97
  if text is None:
98
  if not body or not isinstance(body, bytes):
99
+ raise TypeError(
100
+ f"body argument must be valid and of type bytes, got {body.__class__}"
101
+ )
102
 
103
  body = body.replace(b"\x00", b"").strip()
104
  else:
105
  if not isinstance(text, str):
106
+ raise TypeError(
107
+ f"text argument must be of type str, got {text.__class__}"
108
+ )
109
 
110
  body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
111
 
112
  # https://lxml.de/api/lxml.etree.HTMLParser-class.html
113
  parser = html.HTMLParser(
114
+ recover=True,
115
+ remove_blank_text=True,
116
+ remove_comments=(not keep_comments),
117
+ encoding=encoding,
118
+ compact=True,
119
+ huge_tree=huge_tree,
120
+ default_doctype=True,
121
+ strip_cdata=(not keep_cdata),
122
  )
123
  self._root = etree.fromstring(body, parser=parser, base_url=url)
124
  if is_jsonable(text or body.decode()):
 
138
  if self.__auto_match_enabled:
139
  if not storage_args:
140
  storage_args = {
141
+ "storage_file": os.path.join(
142
+ os.path.dirname(__file__), "elements_storage.db"
143
+ ),
144
+ "url": url,
145
  }
146
 
147
+ if not hasattr(storage, "__wrapped__"):
148
+ raise ValueError(
149
+ "Storage class must be wrapped with lru_cache decorator, see docs for info"
150
+ )
151
 
152
  if not issubclass(storage.__wrapped__, StorageSystemMixin):
153
+ raise ValueError(
154
+ "Storage system must be inherited from class `StorageSystemMixin`"
155
+ )
156
 
157
  self._storage = storage(**storage_args)
158
 
 
165
  self.__attributes = None
166
  self.__tag = None
167
  # No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
168
+ self.__response_data = (
169
+ {
170
+ key: getattr(self, key)
171
+ for key in (
172
+ "status",
173
+ "reason",
174
+ "cookies",
175
+ "history",
176
+ "headers",
177
+ "request_headers",
178
+ )
179
+ }
180
+ if hasattr(self, "status")
181
+ else {}
182
+ )
183
 
184
  # Node functionalities, I wanted to move to separate Mixin class but it had slight impact on performance
185
  @staticmethod
186
+ def _is_text_node(
187
+ element: Union[html.HtmlElement, etree._ElementUnicodeResult],
188
+ ) -> bool:
189
  """Return True if given element is a result of a string expression
190
  Examples:
191
  XPath -> '/text()', '/@attribute' etc...
 
195
  return issubclass(type(element), etree._ElementUnicodeResult)
196
 
197
  @staticmethod
198
+ def __content_convertor(
199
+ element: Union[html.HtmlElement, etree._ElementUnicodeResult],
200
+ ) -> TextHandler:
201
  """Used internally to convert a single element's text content to TextHandler directly without checks
202
 
203
  This single line has been isolated like this so when it's used with map we get that slight performance boost vs list comprehension
204
  """
205
  return TextHandler(str(element))
206
 
207
+ def __element_convertor(self, element: html.HtmlElement) -> "Adaptor":
208
  """Used internally to convert a single HtmlElement to Adaptor directly without checks"""
209
  return Adaptor(
210
  root=element,
211
+ text="",
212
+ body=b"", # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
213
+ url=self.url,
214
+ encoding=self.encoding,
215
+ auto_match=self.__auto_match_enabled,
216
+ keep_comments=self.__keep_comments,
217
+ keep_cdata=self.__keep_cdata,
218
  huge_tree=self.__huge_tree_enabled,
219
+ **self.__response_data,
220
  )
221
 
222
+ def __handle_element(
223
+ self, element: Union[html.HtmlElement, etree._ElementUnicodeResult]
224
+ ) -> Union[TextHandler, "Adaptor", None]:
225
  """Used internally in all functions to convert a single element to type (Adaptor|TextHandler) when possible"""
226
  if element is None:
227
  return None
 
231
  else:
232
  return self.__element_convertor(element)
233
 
234
+ def __handle_elements(
235
+ self, result: List[Union[html.HtmlElement, etree._ElementUnicodeResult]]
236
+ ) -> Union["Adaptors", "TextHandlers", List]:
237
  """Used internally in all functions to convert results to type (Adaptors|TextHandlers) in bulk when possible"""
238
+ if not len(
239
+ result
240
+ ): # Lxml will give a warning if I used something like `not result`
241
  return Adaptors([])
242
 
243
  # From within the code, this method will always get a list of the same type
 
272
  self.__text = TextHandler(self._root.text)
273
  return self.__text
274
 
275
+ def get_all_text(
276
+ self,
277
+ separator: str = "\n",
278
+ strip: bool = False,
279
+ ignore_tags: Tuple = (
280
+ "script",
281
+ "style",
282
+ ),
283
+ valid_values: bool = True,
284
+ ) -> TextHandler:
285
  """Get all child strings of this element, concatenated using the given separator.
286
 
287
  :param separator: Strings will be concatenated using this separator.
 
292
  :return: A TextHandler
293
  """
294
  _all_strings = []
295
+ for node in self._root.xpath(".//*"):
296
  if node.tag not in ignore_tags:
297
  text = node.text
298
  if text and type(text) is str:
 
317
  @property
318
  def html_content(self) -> TextHandler:
319
  """Return the inner html code of the element"""
320
+ return TextHandler(
321
+ etree.tostring(
322
+ self._root, encoding="unicode", method="html", with_tail=False
323
+ )
324
+ )
325
 
326
  body = html_content
327
 
328
  def prettify(self) -> TextHandler:
329
  """Return a prettified version of the element's inner html-code"""
330
+ return TextHandler(
331
+ etree.tostring(
332
+ self._root,
333
+ encoding="unicode",
334
+ pretty_print=True,
335
+ method="html",
336
+ with_tail=False,
337
+ )
338
+ )
339
 
340
  def has_class(self, class_name: str) -> bool:
341
  """Check if element has a specific class
 
345
  return class_name in self._root.classes
346
 
347
  @property
348
+ def parent(self) -> Union["Adaptor", None]:
349
  """Return the direct parent of the element or ``None`` otherwise"""
350
  return self.__handle_element(self._root.getparent())
351
 
352
  @property
353
+ def below_elements(self) -> "Adaptors[Adaptor]":
354
  """Return all elements under the current element in the DOM tree"""
355
+ below = self._root.xpath(".//*")
356
  return self.__handle_elements(below)
357
 
358
  @property
359
+ def children(self) -> "Adaptors[Adaptor]":
360
  """Return the children elements of the current element or empty list otherwise"""
361
+ return Adaptors(
362
+ [
363
+ self.__element_convertor(child)
364
+ for child in self._root.iterchildren()
365
+ if type(child) not in html_forbidden
366
+ ]
367
+ )
368
 
369
  @property
370
+ def siblings(self) -> "Adaptors[Adaptor]":
371
  """Return other children of the current element's parent or empty list otherwise"""
372
  if self.parent:
373
+ return Adaptors(
374
+ [child for child in self.parent.children if child._root != self._root]
375
+ )
376
  return Adaptors([])
377
 
378
+ def iterancestors(self) -> Generator["Adaptor", None, None]:
379
  """Return a generator that loops over all ancestors of the element, starting with element's parent."""
380
  for ancestor in self._root.iterancestors():
381
  yield self.__element_convertor(ancestor)
382
 
383
+ def find_ancestor(
384
+ self, func: Callable[["Adaptor"], bool]
385
+ ) -> Union["Adaptor", None]:
386
  """Loop over all ancestors of the element till one match the passed function
387
  :param func: A function that takes each ancestor as an argument and returns True/False
388
  :return: The first ancestor that match the function or ``None`` otherwise.
 
393
  return None
394
 
395
  @property
396
+ def path(self) -> "Adaptors[Adaptor]":
397
  """Returns list of type :class:`Adaptors` that contains the path leading to the current element from the root."""
398
  lst = list(self.iterancestors())
399
  return Adaptors(lst)
400
 
401
  @property
402
+ def next(self) -> Union["Adaptor", None]:
403
  """Returns the next element of the current element in the children of the parent or ``None`` otherwise."""
404
  next_element = self._root.getnext()
405
  if next_element is not None:
 
410
  return self.__handle_element(next_element)
411
 
412
  @property
413
+ def previous(self) -> Union["Adaptor", None]:
414
  """Returns the previous element of the current element in the children of the parent or ``None`` otherwise."""
415
  prev_element = self._root.getprevious()
416
  if prev_element is not None:
 
438
  data = "<"
439
  content = clean_spaces(self.html_content)
440
  if len(content) > length_limit:
441
+ content = content[:length_limit].strip() + "..."
442
  data += f"data='{content}'"
443
 
444
  if self.parent:
445
  parent_content = clean_spaces(self.parent.html_content)
446
  if len(parent_content) > length_limit:
447
+ parent_content = parent_content[:length_limit].strip() + "..."
448
 
449
  data += f" parent='{parent_content}'"
450
 
 
452
 
453
  # From here we start the selecting functions
454
  def relocate(
455
+ self,
456
+ element: Union[Dict, html.HtmlElement, "Adaptor"],
457
+ percentage: int = 0,
458
+ adaptor_type: bool = False,
459
+ ) -> Union[List[Union[html.HtmlElement, None]], "Adaptors"]:
460
  """This function will search again for the element in the page tree, used automatically on page structure change
461
 
462
  :param element: The element we want to relocate in the tree
 
474
  if issubclass(type(element), html.HtmlElement):
475
  element = _StorageTools.element_to_dict(element)
476
 
477
+ for node in self._root.xpath(".//*"):
478
  # Collect all elements in the page then for each element get the matching score of it against the node.
479
  # Hence: the code doesn't stop even if the score was 100%
480
  # because there might be another element(s) left in page with the same score
 
486
  if score_table[highest_probability] and highest_probability >= percentage:
487
  if log.getEffectiveLevel() < 20:
488
  # No need to execute this part if logging level is not debugging
489
+ log.debug(f"Highest probability was {highest_probability}%")
490
+ log.debug("Top 5 best matching elements are: ")
491
  for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
492
+ log.debug(
493
+ f"{percent} -> {self.__handle_elements(score_table[percent])}"
494
+ )
495
 
496
  if not adaptor_type:
497
  return score_table[highest_probability]
498
  return self.__handle_elements(score_table[highest_probability])
499
  return []
500
 
501
+ def css_first(
502
+ self,
503
+ selector: str,
504
+ identifier: str = "",
505
+ auto_match: bool = False,
506
+ auto_save: bool = False,
507
+ percentage: int = 0,
508
+ ) -> Union["Adaptor", "TextHandler", None]:
509
  """Search current tree with CSS3 selectors and return the first result if possible, otherwise return `None`
510
 
511
  **Important:
 
521
  Be aware that the percentage calculation depends solely on the page structure so don't play with this
522
  number unless you must know what you are doing!
523
  """
524
+ for element in self.css(
525
+ selector, identifier, auto_match, auto_save, percentage
526
+ ):
527
  return element
528
  return None
529
 
530
+ def xpath_first(
531
+ self,
532
+ selector: str,
533
+ identifier: str = "",
534
+ auto_match: bool = False,
535
+ auto_save: bool = False,
536
+ percentage: int = 0,
537
+ **kwargs: Any,
538
+ ) -> Union["Adaptor", "TextHandler", None]:
539
  """Search current tree with XPath selectors and return the first result if possible, otherwise return `None`
540
 
541
  **Important:
 
553
  Be aware that the percentage calculation depends solely on the page structure so don't play with this
554
  number unless you must know what you are doing!
555
  """
556
+ for element in self.xpath(
557
+ selector, identifier, auto_match, auto_save, percentage, **kwargs
558
+ ):
559
  return element
560
  return None
561
 
562
+ def css(
563
+ self,
564
+ selector: str,
565
+ identifier: str = "",
566
+ auto_match: bool = False,
567
+ auto_save: bool = False,
568
+ percentage: int = 0,
569
+ ) -> Union["Adaptors[Adaptor]", List, "TextHandlers[TextHandler]"]:
570
  """Search current tree with CSS3 selectors
571
 
572
  **Important:
 
585
  :return: List as :class:`Adaptors`
586
  """
587
  try:
588
+ if not self.__auto_match_enabled or "," not in selector:
589
  # No need to split selectors in this case, let's save some CPU cycles :)
590
  xpath_selector = translator_instance.css_to_xpath(selector)
591
+ return self.xpath(
592
+ xpath_selector,
593
+ identifier or selector,
594
+ auto_match,
595
+ auto_save,
596
+ percentage,
597
+ )
598
 
599
  results = []
600
+ if "," in selector:
601
  for single_selector in split_selectors(selector):
602
  # I'm doing this only so the `save` function save data correctly for combined selectors
603
  # Like using the ',' to combine two different selectors that point to different elements.
604
+ xpath_selector = translator_instance.css_to_xpath(
605
+ single_selector.canonical()
606
+ )
607
  results += self.xpath(
608
+ xpath_selector,
609
+ identifier or single_selector.canonical(),
610
+ auto_match,
611
+ auto_save,
612
+ percentage,
613
  )
614
 
615
  return results
616
+ except (
617
+ SelectorError,
618
+ SelectorSyntaxError,
619
+ ):
620
  raise SelectorSyntaxError(f"Invalid CSS selector: {selector}")
621
 
622
+ def xpath(
623
+ self,
624
+ selector: str,
625
+ identifier: str = "",
626
+ auto_match: bool = False,
627
+ auto_save: bool = False,
628
+ percentage: int = 0,
629
+ **kwargs: Any,
630
+ ) -> Union["Adaptors[Adaptor]", List, "TextHandlers[TextHandler]"]:
631
  """Search current tree with XPath selectors
632
 
633
  **Important:
 
653
  if elements:
654
  if auto_save:
655
  if not self.__auto_match_enabled:
656
+ log.warning(
657
+ "Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info."
658
+ )
659
  else:
660
  self.save(elements[0], identifier or selector)
661
 
 
671
  return self.__handle_elements(elements)
672
  else:
673
  if auto_match:
674
+ log.warning(
675
+ "Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info."
676
+ )
677
  elif auto_save:
678
+ log.warning(
679
+ "Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info."
680
+ )
681
 
682
  return self.__handle_elements(elements)
683
 
684
+ except (
685
+ SelectorError,
686
+ SelectorSyntaxError,
687
+ etree.XPathError,
688
+ etree.XPathEvalError,
689
+ ):
690
  raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
691
 
692
+ def find_all(
693
+ self,
694
+ *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]],
695
+ **kwargs: str,
696
+ ) -> "Adaptors":
697
  """Find elements by filters of your creations for ease..
698
 
699
  :param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
 
704
  # Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
705
  # https://www.w3schools.com/python/python_ref_keywords.asp
706
  whitelisted = {
707
+ "class_": "class",
708
+ "for_": "for",
709
  }
710
 
711
  if not args and not kwargs:
712
+ raise TypeError(
713
+ "You have to pass something to search with, like tag name(s), tag attributes, or both."
714
+ )
715
 
716
  attributes = dict()
717
  tags, patterns = set(), set()
 
724
 
725
  elif type(arg) in [list, tuple, set]:
726
  if not all(map(lambda x: type(x) is str, arg)):
727
+ raise TypeError(
728
+ "Nested Iterables are not accepted, only iterables of tag names are accepted"
729
+ )
730
  tags.update(set(arg))
731
 
732
  elif isinstance(arg, dict):
733
+ if not all(
734
+ [(type(k) is str and type(v) is str) for k, v in arg.items()]
735
+ ):
736
+ raise TypeError(
737
+ "Nested dictionaries are not accepted, only string keys and string values are accepted"
738
+ )
739
  attributes.update(arg)
740
 
741
  elif isinstance(arg, re.Pattern):
 
745
  if len(inspect.signature(arg).parameters) > 0:
746
  functions.append(arg)
747
  else:
748
+ raise TypeError(
749
+ "Callable filter function must have at least one argument to take `Adaptor` objects."
750
+ )
751
 
752
  else:
753
+ raise TypeError(
754
+ f'Argument with type "{type(arg)}" is not accepted, please read the docs.'
755
+ )
756
 
757
  if not all([(type(k) is str and type(v) is str) for k, v in kwargs.items()]):
758
+ raise TypeError("Only string values are accepted for arguments")
759
 
760
  for attribute_name, value in kwargs.items():
761
  # Only replace names for kwargs, replacing them in dictionaries doesn't make sense
 
763
  attributes[attribute_name] = value
764
 
765
  # It's easier and faster to build a selector than traversing the tree
766
+ tags = tags or ["*"]
767
  for tag in tags:
768
  selector = tag
769
  for key, value in attributes.items():
770
+ value = value.replace('"', r"\"") # Escape double quotes in user input
771
  # Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
772
  selector += '[{}="{}"]'.format(key, value)
773
+ if selector != "*":
774
  selectors.append(selector)
775
 
776
  if selectors:
777
+ results = self.css(", ".join(selectors))
778
  if results:
779
  # From the results, get the ones that fulfill passed regex patterns
780
  for pattern in patterns:
781
+ results = results.filter(
782
+ lambda e: e.text.re(pattern, check_match=True)
783
+ )
784
 
785
  # From the results, get the ones that fulfill passed functions
786
  for function in functions:
 
796
 
797
  return results
798
 
799
+ def find(
800
+ self,
801
+ *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]],
802
+ **kwargs: str,
803
+ ) -> Union["Adaptor", None]:
804
  """Find elements by filters of your creations for ease then return the first result. Otherwise return `None`.
805
 
806
  :param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
 
811
  return element
812
  return None
813
 
814
+ def __calculate_similarity_score(
815
+ self, original: Dict, candidate: html.HtmlElement
816
+ ) -> float:
817
  """Used internally to calculate a score that shows how candidate element similar to the original one
818
 
819
  :param original: The original element in the form of the dictionary generated from `element_to_dict` function
 
826
  # Possible TODO:
827
  # Study the idea of giving weight to each test below so some are more important than others
828
  # Current results: With weights some websites had better score while it was worse for others
829
+ score += 1 if original["tag"] == candidate["tag"] else 0 # * 0.3 # 30%
830
  checks += 1
831
 
832
+ if original["text"]:
833
+ score += SequenceMatcher(
834
+ None, original["text"], candidate.get("text") or ""
835
+ ).ratio() # * 0.3 # 30%
836
  checks += 1
837
 
838
  # if both doesn't have attributes, it still count for something!
839
+ score += self.__calculate_dict_diff(
840
+ original["attributes"], candidate["attributes"]
841
+ ) # * 0.3 # 30%
842
  checks += 1
843
 
844
  # Separate similarity test for class, id, href,... this will help in full structural changes
845
+ for attrib in (
846
+ "class",
847
+ "id",
848
+ "href",
849
+ "src",
850
+ ):
851
+ if original["attributes"].get(attrib):
852
  score += SequenceMatcher(
853
+ None,
854
+ original["attributes"][attrib],
855
+ candidate["attributes"].get(attrib) or "",
856
  ).ratio() # * 0.3 # 30%
857
  checks += 1
858
 
859
+ score += SequenceMatcher(
860
+ None, original["path"], candidate["path"]
861
+ ).ratio() # * 0.1 # 10%
862
  checks += 1
863
 
864
+ if original.get("parent_name"):
865
  # Then we start comparing parents' data
866
+ if candidate.get("parent_name"):
867
  score += SequenceMatcher(
868
+ None, original["parent_name"], candidate.get("parent_name") or ""
869
  ).ratio() # * 0.2 # 20%
870
  checks += 1
871
 
872
  score += self.__calculate_dict_diff(
873
+ original["parent_attribs"], candidate.get("parent_attribs") or {}
874
  ) # * 0.2 # 20%
875
  checks += 1
876
 
877
+ if original["parent_text"]:
878
  score += SequenceMatcher(
879
+ None,
880
+ original["parent_text"],
881
+ candidate.get("parent_text") or "",
882
  ).ratio() # * 0.1 # 10%
883
  checks += 1
884
  # else:
885
  # # The original element have a parent and this one not, this is not a good sign
886
  # score -= 0.1
887
 
888
+ if original.get("siblings"):
889
  score += SequenceMatcher(
890
+ None, original["siblings"], candidate.get("siblings") or []
891
  ).ratio() # * 0.1 # 10%
892
  checks += 1
893
 
 
896
 
897
  @staticmethod
898
  def __calculate_dict_diff(dict1: dict, dict2: dict) -> float:
899
+ """Used internally calculate similarity between two dictionaries as SequenceMatcher doesn't accept dictionaries"""
900
+ score = (
901
+ SequenceMatcher(None, tuple(dict1.keys()), tuple(dict2.keys())).ratio()
902
+ * 0.5
903
+ )
904
+ score += (
905
+ SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio()
906
+ * 0.5
907
+ )
908
  return score
909
 
910
+ def save(
911
+ self, element: Union["Adaptor", html.HtmlElement], identifier: str
912
+ ) -> None:
913
  """Saves the element's unique properties to the storage for retrieval and relocation later
914
 
915
  :param element: The element itself that we want to save to storage, it can be a `Adaptor` or pure `HtmlElement`
 
951
  else:
952
  return self.get_all_text(strip=True).json()
953
 
954
+ def re(
955
+ self,
956
+ regex: Union[str, Pattern[str]],
957
+ replace_entities: bool = True,
958
+ clean_match: bool = False,
959
+ case_sensitive: bool = True,
960
+ ) -> TextHandlers:
961
  """Apply the given regex to the current text and return a list of strings with the matches.
962
 
963
  :param regex: Can be either a compiled regular expression or a string.
 
967
  """
968
  return self.text.re(regex, replace_entities, clean_match, case_sensitive)
969
 
970
+ def re_first(
971
+ self,
972
+ regex: Union[str, Pattern[str]],
973
+ default=None,
974
+ replace_entities: bool = True,
975
+ clean_match: bool = False,
976
+ case_sensitive: bool = True,
977
+ ) -> TextHandler:
978
  """Apply the given regex to text and return the first match if found, otherwise return the default value.
979
 
980
  :param regex: Can be either a compiled regular expression or a string.
 
983
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
984
  :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
985
  """
986
+ return self.text.re_first(
987
+ regex, default, replace_entities, clean_match, case_sensitive
988
+ )
989
 
990
  def find_similar(
991
+ self,
992
+ similarity_threshold: float = 0.2,
993
+ ignore_attributes: Union[List, Tuple] = (
994
+ "href",
995
+ "src",
996
+ ),
997
+ match_text: bool = False,
998
+ ) -> Union["Adaptors[Adaptor]", List]:
999
  """Find elements that are in the same tree depth in the page with the same tag name and same parent tag etc...
1000
  then return the ones that match the current element attributes with percentage higher than the input threshold.
1001
 
 
1016
 
1017
  :return: A ``Adaptors`` container of ``Adaptor`` objects or empty list
1018
  """
1019
+
1020
  def get_attributes(element: html.HtmlElement) -> Dict:
1021
  """Return attributes dictionary without the ignored list"""
1022
+ return {
1023
+ k: v for k, v in element.attrib.items() if k not in ignore_attributes
1024
+ }
1025
+
1026
+ def are_alike(
1027
+ original: html.HtmlElement,
1028
+ original_attributes: Dict,
1029
+ candidate: html.HtmlElement,
1030
+ ) -> bool:
1031
  """Calculate a score of how much these elements are alike and return True
1032
+ if score is higher or equal the threshold"""
1033
+ candidate_attributes = (
1034
+ get_attributes(candidate) if ignore_attributes else candidate.attrib
1035
+ )
1036
  score, checks = 0, 0
1037
 
1038
  if original_attributes:
1039
  score += sum(
1040
+ SequenceMatcher(None, v, candidate_attributes.get(k, "")).ratio()
1041
  for k, v in original_attributes.items()
1042
  )
1043
  checks += len(candidate_attributes)
 
1049
 
1050
  if match_text:
1051
  score += SequenceMatcher(
1052
+ None,
1053
+ clean_spaces(original.text or ""),
1054
+ clean_spaces(candidate.text or ""),
1055
  ).ratio()
1056
  checks += 1
1057
 
 
1073
  f"//{grandparent.tag}/{parent.tag}/{self.tag}[count(ancestor::*) = {current_depth}]"
1074
  )
1075
  else:
1076
+ potential_matches = root.xpath(
1077
+ f"//{parent.tag}/{self.tag}[count(ancestor::*) = {current_depth}]"
1078
+ )
1079
  else:
1080
+ potential_matches = root.xpath(
1081
+ f"//{self.tag}[count(ancestor::*) = {current_depth}]"
1082
+ )
1083
 
1084
  for potential_match in potential_matches:
1085
+ if potential_match != root and are_alike(
1086
+ root, target_attrs, potential_match
1087
+ ):
1088
  similar_elements.append(potential_match)
1089
 
1090
  return self.__handle_elements(similar_elements)
1091
 
1092
  def find_by_text(
1093
+ self,
1094
+ text: str,
1095
+ first_match: bool = True,
1096
+ partial: bool = False,
1097
+ case_sensitive: bool = False,
1098
+ clean_match: bool = True,
1099
+ ) -> Union["Adaptors[Adaptor]", "Adaptor"]:
1100
  """Find elements that its text content fully/partially matches input.
1101
  :param text: Text query to match
1102
  :param first_match: Return first element that matches conditions, enabled by default
 
1110
  text = text.lower()
1111
 
1112
  # This selector gets all elements with text content
1113
+ for node in self.__handle_elements(
1114
+ self._root.xpath(".//*[normalize-space(text())]")
1115
+ ):
1116
  """Check if element matches given text otherwise, traverse the children tree and iterate"""
1117
  node_text = node.text
1118
  if clean_match:
 
1137
  return results
1138
 
1139
  def find_by_regex(
1140
+ self,
1141
+ query: Union[str, Pattern[str]],
1142
+ first_match: bool = True,
1143
+ case_sensitive: bool = False,
1144
+ clean_match: bool = True,
1145
+ ) -> Union["Adaptors[Adaptor]", "Adaptor"]:
1146
  """Find elements that its text content matches the input regex pattern.
1147
  :param query: Regex query/pattern to match
1148
  :param first_match: Return first element that matches conditions, enabled by default
 
1152
  results = Adaptors([])
1153
 
1154
  # This selector gets all elements with text content
1155
+ for node in self.__handle_elements(
1156
+ self._root.xpath(".//*[normalize-space(text())]")
1157
+ ):
1158
  """Check if element matches given regex otherwise, traverse the children tree and iterate"""
1159
  node_text = node.text
1160
+ if node_text.re(
1161
+ query,
1162
+ check_match=True,
1163
+ clean_match=clean_match,
1164
+ case_sensitive=case_sensitive,
1165
+ ):
1166
  results.append(node)
1167
 
1168
  if first_match and results:
 
1178
  """
1179
  The :class:`Adaptors` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
1180
  """
1181
+
1182
  __slots__ = ()
1183
 
1184
  @typing.overload
 
1189
  def __getitem__(self, pos: slice) -> "Adaptors":
1190
  pass
1191
 
1192
+ def __getitem__(
1193
+ self, pos: Union[SupportsIndex, slice]
1194
+ ) -> Union[Adaptor, "Adaptors"]:
1195
  lst = super().__getitem__(pos)
1196
  if isinstance(pos, slice):
1197
  return self.__class__(lst)
 
1199
  return lst
1200
 
1201
  def xpath(
1202
+ self,
1203
+ selector: str,
1204
+ identifier: str = "",
1205
+ auto_save: bool = False,
1206
+ percentage: int = 0,
1207
+ **kwargs: Any,
1208
  ) -> "Adaptors[Adaptor]":
1209
  """
1210
  Call the ``.xpath()`` method for each element in this list and return
 
1227
  :return: List as :class:`Adaptors`
1228
  """
1229
  results = [
1230
+ n.xpath(
1231
+ selector, identifier or selector, False, auto_save, percentage, **kwargs
1232
+ )
1233
+ for n in self
1234
  ]
1235
  return self.__class__(flatten(results))
1236
 
1237
+ def css(
1238
+ self,
1239
+ selector: str,
1240
+ identifier: str = "",
1241
+ auto_save: bool = False,
1242
+ percentage: int = 0,
1243
+ ) -> "Adaptors[Adaptor]":
1244
  """
1245
  Call the ``.css()`` method for each element in this list and return
1246
  their results flattened as another :class:`Adaptors`.
 
1260
  :return: List as :class:`Adaptors`
1261
  """
1262
  results = [
1263
+ n.css(selector, identifier or selector, False, auto_save, percentage)
1264
+ for n in self
1265
  ]
1266
  return self.__class__(flatten(results))
1267
 
1268
+ def re(
1269
+ self,
1270
+ regex: Union[str, Pattern[str]],
1271
+ replace_entities: bool = True,
1272
+ clean_match: bool = False,
1273
+ case_sensitive: bool = True,
1274
+ ) -> TextHandlers[TextHandler]:
1275
  """Call the ``.re()`` method for each element in this list and return
1276
  their results flattened as List of TextHandler.
1277
 
 
1281
  :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
1282
  """
1283
  results = [
1284
+ n.text.re(regex, replace_entities, clean_match, case_sensitive)
1285
+ for n in self
1286
  ]
1287
  return TextHandlers(flatten(results))
1288
 
1289
+ def re_first(
1290
+ self,
1291
+ regex: Union[str, Pattern[str]],
1292
+ default=None,
1293
+ replace_entities: bool = True,
1294
+ clean_match: bool = False,
1295
+ case_sensitive: bool = True,
1296
+ ) -> TextHandler:
1297
  """Call the ``.re_first()`` method for each element in this list and return
1298
  the first result or the default value otherwise.
1299
 
 
1308
  return result
1309
  return default
1310
 
1311
+ def search(self, func: Callable[["Adaptor"], bool]) -> Union["Adaptor", None]:
1312
  """Loop over all current elements and return the first element that matches the passed function
1313
  :param func: A function that takes each element as an argument and returns True/False
1314
  :return: The first element that match the function or ``None`` otherwise.
 
1318
  return element
1319
  return None
1320
 
1321
+ def filter(self, func: Callable[["Adaptor"], bool]) -> "Adaptors[Adaptor]":
1322
  """Filter current elements based on the passed function
1323
  :param func: A function that takes each element as an argument and returns True/False
1324
  :return: The new `Adaptors` object or empty list otherwise.
1325
  """
1326
+ return self.__class__([element for element in self if func(element)])
 
 
1327
 
1328
  # For easy copy-paste from Scrapy/parsel code when needed :)
1329
  def get(self, default=None):
setup.py CHANGED
@@ -1,7 +1,8 @@
 
 
1
  from setuptools import find_packages, setup
2
 
3
- with open("README.md", "r", encoding="utf-8") as fh:
4
- long_description = fh.read()
5
 
6
 
7
  setup(
@@ -20,9 +21,7 @@ setup(
20
  "scrapling": "scrapling",
21
  },
22
  entry_points={
23
- 'console_scripts': [
24
- 'scrapling=scrapling.cli:main'
25
- ],
26
  },
27
  include_package_data=True,
28
  classifiers=[
@@ -53,14 +52,14 @@ setup(
53
  install_requires=[
54
  "lxml>=5.0",
55
  "cssselect>=1.2",
56
- 'click',
57
  "w3lib",
58
  "orjson>=3",
59
  "tldextract",
60
- 'httpx[brotli,zstd, socks]',
61
- 'playwright>=1.49.1',
62
- 'rebrowser-playwright>=1.49.1',
63
- 'camoufox[geoip]>=0.4.11'
64
  ],
65
  python_requires=">=3.9",
66
  url="https://github.com/D4Vinci/Scrapling",
@@ -68,5 +67,5 @@ setup(
68
  "Documentation": "https://scrapling.readthedocs.io/en/latest/",
69
  "Source": "https://github.com/D4Vinci/Scrapling",
70
  "Tracker": "https://github.com/D4Vinci/Scrapling/issues",
71
- }
72
  )
 
1
+ from pathlib import Path
2
+
3
  from setuptools import find_packages, setup
4
 
5
+ long_description = Path("README.md").read_text(encoding="utf-8")
 
6
 
7
 
8
  setup(
 
21
  "scrapling": "scrapling",
22
  },
23
  entry_points={
24
+ "console_scripts": ["scrapling=scrapling.cli:main"],
 
 
25
  },
26
  include_package_data=True,
27
  classifiers=[
 
52
  install_requires=[
53
  "lxml>=5.0",
54
  "cssselect>=1.2",
55
+ "click",
56
  "w3lib",
57
  "orjson>=3",
58
  "tldextract",
59
+ "httpx[brotli,zstd, socks]",
60
+ "playwright>=1.49.1",
61
+ "rebrowser-playwright>=1.49.1",
62
+ "camoufox[geoip]>=0.4.11",
63
  ],
64
  python_requires=">=3.9",
65
  url="https://github.com/D4Vinci/Scrapling",
 
67
  "Documentation": "https://scrapling.readthedocs.io/en/latest/",
68
  "Source": "https://github.com/D4Vinci/Scrapling",
69
  "Tracker": "https://github.com/D4Vinci/Scrapling/issues",
70
+ },
71
  )
tests/fetchers/async/test_camoufox.py CHANGED
@@ -17,43 +17,51 @@ class TestStealthyFetcher:
17
  def urls(self, httpbin):
18
  url = httpbin.url
19
  return {
20
- 'status_200': f'{url}/status/200',
21
- 'status_404': f'{url}/status/404',
22
- 'status_501': f'{url}/status/501',
23
- 'basic_url': f'{url}/get',
24
- 'html_url': f'{url}/html',
25
- 'delayed_url': f'{url}/delay/10', # 10 Seconds delay response
26
- 'cookies_url': f"{url}/cookies/set/test/value"
27
  }
28
 
29
  async def test_basic_fetch(self, fetcher, urls):
30
  """Test doing basic fetch request with multiple statuses"""
31
- assert (await fetcher.async_fetch(urls['status_200'])).status == 200
32
- assert (await fetcher.async_fetch(urls['status_404'])).status == 404
33
- assert (await fetcher.async_fetch(urls['status_501'])).status == 501
34
 
35
  async def test_networkidle(self, fetcher, urls):
36
  """Test if waiting for `networkidle` make page does not finish loading or not"""
37
- assert (await fetcher.async_fetch(urls['basic_url'], network_idle=True)).status == 200
 
 
38
 
39
  async def test_blocking_resources(self, fetcher, urls):
40
  """Test if blocking resources make page does not finish loading or not"""
41
- assert (await fetcher.async_fetch(urls['basic_url'], block_images=True)).status == 200
42
- assert (await fetcher.async_fetch(urls['basic_url'], disable_resources=True)).status == 200
 
 
 
 
43
 
44
  async def test_waiting_selector(self, fetcher, urls):
45
  """Test if waiting for a selector make page does not finish loading or not"""
46
- assert (await fetcher.async_fetch(urls['html_url'], wait_selector='h1')).status == 200
47
- assert (await fetcher.async_fetch(
48
- urls['html_url'],
49
- wait_selector='h1',
50
- wait_selector_state='visible'
51
- )).status == 200
 
 
52
 
53
  async def test_cookies_loading(self, fetcher, urls):
54
  """Test if cookies are set after the request"""
55
- response = await fetcher.async_fetch(urls['cookies_url'])
56
- assert response.cookies == {'test': 'value'}
57
 
58
  async def test_automation(self, fetcher, urls):
59
  """Test if automation break the code or not"""
@@ -64,34 +72,38 @@ class TestStealthyFetcher:
64
  await page.mouse.up()
65
  return page
66
 
67
- assert (await fetcher.async_fetch(urls['html_url'], page_action=scroll_page)).status == 200
 
 
68
 
69
  async def test_properties(self, fetcher, urls):
70
  """Test if different arguments breaks the code or not"""
71
- assert (await fetcher.async_fetch(
72
- urls['html_url'],
73
- block_webrtc=True,
74
- allow_webgl=True
75
- )).status == 200
76
-
77
- assert (await fetcher.async_fetch(
78
- urls['html_url'],
79
- block_webrtc=False,
80
- allow_webgl=True
81
- )).status == 200
82
-
83
- assert (await fetcher.async_fetch(
84
- urls['html_url'],
85
- block_webrtc=True,
86
- allow_webgl=False
87
- )).status == 200
88
-
89
- assert (await fetcher.async_fetch(
90
- urls['html_url'],
91
- extra_headers={'ayo': ''},
92
- os_randomize=True
93
- )).status == 200
94
 
95
  async def test_infinite_timeout(self, fetcher, urls):
96
  """Test if infinite timeout breaks the code or not"""
97
- assert (await fetcher.async_fetch(urls['delayed_url'], timeout=None)).status == 200
 
 
 
17
  def urls(self, httpbin):
18
  url = httpbin.url
19
  return {
20
+ "status_200": f"{url}/status/200",
21
+ "status_404": f"{url}/status/404",
22
+ "status_501": f"{url}/status/501",
23
+ "basic_url": f"{url}/get",
24
+ "html_url": f"{url}/html",
25
+ "delayed_url": f"{url}/delay/10", # 10 Seconds delay response
26
+ "cookies_url": f"{url}/cookies/set/test/value",
27
  }
28
 
29
  async def test_basic_fetch(self, fetcher, urls):
30
  """Test doing basic fetch request with multiple statuses"""
31
+ assert (await fetcher.async_fetch(urls["status_200"])).status == 200
32
+ assert (await fetcher.async_fetch(urls["status_404"])).status == 404
33
+ assert (await fetcher.async_fetch(urls["status_501"])).status == 501
34
 
35
  async def test_networkidle(self, fetcher, urls):
36
  """Test if waiting for `networkidle` make page does not finish loading or not"""
37
+ assert (
38
+ await fetcher.async_fetch(urls["basic_url"], network_idle=True)
39
+ ).status == 200
40
 
41
  async def test_blocking_resources(self, fetcher, urls):
42
  """Test if blocking resources make page does not finish loading or not"""
43
+ assert (
44
+ await fetcher.async_fetch(urls["basic_url"], block_images=True)
45
+ ).status == 200
46
+ assert (
47
+ await fetcher.async_fetch(urls["basic_url"], disable_resources=True)
48
+ ).status == 200
49
 
50
  async def test_waiting_selector(self, fetcher, urls):
51
  """Test if waiting for a selector make page does not finish loading or not"""
52
+ assert (
53
+ await fetcher.async_fetch(urls["html_url"], wait_selector="h1")
54
+ ).status == 200
55
+ assert (
56
+ await fetcher.async_fetch(
57
+ urls["html_url"], wait_selector="h1", wait_selector_state="visible"
58
+ )
59
+ ).status == 200
60
 
61
  async def test_cookies_loading(self, fetcher, urls):
62
  """Test if cookies are set after the request"""
63
+ response = await fetcher.async_fetch(urls["cookies_url"])
64
+ assert response.cookies == {"test": "value"}
65
 
66
  async def test_automation(self, fetcher, urls):
67
  """Test if automation break the code or not"""
 
72
  await page.mouse.up()
73
  return page
74
 
75
+ assert (
76
+ await fetcher.async_fetch(urls["html_url"], page_action=scroll_page)
77
+ ).status == 200
78
 
79
  async def test_properties(self, fetcher, urls):
80
  """Test if different arguments breaks the code or not"""
81
+ assert (
82
+ await fetcher.async_fetch(
83
+ urls["html_url"], block_webrtc=True, allow_webgl=True
84
+ )
85
+ ).status == 200
86
+
87
+ assert (
88
+ await fetcher.async_fetch(
89
+ urls["html_url"], block_webrtc=False, allow_webgl=True
90
+ )
91
+ ).status == 200
92
+
93
+ assert (
94
+ await fetcher.async_fetch(
95
+ urls["html_url"], block_webrtc=True, allow_webgl=False
96
+ )
97
+ ).status == 200
98
+
99
+ assert (
100
+ await fetcher.async_fetch(
101
+ urls["html_url"], extra_headers={"ayo": ""}, os_randomize=True
102
+ )
103
+ ).status == 200
104
 
105
  async def test_infinite_timeout(self, fetcher, urls):
106
  """Test if infinite timeout breaks the code or not"""
107
+ assert (
108
+ await fetcher.async_fetch(urls["delayed_url"], timeout=None)
109
+ ).status == 200
tests/fetchers/async/test_httpx.py CHANGED
@@ -16,70 +16,111 @@ class TestAsyncFetcher:
16
  @pytest.fixture(scope="class")
17
  def urls(self, httpbin):
18
  return {
19
- 'status_200': f'{httpbin.url}/status/200',
20
- 'status_404': f'{httpbin.url}/status/404',
21
- 'status_501': f'{httpbin.url}/status/501',
22
- 'basic_url': f'{httpbin.url}/get',
23
- 'post_url': f'{httpbin.url}/post',
24
- 'put_url': f'{httpbin.url}/put',
25
- 'delete_url': f'{httpbin.url}/delete',
26
- 'html_url': f'{httpbin.url}/html'
27
  }
28
 
29
  async def test_basic_get(self, fetcher, urls):
30
  """Test doing basic get request with multiple statuses"""
31
- assert (await fetcher.get(urls['status_200'])).status == 200
32
- assert (await fetcher.get(urls['status_404'])).status == 404
33
- assert (await fetcher.get(urls['status_501'])).status == 501
34
 
35
  async def test_get_properties(self, fetcher, urls):
36
  """Test if different arguments with GET request breaks the code or not"""
37
- assert (await fetcher.get(urls['status_200'], stealthy_headers=True)).status == 200
38
- assert (await fetcher.get(urls['status_200'], follow_redirects=True)).status == 200
39
- assert (await fetcher.get(urls['status_200'], timeout=None)).status == 200
40
- assert (await fetcher.get(
41
- urls['status_200'],
42
- stealthy_headers=True,
43
- follow_redirects=True,
44
- timeout=None
45
- )).status == 200
 
 
 
 
 
 
46
 
47
  async def test_post_properties(self, fetcher, urls):
48
  """Test if different arguments with POST request breaks the code or not"""
49
- assert (await fetcher.post(urls['post_url'], data={'key': 'value'})).status == 200
50
- assert (await fetcher.post(urls['post_url'], data={'key': 'value'}, stealthy_headers=True)).status == 200
51
- assert (await fetcher.post(urls['post_url'], data={'key': 'value'}, follow_redirects=True)).status == 200
52
- assert (await fetcher.post(urls['post_url'], data={'key': 'value'}, timeout=None)).status == 200
53
- assert (await fetcher.post(
54
- urls['post_url'],
55
- data={'key': 'value'},
56
- stealthy_headers=True,
57
- follow_redirects=True,
58
- timeout=None
59
- )).status == 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  async def test_put_properties(self, fetcher, urls):
62
  """Test if different arguments with PUT request breaks the code or not"""
63
- assert (await fetcher.put(urls['put_url'], data={'key': 'value'})).status in [200, 405]
64
- assert (await fetcher.put(urls['put_url'], data={'key': 'value'}, stealthy_headers=True)).status in [200, 405]
65
- assert (await fetcher.put(urls['put_url'], data={'key': 'value'}, follow_redirects=True)).status in [200, 405]
66
- assert (await fetcher.put(urls['put_url'], data={'key': 'value'}, timeout=None)).status in [200, 405]
67
- assert (await fetcher.put(
68
- urls['put_url'],
69
- data={'key': 'value'},
70
- stealthy_headers=True,
71
- follow_redirects=True,
72
- timeout=None
73
- )).status in [200, 405]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  async def test_delete_properties(self, fetcher, urls):
76
  """Test if different arguments with DELETE request breaks the code or not"""
77
- assert (await fetcher.delete(urls['delete_url'], stealthy_headers=True)).status == 200
78
- assert (await fetcher.delete(urls['delete_url'], follow_redirects=True)).status == 200
79
- assert (await fetcher.delete(urls['delete_url'], timeout=None)).status == 200
80
- assert (await fetcher.delete(
81
- urls['delete_url'],
82
- stealthy_headers=True,
83
- follow_redirects=True,
84
- timeout=None
85
- )).status == 200
 
 
 
 
 
 
 
16
  @pytest.fixture(scope="class")
17
  def urls(self, httpbin):
18
  return {
19
+ "status_200": f"{httpbin.url}/status/200",
20
+ "status_404": f"{httpbin.url}/status/404",
21
+ "status_501": f"{httpbin.url}/status/501",
22
+ "basic_url": f"{httpbin.url}/get",
23
+ "post_url": f"{httpbin.url}/post",
24
+ "put_url": f"{httpbin.url}/put",
25
+ "delete_url": f"{httpbin.url}/delete",
26
+ "html_url": f"{httpbin.url}/html",
27
  }
28
 
29
  async def test_basic_get(self, fetcher, urls):
30
  """Test doing basic get request with multiple statuses"""
31
+ assert (await fetcher.get(urls["status_200"])).status == 200
32
+ assert (await fetcher.get(urls["status_404"])).status == 404
33
+ assert (await fetcher.get(urls["status_501"])).status == 501
34
 
35
  async def test_get_properties(self, fetcher, urls):
36
  """Test if different arguments with GET request breaks the code or not"""
37
+ assert (
38
+ await fetcher.get(urls["status_200"], stealthy_headers=True)
39
+ ).status == 200
40
+ assert (
41
+ await fetcher.get(urls["status_200"], follow_redirects=True)
42
+ ).status == 200
43
+ assert (await fetcher.get(urls["status_200"], timeout=None)).status == 200
44
+ assert (
45
+ await fetcher.get(
46
+ urls["status_200"],
47
+ stealthy_headers=True,
48
+ follow_redirects=True,
49
+ timeout=None,
50
+ )
51
+ ).status == 200
52
 
53
  async def test_post_properties(self, fetcher, urls):
54
  """Test if different arguments with POST request breaks the code or not"""
55
+ assert (
56
+ await fetcher.post(urls["post_url"], data={"key": "value"})
57
+ ).status == 200
58
+ assert (
59
+ await fetcher.post(
60
+ urls["post_url"], data={"key": "value"}, stealthy_headers=True
61
+ )
62
+ ).status == 200
63
+ assert (
64
+ await fetcher.post(
65
+ urls["post_url"], data={"key": "value"}, follow_redirects=True
66
+ )
67
+ ).status == 200
68
+ assert (
69
+ await fetcher.post(urls["post_url"], data={"key": "value"}, timeout=None)
70
+ ).status == 200
71
+ assert (
72
+ await fetcher.post(
73
+ urls["post_url"],
74
+ data={"key": "value"},
75
+ stealthy_headers=True,
76
+ follow_redirects=True,
77
+ timeout=None,
78
+ )
79
+ ).status == 200
80
 
81
  async def test_put_properties(self, fetcher, urls):
82
  """Test if different arguments with PUT request breaks the code or not"""
83
+ assert (await fetcher.put(urls["put_url"], data={"key": "value"})).status in [
84
+ 200,
85
+ 405,
86
+ ]
87
+ assert (
88
+ await fetcher.put(
89
+ urls["put_url"], data={"key": "value"}, stealthy_headers=True
90
+ )
91
+ ).status in [200, 405]
92
+ assert (
93
+ await fetcher.put(
94
+ urls["put_url"], data={"key": "value"}, follow_redirects=True
95
+ )
96
+ ).status in [200, 405]
97
+ assert (
98
+ await fetcher.put(urls["put_url"], data={"key": "value"}, timeout=None)
99
+ ).status in [200, 405]
100
+ assert (
101
+ await fetcher.put(
102
+ urls["put_url"],
103
+ data={"key": "value"},
104
+ stealthy_headers=True,
105
+ follow_redirects=True,
106
+ timeout=None,
107
+ )
108
+ ).status in [200, 405]
109
 
110
  async def test_delete_properties(self, fetcher, urls):
111
  """Test if different arguments with DELETE request breaks the code or not"""
112
+ assert (
113
+ await fetcher.delete(urls["delete_url"], stealthy_headers=True)
114
+ ).status == 200
115
+ assert (
116
+ await fetcher.delete(urls["delete_url"], follow_redirects=True)
117
+ ).status == 200
118
+ assert (await fetcher.delete(urls["delete_url"], timeout=None)).status == 200
119
+ assert (
120
+ await fetcher.delete(
121
+ urls["delete_url"],
122
+ stealthy_headers=True,
123
+ follow_redirects=True,
124
+ timeout=None,
125
+ )
126
+ ).status == 200
tests/fetchers/async/test_playwright.py CHANGED
@@ -15,87 +15,97 @@ class TestPlayWrightFetcherAsync:
15
  @pytest.fixture
16
  def urls(self, httpbin):
17
  return {
18
- 'status_200': f'{httpbin.url}/status/200',
19
- 'status_404': f'{httpbin.url}/status/404',
20
- 'status_501': f'{httpbin.url}/status/501',
21
- 'basic_url': f'{httpbin.url}/get',
22
- 'html_url': f'{httpbin.url}/html',
23
- 'delayed_url': f'{httpbin.url}/delay/10',
24
- 'cookies_url': f"{httpbin.url}/cookies/set/test/value"
25
  }
26
 
27
  @pytest.mark.asyncio
28
  async def test_basic_fetch(self, fetcher, urls):
29
  """Test doing basic fetch request with multiple statuses"""
30
- response = await fetcher.async_fetch(urls['status_200'])
31
  assert response.status == 200
32
 
33
  @pytest.mark.asyncio
34
  async def test_networkidle(self, fetcher, urls):
35
  """Test if waiting for `networkidle` make page does not finish loading or not"""
36
- response = await fetcher.async_fetch(urls['basic_url'], network_idle=True)
37
  assert response.status == 200
38
 
39
  @pytest.mark.asyncio
40
  async def test_blocking_resources(self, fetcher, urls):
41
  """Test if blocking resources make page does not finish loading or not"""
42
- response = await fetcher.async_fetch(urls['basic_url'], disable_resources=True)
43
  assert response.status == 200
44
 
45
  @pytest.mark.asyncio
46
  async def test_waiting_selector(self, fetcher, urls):
47
  """Test if waiting for a selector make page does not finish loading or not"""
48
- response1 = await fetcher.async_fetch(urls['html_url'], wait_selector='h1')
49
  assert response1.status == 200
50
 
51
- response2 = await fetcher.async_fetch(urls['html_url'], wait_selector='h1', wait_selector_state='visible')
 
 
52
  assert response2.status == 200
53
 
54
  @pytest.mark.asyncio
55
  async def test_cookies_loading(self, fetcher, urls):
56
  """Test if cookies are set after the request"""
57
- response = await fetcher.async_fetch(urls['cookies_url'])
58
- assert response.cookies == {'test': 'value'}
59
 
60
  @pytest.mark.asyncio
61
  async def test_automation(self, fetcher, urls):
62
  """Test if automation break the code or not"""
 
63
  async def scroll_page(page):
64
  await page.mouse.wheel(10, 0)
65
  await page.mouse.move(100, 400)
66
  await page.mouse.up()
67
  return page
68
 
69
- response = await fetcher.async_fetch(urls['html_url'], page_action=scroll_page)
70
  assert response.status == 200
71
 
72
- @pytest.mark.parametrize("kwargs", [
73
- {"disable_webgl": True, "hide_canvas": False},
74
- {"disable_webgl": False, "hide_canvas": True},
75
- # {"stealth": True}, # causes issues with Github Actions
76
- {"useragent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0'},
77
- {"extra_headers": {'ayo': ''}}
78
- ])
 
 
 
 
 
79
  @pytest.mark.asyncio
80
  async def test_properties(self, fetcher, urls, kwargs):
81
  """Test if different arguments breaks the code or not"""
82
- response = await fetcher.async_fetch(urls['html_url'], **kwargs)
83
  assert response.status == 200
84
 
85
  @pytest.mark.asyncio
86
  async def test_cdp_url_invalid(self, fetcher, urls):
87
  """Test if invalid CDP URLs raise appropriate exceptions"""
88
  with pytest.raises(ValueError):
89
- await fetcher.async_fetch(urls['html_url'], cdp_url='blahblah')
90
 
91
  with pytest.raises(ValueError):
92
- await fetcher.async_fetch(urls['html_url'], cdp_url='blahblah', nstbrowser_mode=True)
 
 
93
 
94
  with pytest.raises(Exception):
95
- await fetcher.async_fetch(urls['html_url'], cdp_url='ws://blahblah')
96
 
97
  @pytest.mark.asyncio
98
  async def test_infinite_timeout(self, fetcher, urls):
99
  """Test if infinite timeout breaks the code or not"""
100
- response = await fetcher.async_fetch(urls['delayed_url'], timeout=None)
101
  assert response.status == 200
 
15
  @pytest.fixture
16
  def urls(self, httpbin):
17
  return {
18
+ "status_200": f"{httpbin.url}/status/200",
19
+ "status_404": f"{httpbin.url}/status/404",
20
+ "status_501": f"{httpbin.url}/status/501",
21
+ "basic_url": f"{httpbin.url}/get",
22
+ "html_url": f"{httpbin.url}/html",
23
+ "delayed_url": f"{httpbin.url}/delay/10",
24
+ "cookies_url": f"{httpbin.url}/cookies/set/test/value",
25
  }
26
 
27
  @pytest.mark.asyncio
28
  async def test_basic_fetch(self, fetcher, urls):
29
  """Test doing basic fetch request with multiple statuses"""
30
+ response = await fetcher.async_fetch(urls["status_200"])
31
  assert response.status == 200
32
 
33
  @pytest.mark.asyncio
34
  async def test_networkidle(self, fetcher, urls):
35
  """Test if waiting for `networkidle` make page does not finish loading or not"""
36
+ response = await fetcher.async_fetch(urls["basic_url"], network_idle=True)
37
  assert response.status == 200
38
 
39
  @pytest.mark.asyncio
40
  async def test_blocking_resources(self, fetcher, urls):
41
  """Test if blocking resources make page does not finish loading or not"""
42
+ response = await fetcher.async_fetch(urls["basic_url"], disable_resources=True)
43
  assert response.status == 200
44
 
45
  @pytest.mark.asyncio
46
  async def test_waiting_selector(self, fetcher, urls):
47
  """Test if waiting for a selector make page does not finish loading or not"""
48
+ response1 = await fetcher.async_fetch(urls["html_url"], wait_selector="h1")
49
  assert response1.status == 200
50
 
51
+ response2 = await fetcher.async_fetch(
52
+ urls["html_url"], wait_selector="h1", wait_selector_state="visible"
53
+ )
54
  assert response2.status == 200
55
 
56
  @pytest.mark.asyncio
57
  async def test_cookies_loading(self, fetcher, urls):
58
  """Test if cookies are set after the request"""
59
+ response = await fetcher.async_fetch(urls["cookies_url"])
60
+ assert response.cookies == {"test": "value"}
61
 
62
  @pytest.mark.asyncio
63
  async def test_automation(self, fetcher, urls):
64
  """Test if automation break the code or not"""
65
+
66
  async def scroll_page(page):
67
  await page.mouse.wheel(10, 0)
68
  await page.mouse.move(100, 400)
69
  await page.mouse.up()
70
  return page
71
 
72
+ response = await fetcher.async_fetch(urls["html_url"], page_action=scroll_page)
73
  assert response.status == 200
74
 
75
+ @pytest.mark.parametrize(
76
+ "kwargs",
77
+ [
78
+ {"disable_webgl": True, "hide_canvas": False},
79
+ {"disable_webgl": False, "hide_canvas": True},
80
+ # {"stealth": True}, # causes issues with Github Actions
81
+ {
82
+ "useragent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0"
83
+ },
84
+ {"extra_headers": {"ayo": ""}},
85
+ ],
86
+ )
87
  @pytest.mark.asyncio
88
  async def test_properties(self, fetcher, urls, kwargs):
89
  """Test if different arguments breaks the code or not"""
90
+ response = await fetcher.async_fetch(urls["html_url"], **kwargs)
91
  assert response.status == 200
92
 
93
  @pytest.mark.asyncio
94
  async def test_cdp_url_invalid(self, fetcher, urls):
95
  """Test if invalid CDP URLs raise appropriate exceptions"""
96
  with pytest.raises(ValueError):
97
+ await fetcher.async_fetch(urls["html_url"], cdp_url="blahblah")
98
 
99
  with pytest.raises(ValueError):
100
+ await fetcher.async_fetch(
101
+ urls["html_url"], cdp_url="blahblah", nstbrowser_mode=True
102
+ )
103
 
104
  with pytest.raises(Exception):
105
+ await fetcher.async_fetch(urls["html_url"], cdp_url="ws://blahblah")
106
 
107
  @pytest.mark.asyncio
108
  async def test_infinite_timeout(self, fetcher, urls):
109
  """Test if infinite timeout breaks the code or not"""
110
+ response = await fetcher.async_fetch(urls["delayed_url"], timeout=None)
111
  assert response.status == 200
tests/fetchers/sync/test_camoufox.py CHANGED
@@ -16,12 +16,12 @@ class TestStealthyFetcher:
16
  @pytest.fixture(autouse=True)
17
  def setup_urls(self, httpbin):
18
  """Fixture to set up URLs for testing"""
19
- self.status_200 = f'{httpbin.url}/status/200'
20
- self.status_404 = f'{httpbin.url}/status/404'
21
- self.status_501 = f'{httpbin.url}/status/501'
22
- self.basic_url = f'{httpbin.url}/get'
23
- self.html_url = f'{httpbin.url}/html'
24
- self.delayed_url = f'{httpbin.url}/delay/10' # 10 Seconds delay response
25
  self.cookies_url = f"{httpbin.url}/cookies/set/test/value"
26
 
27
  def test_basic_fetch(self, fetcher):
@@ -41,15 +41,21 @@ class TestStealthyFetcher:
41
 
42
  def test_waiting_selector(self, fetcher):
43
  """Test if waiting for a selector make page does not finish loading or not"""
44
- assert fetcher.fetch(self.html_url, wait_selector='h1').status == 200
45
- assert fetcher.fetch(self.html_url, wait_selector='h1', wait_selector_state='visible').status == 200
 
 
 
 
 
46
 
47
  def test_cookies_loading(self, fetcher):
48
  """Test if cookies are set after the request"""
49
- assert fetcher.fetch(self.cookies_url).cookies == {'test': 'value'}
50
 
51
  def test_automation(self, fetcher):
52
  """Test if automation break the code or not"""
 
53
  def scroll_page(page):
54
  page.mouse.wheel(10, 0)
55
  page.mouse.move(100, 400)
@@ -60,10 +66,24 @@ class TestStealthyFetcher:
60
 
61
  def test_properties(self, fetcher):
62
  """Test if different arguments breaks the code or not"""
63
- assert fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=True).status == 200
64
- assert fetcher.fetch(self.html_url, block_webrtc=False, allow_webgl=True).status == 200
65
- assert fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=False).status == 200
66
- assert fetcher.fetch(self.html_url, extra_headers={'ayo': ''}, os_randomize=True).status == 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  def test_infinite_timeout(self, fetcher):
69
  """Test if infinite timeout breaks the code or not"""
 
16
  @pytest.fixture(autouse=True)
17
  def setup_urls(self, httpbin):
18
  """Fixture to set up URLs for testing"""
19
+ self.status_200 = f"{httpbin.url}/status/200"
20
+ self.status_404 = f"{httpbin.url}/status/404"
21
+ self.status_501 = f"{httpbin.url}/status/501"
22
+ self.basic_url = f"{httpbin.url}/get"
23
+ self.html_url = f"{httpbin.url}/html"
24
+ self.delayed_url = f"{httpbin.url}/delay/10" # 10 Seconds delay response
25
  self.cookies_url = f"{httpbin.url}/cookies/set/test/value"
26
 
27
  def test_basic_fetch(self, fetcher):
 
41
 
42
  def test_waiting_selector(self, fetcher):
43
  """Test if waiting for a selector make page does not finish loading or not"""
44
+ assert fetcher.fetch(self.html_url, wait_selector="h1").status == 200
45
+ assert (
46
+ fetcher.fetch(
47
+ self.html_url, wait_selector="h1", wait_selector_state="visible"
48
+ ).status
49
+ == 200
50
+ )
51
 
52
  def test_cookies_loading(self, fetcher):
53
  """Test if cookies are set after the request"""
54
+ assert fetcher.fetch(self.cookies_url).cookies == {"test": "value"}
55
 
56
  def test_automation(self, fetcher):
57
  """Test if automation break the code or not"""
58
+
59
  def scroll_page(page):
60
  page.mouse.wheel(10, 0)
61
  page.mouse.move(100, 400)
 
66
 
67
  def test_properties(self, fetcher):
68
  """Test if different arguments breaks the code or not"""
69
+ assert (
70
+ fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=True).status
71
+ == 200
72
+ )
73
+ assert (
74
+ fetcher.fetch(self.html_url, block_webrtc=False, allow_webgl=True).status
75
+ == 200
76
+ )
77
+ assert (
78
+ fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=False).status
79
+ == 200
80
+ )
81
+ assert (
82
+ fetcher.fetch(
83
+ self.html_url, extra_headers={"ayo": ""}, os_randomize=True
84
+ ).status
85
+ == 200
86
+ )
87
 
88
  def test_infinite_timeout(self, fetcher):
89
  """Test if infinite timeout breaks the code or not"""
tests/fetchers/sync/test_httpx.py CHANGED
@@ -16,14 +16,14 @@ class TestFetcher:
16
  @pytest.fixture(autouse=True)
17
  def setup_urls(self, httpbin):
18
  """Fixture to set up URLs for testing"""
19
- self.status_200 = f'{httpbin.url}/status/200'
20
- self.status_404 = f'{httpbin.url}/status/404'
21
- self.status_501 = f'{httpbin.url}/status/501'
22
- self.basic_url = f'{httpbin.url}/get'
23
- self.post_url = f'{httpbin.url}/post'
24
- self.put_url = f'{httpbin.url}/put'
25
- self.delete_url = f'{httpbin.url}/delete'
26
- self.html_url = f'{httpbin.url}/html'
27
 
28
  def test_basic_get(self, fetcher):
29
  """Test doing basic get request with multiple statuses"""
@@ -36,49 +36,86 @@ class TestFetcher:
36
  assert fetcher.get(self.status_200, stealthy_headers=True).status == 200
37
  assert fetcher.get(self.status_200, follow_redirects=True).status == 200
38
  assert fetcher.get(self.status_200, timeout=None).status == 200
39
- assert fetcher.get(
40
- self.status_200,
41
- stealthy_headers=True,
42
- follow_redirects=True,
43
- timeout=None
44
- ).status == 200
 
 
 
45
 
46
  def test_post_properties(self, fetcher):
47
  """Test if different arguments with POST request breaks the code or not"""
48
- assert fetcher.post(self.post_url, data={'key': 'value'}).status == 200
49
- assert fetcher.post(self.post_url, data={'key': 'value'}, stealthy_headers=True).status == 200
50
- assert fetcher.post(self.post_url, data={'key': 'value'}, follow_redirects=True).status == 200
51
- assert fetcher.post(self.post_url, data={'key': 'value'}, timeout=None).status == 200
52
- assert fetcher.post(
53
- self.post_url,
54
- data={'key': 'value'},
55
- stealthy_headers=True,
56
- follow_redirects=True,
57
- timeout=None
58
- ).status == 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  def test_put_properties(self, fetcher):
61
  """Test if different arguments with PUT request breaks the code or not"""
62
- assert fetcher.put(self.put_url, data={'key': 'value'}).status == 200
63
- assert fetcher.put(self.put_url, data={'key': 'value'}, stealthy_headers=True).status == 200
64
- assert fetcher.put(self.put_url, data={'key': 'value'}, follow_redirects=True).status == 200
65
- assert fetcher.put(self.put_url, data={'key': 'value'}, timeout=None).status == 200
66
- assert fetcher.put(
67
- self.put_url,
68
- data={'key': 'value'},
69
- stealthy_headers=True,
70
- follow_redirects=True,
71
- timeout=None
72
- ).status == 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  def test_delete_properties(self, fetcher):
75
  """Test if different arguments with DELETE request breaks the code or not"""
76
  assert fetcher.delete(self.delete_url, stealthy_headers=True).status == 200
77
  assert fetcher.delete(self.delete_url, follow_redirects=True).status == 200
78
  assert fetcher.delete(self.delete_url, timeout=None).status == 200
79
- assert fetcher.delete(
80
- self.delete_url,
81
- stealthy_headers=True,
82
- follow_redirects=True,
83
- timeout=None
84
- ).status == 200
 
 
 
 
16
  @pytest.fixture(autouse=True)
17
  def setup_urls(self, httpbin):
18
  """Fixture to set up URLs for testing"""
19
+ self.status_200 = f"{httpbin.url}/status/200"
20
+ self.status_404 = f"{httpbin.url}/status/404"
21
+ self.status_501 = f"{httpbin.url}/status/501"
22
+ self.basic_url = f"{httpbin.url}/get"
23
+ self.post_url = f"{httpbin.url}/post"
24
+ self.put_url = f"{httpbin.url}/put"
25
+ self.delete_url = f"{httpbin.url}/delete"
26
+ self.html_url = f"{httpbin.url}/html"
27
 
28
  def test_basic_get(self, fetcher):
29
  """Test doing basic get request with multiple statuses"""
 
36
  assert fetcher.get(self.status_200, stealthy_headers=True).status == 200
37
  assert fetcher.get(self.status_200, follow_redirects=True).status == 200
38
  assert fetcher.get(self.status_200, timeout=None).status == 200
39
+ assert (
40
+ fetcher.get(
41
+ self.status_200,
42
+ stealthy_headers=True,
43
+ follow_redirects=True,
44
+ timeout=None,
45
+ ).status
46
+ == 200
47
+ )
48
 
49
  def test_post_properties(self, fetcher):
50
  """Test if different arguments with POST request breaks the code or not"""
51
+ assert fetcher.post(self.post_url, data={"key": "value"}).status == 200
52
+ assert (
53
+ fetcher.post(
54
+ self.post_url, data={"key": "value"}, stealthy_headers=True
55
+ ).status
56
+ == 200
57
+ )
58
+ assert (
59
+ fetcher.post(
60
+ self.post_url, data={"key": "value"}, follow_redirects=True
61
+ ).status
62
+ == 200
63
+ )
64
+ assert (
65
+ fetcher.post(self.post_url, data={"key": "value"}, timeout=None).status
66
+ == 200
67
+ )
68
+ assert (
69
+ fetcher.post(
70
+ self.post_url,
71
+ data={"key": "value"},
72
+ stealthy_headers=True,
73
+ follow_redirects=True,
74
+ timeout=None,
75
+ ).status
76
+ == 200
77
+ )
78
 
79
  def test_put_properties(self, fetcher):
80
  """Test if different arguments with PUT request breaks the code or not"""
81
+ assert fetcher.put(self.put_url, data={"key": "value"}).status == 200
82
+ assert (
83
+ fetcher.put(
84
+ self.put_url, data={"key": "value"}, stealthy_headers=True
85
+ ).status
86
+ == 200
87
+ )
88
+ assert (
89
+ fetcher.put(
90
+ self.put_url, data={"key": "value"}, follow_redirects=True
91
+ ).status
92
+ == 200
93
+ )
94
+ assert (
95
+ fetcher.put(self.put_url, data={"key": "value"}, timeout=None).status == 200
96
+ )
97
+ assert (
98
+ fetcher.put(
99
+ self.put_url,
100
+ data={"key": "value"},
101
+ stealthy_headers=True,
102
+ follow_redirects=True,
103
+ timeout=None,
104
+ ).status
105
+ == 200
106
+ )
107
 
108
  def test_delete_properties(self, fetcher):
109
  """Test if different arguments with DELETE request breaks the code or not"""
110
  assert fetcher.delete(self.delete_url, stealthy_headers=True).status == 200
111
  assert fetcher.delete(self.delete_url, follow_redirects=True).status == 200
112
  assert fetcher.delete(self.delete_url, timeout=None).status == 200
113
+ assert (
114
+ fetcher.delete(
115
+ self.delete_url,
116
+ stealthy_headers=True,
117
+ follow_redirects=True,
118
+ timeout=None,
119
+ ).status
120
+ == 200
121
+ )
tests/fetchers/sync/test_playwright.py CHANGED
@@ -8,7 +8,6 @@ PlayWrightFetcher.auto_match = True
8
 
9
  @pytest_httpbin.use_class_based_httpbin
10
  class TestPlayWrightFetcher:
11
-
12
  @pytest.fixture(scope="class")
13
  def fetcher(self):
14
  """Fixture to create a StealthyFetcher instance for the entire test class"""
@@ -17,12 +16,12 @@ class TestPlayWrightFetcher:
17
  @pytest.fixture(autouse=True)
18
  def setup_urls(self, httpbin):
19
  """Fixture to set up URLs for testing"""
20
- self.status_200 = f'{httpbin.url}/status/200'
21
- self.status_404 = f'{httpbin.url}/status/404'
22
- self.status_501 = f'{httpbin.url}/status/501'
23
- self.basic_url = f'{httpbin.url}/get'
24
- self.html_url = f'{httpbin.url}/html'
25
- self.delayed_url = f'{httpbin.url}/delay/10' # 10 Seconds delay response
26
  self.cookies_url = f"{httpbin.url}/cookies/set/test/value"
27
 
28
  def test_basic_fetch(self, fetcher):
@@ -42,12 +41,17 @@ class TestPlayWrightFetcher:
42
 
43
  def test_waiting_selector(self, fetcher):
44
  """Test if waiting for a selector make page does not finish loading or not"""
45
- assert fetcher.fetch(self.html_url, wait_selector='h1').status == 200
46
- assert fetcher.fetch(self.html_url, wait_selector='h1', wait_selector_state='visible').status == 200
 
 
 
 
 
47
 
48
  def test_cookies_loading(self, fetcher):
49
  """Test if cookies are set after the request"""
50
- assert fetcher.fetch(self.cookies_url).cookies == {'test': 'value'}
51
 
52
  def test_automation(self, fetcher):
53
  """Test if automation break the code or not"""
@@ -60,13 +64,18 @@ class TestPlayWrightFetcher:
60
 
61
  assert fetcher.fetch(self.html_url, page_action=scroll_page).status == 200
62
 
63
- @pytest.mark.parametrize("kwargs", [
64
- {"disable_webgl": True, "hide_canvas": False},
65
- {"disable_webgl": False, "hide_canvas": True},
66
- # {"stealth": True}, # causes issues with Github Actions
67
- {"useragent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0'},
68
- {"extra_headers": {'ayo': ''}}
69
- ])
 
 
 
 
 
70
  def test_properties(self, fetcher, kwargs):
71
  """Test if different arguments breaks the code or not"""
72
  response = fetcher.fetch(self.html_url, **kwargs)
@@ -75,15 +84,18 @@ class TestPlayWrightFetcher:
75
  def test_cdp_url_invalid(self, fetcher):
76
  """Test if invalid CDP URLs raise appropriate exceptions"""
77
  with pytest.raises(ValueError):
78
- fetcher.fetch(self.html_url, cdp_url='blahblah')
79
 
80
  with pytest.raises(ValueError):
81
- fetcher.fetch(self.html_url, cdp_url='blahblah', nstbrowser_mode=True)
82
 
83
  with pytest.raises(Exception):
84
- fetcher.fetch(self.html_url, cdp_url='ws://blahblah')
85
 
86
- def test_infinite_timeout(self, fetcher, ):
 
 
 
87
  """Test if infinite timeout breaks the code or not"""
88
  response = fetcher.fetch(self.delayed_url, timeout=None)
89
  assert response.status == 200
 
8
 
9
  @pytest_httpbin.use_class_based_httpbin
10
  class TestPlayWrightFetcher:
 
11
  @pytest.fixture(scope="class")
12
  def fetcher(self):
13
  """Fixture to create a StealthyFetcher instance for the entire test class"""
 
16
  @pytest.fixture(autouse=True)
17
  def setup_urls(self, httpbin):
18
  """Fixture to set up URLs for testing"""
19
+ self.status_200 = f"{httpbin.url}/status/200"
20
+ self.status_404 = f"{httpbin.url}/status/404"
21
+ self.status_501 = f"{httpbin.url}/status/501"
22
+ self.basic_url = f"{httpbin.url}/get"
23
+ self.html_url = f"{httpbin.url}/html"
24
+ self.delayed_url = f"{httpbin.url}/delay/10" # 10 Seconds delay response
25
  self.cookies_url = f"{httpbin.url}/cookies/set/test/value"
26
 
27
  def test_basic_fetch(self, fetcher):
 
41
 
42
  def test_waiting_selector(self, fetcher):
43
  """Test if waiting for a selector make page does not finish loading or not"""
44
+ assert fetcher.fetch(self.html_url, wait_selector="h1").status == 200
45
+ assert (
46
+ fetcher.fetch(
47
+ self.html_url, wait_selector="h1", wait_selector_state="visible"
48
+ ).status
49
+ == 200
50
+ )
51
 
52
  def test_cookies_loading(self, fetcher):
53
  """Test if cookies are set after the request"""
54
+ assert fetcher.fetch(self.cookies_url).cookies == {"test": "value"}
55
 
56
  def test_automation(self, fetcher):
57
  """Test if automation break the code or not"""
 
64
 
65
  assert fetcher.fetch(self.html_url, page_action=scroll_page).status == 200
66
 
67
+ @pytest.mark.parametrize(
68
+ "kwargs",
69
+ [
70
+ {"disable_webgl": True, "hide_canvas": False},
71
+ {"disable_webgl": False, "hide_canvas": True},
72
+ # {"stealth": True}, # causes issues with Github Actions
73
+ {
74
+ "useragent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0"
75
+ },
76
+ {"extra_headers": {"ayo": ""}},
77
+ ],
78
+ )
79
  def test_properties(self, fetcher, kwargs):
80
  """Test if different arguments breaks the code or not"""
81
  response = fetcher.fetch(self.html_url, **kwargs)
 
84
  def test_cdp_url_invalid(self, fetcher):
85
  """Test if invalid CDP URLs raise appropriate exceptions"""
86
  with pytest.raises(ValueError):
87
+ fetcher.fetch(self.html_url, cdp_url="blahblah")
88
 
89
  with pytest.raises(ValueError):
90
+ fetcher.fetch(self.html_url, cdp_url="blahblah", nstbrowser_mode=True)
91
 
92
  with pytest.raises(Exception):
93
+ fetcher.fetch(self.html_url, cdp_url="ws://blahblah")
94
 
95
+ def test_infinite_timeout(
96
+ self,
97
+ fetcher,
98
+ ):
99
  """Test if infinite timeout breaks the code or not"""
100
  response = fetcher.fetch(self.delayed_url, timeout=None)
101
  assert response.status == 200
tests/fetchers/test_utils.py CHANGED
@@ -7,76 +7,117 @@ from scrapling.engines.toolbelt.custom import ResponseEncoding, StatusText
7
  def content_type_map():
8
  return {
9
  # A map generated by ChatGPT for most possible `content_type` values and the expected outcome
10
- 'text/html; charset=UTF-8': 'UTF-8',
11
- 'text/html; charset=ISO-8859-1': 'ISO-8859-1',
12
- 'text/html': 'ISO-8859-1',
13
- 'application/json; charset=UTF-8': 'UTF-8',
14
- 'application/json': 'utf-8',
15
- 'text/json': 'utf-8',
16
- 'application/javascript; charset=UTF-8': 'UTF-8',
17
- 'application/javascript': 'utf-8',
18
- 'text/plain; charset=UTF-8': 'UTF-8',
19
- 'text/plain; charset=ISO-8859-1': 'ISO-8859-1',
20
- 'text/plain': 'ISO-8859-1',
21
- 'application/xhtml+xml; charset=UTF-8': 'UTF-8',
22
- 'application/xhtml+xml': 'utf-8',
23
- 'text/html; charset=windows-1252': 'windows-1252',
24
- 'application/json; charset=windows-1252': 'windows-1252',
25
- 'text/plain; charset=windows-1252': 'windows-1252',
26
- 'text/html; charset="UTF-8"': 'UTF-8',
27
- 'text/html; charset="ISO-8859-1"': 'ISO-8859-1',
28
- 'text/html; charset="windows-1252"': 'windows-1252',
29
- 'application/json; charset="UTF-8"': 'UTF-8',
30
- 'application/json; charset="ISO-8859-1"': 'ISO-8859-1',
31
- 'application/json; charset="windows-1252"': 'windows-1252',
32
- 'text/json; charset="UTF-8"': 'UTF-8',
33
- 'application/javascript; charset="UTF-8"': 'UTF-8',
34
- 'application/javascript; charset="ISO-8859-1"': 'ISO-8859-1',
35
- 'text/plain; charset="UTF-8"': 'UTF-8',
36
- 'text/plain; charset="ISO-8859-1"': 'ISO-8859-1',
37
- 'text/plain; charset="windows-1252"': 'windows-1252',
38
- 'application/xhtml+xml; charset="UTF-8"': 'UTF-8',
39
- 'application/xhtml+xml; charset="ISO-8859-1"': 'ISO-8859-1',
40
- 'application/xhtml+xml; charset="windows-1252"': 'windows-1252',
41
- 'text/html; charset="US-ASCII"': 'US-ASCII',
42
- 'application/json; charset="US-ASCII"': 'US-ASCII',
43
- 'text/plain; charset="US-ASCII"': 'US-ASCII',
44
- 'text/html; charset="Shift_JIS"': 'Shift_JIS',
45
- 'application/json; charset="Shift_JIS"': 'Shift_JIS',
46
- 'text/plain; charset="Shift_JIS"': 'Shift_JIS',
47
- 'application/xml; charset="UTF-8"': 'UTF-8',
48
- 'application/xml; charset="ISO-8859-1"': 'ISO-8859-1',
49
- 'application/xml': 'utf-8',
50
- 'text/xml; charset="UTF-8"': 'UTF-8',
51
- 'text/xml; charset="ISO-8859-1"': 'ISO-8859-1',
52
- 'text/xml': 'utf-8'
53
  }
54
 
55
 
56
  @pytest.fixture
57
  def status_map():
58
  return {
59
- 100: "Continue", 101: "Switching Protocols", 102: "Processing", 103: "Early Hints",
60
- 200: "OK", 201: "Created", 202: "Accepted", 203: "Non-Authoritative Information",
61
- 204: "No Content", 205: "Reset Content", 206: "Partial Content", 207: "Multi-Status",
62
- 208: "Already Reported", 226: "IM Used", 300: "Multiple Choices",
63
- 301: "Moved Permanently", 302: "Found", 303: "See Other", 304: "Not Modified",
64
- 305: "Use Proxy", 307: "Temporary Redirect", 308: "Permanent Redirect",
65
- 400: "Bad Request", 401: "Unauthorized", 402: "Payment Required", 403: "Forbidden",
66
- 404: "Not Found", 405: "Method Not Allowed", 406: "Not Acceptable",
67
- 407: "Proxy Authentication Required", 408: "Request Timeout", 409: "Conflict",
68
- 410: "Gone", 411: "Length Required", 412: "Precondition Failed",
69
- 413: "Payload Too Large", 414: "URI Too Long", 415: "Unsupported Media Type",
70
- 416: "Range Not Satisfiable", 417: "Expectation Failed", 418: "I'm a teapot",
71
- 421: "Misdirected Request", 422: "Unprocessable Entity", 423: "Locked",
72
- 424: "Failed Dependency", 425: "Too Early", 426: "Upgrade Required",
73
- 428: "Precondition Required", 429: "Too Many Requests",
74
- 431: "Request Header Fields Too Large", 451: "Unavailable For Legal Reasons",
75
- 500: "Internal Server Error", 501: "Not Implemented", 502: "Bad Gateway",
76
- 503: "Service Unavailable", 504: "Gateway Timeout",
77
- 505: "HTTP Version Not Supported", 506: "Variant Also Negotiates",
78
- 507: "Insufficient Storage", 508: "Loop Detected", 510: "Not Extended",
79
- 511: "Network Authentication Required"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  }
81
 
82
 
 
7
  def content_type_map():
8
  return {
9
  # A map generated by ChatGPT for most possible `content_type` values and the expected outcome
10
+ "text/html; charset=UTF-8": "UTF-8",
11
+ "text/html; charset=ISO-8859-1": "ISO-8859-1",
12
+ "text/html": "ISO-8859-1",
13
+ "application/json; charset=UTF-8": "UTF-8",
14
+ "application/json": "utf-8",
15
+ "text/json": "utf-8",
16
+ "application/javascript; charset=UTF-8": "UTF-8",
17
+ "application/javascript": "utf-8",
18
+ "text/plain; charset=UTF-8": "UTF-8",
19
+ "text/plain; charset=ISO-8859-1": "ISO-8859-1",
20
+ "text/plain": "ISO-8859-1",
21
+ "application/xhtml+xml; charset=UTF-8": "UTF-8",
22
+ "application/xhtml+xml": "utf-8",
23
+ "text/html; charset=windows-1252": "windows-1252",
24
+ "application/json; charset=windows-1252": "windows-1252",
25
+ "text/plain; charset=windows-1252": "windows-1252",
26
+ 'text/html; charset="UTF-8"': "UTF-8",
27
+ 'text/html; charset="ISO-8859-1"': "ISO-8859-1",
28
+ 'text/html; charset="windows-1252"': "windows-1252",
29
+ 'application/json; charset="UTF-8"': "UTF-8",
30
+ 'application/json; charset="ISO-8859-1"': "ISO-8859-1",
31
+ 'application/json; charset="windows-1252"': "windows-1252",
32
+ 'text/json; charset="UTF-8"': "UTF-8",
33
+ 'application/javascript; charset="UTF-8"': "UTF-8",
34
+ 'application/javascript; charset="ISO-8859-1"': "ISO-8859-1",
35
+ 'text/plain; charset="UTF-8"': "UTF-8",
36
+ 'text/plain; charset="ISO-8859-1"': "ISO-8859-1",
37
+ 'text/plain; charset="windows-1252"': "windows-1252",
38
+ 'application/xhtml+xml; charset="UTF-8"': "UTF-8",
39
+ 'application/xhtml+xml; charset="ISO-8859-1"': "ISO-8859-1",
40
+ 'application/xhtml+xml; charset="windows-1252"': "windows-1252",
41
+ 'text/html; charset="US-ASCII"': "US-ASCII",
42
+ 'application/json; charset="US-ASCII"': "US-ASCII",
43
+ 'text/plain; charset="US-ASCII"': "US-ASCII",
44
+ 'text/html; charset="Shift_JIS"': "Shift_JIS",
45
+ 'application/json; charset="Shift_JIS"': "Shift_JIS",
46
+ 'text/plain; charset="Shift_JIS"': "Shift_JIS",
47
+ 'application/xml; charset="UTF-8"': "UTF-8",
48
+ 'application/xml; charset="ISO-8859-1"': "ISO-8859-1",
49
+ "application/xml": "utf-8",
50
+ 'text/xml; charset="UTF-8"': "UTF-8",
51
+ 'text/xml; charset="ISO-8859-1"': "ISO-8859-1",
52
+ "text/xml": "utf-8",
53
  }
54
 
55
 
56
  @pytest.fixture
57
  def status_map():
58
  return {
59
+ 100: "Continue",
60
+ 101: "Switching Protocols",
61
+ 102: "Processing",
62
+ 103: "Early Hints",
63
+ 200: "OK",
64
+ 201: "Created",
65
+ 202: "Accepted",
66
+ 203: "Non-Authoritative Information",
67
+ 204: "No Content",
68
+ 205: "Reset Content",
69
+ 206: "Partial Content",
70
+ 207: "Multi-Status",
71
+ 208: "Already Reported",
72
+ 226: "IM Used",
73
+ 300: "Multiple Choices",
74
+ 301: "Moved Permanently",
75
+ 302: "Found",
76
+ 303: "See Other",
77
+ 304: "Not Modified",
78
+ 305: "Use Proxy",
79
+ 307: "Temporary Redirect",
80
+ 308: "Permanent Redirect",
81
+ 400: "Bad Request",
82
+ 401: "Unauthorized",
83
+ 402: "Payment Required",
84
+ 403: "Forbidden",
85
+ 404: "Not Found",
86
+ 405: "Method Not Allowed",
87
+ 406: "Not Acceptable",
88
+ 407: "Proxy Authentication Required",
89
+ 408: "Request Timeout",
90
+ 409: "Conflict",
91
+ 410: "Gone",
92
+ 411: "Length Required",
93
+ 412: "Precondition Failed",
94
+ 413: "Payload Too Large",
95
+ 414: "URI Too Long",
96
+ 415: "Unsupported Media Type",
97
+ 416: "Range Not Satisfiable",
98
+ 417: "Expectation Failed",
99
+ 418: "I'm a teapot",
100
+ 421: "Misdirected Request",
101
+ 422: "Unprocessable Entity",
102
+ 423: "Locked",
103
+ 424: "Failed Dependency",
104
+ 425: "Too Early",
105
+ 426: "Upgrade Required",
106
+ 428: "Precondition Required",
107
+ 429: "Too Many Requests",
108
+ 431: "Request Header Fields Too Large",
109
+ 451: "Unavailable For Legal Reasons",
110
+ 500: "Internal Server Error",
111
+ 501: "Not Implemented",
112
+ 502: "Bad Gateway",
113
+ 503: "Service Unavailable",
114
+ 504: "Gateway Timeout",
115
+ 505: "HTTP Version Not Supported",
116
+ 506: "Variant Also Negotiates",
117
+ 507: "Insufficient Storage",
118
+ 508: "Loop Detected",
119
+ 510: "Not Extended",
120
+ 511: "Network Authentication Required",
121
  }
122
 
123
 
tests/parser/test_automatch.py CHANGED
@@ -8,7 +8,7 @@ from scrapling import Adaptor
8
  class TestParserAutoMatch:
9
  def test_element_relocation(self):
10
  """Test relocating element after structure change"""
11
- original_html = '''
12
  <div class="container">
13
  <section class="products">
14
  <article class="product" id="p1">
@@ -21,8 +21,8 @@ class TestParserAutoMatch:
21
  </article>
22
  </section>
23
  </div>
24
- '''
25
- changed_html = '''
26
  <div class="new-container">
27
  <div class="product-wrapper">
28
  <section class="products">
@@ -41,25 +41,25 @@ class TestParserAutoMatch:
41
  </section>
42
  </div>
43
  </div>
44
- '''
45
 
46
- old_page = Adaptor(original_html, url='example.com', auto_match=True)
47
- new_page = Adaptor(changed_html, url='example.com', auto_match=True)
48
 
49
  # 'p1' was used as ID and now it's not and all the path elements have changes
50
  # Also at the same time testing auto-match vs combined selectors
51
- _ = old_page.css('#p1, #p2', auto_save=True)[0]
52
- relocated = new_page.css('#p1', auto_match=True)
53
 
54
  assert relocated is not None
55
- assert relocated[0].attrib['data-id'] == 'p1'
56
- assert relocated[0].has_class('new-class')
57
- assert relocated[0].css('.new-description')[0].text == 'Description 1'
58
 
59
  @pytest.mark.asyncio
60
  async def test_element_relocation_async(self):
61
  """Test relocating element after structure change in async mode"""
62
- original_html = '''
63
  <div class="container">
64
  <section class="products">
65
  <article class="product" id="p1">
@@ -72,8 +72,8 @@ class TestParserAutoMatch:
72
  </article>
73
  </section>
74
  </div>
75
- '''
76
- changed_html = '''
77
  <div class="new-container">
78
  <div class="product-wrapper">
79
  <section class="products">
@@ -92,20 +92,20 @@ class TestParserAutoMatch:
92
  </section>
93
  </div>
94
  </div>
95
- '''
96
 
97
  # Simulate async operation
98
  await asyncio.sleep(0.1) # Minimal async operation
99
 
100
- old_page = Adaptor(original_html, url='example.com', auto_match=True)
101
- new_page = Adaptor(changed_html, url='example.com', auto_match=True)
102
 
103
  # 'p1' was used as ID and now it's not and all the path elements have changes
104
  # Also at the same time testing auto-match vs combined selectors
105
- _ = old_page.css('#p1, #p2', auto_save=True)[0]
106
- relocated = new_page.css('#p1', auto_match=True)
107
 
108
  assert relocated is not None
109
- assert relocated[0].attrib['data-id'] == 'p1'
110
- assert relocated[0].has_class('new-class')
111
- assert relocated[0].css('.new-description')[0].text == 'Description 1'
 
8
  class TestParserAutoMatch:
9
  def test_element_relocation(self):
10
  """Test relocating element after structure change"""
11
+ original_html = """
12
  <div class="container">
13
  <section class="products">
14
  <article class="product" id="p1">
 
21
  </article>
22
  </section>
23
  </div>
24
+ """
25
+ changed_html = """
26
  <div class="new-container">
27
  <div class="product-wrapper">
28
  <section class="products">
 
41
  </section>
42
  </div>
43
  </div>
44
+ """
45
 
46
+ old_page = Adaptor(original_html, url="example.com", auto_match=True)
47
+ new_page = Adaptor(changed_html, url="example.com", auto_match=True)
48
 
49
  # 'p1' was used as ID and now it's not and all the path elements have changes
50
  # Also at the same time testing auto-match vs combined selectors
51
+ _ = old_page.css("#p1, #p2", auto_save=True)[0]
52
+ relocated = new_page.css("#p1", auto_match=True)
53
 
54
  assert relocated is not None
55
+ assert relocated[0].attrib["data-id"] == "p1"
56
+ assert relocated[0].has_class("new-class")
57
+ assert relocated[0].css(".new-description")[0].text == "Description 1"
58
 
59
  @pytest.mark.asyncio
60
  async def test_element_relocation_async(self):
61
  """Test relocating element after structure change in async mode"""
62
+ original_html = """
63
  <div class="container">
64
  <section class="products">
65
  <article class="product" id="p1">
 
72
  </article>
73
  </section>
74
  </div>
75
+ """
76
+ changed_html = """
77
  <div class="new-container">
78
  <div class="product-wrapper">
79
  <section class="products">
 
92
  </section>
93
  </div>
94
  </div>
95
+ """
96
 
97
  # Simulate async operation
98
  await asyncio.sleep(0.1) # Minimal async operation
99
 
100
+ old_page = Adaptor(original_html, url="example.com", auto_match=True)
101
+ new_page = Adaptor(changed_html, url="example.com", auto_match=True)
102
 
103
  # 'p1' was used as ID and now it's not and all the path elements have changes
104
  # Also at the same time testing auto-match vs combined selectors
105
+ _ = old_page.css("#p1, #p2", auto_save=True)[0]
106
+ relocated = new_page.css("#p1", auto_match=True)
107
 
108
  assert relocated is not None
109
+ assert relocated[0].attrib["data-id"] == "p1"
110
+ assert relocated[0].has_class("new-class")
111
+ assert relocated[0].css(".new-description")[0].text == "Description 1"
tests/parser/test_general.py CHANGED
@@ -9,7 +9,7 @@ from scrapling import Adaptor
9
 
10
  @pytest.fixture
11
  def html_content():
12
- return '''
13
  <html>
14
  <head>
15
  <title>Complex Web Page</title>
@@ -73,7 +73,7 @@ def html_content():
73
  </script>
74
  </body>
75
  </html>
76
- '''
77
 
78
 
79
  @pytest.fixture
@@ -85,13 +85,14 @@ def page(html_content):
85
  class TestCSSSelectors:
86
  def test_basic_product_selection(self, page):
87
  """Test selecting all product elements"""
88
- elements = page.css('main #products .product-list article.product')
89
  assert len(elements) == 3
90
 
91
  def test_in_stock_product_selection(self, page):
92
  """Test selecting in-stock products"""
93
  in_stock_products = page.css(
94
- 'main #products .product-list article.product:not(:contains("Out of stock"))')
 
95
  assert len(in_stock_products) == 2
96
 
97
 
@@ -117,22 +118,26 @@ class TestXPathSelectors:
117
  class TestTextMatching:
118
  def test_regex_multiple_matches(self, page):
119
  """Test finding multiple matches with regex"""
120
- stock_info = page.find_by_regex(r'In stock: \d+', first_match=False)
121
  assert len(stock_info) == 2
122
 
123
  def test_regex_first_match(self, page):
124
  """Test finding the first match with regex"""
125
- stock_info = page.find_by_regex(r'In stock: \d+', first_match=True, case_sensitive=True)
126
- assert stock_info.text == 'In stock: 5'
 
 
127
 
128
  def test_partial_text_match(self, page):
129
  """Test finding elements with partial text match"""
130
- stock_info = page.find_by_text(r'In stock:', partial=True, first_match=False)
131
  assert len(stock_info) == 2
132
 
133
  def test_exact_text_match(self, page):
134
  """Test finding elements with exact text match"""
135
- out_of_stock = page.find_by_text('Out of stock', partial=False, first_match=False)
 
 
136
  assert len(out_of_stock) == 1
137
 
138
 
@@ -140,17 +145,17 @@ class TestTextMatching:
140
  class TestSimilarElements:
141
  def test_finding_similar_products(self, page):
142
  """Test finding similar product elements"""
143
- first_product = page.css_first('.product')
144
  similar_products = first_product.find_similar()
145
  assert len(similar_products) == 2
146
 
147
  def test_finding_similar_reviews(self, page):
148
  """Test finding similar review elements with additional filtering"""
149
- first_review = page.find('div', class_='review')
150
  similar_high_rated_reviews = [
151
  review
152
  for review in first_review.find_similar()
153
- if int(review.attrib.get('data-rating', 0)) >= 4
154
  ]
155
  assert len(similar_high_rated_reviews) == 1
156
 
@@ -181,17 +186,17 @@ class TestErrorHandling:
181
  def test_bad_selectors(self, page):
182
  """Test handling of invalid selectors"""
183
  with pytest.raises((SelectorError, SelectorSyntaxError)):
184
- page.css('4 ayo')
185
 
186
  with pytest.raises((SelectorError, SelectorSyntaxError)):
187
- page.xpath('4 ayo')
188
 
189
 
190
  # Pickling and Object Representation Tests
191
  class TestPicklingAndRepresentation:
192
  def test_unpickleable_objects(self, page):
193
  """Test that Adaptor objects cannot be pickled"""
194
- table = page.css('.product-list')[0]
195
  with pytest.raises(TypeError):
196
  pickle.dumps(table)
197
 
@@ -200,7 +205,7 @@ class TestPicklingAndRepresentation:
200
 
201
  def test_string_representations(self, page):
202
  """Test custom string representations of objects"""
203
- table = page.css('.product-list')[0]
204
  assert issubclass(type(table.__str__()), str)
205
  assert issubclass(type(table.__repr__()), str)
206
  assert issubclass(type(table.attrib.__str__()), str)
@@ -211,40 +216,40 @@ class TestPicklingAndRepresentation:
211
  class TestElementNavigation:
212
  def test_basic_navigation_properties(self, page):
213
  """Test basic navigation properties of elements"""
214
- table = page.css('.product-list')[0]
215
  assert table.path is not None
216
- assert table.html_content != ''
217
- assert table.prettify() != ''
218
 
219
  def test_parent_and_sibling_navigation(self, page):
220
  """Test parent and sibling navigation"""
221
- table = page.css('.product-list')[0]
222
  parent = table.parent
223
- assert parent.attrib['id'] == 'products'
224
 
225
  parent_siblings = parent.siblings
226
  assert len(parent_siblings) == 1
227
 
228
  def test_child_navigation(self, page):
229
  """Test child navigation"""
230
- table = page.css('.product-list')[0]
231
  children = table.children
232
  assert len(children) == 3
233
 
234
  def test_next_and_previous_navigation(self, page):
235
  """Test next and previous element navigation"""
236
- child = page.css('.product-list')[0].find({'data-id': "1"})
237
  next_element = child.next
238
- assert next_element.attrib['data-id'] == '2'
239
 
240
  prev_element = next_element.previous
241
  assert prev_element.tag == child.tag
242
 
243
  def test_ancestor_finding(self, page):
244
  """Test finding ancestors of elements"""
245
- all_prices = page.css('.price')
246
  products_with_prices = [
247
- price.find_ancestor(lambda p: p.has_class('product'))
248
  for price in all_prices
249
  ]
250
  assert len(products_with_prices) == 3
@@ -254,52 +259,59 @@ class TestElementNavigation:
254
  class TestJSONAndAttributes:
255
  def test_json_conversion(self, page):
256
  """Test converting content to JSON"""
257
- script_content = page.css('#page-data::text')[0]
258
  assert issubclass(type(script_content.sort()), str)
259
  page_data = script_content.json()
260
- assert page_data['totalProducts'] == 3
261
- assert 'lastUpdated' in page_data
262
 
263
  def test_attribute_operations(self, page):
264
  """Test various attribute-related operations"""
265
  # Product ID extraction
266
- products = page.css('.product')
267
- product_ids = [product.attrib['data-id'] for product in products]
268
- assert product_ids == ['1', '2', '3']
269
- assert 'data-id' in products[0].attrib
270
 
271
  # Review rating calculations
272
- reviews = page.css('.review')
273
- review_ratings = [int(review.attrib['data-rating']) for review in reviews]
274
  assert sum(review_ratings) / len(review_ratings) == 4.5
275
 
276
  # Attribute searching
277
- key_value = list(products[0].attrib.search_values('1', partial=False))
278
- assert list(key_value[0].keys()) == ['data-id']
279
 
280
- key_value = list(products[0].attrib.search_values('1', partial=True))
281
- assert list(key_value[0].keys()) == ['data-id']
282
 
283
  # JSON attribute conversion
284
- attr_json = page.css_first('#products').attrib['schema'].json()
285
- assert attr_json == {'jsonable': 'data'}
286
- assert isinstance(page.css('#products')[0].attrib.json_string, bytes)
287
 
288
 
289
  # Performance Test
290
  def test_large_html_parsing_performance():
291
  """Test parsing and selecting performance on large HTML"""
292
- large_html = '<html><body>' + '<div class="item">' * 5000 + '</div>' * 5000 + '</body></html>'
 
 
 
 
 
293
 
294
  start_time = time.time()
295
  parsed = Adaptor(large_html, auto_match=False)
296
- elements = parsed.css('.item')
297
  end_time = time.time()
298
 
299
  assert len(elements) == 5000
300
  # Converting 5000 elements to a class and doing operations on them will take time
301
  # Based on my tests with 100 runs, 1 loop each Scrapling (given the extra work/features) takes 10.4ms on average
302
- assert end_time - start_time < 0.5 # Locally I test on 0.1 but on GitHub actions with browsers and threading sometimes closing adds fractions of seconds
 
 
303
 
304
 
305
  # Selector Generation Test
@@ -318,13 +330,13 @@ def test_selectors_generation(page):
318
  # Miscellaneous Tests
319
  def test_getting_all_text(page):
320
  """Test getting all text from the page"""
321
- assert page.get_all_text() != ''
322
 
323
 
324
  def test_regex_on_text(page):
325
  """Test regex operations on text"""
326
  element = page.css('[data-id="1"] .price')[0]
327
- match = element.re_first(r'[\.\d]+')
328
- assert match == '10.99'
329
- match = element.text.re(r'(\d+)', replace_entities=False)
330
  assert len(match) == 2
 
9
 
10
  @pytest.fixture
11
  def html_content():
12
+ return """
13
  <html>
14
  <head>
15
  <title>Complex Web Page</title>
 
73
  </script>
74
  </body>
75
  </html>
76
+ """
77
 
78
 
79
  @pytest.fixture
 
85
  class TestCSSSelectors:
86
  def test_basic_product_selection(self, page):
87
  """Test selecting all product elements"""
88
+ elements = page.css("main #products .product-list article.product")
89
  assert len(elements) == 3
90
 
91
  def test_in_stock_product_selection(self, page):
92
  """Test selecting in-stock products"""
93
  in_stock_products = page.css(
94
+ 'main #products .product-list article.product:not(:contains("Out of stock"))'
95
+ )
96
  assert len(in_stock_products) == 2
97
 
98
 
 
118
  class TestTextMatching:
119
  def test_regex_multiple_matches(self, page):
120
  """Test finding multiple matches with regex"""
121
+ stock_info = page.find_by_regex(r"In stock: \d+", first_match=False)
122
  assert len(stock_info) == 2
123
 
124
  def test_regex_first_match(self, page):
125
  """Test finding the first match with regex"""
126
+ stock_info = page.find_by_regex(
127
+ r"In stock: \d+", first_match=True, case_sensitive=True
128
+ )
129
+ assert stock_info.text == "In stock: 5"
130
 
131
  def test_partial_text_match(self, page):
132
  """Test finding elements with partial text match"""
133
+ stock_info = page.find_by_text(r"In stock:", partial=True, first_match=False)
134
  assert len(stock_info) == 2
135
 
136
  def test_exact_text_match(self, page):
137
  """Test finding elements with exact text match"""
138
+ out_of_stock = page.find_by_text(
139
+ "Out of stock", partial=False, first_match=False
140
+ )
141
  assert len(out_of_stock) == 1
142
 
143
 
 
145
  class TestSimilarElements:
146
  def test_finding_similar_products(self, page):
147
  """Test finding similar product elements"""
148
+ first_product = page.css_first(".product")
149
  similar_products = first_product.find_similar()
150
  assert len(similar_products) == 2
151
 
152
  def test_finding_similar_reviews(self, page):
153
  """Test finding similar review elements with additional filtering"""
154
+ first_review = page.find("div", class_="review")
155
  similar_high_rated_reviews = [
156
  review
157
  for review in first_review.find_similar()
158
+ if int(review.attrib.get("data-rating", 0)) >= 4
159
  ]
160
  assert len(similar_high_rated_reviews) == 1
161
 
 
186
  def test_bad_selectors(self, page):
187
  """Test handling of invalid selectors"""
188
  with pytest.raises((SelectorError, SelectorSyntaxError)):
189
+ page.css("4 ayo")
190
 
191
  with pytest.raises((SelectorError, SelectorSyntaxError)):
192
+ page.xpath("4 ayo")
193
 
194
 
195
  # Pickling and Object Representation Tests
196
  class TestPicklingAndRepresentation:
197
  def test_unpickleable_objects(self, page):
198
  """Test that Adaptor objects cannot be pickled"""
199
+ table = page.css(".product-list")[0]
200
  with pytest.raises(TypeError):
201
  pickle.dumps(table)
202
 
 
205
 
206
  def test_string_representations(self, page):
207
  """Test custom string representations of objects"""
208
+ table = page.css(".product-list")[0]
209
  assert issubclass(type(table.__str__()), str)
210
  assert issubclass(type(table.__repr__()), str)
211
  assert issubclass(type(table.attrib.__str__()), str)
 
216
  class TestElementNavigation:
217
  def test_basic_navigation_properties(self, page):
218
  """Test basic navigation properties of elements"""
219
+ table = page.css(".product-list")[0]
220
  assert table.path is not None
221
+ assert table.html_content != ""
222
+ assert table.prettify() != ""
223
 
224
  def test_parent_and_sibling_navigation(self, page):
225
  """Test parent and sibling navigation"""
226
+ table = page.css(".product-list")[0]
227
  parent = table.parent
228
+ assert parent.attrib["id"] == "products"
229
 
230
  parent_siblings = parent.siblings
231
  assert len(parent_siblings) == 1
232
 
233
  def test_child_navigation(self, page):
234
  """Test child navigation"""
235
+ table = page.css(".product-list")[0]
236
  children = table.children
237
  assert len(children) == 3
238
 
239
  def test_next_and_previous_navigation(self, page):
240
  """Test next and previous element navigation"""
241
+ child = page.css(".product-list")[0].find({"data-id": "1"})
242
  next_element = child.next
243
+ assert next_element.attrib["data-id"] == "2"
244
 
245
  prev_element = next_element.previous
246
  assert prev_element.tag == child.tag
247
 
248
  def test_ancestor_finding(self, page):
249
  """Test finding ancestors of elements"""
250
+ all_prices = page.css(".price")
251
  products_with_prices = [
252
+ price.find_ancestor(lambda p: p.has_class("product"))
253
  for price in all_prices
254
  ]
255
  assert len(products_with_prices) == 3
 
259
  class TestJSONAndAttributes:
260
  def test_json_conversion(self, page):
261
  """Test converting content to JSON"""
262
+ script_content = page.css("#page-data::text")[0]
263
  assert issubclass(type(script_content.sort()), str)
264
  page_data = script_content.json()
265
+ assert page_data["totalProducts"] == 3
266
+ assert "lastUpdated" in page_data
267
 
268
  def test_attribute_operations(self, page):
269
  """Test various attribute-related operations"""
270
  # Product ID extraction
271
+ products = page.css(".product")
272
+ product_ids = [product.attrib["data-id"] for product in products]
273
+ assert product_ids == ["1", "2", "3"]
274
+ assert "data-id" in products[0].attrib
275
 
276
  # Review rating calculations
277
+ reviews = page.css(".review")
278
+ review_ratings = [int(review.attrib["data-rating"]) for review in reviews]
279
  assert sum(review_ratings) / len(review_ratings) == 4.5
280
 
281
  # Attribute searching
282
+ key_value = list(products[0].attrib.search_values("1", partial=False))
283
+ assert list(key_value[0].keys()) == ["data-id"]
284
 
285
+ key_value = list(products[0].attrib.search_values("1", partial=True))
286
+ assert list(key_value[0].keys()) == ["data-id"]
287
 
288
  # JSON attribute conversion
289
+ attr_json = page.css_first("#products").attrib["schema"].json()
290
+ assert attr_json == {"jsonable": "data"}
291
+ assert isinstance(page.css("#products")[0].attrib.json_string, bytes)
292
 
293
 
294
  # Performance Test
295
  def test_large_html_parsing_performance():
296
  """Test parsing and selecting performance on large HTML"""
297
+ large_html = (
298
+ "<html><body>"
299
+ + '<div class="item">' * 5000
300
+ + "</div>" * 5000
301
+ + "</body></html>"
302
+ )
303
 
304
  start_time = time.time()
305
  parsed = Adaptor(large_html, auto_match=False)
306
+ elements = parsed.css(".item")
307
  end_time = time.time()
308
 
309
  assert len(elements) == 5000
310
  # Converting 5000 elements to a class and doing operations on them will take time
311
  # Based on my tests with 100 runs, 1 loop each Scrapling (given the extra work/features) takes 10.4ms on average
312
+ assert (
313
+ end_time - start_time < 0.5
314
+ ) # Locally I test on 0.1 but on GitHub actions with browsers and threading sometimes closing adds fractions of seconds
315
 
316
 
317
  # Selector Generation Test
 
330
  # Miscellaneous Tests
331
  def test_getting_all_text(page):
332
  """Test getting all text from the page"""
333
+ assert page.get_all_text() != ""
334
 
335
 
336
  def test_regex_on_text(page):
337
  """Test regex operations on text"""
338
  element = page.css('[data-id="1"] .price')[0]
339
+ match = element.re_first(r"[\.\d]+")
340
+ assert match == "10.99"
341
+ match = element.text.re(r"(\d+)", replace_entities=False)
342
  assert len(match) == 2