Spaces:

lenson78
/

Scrapling

Paused

App Files Files Community

Karim shoair commited on Apr 13, 2025

Commit

fcedcce

1 Parent(s): f300870

chore: migrating to ruff and updating pre-commit hooks

Browse files

Files changed (35) hide show

.flake8 +0 -3
.pre-commit-config.yaml +9 -8
benchmarks.py +36 -24
cleanup.py +8 -8
ruff.toml +22 -0
scrapling/__init__.py +19 -11
scrapling/cli.py +27 -7
scrapling/core/_types.py +16 -3
scrapling/core/custom_types.py +122 -55
scrapling/core/mixins.py +20 -16
scrapling/core/storage_adaptors.py +11 -7
scrapling/core/translator.py +1 -2
scrapling/core/utils.py +44 -25
scrapling/defaults.py +20 -8
scrapling/engines/__init__.py +1 -1
scrapling/engines/camo.py +125 -59
scrapling/engines/constants.py +84 -87
scrapling/engines/pw.py +169 -100
scrapling/engines/static.py +57 -25
scrapling/engines/toolbelt/__init__.py +16 -6
scrapling/engines/toolbelt/custom.py +167 -95
scrapling/engines/toolbelt/fingerprints.py +13 -13
scrapling/engines/toolbelt/navigation.py +29 -14
scrapling/fetchers.py +329 -83
scrapling/parser.py +453 -180
setup.py +10 -11
tests/fetchers/async/test_camoufox.py +58 -46
tests/fetchers/async/test_httpx.py +92 -51
tests/fetchers/async/test_playwright.py +37 -27
tests/fetchers/sync/test_camoufox.py +33 -13
tests/fetchers/sync/test_httpx.py +79 -42
tests/fetchers/sync/test_playwright.py +33 -21
tests/fetchers/test_utils.py +105 -64
tests/parser/test_automatch.py +22 -22
tests/parser/test_general.py +61 -49

.flake8 DELETED Viewed

@@ -1,3 +0,0 @@
-[flake8]
-ignore = E501, F401
-exclude = .git,.venv,__pycache__,docs,.github,build,dist,tests,benchmarks.py

.pre-commit-config.yaml CHANGED Viewed

@@ -1,17 +1,18 @@
 repos:
 - repo: https://github.com/PyCQA/bandit
-  rev: 1.8.0
   hooks:
   - id: bandit
     args: [-r, -c, .bandit.yml]
-- repo: https://github.com/PyCQA/flake8
-  rev: 7.1.1
   hooks:
-  - id: flake8
-- repo: https://github.com/pycqa/isort
-  rev: 5.13.2
-  hooks:
-  - id: isort
 - repo: https://github.com/netromdk/vermin
   rev: v1.6.0
   hooks:

 repos:
 - repo: https://github.com/PyCQA/bandit
+  rev: 1.8.3
   hooks:
   - id: bandit
     args: [-r, -c, .bandit.yml]
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  # Ruff version.
+  rev: v0.11.5
   hooks:
+    # Run the linter.
+    - id: ruff
+      args: [ --fix ]
+    # Run the formatter.
+    - id: ruff-format
 - repo: https://github.com/netromdk/vermin
   rev: v1.6.0
   hooks:

benchmarks.py CHANGED Viewed

@@ -14,19 +14,27 @@ from selectolax.parser import HTMLParser
 from scrapling import Adaptor
-large_html = '<html><body>' + '<div class="item">' * 5000 + '</div>' * 5000 + '</body></html>'
 def benchmark(func):
     @functools.wraps(func)
     def wrapper(*args, **kwargs):
-        benchmark_name = func.__name__.replace('test_', '').replace('_', ' ')
         print(f"-> {benchmark_name}", end=" ", flush=True)
         # Warm-up phase
-        timeit.repeat(lambda: func(*args, **kwargs), number=2, repeat=2, globals=globals())
         # Measure time (1 run, repeat 100 times, take average)
         times = timeit.repeat(
-            lambda: func(*args, **kwargs), number=1, repeat=100, globals=globals(), timer=time.process_time
         )
         min_time = round(mean(times) * 1000, 2)  # Convert to milliseconds
         print(f"average execution time: {min_time} ms")
@@ -42,23 +50,24 @@ def test_lxml():
         for e in etree.fromstring(
             large_html,
             # Scrapling and Parsel use the same parser inside so this is just to make it fair
-            parser=html.HTMLParser(recover=True, huge_tree=True)
-        ).cssselect('.item')]
 @benchmark
 def test_bs4_lxml():
-    return [e.text for e in BeautifulSoup(large_html, 'lxml').select('.item')]
 @benchmark
 def test_bs4_html5lib():
-    return [e.text for e in BeautifulSoup(large_html, 'html5lib').select('.item')]
 @benchmark
 def test_pyquery():
-    return [e.text() for e in pq(large_html)('.item').items()]
 @benchmark
@@ -66,33 +75,33 @@ def test_scrapling():
     # No need to do `.extract()` like parsel to extract text
     # Also, this is faster than `[t.text for t in Adaptor(large_html, auto_match=False).css('.item')]`
     # for obvious reasons, of course.
-    return Adaptor(large_html, auto_match=False).css('.item::text')
 @benchmark
 def test_parsel():
-    return Selector(text=large_html).css('.item::text').extract()
 @benchmark
 def test_mechanicalsoup():
     browser = StatefulBrowser()
     browser.open_fake_page(large_html)
-    return [e.text for e in browser.page.select('.item')]
 @benchmark
 def test_selectolax():
-    return [node.text() for node in HTMLParser(large_html).css('.item')]
 def display(results):
     # Sort and display results
     sorted_results = sorted(results.items(), key=lambda x: x[1])  # Sort by time
-    scrapling_time = results['Scrapling']
     print("\nRanked Results (fastest to slowest):")
     print(f" i. {'Library tested':<18} | {'avg. time (ms)':<15} | vs Scrapling")
-    print('-' * 50)
     for i, (test_name, test_time) in enumerate(sorted_results, 1):
         compare = round(test_time / scrapling_time, 3)
         print(f" {i}. {test_name:<18} | {str(test_time):<15} | {compare}")
@@ -102,25 +111,28 @@ def display(results):
 def test_scrapling_text(request_html):
     # Will loop over resulted elements to get text too to make comparison even more fair otherwise Scrapling will be even faster
     return [
-        element.text for element in Adaptor(
-            request_html, auto_match=False
-        ).find_by_text('Tipping the Velvet', first_match=True).find_similar(ignore_attributes=['title'])
     ]
 @benchmark
 def test_autoscraper(request_html):
     # autoscraper by default returns elements text
-    return AutoScraper().build(html=request_html, wanted_list=['Tipping the Velvet'])
 if __name__ == "__main__":
-    print(' Benchmark: Speed of parsing and retrieving the text content of 5000 nested elements \n')
     results1 = {
         "Raw Lxml": test_lxml(),
         "Parsel/Scrapy": test_parsel(),
         "Scrapling": test_scrapling(),
-        'Selectolax': test_selectolax(),
         "PyQuery": test_pyquery(),
         "BS4 with Lxml": test_bs4_lxml(),
         "MechanicalSoup": test_mechanicalsoup(),
@@ -128,10 +140,10 @@ if __name__ == "__main__":
     }
     display(results1)
-    print('\n' + "="*25)
-    req = requests.get('https://books.toscrape.com/index.html')
     print(
-        ' Benchmark: Speed of searching for an element by text content, and retrieving the text of similar elements\n'
     )
     results2 = {
         "Scrapling": test_scrapling_text(req.text),

 from scrapling import Adaptor
+large_html = (
+    "<html><body>" + '<div class="item">' * 5000 + "</div>" * 5000 + "</body></html>"
+)
 def benchmark(func):
     @functools.wraps(func)
     def wrapper(*args, **kwargs):
+        benchmark_name = func.__name__.replace("test_", "").replace("_", " ")
         print(f"-> {benchmark_name}", end=" ", flush=True)
         # Warm-up phase
+        timeit.repeat(
+            lambda: func(*args, **kwargs), number=2, repeat=2, globals=globals()
+        )
         # Measure time (1 run, repeat 100 times, take average)
         times = timeit.repeat(
+            lambda: func(*args, **kwargs),
+            number=1,
+            repeat=100,
+            globals=globals(),
+            timer=time.process_time,
         )
         min_time = round(mean(times) * 1000, 2)  # Convert to milliseconds
         print(f"average execution time: {min_time} ms")
         for e in etree.fromstring(
             large_html,
             # Scrapling and Parsel use the same parser inside so this is just to make it fair
+            parser=html.HTMLParser(recover=True, huge_tree=True),
+        ).cssselect(".item")
+    ]
 @benchmark
 def test_bs4_lxml():
+    return [e.text for e in BeautifulSoup(large_html, "lxml").select(".item")]
 @benchmark
 def test_bs4_html5lib():
+    return [e.text for e in BeautifulSoup(large_html, "html5lib").select(".item")]
 @benchmark
 def test_pyquery():
+    return [e.text() for e in pq(large_html)(".item").items()]
 @benchmark
     # No need to do `.extract()` like parsel to extract text
     # Also, this is faster than `[t.text for t in Adaptor(large_html, auto_match=False).css('.item')]`
     # for obvious reasons, of course.
+    return Adaptor(large_html, auto_match=False).css(".item::text")
 @benchmark
 def test_parsel():
+    return Selector(text=large_html).css(".item::text").extract()
 @benchmark
 def test_mechanicalsoup():
     browser = StatefulBrowser()
     browser.open_fake_page(large_html)
+    return [e.text for e in browser.page.select(".item")]
 @benchmark
 def test_selectolax():
+    return [node.text() for node in HTMLParser(large_html).css(".item")]
 def display(results):
     # Sort and display results
     sorted_results = sorted(results.items(), key=lambda x: x[1])  # Sort by time
+    scrapling_time = results["Scrapling"]
     print("\nRanked Results (fastest to slowest):")
     print(f" i. {'Library tested':<18} | {'avg. time (ms)':<15} | vs Scrapling")
+    print("-" * 50)
     for i, (test_name, test_time) in enumerate(sorted_results, 1):
         compare = round(test_time / scrapling_time, 3)
         print(f" {i}. {test_name:<18} | {str(test_time):<15} | {compare}")
 def test_scrapling_text(request_html):
     # Will loop over resulted elements to get text too to make comparison even more fair otherwise Scrapling will be even faster
     return [
+        element.text
+        for element in Adaptor(request_html, auto_match=False)
+        .find_by_text("Tipping the Velvet", first_match=True)
+        .find_similar(ignore_attributes=["title"])
     ]
 @benchmark
 def test_autoscraper(request_html):
     # autoscraper by default returns elements text
+    return AutoScraper().build(html=request_html, wanted_list=["Tipping the Velvet"])
 if __name__ == "__main__":
+    print(
+        " Benchmark: Speed of parsing and retrieving the text content of 5000 nested elements \n"
+    )
     results1 = {
         "Raw Lxml": test_lxml(),
         "Parsel/Scrapy": test_parsel(),
         "Scrapling": test_scrapling(),
+        "Selectolax": test_selectolax(),
         "PyQuery": test_pyquery(),
         "BS4 with Lxml": test_bs4_lxml(),
         "MechanicalSoup": test_mechanicalsoup(),
     }
     display(results1)
+    print("\n" + "=" * 25)
+    req = requests.get("https://books.toscrape.com/index.html")
     print(
+        " Benchmark: Speed of searching for an element by text content, and retrieving the text of similar elements\n"
     )
     results2 = {
         "Scrapling": test_scrapling_text(req.text),

cleanup.py CHANGED Viewed

@@ -9,12 +9,12 @@ def clean():
     # Directories and patterns to clean
     cleanup_patterns = [
-        'build',
-        'dist',
-        '*.egg-info',
-        '__pycache__',
-        '.eggs',
-        '.pytest_cache'
     ]
     # Clean directories
@@ -30,7 +30,7 @@ def clean():
                 print(f"Could not remove {path}: {e}")
     # Remove compiled Python files
-    for path in base_dir.rglob('*.py[co]'):
         try:
             path.unlink()
             print(f"Removed compiled file: {path}")
@@ -38,5 +38,5 @@ def clean():
             print(f"Could not remove {path}: {e}")
-if __name__ == '__main__':
     clean()

     # Directories and patterns to clean
     cleanup_patterns = [
+        "build",
+        "dist",
+        "*.egg-info",
+        "__pycache__",
+        ".eggs",
+        ".pytest_cache",
     ]
     # Clean directories
                 print(f"Could not remove {path}: {e}")
     # Remove compiled Python files
+    for path in base_dir.rglob("*.py[co]"):
         try:
             path.unlink()
             print(f"Removed compiled file: {path}")
             print(f"Could not remove {path}: {e}")
+if __name__ == "__main__":
     clean()

ruff.toml ADDED Viewed

	@@ -0,0 +1,22 @@

+exclude = [
+    ".git",
+    ".venv",
+    "__pycache__",
+    "docs",
+    ".github",
+    "build",
+    "dist",
+    "tests",
+    "benchmarks.py",
+]
+# Assume Python 3.9
+target-version = "py39"
+[lint]
+select = ["E", "F", "W"]
+ignore = ["E501", "F401"]
+[format]
+# Like Black, use double quotes for strings.
+quote-style = "double"

scrapling/__init__.py CHANGED Viewed

@@ -1,4 +1,3 @@
 __author__ = "Karim Shoair (karim.shoair@pm.me)"
 __version__ = "0.2.99"
 __copyright__ = "Copyright (c) 2024 Karim Shoair"
@@ -7,35 +6,44 @@ __copyright__ = "Copyright (c) 2024 Karim Shoair"
 # A lightweight approach to create lazy loader for each import for backward compatibility
 # This will reduces initial memory footprint significantly (only loads what's used)
 def __getattr__(name):
-    if name == 'Fetcher':
         from scrapling.fetchers import Fetcher as cls
         return cls
-    elif name == 'Adaptor':
         from scrapling.parser import Adaptor as cls
         return cls
-    elif name == 'Adaptors':
         from scrapling.parser import Adaptors as cls
         return cls
-    elif name == 'AttributesHandler':
         from scrapling.core.custom_types import AttributesHandler as cls
         return cls
-    elif name == 'TextHandler':
         from scrapling.core.custom_types import TextHandler as cls
         return cls
-    elif name == 'AsyncFetcher':
         from scrapling.fetchers import AsyncFetcher as cls
         return cls
-    elif name == 'StealthyFetcher':
         from scrapling.fetchers import StealthyFetcher as cls
         return cls
-    elif name == 'PlayWrightFetcher':
         from scrapling.fetchers import PlayWrightFetcher as cls
         return cls
-    elif name == 'CustomFetcher':
         from scrapling.fetchers import CustomFetcher as cls
         return cls
     else:
         raise AttributeError(f"module 'scrapling' has no attribute '{name}'")
-__all__ = ['Adaptor', 'Fetcher', 'AsyncFetcher', 'StealthyFetcher', 'PlayWrightFetcher']

 __author__ = "Karim Shoair (karim.shoair@pm.me)"
 __version__ = "0.2.99"
 __copyright__ = "Copyright (c) 2024 Karim Shoair"
 # A lightweight approach to create lazy loader for each import for backward compatibility
 # This will reduces initial memory footprint significantly (only loads what's used)
 def __getattr__(name):
+    if name == "Fetcher":
         from scrapling.fetchers import Fetcher as cls
         return cls
+    elif name == "Adaptor":
         from scrapling.parser import Adaptor as cls
         return cls
+    elif name == "Adaptors":
         from scrapling.parser import Adaptors as cls
         return cls
+    elif name == "AttributesHandler":
         from scrapling.core.custom_types import AttributesHandler as cls
         return cls
+    elif name == "TextHandler":
         from scrapling.core.custom_types import TextHandler as cls
         return cls
+    elif name == "AsyncFetcher":
         from scrapling.fetchers import AsyncFetcher as cls
         return cls
+    elif name == "StealthyFetcher":
         from scrapling.fetchers import StealthyFetcher as cls
         return cls
+    elif name == "PlayWrightFetcher":
         from scrapling.fetchers import PlayWrightFetcher as cls
         return cls
+    elif name == "CustomFetcher":
         from scrapling.fetchers import CustomFetcher as cls
         return cls
     else:
         raise AttributeError(f"module 'scrapling' has no attribute '{name}'")
+__all__ = ["Adaptor", "Fetcher", "AsyncFetcher", "StealthyFetcher", "PlayWrightFetcher"]

scrapling/cli.py CHANGED Viewed

@@ -12,21 +12,41 @@ def get_package_dir():
 def run_command(command, line):
     print(f"Installing {line}...")
-    _ = subprocess.check_call(' '.join(command), shell=True)
     # I meant to not use try except here
 @click.command(help="Install all Scrapling's Fetchers dependencies")
-@click.option('-f', '--force', 'force', is_flag=True, default=False, type=bool, help="Force Scrapling to reinstall all Fetchers dependencies")
 def install(force):
-    if force or not get_package_dir().joinpath(".scrapling_dependencies_installed").exists():
-        run_command([sys.executable, "-m", "playwright", "install", 'chromium'], 'Playwright browsers')
-        run_command([sys.executable, "-m", "playwright", "install-deps", 'chromium', 'firefox'], 'Playwright dependencies')
-        run_command([sys.executable, "-m", "camoufox", "fetch", '--browserforge'], 'Camoufox browser and databases')
         # if no errors raised by above commands, then we add below file
         get_package_dir().joinpath(".scrapling_dependencies_installed").touch()
     else:
-        print('The dependencies are already installed')
 @click.group()

 def run_command(command, line):
     print(f"Installing {line}...")
+    _ = subprocess.check_call(" ".join(command), shell=True)
     # I meant to not use try except here
 @click.command(help="Install all Scrapling's Fetchers dependencies")
+@click.option(
+    "-f",
+    "--force",
+    "force",
+    is_flag=True,
+    default=False,
+    type=bool,
+    help="Force Scrapling to reinstall all Fetchers dependencies",
+)
 def install(force):
+    if (
+        force
+        or not get_package_dir().joinpath(".scrapling_dependencies_installed").exists()
+    ):
+        run_command(
+            [sys.executable, "-m", "playwright", "install", "chromium"],
+            "Playwright browsers",
+        )
+        run_command(
+            [sys.executable, "-m", "playwright", "install-deps", "chromium", "firefox"],
+            "Playwright dependencies",
+        )
+        run_command(
+            [sys.executable, "-m", "camoufox", "fetch", "--browserforge"],
+            "Camoufox browser and databases",
+        )
         # if no errors raised by above commands, then we add below file
         get_package_dir().joinpath(".scrapling_dependencies_installed").touch()
     else:
+        print("The dependencies are already installed")
 @click.group()

scrapling/core/_types.py CHANGED Viewed

@@ -2,9 +2,22 @@
 Type definitions for type checking purposes.
 """
-from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
-                    List, Literal, Optional, Pattern, Tuple, Type, TypeVar,
-                    Union)
 SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]

 Type definitions for type checking purposes.
 """
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Generator,
+    Iterable,
+    List,
+    Literal,
+    Optional,
+    Pattern,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
 SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]

scrapling/core/custom_types.py CHANGED Viewed

@@ -6,16 +6,26 @@ from types import MappingProxyType
 from orjson import dumps, loads
 from w3lib.html import replace_entities as _replace_entities
-from scrapling.core._types import (Dict, Iterable, List, Literal, Optional,
-                                   Pattern, SupportsIndex, TypeVar, Union)
 from scrapling.core.utils import _is_iterable, flatten
 # Define type variable for AttributeHandler value type
-_TextHandlerType = TypeVar('_TextHandlerType', bound='TextHandler')
 class TextHandler(str):
     """Extends standard Python string by adding more functionality"""
     __slots__ = ()
     def __new__(cls, string):
@@ -25,77 +35,89 @@ class TextHandler(str):
         lst = super().__getitem__(key)
         return typing.cast(_TextHandlerType, TextHandler(lst))
-    def split(self, sep: str = None, maxsplit: SupportsIndex = -1) -> 'TextHandlers':
         return TextHandlers(
-            typing.cast(List[_TextHandlerType], [TextHandler(s) for s in super().split(sep, maxsplit)])
         )
-    def strip(self, chars: str = None) -> Union[str, 'TextHandler']:
         return TextHandler(super().strip(chars))
-    def lstrip(self, chars: str = None) -> Union[str, 'TextHandler']:
         return TextHandler(super().lstrip(chars))
-    def rstrip(self, chars: str = None) -> Union[str, 'TextHandler']:
         return TextHandler(super().rstrip(chars))
-    def capitalize(self) -> Union[str, 'TextHandler']:
         return TextHandler(super().capitalize())
-    def casefold(self) -> Union[str, 'TextHandler']:
         return TextHandler(super().casefold())
-    def center(self, width: SupportsIndex, fillchar: str = ' ') -> Union[str, 'TextHandler']:
         return TextHandler(super().center(width, fillchar))
-    def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, 'TextHandler']:
         return TextHandler(super().expandtabs(tabsize))
-    def format(self, *args: str, **kwargs: str) -> Union[str, 'TextHandler']:
         return TextHandler(super().format(*args, **kwargs))
-    def format_map(self, mapping) -> Union[str, 'TextHandler']:
         return TextHandler(super().format_map(mapping))
-    def join(self, iterable: Iterable[str]) -> Union[str, 'TextHandler']:
         return TextHandler(super().join(iterable))
-    def ljust(self, width: SupportsIndex, fillchar: str = ' ') -> Union[str, 'TextHandler']:
         return TextHandler(super().ljust(width, fillchar))
-    def rjust(self, width: SupportsIndex, fillchar: str = ' ') -> Union[str, 'TextHandler']:
         return TextHandler(super().rjust(width, fillchar))
-    def swapcase(self) -> Union[str, 'TextHandler']:
         return TextHandler(super().swapcase())
-    def title(self) -> Union[str, 'TextHandler']:
         return TextHandler(super().title())
-    def translate(self, table) -> Union[str, 'TextHandler']:
         return TextHandler(super().translate(table))
-    def zfill(self, width: SupportsIndex) -> Union[str, 'TextHandler']:
         return TextHandler(super().zfill(width))
-    def replace(self, old: str, new: str, count: SupportsIndex = -1) -> Union[str, 'TextHandler']:
         return TextHandler(super().replace(old, new, count))
-    def upper(self) -> Union[str, 'TextHandler']:
         return TextHandler(super().upper())
-    def lower(self) -> Union[str, 'TextHandler']:
         return TextHandler(super().lower())
     ##############
-    def sort(self, reverse: bool = False) -> Union[str, 'TextHandler']:
         """Return a sorted version of the string"""
         return self.__class__("".join(sorted(self, reverse=reverse)))
-    def clean(self) -> Union[str, 'TextHandler']:
         """Return a new version of the string after removing all white spaces and consecutive spaces"""
-        data = re.sub(r'[\t|\r|\n]', '', self)
-        data = re.sub(' +', ' ', data)
         return self.__class__(data.strip())
     # For easy copy-paste from Scrapy/parsel code when needed :)
@@ -122,8 +144,7 @@ class TextHandler(str):
         replace_entities: bool = True,
         clean_match: bool = False,
         case_sensitive: bool = True,
-    ) -> bool:
-        ...
     @typing.overload
     def re(
@@ -133,12 +154,15 @@ class TextHandler(str):
         clean_match: bool = False,
         case_sensitive: bool = True,
         check_match: Literal[False] = False,
-    ) -> "TextHandlers[TextHandler]":
-        ...
     def re(
-            self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
-            case_sensitive: bool = True, check_match: bool = False
     ) -> Union["TextHandlers[TextHandler]", bool]:
         """Apply the given regex to the current text and return a list of strings with the matches.
@@ -164,12 +188,27 @@ class TextHandler(str):
             results = flatten(results)
         if not replace_entities:
-            return TextHandlers(typing.cast(List[_TextHandlerType], [TextHandler(string) for string in results]))
-        return TextHandlers(typing.cast(List[_TextHandlerType], [TextHandler(_replace_entities(s)) for s in results]))
-    def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
-                 clean_match: bool = False, case_sensitive: bool = True) -> "TextHandler":
         """Apply the given regex to text and return the first match if found, otherwise return the default value.
         :param regex: Can be either a compiled regular expression or a string.
@@ -179,7 +218,12 @@ class TextHandler(str):
         :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
         """
-        result = self.re(regex, replace_entities, clean_match=clean_match, case_sensitive=case_sensitive)
         return result[0] if result else default
@@ -187,6 +231,7 @@ class TextHandlers(List[TextHandler]):
     """
     The :class:`TextHandlers` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
     """
     __slots__ = ()
     @typing.overload
@@ -197,15 +242,22 @@ class TextHandlers(List[TextHandler]):
     def __getitem__(self, pos: slice) -> "TextHandlers":
         pass
-    def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union[TextHandler, "TextHandlers"]:
         lst = super().__getitem__(pos)
         if isinstance(pos, slice):
             lst = [TextHandler(s) for s in lst]
             return TextHandlers(typing.cast(List[_TextHandlerType], lst))
         return typing.cast(_TextHandlerType, TextHandler(lst))
-    def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
-            case_sensitive: bool = True) -> 'TextHandlers[TextHandler]':
         """Call the ``.re()`` method for each element in this list and return
         their results flattened as TextHandlers.
@@ -219,8 +271,14 @@ class TextHandlers(List[TextHandler]):
         ]
         return TextHandlers(flatten(results))
-    def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
-                 clean_match: bool = False, case_sensitive: bool = True) -> TextHandler:
         """Call the ``.re_first()`` method for each element in this list and return
         the first result or the default value otherwise.
@@ -251,26 +309,35 @@ class TextHandlers(List[TextHandler]):
 class AttributesHandler(Mapping[str, _TextHandlerType]):
     """A read-only mapping to use instead of the standard dictionary for the speed boost but at the same time I use it to add more functionalities.
-        If standard dictionary is needed, just convert this class to dictionary with `dict` function
     """
-    __slots__ = ('_data',)
     def __init__(self, mapping=None, **kwargs):
-        mapping = {
-            key: TextHandler(value) if type(value) is str else value
-            for key, value in mapping.items()
-        } if mapping is not None else {}
         if kwargs:
-            mapping.update({
-                key: TextHandler(value) if type(value) is str else value
-                for key, value in kwargs.items()
-            })
         # Fastest read-only mapping type
         self._data = MappingProxyType(mapping)
-    def get(self, key: str, default: Optional[str] = None) -> Union[_TextHandlerType, None]:
         """Acts like standard dictionary `.get()` method"""
         return self._data.get(key, default)

 from orjson import dumps, loads
 from w3lib.html import replace_entities as _replace_entities
+from scrapling.core._types import (
+    Dict,
+    Iterable,
+    List,
+    Literal,
+    Optional,
+    Pattern,
+    SupportsIndex,
+    TypeVar,
+    Union,
+)
 from scrapling.core.utils import _is_iterable, flatten
 # Define type variable for AttributeHandler value type
+_TextHandlerType = TypeVar("_TextHandlerType", bound="TextHandler")
 class TextHandler(str):
     """Extends standard Python string by adding more functionality"""
     __slots__ = ()
     def __new__(cls, string):
         lst = super().__getitem__(key)
         return typing.cast(_TextHandlerType, TextHandler(lst))
+    def split(self, sep: str = None, maxsplit: SupportsIndex = -1) -> "TextHandlers":
         return TextHandlers(
+            typing.cast(
+                List[_TextHandlerType],
+                [TextHandler(s) for s in super().split(sep, maxsplit)],
+            )
         )
+    def strip(self, chars: str = None) -> Union[str, "TextHandler"]:
         return TextHandler(super().strip(chars))
+    def lstrip(self, chars: str = None) -> Union[str, "TextHandler"]:
         return TextHandler(super().lstrip(chars))
+    def rstrip(self, chars: str = None) -> Union[str, "TextHandler"]:
         return TextHandler(super().rstrip(chars))
+    def capitalize(self) -> Union[str, "TextHandler"]:
         return TextHandler(super().capitalize())
+    def casefold(self) -> Union[str, "TextHandler"]:
         return TextHandler(super().casefold())
+    def center(
+        self, width: SupportsIndex, fillchar: str = " "
+    ) -> Union[str, "TextHandler"]:
         return TextHandler(super().center(width, fillchar))
+    def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, "TextHandler"]:
         return TextHandler(super().expandtabs(tabsize))
+    def format(self, *args: str, **kwargs: str) -> Union[str, "TextHandler"]:
         return TextHandler(super().format(*args, **kwargs))
+    def format_map(self, mapping) -> Union[str, "TextHandler"]:
         return TextHandler(super().format_map(mapping))
+    def join(self, iterable: Iterable[str]) -> Union[str, "TextHandler"]:
         return TextHandler(super().join(iterable))
+    def ljust(
+        self, width: SupportsIndex, fillchar: str = " "
+    ) -> Union[str, "TextHandler"]:
         return TextHandler(super().ljust(width, fillchar))
+    def rjust(
+        self, width: SupportsIndex, fillchar: str = " "
+    ) -> Union[str, "TextHandler"]:
         return TextHandler(super().rjust(width, fillchar))
+    def swapcase(self) -> Union[str, "TextHandler"]:
         return TextHandler(super().swapcase())
+    def title(self) -> Union[str, "TextHandler"]:
         return TextHandler(super().title())
+    def translate(self, table) -> Union[str, "TextHandler"]:
         return TextHandler(super().translate(table))
+    def zfill(self, width: SupportsIndex) -> Union[str, "TextHandler"]:
         return TextHandler(super().zfill(width))
+    def replace(
+        self, old: str, new: str, count: SupportsIndex = -1
+    ) -> Union[str, "TextHandler"]:
         return TextHandler(super().replace(old, new, count))
+    def upper(self) -> Union[str, "TextHandler"]:
         return TextHandler(super().upper())
+    def lower(self) -> Union[str, "TextHandler"]:
         return TextHandler(super().lower())
     ##############
+    def sort(self, reverse: bool = False) -> Union[str, "TextHandler"]:
         """Return a sorted version of the string"""
         return self.__class__("".join(sorted(self, reverse=reverse)))
+    def clean(self) -> Union[str, "TextHandler"]:
         """Return a new version of the string after removing all white spaces and consecutive spaces"""
+        data = re.sub(r"[\t|\r|\n]", "", self)
+        data = re.sub(" +", " ", data)
         return self.__class__(data.strip())
     # For easy copy-paste from Scrapy/parsel code when needed :)
         replace_entities: bool = True,
         clean_match: bool = False,
         case_sensitive: bool = True,
+    ) -> bool: ...
     @typing.overload
     def re(
         clean_match: bool = False,
         case_sensitive: bool = True,
         check_match: Literal[False] = False,
+    ) -> "TextHandlers[TextHandler]": ...
     def re(
+        self,
+        regex: Union[str, Pattern[str]],
+        replace_entities: bool = True,
+        clean_match: bool = False,
+        case_sensitive: bool = True,
+        check_match: bool = False,
     ) -> Union["TextHandlers[TextHandler]", bool]:
         """Apply the given regex to the current text and return a list of strings with the matches.
             results = flatten(results)
         if not replace_entities:
+            return TextHandlers(
+                typing.cast(
+                    List[_TextHandlerType], [TextHandler(string) for string in results]
+                )
+            )
+        return TextHandlers(
+            typing.cast(
+                List[_TextHandlerType],
+                [TextHandler(_replace_entities(s)) for s in results],
+            )
+        )
+    def re_first(
+        self,
+        regex: Union[str, Pattern[str]],
+        default=None,
+        replace_entities: bool = True,
+        clean_match: bool = False,
+        case_sensitive: bool = True,
+    ) -> "TextHandler":
         """Apply the given regex to text and return the first match if found, otherwise return the default value.
         :param regex: Can be either a compiled regular expression or a string.
         :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
         """
+        result = self.re(
+            regex,
+            replace_entities,
+            clean_match=clean_match,
+            case_sensitive=case_sensitive,
+        )
         return result[0] if result else default
     """
     The :class:`TextHandlers` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
     """
     __slots__ = ()
     @typing.overload
     def __getitem__(self, pos: slice) -> "TextHandlers":
         pass
+    def __getitem__(
+        self, pos: Union[SupportsIndex, slice]
+    ) -> Union[TextHandler, "TextHandlers"]:
         lst = super().__getitem__(pos)
         if isinstance(pos, slice):
             lst = [TextHandler(s) for s in lst]
             return TextHandlers(typing.cast(List[_TextHandlerType], lst))
         return typing.cast(_TextHandlerType, TextHandler(lst))
+    def re(
+        self,
+        regex: Union[str, Pattern[str]],
+        replace_entities: bool = True,
+        clean_match: bool = False,
+        case_sensitive: bool = True,
+    ) -> "TextHandlers[TextHandler]":
         """Call the ``.re()`` method for each element in this list and return
         their results flattened as TextHandlers.
         ]
         return TextHandlers(flatten(results))
+    def re_first(
+        self,
+        regex: Union[str, Pattern[str]],
+        default=None,
+        replace_entities: bool = True,
+        clean_match: bool = False,
+        case_sensitive: bool = True,
+    ) -> TextHandler:
         """Call the ``.re_first()`` method for each element in this list and return
         the first result or the default value otherwise.
 class AttributesHandler(Mapping[str, _TextHandlerType]):
     """A read-only mapping to use instead of the standard dictionary for the speed boost but at the same time I use it to add more functionalities.
+    If standard dictionary is needed, just convert this class to dictionary with `dict` function
     """
+    __slots__ = ("_data",)
     def __init__(self, mapping=None, **kwargs):
+        mapping = (
+            {
+                key: TextHandler(value) if type(value) is str else value
+                for key, value in mapping.items()
+            }
+            if mapping is not None
+            else {}
+        )
         if kwargs:
+            mapping.update(
+                {
+                    key: TextHandler(value) if type(value) is str else value
+                    for key, value in kwargs.items()
+                }
+            )
         # Fastest read-only mapping type
         self._data = MappingProxyType(mapping)
+    def get(
+        self, key: str, default: Optional[str] = None
+    ) -> Union[_TextHandlerType, None]:
         """Acts like standard dictionary `.get()` method"""
         return self._data.get(key, default)

scrapling/core/mixins.py CHANGED Viewed

@@ -1,32 +1,33 @@
 class SelectorsGeneration:
     """Selectors generation functions
     Trying to generate selectors like Firefox or maybe cleaner ones!? Ehm
     Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591"""
-    def __general_selection(self, selection: str = 'css', full_path=False) -> str:
         """Generate a selector for the current element.
         :return: A string of the generated selector.
         """
         selectorPath = []
         target = self
-        css = selection.lower() == 'css'
         while target is not None:
             if target.parent:
-                if target.attrib.get('id'):
                     # id is enough
                     part = (
-                        f'#{target.attrib["id"]}' if css
                         else f"[@id='{target.attrib['id']}']"
                     )
                     selectorPath.append(part)
                     if not full_path:
                         return (
-                            " > ".join(reversed(selectorPath)) if css
-                            else '//*' + "/".join(reversed(selectorPath))
                         )
                 else:
-                    part = f'{target.tag}'
                     # We won't use classes anymore because I some websites share exact classes between elements
                     # classes = target.attrib.get('class', '').split()
                     # if classes and css:
@@ -41,23 +42,26 @@ class SelectorsGeneration:
                     if counter[target.tag] > 1:
                         part += (
-                            f":nth-of-type({counter[target.tag]})" if css
                             else f"[{counter[target.tag]}]"
                         )
                 selectorPath.append(part)
                 target = target.parent
-                if target is None or target.tag == 'html':
                     return (
-                        " > ".join(reversed(selectorPath)) if css
-                        else '//' + "/".join(reversed(selectorPath))
                     )
             else:
                 break
         return (
-            " > ".join(reversed(selectorPath)) if css
-            else '//' + "/".join(reversed(selectorPath))
         )
     @property
@@ -79,11 +83,11 @@ class SelectorsGeneration:
         """Generate a XPath selector for the current element
         :return: A string of the generated selector.
         """
-        return self.__general_selection('xpath')
     @property
     def generate_full_xpath_selector(self) -> str:
         """Generate a complete XPath selector for the current element
         :return: A string of the generated selector.
         """
-        return self.__general_selection('xpath', full_path=True)

 class SelectorsGeneration:
     """Selectors generation functions
     Trying to generate selectors like Firefox or maybe cleaner ones!? Ehm
     Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591"""
+    def __general_selection(self, selection: str = "css", full_path=False) -> str:
         """Generate a selector for the current element.
         :return: A string of the generated selector.
         """
         selectorPath = []
         target = self
+        css = selection.lower() == "css"
         while target is not None:
             if target.parent:
+                if target.attrib.get("id"):
                     # id is enough
                     part = (
+                        f"#{target.attrib['id']}"
+                        if css
                         else f"[@id='{target.attrib['id']}']"
                     )
                     selectorPath.append(part)
                     if not full_path:
                         return (
+                            " > ".join(reversed(selectorPath))
+                            if css
+                            else "//*" + "/".join(reversed(selectorPath))
                         )
                 else:
+                    part = f"{target.tag}"
                     # We won't use classes anymore because I some websites share exact classes between elements
                     # classes = target.attrib.get('class', '').split()
                     # if classes and css:
                     if counter[target.tag] > 1:
                         part += (
+                            f":nth-of-type({counter[target.tag]})"
+                            if css
                             else f"[{counter[target.tag]}]"
                         )
                 selectorPath.append(part)
                 target = target.parent
+                if target is None or target.tag == "html":
                     return (
+                        " > ".join(reversed(selectorPath))
+                        if css
+                        else "//" + "/".join(reversed(selectorPath))
                     )
             else:
                 break
         return (
+            " > ".join(reversed(selectorPath))
+            if css
+            else "//" + "/".join(reversed(selectorPath))
         )
     @property
         """Generate a XPath selector for the current element
         :return: A string of the generated selector.
         """
+        return self.__general_selection("xpath")
     @property
     def generate_full_xpath_selector(self) -> str:
         """Generate a complete XPath selector for the current element
         :return: A string of the generated selector.
         """
+        return self.__general_selection("xpath", full_path=True)

scrapling/core/storage_adaptors.py CHANGED Viewed

@@ -20,7 +20,7 @@ class StorageSystemMixin(ABC):
         self.url = url
     @lru_cache(64, typed=True)
-    def _get_base_url(self, default_value: str = 'default') -> str:
         if not self.url or type(self.url) is not str:
             return default_value
@@ -38,7 +38,7 @@ class StorageSystemMixin(ABC):
         :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
             the docs for more info.
         """
-        raise NotImplementedError('Storage system must implement `save` method')
     @abstractmethod
     def retrieve(self, identifier: str) -> Optional[Dict]:
@@ -48,7 +48,7 @@ class StorageSystemMixin(ABC):
             the docs for more info.
         :return: A dictionary of the unique properties
         """
-        raise NotImplementedError('Storage system must implement `save` method')
     @staticmethod
     @lru_cache(128, typed=True)
@@ -57,7 +57,7 @@ class StorageSystemMixin(ABC):
         identifier = identifier.lower().strip()
         if isinstance(identifier, str):
             # Hash functions have to take bytes
-            identifier = identifier.encode('utf-8')
         hash_value = sha256(identifier).hexdigest()
         return f"{hash_value}_{len(identifier)}"  # Length to reduce collision chance
@@ -68,6 +68,7 @@ class SQLiteStorageSystem(StorageSystemMixin):
     """The recommended system to use, it's race condition safe and thread safe.
     Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
     > It's optimized for threaded applications but running it without threads shouldn't make it slow."""
     def __init__(self, storage_file: str, url: Union[str, None] = None):
         """
         :param storage_file: File to be used to store elements
@@ -111,10 +112,13 @@ class SQLiteStorageSystem(StorageSystemMixin):
         url = self._get_base_url()
         element_data = _StorageTools.element_to_dict(element)
         with self.lock:
-            self.cursor.execute("""
                 INSERT OR REPLACE INTO storage (url, identifier, element_data)
                 VALUES (?, ?, ?)
-            """, (url, identifier, orjson.dumps(element_data)))
             self.cursor.fetchall()
             self.connection.commit()
@@ -129,7 +133,7 @@ class SQLiteStorageSystem(StorageSystemMixin):
         with self.lock:
             self.cursor.execute(
                 "SELECT element_data FROM storage WHERE url = ? AND identifier = ?",
-                (url, identifier)
             )
             result = self.cursor.fetchone()
             if result:

         self.url = url
     @lru_cache(64, typed=True)
+    def _get_base_url(self, default_value: str = "default") -> str:
         if not self.url or type(self.url) is not str:
             return default_value
         :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
             the docs for more info.
         """
+        raise NotImplementedError("Storage system must implement `save` method")
     @abstractmethod
     def retrieve(self, identifier: str) -> Optional[Dict]:
             the docs for more info.
         :return: A dictionary of the unique properties
         """
+        raise NotImplementedError("Storage system must implement `save` method")
     @staticmethod
     @lru_cache(128, typed=True)
         identifier = identifier.lower().strip()
         if isinstance(identifier, str):
             # Hash functions have to take bytes
+            identifier = identifier.encode("utf-8")
         hash_value = sha256(identifier).hexdigest()
         return f"{hash_value}_{len(identifier)}"  # Length to reduce collision chance
     """The recommended system to use, it's race condition safe and thread safe.
     Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
     > It's optimized for threaded applications but running it without threads shouldn't make it slow."""
     def __init__(self, storage_file: str, url: Union[str, None] = None):
         """
         :param storage_file: File to be used to store elements
         url = self._get_base_url()
         element_data = _StorageTools.element_to_dict(element)
         with self.lock:
+            self.cursor.execute(
+                """
                 INSERT OR REPLACE INTO storage (url, identifier, element_data)
                 VALUES (?, ?, ?)
+            """,
+                (url, identifier, orjson.dumps(element_data)),
+            )
             self.cursor.fetchall()
             self.connection.commit()
         with self.lock:
             self.cursor.execute(
                 "SELECT element_data FROM storage WHERE url = ? AND identifier = ?",
+                (url, identifier),
             )
             result = self.cursor.fetchone()
             if result:

scrapling/core/translator.py CHANGED Viewed

@@ -24,7 +24,6 @@ replace_html5_whitespaces = re.compile(regex).sub
 class XPathExpr(OriginalXPathExpr):
     textnode: bool = False
     attribute: Optional[str] = None
@@ -123,7 +122,7 @@ class TranslatorMixin:
     @staticmethod
     def xpath_attr_functional_pseudo_element(
-            xpath: OriginalXPathExpr, function: FunctionalPseudoElement
     ) -> XPathExpr:
         """Support selecting attribute values using ::attr() pseudo-element"""
         if function.argument_types() not in (["STRING"], ["IDENT"]):

 class XPathExpr(OriginalXPathExpr):
     textnode: bool = False
     attribute: Optional[str] = None
     @staticmethod
     def xpath_attr_functional_pseudo_element(
+        xpath: OriginalXPathExpr, function: FunctionalPseudoElement
     ) -> XPathExpr:
         """Support selecting attribute values using ::attr() pseudo-element"""
         if function.argument_types() not in (["STRING"], ["IDENT"]):

scrapling/core/utils.py CHANGED Viewed

@@ -11,7 +11,9 @@ from scrapling.core._types import Any, Dict, Iterable, Union
 # functools.cache is available on Python 3.9+ only so let's keep lru_cache
 from functools import lru_cache  # isort:skip
-html_forbidden = {html.HtmlComment, }
 @lru_cache(1, typed=True)
@@ -20,12 +22,11 @@ def setup_logger():
     :returns: logging.Logger: Configured logger instance
     """
-    logger = logging.getLogger('scrapling')
     logger.setLevel(logging.INFO)
     formatter = logging.Formatter(
-        fmt="[%(asctime)s] %(levelname)s: %(message)s",
-        datefmt="%Y-%m-%d %H:%M:%S"
     )
     console_handler = logging.StreamHandler()
@@ -58,7 +59,13 @@ def flatten(lst: Iterable):
 def _is_iterable(s: Any):
     # This will be used only in regex functions to make sure it's iterable but not string/bytes
-    return isinstance(s, (list, tuple,))
 class _StorageTools:
@@ -66,31 +73,43 @@ class _StorageTools:
     def __clean_attributes(element: html.HtmlElement, forbidden: tuple = ()) -> Dict:
         if not element.attrib:
             return {}
-        return {k: v.strip() for k, v in element.attrib.items() if v and v.strip() and k not in forbidden}
     @classmethod
     def element_to_dict(cls, element: html.HtmlElement) -> Dict:
         parent = element.getparent()
         result = {
-            'tag': str(element.tag),
-            'attributes': cls.__clean_attributes(element),
-            'text': element.text.strip() if element.text else None,
-            'path': cls._get_element_path(element)
         }
         if parent is not None:
-            result.update({
-                'parent_name': parent.tag,
-                'parent_attribs': dict(parent.attrib),
-                'parent_text': parent.text.strip() if parent.text else None
-            })
-            siblings = [child.tag for child in parent.iterchildren() if child != element]
             if siblings:
-                result.update({'siblings': tuple(siblings)})
-        children = [child.tag for child in element.iterchildren() if type(child) not in html_forbidden]
         if children:
-            result.update({'children': tuple(children)})
         return result
@@ -98,9 +117,9 @@ class _StorageTools:
     def _get_element_path(cls, element: html.HtmlElement):
         parent = element.getparent()
         return tuple(
-            (element.tag,) if parent is None else (
-                cls._get_element_path(parent) + (element.tag,)
-            )
         )
@@ -117,6 +136,6 @@ class _StorageTools:
 @lru_cache(128, typed=True)
 def clean_spaces(string):
-    string = string.replace('\t', ' ')
-    string = re.sub('[\n|\r]', '', string)
-    return re.sub(' +', ' ', string)

 # functools.cache is available on Python 3.9+ only so let's keep lru_cache
 from functools import lru_cache  # isort:skip
+html_forbidden = {
+    html.HtmlComment,
+}
 @lru_cache(1, typed=True)
     :returns: logging.Logger: Configured logger instance
     """
+    logger = logging.getLogger("scrapling")
     logger.setLevel(logging.INFO)
     formatter = logging.Formatter(
+        fmt="[%(asctime)s] %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
     )
     console_handler = logging.StreamHandler()
 def _is_iterable(s: Any):
     # This will be used only in regex functions to make sure it's iterable but not string/bytes
+    return isinstance(
+        s,
+        (
+            list,
+            tuple,
+        ),
+    )
 class _StorageTools:
     def __clean_attributes(element: html.HtmlElement, forbidden: tuple = ()) -> Dict:
         if not element.attrib:
             return {}
+        return {
+            k: v.strip()
+            for k, v in element.attrib.items()
+            if v and v.strip() and k not in forbidden
+        }
     @classmethod
     def element_to_dict(cls, element: html.HtmlElement) -> Dict:
         parent = element.getparent()
         result = {
+            "tag": str(element.tag),
+            "attributes": cls.__clean_attributes(element),
+            "text": element.text.strip() if element.text else None,
+            "path": cls._get_element_path(element),
         }
         if parent is not None:
+            result.update(
+                {
+                    "parent_name": parent.tag,
+                    "parent_attribs": dict(parent.attrib),
+                    "parent_text": parent.text.strip() if parent.text else None,
+                }
+            )
+            siblings = [
+                child.tag for child in parent.iterchildren() if child != element
+            ]
             if siblings:
+                result.update({"siblings": tuple(siblings)})
+        children = [
+            child.tag
+            for child in element.iterchildren()
+            if type(child) not in html_forbidden
+        ]
         if children:
+            result.update({"children": tuple(children)})
         return result
     def _get_element_path(cls, element: html.HtmlElement):
         parent = element.getparent()
         return tuple(
+            (element.tag,)
+            if parent is None
+            else (cls._get_element_path(parent) + (element.tag,))
         )
 @lru_cache(128, typed=True)
 def clean_spaces(string):
+    string = string.replace("\t", " ")
+    string = re.sub("[\n|\r]", "", string)
+    return re.sub(" +", " ", string)

scrapling/defaults.py CHANGED Viewed

@@ -5,21 +5,33 @@ from scrapling.core.utils import log
 # A lightweight approach to create lazy loader for each import for backward compatibility
 # This will reduces initial memory footprint significantly (only loads what's used)
 def __getattr__(name):
-    if name == 'Fetcher':
         from scrapling.fetchers import Fetcher as cls
-        log.warning('This import is deprecated now and it will be removed with v0.3. Use `from scrapling.fetchers import Fetcher` instead')
         return cls
-    elif name == 'AsyncFetcher':
         from scrapling.fetchers import AsyncFetcher as cls
-        log.warning('This import is deprecated now and it will be removed with v0.3. Use `from scrapling.fetchers import AsyncFetcher` instead')
         return cls
-    elif name == 'StealthyFetcher':
         from scrapling.fetchers import StealthyFetcher as cls
-        log.warning('This import is deprecated now and it will be removed with v0.3. Use `from scrapling.fetchers import StealthyFetcher` instead')
         return cls
-    elif name == 'PlayWrightFetcher':
         from scrapling.fetchers import PlayWrightFetcher as cls
-        log.warning('This import is deprecated now and it will be removed with v0.3. Use `from scrapling.fetchers import PlayWrightFetcher` instead')
         return cls
     else:
         raise AttributeError(f"module 'scrapling' has no attribute '{name}'")

 # A lightweight approach to create lazy loader for each import for backward compatibility
 # This will reduces initial memory footprint significantly (only loads what's used)
 def __getattr__(name):
+    if name == "Fetcher":
         from scrapling.fetchers import Fetcher as cls
+        log.warning(
+            "This import is deprecated now and it will be removed with v0.3. Use `from scrapling.fetchers import Fetcher` instead"
+        )
         return cls
+    elif name == "AsyncFetcher":
         from scrapling.fetchers import AsyncFetcher as cls
+        log.warning(
+            "This import is deprecated now and it will be removed with v0.3. Use `from scrapling.fetchers import AsyncFetcher` instead"
+        )
         return cls
+    elif name == "StealthyFetcher":
         from scrapling.fetchers import StealthyFetcher as cls
+        log.warning(
+            "This import is deprecated now and it will be removed with v0.3. Use `from scrapling.fetchers import StealthyFetcher` instead"
+        )
         return cls
+    elif name == "PlayWrightFetcher":
         from scrapling.fetchers import PlayWrightFetcher as cls
+        log.warning(
+            "This import is deprecated now and it will be removed with v0.3. Use `from scrapling.fetchers import PlayWrightFetcher` instead"
+        )
         return cls
     else:
         raise AttributeError(f"module 'scrapling' has no attribute '{name}'")

scrapling/engines/__init__.py CHANGED Viewed

@@ -4,4 +4,4 @@ from .pw import PlaywrightEngine
 from .static import StaticEngine
 from .toolbelt import check_if_engine_usable
-__all__ = ['CamoufoxEngine', 'PlaywrightEngine']

 from .static import StaticEngine
 from .toolbelt import check_if_engine_usable
+__all__ = ["CamoufoxEngine", "PlaywrightEngine"]

scrapling/engines/camo.py CHANGED Viewed

@@ -2,27 +2,52 @@ from camoufox import DefaultAddons
 from camoufox.async_api import AsyncCamoufox
 from camoufox.sync_api import Camoufox
-from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
-                                   SelectorWaitStates, Union)
 from scrapling.core.utils import log
-from scrapling.engines.toolbelt import (Response, StatusText,
-                                        async_intercept_route,
-                                        check_type_validity,
-                                        construct_proxy_dict,
-                                        generate_convincing_referer,
-                                        get_os_name, intercept_route)
 class CamoufoxEngine:
     def __init__(
-            self, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
-            block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, humanize: Union[bool, float] = True, wait: Optional[int] = 0,
-            timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
-            wait_selector_state: SelectorWaitStates = 'attached', google_search: bool = True, extra_headers: Optional[Dict[str, str]] = None,
-            proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: bool = False, disable_ads: bool = False,
-            geoip: bool = False,
-            adaptor_arguments: Dict = None,
-            additional_arguments: Dict = None
     ):
         """An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
@@ -97,7 +122,7 @@ class CamoufoxEngine:
             "block_webrtc": self.block_webrtc,
             "block_images": self.block_images,  # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
             "os": None if self.os_randomize else get_os_name(),
-            **self.additional_arguments
         }
     def _process_response_history(self, first_response):
@@ -109,19 +134,30 @@ class CamoufoxEngine:
             while current_request:
                 try:
                     current_response = current_request.response()
-                    history.insert(0, Response(
-                        url=current_request.url,
-                        # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
-                        text='',
-                        body=b'',
-                        status=current_response.status if current_response else 301,
-                        reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
-                        encoding=current_response.headers.get('content-type', '') or 'utf-8',
-                        cookies={},
-                        headers=current_response.all_headers() if current_response else {},
-                        request_headers=current_request.all_headers(),
-                        **self.adaptor_arguments
-                    ))
                 except Exception as e:
                     log.error(f"Error processing redirect: {e}")
                     break
@@ -141,19 +177,30 @@ class CamoufoxEngine:
             while current_request:
                 try:
                     current_response = await current_request.response()
-                    history.insert(0, Response(
-                        url=current_request.url,
-                        # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
-                        text='',
-                        body=b'',
-                        status=current_response.status if current_response else 301,
-                        reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
-                        encoding=current_response.headers.get('content-type', '') or 'utf-8',
-                        cookies={},
-                        headers=await current_response.all_headers() if current_response else {},
-                        request_headers=await current_request.all_headers(),
-                        **self.adaptor_arguments
-                    ))
                 except Exception as e:
                     log.error(f"Error processing redirect: {e}")
                     break
@@ -175,7 +222,10 @@ class CamoufoxEngine:
         def handle_response(finished_response):
             nonlocal final_response
-            if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
                 final_response = finished_response
         with Camoufox(**self._get_camoufox_options()) as browser:
@@ -195,7 +245,7 @@ class CamoufoxEngine:
             page.wait_for_load_state(state="domcontentloaded")
             if self.network_idle:
-                page.wait_for_load_state('networkidle')
             if self.page_action is not None:
                 try:
@@ -211,7 +261,7 @@ class CamoufoxEngine:
                     page.wait_for_load_state(state="load")
                     page.wait_for_load_state(state="domcontentloaded")
                     if self.network_idle:
-                        page.wait_for_load_state('networkidle')
                 except Exception as e:
                     log.error(f"Error waiting for selector {self.wait_selector}: {e}")
@@ -222,9 +272,13 @@ class CamoufoxEngine:
                 raise ValueError("Failed to get a response from the page")
             # This will be parsed inside `Response`
-            encoding = final_response.headers.get('content-type', '') or 'utf-8'  # default encoding
             # PlayWright API sometimes give empty status text for some reason!
-            status_text = final_response.status_text or StatusText.get(final_response.status)
             history = self._process_response_history(first_response)
             try:
@@ -236,15 +290,17 @@ class CamoufoxEngine:
             response = Response(
                 url=page.url,
                 text=page_content,
-                body=page_content.encode('utf-8'),
                 status=final_response.status,
                 reason=status_text,
                 encoding=encoding,
-                cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
                 headers=first_response.all_headers(),
                 request_headers=first_response.request.all_headers(),
                 history=history,
-                **self.adaptor_arguments
             )
             page.close()
             context.close()
@@ -262,7 +318,10 @@ class CamoufoxEngine:
         async def handle_response(finished_response):
             nonlocal final_response
-            if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
                 final_response = finished_response
         async with AsyncCamoufox(**self._get_camoufox_options()) as browser:
@@ -282,7 +341,7 @@ class CamoufoxEngine:
             await page.wait_for_load_state(state="domcontentloaded")
             if self.network_idle:
-                await page.wait_for_load_state('networkidle')
             if self.page_action is not None:
                 try:
@@ -298,7 +357,7 @@ class CamoufoxEngine:
                     await page.wait_for_load_state(state="load")
                     await page.wait_for_load_state(state="domcontentloaded")
                     if self.network_idle:
-                        await page.wait_for_load_state('networkidle')
                 except Exception as e:
                     log.error(f"Error waiting for selector {self.wait_selector}: {e}")
@@ -309,9 +368,13 @@ class CamoufoxEngine:
                 raise ValueError("Failed to get a response from the page")
             # This will be parsed inside `Response`
-            encoding = final_response.headers.get('content-type', '') or 'utf-8'  # default encoding
             # PlayWright API sometimes give empty status text for some reason!
-            status_text = final_response.status_text or StatusText.get(final_response.status)
             history = await self._async_process_response_history(first_response)
             try:
@@ -323,15 +386,18 @@ class CamoufoxEngine:
             response = Response(
                 url=page.url,
                 text=page_content,
-                body=page_content.encode('utf-8'),
                 status=final_response.status,
                 reason=status_text,
                 encoding=encoding,
-                cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
                 headers=await first_response.all_headers(),
                 request_headers=await first_response.request.all_headers(),
                 history=history,
-                **self.adaptor_arguments
             )
             await page.close()
             await context.close()

 from camoufox.async_api import AsyncCamoufox
 from camoufox.sync_api import Camoufox
+from scrapling.core._types import (
+    Callable,
+    Dict,
+    List,
+    Literal,
+    Optional,
+    SelectorWaitStates,
+    Union,
+)
 from scrapling.core.utils import log
+from scrapling.engines.toolbelt import (
+    Response,
+    StatusText,
+    async_intercept_route,
+    check_type_validity,
+    construct_proxy_dict,
+    generate_convincing_referer,
+    get_os_name,
+    intercept_route,
+)
 class CamoufoxEngine:
     def __init__(
+        self,
+        headless: Union[bool, Literal["virtual"]] = True,  # noqa: F821
+        block_images: bool = False,
+        disable_resources: bool = False,
+        block_webrtc: bool = False,
+        allow_webgl: bool = True,
+        network_idle: bool = False,
+        humanize: Union[bool, float] = True,
+        wait: Optional[int] = 0,
+        timeout: Optional[float] = 30000,
+        page_action: Callable = None,
+        wait_selector: Optional[str] = None,
+        addons: Optional[List[str]] = None,
+        wait_selector_state: SelectorWaitStates = "attached",
+        google_search: bool = True,
+        extra_headers: Optional[Dict[str, str]] = None,
+        proxy: Optional[Union[str, Dict[str, str]]] = None,
+        os_randomize: bool = False,
+        disable_ads: bool = False,
+        geoip: bool = False,
+        adaptor_arguments: Dict = None,
+        additional_arguments: Dict = None,
     ):
         """An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
             "block_webrtc": self.block_webrtc,
             "block_images": self.block_images,  # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
             "os": None if self.os_randomize else get_os_name(),
+            **self.additional_arguments,
         }
     def _process_response_history(self, first_response):
             while current_request:
                 try:
                     current_response = current_request.response()
+                    history.insert(
+                        0,
+                        Response(
+                            url=current_request.url,
+                            # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
+                            text="",
+                            body=b"",
+                            status=current_response.status if current_response else 301,
+                            reason=(
+                                current_response.status_text
+                                or StatusText.get(current_response.status)
+                            )
+                            if current_response
+                            else StatusText.get(301),
+                            encoding=current_response.headers.get("content-type", "")
+                            or "utf-8",
+                            cookies={},
+                            headers=current_response.all_headers()
+                            if current_response
+                            else {},
+                            request_headers=current_request.all_headers(),
+                            **self.adaptor_arguments,
+                        ),
+                    )
                 except Exception as e:
                     log.error(f"Error processing redirect: {e}")
                     break
             while current_request:
                 try:
                     current_response = await current_request.response()
+                    history.insert(
+                        0,
+                        Response(
+                            url=current_request.url,
+                            # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
+                            text="",
+                            body=b"",
+                            status=current_response.status if current_response else 301,
+                            reason=(
+                                current_response.status_text
+                                or StatusText.get(current_response.status)
+                            )
+                            if current_response
+                            else StatusText.get(301),
+                            encoding=current_response.headers.get("content-type", "")
+                            or "utf-8",
+                            cookies={},
+                            headers=await current_response.all_headers()
+                            if current_response
+                            else {},
+                            request_headers=await current_request.all_headers(),
+                            **self.adaptor_arguments,
+                        ),
+                    )
                 except Exception as e:
                     log.error(f"Error processing redirect: {e}")
                     break
         def handle_response(finished_response):
             nonlocal final_response
+            if (
+                finished_response.request.resource_type == "document"
+                and finished_response.request.is_navigation_request()
+            ):
                 final_response = finished_response
         with Camoufox(**self._get_camoufox_options()) as browser:
             page.wait_for_load_state(state="domcontentloaded")
             if self.network_idle:
+                page.wait_for_load_state("networkidle")
             if self.page_action is not None:
                 try:
                     page.wait_for_load_state(state="load")
                     page.wait_for_load_state(state="domcontentloaded")
                     if self.network_idle:
+                        page.wait_for_load_state("networkidle")
                 except Exception as e:
                     log.error(f"Error waiting for selector {self.wait_selector}: {e}")
                 raise ValueError("Failed to get a response from the page")
             # This will be parsed inside `Response`
+            encoding = (
+                final_response.headers.get("content-type", "") or "utf-8"
+            )  # default encoding
             # PlayWright API sometimes give empty status text for some reason!
+            status_text = final_response.status_text or StatusText.get(
+                final_response.status
+            )
             history = self._process_response_history(first_response)
             try:
             response = Response(
                 url=page.url,
                 text=page_content,
+                body=page_content.encode("utf-8"),
                 status=final_response.status,
                 reason=status_text,
                 encoding=encoding,
+                cookies={
+                    cookie["name"]: cookie["value"] for cookie in page.context.cookies()
+                },
                 headers=first_response.all_headers(),
                 request_headers=first_response.request.all_headers(),
                 history=history,
+                **self.adaptor_arguments,
             )
             page.close()
             context.close()
         async def handle_response(finished_response):
             nonlocal final_response
+            if (
+                finished_response.request.resource_type == "document"
+                and finished_response.request.is_navigation_request()
+            ):
                 final_response = finished_response
         async with AsyncCamoufox(**self._get_camoufox_options()) as browser:
             await page.wait_for_load_state(state="domcontentloaded")
             if self.network_idle:
+                await page.wait_for_load_state("networkidle")
             if self.page_action is not None:
                 try:
                     await page.wait_for_load_state(state="load")
                     await page.wait_for_load_state(state="domcontentloaded")
                     if self.network_idle:
+                        await page.wait_for_load_state("networkidle")
                 except Exception as e:
                     log.error(f"Error waiting for selector {self.wait_selector}: {e}")
                 raise ValueError("Failed to get a response from the page")
             # This will be parsed inside `Response`
+            encoding = (
+                final_response.headers.get("content-type", "") or "utf-8"
+            )  # default encoding
             # PlayWright API sometimes give empty status text for some reason!
+            status_text = final_response.status_text or StatusText.get(
+                final_response.status
+            )
             history = await self._async_process_response_history(first_response)
             try:
             response = Response(
                 url=page.url,
                 text=page_content,
+                body=page_content.encode("utf-8"),
                 status=final_response.status,
                 reason=status_text,
                 encoding=encoding,
+                cookies={
+                    cookie["name"]: cookie["value"]
+                    for cookie in await page.context.cookies()
+                },
                 headers=await first_response.all_headers(),
                 request_headers=await first_response.request.all_headers(),
                 history=history,
+                **self.adaptor_arguments,
             )
             await page.close()
             await context.close()

scrapling/engines/constants.py CHANGED Viewed

@@ -1,92 +1,92 @@
 # Disable loading these resources for speed
 DEFAULT_DISABLED_RESOURCES = {
-    'font',
-    'image',
-    'media',
-    'beacon',
-    'object',
-    'imageset',
-    'texttrack',
-    'websocket',
-    'csp_report',
-    'stylesheet',
 }
 DEFAULT_STEALTH_FLAGS = (
     # Explanation: https://peter.sh/experiments/chromium-command-line-switches/
     # Generally this will make the browser faster and less detectable
-    '--no-pings',
-    '--incognito',
-    '--test-type',
-    '--lang=en-US',
-    '--mute-audio',
-    '--no-first-run',
-    '--disable-sync',
-    '--hide-scrollbars',
-    '--disable-logging',
-    '--start-maximized',  # For headless check bypass
-    '--enable-async-dns',
-    '--disable-breakpad',
-    '--disable-infobars',
-    '--accept-lang=en-US',
-    '--use-mock-keychain',
-    '--disable-translate',
-    '--disable-extensions',
-    '--disable-voice-input',
-    '--window-position=0,0',
-    '--disable-wake-on-wifi',
-    '--ignore-gpu-blocklist',
-    '--enable-tcp-fast-open',
-    '--enable-web-bluetooth',
-    '--disable-hang-monitor',
-    '--password-store=basic',
-    '--disable-cloud-import',
-    '--disable-default-apps',
-    '--disable-print-preview',
-    '--disable-dev-shm-usage',
     # '--disable-popup-blocking',
-    '--metrics-recording-only',
-    '--disable-crash-reporter',
-    '--disable-partial-raster',
-    '--disable-gesture-typing',
-    '--disable-checker-imaging',
-    '--disable-prompt-on-repost',
-    '--force-color-profile=srgb',
-    '--font-render-hinting=none',
-    '--no-default-browser-check',
-    '--aggressive-cache-discard',
-    '--disable-component-update',
-    '--disable-cookie-encryption',
-    '--disable-domain-reliability',
-    '--disable-threaded-animation',
-    '--disable-threaded-scrolling',
     # '--disable-reading-from-canvas',  # For Firefox
-    '--enable-simple-cache-backend',
-    '--disable-background-networking',
-    '--disable-session-crashed-bubble',
-    '--enable-surface-synchronization',
-    '--disable-image-animation-resync',
-    '--disable-renderer-backgrounding',
-    '--disable-ipc-flooding-protection',
-    '--prerender-from-omnibox=disabled',
-    '--safebrowsing-disable-auto-update',
-    '--disable-offer-upload-credit-cards',
-    '--disable-features=site-per-process',
-    '--disable-background-timer-throttling',
-    '--disable-new-content-rendering-timeout',
-    '--run-all-compositor-stages-before-draw',
-    '--disable-client-side-phishing-detection',
-    '--disable-backgrounding-occluded-windows',
-    '--disable-layer-tree-host-memory-pressure',
-    '--autoplay-policy=no-user-gesture-required',
-    '--disable-offer-store-unmasked-wallet-cards',
-    '--disable-blink-features=AutomationControlled',
-    '--webrtc-ip-handling-policy=disable_non_proxied_udp',
-    '--disable-component-extensions-with-background-pages',
-    '--force-webrtc-ip-handling-policy=disable_non_proxied_udp',
-    '--enable-features=NetworkService,NetworkServiceInProcess,TrustTokens,TrustTokensAlwaysAllowIssuance',
-    '--blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4',
-    '--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process,TranslateUI,BlinkGenPropertyTrees',
 )
 # Defaulting to the docker mode, token doesn't matter in it as it's passed for the container
@@ -95,13 +95,10 @@ NSTBROWSER_DEFAULT_QUERY = {
     "headless": True,
     "autoClose": True,
     "fingerprint": {
-        "flags": {
-            "timezone": "BasedOnIp",
-            "screen": "Custom"
-        },
-        "platform": 'linux',  # support: windows, mac, linux
-        "kernel": 'chromium',  # only support: chromium
-        "kernelMilestone": '128',
         "hardwareConcurrency": 8,
         "deviceMemory": 8,
     },

 # Disable loading these resources for speed
 DEFAULT_DISABLED_RESOURCES = {
+    "font",
+    "image",
+    "media",
+    "beacon",
+    "object",
+    "imageset",
+    "texttrack",
+    "websocket",
+    "csp_report",
+    "stylesheet",
 }
 DEFAULT_STEALTH_FLAGS = (
     # Explanation: https://peter.sh/experiments/chromium-command-line-switches/
     # Generally this will make the browser faster and less detectable
+    "--no-pings",
+    "--incognito",
+    "--test-type",
+    "--lang=en-US",
+    "--mute-audio",
+    "--no-first-run",
+    "--disable-sync",
+    "--hide-scrollbars",
+    "--disable-logging",
+    "--start-maximized",  # For headless check bypass
+    "--enable-async-dns",
+    "--disable-breakpad",
+    "--disable-infobars",
+    "--accept-lang=en-US",
+    "--use-mock-keychain",
+    "--disable-translate",
+    "--disable-extensions",
+    "--disable-voice-input",
+    "--window-position=0,0",
+    "--disable-wake-on-wifi",
+    "--ignore-gpu-blocklist",
+    "--enable-tcp-fast-open",
+    "--enable-web-bluetooth",
+    "--disable-hang-monitor",
+    "--password-store=basic",
+    "--disable-cloud-import",
+    "--disable-default-apps",
+    "--disable-print-preview",
+    "--disable-dev-shm-usage",
     # '--disable-popup-blocking',
+    "--metrics-recording-only",
+    "--disable-crash-reporter",
+    "--disable-partial-raster",
+    "--disable-gesture-typing",
+    "--disable-checker-imaging",
+    "--disable-prompt-on-repost",
+    "--force-color-profile=srgb",
+    "--font-render-hinting=none",
+    "--no-default-browser-check",
+    "--aggressive-cache-discard",
+    "--disable-component-update",
+    "--disable-cookie-encryption",
+    "--disable-domain-reliability",
+    "--disable-threaded-animation",
+    "--disable-threaded-scrolling",
     # '--disable-reading-from-canvas',  # For Firefox
+    "--enable-simple-cache-backend",
+    "--disable-background-networking",
+    "--disable-session-crashed-bubble",
+    "--enable-surface-synchronization",
+    "--disable-image-animation-resync",
+    "--disable-renderer-backgrounding",
+    "--disable-ipc-flooding-protection",
+    "--prerender-from-omnibox=disabled",
+    "--safebrowsing-disable-auto-update",
+    "--disable-offer-upload-credit-cards",
+    "--disable-features=site-per-process",
+    "--disable-background-timer-throttling",
+    "--disable-new-content-rendering-timeout",
+    "--run-all-compositor-stages-before-draw",
+    "--disable-client-side-phishing-detection",
+    "--disable-backgrounding-occluded-windows",
+    "--disable-layer-tree-host-memory-pressure",
+    "--autoplay-policy=no-user-gesture-required",
+    "--disable-offer-store-unmasked-wallet-cards",
+    "--disable-blink-features=AutomationControlled",
+    "--webrtc-ip-handling-policy=disable_non_proxied_udp",
+    "--disable-component-extensions-with-background-pages",
+    "--force-webrtc-ip-handling-policy=disable_non_proxied_udp",
+    "--enable-features=NetworkService,NetworkServiceInProcess,TrustTokens,TrustTokensAlwaysAllowIssuance",
+    "--blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4",
+    "--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process,TranslateUI,BlinkGenPropertyTrees",
 )
 # Defaulting to the docker mode, token doesn't matter in it as it's passed for the container
     "headless": True,
     "autoClose": True,
     "fingerprint": {
+        "flags": {"timezone": "BasedOnIp", "screen": "Custom"},
+        "platform": "linux",  # support: windows, mac, linux
+        "kernel": "chromium",  # only support: chromium
+        "kernelMilestone": "128",
         "hardwareConcurrency": 8,
         "deviceMemory": 8,
     },

scrapling/engines/pw.py CHANGED Viewed

@@ -1,42 +1,46 @@
 import json
-from scrapling.core._types import (Callable, Dict, Optional,
-                                   SelectorWaitStates, Union)
 from scrapling.core.utils import log, lru_cache
-from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
-                                         NSTBROWSER_DEFAULT_QUERY)
-from scrapling.engines.toolbelt import (Response, StatusText,
-                                        async_intercept_route,
-                                        check_type_validity, construct_cdp_url,
-                                        construct_proxy_dict,
-                                        generate_convincing_referer,
-                                        generate_headers, intercept_route,
-                                        js_bypass_path)
 class PlaywrightEngine:
     def __init__(
-            self, headless: Union[bool, str] = True,
-            disable_resources: bool = False,
-            useragent: Optional[str] = None,
-            network_idle: bool = False,
-            timeout: Optional[float] = 30000,
-            wait: Optional[int] = 0,
-            page_action: Callable = None,
-            wait_selector: Optional[str] = None,
-            locale: Optional[str] = 'en-US',
-            wait_selector_state: SelectorWaitStates = 'attached',
-            stealth: bool = False,
-            real_chrome: bool = False,
-            hide_canvas: bool = False,
-            disable_webgl: bool = False,
-            cdp_url: Optional[str] = None,
-            nstbrowser_mode: bool = False,
-            nstbrowser_config: Optional[Dict] = None,
-            google_search: bool = True,
-            extra_headers: Optional[Dict[str, str]] = None,
-            proxy: Optional[Union[str, Dict[str, str]]] = None,
-            adaptor_arguments: Dict = None
     ):
         """An engine that utilizes PlayWright library, check the `PlayWrightFetcher` class for more documentation.
@@ -65,7 +69,7 @@ class PlaywrightEngine:
         :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
         """
         self.headless = headless
-        self.locale = check_type_validity(locale, [str], 'en-US', param_name='locale')
         self.disable_resources = disable_resources
         self.network_idle = bool(network_idle)
         self.stealth = bool(stealth)
@@ -95,8 +99,8 @@ class PlaywrightEngine:
         self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
         self.harmful_default_args = [
             # This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
-            '--enable-automation',
-            '--disable-popup-blocking',
             # '--disable-component-update',
             # '--disable-default-apps',
             # '--disable-extensions',
@@ -114,12 +118,16 @@ class PlaywrightEngine:
                 query = NSTBROWSER_DEFAULT_QUERY.copy()
                 if self.stealth:
                     flags = self.__set_flags()
-                    query.update({
-                        "args": dict(zip(flags, [''] * len(flags))),  # browser args should be a dictionary
-                    })
                 config = {
-                    'config': json.dumps(query),
                     # 'token': ''
                 }
             cdp_url = construct_cdp_url(cdp_url, config)
@@ -134,17 +142,25 @@ class PlaywrightEngine:
         """Returns the flags that will be used while launching the browser if stealth mode is enabled"""
         flags = DEFAULT_STEALTH_FLAGS
         if self.hide_canvas:
-            flags += ('--fingerprinting-canvas-image-data-noise',)
         if self.disable_webgl:
-            flags += ('--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2',)
         return flags
     def __launch_kwargs(self):
         """Creates the arguments we will use while launching playwright's browser"""
-        launch_kwargs = {'headless': self.headless, 'ignore_default_args': self.harmful_default_args, 'channel': 'chrome' if self.real_chrome else 'chromium'}
         if self.stealth:
-            launch_kwargs.update({'args': self.__set_flags(), 'chromium_sandbox': True})
         return launch_kwargs
@@ -153,22 +169,26 @@ class PlaywrightEngine:
         context_kwargs = {
             "proxy": self.proxy,
             "locale": self.locale,
-            "color_scheme": 'dark',  # Bypasses the 'prefersLightColor' check in creepjs
             "device_scale_factor": 2,
             "extra_http_headers": self.extra_headers if self.extra_headers else {},
-            "user_agent": self.useragent if self.useragent else generate_headers(browser_mode=True).get('User-Agent'),
         }
         if self.stealth:
-            context_kwargs.update({
-                'is_mobile': False,
-                'has_touch': False,
-                # I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
-                'service_workers': 'allow',
-                'ignore_https_errors': True,
-                'screen': {'width': 1920, 'height': 1080},
-                'viewport': {'width': 1920, 'height': 1080},
-                'permissions': ['geolocation', 'notifications']
-            })
         return context_kwargs
@@ -184,10 +204,16 @@ class PlaywrightEngine:
         # https://arh.antoinevastel.com/bots/areyouheadless/
         # https://prescience-data.github.io/execution-monitor.html
         return tuple(
-            js_bypass_path(script) for script in (
                 # Order is important
-                'webdriver_fully.js', 'window_chrome.js', 'navigator_plugins.js', 'pdf_viewer.js',
-                'notification_permission.js', 'screen_props.js', 'playwright_fingerprint.js'
             )
         )
@@ -200,19 +226,30 @@ class PlaywrightEngine:
             while current_request:
                 try:
                     current_response = current_request.response()
-                    history.insert(0, Response(
-                        url=current_request.url,
-                        # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
-                        text='',
-                        body=b'',
-                        status=current_response.status if current_response else 301,
-                        reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
-                        encoding=current_response.headers.get('content-type', '') or 'utf-8',
-                        cookies={},
-                        headers=current_response.all_headers() if current_response else {},
-                        request_headers=current_request.all_headers(),
-                        **self.adaptor_arguments
-                    ))
                 except Exception as e:
                     log.error(f"Error processing redirect: {e}")
                     break
@@ -232,19 +269,30 @@ class PlaywrightEngine:
             while current_request:
                 try:
                     current_response = await current_request.response()
-                    history.insert(0, Response(
-                        url=current_request.url,
-                        # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
-                        text='',
-                        body=b'',
-                        status=current_response.status if current_response else 301,
-                        reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
-                        encoding=current_response.headers.get('content-type', '') or 'utf-8',
-                        cookies={},
-                        headers=await current_response.all_headers() if current_response else {},
-                        request_headers=await current_request.all_headers(),
-                        **self.adaptor_arguments
-                    ))
                 except Exception as e:
                     log.error(f"Error processing redirect: {e}")
                     break
@@ -262,6 +310,7 @@ class PlaywrightEngine:
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
         from playwright.sync_api import Response as PlaywrightResponse
         if not self.stealth or self.real_chrome:
             # Because rebrowser_playwright doesn't play well with real browsers
             from playwright.sync_api import sync_playwright
@@ -273,7 +322,10 @@ class PlaywrightEngine:
         def handle_response(finished_response: PlaywrightResponse):
             nonlocal final_response
-            if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
                 final_response = finished_response
         with sync_playwright() as p:
@@ -304,7 +356,7 @@ class PlaywrightEngine:
             page.wait_for_load_state(state="domcontentloaded")
             if self.network_idle:
-                page.wait_for_load_state('networkidle')
             if self.page_action is not None:
                 try:
@@ -320,7 +372,7 @@ class PlaywrightEngine:
                     page.wait_for_load_state(state="load")
                     page.wait_for_load_state(state="domcontentloaded")
                     if self.network_idle:
-                        page.wait_for_load_state('networkidle')
                 except Exception as e:
                     log.error(f"Error waiting for selector {self.wait_selector}: {e}")
@@ -331,9 +383,13 @@ class PlaywrightEngine:
                 raise ValueError("Failed to get a response from the page")
             # This will be parsed inside `Response`
-            encoding = final_response.headers.get('content-type', '') or 'utf-8'  # default encoding
             # PlayWright API sometimes give empty status text for some reason!
-            status_text = final_response.status_text or StatusText.get(final_response.status)
             history = self._process_response_history(first_response)
             try:
@@ -345,15 +401,17 @@ class PlaywrightEngine:
             response = Response(
                 url=page.url,
                 text=page_content,
-                body=page_content.encode('utf-8'),
                 status=final_response.status,
                 reason=status_text,
                 encoding=encoding,
-                cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
                 headers=first_response.all_headers(),
                 request_headers=first_response.request.all_headers(),
                 history=history,
-                **self.adaptor_arguments
             )
             page.close()
             context.close()
@@ -366,6 +424,7 @@ class PlaywrightEngine:
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
         from playwright.async_api import Response as PlaywrightResponse
         if not self.stealth or self.real_chrome:
             # Because rebrowser_playwright doesn't play well with real browsers
             from playwright.async_api import async_playwright
@@ -377,7 +436,10 @@ class PlaywrightEngine:
         async def handle_response(finished_response: PlaywrightResponse):
             nonlocal final_response
-            if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
                 final_response = finished_response
         async with async_playwright() as p:
@@ -408,7 +470,7 @@ class PlaywrightEngine:
             await page.wait_for_load_state(state="domcontentloaded")
             if self.network_idle:
-                await page.wait_for_load_state('networkidle')
             if self.page_action is not None:
                 try:
@@ -424,7 +486,7 @@ class PlaywrightEngine:
                     await page.wait_for_load_state(state="load")
                     await page.wait_for_load_state(state="domcontentloaded")
                     if self.network_idle:
-                        await page.wait_for_load_state('networkidle')
                 except Exception as e:
                     log.error(f"Error waiting for selector {self.wait_selector}: {e}")
@@ -435,9 +497,13 @@ class PlaywrightEngine:
                 raise ValueError("Failed to get a response from the page")
             # This will be parsed inside `Response`
-            encoding = final_response.headers.get('content-type', '') or 'utf-8'  # default encoding
             # PlayWright API sometimes give empty status text for some reason!
-            status_text = final_response.status_text or StatusText.get(final_response.status)
             history = await self._async_process_response_history(first_response)
             try:
@@ -449,15 +515,18 @@ class PlaywrightEngine:
             response = Response(
                 url=page.url,
                 text=page_content,
-                body=page_content.encode('utf-8'),
                 status=final_response.status,
                 reason=status_text,
                 encoding=encoding,
-                cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
                 headers=await first_response.all_headers(),
                 request_headers=await first_response.request.all_headers(),
                 history=history,
-                **self.adaptor_arguments
             )
             await page.close()
             await context.close()

 import json
+from scrapling.core._types import Callable, Dict, Optional, SelectorWaitStates, Union
 from scrapling.core.utils import log, lru_cache
+from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS, NSTBROWSER_DEFAULT_QUERY
+from scrapling.engines.toolbelt import (
+    Response,
+    StatusText,
+    async_intercept_route,
+    check_type_validity,
+    construct_cdp_url,
+    construct_proxy_dict,
+    generate_convincing_referer,
+    generate_headers,
+    intercept_route,
+    js_bypass_path,
+)
 class PlaywrightEngine:
     def __init__(
+        self,
+        headless: Union[bool, str] = True,
+        disable_resources: bool = False,
+        useragent: Optional[str] = None,
+        network_idle: bool = False,
+        timeout: Optional[float] = 30000,
+        wait: Optional[int] = 0,
+        page_action: Callable = None,
+        wait_selector: Optional[str] = None,
+        locale: Optional[str] = "en-US",
+        wait_selector_state: SelectorWaitStates = "attached",
+        stealth: bool = False,
+        real_chrome: bool = False,
+        hide_canvas: bool = False,
+        disable_webgl: bool = False,
+        cdp_url: Optional[str] = None,
+        nstbrowser_mode: bool = False,
+        nstbrowser_config: Optional[Dict] = None,
+        google_search: bool = True,
+        extra_headers: Optional[Dict[str, str]] = None,
+        proxy: Optional[Union[str, Dict[str, str]]] = None,
+        adaptor_arguments: Dict = None,
     ):
         """An engine that utilizes PlayWright library, check the `PlayWrightFetcher` class for more documentation.
         :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
         """
         self.headless = headless
+        self.locale = check_type_validity(locale, [str], "en-US", param_name="locale")
         self.disable_resources = disable_resources
         self.network_idle = bool(network_idle)
         self.stealth = bool(stealth)
         self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
         self.harmful_default_args = [
             # This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
+            "--enable-automation",
+            "--disable-popup-blocking",
             # '--disable-component-update',
             # '--disable-default-apps',
             # '--disable-extensions',
                 query = NSTBROWSER_DEFAULT_QUERY.copy()
                 if self.stealth:
                     flags = self.__set_flags()
+                    query.update(
+                        {
+                            "args": dict(
+                                zip(flags, [""] * len(flags))
+                            ),  # browser args should be a dictionary
+                        }
+                    )
                 config = {
+                    "config": json.dumps(query),
                     # 'token': ''
                 }
             cdp_url = construct_cdp_url(cdp_url, config)
         """Returns the flags that will be used while launching the browser if stealth mode is enabled"""
         flags = DEFAULT_STEALTH_FLAGS
         if self.hide_canvas:
+            flags += ("--fingerprinting-canvas-image-data-noise",)
         if self.disable_webgl:
+            flags += (
+                "--disable-webgl",
+                "--disable-webgl-image-chromium",
+                "--disable-webgl2",
+            )
         return flags
     def __launch_kwargs(self):
         """Creates the arguments we will use while launching playwright's browser"""
+        launch_kwargs = {
+            "headless": self.headless,
+            "ignore_default_args": self.harmful_default_args,
+            "channel": "chrome" if self.real_chrome else "chromium",
+        }
         if self.stealth:
+            launch_kwargs.update({"args": self.__set_flags(), "chromium_sandbox": True})
         return launch_kwargs
         context_kwargs = {
             "proxy": self.proxy,
             "locale": self.locale,
+            "color_scheme": "dark",  # Bypasses the 'prefersLightColor' check in creepjs
             "device_scale_factor": 2,
             "extra_http_headers": self.extra_headers if self.extra_headers else {},
+            "user_agent": self.useragent
+            if self.useragent
+            else generate_headers(browser_mode=True).get("User-Agent"),
         }
         if self.stealth:
+            context_kwargs.update(
+                {
+                    "is_mobile": False,
+                    "has_touch": False,
+                    # I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
+                    "service_workers": "allow",
+                    "ignore_https_errors": True,
+                    "screen": {"width": 1920, "height": 1080},
+                    "viewport": {"width": 1920, "height": 1080},
+                    "permissions": ["geolocation", "notifications"],
+                }
+            )
         return context_kwargs
         # https://arh.antoinevastel.com/bots/areyouheadless/
         # https://prescience-data.github.io/execution-monitor.html
         return tuple(
+            js_bypass_path(script)
+            for script in (
                 # Order is important
+                "webdriver_fully.js",
+                "window_chrome.js",
+                "navigator_plugins.js",
+                "pdf_viewer.js",
+                "notification_permission.js",
+                "screen_props.js",
+                "playwright_fingerprint.js",
             )
         )
             while current_request:
                 try:
                     current_response = current_request.response()
+                    history.insert(
+                        0,
+                        Response(
+                            url=current_request.url,
+                            # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
+                            text="",
+                            body=b"",
+                            status=current_response.status if current_response else 301,
+                            reason=(
+                                current_response.status_text
+                                or StatusText.get(current_response.status)
+                            )
+                            if current_response
+                            else StatusText.get(301),
+                            encoding=current_response.headers.get("content-type", "")
+                            or "utf-8",
+                            cookies={},
+                            headers=current_response.all_headers()
+                            if current_response
+                            else {},
+                            request_headers=current_request.all_headers(),
+                            **self.adaptor_arguments,
+                        ),
+                    )
                 except Exception as e:
                     log.error(f"Error processing redirect: {e}")
                     break
             while current_request:
                 try:
                     current_response = await current_request.response()
+                    history.insert(
+                        0,
+                        Response(
+                            url=current_request.url,
+                            # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
+                            text="",
+                            body=b"",
+                            status=current_response.status if current_response else 301,
+                            reason=(
+                                current_response.status_text
+                                or StatusText.get(current_response.status)
+                            )
+                            if current_response
+                            else StatusText.get(301),
+                            encoding=current_response.headers.get("content-type", "")
+                            or "utf-8",
+                            cookies={},
+                            headers=await current_response.all_headers()
+                            if current_response
+                            else {},
+                            request_headers=await current_request.all_headers(),
+                            **self.adaptor_arguments,
+                        ),
+                    )
                 except Exception as e:
                     log.error(f"Error processing redirect: {e}")
                     break
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
         from playwright.sync_api import Response as PlaywrightResponse
         if not self.stealth or self.real_chrome:
             # Because rebrowser_playwright doesn't play well with real browsers
             from playwright.sync_api import sync_playwright
         def handle_response(finished_response: PlaywrightResponse):
             nonlocal final_response
+            if (
+                finished_response.request.resource_type == "document"
+                and finished_response.request.is_navigation_request()
+            ):
                 final_response = finished_response
         with sync_playwright() as p:
             page.wait_for_load_state(state="domcontentloaded")
             if self.network_idle:
+                page.wait_for_load_state("networkidle")
             if self.page_action is not None:
                 try:
                     page.wait_for_load_state(state="load")
                     page.wait_for_load_state(state="domcontentloaded")
                     if self.network_idle:
+                        page.wait_for_load_state("networkidle")
                 except Exception as e:
                     log.error(f"Error waiting for selector {self.wait_selector}: {e}")
                 raise ValueError("Failed to get a response from the page")
             # This will be parsed inside `Response`
+            encoding = (
+                final_response.headers.get("content-type", "") or "utf-8"
+            )  # default encoding
             # PlayWright API sometimes give empty status text for some reason!
+            status_text = final_response.status_text or StatusText.get(
+                final_response.status
+            )
             history = self._process_response_history(first_response)
             try:
             response = Response(
                 url=page.url,
                 text=page_content,
+                body=page_content.encode("utf-8"),
                 status=final_response.status,
                 reason=status_text,
                 encoding=encoding,
+                cookies={
+                    cookie["name"]: cookie["value"] for cookie in page.context.cookies()
+                },
                 headers=first_response.all_headers(),
                 request_headers=first_response.request.all_headers(),
                 history=history,
+                **self.adaptor_arguments,
             )
             page.close()
             context.close()
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
         from playwright.async_api import Response as PlaywrightResponse
         if not self.stealth or self.real_chrome:
             # Because rebrowser_playwright doesn't play well with real browsers
             from playwright.async_api import async_playwright
         async def handle_response(finished_response: PlaywrightResponse):
             nonlocal final_response
+            if (
+                finished_response.request.resource_type == "document"
+                and finished_response.request.is_navigation_request()
+            ):
                 final_response = finished_response
         async with async_playwright() as p:
             await page.wait_for_load_state(state="domcontentloaded")
             if self.network_idle:
+                await page.wait_for_load_state("networkidle")
             if self.page_action is not None:
                 try:
                     await page.wait_for_load_state(state="load")
                     await page.wait_for_load_state(state="domcontentloaded")
                     if self.network_idle:
+                        await page.wait_for_load_state("networkidle")
                 except Exception as e:
                     log.error(f"Error waiting for selector {self.wait_selector}: {e}")
                 raise ValueError("Failed to get a response from the page")
             # This will be parsed inside `Response`
+            encoding = (
+                final_response.headers.get("content-type", "") or "utf-8"
+            )  # default encoding
             # PlayWright API sometimes give empty status text for some reason!
+            status_text = final_response.status_text or StatusText.get(
+                final_response.status
+            )
             history = await self._async_process_response_history(first_response)
             try:
             response = Response(
                 url=page.url,
                 text=page_content,
+                body=page_content.encode("utf-8"),
                 status=final_response.status,
                 reason=status_text,
                 encoding=encoding,
+                cookies={
+                    cookie["name"]: cookie["value"]
+                    for cookie in await page.context.cookies()
+                },
                 headers=await first_response.all_headers(),
                 request_headers=await first_response.request.all_headers(),
                 history=history,
+                **self.adaptor_arguments,
             )
             await page.close()
             await context.close()

scrapling/engines/static.py CHANGED Viewed

@@ -10,8 +10,14 @@ from .toolbelt import Response, generate_convincing_referer, generate_headers
 @lru_cache(2, typed=True)  # Singleton easily
 class StaticEngine:
     def __init__(
-            self, url: str, proxy: Optional[str] = None, stealthy_headers: bool = True, follow_redirects: bool = True,
-            timeout: Optional[Union[int, float]] = None, retries: Optional[int] = 3, adaptor_arguments: Tuple = None
     ):
         """An engine that utilizes httpx library, check the `Fetcher` class for more documentation.
@@ -47,14 +53,22 @@ class StaticEngine:
         if self.stealth:
             extra_headers = generate_headers(browser_mode=False)
             # Don't overwrite user supplied headers
-            extra_headers = {key: value for key, value in extra_headers.items() if key.lower() not in headers_keys}
             headers.update(extra_headers)
-            if 'referer' not in headers_keys:
-                headers.update({'referer': generate_convincing_referer(self.url)})
-        elif 'user-agent' not in headers_keys:
-            headers['User-Agent'] = generate_headers(browser_mode=False).get('User-Agent')
-            log.debug(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
         return headers
@@ -70,25 +84,43 @@ class StaticEngine:
             body=response.content,
             status=response.status_code,
             reason=response.reason_phrase,
-            encoding=response.encoding or 'utf-8',
             cookies=dict(response.cookies),
             headers=dict(response.headers),
             request_headers=dict(response.request.headers),
             method=response.request.method,
-            history=[self._prepare_response(redirection) for redirection in response.history],
-            **self.adaptor_arguments
         )
     def _make_request(self, method: str, **kwargs) -> Response:
-        headers = self._headers_job(kwargs.pop('headers', {}))
-        with httpx.Client(proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)) as client:
-            request = getattr(client, method)(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
         return self._prepare_response(request)
     async def _async_make_request(self, method: str, **kwargs) -> Response:
-        headers = self._headers_job(kwargs.pop('headers', {}))
-        async with httpx.AsyncClient(proxy=self.proxy, transport=httpx.AsyncHTTPTransport(retries=self.retries)) as client:
-            request = await getattr(client, method)(url=self.url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
         return self._prepare_response(request)
     def get(self, **kwargs: Dict) -> Response:
@@ -97,7 +129,7 @@ class StaticEngine:
         :param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        return self._make_request('get', **kwargs)
     async def async_get(self, **kwargs: Dict) -> Response:
         """Make basic async HTTP GET request for you but with some added flavors.
@@ -105,7 +137,7 @@ class StaticEngine:
         :param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        return await self._async_make_request('get', **kwargs)
     def post(self, **kwargs: Dict) -> Response:
         """Make basic HTTP POST request for you but with some added flavors.
@@ -113,7 +145,7 @@ class StaticEngine:
         :param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        return self._make_request('post', **kwargs)
     async def async_post(self, **kwargs: Dict) -> Response:
         """Make basic async HTTP POST request for you but with some added flavors.
@@ -121,7 +153,7 @@ class StaticEngine:
         :param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        return await self._async_make_request('post', **kwargs)
     def delete(self, **kwargs: Dict) -> Response:
         """Make basic HTTP DELETE request for you but with some added flavors.
@@ -129,7 +161,7 @@ class StaticEngine:
         :param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        return self._make_request('delete', **kwargs)
     async def async_delete(self, **kwargs: Dict) -> Response:
         """Make basic async HTTP DELETE request for you but with some added flavors.
@@ -137,7 +169,7 @@ class StaticEngine:
         :param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        return await self._async_make_request('delete', **kwargs)
     def put(self, **kwargs: Dict) -> Response:
         """Make basic HTTP PUT request for you but with some added flavors.
@@ -145,7 +177,7 @@ class StaticEngine:
         :param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        return self._make_request('put', **kwargs)
     async def async_put(self, **kwargs: Dict) -> Response:
         """Make basic async HTTP PUT request for you but with some added flavors.
@@ -153,4 +185,4 @@ class StaticEngine:
         :param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        return await self._async_make_request('put', **kwargs)

 @lru_cache(2, typed=True)  # Singleton easily
 class StaticEngine:
     def __init__(
+        self,
+        url: str,
+        proxy: Optional[str] = None,
+        stealthy_headers: bool = True,
+        follow_redirects: bool = True,
+        timeout: Optional[Union[int, float]] = None,
+        retries: Optional[int] = 3,
+        adaptor_arguments: Tuple = None,
     ):
         """An engine that utilizes httpx library, check the `Fetcher` class for more documentation.
         if self.stealth:
             extra_headers = generate_headers(browser_mode=False)
             # Don't overwrite user supplied headers
+            extra_headers = {
+                key: value
+                for key, value in extra_headers.items()
+                if key.lower() not in headers_keys
+            }
             headers.update(extra_headers)
+            if "referer" not in headers_keys:
+                headers.update({"referer": generate_convincing_referer(self.url)})
+        elif "user-agent" not in headers_keys:
+            headers["User-Agent"] = generate_headers(browser_mode=False).get(
+                "User-Agent"
+            )
+            log.debug(
+                f"Can't find useragent in headers so '{headers['User-Agent']}' was used."
+            )
         return headers
             body=response.content,
             status=response.status_code,
             reason=response.reason_phrase,
+            encoding=response.encoding or "utf-8",
             cookies=dict(response.cookies),
             headers=dict(response.headers),
             request_headers=dict(response.request.headers),
             method=response.request.method,
+            history=[
+                self._prepare_response(redirection) for redirection in response.history
+            ],
+            **self.adaptor_arguments,
         )
     def _make_request(self, method: str, **kwargs) -> Response:
+        headers = self._headers_job(kwargs.pop("headers", {}))
+        with httpx.Client(
+            proxy=self.proxy, transport=httpx.HTTPTransport(retries=self.retries)
+        ) as client:
+            request = getattr(client, method)(
+                url=self.url,
+                headers=headers,
+                follow_redirects=self.follow_redirects,
+                timeout=self.timeout,
+                **kwargs,
+            )
         return self._prepare_response(request)
     async def _async_make_request(self, method: str, **kwargs) -> Response:
+        headers = self._headers_job(kwargs.pop("headers", {}))
+        async with httpx.AsyncClient(
+            proxy=self.proxy, transport=httpx.AsyncHTTPTransport(retries=self.retries)
+        ) as client:
+            request = await getattr(client, method)(
+                url=self.url,
+                headers=headers,
+                follow_redirects=self.follow_redirects,
+                timeout=self.timeout,
+                **kwargs,
+            )
         return self._prepare_response(request)
     def get(self, **kwargs: Dict) -> Response:
         :param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
+        return self._make_request("get", **kwargs)
     async def async_get(self, **kwargs: Dict) -> Response:
         """Make basic async HTTP GET request for you but with some added flavors.
         :param kwargs: Any keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
+        return await self._async_make_request("get", **kwargs)
     def post(self, **kwargs: Dict) -> Response:
         """Make basic HTTP POST request for you but with some added flavors.
         :param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
+        return self._make_request("post", **kwargs)
     async def async_post(self, **kwargs: Dict) -> Response:
         """Make basic async HTTP POST request for you but with some added flavors.
         :param kwargs: Any keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
+        return await self._async_make_request("post", **kwargs)
     def delete(self, **kwargs: Dict) -> Response:
         """Make basic HTTP DELETE request for you but with some added flavors.
         :param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
+        return self._make_request("delete", **kwargs)
     async def async_delete(self, **kwargs: Dict) -> Response:
         """Make basic async HTTP DELETE request for you but with some added flavors.
         :param kwargs: Any keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
+        return await self._async_make_request("delete", **kwargs)
     def put(self, **kwargs: Dict) -> Response:
         """Make basic HTTP PUT request for you but with some added flavors.
         :param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
+        return self._make_request("put", **kwargs)
     async def async_put(self, **kwargs: Dict) -> Response:
         """Make basic async HTTP PUT request for you but with some added flavors.
         :param kwargs: Any keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
+        return await self._async_make_request("put", **kwargs)

scrapling/engines/toolbelt/__init__.py CHANGED Viewed

@@ -1,6 +1,16 @@
-from .custom import (BaseFetcher, Response, StatusText, check_if_engine_usable,
-                     check_type_validity, get_variable_name)
-from .fingerprints import (generate_convincing_referer, generate_headers,
-                           get_os_name)
-from .navigation import (async_intercept_route, construct_cdp_url,
-                         construct_proxy_dict, intercept_route, js_bypass_path)

+from .custom import (
+    BaseFetcher,
+    Response,
+    StatusText,
+    check_if_engine_usable,
+    check_type_validity,
+    get_variable_name,
+)
+from .fingerprints import generate_convincing_referer, generate_headers, get_os_name
+from .navigation import (
+    async_intercept_route,
+    construct_cdp_url,
+    construct_proxy_dict,
+    intercept_route,
+    js_bypass_path,
+)

scrapling/engines/toolbelt/custom.py CHANGED Viewed

@@ -1,11 +1,20 @@
 """
 Functions related to custom types or type checking
 """
 import inspect
 from email.message import Message
-from scrapling.core._types import (Any, Callable, Dict, List, Optional, Tuple,
-                                   Type, Union)
 from scrapling.core.custom_types import MappingProxyType
 from scrapling.core.utils import log, lru_cache
 from scrapling.parser import Adaptor, SQLiteStorageSystem
@@ -13,7 +22,12 @@ from scrapling.parser import Adaptor, SQLiteStorageSystem
 class ResponseEncoding:
     __DEFAULT_ENCODING = "utf-8"
-    __ISO_8859_1_CONTENT_TYPES = {"text/plain", "text/html", "text/css", "text/javascript"}
     @classmethod
     @lru_cache(maxsize=128)
@@ -27,19 +41,21 @@ class ResponseEncoding:
         """
         # Create a Message object and set the Content-Type header then get the content type and parameters
         msg = Message()
-        msg['content-type'] = header_value
         content_type = msg.get_content_type()
         params = dict(msg.get_params(failobj=[]))
         # Remove the content-type from params if present somehow
-        params.pop('content-type', None)
         return content_type, params
     @classmethod
     @lru_cache(maxsize=128)
-    def get_value(cls, content_type: Optional[str], text: Optional[str] = 'test') -> str:
         """Determine the appropriate character encoding from a content-type header.
         The encoding is determined by these rules in order:
@@ -72,7 +88,9 @@ class ResponseEncoding:
                 encoding = cls.__DEFAULT_ENCODING
             if encoding:
-                _ = text.encode(encoding)  # Validate encoding and validate it can encode the given text
                 return encoding
             return cls.__DEFAULT_ENCODING
@@ -84,9 +102,22 @@ class ResponseEncoding:
 class Response(Adaptor):
     """This class is returned by all engines as a way to unify response type between different libraries."""
-    def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict,
-                 encoding: str = 'utf-8', method: str = 'GET', history: List = None, **adaptor_arguments: Dict):
-        automatch_domain = adaptor_arguments.pop('automatch_domain', None)
         self.status = status
         self.reason = reason
         self.cookies = cookies
@@ -94,11 +125,19 @@ class Response(Adaptor):
         self.request_headers = request_headers
         self.history = history or []
         encoding = ResponseEncoding.get_value(encoding, text)
-        super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
         # For back-ward compatibility
         self.adaptor = self
         # For easier debugging while working from a Python shell
-        log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')
     # def __repr__(self):
     #     return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
@@ -113,16 +152,26 @@ class BaseFetcher:
     storage_args: Optional[Dict] = None
     keep_comments: Optional[bool] = False
     automatch_domain: Optional[str] = None
-    parser_keywords: Tuple = ('huge_tree', 'auto_match', 'storage', 'keep_cdata', 'storage_args', 'keep_comments', 'automatch_domain',)  # Left open for the user
     def __init__(self, *args, **kwargs):
         # For backward-compatibility before 0.2.99
-        args_str = ", ".join(args) or ''
-        kwargs_str = ", ".join(f'{k}={v}' for k, v in kwargs.items()) or ''
         if args_str:
-            args_str += ', '
-        log.warning(f'This logic is deprecated now, and have no effect; It will be removed with v0.3. Use `{self.__class__.__name__}.configure({args_str}{kwargs_str})` instead before fetching')
         pass
     @classmethod
@@ -150,12 +199,18 @@ class BaseFetcher:
                     setattr(cls, key, value)
                 else:
                     # Yup, no fun allowed LOL
-                    raise AttributeError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
             else:
-                raise ValueError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
         if not kwargs:
-            raise AttributeError(f'You must pass a keyword to configure, current keywords: {cls.parser_keywords}?')
     @classmethod
     def _generate_parser_arguments(cls) -> Dict:
@@ -167,13 +222,15 @@ class BaseFetcher:
             keep_cdata=cls.keep_cdata,
             auto_match=cls.auto_match,
             storage=cls.storage,
-            storage_args=cls.storage_args
         )
         if cls.automatch_domain:
             if type(cls.automatch_domain) is not str:
-                log.warning('[Ignored] The argument "automatch_domain" must be of string type')
             else:
-                parser_arguments.update({'automatch_domain': cls.automatch_domain})
         return parser_arguments
@@ -181,72 +238,75 @@ class BaseFetcher:
 class StatusText:
     """A class that gets the status text of response status code.
-        Reference: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status
     """
-    _phrases = MappingProxyType({
-        100: "Continue",
-        101: "Switching Protocols",
-        102: "Processing",
-        103: "Early Hints",
-        200: "OK",
-        201: "Created",
-        202: "Accepted",
-        203: "Non-Authoritative Information",
-        204: "No Content",
-        205: "Reset Content",
-        206: "Partial Content",
-        207: "Multi-Status",
-        208: "Already Reported",
-        226: "IM Used",
-        300: "Multiple Choices",
-        301: "Moved Permanently",
-        302: "Found",
-        303: "See Other",
-        304: "Not Modified",
-        305: "Use Proxy",
-        307: "Temporary Redirect",
-        308: "Permanent Redirect",
-        400: "Bad Request",
-        401: "Unauthorized",
-        402: "Payment Required",
-        403: "Forbidden",
-        404: "Not Found",
-        405: "Method Not Allowed",
-        406: "Not Acceptable",
-        407: "Proxy Authentication Required",
-        408: "Request Timeout",
-        409: "Conflict",
-        410: "Gone",
-        411: "Length Required",
-        412: "Precondition Failed",
-        413: "Payload Too Large",
-        414: "URI Too Long",
-        415: "Unsupported Media Type",
-        416: "Range Not Satisfiable",
-        417: "Expectation Failed",
-        418: "I'm a teapot",
-        421: "Misdirected Request",
-        422: "Unprocessable Entity",
-        423: "Locked",
-        424: "Failed Dependency",
-        425: "Too Early",
-        426: "Upgrade Required",
-        428: "Precondition Required",
-        429: "Too Many Requests",
-        431: "Request Header Fields Too Large",
-        451: "Unavailable For Legal Reasons",
-        500: "Internal Server Error",
-        501: "Not Implemented",
-        502: "Bad Gateway",
-        503: "Service Unavailable",
-        504: "Gateway Timeout",
-        505: "HTTP Version Not Supported",
-        506: "Variant Also Negotiates",
-        507: "Insufficient Storage",
-        508: "Loop Detected",
-        510: "Not Extended",
-        511: "Network Authentication Required"
-    })
     @classmethod
     @lru_cache(maxsize=128)
@@ -265,20 +325,26 @@ def check_if_engine_usable(engine: Callable) -> Union[Callable, None]:
     # if isinstance(engine, type):
     #     raise TypeError("Expected an engine instance, not a class definition of the engine")
-    if hasattr(engine, 'fetch'):
         fetch_function = getattr(engine, "fetch")
         if callable(fetch_function):
             if len(inspect.signature(fetch_function).parameters) > 0:
                 return engine
             else:
                 # raise TypeError("Engine class instance must have a callable method 'fetch' with the first argument used for the url.")
-                raise TypeError("Engine class must have a callable method 'fetch' with the first argument used for the url.")
         else:
             # raise TypeError("Invalid engine instance! Engine class must have a callable method 'fetch'")
-            raise TypeError("Invalid engine class! Engine class must have a callable method 'fetch'")
     else:
         # raise TypeError("Invalid engine instance! Engine class must have the method 'fetch'")
-        raise TypeError("Invalid engine class! Engine class must have the method 'fetch'")
 def get_variable_name(var: Any) -> Optional[str]:
@@ -293,7 +359,13 @@ def get_variable_name(var: Any) -> Optional[str]:
     return None
-def check_type_validity(variable: Any, valid_types: Union[List[Type], None], default_value: Any = None, critical: bool = False, param_name: Optional[str] = None) -> Any:
     """Check if a variable matches the specified type constraints.
     :param variable: The variable to check
     :param valid_types: List of valid types for the variable
@@ -316,7 +388,7 @@ def check_type_validity(variable: Any, valid_types: Union[List[Type], None], def
         error_msg = f'Argument "{var_name}" cannot be None'
         if critical:
             raise TypeError(error_msg)
-        log.error(f'[Ignored] {error_msg}')
         return default_value
     # If no valid_types specified and variable has a value, return it
@@ -329,7 +401,7 @@ def check_type_validity(variable: Any, valid_types: Union[List[Type], None], def
         error_msg = f'Argument "{var_name}" must be of type {" or ".join(type_names)}'
         if critical:
             raise TypeError(error_msg)
-        log.error(f'[Ignored] {error_msg}')
         return default_value
     return variable

 """
 Functions related to custom types or type checking
 """
 import inspect
 from email.message import Message
+from scrapling.core._types import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    Union,
+)
 from scrapling.core.custom_types import MappingProxyType
 from scrapling.core.utils import log, lru_cache
 from scrapling.parser import Adaptor, SQLiteStorageSystem
 class ResponseEncoding:
     __DEFAULT_ENCODING = "utf-8"
+    __ISO_8859_1_CONTENT_TYPES = {
+        "text/plain",
+        "text/html",
+        "text/css",
+        "text/javascript",
+    }
     @classmethod
     @lru_cache(maxsize=128)
         """
         # Create a Message object and set the Content-Type header then get the content type and parameters
         msg = Message()
+        msg["content-type"] = header_value
         content_type = msg.get_content_type()
         params = dict(msg.get_params(failobj=[]))
         # Remove the content-type from params if present somehow
+        params.pop("content-type", None)
         return content_type, params
     @classmethod
     @lru_cache(maxsize=128)
+    def get_value(
+        cls, content_type: Optional[str], text: Optional[str] = "test"
+    ) -> str:
         """Determine the appropriate character encoding from a content-type header.
         The encoding is determined by these rules in order:
                 encoding = cls.__DEFAULT_ENCODING
             if encoding:
+                _ = text.encode(
+                    encoding
+                )  # Validate encoding and validate it can encode the given text
                 return encoding
             return cls.__DEFAULT_ENCODING
 class Response(Adaptor):
     """This class is returned by all engines as a way to unify response type between different libraries."""
+    def __init__(
+        self,
+        url: str,
+        text: str,
+        body: bytes,
+        status: int,
+        reason: str,
+        cookies: Dict,
+        headers: Dict,
+        request_headers: Dict,
+        encoding: str = "utf-8",
+        method: str = "GET",
+        history: List = None,
+        **adaptor_arguments: Dict,
+    ):
+        automatch_domain = adaptor_arguments.pop("automatch_domain", None)
         self.status = status
         self.reason = reason
         self.cookies = cookies
         self.request_headers = request_headers
         self.history = history or []
         encoding = ResponseEncoding.get_value(encoding, text)
+        super().__init__(
+            text=text,
+            body=body,
+            url=automatch_domain or url,
+            encoding=encoding,
+            **adaptor_arguments,
+        )
         # For back-ward compatibility
         self.adaptor = self
         # For easier debugging while working from a Python shell
+        log.info(
+            f"Fetched ({status}) <{method} {url}> (referer: {request_headers.get('referer')})"
+        )
     # def __repr__(self):
     #     return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
     storage_args: Optional[Dict] = None
     keep_comments: Optional[bool] = False
     automatch_domain: Optional[str] = None
+    parser_keywords: Tuple = (
+        "huge_tree",
+        "auto_match",
+        "storage",
+        "keep_cdata",
+        "storage_args",
+        "keep_comments",
+        "automatch_domain",
+    )  # Left open for the user
     def __init__(self, *args, **kwargs):
         # For backward-compatibility before 0.2.99
+        args_str = ", ".join(args) or ""
+        kwargs_str = ", ".join(f"{k}={v}" for k, v in kwargs.items()) or ""
         if args_str:
+            args_str += ", "
+        log.warning(
+            f"This logic is deprecated now, and have no effect; It will be removed with v0.3. Use `{self.__class__.__name__}.configure({args_str}{kwargs_str})` instead before fetching"
+        )
         pass
     @classmethod
                     setattr(cls, key, value)
                 else:
                     # Yup, no fun allowed LOL
+                    raise AttributeError(
+                        f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?'
+                    )
             else:
+                raise ValueError(
+                    f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?'
+                )
         if not kwargs:
+            raise AttributeError(
+                f"You must pass a keyword to configure, current keywords: {cls.parser_keywords}?"
+            )
     @classmethod
     def _generate_parser_arguments(cls) -> Dict:
             keep_cdata=cls.keep_cdata,
             auto_match=cls.auto_match,
             storage=cls.storage,
+            storage_args=cls.storage_args,
         )
         if cls.automatch_domain:
             if type(cls.automatch_domain) is not str:
+                log.warning(
+                    '[Ignored] The argument "automatch_domain" must be of string type'
+                )
             else:
+                parser_arguments.update({"automatch_domain": cls.automatch_domain})
         return parser_arguments
 class StatusText:
     """A class that gets the status text of response status code.
+    Reference: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status
     """
+    _phrases = MappingProxyType(
+        {
+            100: "Continue",
+            101: "Switching Protocols",
+            102: "Processing",
+            103: "Early Hints",
+            200: "OK",
+            201: "Created",
+            202: "Accepted",
+            203: "Non-Authoritative Information",
+            204: "No Content",
+            205: "Reset Content",
+            206: "Partial Content",
+            207: "Multi-Status",
+            208: "Already Reported",
+            226: "IM Used",
+            300: "Multiple Choices",
+            301: "Moved Permanently",
+            302: "Found",
+            303: "See Other",
+            304: "Not Modified",
+            305: "Use Proxy",
+            307: "Temporary Redirect",
+            308: "Permanent Redirect",
+            400: "Bad Request",
+            401: "Unauthorized",
+            402: "Payment Required",
+            403: "Forbidden",
+            404: "Not Found",
+            405: "Method Not Allowed",
+            406: "Not Acceptable",
+            407: "Proxy Authentication Required",
+            408: "Request Timeout",
+            409: "Conflict",
+            410: "Gone",
+            411: "Length Required",
+            412: "Precondition Failed",
+            413: "Payload Too Large",
+            414: "URI Too Long",
+            415: "Unsupported Media Type",
+            416: "Range Not Satisfiable",
+            417: "Expectation Failed",
+            418: "I'm a teapot",
+            421: "Misdirected Request",
+            422: "Unprocessable Entity",
+            423: "Locked",
+            424: "Failed Dependency",
+            425: "Too Early",
+            426: "Upgrade Required",
+            428: "Precondition Required",
+            429: "Too Many Requests",
+            431: "Request Header Fields Too Large",
+            451: "Unavailable For Legal Reasons",
+            500: "Internal Server Error",
+            501: "Not Implemented",
+            502: "Bad Gateway",
+            503: "Service Unavailable",
+            504: "Gateway Timeout",
+            505: "HTTP Version Not Supported",
+            506: "Variant Also Negotiates",
+            507: "Insufficient Storage",
+            508: "Loop Detected",
+            510: "Not Extended",
+            511: "Network Authentication Required",
+        }
+    )
     @classmethod
     @lru_cache(maxsize=128)
     # if isinstance(engine, type):
     #     raise TypeError("Expected an engine instance, not a class definition of the engine")
+    if hasattr(engine, "fetch"):
         fetch_function = getattr(engine, "fetch")
         if callable(fetch_function):
             if len(inspect.signature(fetch_function).parameters) > 0:
                 return engine
             else:
                 # raise TypeError("Engine class instance must have a callable method 'fetch' with the first argument used for the url.")
+                raise TypeError(
+                    "Engine class must have a callable method 'fetch' with the first argument used for the url."
+                )
         else:
             # raise TypeError("Invalid engine instance! Engine class must have a callable method 'fetch'")
+            raise TypeError(
+                "Invalid engine class! Engine class must have a callable method 'fetch'"
+            )
     else:
         # raise TypeError("Invalid engine instance! Engine class must have the method 'fetch'")
+        raise TypeError(
+            "Invalid engine class! Engine class must have the method 'fetch'"
+        )
 def get_variable_name(var: Any) -> Optional[str]:
     return None
+def check_type_validity(
+    variable: Any,
+    valid_types: Union[List[Type], None],
+    default_value: Any = None,
+    critical: bool = False,
+    param_name: Optional[str] = None,
+) -> Any:
     """Check if a variable matches the specified type constraints.
     :param variable: The variable to check
     :param valid_types: List of valid types for the variable
         error_msg = f'Argument "{var_name}" cannot be None'
         if critical:
             raise TypeError(error_msg)
+        log.error(f"[Ignored] {error_msg}")
         return default_value
     # If no valid_types specified and variable has a value, return it
         error_msg = f'Argument "{var_name}" must be of type {" or ".join(type_names)}'
         if critical:
             raise TypeError(error_msg)
+        log.error(f"[Ignored] {error_msg}")
         return default_value
     return variable

scrapling/engines/toolbelt/fingerprints.py CHANGED Viewed

@@ -23,7 +23,7 @@ def generate_convincing_referer(url: str) -> str:
     :return: Google's search URL of the domain name
     """
     website_name = extract(url).domain
-    return f'https://www.google.com/search?q={website_name}'
 @lru_cache(1, typed=True)
@@ -35,11 +35,11 @@ def get_os_name() -> Union[str, None]:
     #
     os_name = platform.system()
     return {
-        'Linux': 'linux',
-        'Darwin': 'macos',
-        'Windows': 'windows',
         # For the future? because why not
-        'iOS': 'ios',
     }.get(os_name)
@@ -50,9 +50,9 @@ def generate_suitable_fingerprint() -> Fingerprint:
     :return: `Fingerprint` object
     """
     return FingerprintGenerator(
-        browser=[Browser(name='chrome', min_version=128)],
         os=get_os_name(),  # None is ignored
-        device='desktop'
     ).generate()
@@ -67,15 +67,15 @@ def generate_headers(browser_mode: bool = False) -> Dict:
         # So we don't raise any inconsistency red flags while websites fingerprinting us
         os_name = get_os_name()
         return HeaderGenerator(
-            browser=[Browser(name='chrome', min_version=130)],
             os=os_name,  # None is ignored
-            device='desktop'
         ).generate()
     else:
         # Here it's used for normal requests that aren't done through browsers so we can take it lightly
         browsers = [
-            Browser(name='chrome', min_version=120),
-            Browser(name='firefox', min_version=120),
-            Browser(name='edge', min_version=120),
         ]
-        return HeaderGenerator(browser=browsers, device='desktop').generate()

     :return: Google's search URL of the domain name
     """
     website_name = extract(url).domain
+    return f"https://www.google.com/search?q={website_name}"
 @lru_cache(1, typed=True)
     #
     os_name = platform.system()
     return {
+        "Linux": "linux",
+        "Darwin": "macos",
+        "Windows": "windows",
         # For the future? because why not
+        "iOS": "ios",
     }.get(os_name)
     :return: `Fingerprint` object
     """
     return FingerprintGenerator(
+        browser=[Browser(name="chrome", min_version=128)],
         os=get_os_name(),  # None is ignored
+        device="desktop",
     ).generate()
         # So we don't raise any inconsistency red flags while websites fingerprinting us
         os_name = get_os_name()
         return HeaderGenerator(
+            browser=[Browser(name="chrome", min_version=130)],
             os=os_name,  # None is ignored
+            device="desktop",
         ).generate()
     else:
         # Here it's used for normal requests that aren't done through browsers so we can take it lightly
         browsers = [
+            Browser(name="chrome", min_version=120),
+            Browser(name="firefox", min_version=120),
+            Browser(name="edge", min_version=120),
         ]
+        return HeaderGenerator(browser=browsers, device="desktop").generate()

scrapling/engines/toolbelt/navigation.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """
 Functions related to files and URLs
 """
 import os
 from urllib.parse import urlencode, urlparse
@@ -19,7 +20,9 @@ def intercept_route(route: Route):
     :return: PlayWright `Route` object
     """
     if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
-        log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
         route.abort()
     else:
         route.continue_()
@@ -32,7 +35,9 @@ async def async_intercept_route(route: async_Route):
     :return: PlayWright `Route` object
     """
     if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
-        log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
         await route.abort()
     else:
         await route.continue_()
@@ -50,23 +55,33 @@ def construct_proxy_dict(proxy_string: Union[str, Dict[str, str]]) -> Union[Dict
             proxy = urlparse(proxy_string)
             try:
                 return {
-                    'server': f'{proxy.scheme}://{proxy.hostname}:{proxy.port}',
-                    'username': proxy.username or '',
-                    'password': proxy.password or '',
                 }
             except ValueError:
                 # Urllib will say that one of the parameters above can't be casted to the correct type like `int` for port etc...
-                raise TypeError('The proxy argument\'s string is in invalid format!')
         elif isinstance(proxy_string, dict):
-            valid_keys = ('server', 'username', 'password', )
-            if all(key in valid_keys for key in proxy_string.keys()) and not any(key not in valid_keys for key in proxy_string.keys()):
                 return proxy_string
             else:
-                raise TypeError(f'A proxy dictionary must have only these keys: {valid_keys}')
         else:
-            raise TypeError(f'Invalid type of proxy ({type(proxy_string)}), the proxy argument must be a string or a dictionary!')
     # The default value for proxy in Playwright's source is `None`
     return None
@@ -84,7 +99,7 @@ def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
         parsed = urlparse(cdp_url)
         # Check scheme
-        if parsed.scheme not in ('ws', 'wss'):
             raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
         # Validate hostname and port
@@ -93,8 +108,8 @@ def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
         # Ensure path starts with /
         path = parsed.path
-        if not path.startswith('/'):
-            path = '/' + path
         # Reconstruct the base URL with validated parts
         validated_base = f"{parsed.scheme}://{parsed.netloc}{path}"
@@ -118,4 +133,4 @@ def js_bypass_path(filename: str) -> str:
     :return: The full path of the JS file.
     """
     current_directory = os.path.dirname(__file__)
-    return os.path.join(current_directory, 'bypasses', filename)

 """
 Functions related to files and URLs
 """
 import os
 from urllib.parse import urlencode, urlparse
     :return: PlayWright `Route` object
     """
     if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
+        log.debug(
+            f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"'
+        )
         route.abort()
     else:
         route.continue_()
     :return: PlayWright `Route` object
     """
     if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
+        log.debug(
+            f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"'
+        )
         await route.abort()
     else:
         await route.continue_()
             proxy = urlparse(proxy_string)
             try:
                 return {
+                    "server": f"{proxy.scheme}://{proxy.hostname}:{proxy.port}",
+                    "username": proxy.username or "",
+                    "password": proxy.password or "",
                 }
             except ValueError:
                 # Urllib will say that one of the parameters above can't be casted to the correct type like `int` for port etc...
+                raise TypeError("The proxy argument's string is in invalid format!")
         elif isinstance(proxy_string, dict):
+            valid_keys = (
+                "server",
+                "username",
+                "password",
+            )
+            if all(key in valid_keys for key in proxy_string.keys()) and not any(
+                key not in valid_keys for key in proxy_string.keys()
+            ):
                 return proxy_string
             else:
+                raise TypeError(
+                    f"A proxy dictionary must have only these keys: {valid_keys}"
+                )
         else:
+            raise TypeError(
+                f"Invalid type of proxy ({type(proxy_string)}), the proxy argument must be a string or a dictionary!"
+            )
     # The default value for proxy in Playwright's source is `None`
     return None
         parsed = urlparse(cdp_url)
         # Check scheme
+        if parsed.scheme not in ("ws", "wss"):
             raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
         # Validate hostname and port
         # Ensure path starts with /
         path = parsed.path
+        if not path.startswith("/"):
+            path = "/" + path
         # Reconstruct the base URL with validated parts
         validated_base = f"{parsed.scheme}://{parsed.netloc}{path}"
     :return: The full path of the JS file.
     """
     current_directory = os.path.dirname(__file__)
+    return os.path.join(current_directory, "bypasses", filename)

scrapling/fetchers.py CHANGED Viewed

@@ -1,7 +1,18 @@
-from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
-                                   SelectorWaitStates, Union)
-from scrapling.engines import (CamoufoxEngine, PlaywrightEngine, StaticEngine,
-                               check_if_engine_usable)
 from scrapling.engines.toolbelt import BaseFetcher, Response
@@ -10,10 +21,19 @@ class Fetcher(BaseFetcher):
     Any additional keyword arguments passed to the methods below are passed to the respective httpx's method directly.
     """
     @classmethod
     def get(
-            cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
-            proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
         """Make basic HTTP GET request for you but with some added flavors.
         :param url: Target url.
@@ -30,16 +50,36 @@ class Fetcher(BaseFetcher):
         if not custom_config:
             custom_config = {}
         elif not isinstance(custom_config, dict):
-            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
-        adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
-        response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).get(**kwargs)
         return response_object
     @classmethod
     def post(
-            cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
-            proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
         """Make basic HTTP POST request for you but with some added flavors.
         :param url: Target url.
@@ -56,16 +96,36 @@ class Fetcher(BaseFetcher):
         if not custom_config:
             custom_config = {}
         elif not isinstance(custom_config, dict):
-            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
-        adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
-        response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).post(**kwargs)
         return response_object
     @classmethod
     def put(
-            cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
-            proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
         """Make basic HTTP PUT request for you but with some added flavors.
         :param url: Target url
@@ -83,16 +143,36 @@ class Fetcher(BaseFetcher):
         if not custom_config:
             custom_config = {}
         elif not isinstance(custom_config, dict):
-            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
-        adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
-        response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).put(**kwargs)
         return response_object
     @classmethod
     def delete(
-            cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
-            proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
         """Make basic HTTP DELETE request for you but with some added flavors.
         :param url: Target url
@@ -109,18 +189,38 @@ class Fetcher(BaseFetcher):
         if not custom_config:
             custom_config = {}
         elif not isinstance(custom_config, dict):
-            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
-        adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
-        response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).delete(**kwargs)
         return response_object
 class AsyncFetcher(Fetcher):
     @classmethod
     async def get(
-            cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
-            proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
         """Make basic HTTP GET request for you but with some added flavors.
         :param url: Target url.
@@ -137,16 +237,36 @@ class AsyncFetcher(Fetcher):
         if not custom_config:
             custom_config = {}
         elif not isinstance(custom_config, dict):
-            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
-        adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
-        response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_get(**kwargs)
         return response_object
     @classmethod
     async def post(
-            cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
-            proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
         """Make basic HTTP POST request for you but with some added flavors.
         :param url: Target url.
@@ -163,16 +283,36 @@ class AsyncFetcher(Fetcher):
         if not custom_config:
             custom_config = {}
         elif not isinstance(custom_config, dict):
-            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
-        adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
-        response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_post(**kwargs)
         return response_object
     @classmethod
     async def put(
-            cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
-            proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
         """Make basic HTTP PUT request for you but with some added flavors.
         :param url: Target url
@@ -189,16 +329,36 @@ class AsyncFetcher(Fetcher):
         if not custom_config:
             custom_config = {}
         elif not isinstance(custom_config, dict):
-            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
-        adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
-        response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_put(**kwargs)
         return response_object
     @classmethod
     async def delete(
-            cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
-            proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
         """Make basic HTTP DELETE request for you but with some added flavors.
         :param url: Target url
@@ -215,27 +375,57 @@ class AsyncFetcher(Fetcher):
         if not custom_config:
             custom_config = {}
         elif not isinstance(custom_config, dict):
-            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
-        adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
-        response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_delete(**kwargs)
         return response_object
 class StealthyFetcher(BaseFetcher):
     """A `Fetcher` class type that is completely stealthy fetcher that uses a modified version of Firefox.
-     It works as real browsers passing almost all online tests/protections based on Camoufox.
-     Other added flavors include setting the faked OS fingerprints to match the user's OS and the referer of every request is set as if this request came from Google's search of this URL's domain.
     """
     @classmethod
     def fetch(
-            cls, url: str, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
-            block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, addons: Optional[List[str]] = None, wait: Optional[int] = 0,
-            timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
-            wait_selector_state: SelectorWaitStates = 'attached', google_search: bool = True, extra_headers: Optional[Dict[str, str]] = None,
-            proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: bool = False, disable_ads: bool = False, geoip: bool = False,
-            custom_config: Dict = None, additional_arguments: Dict = None
     ) -> Response:
         """
         Opens up a browser and do your request based on your chosen options below.
@@ -271,7 +461,9 @@ class StealthyFetcher(BaseFetcher):
         if not custom_config:
             custom_config = {}
         elif not isinstance(custom_config, dict):
-            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
         engine = CamoufoxEngine(
             wait=wait,
@@ -294,18 +486,35 @@ class StealthyFetcher(BaseFetcher):
             disable_resources=disable_resources,
             wait_selector_state=wait_selector_state,
             adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
-            additional_arguments=additional_arguments or {}
         )
         return engine.fetch(url)
     @classmethod
     async def async_fetch(
-            cls, url: str, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
-            block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, addons: Optional[List[str]] = None, wait: Optional[int] = 0,
-            timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
-            wait_selector_state: SelectorWaitStates = 'attached', google_search: bool = True, extra_headers: Optional[Dict[str, str]] = None,
-            proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: bool = False, disable_ads: bool = False, geoip: bool = False,
-            custom_config: Dict = None, additional_arguments: Dict = None
     ) -> Response:
         """
         Opens up a browser and do your request based on your chosen options below.
@@ -341,7 +550,9 @@ class StealthyFetcher(BaseFetcher):
         if not custom_config:
             custom_config = {}
         elif not isinstance(custom_config, dict):
-            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
         engine = CamoufoxEngine(
             wait=wait,
@@ -364,7 +575,7 @@ class StealthyFetcher(BaseFetcher):
             disable_resources=disable_resources,
             wait_selector_state=wait_selector_state,
             adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
-            additional_arguments=additional_arguments or {}
         )
         return await engine.async_fetch(url)
@@ -385,17 +596,32 @@ class PlayWrightFetcher(BaseFetcher):
     > Note that these are the main options with PlayWright but it can be mixed together.
     """
     @classmethod
     def fetch(
-            cls, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
-            useragent: Optional[str] = None, network_idle: bool = False, timeout: Optional[float] = 30000, wait: Optional[int] = 0,
-            page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
-            hide_canvas: bool = False, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: bool = True,
-            proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
-            stealth: bool = False, real_chrome: bool = False,
-            cdp_url: Optional[str] = None,
-            nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
-            custom_config: Dict = None
     ) -> Response:
         """Opens up a browser and do your request based on your chosen options below.
@@ -428,7 +654,9 @@ class PlayWrightFetcher(BaseFetcher):
         if not custom_config:
             custom_config = {}
         elif not isinstance(custom_config, dict):
-            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
         engine = PlaywrightEngine(
             wait=wait,
@@ -457,15 +685,29 @@ class PlayWrightFetcher(BaseFetcher):
     @classmethod
     async def async_fetch(
-            cls, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
-            useragent: Optional[str] = None, network_idle: bool = False, timeout: Optional[float] = 30000, wait: Optional[int] = 0,
-            page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
-            hide_canvas: bool = False, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: bool = True,
-            proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
-            stealth: bool = False, real_chrome: bool = False,
-            cdp_url: Optional[str] = None,
-            nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
-            custom_config: Dict = None
     ) -> Response:
         """Opens up a browser and do your request based on your chosen options below.
@@ -498,7 +740,9 @@ class PlayWrightFetcher(BaseFetcher):
         if not custom_config:
             custom_config = {}
         elif not isinstance(custom_config, dict):
-            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
         engine = PlaywrightEngine(
             wait=wait,
@@ -529,5 +773,7 @@ class PlayWrightFetcher(BaseFetcher):
 class CustomFetcher(BaseFetcher):
     @classmethod
     def fetch(cls, url: str, browser_engine, **kwargs) -> Response:
-        engine = check_if_engine_usable(browser_engine)(adaptor_arguments=cls._generate_parser_arguments(), **kwargs)
         return engine.fetch(url)

+from scrapling.core._types import (
+    Callable,
+    Dict,
+    List,
+    Literal,
+    Optional,
+    SelectorWaitStates,
+    Union,
+)
+from scrapling.engines import (
+    CamoufoxEngine,
+    PlaywrightEngine,
+    StaticEngine,
+    check_if_engine_usable,
+)
 from scrapling.engines.toolbelt import BaseFetcher, Response
     Any additional keyword arguments passed to the methods below are passed to the respective httpx's method directly.
     """
     @classmethod
     def get(
+        cls,
+        url: str,
+        follow_redirects: bool = True,
+        timeout: Optional[Union[int, float]] = 10,
+        stealthy_headers: bool = True,
+        proxy: Optional[str] = None,
+        retries: Optional[int] = 3,
+        custom_config: Dict = None,
+        **kwargs: Dict,
+    ) -> Response:
         """Make basic HTTP GET request for you but with some added flavors.
         :param url: Target url.
         if not custom_config:
             custom_config = {}
         elif not isinstance(custom_config, dict):
+            ValueError(
+                f"The custom parser config must be of type dictionary, got {cls.__class__}"
+            )
+        adaptor_arguments = tuple(
+            {**cls._generate_parser_arguments(), **custom_config}.items()
+        )
+        response_object = StaticEngine(
+            url,
+            proxy,
+            stealthy_headers,
+            follow_redirects,
+            timeout,
+            retries,
+            adaptor_arguments=adaptor_arguments,
+        ).get(**kwargs)
         return response_object
     @classmethod
     def post(
+        cls,
+        url: str,
+        follow_redirects: bool = True,
+        timeout: Optional[Union[int, float]] = 10,
+        stealthy_headers: bool = True,
+        proxy: Optional[str] = None,
+        retries: Optional[int] = 3,
+        custom_config: Dict = None,
+        **kwargs: Dict,
+    ) -> Response:
         """Make basic HTTP POST request for you but with some added flavors.
         :param url: Target url.
         if not custom_config:
             custom_config = {}
         elif not isinstance(custom_config, dict):
+            ValueError(
+                f"The custom parser config must be of type dictionary, got {cls.__class__}"
+            )
+        adaptor_arguments = tuple(
+            {**cls._generate_parser_arguments(), **custom_config}.items()
+        )
+        response_object = StaticEngine(
+            url,
+            proxy,
+            stealthy_headers,
+            follow_redirects,
+            timeout,
+            retries,
+            adaptor_arguments=adaptor_arguments,
+        ).post(**kwargs)
         return response_object
     @classmethod
     def put(
+        cls,
+        url: str,
+        follow_redirects: bool = True,
+        timeout: Optional[Union[int, float]] = 10,
+        stealthy_headers: bool = True,
+        proxy: Optional[str] = None,
+        retries: Optional[int] = 3,
+        custom_config: Dict = None,
+        **kwargs: Dict,
+    ) -> Response:
         """Make basic HTTP PUT request for you but with some added flavors.
         :param url: Target url
         if not custom_config:
             custom_config = {}
         elif not isinstance(custom_config, dict):
+            ValueError(
+                f"The custom parser config must be of type dictionary, got {cls.__class__}"
+            )
+        adaptor_arguments = tuple(
+            {**cls._generate_parser_arguments(), **custom_config}.items()
+        )
+        response_object = StaticEngine(
+            url,
+            proxy,
+            stealthy_headers,
+            follow_redirects,
+            timeout,
+            retries,
+            adaptor_arguments=adaptor_arguments,
+        ).put(**kwargs)
         return response_object
     @classmethod
     def delete(
+        cls,
+        url: str,
+        follow_redirects: bool = True,
+        timeout: Optional[Union[int, float]] = 10,
+        stealthy_headers: bool = True,
+        proxy: Optional[str] = None,
+        retries: Optional[int] = 3,
+        custom_config: Dict = None,
+        **kwargs: Dict,
+    ) -> Response:
         """Make basic HTTP DELETE request for you but with some added flavors.
         :param url: Target url
         if not custom_config:
             custom_config = {}
         elif not isinstance(custom_config, dict):
+            ValueError(
+                f"The custom parser config must be of type dictionary, got {cls.__class__}"
+            )
+        adaptor_arguments = tuple(
+            {**cls._generate_parser_arguments(), **custom_config}.items()
+        )
+        response_object = StaticEngine(
+            url,
+            proxy,
+            stealthy_headers,
+            follow_redirects,
+            timeout,
+            retries,
+            adaptor_arguments=adaptor_arguments,
+        ).delete(**kwargs)
         return response_object
 class AsyncFetcher(Fetcher):
     @classmethod
     async def get(
+        cls,
+        url: str,
+        follow_redirects: bool = True,
+        timeout: Optional[Union[int, float]] = 10,
+        stealthy_headers: bool = True,
+        proxy: Optional[str] = None,
+        retries: Optional[int] = 3,
+        custom_config: Dict = None,
+        **kwargs: Dict,
+    ) -> Response:
         """Make basic HTTP GET request for you but with some added flavors.
         :param url: Target url.
         if not custom_config:
             custom_config = {}
         elif not isinstance(custom_config, dict):
+            ValueError(
+                f"The custom parser config must be of type dictionary, got {cls.__class__}"
+            )
+        adaptor_arguments = tuple(
+            {**cls._generate_parser_arguments(), **custom_config}.items()
+        )
+        response_object = await StaticEngine(
+            url,
+            proxy,
+            stealthy_headers,
+            follow_redirects,
+            timeout,
+            retries=retries,
+            adaptor_arguments=adaptor_arguments,
+        ).async_get(**kwargs)
         return response_object
     @classmethod
     async def post(
+        cls,
+        url: str,
+        follow_redirects: bool = True,
+        timeout: Optional[Union[int, float]] = 10,
+        stealthy_headers: bool = True,
+        proxy: Optional[str] = None,
+        retries: Optional[int] = 3,
+        custom_config: Dict = None,
+        **kwargs: Dict,
+    ) -> Response:
         """Make basic HTTP POST request for you but with some added flavors.
         :param url: Target url.
         if not custom_config:
             custom_config = {}
         elif not isinstance(custom_config, dict):
+            ValueError(
+                f"The custom parser config must be of type dictionary, got {cls.__class__}"
+            )
+        adaptor_arguments = tuple(
+            {**cls._generate_parser_arguments(), **custom_config}.items()
+        )
+        response_object = await StaticEngine(
+            url,
+            proxy,
+            stealthy_headers,
+            follow_redirects,
+            timeout,
+            retries=retries,
+            adaptor_arguments=adaptor_arguments,
+        ).async_post(**kwargs)
         return response_object
     @classmethod
     async def put(
+        cls,
+        url: str,
+        follow_redirects: bool = True,
+        timeout: Optional[Union[int, float]] = 10,
+        stealthy_headers: bool = True,
+        proxy: Optional[str] = None,
+        retries: Optional[int] = 3,
+        custom_config: Dict = None,
+        **kwargs: Dict,
+    ) -> Response:
         """Make basic HTTP PUT request for you but with some added flavors.
         :param url: Target url
         if not custom_config:
             custom_config = {}
         elif not isinstance(custom_config, dict):
+            ValueError(
+                f"The custom parser config must be of type dictionary, got {cls.__class__}"
+            )
+        adaptor_arguments = tuple(
+            {**cls._generate_parser_arguments(), **custom_config}.items()
+        )
+        response_object = await StaticEngine(
+            url,
+            proxy,
+            stealthy_headers,
+            follow_redirects,
+            timeout,
+            retries=retries,
+            adaptor_arguments=adaptor_arguments,
+        ).async_put(**kwargs)
         return response_object
     @classmethod
     async def delete(
+        cls,
+        url: str,
+        follow_redirects: bool = True,
+        timeout: Optional[Union[int, float]] = 10,
+        stealthy_headers: bool = True,
+        proxy: Optional[str] = None,
+        retries: Optional[int] = 3,
+        custom_config: Dict = None,
+        **kwargs: Dict,
+    ) -> Response:
         """Make basic HTTP DELETE request for you but with some added flavors.
         :param url: Target url
         if not custom_config:
             custom_config = {}
         elif not isinstance(custom_config, dict):
+            ValueError(
+                f"The custom parser config must be of type dictionary, got {cls.__class__}"
+            )
+        adaptor_arguments = tuple(
+            {**cls._generate_parser_arguments(), **custom_config}.items()
+        )
+        response_object = await StaticEngine(
+            url,
+            proxy,
+            stealthy_headers,
+            follow_redirects,
+            timeout,
+            retries=retries,
+            adaptor_arguments=adaptor_arguments,
+        ).async_delete(**kwargs)
         return response_object
 class StealthyFetcher(BaseFetcher):
     """A `Fetcher` class type that is completely stealthy fetcher that uses a modified version of Firefox.
+    It works as real browsers passing almost all online tests/protections based on Camoufox.
+    Other added flavors include setting the faked OS fingerprints to match the user's OS and the referer of every request is set as if this request came from Google's search of this URL's domain.
     """
     @classmethod
     def fetch(
+        cls,
+        url: str,
+        headless: Union[bool, Literal["virtual"]] = True,  # noqa: F821
+        block_images: bool = False,
+        disable_resources: bool = False,
+        block_webrtc: bool = False,
+        allow_webgl: bool = True,
+        network_idle: bool = False,
+        addons: Optional[List[str]] = None,
+        wait: Optional[int] = 0,
+        timeout: Optional[float] = 30000,
+        page_action: Callable = None,
+        wait_selector: Optional[str] = None,
+        humanize: Optional[Union[bool, float]] = True,
+        wait_selector_state: SelectorWaitStates = "attached",
+        google_search: bool = True,
+        extra_headers: Optional[Dict[str, str]] = None,
+        proxy: Optional[Union[str, Dict[str, str]]] = None,
+        os_randomize: bool = False,
+        disable_ads: bool = False,
+        geoip: bool = False,
+        custom_config: Dict = None,
+        additional_arguments: Dict = None,
     ) -> Response:
         """
         Opens up a browser and do your request based on your chosen options below.
         if not custom_config:
             custom_config = {}
         elif not isinstance(custom_config, dict):
+            ValueError(
+                f"The custom parser config must be of type dictionary, got {cls.__class__}"
+            )
         engine = CamoufoxEngine(
             wait=wait,
             disable_resources=disable_resources,
             wait_selector_state=wait_selector_state,
             adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
+            additional_arguments=additional_arguments or {},
         )
         return engine.fetch(url)
     @classmethod
     async def async_fetch(
+        cls,
+        url: str,
+        headless: Union[bool, Literal["virtual"]] = True,  # noqa: F821
+        block_images: bool = False,
+        disable_resources: bool = False,
+        block_webrtc: bool = False,
+        allow_webgl: bool = True,
+        network_idle: bool = False,
+        addons: Optional[List[str]] = None,
+        wait: Optional[int] = 0,
+        timeout: Optional[float] = 30000,
+        page_action: Callable = None,
+        wait_selector: Optional[str] = None,
+        humanize: Optional[Union[bool, float]] = True,
+        wait_selector_state: SelectorWaitStates = "attached",
+        google_search: bool = True,
+        extra_headers: Optional[Dict[str, str]] = None,
+        proxy: Optional[Union[str, Dict[str, str]]] = None,
+        os_randomize: bool = False,
+        disable_ads: bool = False,
+        geoip: bool = False,
+        custom_config: Dict = None,
+        additional_arguments: Dict = None,
     ) -> Response:
         """
         Opens up a browser and do your request based on your chosen options below.
         if not custom_config:
             custom_config = {}
         elif not isinstance(custom_config, dict):
+            ValueError(
+                f"The custom parser config must be of type dictionary, got {cls.__class__}"
+            )
         engine = CamoufoxEngine(
             wait=wait,
             disable_resources=disable_resources,
             wait_selector_state=wait_selector_state,
             adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
+            additional_arguments=additional_arguments or {},
         )
         return await engine.async_fetch(url)
     > Note that these are the main options with PlayWright but it can be mixed together.
     """
     @classmethod
     def fetch(
+        cls,
+        url: str,
+        headless: Union[bool, str] = True,
+        disable_resources: bool = None,
+        useragent: Optional[str] = None,
+        network_idle: bool = False,
+        timeout: Optional[float] = 30000,
+        wait: Optional[int] = 0,
+        page_action: Optional[Callable] = None,
+        wait_selector: Optional[str] = None,
+        wait_selector_state: SelectorWaitStates = "attached",
+        hide_canvas: bool = False,
+        disable_webgl: bool = False,
+        extra_headers: Optional[Dict[str, str]] = None,
+        google_search: bool = True,
+        proxy: Optional[Union[str, Dict[str, str]]] = None,
+        locale: Optional[str] = "en-US",
+        stealth: bool = False,
+        real_chrome: bool = False,
+        cdp_url: Optional[str] = None,
+        nstbrowser_mode: bool = False,
+        nstbrowser_config: Optional[Dict] = None,
+        custom_config: Dict = None,
     ) -> Response:
         """Opens up a browser and do your request based on your chosen options below.
         if not custom_config:
             custom_config = {}
         elif not isinstance(custom_config, dict):
+            ValueError(
+                f"The custom parser config must be of type dictionary, got {cls.__class__}"
+            )
         engine = PlaywrightEngine(
             wait=wait,
     @classmethod
     async def async_fetch(
+        cls,
+        url: str,
+        headless: Union[bool, str] = True,
+        disable_resources: bool = None,
+        useragent: Optional[str] = None,
+        network_idle: bool = False,
+        timeout: Optional[float] = 30000,
+        wait: Optional[int] = 0,
+        page_action: Optional[Callable] = None,
+        wait_selector: Optional[str] = None,
+        wait_selector_state: SelectorWaitStates = "attached",
+        hide_canvas: bool = False,
+        disable_webgl: bool = False,
+        extra_headers: Optional[Dict[str, str]] = None,
+        google_search: bool = True,
+        proxy: Optional[Union[str, Dict[str, str]]] = None,
+        locale: Optional[str] = "en-US",
+        stealth: bool = False,
+        real_chrome: bool = False,
+        cdp_url: Optional[str] = None,
+        nstbrowser_mode: bool = False,
+        nstbrowser_config: Optional[Dict] = None,
+        custom_config: Dict = None,
     ) -> Response:
         """Opens up a browser and do your request based on your chosen options below.
         if not custom_config:
             custom_config = {}
         elif not isinstance(custom_config, dict):
+            ValueError(
+                f"The custom parser config must be of type dictionary, got {cls.__class__}"
+            )
         engine = PlaywrightEngine(
             wait=wait,
 class CustomFetcher(BaseFetcher):
     @classmethod
     def fetch(cls, url: str, browser_engine, **kwargs) -> Response:
+        engine = check_if_engine_usable(browser_engine)(
+            adaptor_arguments=cls._generate_parser_arguments(), **kwargs
+        )
         return engine.fetch(url)

scrapling/parser.py CHANGED Viewed

@@ -9,40 +9,59 @@ from cssselect import SelectorError, SelectorSyntaxError
 from cssselect import parse as split_selectors
 from lxml import etree, html
-from scrapling.core._types import (Any, Callable, Dict, Generator, Iterable,
-                                   List, Optional, Pattern, SupportsIndex,
-                                   Tuple, Union)
-from scrapling.core.custom_types import (AttributesHandler, TextHandler,
-                                         TextHandlers)
 from scrapling.core.mixins import SelectorsGeneration
-from scrapling.core.storage_adaptors import (SQLiteStorageSystem,
-                                             StorageSystemMixin, _StorageTools)
 from scrapling.core.translator import translator_instance
-from scrapling.core.utils import (clean_spaces, flatten, html_forbidden,
-                                  is_jsonable, log)
 class Adaptor(SelectorsGeneration):
     __slots__ = (
-        'url', 'encoding', '__auto_match_enabled', '_root', '_storage',
-        '__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
-        '__keep_cdata'
     )
     def __init__(
-            self,
-            text: Optional[str] = None,
-            url: Optional[str] = None,
-            body: bytes = b"",
-            encoding: str = "utf8",
-            huge_tree: bool = True,
-            root: Optional[html.HtmlElement] = None,
-            keep_comments: Optional[bool] = False,
-            keep_cdata: Optional[bool] = False,
-            auto_match: Optional[bool] = False,
-            storage: Any = SQLiteStorageSystem,
-            storage_args: Optional[Dict] = None,
-            **kwargs
     ):
         """The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
         with expressions in CSS, XPath, or with simply text. Check the docs for more info.
@@ -69,25 +88,37 @@ class Adaptor(SelectorsGeneration):
             If empty, default values will be used.
         """
         if root is None and not body and text is None:
-            raise ValueError("Adaptor class needs text, body, or root arguments to work")
-        self.__text = ''
         if root is None:
             if text is None:
                 if not body or not isinstance(body, bytes):
-                    raise TypeError(f"body argument must be valid and of type bytes, got {body.__class__}")
                 body = body.replace(b"\x00", b"").strip()
             else:
                 if not isinstance(text, str):
-                    raise TypeError(f"text argument must be of type str, got {text.__class__}")
                 body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
             # https://lxml.de/api/lxml.etree.HTMLParser-class.html
             parser = html.HTMLParser(
-                recover=True, remove_blank_text=True, remove_comments=(not keep_comments), encoding=encoding,
-                compact=True, huge_tree=huge_tree, default_doctype=True, strip_cdata=(not keep_cdata),
             )
             self._root = etree.fromstring(body, parser=parser, base_url=url)
             if is_jsonable(text or body.decode()):
@@ -107,15 +138,21 @@ class Adaptor(SelectorsGeneration):
         if self.__auto_match_enabled:
             if not storage_args:
                 storage_args = {
-                    'storage_file': os.path.join(os.path.dirname(__file__), 'elements_storage.db'),
-                    'url': url
                 }
-            if not hasattr(storage, '__wrapped__'):
-                raise ValueError("Storage class must be wrapped with lru_cache decorator, see docs for info")
             if not issubclass(storage.__wrapped__, StorageSystemMixin):
-                raise ValueError("Storage system must be inherited from class `StorageSystemMixin`")
             self._storage = storage(**storage_args)
@@ -128,13 +165,27 @@ class Adaptor(SelectorsGeneration):
         self.__attributes = None
         self.__tag = None
         # No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
-        self.__response_data = {
-            key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'history', 'headers', 'request_headers',)
-        } if hasattr(self, 'status') else {}
     # Node functionalities, I wanted to move to separate Mixin class but it had slight impact on performance
     @staticmethod
-    def _is_text_node(element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> bool:
         """Return True if given element is a result of a string expression
         Examples:
             XPath -> '/text()', '/@attribute' etc...
@@ -144,25 +195,33 @@ class Adaptor(SelectorsGeneration):
         return issubclass(type(element), etree._ElementUnicodeResult)
     @staticmethod
-    def __content_convertor(element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> TextHandler:
         """Used internally to convert a single element's text content to TextHandler directly without checks
         This single line has been isolated like this so when it's used with map we get that slight performance boost vs list comprehension
         """
         return TextHandler(str(element))
-    def __element_convertor(self, element: html.HtmlElement) -> 'Adaptor':
         """Used internally to convert a single HtmlElement to Adaptor directly without checks"""
         return Adaptor(
             root=element,
-            text='', body=b'',  # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
-            url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
-            keep_comments=self.__keep_comments, keep_cdata=self.__keep_cdata,
             huge_tree=self.__huge_tree_enabled,
-            **self.__response_data
         )
-    def __handle_element(self, element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> Union[TextHandler, 'Adaptor', None]:
         """Used internally in all functions to convert a single element to type (Adaptor|TextHandler) when possible"""
         if element is None:
             return None
@@ -172,9 +231,13 @@ class Adaptor(SelectorsGeneration):
         else:
             return self.__element_convertor(element)
-    def __handle_elements(self, result: List[Union[html.HtmlElement, etree._ElementUnicodeResult]]) -> Union['Adaptors', 'TextHandlers', List]:
         """Used internally in all functions to convert results to type (Adaptors|TextHandlers) in bulk when possible"""
-        if not len(result):  # Lxml will give a warning if I used something like `not result`
             return Adaptors([])
         # From within the code, this method will always get a list of the same type
@@ -209,7 +272,16 @@ class Adaptor(SelectorsGeneration):
             self.__text = TextHandler(self._root.text)
         return self.__text
-    def get_all_text(self, separator: str = "\n", strip: bool = False, ignore_tags: Tuple = ('script', 'style',), valid_values: bool = True) -> TextHandler:
         """Get all child strings of this element, concatenated using the given separator.
         :param separator: Strings will be concatenated using this separator.
@@ -220,7 +292,7 @@ class Adaptor(SelectorsGeneration):
         :return: A TextHandler
         """
         _all_strings = []
-        for node in self._root.xpath('.//*'):
             if node.tag not in ignore_tags:
                 text = node.text
                 if text and type(text) is str:
@@ -245,13 +317,25 @@ class Adaptor(SelectorsGeneration):
     @property
     def html_content(self) -> TextHandler:
         """Return the inner html code of the element"""
-        return TextHandler(etree.tostring(self._root, encoding='unicode', method='html', with_tail=False))
     body = html_content
     def prettify(self) -> TextHandler:
         """Return a prettified version of the element's inner html-code"""
-        return TextHandler(etree.tostring(self._root, encoding='unicode', pretty_print=True, method='html', with_tail=False))
     def has_class(self, class_name: str) -> bool:
         """Check if element has a specific class
@@ -261,36 +345,44 @@ class Adaptor(SelectorsGeneration):
         return class_name in self._root.classes
     @property
-    def parent(self) -> Union['Adaptor', None]:
         """Return the direct parent of the element or ``None`` otherwise"""
         return self.__handle_element(self._root.getparent())
     @property
-    def below_elements(self) -> 'Adaptors[Adaptor]':
         """Return all elements under the current element in the DOM tree"""
-        below = self._root.xpath('.//*')
         return self.__handle_elements(below)
     @property
-    def children(self) -> 'Adaptors[Adaptor]':
         """Return the children elements of the current element or empty list otherwise"""
-        return Adaptors([
-            self.__element_convertor(child) for child in self._root.iterchildren() if type(child) not in html_forbidden
-        ])
     @property
-    def siblings(self) -> 'Adaptors[Adaptor]':
         """Return other children of the current element's parent or empty list otherwise"""
         if self.parent:
-            return Adaptors([child for child in self.parent.children if child._root != self._root])
         return Adaptors([])
-    def iterancestors(self) -> Generator['Adaptor', None, None]:
         """Return a generator that loops over all ancestors of the element, starting with element's parent."""
         for ancestor in self._root.iterancestors():
             yield self.__element_convertor(ancestor)
-    def find_ancestor(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptor', None]:
         """Loop over all ancestors of the element till one match the passed function
         :param func: A function that takes each ancestor as an argument and returns True/False
         :return: The first ancestor that match the function or ``None`` otherwise.
@@ -301,13 +393,13 @@ class Adaptor(SelectorsGeneration):
         return None
     @property
-    def path(self) -> 'Adaptors[Adaptor]':
         """Returns list of type :class:`Adaptors` that contains the path leading to the current element from the root."""
         lst = list(self.iterancestors())
         return Adaptors(lst)
     @property
-    def next(self) -> Union['Adaptor', None]:
         """Returns the next element of the current element in the children of the parent or ``None`` otherwise."""
         next_element = self._root.getnext()
         if next_element is not None:
@@ -318,7 +410,7 @@ class Adaptor(SelectorsGeneration):
         return self.__handle_element(next_element)
     @property
-    def previous(self) -> Union['Adaptor', None]:
         """Returns the previous element of the current element in the children of the parent or ``None`` otherwise."""
         prev_element = self._root.getprevious()
         if prev_element is not None:
@@ -346,13 +438,13 @@ class Adaptor(SelectorsGeneration):
         data = "<"
         content = clean_spaces(self.html_content)
         if len(content) > length_limit:
-            content = content[:length_limit].strip() + '...'
         data += f"data='{content}'"
         if self.parent:
             parent_content = clean_spaces(self.parent.html_content)
             if len(parent_content) > length_limit:
-                parent_content = parent_content[:length_limit].strip() + '...'
             data += f" parent='{parent_content}'"
@@ -360,8 +452,11 @@ class Adaptor(SelectorsGeneration):
     # From here we start the selecting functions
     def relocate(
-            self, element: Union[Dict, html.HtmlElement, 'Adaptor'], percentage: int = 0, adaptor_type: bool = False
-    ) -> Union[List[Union[html.HtmlElement, None]], 'Adaptors']:
         """This function will search again for the element in the page tree, used automatically on page structure change
         :param element: The element we want to relocate in the tree
@@ -379,7 +474,7 @@ class Adaptor(SelectorsGeneration):
         if issubclass(type(element), html.HtmlElement):
             element = _StorageTools.element_to_dict(element)
-        for node in self._root.xpath('.//*'):
             # Collect all elements in the page then for each element get the matching score of it against the node.
             # Hence: the code doesn't stop even if the score was 100%
             # because there might be another element(s) left in page with the same score
@@ -391,19 +486,26 @@ class Adaptor(SelectorsGeneration):
             if score_table[highest_probability] and highest_probability >= percentage:
                 if log.getEffectiveLevel() < 20:
                     # No need to execute this part if logging level is not debugging
-                    log.debug(f'Highest probability was {highest_probability}%')
-                    log.debug('Top 5 best matching elements are: ')
                     for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
-                        log.debug(f'{percent} -> {self.__handle_elements(score_table[percent])}')
                 if not adaptor_type:
                     return score_table[highest_probability]
                 return self.__handle_elements(score_table[highest_probability])
         return []
-    def css_first(self, selector: str, identifier: str = '',
-                  auto_match: bool = False, auto_save: bool = False, percentage: int = 0
-                  ) -> Union['Adaptor', 'TextHandler', None]:
         """Search current tree with CSS3 selectors and return the first result if possible, otherwise return `None`
         **Important:
@@ -419,13 +521,21 @@ class Adaptor(SelectorsGeneration):
          Be aware that the percentage calculation depends solely on the page structure so don't play with this
          number unless you must know what you are doing!
         """
-        for element in self.css(selector, identifier, auto_match, auto_save, percentage):
             return element
         return None
-    def xpath_first(self, selector: str, identifier: str = '',
-                    auto_match: bool = False, auto_save: bool = False, percentage: int = 0, **kwargs: Any
-                    ) -> Union['Adaptor', 'TextHandler', None]:
         """Search current tree with XPath selectors and return the first result if possible, otherwise return `None`
         **Important:
@@ -443,13 +553,20 @@ class Adaptor(SelectorsGeneration):
          Be aware that the percentage calculation depends solely on the page structure so don't play with this
          number unless you must know what you are doing!
         """
-        for element in self.xpath(selector, identifier, auto_match, auto_save, percentage, **kwargs):
             return element
         return None
-    def css(self, selector: str, identifier: str = '',
-            auto_match: bool = False, auto_save: bool = False, percentage: int = 0
-            ) -> Union['Adaptors[Adaptor]', List, 'TextHandlers[TextHandler]']:
         """Search current tree with CSS3 selectors
         **Important:
@@ -468,28 +585,49 @@ class Adaptor(SelectorsGeneration):
         :return: List as :class:`Adaptors`
         """
         try:
-            if not self.__auto_match_enabled or ',' not in selector:
                 # No need to split selectors in this case, let's save some CPU cycles :)
                 xpath_selector = translator_instance.css_to_xpath(selector)
-                return self.xpath(xpath_selector, identifier or selector, auto_match, auto_save, percentage)
             results = []
-            if ',' in selector:
                 for single_selector in split_selectors(selector):
                     # I'm doing this only so the `save` function save data correctly for combined selectors
                     # Like using the ',' to combine two different selectors that point to different elements.
-                    xpath_selector = translator_instance.css_to_xpath(single_selector.canonical())
                     results += self.xpath(
-                        xpath_selector, identifier or single_selector.canonical(), auto_match, auto_save, percentage
                     )
             return results
-        except (SelectorError, SelectorSyntaxError,):
             raise SelectorSyntaxError(f"Invalid CSS selector: {selector}")
-    def xpath(self, selector: str, identifier: str = '',
-              auto_match: bool = False, auto_save: bool = False, percentage: int = 0, **kwargs: Any
-              ) -> Union['Adaptors[Adaptor]', List, 'TextHandlers[TextHandler]']:
         """Search current tree with XPath selectors
         **Important:
@@ -515,7 +653,9 @@ class Adaptor(SelectorsGeneration):
             if elements:
                 if auto_save:
                     if not self.__auto_match_enabled:
-                        log.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
                     else:
                         self.save(elements[0], identifier or selector)
@@ -531,16 +671,29 @@ class Adaptor(SelectorsGeneration):
                 return self.__handle_elements(elements)
             else:
                 if auto_match:
-                    log.warning("Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
                 elif auto_save:
-                    log.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
                 return self.__handle_elements(elements)
-        except (SelectorError, SelectorSyntaxError, etree.XPathError, etree.XPathEvalError):
             raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
-    def find_all(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> 'Adaptors':
         """Find elements by filters of your creations for ease..
         :param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
@@ -551,12 +704,14 @@ class Adaptor(SelectorsGeneration):
         # Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
         # https://www.w3schools.com/python/python_ref_keywords.asp
         whitelisted = {
-            'class_': 'class',
-            'for_': 'for',
         }
         if not args and not kwargs:
-            raise TypeError('You have to pass something to search with, like tag name(s), tag attributes, or both.')
         attributes = dict()
         tags, patterns = set(), set()
@@ -569,12 +724,18 @@ class Adaptor(SelectorsGeneration):
             elif type(arg) in [list, tuple, set]:
                 if not all(map(lambda x: type(x) is str, arg)):
-                    raise TypeError('Nested Iterables are not accepted, only iterables of tag names are accepted')
                 tags.update(set(arg))
             elif isinstance(arg, dict):
-                if not all([(type(k) is str and type(v) is str) for k, v in arg.items()]):
-                    raise TypeError('Nested dictionaries are not accepted, only string keys and string values are accepted')
                 attributes.update(arg)
             elif isinstance(arg, re.Pattern):
@@ -584,13 +745,17 @@ class Adaptor(SelectorsGeneration):
                 if len(inspect.signature(arg).parameters) > 0:
                     functions.append(arg)
                 else:
-                    raise TypeError("Callable filter function must have at least one argument to take `Adaptor` objects.")
             else:
-                raise TypeError(f'Argument with type "{type(arg)}" is not accepted, please read the docs.')
         if not all([(type(k) is str and type(v) is str) for k, v in kwargs.items()]):
-            raise TypeError('Only string values are accepted for arguments')
         for attribute_name, value in kwargs.items():
             # Only replace names for kwargs, replacing them in dictionaries doesn't make sense
@@ -598,22 +763,24 @@ class Adaptor(SelectorsGeneration):
             attributes[attribute_name] = value
         # It's easier and faster to build a selector than traversing the tree
-        tags = tags or ['*']
         for tag in tags:
             selector = tag
             for key, value in attributes.items():
-                value = value.replace('"', r'\"')  # Escape double quotes in user input
                 # Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
                 selector += '[{}="{}"]'.format(key, value)
-            if selector != '*':
                 selectors.append(selector)
         if selectors:
-            results = self.css(', '.join(selectors))
             if results:
                 # From the results, get the ones that fulfill passed regex patterns
                 for pattern in patterns:
-                    results = results.filter(lambda e: e.text.re(pattern, check_match=True))
                 # From the results, get the ones that fulfill passed functions
                 for function in functions:
@@ -629,7 +796,11 @@ class Adaptor(SelectorsGeneration):
         return results
-    def find(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> Union['Adaptor', None]:
         """Find elements by filters of your creations for ease then return the first result. Otherwise return `None`.
         :param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
@@ -640,7 +811,9 @@ class Adaptor(SelectorsGeneration):
             return element
         return None
-    def __calculate_similarity_score(self, original: Dict, candidate: html.HtmlElement) -> float:
         """Used internally to calculate a score that shows how candidate element similar to the original one
         :param original: The original element in the form of the dictionary generated from `element_to_dict` function
@@ -653,53 +826,68 @@ class Adaptor(SelectorsGeneration):
         # Possible TODO:
         # Study the idea of giving weight to each test below so some are more important than others
         # Current results: With weights some websites had better score while it was worse for others
-        score += 1 if original['tag'] == candidate['tag'] else 0  # * 0.3  # 30%
         checks += 1
-        if original['text']:
-            score += SequenceMatcher(None, original['text'], candidate.get('text') or '').ratio()  # * 0.3  # 30%
             checks += 1
         # if both doesn't have attributes, it still count for something!
-        score += self.__calculate_dict_diff(original['attributes'], candidate['attributes'])  # * 0.3  # 30%
         checks += 1
         # Separate similarity test for class, id, href,... this will help in full structural changes
-        for attrib in ('class', 'id', 'href', 'src',):
-            if original['attributes'].get(attrib):
                 score += SequenceMatcher(
-                    None, original['attributes'][attrib], candidate['attributes'].get(attrib) or ''
                 ).ratio()  # * 0.3  # 30%
                 checks += 1
-        score += SequenceMatcher(None, original['path'], candidate['path']).ratio()  # * 0.1  # 10%
         checks += 1
-        if original.get('parent_name'):
             # Then we start comparing parents' data
-            if candidate.get('parent_name'):
                 score += SequenceMatcher(
-                    None, original['parent_name'], candidate.get('parent_name') or ''
                 ).ratio()  # * 0.2  # 20%
                 checks += 1
                 score += self.__calculate_dict_diff(
-                    original['parent_attribs'], candidate.get('parent_attribs') or {}
                 )  # * 0.2  # 20%
                 checks += 1
-                if original['parent_text']:
                     score += SequenceMatcher(
-                        None, original['parent_text'], candidate.get('parent_text') or ''
                     ).ratio()  # * 0.1  # 10%
                     checks += 1
             # else:
             #     # The original element have a parent and this one not, this is not a good sign
             #     score -= 0.1
-        if original.get('siblings'):
             score += SequenceMatcher(
-                None, original['siblings'], candidate.get('siblings') or []
             ).ratio()  # * 0.1  # 10%
             checks += 1
@@ -708,13 +896,20 @@ class Adaptor(SelectorsGeneration):
     @staticmethod
     def __calculate_dict_diff(dict1: dict, dict2: dict) -> float:
-        """Used internally calculate similarity between two dictionaries as SequenceMatcher doesn't accept dictionaries
-        """
-        score = SequenceMatcher(None, tuple(dict1.keys()), tuple(dict2.keys())).ratio() * 0.5
-        score += SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio() * 0.5
         return score
-    def save(self, element: Union['Adaptor', html.HtmlElement], identifier: str) -> None:
         """Saves the element's unique properties to the storage for retrieval and relocation later
         :param element: The element itself that we want to save to storage, it can be a `Adaptor` or pure `HtmlElement`
@@ -756,8 +951,13 @@ class Adaptor(SelectorsGeneration):
         else:
             return self.get_all_text(strip=True).json()
-    def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
-           clean_match: bool = False, case_sensitive: bool = True) -> TextHandlers:
         """Apply the given regex to the current text and return a list of strings with the matches.
         :param regex: Can be either a compiled regular expression or a string.
@@ -767,8 +967,14 @@ class Adaptor(SelectorsGeneration):
         """
         return self.text.re(regex, replace_entities, clean_match, case_sensitive)
-    def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
-                 clean_match: bool = False, case_sensitive: bool = True) -> TextHandler:
         """Apply the given regex to text and return the first match if found, otherwise return the default value.
         :param regex: Can be either a compiled regular expression or a string.
@@ -777,14 +983,19 @@ class Adaptor(SelectorsGeneration):
         :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
         :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
         """
-        return self.text.re_first(regex, default, replace_entities, clean_match, case_sensitive)
     def find_similar(
-            self,
-            similarity_threshold: float = 0.2,
-            ignore_attributes: Union[List, Tuple] = ('href', 'src',),
-            match_text: bool = False
-    ) -> Union['Adaptors[Adaptor]', List]:
         """Find elements that are in the same tree depth in the page with the same tag name and same parent tag etc...
         then return the ones that match the current element attributes with percentage higher than the input threshold.
@@ -805,19 +1016,28 @@ class Adaptor(SelectorsGeneration):
         :return: A ``Adaptors`` container of ``Adaptor`` objects or empty list
         """
         def get_attributes(element: html.HtmlElement) -> Dict:
             """Return attributes dictionary without the ignored list"""
-            return {k: v for k, v in element.attrib.items() if k not in ignore_attributes}
-        def are_alike(original: html.HtmlElement, original_attributes: Dict, candidate: html.HtmlElement) -> bool:
             """Calculate a score of how much these elements are alike and return True
-                if score is higher or equal the threshold"""
-            candidate_attributes = get_attributes(candidate) if ignore_attributes else candidate.attrib
             score, checks = 0, 0
             if original_attributes:
                 score += sum(
-                    SequenceMatcher(None, v, candidate_attributes.get(k, '')).ratio()
                     for k, v in original_attributes.items()
                 )
                 checks += len(candidate_attributes)
@@ -829,7 +1049,9 @@ class Adaptor(SelectorsGeneration):
             if match_text:
                 score += SequenceMatcher(
-                    None, clean_spaces(original.text or ''), clean_spaces(candidate.text or '')
                 ).ratio()
                 checks += 1
@@ -851,20 +1073,30 @@ class Adaptor(SelectorsGeneration):
                     f"//{grandparent.tag}/{parent.tag}/{self.tag}[count(ancestor::*) = {current_depth}]"
                 )
             else:
-                potential_matches = root.xpath(f"//{parent.tag}/{self.tag}[count(ancestor::*) = {current_depth}]")
         else:
-            potential_matches = root.xpath(f"//{self.tag}[count(ancestor::*) = {current_depth}]")
         for potential_match in potential_matches:
-            if potential_match != root and are_alike(root, target_attrs, potential_match):
                 similar_elements.append(potential_match)
         return self.__handle_elements(similar_elements)
     def find_by_text(
-            self, text: str, first_match: bool = True, partial: bool = False,
-            case_sensitive: bool = False, clean_match: bool = True
-    ) -> Union['Adaptors[Adaptor]', 'Adaptor']:
         """Find elements that its text content fully/partially matches input.
         :param text: Text query to match
         :param first_match: Return first element that matches conditions, enabled by default
@@ -878,7 +1110,9 @@ class Adaptor(SelectorsGeneration):
             text = text.lower()
         # This selector gets all elements with text content
-        for node in self.__handle_elements(self._root.xpath('.//*[normalize-space(text())]')):
             """Check if element matches given text otherwise, traverse the children tree and iterate"""
             node_text = node.text
             if clean_match:
@@ -903,8 +1137,12 @@ class Adaptor(SelectorsGeneration):
         return results
     def find_by_regex(
-            self, query: Union[str, Pattern[str]], first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
-    ) -> Union['Adaptors[Adaptor]', 'Adaptor']:
         """Find elements that its text content matches the input regex pattern.
         :param query: Regex query/pattern to match
         :param first_match: Return first element that matches conditions, enabled by default
@@ -914,10 +1152,17 @@ class Adaptor(SelectorsGeneration):
         results = Adaptors([])
         # This selector gets all elements with text content
-        for node in self.__handle_elements(self._root.xpath('.//*[normalize-space(text())]')):
             """Check if element matches given regex otherwise, traverse the children tree and iterate"""
             node_text = node.text
-            if node_text.re(query, check_match=True, clean_match=clean_match, case_sensitive=case_sensitive):
                 results.append(node)
             if first_match and results:
@@ -933,6 +1178,7 @@ class Adaptors(List[Adaptor]):
     """
     The :class:`Adaptors` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
     """
     __slots__ = ()
     @typing.overload
@@ -943,7 +1189,9 @@ class Adaptors(List[Adaptor]):
     def __getitem__(self, pos: slice) -> "Adaptors":
         pass
-    def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union[Adaptor, "Adaptors"]:
         lst = super().__getitem__(pos)
         if isinstance(pos, slice):
             return self.__class__(lst)
@@ -951,7 +1199,12 @@ class Adaptors(List[Adaptor]):
             return lst
     def xpath(
-            self, selector: str, identifier: str = '', auto_save: bool = False, percentage: int = 0, **kwargs: Any
     ) -> "Adaptors[Adaptor]":
         """
         Call the ``.xpath()`` method for each element in this list and return
@@ -974,11 +1227,20 @@ class Adaptors(List[Adaptor]):
         :return: List as :class:`Adaptors`
         """
         results = [
-            n.xpath(selector, identifier or selector, False, auto_save, percentage, **kwargs) for n in self
         ]
         return self.__class__(flatten(results))
-    def css(self, selector: str, identifier: str = '', auto_save: bool = False, percentage: int = 0) -> "Adaptors[Adaptor]":
         """
         Call the ``.css()`` method for each element in this list and return
         their results flattened as another :class:`Adaptors`.
@@ -998,12 +1260,18 @@ class Adaptors(List[Adaptor]):
         :return: List as :class:`Adaptors`
         """
         results = [
-            n.css(selector, identifier or selector, False, auto_save, percentage) for n in self
         ]
         return self.__class__(flatten(results))
-    def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
-           clean_match: bool = False, case_sensitive: bool = True) -> TextHandlers[TextHandler]:
         """Call the ``.re()`` method for each element in this list and return
         their results flattened as List of TextHandler.
@@ -1013,12 +1281,19 @@ class Adaptors(List[Adaptor]):
         :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
         """
         results = [
-            n.text.re(regex, replace_entities, clean_match, case_sensitive) for n in self
         ]
         return TextHandlers(flatten(results))
-    def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
-                 clean_match: bool = False, case_sensitive: bool = True) -> TextHandler:
         """Call the ``.re_first()`` method for each element in this list and return
         the first result or the default value otherwise.
@@ -1033,7 +1308,7 @@ class Adaptors(List[Adaptor]):
                 return result
         return default
-    def search(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptor', None]:
         """Loop over all current elements and return the first element that matches the passed function
         :param func: A function that takes each element as an argument and returns True/False
         :return: The first element that match the function or ``None`` otherwise.
@@ -1043,14 +1318,12 @@ class Adaptors(List[Adaptor]):
                 return element
         return None
-    def filter(self, func: Callable[['Adaptor'], bool]) -> 'Adaptors[Adaptor]':
         """Filter current elements based on the passed function
         :param func: A function that takes each element as an argument and returns True/False
         :return: The new `Adaptors` object or empty list otherwise.
         """
-        return self.__class__([
-            element for element in self if func(element)
-        ])
     # For easy copy-paste from Scrapy/parsel code when needed :)
     def get(self, default=None):

 from cssselect import parse as split_selectors
 from lxml import etree, html
+from scrapling.core._types import (
+    Any,
+    Callable,
+    Dict,
+    Generator,
+    Iterable,
+    List,
+    Optional,
+    Pattern,
+    SupportsIndex,
+    Tuple,
+    Union,
+)
+from scrapling.core.custom_types import AttributesHandler, TextHandler, TextHandlers
 from scrapling.core.mixins import SelectorsGeneration
+from scrapling.core.storage_adaptors import (
+    SQLiteStorageSystem,
+    StorageSystemMixin,
+    _StorageTools,
+)
 from scrapling.core.translator import translator_instance
+from scrapling.core.utils import clean_spaces, flatten, html_forbidden, is_jsonable, log
 class Adaptor(SelectorsGeneration):
     __slots__ = (
+        "url",
+        "encoding",
+        "__auto_match_enabled",
+        "_root",
+        "_storage",
+        "__keep_comments",
+        "__huge_tree_enabled",
+        "__attributes",
+        "__text",
+        "__tag",
+        "__keep_cdata",
     )
     def __init__(
+        self,
+        text: Optional[str] = None,
+        url: Optional[str] = None,
+        body: bytes = b"",
+        encoding: str = "utf8",
+        huge_tree: bool = True,
+        root: Optional[html.HtmlElement] = None,
+        keep_comments: Optional[bool] = False,
+        keep_cdata: Optional[bool] = False,
+        auto_match: Optional[bool] = False,
+        storage: Any = SQLiteStorageSystem,
+        storage_args: Optional[Dict] = None,
+        **kwargs,
     ):
         """The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
         with expressions in CSS, XPath, or with simply text. Check the docs for more info.
             If empty, default values will be used.
         """
         if root is None and not body and text is None:
+            raise ValueError(
+                "Adaptor class needs text, body, or root arguments to work"
+            )
+        self.__text = ""
         if root is None:
             if text is None:
                 if not body or not isinstance(body, bytes):
+                    raise TypeError(
+                        f"body argument must be valid and of type bytes, got {body.__class__}"
+                    )
                 body = body.replace(b"\x00", b"").strip()
             else:
                 if not isinstance(text, str):
+                    raise TypeError(
+                        f"text argument must be of type str, got {text.__class__}"
+                    )
                 body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
             # https://lxml.de/api/lxml.etree.HTMLParser-class.html
             parser = html.HTMLParser(
+                recover=True,
+                remove_blank_text=True,
+                remove_comments=(not keep_comments),
+                encoding=encoding,
+                compact=True,
+                huge_tree=huge_tree,
+                default_doctype=True,
+                strip_cdata=(not keep_cdata),
             )
             self._root = etree.fromstring(body, parser=parser, base_url=url)
             if is_jsonable(text or body.decode()):
         if self.__auto_match_enabled:
             if not storage_args:
                 storage_args = {
+                    "storage_file": os.path.join(
+                        os.path.dirname(__file__), "elements_storage.db"
+                    ),
+                    "url": url,
                 }
+            if not hasattr(storage, "__wrapped__"):
+                raise ValueError(
+                    "Storage class must be wrapped with lru_cache decorator, see docs for info"
+                )
             if not issubclass(storage.__wrapped__, StorageSystemMixin):
+                raise ValueError(
+                    "Storage system must be inherited from class `StorageSystemMixin`"
+                )
             self._storage = storage(**storage_args)
         self.__attributes = None
         self.__tag = None
         # No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
+        self.__response_data = (
+            {
+                key: getattr(self, key)
+                for key in (
+                    "status",
+                    "reason",
+                    "cookies",
+                    "history",
+                    "headers",
+                    "request_headers",
+                )
+            }
+            if hasattr(self, "status")
+            else {}
+        )
     # Node functionalities, I wanted to move to separate Mixin class but it had slight impact on performance
     @staticmethod
+    def _is_text_node(
+        element: Union[html.HtmlElement, etree._ElementUnicodeResult],
+    ) -> bool:
         """Return True if given element is a result of a string expression
         Examples:
             XPath -> '/text()', '/@attribute' etc...
         return issubclass(type(element), etree._ElementUnicodeResult)
     @staticmethod
+    def __content_convertor(
+        element: Union[html.HtmlElement, etree._ElementUnicodeResult],
+    ) -> TextHandler:
         """Used internally to convert a single element's text content to TextHandler directly without checks
         This single line has been isolated like this so when it's used with map we get that slight performance boost vs list comprehension
         """
         return TextHandler(str(element))
+    def __element_convertor(self, element: html.HtmlElement) -> "Adaptor":
         """Used internally to convert a single HtmlElement to Adaptor directly without checks"""
         return Adaptor(
             root=element,
+            text="",
+            body=b"",  # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
+            url=self.url,
+            encoding=self.encoding,
+            auto_match=self.__auto_match_enabled,
+            keep_comments=self.__keep_comments,
+            keep_cdata=self.__keep_cdata,
             huge_tree=self.__huge_tree_enabled,
+            **self.__response_data,
         )
+    def __handle_element(
+        self, element: Union[html.HtmlElement, etree._ElementUnicodeResult]
+    ) -> Union[TextHandler, "Adaptor", None]:
         """Used internally in all functions to convert a single element to type (Adaptor|TextHandler) when possible"""
         if element is None:
             return None
         else:
             return self.__element_convertor(element)
+    def __handle_elements(
+        self, result: List[Union[html.HtmlElement, etree._ElementUnicodeResult]]
+    ) -> Union["Adaptors", "TextHandlers", List]:
         """Used internally in all functions to convert results to type (Adaptors|TextHandlers) in bulk when possible"""
+        if not len(
+            result
+        ):  # Lxml will give a warning if I used something like `not result`
             return Adaptors([])
         # From within the code, this method will always get a list of the same type
             self.__text = TextHandler(self._root.text)
         return self.__text
+    def get_all_text(
+        self,
+        separator: str = "\n",
+        strip: bool = False,
+        ignore_tags: Tuple = (
+            "script",
+            "style",
+        ),
+        valid_values: bool = True,
+    ) -> TextHandler:
         """Get all child strings of this element, concatenated using the given separator.
         :param separator: Strings will be concatenated using this separator.
         :return: A TextHandler
         """
         _all_strings = []
+        for node in self._root.xpath(".//*"):
             if node.tag not in ignore_tags:
                 text = node.text
                 if text and type(text) is str:
     @property
     def html_content(self) -> TextHandler:
         """Return the inner html code of the element"""
+        return TextHandler(
+            etree.tostring(
+                self._root, encoding="unicode", method="html", with_tail=False
+            )
+        )
     body = html_content
     def prettify(self) -> TextHandler:
         """Return a prettified version of the element's inner html-code"""
+        return TextHandler(
+            etree.tostring(
+                self._root,
+                encoding="unicode",
+                pretty_print=True,
+                method="html",
+                with_tail=False,
+            )
+        )
     def has_class(self, class_name: str) -> bool:
         """Check if element has a specific class
         return class_name in self._root.classes
     @property
+    def parent(self) -> Union["Adaptor", None]:
         """Return the direct parent of the element or ``None`` otherwise"""
         return self.__handle_element(self._root.getparent())
     @property
+    def below_elements(self) -> "Adaptors[Adaptor]":
         """Return all elements under the current element in the DOM tree"""
+        below = self._root.xpath(".//*")
         return self.__handle_elements(below)
     @property
+    def children(self) -> "Adaptors[Adaptor]":
         """Return the children elements of the current element or empty list otherwise"""
+        return Adaptors(
+            [
+                self.__element_convertor(child)
+                for child in self._root.iterchildren()
+                if type(child) not in html_forbidden
+            ]
+        )
     @property
+    def siblings(self) -> "Adaptors[Adaptor]":
         """Return other children of the current element's parent or empty list otherwise"""
         if self.parent:
+            return Adaptors(
+                [child for child in self.parent.children if child._root != self._root]
+            )
         return Adaptors([])
+    def iterancestors(self) -> Generator["Adaptor", None, None]:
         """Return a generator that loops over all ancestors of the element, starting with element's parent."""
         for ancestor in self._root.iterancestors():
             yield self.__element_convertor(ancestor)
+    def find_ancestor(
+        self, func: Callable[["Adaptor"], bool]
+    ) -> Union["Adaptor", None]:
         """Loop over all ancestors of the element till one match the passed function
         :param func: A function that takes each ancestor as an argument and returns True/False
         :return: The first ancestor that match the function or ``None`` otherwise.
         return None
     @property
+    def path(self) -> "Adaptors[Adaptor]":
         """Returns list of type :class:`Adaptors` that contains the path leading to the current element from the root."""
         lst = list(self.iterancestors())
         return Adaptors(lst)
     @property
+    def next(self) -> Union["Adaptor", None]:
         """Returns the next element of the current element in the children of the parent or ``None`` otherwise."""
         next_element = self._root.getnext()
         if next_element is not None:
         return self.__handle_element(next_element)
     @property
+    def previous(self) -> Union["Adaptor", None]:
         """Returns the previous element of the current element in the children of the parent or ``None`` otherwise."""
         prev_element = self._root.getprevious()
         if prev_element is not None:
         data = "<"
         content = clean_spaces(self.html_content)
         if len(content) > length_limit:
+            content = content[:length_limit].strip() + "..."
         data += f"data='{content}'"
         if self.parent:
             parent_content = clean_spaces(self.parent.html_content)
             if len(parent_content) > length_limit:
+                parent_content = parent_content[:length_limit].strip() + "..."
             data += f" parent='{parent_content}'"
     # From here we start the selecting functions
     def relocate(
+        self,
+        element: Union[Dict, html.HtmlElement, "Adaptor"],
+        percentage: int = 0,
+        adaptor_type: bool = False,
+    ) -> Union[List[Union[html.HtmlElement, None]], "Adaptors"]:
         """This function will search again for the element in the page tree, used automatically on page structure change
         :param element: The element we want to relocate in the tree
         if issubclass(type(element), html.HtmlElement):
             element = _StorageTools.element_to_dict(element)
+        for node in self._root.xpath(".//*"):
             # Collect all elements in the page then for each element get the matching score of it against the node.
             # Hence: the code doesn't stop even if the score was 100%
             # because there might be another element(s) left in page with the same score
             if score_table[highest_probability] and highest_probability >= percentage:
                 if log.getEffectiveLevel() < 20:
                     # No need to execute this part if logging level is not debugging
+                    log.debug(f"Highest probability was {highest_probability}%")
+                    log.debug("Top 5 best matching elements are: ")
                     for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
+                        log.debug(
+                            f"{percent} -> {self.__handle_elements(score_table[percent])}"
+                        )
                 if not adaptor_type:
                     return score_table[highest_probability]
                 return self.__handle_elements(score_table[highest_probability])
         return []
+    def css_first(
+        self,
+        selector: str,
+        identifier: str = "",
+        auto_match: bool = False,
+        auto_save: bool = False,
+        percentage: int = 0,
+    ) -> Union["Adaptor", "TextHandler", None]:
         """Search current tree with CSS3 selectors and return the first result if possible, otherwise return `None`
         **Important:
          Be aware that the percentage calculation depends solely on the page structure so don't play with this
          number unless you must know what you are doing!
         """
+        for element in self.css(
+            selector, identifier, auto_match, auto_save, percentage
+        ):
             return element
         return None
+    def xpath_first(
+        self,
+        selector: str,
+        identifier: str = "",
+        auto_match: bool = False,
+        auto_save: bool = False,
+        percentage: int = 0,
+        **kwargs: Any,
+    ) -> Union["Adaptor", "TextHandler", None]:
         """Search current tree with XPath selectors and return the first result if possible, otherwise return `None`
         **Important:
          Be aware that the percentage calculation depends solely on the page structure so don't play with this
          number unless you must know what you are doing!
         """
+        for element in self.xpath(
+            selector, identifier, auto_match, auto_save, percentage, **kwargs
+        ):
             return element
         return None
+    def css(
+        self,
+        selector: str,
+        identifier: str = "",
+        auto_match: bool = False,
+        auto_save: bool = False,
+        percentage: int = 0,
+    ) -> Union["Adaptors[Adaptor]", List, "TextHandlers[TextHandler]"]:
         """Search current tree with CSS3 selectors
         **Important:
         :return: List as :class:`Adaptors`
         """
         try:
+            if not self.__auto_match_enabled or "," not in selector:
                 # No need to split selectors in this case, let's save some CPU cycles :)
                 xpath_selector = translator_instance.css_to_xpath(selector)
+                return self.xpath(
+                    xpath_selector,
+                    identifier or selector,
+                    auto_match,
+                    auto_save,
+                    percentage,
+                )
             results = []
+            if "," in selector:
                 for single_selector in split_selectors(selector):
                     # I'm doing this only so the `save` function save data correctly for combined selectors
                     # Like using the ',' to combine two different selectors that point to different elements.
+                    xpath_selector = translator_instance.css_to_xpath(
+                        single_selector.canonical()
+                    )
                     results += self.xpath(
+                        xpath_selector,
+                        identifier or single_selector.canonical(),
+                        auto_match,
+                        auto_save,
+                        percentage,
                     )
             return results
+        except (
+            SelectorError,
+            SelectorSyntaxError,
+        ):
             raise SelectorSyntaxError(f"Invalid CSS selector: {selector}")
+    def xpath(
+        self,
+        selector: str,
+        identifier: str = "",
+        auto_match: bool = False,
+        auto_save: bool = False,
+        percentage: int = 0,
+        **kwargs: Any,
+    ) -> Union["Adaptors[Adaptor]", List, "TextHandlers[TextHandler]"]:
         """Search current tree with XPath selectors
         **Important:
             if elements:
                 if auto_save:
                     if not self.__auto_match_enabled:
+                        log.warning(
+                            "Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info."
+                        )
                     else:
                         self.save(elements[0], identifier or selector)
                 return self.__handle_elements(elements)
             else:
                 if auto_match:
+                    log.warning(
+                        "Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info."
+                    )
                 elif auto_save:
+                    log.warning(
+                        "Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info."
+                    )
                 return self.__handle_elements(elements)
+        except (
+            SelectorError,
+            SelectorSyntaxError,
+            etree.XPathError,
+            etree.XPathEvalError,
+        ):
             raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
+    def find_all(
+        self,
+        *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]],
+        **kwargs: str,
+    ) -> "Adaptors":
         """Find elements by filters of your creations for ease..
         :param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
         # Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
         # https://www.w3schools.com/python/python_ref_keywords.asp
         whitelisted = {
+            "class_": "class",
+            "for_": "for",
         }
         if not args and not kwargs:
+            raise TypeError(
+                "You have to pass something to search with, like tag name(s), tag attributes, or both."
+            )
         attributes = dict()
         tags, patterns = set(), set()
             elif type(arg) in [list, tuple, set]:
                 if not all(map(lambda x: type(x) is str, arg)):
+                    raise TypeError(
+                        "Nested Iterables are not accepted, only iterables of tag names are accepted"
+                    )
                 tags.update(set(arg))
             elif isinstance(arg, dict):
+                if not all(
+                    [(type(k) is str and type(v) is str) for k, v in arg.items()]
+                ):
+                    raise TypeError(
+                        "Nested dictionaries are not accepted, only string keys and string values are accepted"
+                    )
                 attributes.update(arg)
             elif isinstance(arg, re.Pattern):
                 if len(inspect.signature(arg).parameters) > 0:
                     functions.append(arg)
                 else:
+                    raise TypeError(
+                        "Callable filter function must have at least one argument to take `Adaptor` objects."
+                    )
             else:
+                raise TypeError(
+                    f'Argument with type "{type(arg)}" is not accepted, please read the docs.'
+                )
         if not all([(type(k) is str and type(v) is str) for k, v in kwargs.items()]):
+            raise TypeError("Only string values are accepted for arguments")
         for attribute_name, value in kwargs.items():
             # Only replace names for kwargs, replacing them in dictionaries doesn't make sense
             attributes[attribute_name] = value
         # It's easier and faster to build a selector than traversing the tree
+        tags = tags or ["*"]
         for tag in tags:
             selector = tag
             for key, value in attributes.items():
+                value = value.replace('"', r"\"")  # Escape double quotes in user input
                 # Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
                 selector += '[{}="{}"]'.format(key, value)
+            if selector != "*":
                 selectors.append(selector)
         if selectors:
+            results = self.css(", ".join(selectors))
             if results:
                 # From the results, get the ones that fulfill passed regex patterns
                 for pattern in patterns:
+                    results = results.filter(
+                        lambda e: e.text.re(pattern, check_match=True)
+                    )
                 # From the results, get the ones that fulfill passed functions
                 for function in functions:
         return results
+    def find(
+        self,
+        *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]],
+        **kwargs: str,
+    ) -> Union["Adaptor", None]:
         """Find elements by filters of your creations for ease then return the first result. Otherwise return `None`.
         :param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
             return element
         return None
+    def __calculate_similarity_score(
+        self, original: Dict, candidate: html.HtmlElement
+    ) -> float:
         """Used internally to calculate a score that shows how candidate element similar to the original one
         :param original: The original element in the form of the dictionary generated from `element_to_dict` function
         # Possible TODO:
         # Study the idea of giving weight to each test below so some are more important than others
         # Current results: With weights some websites had better score while it was worse for others
+        score += 1 if original["tag"] == candidate["tag"] else 0  # * 0.3  # 30%
         checks += 1
+        if original["text"]:
+            score += SequenceMatcher(
+                None, original["text"], candidate.get("text") or ""
+            ).ratio()  # * 0.3  # 30%
             checks += 1
         # if both doesn't have attributes, it still count for something!
+        score += self.__calculate_dict_diff(
+            original["attributes"], candidate["attributes"]
+        )  # * 0.3  # 30%
         checks += 1
         # Separate similarity test for class, id, href,... this will help in full structural changes
+        for attrib in (
+            "class",
+            "id",
+            "href",
+            "src",
+        ):
+            if original["attributes"].get(attrib):
                 score += SequenceMatcher(
+                    None,
+                    original["attributes"][attrib],
+                    candidate["attributes"].get(attrib) or "",
                 ).ratio()  # * 0.3  # 30%
                 checks += 1
+        score += SequenceMatcher(
+            None, original["path"], candidate["path"]
+        ).ratio()  # * 0.1  # 10%
         checks += 1
+        if original.get("parent_name"):
             # Then we start comparing parents' data
+            if candidate.get("parent_name"):
                 score += SequenceMatcher(
+                    None, original["parent_name"], candidate.get("parent_name") or ""
                 ).ratio()  # * 0.2  # 20%
                 checks += 1
                 score += self.__calculate_dict_diff(
+                    original["parent_attribs"], candidate.get("parent_attribs") or {}
                 )  # * 0.2  # 20%
                 checks += 1
+                if original["parent_text"]:
                     score += SequenceMatcher(
+                        None,
+                        original["parent_text"],
+                        candidate.get("parent_text") or "",
                     ).ratio()  # * 0.1  # 10%
                     checks += 1
             # else:
             #     # The original element have a parent and this one not, this is not a good sign
             #     score -= 0.1
+        if original.get("siblings"):
             score += SequenceMatcher(
+                None, original["siblings"], candidate.get("siblings") or []
             ).ratio()  # * 0.1  # 10%
             checks += 1
     @staticmethod
     def __calculate_dict_diff(dict1: dict, dict2: dict) -> float:
+        """Used internally calculate similarity between two dictionaries as SequenceMatcher doesn't accept dictionaries"""
+        score = (
+            SequenceMatcher(None, tuple(dict1.keys()), tuple(dict2.keys())).ratio()
+            * 0.5
+        )
+        score += (
+            SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio()
+            * 0.5
+        )
         return score
+    def save(
+        self, element: Union["Adaptor", html.HtmlElement], identifier: str
+    ) -> None:
         """Saves the element's unique properties to the storage for retrieval and relocation later
         :param element: The element itself that we want to save to storage, it can be a `Adaptor` or pure `HtmlElement`
         else:
             return self.get_all_text(strip=True).json()
+    def re(
+        self,
+        regex: Union[str, Pattern[str]],
+        replace_entities: bool = True,
+        clean_match: bool = False,
+        case_sensitive: bool = True,
+    ) -> TextHandlers:
         """Apply the given regex to the current text and return a list of strings with the matches.
         :param regex: Can be either a compiled regular expression or a string.
         """
         return self.text.re(regex, replace_entities, clean_match, case_sensitive)
+    def re_first(
+        self,
+        regex: Union[str, Pattern[str]],
+        default=None,
+        replace_entities: bool = True,
+        clean_match: bool = False,
+        case_sensitive: bool = True,
+    ) -> TextHandler:
         """Apply the given regex to text and return the first match if found, otherwise return the default value.
         :param regex: Can be either a compiled regular expression or a string.
         :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
         :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
         """
+        return self.text.re_first(
+            regex, default, replace_entities, clean_match, case_sensitive
+        )
     def find_similar(
+        self,
+        similarity_threshold: float = 0.2,
+        ignore_attributes: Union[List, Tuple] = (
+            "href",
+            "src",
+        ),
+        match_text: bool = False,
+    ) -> Union["Adaptors[Adaptor]", List]:
         """Find elements that are in the same tree depth in the page with the same tag name and same parent tag etc...
         then return the ones that match the current element attributes with percentage higher than the input threshold.
         :return: A ``Adaptors`` container of ``Adaptor`` objects or empty list
         """
         def get_attributes(element: html.HtmlElement) -> Dict:
             """Return attributes dictionary without the ignored list"""
+            return {
+                k: v for k, v in element.attrib.items() if k not in ignore_attributes
+            }
+        def are_alike(
+            original: html.HtmlElement,
+            original_attributes: Dict,
+            candidate: html.HtmlElement,
+        ) -> bool:
             """Calculate a score of how much these elements are alike and return True
+            if score is higher or equal the threshold"""
+            candidate_attributes = (
+                get_attributes(candidate) if ignore_attributes else candidate.attrib
+            )
             score, checks = 0, 0
             if original_attributes:
                 score += sum(
+                    SequenceMatcher(None, v, candidate_attributes.get(k, "")).ratio()
                     for k, v in original_attributes.items()
                 )
                 checks += len(candidate_attributes)
             if match_text:
                 score += SequenceMatcher(
+                    None,
+                    clean_spaces(original.text or ""),
+                    clean_spaces(candidate.text or ""),
                 ).ratio()
                 checks += 1
                     f"//{grandparent.tag}/{parent.tag}/{self.tag}[count(ancestor::*) = {current_depth}]"
                 )
             else:
+                potential_matches = root.xpath(
+                    f"//{parent.tag}/{self.tag}[count(ancestor::*) = {current_depth}]"
+                )
         else:
+            potential_matches = root.xpath(
+                f"//{self.tag}[count(ancestor::*) = {current_depth}]"
+            )
         for potential_match in potential_matches:
+            if potential_match != root and are_alike(
+                root, target_attrs, potential_match
+            ):
                 similar_elements.append(potential_match)
         return self.__handle_elements(similar_elements)
     def find_by_text(
+        self,
+        text: str,
+        first_match: bool = True,
+        partial: bool = False,
+        case_sensitive: bool = False,
+        clean_match: bool = True,
+    ) -> Union["Adaptors[Adaptor]", "Adaptor"]:
         """Find elements that its text content fully/partially matches input.
         :param text: Text query to match
         :param first_match: Return first element that matches conditions, enabled by default
             text = text.lower()
         # This selector gets all elements with text content
+        for node in self.__handle_elements(
+            self._root.xpath(".//*[normalize-space(text())]")
+        ):
             """Check if element matches given text otherwise, traverse the children tree and iterate"""
             node_text = node.text
             if clean_match:
         return results
     def find_by_regex(
+        self,
+        query: Union[str, Pattern[str]],
+        first_match: bool = True,
+        case_sensitive: bool = False,
+        clean_match: bool = True,
+    ) -> Union["Adaptors[Adaptor]", "Adaptor"]:
         """Find elements that its text content matches the input regex pattern.
         :param query: Regex query/pattern to match
         :param first_match: Return first element that matches conditions, enabled by default
         results = Adaptors([])
         # This selector gets all elements with text content
+        for node in self.__handle_elements(
+            self._root.xpath(".//*[normalize-space(text())]")
+        ):
             """Check if element matches given regex otherwise, traverse the children tree and iterate"""
             node_text = node.text
+            if node_text.re(
+                query,
+                check_match=True,
+                clean_match=clean_match,
+                case_sensitive=case_sensitive,
+            ):
                 results.append(node)
             if first_match and results:
     """
     The :class:`Adaptors` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
     """
     __slots__ = ()
     @typing.overload
     def __getitem__(self, pos: slice) -> "Adaptors":
         pass
+    def __getitem__(
+        self, pos: Union[SupportsIndex, slice]
+    ) -> Union[Adaptor, "Adaptors"]:
         lst = super().__getitem__(pos)
         if isinstance(pos, slice):
             return self.__class__(lst)
             return lst
     def xpath(
+        self,
+        selector: str,
+        identifier: str = "",
+        auto_save: bool = False,
+        percentage: int = 0,
+        **kwargs: Any,
     ) -> "Adaptors[Adaptor]":
         """
         Call the ``.xpath()`` method for each element in this list and return
         :return: List as :class:`Adaptors`
         """
         results = [
+            n.xpath(
+                selector, identifier or selector, False, auto_save, percentage, **kwargs
+            )
+            for n in self
         ]
         return self.__class__(flatten(results))
+    def css(
+        self,
+        selector: str,
+        identifier: str = "",
+        auto_save: bool = False,
+        percentage: int = 0,
+    ) -> "Adaptors[Adaptor]":
         """
         Call the ``.css()`` method for each element in this list and return
         their results flattened as another :class:`Adaptors`.
         :return: List as :class:`Adaptors`
         """
         results = [
+            n.css(selector, identifier or selector, False, auto_save, percentage)
+            for n in self
         ]
         return self.__class__(flatten(results))
+    def re(
+        self,
+        regex: Union[str, Pattern[str]],
+        replace_entities: bool = True,
+        clean_match: bool = False,
+        case_sensitive: bool = True,
+    ) -> TextHandlers[TextHandler]:
         """Call the ``.re()`` method for each element in this list and return
         their results flattened as List of TextHandler.
         :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
         """
         results = [
+            n.text.re(regex, replace_entities, clean_match, case_sensitive)
+            for n in self
         ]
         return TextHandlers(flatten(results))
+    def re_first(
+        self,
+        regex: Union[str, Pattern[str]],
+        default=None,
+        replace_entities: bool = True,
+        clean_match: bool = False,
+        case_sensitive: bool = True,
+    ) -> TextHandler:
         """Call the ``.re_first()`` method for each element in this list and return
         the first result or the default value otherwise.
                 return result
         return default
+    def search(self, func: Callable[["Adaptor"], bool]) -> Union["Adaptor", None]:
         """Loop over all current elements and return the first element that matches the passed function
         :param func: A function that takes each element as an argument and returns True/False
         :return: The first element that match the function or ``None`` otherwise.
                 return element
         return None
+    def filter(self, func: Callable[["Adaptor"], bool]) -> "Adaptors[Adaptor]":
         """Filter current elements based on the passed function
         :param func: A function that takes each element as an argument and returns True/False
         :return: The new `Adaptors` object or empty list otherwise.
         """
+        return self.__class__([element for element in self if func(element)])
     # For easy copy-paste from Scrapy/parsel code when needed :)
     def get(self, default=None):

setup.py CHANGED Viewed

@@ -1,7 +1,8 @@
 from setuptools import find_packages, setup
-with open("README.md", "r", encoding="utf-8") as fh:
-    long_description = fh.read()
 setup(
@@ -20,9 +21,7 @@ setup(
         "scrapling": "scrapling",
     },
     entry_points={
-        'console_scripts': [
-            'scrapling=scrapling.cli:main'
-        ],
     },
     include_package_data=True,
     classifiers=[
@@ -53,14 +52,14 @@ setup(
     install_requires=[
         "lxml>=5.0",
         "cssselect>=1.2",
-        'click',
         "w3lib",
         "orjson>=3",
         "tldextract",
-        'httpx[brotli,zstd, socks]',
-        'playwright>=1.49.1',
-        'rebrowser-playwright>=1.49.1',
-        'camoufox[geoip]>=0.4.11'
     ],
     python_requires=">=3.9",
     url="https://github.com/D4Vinci/Scrapling",
@@ -68,5 +67,5 @@ setup(
         "Documentation": "https://scrapling.readthedocs.io/en/latest/",
         "Source": "https://github.com/D4Vinci/Scrapling",
         "Tracker": "https://github.com/D4Vinci/Scrapling/issues",
-    }
 )

+from pathlib import Path
 from setuptools import find_packages, setup
+long_description = Path("README.md").read_text(encoding="utf-8")
 setup(
         "scrapling": "scrapling",
     },
     entry_points={
+        "console_scripts": ["scrapling=scrapling.cli:main"],
     },
     include_package_data=True,
     classifiers=[
     install_requires=[
         "lxml>=5.0",
         "cssselect>=1.2",
+        "click",
         "w3lib",
         "orjson>=3",
         "tldextract",
+        "httpx[brotli,zstd, socks]",
+        "playwright>=1.49.1",
+        "rebrowser-playwright>=1.49.1",
+        "camoufox[geoip]>=0.4.11",
     ],
     python_requires=">=3.9",
     url="https://github.com/D4Vinci/Scrapling",
         "Documentation": "https://scrapling.readthedocs.io/en/latest/",
         "Source": "https://github.com/D4Vinci/Scrapling",
         "Tracker": "https://github.com/D4Vinci/Scrapling/issues",
+    },
 )

tests/fetchers/async/test_camoufox.py CHANGED Viewed

@@ -17,43 +17,51 @@ class TestStealthyFetcher:
     def urls(self, httpbin):
         url = httpbin.url
         return {
-            'status_200': f'{url}/status/200',
-            'status_404': f'{url}/status/404',
-            'status_501': f'{url}/status/501',
-            'basic_url': f'{url}/get',
-            'html_url': f'{url}/html',
-            'delayed_url': f'{url}/delay/10',  # 10 Seconds delay response
-            'cookies_url': f"{url}/cookies/set/test/value"
         }
     async def test_basic_fetch(self, fetcher, urls):
         """Test doing basic fetch request with multiple statuses"""
-        assert (await fetcher.async_fetch(urls['status_200'])).status == 200
-        assert (await fetcher.async_fetch(urls['status_404'])).status == 404
-        assert (await fetcher.async_fetch(urls['status_501'])).status == 501
     async def test_networkidle(self, fetcher, urls):
         """Test if waiting for `networkidle` make page does not finish loading or not"""
-        assert (await fetcher.async_fetch(urls['basic_url'], network_idle=True)).status == 200
     async def test_blocking_resources(self, fetcher, urls):
         """Test if blocking resources make page does not finish loading or not"""
-        assert (await fetcher.async_fetch(urls['basic_url'], block_images=True)).status == 200
-        assert (await fetcher.async_fetch(urls['basic_url'], disable_resources=True)).status == 200
     async def test_waiting_selector(self, fetcher, urls):
         """Test if waiting for a selector make page does not finish loading or not"""
-        assert (await fetcher.async_fetch(urls['html_url'], wait_selector='h1')).status == 200
-        assert (await fetcher.async_fetch(
-            urls['html_url'],
-            wait_selector='h1',
-            wait_selector_state='visible'
-        )).status == 200
     async def test_cookies_loading(self, fetcher, urls):
         """Test if cookies are set after the request"""
-        response = await fetcher.async_fetch(urls['cookies_url'])
-        assert response.cookies == {'test': 'value'}
     async def test_automation(self, fetcher, urls):
         """Test if automation break the code or not"""
@@ -64,34 +72,38 @@ class TestStealthyFetcher:
             await page.mouse.up()
             return page
-        assert (await fetcher.async_fetch(urls['html_url'], page_action=scroll_page)).status == 200
     async def test_properties(self, fetcher, urls):
         """Test if different arguments breaks the code or not"""
-        assert (await fetcher.async_fetch(
-            urls['html_url'],
-            block_webrtc=True,
-            allow_webgl=True
-        )).status == 200
-        assert (await fetcher.async_fetch(
-            urls['html_url'],
-            block_webrtc=False,
-            allow_webgl=True
-        )).status == 200
-        assert (await fetcher.async_fetch(
-            urls['html_url'],
-            block_webrtc=True,
-            allow_webgl=False
-        )).status == 200
-        assert (await fetcher.async_fetch(
-            urls['html_url'],
-            extra_headers={'ayo': ''},
-            os_randomize=True
-        )).status == 200
     async def test_infinite_timeout(self, fetcher, urls):
         """Test if infinite timeout breaks the code or not"""
-        assert (await fetcher.async_fetch(urls['delayed_url'], timeout=None)).status == 200

     def urls(self, httpbin):
         url = httpbin.url
         return {
+            "status_200": f"{url}/status/200",
+            "status_404": f"{url}/status/404",
+            "status_501": f"{url}/status/501",
+            "basic_url": f"{url}/get",
+            "html_url": f"{url}/html",
+            "delayed_url": f"{url}/delay/10",  # 10 Seconds delay response
+            "cookies_url": f"{url}/cookies/set/test/value",
         }
     async def test_basic_fetch(self, fetcher, urls):
         """Test doing basic fetch request with multiple statuses"""
+        assert (await fetcher.async_fetch(urls["status_200"])).status == 200
+        assert (await fetcher.async_fetch(urls["status_404"])).status == 404
+        assert (await fetcher.async_fetch(urls["status_501"])).status == 501
     async def test_networkidle(self, fetcher, urls):
         """Test if waiting for `networkidle` make page does not finish loading or not"""
+        assert (
+            await fetcher.async_fetch(urls["basic_url"], network_idle=True)
+        ).status == 200
     async def test_blocking_resources(self, fetcher, urls):
         """Test if blocking resources make page does not finish loading or not"""
+        assert (
+            await fetcher.async_fetch(urls["basic_url"], block_images=True)
+        ).status == 200
+        assert (
+            await fetcher.async_fetch(urls["basic_url"], disable_resources=True)
+        ).status == 200
     async def test_waiting_selector(self, fetcher, urls):
         """Test if waiting for a selector make page does not finish loading or not"""
+        assert (
+            await fetcher.async_fetch(urls["html_url"], wait_selector="h1")
+        ).status == 200
+        assert (
+            await fetcher.async_fetch(
+                urls["html_url"], wait_selector="h1", wait_selector_state="visible"
+            )
+        ).status == 200
     async def test_cookies_loading(self, fetcher, urls):
         """Test if cookies are set after the request"""
+        response = await fetcher.async_fetch(urls["cookies_url"])
+        assert response.cookies == {"test": "value"}
     async def test_automation(self, fetcher, urls):
         """Test if automation break the code or not"""
             await page.mouse.up()
             return page
+        assert (
+            await fetcher.async_fetch(urls["html_url"], page_action=scroll_page)
+        ).status == 200
     async def test_properties(self, fetcher, urls):
         """Test if different arguments breaks the code or not"""
+        assert (
+            await fetcher.async_fetch(
+                urls["html_url"], block_webrtc=True, allow_webgl=True
+            )
+        ).status == 200
+        assert (
+            await fetcher.async_fetch(
+                urls["html_url"], block_webrtc=False, allow_webgl=True
+            )
+        ).status == 200
+        assert (
+            await fetcher.async_fetch(
+                urls["html_url"], block_webrtc=True, allow_webgl=False
+            )
+        ).status == 200
+        assert (
+            await fetcher.async_fetch(
+                urls["html_url"], extra_headers={"ayo": ""}, os_randomize=True
+            )
+        ).status == 200
     async def test_infinite_timeout(self, fetcher, urls):
         """Test if infinite timeout breaks the code or not"""
+        assert (
+            await fetcher.async_fetch(urls["delayed_url"], timeout=None)
+        ).status == 200

tests/fetchers/async/test_httpx.py CHANGED Viewed

@@ -16,70 +16,111 @@ class TestAsyncFetcher:
     @pytest.fixture(scope="class")
     def urls(self, httpbin):
         return {
-            'status_200': f'{httpbin.url}/status/200',
-            'status_404': f'{httpbin.url}/status/404',
-            'status_501': f'{httpbin.url}/status/501',
-            'basic_url': f'{httpbin.url}/get',
-            'post_url': f'{httpbin.url}/post',
-            'put_url': f'{httpbin.url}/put',
-            'delete_url': f'{httpbin.url}/delete',
-            'html_url': f'{httpbin.url}/html'
         }
     async def test_basic_get(self, fetcher, urls):
         """Test doing basic get request with multiple statuses"""
-        assert (await fetcher.get(urls['status_200'])).status == 200
-        assert (await fetcher.get(urls['status_404'])).status == 404
-        assert (await fetcher.get(urls['status_501'])).status == 501
     async def test_get_properties(self, fetcher, urls):
         """Test if different arguments with GET request breaks the code or not"""
-        assert (await fetcher.get(urls['status_200'], stealthy_headers=True)).status == 200
-        assert (await fetcher.get(urls['status_200'], follow_redirects=True)).status == 200
-        assert (await fetcher.get(urls['status_200'], timeout=None)).status == 200
-        assert (await fetcher.get(
-            urls['status_200'],
-            stealthy_headers=True,
-            follow_redirects=True,
-            timeout=None
-        )).status == 200
     async def test_post_properties(self, fetcher, urls):
         """Test if different arguments with POST request breaks the code or not"""
-        assert (await fetcher.post(urls['post_url'], data={'key': 'value'})).status == 200
-        assert (await fetcher.post(urls['post_url'], data={'key': 'value'}, stealthy_headers=True)).status == 200
-        assert (await fetcher.post(urls['post_url'], data={'key': 'value'}, follow_redirects=True)).status == 200
-        assert (await fetcher.post(urls['post_url'], data={'key': 'value'}, timeout=None)).status == 200
-        assert (await fetcher.post(
-            urls['post_url'],
-            data={'key': 'value'},
-            stealthy_headers=True,
-            follow_redirects=True,
-            timeout=None
-        )).status == 200
     async def test_put_properties(self, fetcher, urls):
         """Test if different arguments with PUT request breaks the code or not"""
-        assert (await fetcher.put(urls['put_url'], data={'key': 'value'})).status in [200, 405]
-        assert (await fetcher.put(urls['put_url'], data={'key': 'value'}, stealthy_headers=True)).status in [200, 405]
-        assert (await fetcher.put(urls['put_url'], data={'key': 'value'}, follow_redirects=True)).status in [200, 405]
-        assert (await fetcher.put(urls['put_url'], data={'key': 'value'}, timeout=None)).status in [200, 405]
-        assert (await fetcher.put(
-            urls['put_url'],
-            data={'key': 'value'},
-            stealthy_headers=True,
-            follow_redirects=True,
-            timeout=None
-        )).status in [200, 405]
     async def test_delete_properties(self, fetcher, urls):
         """Test if different arguments with DELETE request breaks the code or not"""
-        assert (await fetcher.delete(urls['delete_url'], stealthy_headers=True)).status == 200
-        assert (await fetcher.delete(urls['delete_url'], follow_redirects=True)).status == 200
-        assert (await fetcher.delete(urls['delete_url'], timeout=None)).status == 200
-        assert (await fetcher.delete(
-            urls['delete_url'],
-            stealthy_headers=True,
-            follow_redirects=True,
-            timeout=None
-        )).status == 200

     @pytest.fixture(scope="class")
     def urls(self, httpbin):
         return {
+            "status_200": f"{httpbin.url}/status/200",
+            "status_404": f"{httpbin.url}/status/404",
+            "status_501": f"{httpbin.url}/status/501",
+            "basic_url": f"{httpbin.url}/get",
+            "post_url": f"{httpbin.url}/post",
+            "put_url": f"{httpbin.url}/put",
+            "delete_url": f"{httpbin.url}/delete",
+            "html_url": f"{httpbin.url}/html",
         }
     async def test_basic_get(self, fetcher, urls):
         """Test doing basic get request with multiple statuses"""
+        assert (await fetcher.get(urls["status_200"])).status == 200
+        assert (await fetcher.get(urls["status_404"])).status == 404
+        assert (await fetcher.get(urls["status_501"])).status == 501
     async def test_get_properties(self, fetcher, urls):
         """Test if different arguments with GET request breaks the code or not"""
+        assert (
+            await fetcher.get(urls["status_200"], stealthy_headers=True)
+        ).status == 200
+        assert (
+            await fetcher.get(urls["status_200"], follow_redirects=True)
+        ).status == 200
+        assert (await fetcher.get(urls["status_200"], timeout=None)).status == 200
+        assert (
+            await fetcher.get(
+                urls["status_200"],
+                stealthy_headers=True,
+                follow_redirects=True,
+                timeout=None,
+            )
+        ).status == 200
     async def test_post_properties(self, fetcher, urls):
         """Test if different arguments with POST request breaks the code or not"""
+        assert (
+            await fetcher.post(urls["post_url"], data={"key": "value"})
+        ).status == 200
+        assert (
+            await fetcher.post(
+                urls["post_url"], data={"key": "value"}, stealthy_headers=True
+            )
+        ).status == 200
+        assert (
+            await fetcher.post(
+                urls["post_url"], data={"key": "value"}, follow_redirects=True
+            )
+        ).status == 200
+        assert (
+            await fetcher.post(urls["post_url"], data={"key": "value"}, timeout=None)
+        ).status == 200
+        assert (
+            await fetcher.post(
+                urls["post_url"],
+                data={"key": "value"},
+                stealthy_headers=True,
+                follow_redirects=True,
+                timeout=None,
+            )
+        ).status == 200
     async def test_put_properties(self, fetcher, urls):
         """Test if different arguments with PUT request breaks the code or not"""
+        assert (await fetcher.put(urls["put_url"], data={"key": "value"})).status in [
+            200,
+            405,
+        ]
+        assert (
+            await fetcher.put(
+                urls["put_url"], data={"key": "value"}, stealthy_headers=True
+            )
+        ).status in [200, 405]
+        assert (
+            await fetcher.put(
+                urls["put_url"], data={"key": "value"}, follow_redirects=True
+            )
+        ).status in [200, 405]
+        assert (
+            await fetcher.put(urls["put_url"], data={"key": "value"}, timeout=None)
+        ).status in [200, 405]
+        assert (
+            await fetcher.put(
+                urls["put_url"],
+                data={"key": "value"},
+                stealthy_headers=True,
+                follow_redirects=True,
+                timeout=None,
+            )
+        ).status in [200, 405]
     async def test_delete_properties(self, fetcher, urls):
         """Test if different arguments with DELETE request breaks the code or not"""
+        assert (
+            await fetcher.delete(urls["delete_url"], stealthy_headers=True)
+        ).status == 200
+        assert (
+            await fetcher.delete(urls["delete_url"], follow_redirects=True)
+        ).status == 200
+        assert (await fetcher.delete(urls["delete_url"], timeout=None)).status == 200
+        assert (
+            await fetcher.delete(
+                urls["delete_url"],
+                stealthy_headers=True,
+                follow_redirects=True,
+                timeout=None,
+            )
+        ).status == 200

tests/fetchers/async/test_playwright.py CHANGED Viewed

@@ -15,87 +15,97 @@ class TestPlayWrightFetcherAsync:
     @pytest.fixture
     def urls(self, httpbin):
         return {
-            'status_200': f'{httpbin.url}/status/200',
-            'status_404': f'{httpbin.url}/status/404',
-            'status_501': f'{httpbin.url}/status/501',
-            'basic_url': f'{httpbin.url}/get',
-            'html_url': f'{httpbin.url}/html',
-            'delayed_url': f'{httpbin.url}/delay/10',
-            'cookies_url': f"{httpbin.url}/cookies/set/test/value"
         }
     @pytest.mark.asyncio
     async def test_basic_fetch(self, fetcher, urls):
         """Test doing basic fetch request with multiple statuses"""
-        response = await fetcher.async_fetch(urls['status_200'])
         assert response.status == 200
     @pytest.mark.asyncio
     async def test_networkidle(self, fetcher, urls):
         """Test if waiting for `networkidle` make page does not finish loading or not"""
-        response = await fetcher.async_fetch(urls['basic_url'], network_idle=True)
         assert response.status == 200
     @pytest.mark.asyncio
     async def test_blocking_resources(self, fetcher, urls):
         """Test if blocking resources make page does not finish loading or not"""
-        response = await fetcher.async_fetch(urls['basic_url'], disable_resources=True)
         assert response.status == 200
     @pytest.mark.asyncio
     async def test_waiting_selector(self, fetcher, urls):
         """Test if waiting for a selector make page does not finish loading or not"""
-        response1 = await fetcher.async_fetch(urls['html_url'], wait_selector='h1')
         assert response1.status == 200
-        response2 = await fetcher.async_fetch(urls['html_url'], wait_selector='h1', wait_selector_state='visible')
         assert response2.status == 200
     @pytest.mark.asyncio
     async def test_cookies_loading(self, fetcher, urls):
         """Test if cookies are set after the request"""
-        response = await fetcher.async_fetch(urls['cookies_url'])
-        assert response.cookies == {'test': 'value'}
     @pytest.mark.asyncio
     async def test_automation(self, fetcher, urls):
         """Test if automation break the code or not"""
         async def scroll_page(page):
             await page.mouse.wheel(10, 0)
             await page.mouse.move(100, 400)
             await page.mouse.up()
             return page
-        response = await fetcher.async_fetch(urls['html_url'], page_action=scroll_page)
         assert response.status == 200
-    @pytest.mark.parametrize("kwargs", [
-        {"disable_webgl": True, "hide_canvas": False},
-        {"disable_webgl": False, "hide_canvas": True},
-        # {"stealth": True}, # causes issues with Github Actions
-        {"useragent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0'},
-        {"extra_headers": {'ayo': ''}}
-    ])
     @pytest.mark.asyncio
     async def test_properties(self, fetcher, urls, kwargs):
         """Test if different arguments breaks the code or not"""
-        response = await fetcher.async_fetch(urls['html_url'], **kwargs)
         assert response.status == 200
     @pytest.mark.asyncio
     async def test_cdp_url_invalid(self, fetcher, urls):
         """Test if invalid CDP URLs raise appropriate exceptions"""
         with pytest.raises(ValueError):
-            await fetcher.async_fetch(urls['html_url'], cdp_url='blahblah')
         with pytest.raises(ValueError):
-            await fetcher.async_fetch(urls['html_url'], cdp_url='blahblah', nstbrowser_mode=True)
         with pytest.raises(Exception):
-            await fetcher.async_fetch(urls['html_url'], cdp_url='ws://blahblah')
     @pytest.mark.asyncio
     async def test_infinite_timeout(self, fetcher, urls):
         """Test if infinite timeout breaks the code or not"""
-        response = await fetcher.async_fetch(urls['delayed_url'], timeout=None)
         assert response.status == 200

     @pytest.fixture
     def urls(self, httpbin):
         return {
+            "status_200": f"{httpbin.url}/status/200",
+            "status_404": f"{httpbin.url}/status/404",
+            "status_501": f"{httpbin.url}/status/501",
+            "basic_url": f"{httpbin.url}/get",
+            "html_url": f"{httpbin.url}/html",
+            "delayed_url": f"{httpbin.url}/delay/10",
+            "cookies_url": f"{httpbin.url}/cookies/set/test/value",
         }
     @pytest.mark.asyncio
     async def test_basic_fetch(self, fetcher, urls):
         """Test doing basic fetch request with multiple statuses"""
+        response = await fetcher.async_fetch(urls["status_200"])
         assert response.status == 200
     @pytest.mark.asyncio
     async def test_networkidle(self, fetcher, urls):
         """Test if waiting for `networkidle` make page does not finish loading or not"""
+        response = await fetcher.async_fetch(urls["basic_url"], network_idle=True)
         assert response.status == 200
     @pytest.mark.asyncio
     async def test_blocking_resources(self, fetcher, urls):
         """Test if blocking resources make page does not finish loading or not"""
+        response = await fetcher.async_fetch(urls["basic_url"], disable_resources=True)
         assert response.status == 200
     @pytest.mark.asyncio
     async def test_waiting_selector(self, fetcher, urls):
         """Test if waiting for a selector make page does not finish loading or not"""
+        response1 = await fetcher.async_fetch(urls["html_url"], wait_selector="h1")
         assert response1.status == 200
+        response2 = await fetcher.async_fetch(
+            urls["html_url"], wait_selector="h1", wait_selector_state="visible"
+        )
         assert response2.status == 200
     @pytest.mark.asyncio
     async def test_cookies_loading(self, fetcher, urls):
         """Test if cookies are set after the request"""
+        response = await fetcher.async_fetch(urls["cookies_url"])
+        assert response.cookies == {"test": "value"}
     @pytest.mark.asyncio
     async def test_automation(self, fetcher, urls):
         """Test if automation break the code or not"""
         async def scroll_page(page):
             await page.mouse.wheel(10, 0)
             await page.mouse.move(100, 400)
             await page.mouse.up()
             return page
+        response = await fetcher.async_fetch(urls["html_url"], page_action=scroll_page)
         assert response.status == 200
+    @pytest.mark.parametrize(
+        "kwargs",
+        [
+            {"disable_webgl": True, "hide_canvas": False},
+            {"disable_webgl": False, "hide_canvas": True},
+            # {"stealth": True}, # causes issues with Github Actions
+            {
+                "useragent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0"
+            },
+            {"extra_headers": {"ayo": ""}},
+        ],
+    )
     @pytest.mark.asyncio
     async def test_properties(self, fetcher, urls, kwargs):
         """Test if different arguments breaks the code or not"""
+        response = await fetcher.async_fetch(urls["html_url"], **kwargs)
         assert response.status == 200
     @pytest.mark.asyncio
     async def test_cdp_url_invalid(self, fetcher, urls):
         """Test if invalid CDP URLs raise appropriate exceptions"""
         with pytest.raises(ValueError):
+            await fetcher.async_fetch(urls["html_url"], cdp_url="blahblah")
         with pytest.raises(ValueError):
+            await fetcher.async_fetch(
+                urls["html_url"], cdp_url="blahblah", nstbrowser_mode=True
+            )
         with pytest.raises(Exception):
+            await fetcher.async_fetch(urls["html_url"], cdp_url="ws://blahblah")
     @pytest.mark.asyncio
     async def test_infinite_timeout(self, fetcher, urls):
         """Test if infinite timeout breaks the code or not"""
+        response = await fetcher.async_fetch(urls["delayed_url"], timeout=None)
         assert response.status == 200

tests/fetchers/sync/test_camoufox.py CHANGED Viewed

@@ -16,12 +16,12 @@ class TestStealthyFetcher:
     @pytest.fixture(autouse=True)
     def setup_urls(self, httpbin):
         """Fixture to set up URLs for testing"""
-        self.status_200 = f'{httpbin.url}/status/200'
-        self.status_404 = f'{httpbin.url}/status/404'
-        self.status_501 = f'{httpbin.url}/status/501'
-        self.basic_url = f'{httpbin.url}/get'
-        self.html_url = f'{httpbin.url}/html'
-        self.delayed_url = f'{httpbin.url}/delay/10'  # 10 Seconds delay response
         self.cookies_url = f"{httpbin.url}/cookies/set/test/value"
     def test_basic_fetch(self, fetcher):
@@ -41,15 +41,21 @@ class TestStealthyFetcher:
     def test_waiting_selector(self, fetcher):
         """Test if waiting for a selector make page does not finish loading or not"""
-        assert fetcher.fetch(self.html_url, wait_selector='h1').status == 200
-        assert fetcher.fetch(self.html_url, wait_selector='h1', wait_selector_state='visible').status == 200
     def test_cookies_loading(self, fetcher):
         """Test if cookies are set after the request"""
-        assert fetcher.fetch(self.cookies_url).cookies == {'test': 'value'}
     def test_automation(self, fetcher):
         """Test if automation break the code or not"""
         def scroll_page(page):
             page.mouse.wheel(10, 0)
             page.mouse.move(100, 400)
@@ -60,10 +66,24 @@ class TestStealthyFetcher:
     def test_properties(self, fetcher):
         """Test if different arguments breaks the code or not"""
-        assert fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=True).status == 200
-        assert fetcher.fetch(self.html_url, block_webrtc=False, allow_webgl=True).status == 200
-        assert fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=False).status == 200
-        assert fetcher.fetch(self.html_url, extra_headers={'ayo': ''}, os_randomize=True).status == 200
     def test_infinite_timeout(self, fetcher):
         """Test if infinite timeout breaks the code or not"""

     @pytest.fixture(autouse=True)
     def setup_urls(self, httpbin):
         """Fixture to set up URLs for testing"""
+        self.status_200 = f"{httpbin.url}/status/200"
+        self.status_404 = f"{httpbin.url}/status/404"
+        self.status_501 = f"{httpbin.url}/status/501"
+        self.basic_url = f"{httpbin.url}/get"
+        self.html_url = f"{httpbin.url}/html"
+        self.delayed_url = f"{httpbin.url}/delay/10"  # 10 Seconds delay response
         self.cookies_url = f"{httpbin.url}/cookies/set/test/value"
     def test_basic_fetch(self, fetcher):
     def test_waiting_selector(self, fetcher):
         """Test if waiting for a selector make page does not finish loading or not"""
+        assert fetcher.fetch(self.html_url, wait_selector="h1").status == 200
+        assert (
+            fetcher.fetch(
+                self.html_url, wait_selector="h1", wait_selector_state="visible"
+            ).status
+            == 200
+        )
     def test_cookies_loading(self, fetcher):
         """Test if cookies are set after the request"""
+        assert fetcher.fetch(self.cookies_url).cookies == {"test": "value"}
     def test_automation(self, fetcher):
         """Test if automation break the code or not"""
         def scroll_page(page):
             page.mouse.wheel(10, 0)
             page.mouse.move(100, 400)
     def test_properties(self, fetcher):
         """Test if different arguments breaks the code or not"""
+        assert (
+            fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=True).status
+            == 200
+        )
+        assert (
+            fetcher.fetch(self.html_url, block_webrtc=False, allow_webgl=True).status
+            == 200
+        )
+        assert (
+            fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=False).status
+            == 200
+        )
+        assert (
+            fetcher.fetch(
+                self.html_url, extra_headers={"ayo": ""}, os_randomize=True
+            ).status
+            == 200
+        )
     def test_infinite_timeout(self, fetcher):
         """Test if infinite timeout breaks the code or not"""

tests/fetchers/sync/test_httpx.py CHANGED Viewed

@@ -16,14 +16,14 @@ class TestFetcher:
     @pytest.fixture(autouse=True)
     def setup_urls(self, httpbin):
         """Fixture to set up URLs for testing"""
-        self.status_200 = f'{httpbin.url}/status/200'
-        self.status_404 = f'{httpbin.url}/status/404'
-        self.status_501 = f'{httpbin.url}/status/501'
-        self.basic_url = f'{httpbin.url}/get'
-        self.post_url = f'{httpbin.url}/post'
-        self.put_url = f'{httpbin.url}/put'
-        self.delete_url = f'{httpbin.url}/delete'
-        self.html_url = f'{httpbin.url}/html'
     def test_basic_get(self, fetcher):
         """Test doing basic get request with multiple statuses"""
@@ -36,49 +36,86 @@ class TestFetcher:
         assert fetcher.get(self.status_200, stealthy_headers=True).status == 200
         assert fetcher.get(self.status_200, follow_redirects=True).status == 200
         assert fetcher.get(self.status_200, timeout=None).status == 200
-        assert fetcher.get(
-            self.status_200,
-            stealthy_headers=True,
-            follow_redirects=True,
-            timeout=None
-        ).status == 200
     def test_post_properties(self, fetcher):
         """Test if different arguments with POST request breaks the code or not"""
-        assert fetcher.post(self.post_url, data={'key': 'value'}).status == 200
-        assert fetcher.post(self.post_url, data={'key': 'value'}, stealthy_headers=True).status == 200
-        assert fetcher.post(self.post_url, data={'key': 'value'}, follow_redirects=True).status == 200
-        assert fetcher.post(self.post_url, data={'key': 'value'}, timeout=None).status == 200
-        assert fetcher.post(
-            self.post_url,
-            data={'key': 'value'},
-            stealthy_headers=True,
-            follow_redirects=True,
-            timeout=None
-        ).status == 200
     def test_put_properties(self, fetcher):
         """Test if different arguments with PUT request breaks the code or not"""
-        assert fetcher.put(self.put_url, data={'key': 'value'}).status == 200
-        assert fetcher.put(self.put_url, data={'key': 'value'}, stealthy_headers=True).status == 200
-        assert fetcher.put(self.put_url, data={'key': 'value'}, follow_redirects=True).status == 200
-        assert fetcher.put(self.put_url, data={'key': 'value'}, timeout=None).status == 200
-        assert fetcher.put(
-            self.put_url,
-            data={'key': 'value'},
-            stealthy_headers=True,
-            follow_redirects=True,
-            timeout=None
-        ).status == 200
     def test_delete_properties(self, fetcher):
         """Test if different arguments with DELETE request breaks the code or not"""
         assert fetcher.delete(self.delete_url, stealthy_headers=True).status == 200
         assert fetcher.delete(self.delete_url, follow_redirects=True).status == 200
         assert fetcher.delete(self.delete_url, timeout=None).status == 200
-        assert fetcher.delete(
-            self.delete_url,
-            stealthy_headers=True,
-            follow_redirects=True,
-            timeout=None
-        ).status == 200

     @pytest.fixture(autouse=True)
     def setup_urls(self, httpbin):
         """Fixture to set up URLs for testing"""
+        self.status_200 = f"{httpbin.url}/status/200"
+        self.status_404 = f"{httpbin.url}/status/404"
+        self.status_501 = f"{httpbin.url}/status/501"
+        self.basic_url = f"{httpbin.url}/get"
+        self.post_url = f"{httpbin.url}/post"
+        self.put_url = f"{httpbin.url}/put"
+        self.delete_url = f"{httpbin.url}/delete"
+        self.html_url = f"{httpbin.url}/html"
     def test_basic_get(self, fetcher):
         """Test doing basic get request with multiple statuses"""
         assert fetcher.get(self.status_200, stealthy_headers=True).status == 200
         assert fetcher.get(self.status_200, follow_redirects=True).status == 200
         assert fetcher.get(self.status_200, timeout=None).status == 200
+        assert (
+            fetcher.get(
+                self.status_200,
+                stealthy_headers=True,
+                follow_redirects=True,
+                timeout=None,
+            ).status
+            == 200
+        )
     def test_post_properties(self, fetcher):
         """Test if different arguments with POST request breaks the code or not"""
+        assert fetcher.post(self.post_url, data={"key": "value"}).status == 200
+        assert (
+            fetcher.post(
+                self.post_url, data={"key": "value"}, stealthy_headers=True
+            ).status
+            == 200
+        )
+        assert (
+            fetcher.post(
+                self.post_url, data={"key": "value"}, follow_redirects=True
+            ).status
+            == 200
+        )
+        assert (
+            fetcher.post(self.post_url, data={"key": "value"}, timeout=None).status
+            == 200
+        )
+        assert (
+            fetcher.post(
+                self.post_url,
+                data={"key": "value"},
+                stealthy_headers=True,
+                follow_redirects=True,
+                timeout=None,
+            ).status
+            == 200
+        )
     def test_put_properties(self, fetcher):
         """Test if different arguments with PUT request breaks the code or not"""
+        assert fetcher.put(self.put_url, data={"key": "value"}).status == 200
+        assert (
+            fetcher.put(
+                self.put_url, data={"key": "value"}, stealthy_headers=True
+            ).status
+            == 200
+        )
+        assert (
+            fetcher.put(
+                self.put_url, data={"key": "value"}, follow_redirects=True
+            ).status
+            == 200
+        )
+        assert (
+            fetcher.put(self.put_url, data={"key": "value"}, timeout=None).status == 200
+        )
+        assert (
+            fetcher.put(
+                self.put_url,
+                data={"key": "value"},
+                stealthy_headers=True,
+                follow_redirects=True,
+                timeout=None,
+            ).status
+            == 200
+        )
     def test_delete_properties(self, fetcher):
         """Test if different arguments with DELETE request breaks the code or not"""
         assert fetcher.delete(self.delete_url, stealthy_headers=True).status == 200
         assert fetcher.delete(self.delete_url, follow_redirects=True).status == 200
         assert fetcher.delete(self.delete_url, timeout=None).status == 200
+        assert (
+            fetcher.delete(
+                self.delete_url,
+                stealthy_headers=True,
+                follow_redirects=True,
+                timeout=None,
+            ).status
+            == 200
+        )

tests/fetchers/sync/test_playwright.py CHANGED Viewed

@@ -8,7 +8,6 @@ PlayWrightFetcher.auto_match = True
 @pytest_httpbin.use_class_based_httpbin
 class TestPlayWrightFetcher:
     @pytest.fixture(scope="class")
     def fetcher(self):
         """Fixture to create a StealthyFetcher instance for the entire test class"""
@@ -17,12 +16,12 @@ class TestPlayWrightFetcher:
     @pytest.fixture(autouse=True)
     def setup_urls(self, httpbin):
         """Fixture to set up URLs for testing"""
-        self.status_200 = f'{httpbin.url}/status/200'
-        self.status_404 = f'{httpbin.url}/status/404'
-        self.status_501 = f'{httpbin.url}/status/501'
-        self.basic_url = f'{httpbin.url}/get'
-        self.html_url = f'{httpbin.url}/html'
-        self.delayed_url = f'{httpbin.url}/delay/10'  # 10 Seconds delay response
         self.cookies_url = f"{httpbin.url}/cookies/set/test/value"
     def test_basic_fetch(self, fetcher):
@@ -42,12 +41,17 @@ class TestPlayWrightFetcher:
     def test_waiting_selector(self, fetcher):
         """Test if waiting for a selector make page does not finish loading or not"""
-        assert fetcher.fetch(self.html_url, wait_selector='h1').status == 200
-        assert fetcher.fetch(self.html_url, wait_selector='h1', wait_selector_state='visible').status == 200
     def test_cookies_loading(self, fetcher):
         """Test if cookies are set after the request"""
-        assert fetcher.fetch(self.cookies_url).cookies == {'test': 'value'}
     def test_automation(self, fetcher):
         """Test if automation break the code or not"""
@@ -60,13 +64,18 @@ class TestPlayWrightFetcher:
         assert fetcher.fetch(self.html_url, page_action=scroll_page).status == 200
-    @pytest.mark.parametrize("kwargs", [
-        {"disable_webgl": True, "hide_canvas": False},
-        {"disable_webgl": False, "hide_canvas": True},
-        # {"stealth": True}, # causes issues with Github Actions
-        {"useragent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0'},
-        {"extra_headers": {'ayo': ''}}
-    ])
     def test_properties(self, fetcher, kwargs):
         """Test if different arguments breaks the code or not"""
         response = fetcher.fetch(self.html_url, **kwargs)
@@ -75,15 +84,18 @@ class TestPlayWrightFetcher:
     def test_cdp_url_invalid(self, fetcher):
         """Test if invalid CDP URLs raise appropriate exceptions"""
         with pytest.raises(ValueError):
-            fetcher.fetch(self.html_url, cdp_url='blahblah')
         with pytest.raises(ValueError):
-            fetcher.fetch(self.html_url, cdp_url='blahblah', nstbrowser_mode=True)
         with pytest.raises(Exception):
-            fetcher.fetch(self.html_url, cdp_url='ws://blahblah')
-    def test_infinite_timeout(self, fetcher, ):
         """Test if infinite timeout breaks the code or not"""
         response = fetcher.fetch(self.delayed_url, timeout=None)
         assert response.status == 200

 @pytest_httpbin.use_class_based_httpbin
 class TestPlayWrightFetcher:
     @pytest.fixture(scope="class")
     def fetcher(self):
         """Fixture to create a StealthyFetcher instance for the entire test class"""
     @pytest.fixture(autouse=True)
     def setup_urls(self, httpbin):
         """Fixture to set up URLs for testing"""
+        self.status_200 = f"{httpbin.url}/status/200"
+        self.status_404 = f"{httpbin.url}/status/404"
+        self.status_501 = f"{httpbin.url}/status/501"
+        self.basic_url = f"{httpbin.url}/get"
+        self.html_url = f"{httpbin.url}/html"
+        self.delayed_url = f"{httpbin.url}/delay/10"  # 10 Seconds delay response
         self.cookies_url = f"{httpbin.url}/cookies/set/test/value"
     def test_basic_fetch(self, fetcher):
     def test_waiting_selector(self, fetcher):
         """Test if waiting for a selector make page does not finish loading or not"""
+        assert fetcher.fetch(self.html_url, wait_selector="h1").status == 200
+        assert (
+            fetcher.fetch(
+                self.html_url, wait_selector="h1", wait_selector_state="visible"
+            ).status
+            == 200
+        )
     def test_cookies_loading(self, fetcher):
         """Test if cookies are set after the request"""
+        assert fetcher.fetch(self.cookies_url).cookies == {"test": "value"}
     def test_automation(self, fetcher):
         """Test if automation break the code or not"""
         assert fetcher.fetch(self.html_url, page_action=scroll_page).status == 200
+    @pytest.mark.parametrize(
+        "kwargs",
+        [
+            {"disable_webgl": True, "hide_canvas": False},
+            {"disable_webgl": False, "hide_canvas": True},
+            # {"stealth": True}, # causes issues with Github Actions
+            {
+                "useragent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0"
+            },
+            {"extra_headers": {"ayo": ""}},
+        ],
+    )
     def test_properties(self, fetcher, kwargs):
         """Test if different arguments breaks the code or not"""
         response = fetcher.fetch(self.html_url, **kwargs)
     def test_cdp_url_invalid(self, fetcher):
         """Test if invalid CDP URLs raise appropriate exceptions"""
         with pytest.raises(ValueError):
+            fetcher.fetch(self.html_url, cdp_url="blahblah")
         with pytest.raises(ValueError):
+            fetcher.fetch(self.html_url, cdp_url="blahblah", nstbrowser_mode=True)
         with pytest.raises(Exception):
+            fetcher.fetch(self.html_url, cdp_url="ws://blahblah")
+    def test_infinite_timeout(
+        self,
+        fetcher,
+    ):
         """Test if infinite timeout breaks the code or not"""
         response = fetcher.fetch(self.delayed_url, timeout=None)
         assert response.status == 200

tests/fetchers/test_utils.py CHANGED Viewed

@@ -7,76 +7,117 @@ from scrapling.engines.toolbelt.custom import ResponseEncoding, StatusText
 def content_type_map():
     return {
         # A map generated by ChatGPT for most possible `content_type` values and the expected outcome
-        'text/html; charset=UTF-8': 'UTF-8',
-        'text/html; charset=ISO-8859-1': 'ISO-8859-1',
-        'text/html': 'ISO-8859-1',
-        'application/json; charset=UTF-8': 'UTF-8',
-        'application/json': 'utf-8',
-        'text/json': 'utf-8',
-        'application/javascript; charset=UTF-8': 'UTF-8',
-        'application/javascript': 'utf-8',
-        'text/plain; charset=UTF-8': 'UTF-8',
-        'text/plain; charset=ISO-8859-1': 'ISO-8859-1',
-        'text/plain': 'ISO-8859-1',
-        'application/xhtml+xml; charset=UTF-8': 'UTF-8',
-        'application/xhtml+xml': 'utf-8',
-        'text/html; charset=windows-1252': 'windows-1252',
-        'application/json; charset=windows-1252': 'windows-1252',
-        'text/plain; charset=windows-1252': 'windows-1252',
-        'text/html; charset="UTF-8"': 'UTF-8',
-        'text/html; charset="ISO-8859-1"': 'ISO-8859-1',
-        'text/html; charset="windows-1252"': 'windows-1252',
-        'application/json; charset="UTF-8"': 'UTF-8',
-        'application/json; charset="ISO-8859-1"': 'ISO-8859-1',
-        'application/json; charset="windows-1252"': 'windows-1252',
-        'text/json; charset="UTF-8"': 'UTF-8',
-        'application/javascript; charset="UTF-8"': 'UTF-8',
-        'application/javascript; charset="ISO-8859-1"': 'ISO-8859-1',
-        'text/plain; charset="UTF-8"': 'UTF-8',
-        'text/plain; charset="ISO-8859-1"': 'ISO-8859-1',
-        'text/plain; charset="windows-1252"': 'windows-1252',
-        'application/xhtml+xml; charset="UTF-8"': 'UTF-8',
-        'application/xhtml+xml; charset="ISO-8859-1"': 'ISO-8859-1',
-        'application/xhtml+xml; charset="windows-1252"': 'windows-1252',
-        'text/html; charset="US-ASCII"': 'US-ASCII',
-        'application/json; charset="US-ASCII"': 'US-ASCII',
-        'text/plain; charset="US-ASCII"': 'US-ASCII',
-        'text/html; charset="Shift_JIS"': 'Shift_JIS',
-        'application/json; charset="Shift_JIS"': 'Shift_JIS',
-        'text/plain; charset="Shift_JIS"': 'Shift_JIS',
-        'application/xml; charset="UTF-8"': 'UTF-8',
-        'application/xml; charset="ISO-8859-1"': 'ISO-8859-1',
-        'application/xml': 'utf-8',
-        'text/xml; charset="UTF-8"': 'UTF-8',
-        'text/xml; charset="ISO-8859-1"': 'ISO-8859-1',
-        'text/xml': 'utf-8'
     }
 @pytest.fixture
 def status_map():
     return {
-        100: "Continue", 101: "Switching Protocols", 102: "Processing", 103: "Early Hints",
-        200: "OK", 201: "Created", 202: "Accepted", 203: "Non-Authoritative Information",
-        204: "No Content", 205: "Reset Content", 206: "Partial Content", 207: "Multi-Status",
-        208: "Already Reported", 226: "IM Used", 300: "Multiple Choices",
-        301: "Moved Permanently", 302: "Found", 303: "See Other", 304: "Not Modified",
-        305: "Use Proxy", 307: "Temporary Redirect", 308: "Permanent Redirect",
-        400: "Bad Request", 401: "Unauthorized", 402: "Payment Required", 403: "Forbidden",
-        404: "Not Found", 405: "Method Not Allowed", 406: "Not Acceptable",
-        407: "Proxy Authentication Required", 408: "Request Timeout", 409: "Conflict",
-        410: "Gone", 411: "Length Required", 412: "Precondition Failed",
-        413: "Payload Too Large", 414: "URI Too Long", 415: "Unsupported Media Type",
-        416: "Range Not Satisfiable", 417: "Expectation Failed", 418: "I'm a teapot",
-        421: "Misdirected Request", 422: "Unprocessable Entity", 423: "Locked",
-        424: "Failed Dependency", 425: "Too Early", 426: "Upgrade Required",
-        428: "Precondition Required", 429: "Too Many Requests",
-        431: "Request Header Fields Too Large", 451: "Unavailable For Legal Reasons",
-        500: "Internal Server Error", 501: "Not Implemented", 502: "Bad Gateway",
-        503: "Service Unavailable", 504: "Gateway Timeout",
-        505: "HTTP Version Not Supported", 506: "Variant Also Negotiates",
-        507: "Insufficient Storage", 508: "Loop Detected", 510: "Not Extended",
-        511: "Network Authentication Required"
     }

 def content_type_map():
     return {
         # A map generated by ChatGPT for most possible `content_type` values and the expected outcome
+        "text/html; charset=UTF-8": "UTF-8",
+        "text/html; charset=ISO-8859-1": "ISO-8859-1",
+        "text/html": "ISO-8859-1",
+        "application/json; charset=UTF-8": "UTF-8",
+        "application/json": "utf-8",
+        "text/json": "utf-8",
+        "application/javascript; charset=UTF-8": "UTF-8",
+        "application/javascript": "utf-8",
+        "text/plain; charset=UTF-8": "UTF-8",
+        "text/plain; charset=ISO-8859-1": "ISO-8859-1",
+        "text/plain": "ISO-8859-1",
+        "application/xhtml+xml; charset=UTF-8": "UTF-8",
+        "application/xhtml+xml": "utf-8",
+        "text/html; charset=windows-1252": "windows-1252",
+        "application/json; charset=windows-1252": "windows-1252",
+        "text/plain; charset=windows-1252": "windows-1252",
+        'text/html; charset="UTF-8"': "UTF-8",
+        'text/html; charset="ISO-8859-1"': "ISO-8859-1",
+        'text/html; charset="windows-1252"': "windows-1252",
+        'application/json; charset="UTF-8"': "UTF-8",
+        'application/json; charset="ISO-8859-1"': "ISO-8859-1",
+        'application/json; charset="windows-1252"': "windows-1252",
+        'text/json; charset="UTF-8"': "UTF-8",
+        'application/javascript; charset="UTF-8"': "UTF-8",
+        'application/javascript; charset="ISO-8859-1"': "ISO-8859-1",
+        'text/plain; charset="UTF-8"': "UTF-8",
+        'text/plain; charset="ISO-8859-1"': "ISO-8859-1",
+        'text/plain; charset="windows-1252"': "windows-1252",
+        'application/xhtml+xml; charset="UTF-8"': "UTF-8",
+        'application/xhtml+xml; charset="ISO-8859-1"': "ISO-8859-1",
+        'application/xhtml+xml; charset="windows-1252"': "windows-1252",
+        'text/html; charset="US-ASCII"': "US-ASCII",
+        'application/json; charset="US-ASCII"': "US-ASCII",
+        'text/plain; charset="US-ASCII"': "US-ASCII",
+        'text/html; charset="Shift_JIS"': "Shift_JIS",
+        'application/json; charset="Shift_JIS"': "Shift_JIS",
+        'text/plain; charset="Shift_JIS"': "Shift_JIS",
+        'application/xml; charset="UTF-8"': "UTF-8",
+        'application/xml; charset="ISO-8859-1"': "ISO-8859-1",
+        "application/xml": "utf-8",
+        'text/xml; charset="UTF-8"': "UTF-8",
+        'text/xml; charset="ISO-8859-1"': "ISO-8859-1",
+        "text/xml": "utf-8",
     }
 @pytest.fixture
 def status_map():
     return {
+        100: "Continue",
+        101: "Switching Protocols",
+        102: "Processing",
+        103: "Early Hints",
+        200: "OK",
+        201: "Created",
+        202: "Accepted",
+        203: "Non-Authoritative Information",
+        204: "No Content",
+        205: "Reset Content",
+        206: "Partial Content",
+        207: "Multi-Status",
+        208: "Already Reported",
+        226: "IM Used",
+        300: "Multiple Choices",
+        301: "Moved Permanently",
+        302: "Found",
+        303: "See Other",
+        304: "Not Modified",
+        305: "Use Proxy",
+        307: "Temporary Redirect",
+        308: "Permanent Redirect",
+        400: "Bad Request",
+        401: "Unauthorized",
+        402: "Payment Required",
+        403: "Forbidden",
+        404: "Not Found",
+        405: "Method Not Allowed",
+        406: "Not Acceptable",
+        407: "Proxy Authentication Required",
+        408: "Request Timeout",
+        409: "Conflict",
+        410: "Gone",
+        411: "Length Required",
+        412: "Precondition Failed",
+        413: "Payload Too Large",
+        414: "URI Too Long",
+        415: "Unsupported Media Type",
+        416: "Range Not Satisfiable",
+        417: "Expectation Failed",
+        418: "I'm a teapot",
+        421: "Misdirected Request",
+        422: "Unprocessable Entity",
+        423: "Locked",
+        424: "Failed Dependency",
+        425: "Too Early",
+        426: "Upgrade Required",
+        428: "Precondition Required",
+        429: "Too Many Requests",
+        431: "Request Header Fields Too Large",
+        451: "Unavailable For Legal Reasons",
+        500: "Internal Server Error",
+        501: "Not Implemented",
+        502: "Bad Gateway",
+        503: "Service Unavailable",
+        504: "Gateway Timeout",
+        505: "HTTP Version Not Supported",
+        506: "Variant Also Negotiates",
+        507: "Insufficient Storage",
+        508: "Loop Detected",
+        510: "Not Extended",
+        511: "Network Authentication Required",
     }

tests/parser/test_automatch.py CHANGED Viewed

@@ -8,7 +8,7 @@ from scrapling import Adaptor
 class TestParserAutoMatch:
     def test_element_relocation(self):
         """Test relocating element after structure change"""
-        original_html = '''
                 <div class="container">
                     <section class="products">
                         <article class="product" id="p1">
@@ -21,8 +21,8 @@ class TestParserAutoMatch:
                         </article>
                     </section>
                 </div>
-                '''
-        changed_html = '''
                 <div class="new-container">
                     <div class="product-wrapper">
                         <section class="products">
@@ -41,25 +41,25 @@ class TestParserAutoMatch:
                         </section>
                     </div>
                 </div>
-                '''
-        old_page = Adaptor(original_html, url='example.com', auto_match=True)
-        new_page = Adaptor(changed_html, url='example.com', auto_match=True)
         # 'p1' was used as ID and now it's not and all the path elements have changes
         # Also at the same time testing auto-match vs combined selectors
-        _ = old_page.css('#p1, #p2', auto_save=True)[0]
-        relocated = new_page.css('#p1', auto_match=True)
         assert relocated is not None
-        assert relocated[0].attrib['data-id'] == 'p1'
-        assert relocated[0].has_class('new-class')
-        assert relocated[0].css('.new-description')[0].text == 'Description 1'
     @pytest.mark.asyncio
     async def test_element_relocation_async(self):
         """Test relocating element after structure change in async mode"""
-        original_html = '''
                 <div class="container">
                     <section class="products">
                         <article class="product" id="p1">
@@ -72,8 +72,8 @@ class TestParserAutoMatch:
                         </article>
                     </section>
                 </div>
-                '''
-        changed_html = '''
                 <div class="new-container">
                     <div class="product-wrapper">
                         <section class="products">
@@ -92,20 +92,20 @@ class TestParserAutoMatch:
                         </section>
                     </div>
                 </div>
-                '''
         # Simulate async operation
         await asyncio.sleep(0.1)  # Minimal async operation
-        old_page = Adaptor(original_html, url='example.com', auto_match=True)
-        new_page = Adaptor(changed_html, url='example.com', auto_match=True)
         # 'p1' was used as ID and now it's not and all the path elements have changes
         # Also at the same time testing auto-match vs combined selectors
-        _ = old_page.css('#p1, #p2', auto_save=True)[0]
-        relocated = new_page.css('#p1', auto_match=True)
         assert relocated is not None
-        assert relocated[0].attrib['data-id'] == 'p1'
-        assert relocated[0].has_class('new-class')
-        assert relocated[0].css('.new-description')[0].text == 'Description 1'

 class TestParserAutoMatch:
     def test_element_relocation(self):
         """Test relocating element after structure change"""
+        original_html = """
                 <div class="container">
                     <section class="products">
                         <article class="product" id="p1">
                         </article>
                     </section>
                 </div>
+                """
+        changed_html = """
                 <div class="new-container">
                     <div class="product-wrapper">
                         <section class="products">
                         </section>
                     </div>
                 </div>
+                """
+        old_page = Adaptor(original_html, url="example.com", auto_match=True)
+        new_page = Adaptor(changed_html, url="example.com", auto_match=True)
         # 'p1' was used as ID and now it's not and all the path elements have changes
         # Also at the same time testing auto-match vs combined selectors
+        _ = old_page.css("#p1, #p2", auto_save=True)[0]
+        relocated = new_page.css("#p1", auto_match=True)
         assert relocated is not None
+        assert relocated[0].attrib["data-id"] == "p1"
+        assert relocated[0].has_class("new-class")
+        assert relocated[0].css(".new-description")[0].text == "Description 1"
     @pytest.mark.asyncio
     async def test_element_relocation_async(self):
         """Test relocating element after structure change in async mode"""
+        original_html = """
                 <div class="container">
                     <section class="products">
                         <article class="product" id="p1">
                         </article>
                     </section>
                 </div>
+                """
+        changed_html = """
                 <div class="new-container">
                     <div class="product-wrapper">
                         <section class="products">
                         </section>
                     </div>
                 </div>
+                """
         # Simulate async operation
         await asyncio.sleep(0.1)  # Minimal async operation
+        old_page = Adaptor(original_html, url="example.com", auto_match=True)
+        new_page = Adaptor(changed_html, url="example.com", auto_match=True)
         # 'p1' was used as ID and now it's not and all the path elements have changes
         # Also at the same time testing auto-match vs combined selectors
+        _ = old_page.css("#p1, #p2", auto_save=True)[0]
+        relocated = new_page.css("#p1", auto_match=True)
         assert relocated is not None
+        assert relocated[0].attrib["data-id"] == "p1"
+        assert relocated[0].has_class("new-class")
+        assert relocated[0].css(".new-description")[0].text == "Description 1"

tests/parser/test_general.py CHANGED Viewed

@@ -9,7 +9,7 @@ from scrapling import Adaptor
 @pytest.fixture
 def html_content():
-    return '''
     <html>
     <head>
         <title>Complex Web Page</title>
@@ -73,7 +73,7 @@ def html_content():
         </script>
     </body>
     </html>
-    '''
 @pytest.fixture
@@ -85,13 +85,14 @@ def page(html_content):
 class TestCSSSelectors:
     def test_basic_product_selection(self, page):
         """Test selecting all product elements"""
-        elements = page.css('main #products .product-list article.product')
         assert len(elements) == 3
     def test_in_stock_product_selection(self, page):
         """Test selecting in-stock products"""
         in_stock_products = page.css(
-            'main #products .product-list article.product:not(:contains("Out of stock"))')
         assert len(in_stock_products) == 2
@@ -117,22 +118,26 @@ class TestXPathSelectors:
 class TestTextMatching:
     def test_regex_multiple_matches(self, page):
         """Test finding multiple matches with regex"""
-        stock_info = page.find_by_regex(r'In stock: \d+', first_match=False)
         assert len(stock_info) == 2
     def test_regex_first_match(self, page):
         """Test finding the first match with regex"""
-        stock_info = page.find_by_regex(r'In stock: \d+', first_match=True, case_sensitive=True)
-        assert stock_info.text == 'In stock: 5'
     def test_partial_text_match(self, page):
         """Test finding elements with partial text match"""
-        stock_info = page.find_by_text(r'In stock:', partial=True, first_match=False)
         assert len(stock_info) == 2
     def test_exact_text_match(self, page):
         """Test finding elements with exact text match"""
-        out_of_stock = page.find_by_text('Out of stock', partial=False, first_match=False)
         assert len(out_of_stock) == 1
@@ -140,17 +145,17 @@ class TestTextMatching:
 class TestSimilarElements:
     def test_finding_similar_products(self, page):
         """Test finding similar product elements"""
-        first_product = page.css_first('.product')
         similar_products = first_product.find_similar()
         assert len(similar_products) == 2
     def test_finding_similar_reviews(self, page):
         """Test finding similar review elements with additional filtering"""
-        first_review = page.find('div', class_='review')
         similar_high_rated_reviews = [
             review
             for review in first_review.find_similar()
-            if int(review.attrib.get('data-rating', 0)) >= 4
         ]
         assert len(similar_high_rated_reviews) == 1
@@ -181,17 +186,17 @@ class TestErrorHandling:
     def test_bad_selectors(self, page):
         """Test handling of invalid selectors"""
         with pytest.raises((SelectorError, SelectorSyntaxError)):
-            page.css('4 ayo')
         with pytest.raises((SelectorError, SelectorSyntaxError)):
-            page.xpath('4 ayo')
 # Pickling and Object Representation Tests
 class TestPicklingAndRepresentation:
     def test_unpickleable_objects(self, page):
         """Test that Adaptor objects cannot be pickled"""
-        table = page.css('.product-list')[0]
         with pytest.raises(TypeError):
             pickle.dumps(table)
@@ -200,7 +205,7 @@ class TestPicklingAndRepresentation:
     def test_string_representations(self, page):
         """Test custom string representations of objects"""
-        table = page.css('.product-list')[0]
         assert issubclass(type(table.__str__()), str)
         assert issubclass(type(table.__repr__()), str)
         assert issubclass(type(table.attrib.__str__()), str)
@@ -211,40 +216,40 @@ class TestPicklingAndRepresentation:
 class TestElementNavigation:
     def test_basic_navigation_properties(self, page):
         """Test basic navigation properties of elements"""
-        table = page.css('.product-list')[0]
         assert table.path is not None
-        assert table.html_content != ''
-        assert table.prettify() != ''
     def test_parent_and_sibling_navigation(self, page):
         """Test parent and sibling navigation"""
-        table = page.css('.product-list')[0]
         parent = table.parent
-        assert parent.attrib['id'] == 'products'
         parent_siblings = parent.siblings
         assert len(parent_siblings) == 1
     def test_child_navigation(self, page):
         """Test child navigation"""
-        table = page.css('.product-list')[0]
         children = table.children
         assert len(children) == 3
     def test_next_and_previous_navigation(self, page):
         """Test next and previous element navigation"""
-        child = page.css('.product-list')[0].find({'data-id': "1"})
         next_element = child.next
-        assert next_element.attrib['data-id'] == '2'
         prev_element = next_element.previous
         assert prev_element.tag == child.tag
     def test_ancestor_finding(self, page):
         """Test finding ancestors of elements"""
-        all_prices = page.css('.price')
         products_with_prices = [
-            price.find_ancestor(lambda p: p.has_class('product'))
             for price in all_prices
         ]
         assert len(products_with_prices) == 3
@@ -254,52 +259,59 @@ class TestElementNavigation:
 class TestJSONAndAttributes:
     def test_json_conversion(self, page):
         """Test converting content to JSON"""
-        script_content = page.css('#page-data::text')[0]
         assert issubclass(type(script_content.sort()), str)
         page_data = script_content.json()
-        assert page_data['totalProducts'] == 3
-        assert 'lastUpdated' in page_data
     def test_attribute_operations(self, page):
         """Test various attribute-related operations"""
         # Product ID extraction
-        products = page.css('.product')
-        product_ids = [product.attrib['data-id'] for product in products]
-        assert product_ids == ['1', '2', '3']
-        assert 'data-id' in products[0].attrib
         # Review rating calculations
-        reviews = page.css('.review')
-        review_ratings = [int(review.attrib['data-rating']) for review in reviews]
         assert sum(review_ratings) / len(review_ratings) == 4.5
         # Attribute searching
-        key_value = list(products[0].attrib.search_values('1', partial=False))
-        assert list(key_value[0].keys()) == ['data-id']
-        key_value = list(products[0].attrib.search_values('1', partial=True))
-        assert list(key_value[0].keys()) == ['data-id']
         # JSON attribute conversion
-        attr_json = page.css_first('#products').attrib['schema'].json()
-        assert attr_json == {'jsonable': 'data'}
-        assert isinstance(page.css('#products')[0].attrib.json_string, bytes)
 # Performance Test
 def test_large_html_parsing_performance():
     """Test parsing and selecting performance on large HTML"""
-    large_html = '<html><body>' + '<div class="item">' * 5000 + '</div>' * 5000 + '</body></html>'
     start_time = time.time()
     parsed = Adaptor(large_html, auto_match=False)
-    elements = parsed.css('.item')
     end_time = time.time()
     assert len(elements) == 5000
     # Converting 5000 elements to a class and doing operations on them will take time
     # Based on my tests with 100 runs, 1 loop each Scrapling (given the extra work/features) takes 10.4ms on average
-    assert end_time - start_time < 0.5  # Locally I test on 0.1 but on GitHub actions with browsers and threading sometimes closing adds fractions of seconds
 # Selector Generation Test
@@ -318,13 +330,13 @@ def test_selectors_generation(page):
 # Miscellaneous Tests
 def test_getting_all_text(page):
     """Test getting all text from the page"""
-    assert page.get_all_text() != ''
 def test_regex_on_text(page):
     """Test regex operations on text"""
     element = page.css('[data-id="1"] .price')[0]
-    match = element.re_first(r'[\.\d]+')
-    assert match == '10.99'
-    match = element.text.re(r'(\d+)', replace_entities=False)
     assert len(match) == 2

 @pytest.fixture
 def html_content():
+    return """
     <html>
     <head>
         <title>Complex Web Page</title>
         </script>
     </body>
     </html>
+    """
 @pytest.fixture
 class TestCSSSelectors:
     def test_basic_product_selection(self, page):
         """Test selecting all product elements"""
+        elements = page.css("main #products .product-list article.product")
         assert len(elements) == 3
     def test_in_stock_product_selection(self, page):
         """Test selecting in-stock products"""
         in_stock_products = page.css(
+            'main #products .product-list article.product:not(:contains("Out of stock"))'
+        )
         assert len(in_stock_products) == 2
 class TestTextMatching:
     def test_regex_multiple_matches(self, page):
         """Test finding multiple matches with regex"""
+        stock_info = page.find_by_regex(r"In stock: \d+", first_match=False)
         assert len(stock_info) == 2
     def test_regex_first_match(self, page):
         """Test finding the first match with regex"""
+        stock_info = page.find_by_regex(
+            r"In stock: \d+", first_match=True, case_sensitive=True
+        )
+        assert stock_info.text == "In stock: 5"
     def test_partial_text_match(self, page):
         """Test finding elements with partial text match"""
+        stock_info = page.find_by_text(r"In stock:", partial=True, first_match=False)
         assert len(stock_info) == 2
     def test_exact_text_match(self, page):
         """Test finding elements with exact text match"""
+        out_of_stock = page.find_by_text(
+            "Out of stock", partial=False, first_match=False
+        )
         assert len(out_of_stock) == 1
 class TestSimilarElements:
     def test_finding_similar_products(self, page):
         """Test finding similar product elements"""
+        first_product = page.css_first(".product")
         similar_products = first_product.find_similar()
         assert len(similar_products) == 2
     def test_finding_similar_reviews(self, page):
         """Test finding similar review elements with additional filtering"""
+        first_review = page.find("div", class_="review")
         similar_high_rated_reviews = [
             review
             for review in first_review.find_similar()
+            if int(review.attrib.get("data-rating", 0)) >= 4
         ]
         assert len(similar_high_rated_reviews) == 1
     def test_bad_selectors(self, page):
         """Test handling of invalid selectors"""
         with pytest.raises((SelectorError, SelectorSyntaxError)):
+            page.css("4 ayo")
         with pytest.raises((SelectorError, SelectorSyntaxError)):
+            page.xpath("4 ayo")
 # Pickling and Object Representation Tests
 class TestPicklingAndRepresentation:
     def test_unpickleable_objects(self, page):
         """Test that Adaptor objects cannot be pickled"""
+        table = page.css(".product-list")[0]
         with pytest.raises(TypeError):
             pickle.dumps(table)
     def test_string_representations(self, page):
         """Test custom string representations of objects"""
+        table = page.css(".product-list")[0]
         assert issubclass(type(table.__str__()), str)
         assert issubclass(type(table.__repr__()), str)
         assert issubclass(type(table.attrib.__str__()), str)
 class TestElementNavigation:
     def test_basic_navigation_properties(self, page):
         """Test basic navigation properties of elements"""
+        table = page.css(".product-list")[0]
         assert table.path is not None
+        assert table.html_content != ""
+        assert table.prettify() != ""
     def test_parent_and_sibling_navigation(self, page):
         """Test parent and sibling navigation"""
+        table = page.css(".product-list")[0]
         parent = table.parent
+        assert parent.attrib["id"] == "products"
         parent_siblings = parent.siblings
         assert len(parent_siblings) == 1
     def test_child_navigation(self, page):
         """Test child navigation"""
+        table = page.css(".product-list")[0]
         children = table.children
         assert len(children) == 3
     def test_next_and_previous_navigation(self, page):
         """Test next and previous element navigation"""
+        child = page.css(".product-list")[0].find({"data-id": "1"})
         next_element = child.next
+        assert next_element.attrib["data-id"] == "2"
         prev_element = next_element.previous
         assert prev_element.tag == child.tag
     def test_ancestor_finding(self, page):
         """Test finding ancestors of elements"""
+        all_prices = page.css(".price")
         products_with_prices = [
+            price.find_ancestor(lambda p: p.has_class("product"))
             for price in all_prices
         ]
         assert len(products_with_prices) == 3
 class TestJSONAndAttributes:
     def test_json_conversion(self, page):
         """Test converting content to JSON"""
+        script_content = page.css("#page-data::text")[0]
         assert issubclass(type(script_content.sort()), str)
         page_data = script_content.json()
+        assert page_data["totalProducts"] == 3
+        assert "lastUpdated" in page_data
     def test_attribute_operations(self, page):
         """Test various attribute-related operations"""
         # Product ID extraction
+        products = page.css(".product")
+        product_ids = [product.attrib["data-id"] for product in products]
+        assert product_ids == ["1", "2", "3"]
+        assert "data-id" in products[0].attrib
         # Review rating calculations
+        reviews = page.css(".review")
+        review_ratings = [int(review.attrib["data-rating"]) for review in reviews]
         assert sum(review_ratings) / len(review_ratings) == 4.5
         # Attribute searching
+        key_value = list(products[0].attrib.search_values("1", partial=False))
+        assert list(key_value[0].keys()) == ["data-id"]
+        key_value = list(products[0].attrib.search_values("1", partial=True))
+        assert list(key_value[0].keys()) == ["data-id"]
         # JSON attribute conversion
+        attr_json = page.css_first("#products").attrib["schema"].json()
+        assert attr_json == {"jsonable": "data"}
+        assert isinstance(page.css("#products")[0].attrib.json_string, bytes)
 # Performance Test
 def test_large_html_parsing_performance():
     """Test parsing and selecting performance on large HTML"""
+    large_html = (
+        "<html><body>"
+        + '<div class="item">' * 5000
+        + "</div>" * 5000
+        + "</body></html>"
+    )
     start_time = time.time()
     parsed = Adaptor(large_html, auto_match=False)
+    elements = parsed.css(".item")
     end_time = time.time()
     assert len(elements) == 5000
     # Converting 5000 elements to a class and doing operations on them will take time
     # Based on my tests with 100 runs, 1 loop each Scrapling (given the extra work/features) takes 10.4ms on average
+    assert (
+        end_time - start_time < 0.5
+    )  # Locally I test on 0.1 but on GitHub actions with browsers and threading sometimes closing adds fractions of seconds
 # Selector Generation Test
 # Miscellaneous Tests
 def test_getting_all_text(page):
     """Test getting all text from the page"""
+    assert page.get_all_text() != ""
 def test_regex_on_text(page):
     """Test regex operations on text"""
     element = page.css('[data-id="1"] .price')[0]
+    match = element.re_first(r"[\.\d]+")
+    assert match == "10.99"
+    match = element.text.re(r"(\d+)", replace_entities=False)
     assert len(match) == 2