Spaces:

lenson78
/

Scrapling

Paused

App Files Files Community

Karim shoair commited on Oct 5, 2025

Commit

e5ecf76

1 Parent(s): c4135c8

refactor: Making all the codebase acceptable by PyRight

Browse files

Files changed (21) hide show

scrapling/core/_types.py +3 -0
scrapling/core/ai.py +2 -1
scrapling/core/custom_types.py +20 -27
scrapling/core/mixins.py +15 -9
scrapling/core/shell.py +4 -3
scrapling/core/storage.py +5 -5
scrapling/core/translator.py +13 -8
scrapling/engines/_browsers/_base.py +28 -11
scrapling/engines/_browsers/_camoufox.py +12 -5
scrapling/engines/_browsers/_config_tools.py +1 -1
scrapling/engines/_browsers/_controllers.py +14 -7
scrapling/engines/_browsers/_validators.py +28 -11
scrapling/engines/static.py +4 -4
scrapling/engines/toolbelt/convertor.py +6 -4
scrapling/engines/toolbelt/custom.py +6 -9
scrapling/engines/toolbelt/fingerprints.py +17 -10
scrapling/engines/toolbelt/navigation.py +11 -3
scrapling/fetchers/__init__.py +11 -1
scrapling/fetchers/chrome.py +3 -4
scrapling/fetchers/firefox.py +0 -4
scrapling/parser.py +105 -80

scrapling/core/_types.py CHANGED Viewed

@@ -12,9 +12,11 @@ from typing import (
     Generator,
     Iterable,
     List,
     Literal,
     Optional,
     Pattern,
     Tuple,
     TypeVar,
     Union,
@@ -22,6 +24,7 @@ from typing import (
     Mapping,
     Awaitable,
     Protocol,
     SupportsIndex,
 )

     Generator,
     Iterable,
     List,
+    Set,
     Literal,
     Optional,
     Pattern,
+    Sequence,
     Tuple,
     TypeVar,
     Union,
     Mapping,
     Awaitable,
     Protocol,
+    Coroutine,
     SupportsIndex,
 )

scrapling/core/ai.py CHANGED Viewed

@@ -20,6 +20,7 @@ from scrapling.core._types import (
     Mapping,
     Dict,
     List,
     SelectorWaitStates,
     Generator,
 )
@@ -171,7 +172,7 @@ class ScraplingMCPServer:
         :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
         """
         async with FetcherSession() as session:
-            tasks = [
                 session.get(
                     url,
                     auth=auth,

     Mapping,
     Dict,
     List,
+    Any,
     SelectorWaitStates,
     Generator,
 )
         :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
         """
         async with FetcherSession() as session:
+            tasks: List[Any] = [
                 session.get(
                     url,
                     auth=auth,

scrapling/core/custom_types.py CHANGED Viewed

@@ -5,6 +5,7 @@ from re import compile as re_compile, UNICODE, IGNORECASE
 from orjson import dumps, loads
 from scrapling.core._types import (
     cast,
     Dict,
     List,
@@ -14,7 +15,6 @@ from scrapling.core._types import (
     Literal,
     Pattern,
     Iterable,
-    Optional,
     Generator,
     SupportsIndex,
 )
@@ -33,23 +33,20 @@ class TextHandler(str):
     def __getitem__(self, key: SupportsIndex | slice) -> "TextHandler":  # pragma: no cover
         lst = super().__getitem__(key)
-        return cast(_TextHandlerType, TextHandler(lst))
-    def split(self, sep: str = None, maxsplit: SupportsIndex = -1) -> "TextHandlers":  # pragma: no cover
-        return TextHandlers(
-            cast(
-                List[_TextHandlerType],
-                [TextHandler(s) for s in super().split(sep, maxsplit)],
-            )
-        )
-    def strip(self, chars: str = None) -> Union[str, "TextHandler"]:  # pragma: no cover
         return TextHandler(super().strip(chars))
-    def lstrip(self, chars: str = None) -> Union[str, "TextHandler"]:  # pragma: no cover
         return TextHandler(super().lstrip(chars))
-    def rstrip(self, chars: str = None) -> Union[str, "TextHandler"]:  # pragma: no cover
         return TextHandler(super().rstrip(chars))
     def capitalize(self) -> Union[str, "TextHandler"]:  # pragma: no cover
@@ -64,7 +61,7 @@ class TextHandler(str):
     def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, "TextHandler"]:  # pragma: no cover
         return TextHandler(super().expandtabs(tabsize))
-    def format(self, *args: str, **kwargs: str) -> Union[str, "TextHandler"]:  # pragma: no cover
         return TextHandler(super().format(*args, **kwargs))
     def format_map(self, mapping) -> Union[str, "TextHandler"]:  # pragma: no cover
@@ -131,10 +128,11 @@ class TextHandler(str):
     def re(
         self,
         regex: str | Pattern,
-        check_match: Literal[True],
         replace_entities: bool = True,
         clean_match: bool = False,
         case_sensitive: bool = True,
     ) -> bool: ...
     @overload
@@ -179,19 +177,14 @@ class TextHandler(str):
             results = flatten(results)
         if not replace_entities:
-            return TextHandlers(cast(List[_TextHandlerType], [TextHandler(string) for string in results]))
-        return TextHandlers(
-            cast(
-                List[_TextHandlerType],
-                [TextHandler(_replace_entities(s)) for s in results],
-            )
-        )
     def re_first(
         self,
         regex: str | Pattern,
-        default=None,
         replace_entities: bool = True,
         clean_match: bool = False,
         case_sensitive: bool = True,
@@ -232,8 +225,8 @@ class TextHandlers(List[TextHandler]):
     def __getitem__(self, pos: SupportsIndex | slice) -> Union[TextHandler, "TextHandlers"]:
         lst = super().__getitem__(pos)
         if isinstance(pos, slice):
-            return TextHandlers(cast(List[_TextHandlerType], lst))
-        return cast(_TextHandlerType, TextHandler(lst))
     def re(
         self,
@@ -256,7 +249,7 @@ class TextHandlers(List[TextHandler]):
     def re_first(
         self,
         regex: str | Pattern,
-        default=None,
         replace_entities: bool = True,
         clean_match: bool = False,
         case_sensitive: bool = True,
@@ -309,9 +302,9 @@ class AttributesHandler(Mapping[str, _TextHandlerType]):
             )
         # Fastest read-only mapping type
-        self._data = MappingProxyType(mapping)
-    def get(self, key: str, default: Optional[str] = None) -> Optional[_TextHandlerType]:
         """Acts like the standard dictionary `.get()` method"""
         return self._data.get(key, default)

 from orjson import dumps, loads
 from scrapling.core._types import (
+    Any,
     cast,
     Dict,
     List,
     Literal,
     Pattern,
     Iterable,
     Generator,
     SupportsIndex,
 )
     def __getitem__(self, key: SupportsIndex | slice) -> "TextHandler":  # pragma: no cover
         lst = super().__getitem__(key)
+        return TextHandler(lst)
+    def split(
+        self, sep: str | None = None, maxsplit: SupportsIndex = -1
+    ) -> Union[List, "TextHandlers"]:  # pragma: no cover
+        return TextHandlers([TextHandler(s) for s in super().split(sep, maxsplit)])
+    def strip(self, chars: str | None = None) -> Union[str, "TextHandler"]:  # pragma: no cover
         return TextHandler(super().strip(chars))
+    def lstrip(self, chars: str | None = None) -> Union[str, "TextHandler"]:  # pragma: no cover
         return TextHandler(super().lstrip(chars))
+    def rstrip(self, chars: str | None = None) -> Union[str, "TextHandler"]:  # pragma: no cover
         return TextHandler(super().rstrip(chars))
     def capitalize(self) -> Union[str, "TextHandler"]:  # pragma: no cover
     def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, "TextHandler"]:  # pragma: no cover
         return TextHandler(super().expandtabs(tabsize))
+    def format(self, *args: object, **kwargs: str) -> Union[str, "TextHandler"]:  # pragma: no cover
         return TextHandler(super().format(*args, **kwargs))
     def format_map(self, mapping) -> Union[str, "TextHandler"]:  # pragma: no cover
     def re(
         self,
         regex: str | Pattern,
         replace_entities: bool = True,
         clean_match: bool = False,
         case_sensitive: bool = True,
+        *,
+        check_match: Literal[True],
     ) -> bool: ...
     @overload
             results = flatten(results)
         if not replace_entities:
+            return TextHandlers([TextHandler(string) for string in results])
+        return TextHandlers([TextHandler(_replace_entities(s)) for s in results])
     def re_first(
         self,
         regex: str | Pattern,
+        default: Any = None,
         replace_entities: bool = True,
         clean_match: bool = False,
         case_sensitive: bool = True,
     def __getitem__(self, pos: SupportsIndex | slice) -> Union[TextHandler, "TextHandlers"]:
         lst = super().__getitem__(pos)
         if isinstance(pos, slice):
+            return TextHandlers(cast(List[TextHandler], lst))
+        return TextHandler(cast(TextHandler, lst))
     def re(
         self,
     def re_first(
         self,
         regex: str | Pattern,
+        default: Any = None,
         replace_entities: bool = True,
         clean_match: bool = False,
         case_sensitive: bool = True,
             )
         # Fastest read-only mapping type
+        self._data: Mapping[str, Any] = MappingProxyType(mapping)
+    def get(self, key: str, default: Any = None) -> _TextHandlerType:
         """Acts like the standard dictionary `.get()` method"""
         return self._data.get(key, default)

scrapling/core/mixins.py CHANGED Viewed

@@ -1,3 +1,9 @@
 class SelectorsGeneration:
     """
     Functions for generating selectors
@@ -5,7 +11,7 @@ class SelectorsGeneration:
     Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591
     """
-    def __general_selection(self, selection: str = "css", full_path: bool = False) -> str:
         """Generate a selector for the current element.
         :return: A string of the generated selector.
         """
@@ -47,29 +53,29 @@ class SelectorsGeneration:
         return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))
     @property
-    def generate_css_selector(self) -> str:
         """Generate a CSS selector for the current element
         :return: A string of the generated selector.
         """
-        return self.__general_selection()
     @property
-    def generate_full_css_selector(self) -> str:
         """Generate a complete CSS selector for the current element
         :return: A string of the generated selector.
         """
-        return self.__general_selection(full_path=True)
     @property
-    def generate_xpath_selector(self) -> str:
         """Generate an XPath selector for the current element
         :return: A string of the generated selector.
         """
-        return self.__general_selection("xpath")
     @property
-    def generate_full_xpath_selector(self) -> str:
         """Generate a complete XPath selector for the current element
         :return: A string of the generated selector.
         """
-        return self.__general_selection("xpath", full_path=True)

+from scrapling.core._types import TYPE_CHECKING
+if TYPE_CHECKING:
+    from scrapling.parser import Selector
 class SelectorsGeneration:
     """
     Functions for generating selectors
     Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591
     """
+    def _general_selection(self: "Selector", selection: str = "css", full_path: bool = False) -> str:  # type: ignore[name-defined]
         """Generate a selector for the current element.
         :return: A string of the generated selector.
         """
         return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))
     @property
+    def generate_css_selector(self: "Selector") -> str:  # type: ignore[name-defined]
         """Generate a CSS selector for the current element
         :return: A string of the generated selector.
         """
+        return self._general_selection()
     @property
+    def generate_full_css_selector(self: "Selector") -> str:  # type: ignore[name-defined]
         """Generate a complete CSS selector for the current element
         :return: A string of the generated selector.
         """
+        return self._general_selection(full_path=True)
     @property
+    def generate_xpath_selector(self: "Selector") -> str:  # type: ignore[name-defined]
         """Generate an XPath selector for the current element
         :return: A string of the generated selector.
         """
+        return self._general_selection("xpath")
     @property
+    def generate_full_xpath_selector(self: "Selector") -> str:  # type: ignore[name-defined]
         """Generate a complete XPath selector for the current element
         :return: A string of the generated selector.
         """
+        return self._general_selection("xpath", full_path=True)

scrapling/core/shell.py CHANGED Viewed

@@ -31,6 +31,7 @@ from scrapling.core._types import (
     Optional,
     Dict,
     Any,
     extraction_types,
     Generator,
 )
@@ -540,15 +541,15 @@ class Convertor:
             raise ValueError(f"Unknown extraction type: {extraction_type}")
         else:
             if main_content_only:
-                page = page.css_first("body") or page
-            pages = [page] if not css_selector else page.css(css_selector)
             for page in pages:
                 match extraction_type:
                     case "markdown":
                         yield cls._convert_to_markdown(page.html_content)
                     case "html":
-                        yield page.body
                     case "text":
                         txt_content = page.get_all_text(strip=True)
                         for s in (

     Optional,
     Dict,
     Any,
+    cast,
     extraction_types,
     Generator,
 )
             raise ValueError(f"Unknown extraction type: {extraction_type}")
         else:
             if main_content_only:
+                page = cast(Selector, page.css_first("body")) or page
+            pages = [page] if not css_selector else cast(Selectors, page.css(css_selector))
             for page in pages:
                 match extraction_type:
                     case "markdown":
                         yield cls._convert_to_markdown(page.html_content)
                     case "html":
+                        yield page.html_content
                     case "text":
                         txt_content = page.get_all_text(strip=True)
                         for s in (

scrapling/core/storage.py CHANGED Viewed

@@ -56,13 +56,13 @@ class StorageSystemMixin(ABC):  # pragma: no cover
     @lru_cache(128, typed=True)
     def _get_hash(identifier: str) -> str:
         """If you want to hash identifier in your storage system, use this safer"""
-        identifier = identifier.lower().strip()
-        if isinstance(identifier, str):
             # Hash functions have to take bytes
-            identifier = identifier.encode("utf-8")
-        hash_value = sha256(identifier).hexdigest()
-        return f"{hash_value}_{len(identifier)}"  # Length to reduce collision chance
 @lru_cache(1, typed=True)

     @lru_cache(128, typed=True)
     def _get_hash(identifier: str) -> str:
         """If you want to hash identifier in your storage system, use this safer"""
+        _identifier = identifier.lower().strip()
+        if isinstance(_identifier, str):
             # Hash functions have to take bytes
+            _identifier = _identifier.encode("utf-8")
+        hash_value = sha256(_identifier).hexdigest()
+        return f"{hash_value}_{len(_identifier)}"  # Length to reduce collision chance
 @lru_cache(1, typed=True)

scrapling/core/translator.py CHANGED Viewed

@@ -10,24 +10,23 @@ So you don't have to learn a new selectors/api method like what bs4 done with so
 from functools import lru_cache
-from cssselect.xpath import ExpressionError
-from cssselect.xpath import XPathExpr as OriginalXPathExpr
 from cssselect import HTMLTranslator as OriginalHTMLTranslator
 from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
-from scrapling.core._types import Any, Optional, Protocol, Self
 class XPathExpr(OriginalXPathExpr):
     textnode: bool = False
-    attribute: Optional[str] = None
     @classmethod
     def from_xpath(
         cls,
         xpath: OriginalXPathExpr,
         textnode: bool = False,
-        attribute: Optional[str] = None,
     ) -> Self:
         x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
         x.textnode = textnode
@@ -71,10 +70,10 @@ class XPathExpr(OriginalXPathExpr):
 # e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
 class TranslatorProtocol(Protocol):
-    def xpath_element(self, selector: Element) -> OriginalXPathExpr:  # pragma: no cover
         pass
-    def css_to_xpath(self, css: str, prefix: str = ...) -> str:  # pragma: no cover
         pass
@@ -121,9 +120,15 @@ class TranslatorMixin:
 class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
-    @lru_cache(maxsize=256)
     def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
         return super().css_to_xpath(css, prefix)
 translator = HTMLTranslator()

 from functools import lru_cache
 from cssselect import HTMLTranslator as OriginalHTMLTranslator
+from cssselect.xpath import ExpressionError, XPathExpr as OriginalXPathExpr
 from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
+from scrapling.core._types import Any, Protocol, Self
 class XPathExpr(OriginalXPathExpr):
     textnode: bool = False
+    attribute: str | None = None
     @classmethod
     def from_xpath(
         cls,
         xpath: OriginalXPathExpr,
         textnode: bool = False,
+        attribute: str | None = None,
     ) -> Self:
         x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
         x.textnode = textnode
 # e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
 class TranslatorProtocol(Protocol):
+    def xpath_element(self, selector: Element) -> OriginalXPathExpr:  # pyright: ignore # pragma: no cover
         pass
+    def css_to_xpath(self, css: str, prefix: str = ...) -> str:  # pyright: ignore # pragma: no cover
         pass
 class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
     def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
         return super().css_to_xpath(css, prefix)
 translator = HTMLTranslator()
+# Using a function instead of the translator directly to avoid Pyright override error
+@lru_cache(maxsize=256)
+def css_to_xpath(query: str) -> str:
+    """Return translated XPath version of a given CSS query"""
+    return translator.css_to_xpath(query)

scrapling/engines/_browsers/_base.py CHANGED Viewed

@@ -7,14 +7,12 @@ from playwright.async_api import (
     BrowserContext as AsyncBrowserContext,
     Playwright as AsyncPlaywright,
 )
-from camoufox.utils import (
-    launch_options as generate_launch_options,
-    installed_verstr as camoufox_version,
-)
 from ._page import PageInfo, PagePool
 from scrapling.parser import Selector
-from scrapling.core._types import Dict, Optional
 from scrapling.engines.toolbelt.fingerprints import get_os_name
 from ._validators import validate, PlaywrightConfig, CamoufoxConfig
 from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
@@ -41,6 +39,7 @@ class SyncSession:
         """Get a new page to use"""
         # No need to check if a page is available or not in sync code because the code blocked before reaching here till the page closed, ofc.
         page = self.context.new_page()
         page.set_default_navigation_timeout(timeout)
         page.set_default_timeout(timeout)
@@ -65,11 +64,14 @@ class SyncSession:
         }
-class AsyncSession(SyncSession):
     def __init__(self, max_pages: int = 1):
-        super().__init__(max_pages)
         self.playwright: Optional[AsyncPlaywright] = None
         self.context: Optional[AsyncBrowserContext] = None
         self._lock = Lock()
     async def _get_page(
@@ -79,6 +81,9 @@ class AsyncSession(SyncSession):
         disable_resources: bool,
     ) -> PageInfo:  # pragma: no cover
         """Get a new page to use"""
         async with self._lock:
             # If we're at max capacity after cleanup, wait for busy pages to finish
             if self.page_pool.pages_count >= self.max_pages:
@@ -92,6 +97,7 @@ class AsyncSession(SyncSession):
                         f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
                     )
             page = await self.context.new_page()
             page.set_default_navigation_timeout(timeout)
             page.set_default_timeout(timeout)
@@ -107,6 +113,14 @@ class AsyncSession(SyncSession):
             return self.page_pool.add_page(page)
 class DynamicSessionMixin:
     def __validate__(self, **params):
@@ -139,6 +153,9 @@ class DynamicSessionMixin:
         self.__initiate_browser_options__()
     def __initiate_browser_options__(self):
         if not self.cdp_url:
             # `launch_options` is used with persistent context
             self.launch_options = dict(
@@ -175,7 +192,7 @@ class DynamicSessionMixin:
 class StealthySessionMixin:
     def __validate__(self, **params):
-        config = validate(params, model=CamoufoxConfig)
         self.max_pages = config.max_pages
         self.headless = config.headless
@@ -209,10 +226,10 @@ class StealthySessionMixin:
     def __initiate_browser_options__(self):
         """Initiate browser options."""
-        self.launch_options = generate_launch_options(
             **{
                 "geoip": self.geoip,
-                "proxy": dict(self.proxy) if self.proxy else self.proxy,
                 "addons": self.addons,
                 "exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
                 "headless": self.headless,
@@ -232,7 +249,7 @@ class StealthySessionMixin:
                     "browser.cache.disk_cache_ssl": True,
                     "browser.cache.disk.smart_size.enabled": True,
                 },
-                **self.additional_args,
             }
         )

     BrowserContext as AsyncBrowserContext,
     Playwright as AsyncPlaywright,
 )
+from camoufox.pkgman import installed_verstr as camoufox_version
+from camoufox.utils import launch_options as generate_launch_options
 from ._page import PageInfo, PagePool
 from scrapling.parser import Selector
+from scrapling.core._types import Any, cast, Dict, Optional, TYPE_CHECKING
 from scrapling.engines.toolbelt.fingerprints import get_os_name
 from ._validators import validate, PlaywrightConfig, CamoufoxConfig
 from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
         """Get a new page to use"""
         # No need to check if a page is available or not in sync code because the code blocked before reaching here till the page closed, ofc.
+        assert self.context is not None, "Browser context not initialized"
         page = self.context.new_page()
         page.set_default_navigation_timeout(timeout)
         page.set_default_timeout(timeout)
         }
+class AsyncSession:
     def __init__(self, max_pages: int = 1):
+        self.max_pages = max_pages
+        self.page_pool = PagePool(max_pages)
+        self._max_wait_for_page = 60
         self.playwright: Optional[AsyncPlaywright] = None
         self.context: Optional[AsyncBrowserContext] = None
+        self._closed = False
         self._lock = Lock()
     async def _get_page(
         disable_resources: bool,
     ) -> PageInfo:  # pragma: no cover
         """Get a new page to use"""
+        if TYPE_CHECKING:
+            assert self.context is not None, "Browser context not initialized"
         async with self._lock:
             # If we're at max capacity after cleanup, wait for busy pages to finish
             if self.page_pool.pages_count >= self.max_pages:
                         f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
                     )
+            assert self.context is not None, "Browser context not initialized"
             page = await self.context.new_page()
             page.set_default_navigation_timeout(timeout)
             page.set_default_timeout(timeout)
             return self.page_pool.add_page(page)
+    def get_pool_stats(self) -> Dict[str, int]:
+        """Get statistics about the current page pool"""
+        return {
+            "total_pages": self.page_pool.pages_count,
+            "busy_pages": self.page_pool.busy_count,
+            "max_pages": self.max_pages,
+        }
 class DynamicSessionMixin:
     def __validate__(self, **params):
         self.__initiate_browser_options__()
     def __initiate_browser_options__(self):
+        if TYPE_CHECKING:
+            assert isinstance(self.proxy, tuple)
         if not self.cdp_url:
             # `launch_options` is used with persistent context
             self.launch_options = dict(
 class StealthySessionMixin:
     def __validate__(self, **params):
+        config: CamoufoxConfig = validate(params, model=CamoufoxConfig)
         self.max_pages = config.max_pages
         self.headless = config.headless
     def __initiate_browser_options__(self):
         """Initiate browser options."""
+        self.launch_options: Dict[str, Any] = generate_launch_options(
             **{
                 "geoip": self.geoip,
+                "proxy": dict(self.proxy) if self.proxy and isinstance(self.proxy, tuple) else self.proxy,
                 "addons": self.addons,
                 "exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
                 "headless": self.headless,
                     "browser.cache.disk_cache_ssl": True,
                     "browser.cache.disk.smart_size.enabled": True,
                 },
+                **cast(Dict, self.additional_args),
             }
         )

scrapling/engines/_browsers/_camoufox.py CHANGED Viewed

@@ -26,6 +26,7 @@ from scrapling.core._types import (
     List,
     Optional,
     Callable,
     SelectorWaitStates,
 )
 from scrapling.engines.toolbelt.convertor import (
@@ -205,7 +206,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
         self._closed = True
     @staticmethod
-    def _get_page_content(page: Page) -> str | None:
         """
         A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
         :param page: The page to extract content from.
@@ -217,6 +218,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
             except PlaywrightError:
                 page.wait_for_timeout(1000)
                 continue
     def _solve_cloudflare(self, page: Page) -> None:  # pragma: no cover
         """Solve the cloudflare challenge displayed on the playwright page passed
@@ -502,8 +504,8 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
     async def __create__(self):
         """Create a browser for this instance and context."""
-        self.playwright: AsyncPlaywright = await async_playwright().start()
-        self.context: AsyncBrowserContext = await self.playwright.firefox.launch_persistent_context(
             **self.launch_options
         )
@@ -511,7 +513,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
             await self.context.add_init_script(path=self.init_script)
         if self.cookies:
-            await self.context.add_cookies(self.cookies)
     async def __aenter__(self):
         await self.__create__()
@@ -536,7 +538,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
         self._closed = True
     @staticmethod
-    async def _get_page_content(page: async_Page) -> str | None:
         """
         A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
         :param page: The page to extract content from.
@@ -548,6 +550,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
             except PlaywrightError:
                 await page.wait_for_timeout(1000)
                 continue
     async def _solve_cloudflare(self, page: async_Page):
         """Solve the cloudflare challenge displayed on the playwright page passed. The async version
@@ -679,6 +682,10 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
         page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
         page_info.mark_busy(url=url)
         try:
             # Navigate to URL and wait for a specified state
             page_info.page.on("response", handle_response)

     List,
     Optional,
     Callable,
+    TYPE_CHECKING,
     SelectorWaitStates,
 )
 from scrapling.engines.toolbelt.convertor import (
         self._closed = True
     @staticmethod
+    def _get_page_content(page: Page) -> str:
         """
         A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
         :param page: The page to extract content from.
             except PlaywrightError:
                 page.wait_for_timeout(1000)
                 continue
+        return ""  # pyright: ignore
     def _solve_cloudflare(self, page: Page) -> None:  # pragma: no cover
         """Solve the cloudflare challenge displayed on the playwright page passed
     async def __create__(self):
         """Create a browser for this instance and context."""
+        self.playwright: AsyncPlaywright | None = await async_playwright().start()
+        self.context: AsyncBrowserContext | None = await self.playwright.firefox.launch_persistent_context(
             **self.launch_options
         )
             await self.context.add_init_script(path=self.init_script)
         if self.cookies:
+            await self.context.add_cookies(self.cookies)  # pyright: ignore [reportArgumentType]
     async def __aenter__(self):
         await self.__create__()
         self._closed = True
     @staticmethod
+    async def _get_page_content(page: async_Page) -> str:
         """
         A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
         :param page: The page to extract content from.
             except PlaywrightError:
                 await page.wait_for_timeout(1000)
                 continue
+        return ""  # pyright: ignore
     async def _solve_cloudflare(self, page: async_Page):
         """Solve the cloudflare challenge displayed on the playwright page passed. The async version
         page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
         page_info.mark_busy(url=url)
+        if TYPE_CHECKING:
+            if not isinstance(page_info.page, async_Page):
+                raise TypeError
         try:
             # Navigate to URL and wait for a specified state
             page_info.page.on("response", handle_response)

scrapling/engines/_browsers/_config_tools.py CHANGED Viewed

@@ -62,7 +62,7 @@ def _set_flags(hide_canvas, disable_webgl):  # pragma: no cover
 @lru_cache(2, typed=True)
 def _launch_kwargs(
     headless,
-    proxy,
     locale,
     extra_headers,
     useragent,

 @lru_cache(2, typed=True)
 def _launch_kwargs(
     headless,
+    proxy: Tuple,
     locale,
     extra_headers,
     useragent,

scrapling/engines/_browsers/_controllers.py CHANGED Viewed

@@ -10,6 +10,7 @@ from playwright.async_api import (
     BrowserContext as AsyncBrowserContext,
     Playwright as AsyncPlaywright,
     Locator as AsyncLocator,
 )
 from patchright.sync_api import sync_playwright as sync_patchright
 from patchright.async_api import async_playwright as async_patchright
@@ -18,10 +19,12 @@ from scrapling.core.utils import log
 from ._base import SyncSession, AsyncSession, DynamicSessionMixin
 from ._validators import validate_fetch as _validate
 from scrapling.core._types import (
     Dict,
     List,
     Optional,
     Callable,
     SelectorWaitStates,
 )
 from scrapling.engines.toolbelt.convertor import (
@@ -30,7 +33,7 @@ from scrapling.engines.toolbelt.convertor import (
 )
 from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
-_UNSET = object()
 class DynamicSession(DynamicSessionMixin, SyncSession):
@@ -154,7 +157,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
         """Create a browser for this instance and context."""
         sync_context = sync_patchright if self.stealth else sync_playwright
-        self.playwright: Playwright = sync_context().start()
         if self.cdp_url:  # pragma: no cover
             self.context = self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url).new_context(
@@ -187,7 +190,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
         if self.playwright:
             self.playwright.stop()
-            self.playwright = None
         self._closed = True
@@ -399,7 +402,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
         """Create a browser for this instance and context."""
         async_context = async_patchright if self.stealth else async_playwright
-        self.playwright: AsyncPlaywright = await async_context().start()
         if self.cdp_url:
             browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url)
@@ -413,7 +416,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
             await self.context.add_init_script(path=self.init_script)
         if self.cookies:
-            await self.context.add_cookies(self.cookies)
     async def __aenter__(self):
         await self.__create__()
@@ -429,11 +432,11 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
         if self.context:
             await self.context.close()
-            self.context = None
         if self.playwright:
             await self.playwright.stop()
-            self.playwright = None
         self._closed = True
@@ -506,6 +509,10 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
         page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
         page_info.mark_busy(url=url)
         try:
             # Navigate to URL and wait for a specified state
             page_info.page.on("response", handle_response)

     BrowserContext as AsyncBrowserContext,
     Playwright as AsyncPlaywright,
     Locator as AsyncLocator,
+    Page as async_Page,
 )
 from patchright.sync_api import sync_playwright as sync_patchright
 from patchright.async_api import async_playwright as async_patchright
 from ._base import SyncSession, AsyncSession, DynamicSessionMixin
 from ._validators import validate_fetch as _validate
 from scrapling.core._types import (
+    Any,
     Dict,
     List,
     Optional,
     Callable,
+    TYPE_CHECKING,
     SelectorWaitStates,
 )
 from scrapling.engines.toolbelt.convertor import (
 )
 from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
+_UNSET: Any = object()
 class DynamicSession(DynamicSessionMixin, SyncSession):
         """Create a browser for this instance and context."""
         sync_context = sync_patchright if self.stealth else sync_playwright
+        self.playwright: Playwright = sync_context().start()  # pyright: ignore [reportAttributeAccessIssue]
         if self.cdp_url:  # pragma: no cover
             self.context = self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url).new_context(
         if self.playwright:
             self.playwright.stop()
+            self.playwright = None  # pyright: ignore
         self._closed = True
         """Create a browser for this instance and context."""
         async_context = async_patchright if self.stealth else async_playwright
+        self.playwright: AsyncPlaywright = await async_context().start()  # pyright: ignore [reportAttributeAccessIssue]
         if self.cdp_url:
             browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url)
             await self.context.add_init_script(path=self.init_script)
         if self.cookies:
+            await self.context.add_cookies(self.cookies)  # pyright: ignore
     async def __aenter__(self):
         await self.__create__()
         if self.context:
             await self.context.close()
+            self.context = None  # pyright: ignore
         if self.playwright:
             await self.playwright.stop()
+            self.playwright = None  # pyright: ignore
         self._closed = True
         page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
         page_info.mark_busy(url=url)
+        if TYPE_CHECKING:
+            if not isinstance(page_info.page, async_Page):
+                raise TypeError
         try:
             # Navigate to URL and wait for a specified state
             page_info.page.on("response", handle_response)

scrapling/engines/_browsers/_validators.py CHANGED Viewed

@@ -11,7 +11,10 @@ from scrapling.core._types import (
     Tuple,
     Optional,
     Callable,
     SelectorWaitStates,
 )
 from scrapling.engines.toolbelt.navigation import construct_proxy_dict
@@ -73,7 +76,7 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
     stealth: bool = False
     wait: Seconds = 0
     page_action: Optional[Callable] = None
-    proxy: Optional[str | Dict[str, str]] = None  # The default value for proxy in Playwright's source is `None`
     locale: str = "en-US"
     extra_headers: Optional[Dict[str, str]] = None
     useragent: Optional[str] = None
@@ -81,11 +84,11 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
     init_script: Optional[str] = None
     disable_resources: bool = False
     wait_selector: Optional[str] = None
-    cookies: Optional[List[Dict]] = None
     network_idle: bool = False
     load_dom: bool = True
     wait_selector_state: SelectorWaitStates = "attached"
-    selector_config: Optional[Dict] = None
     def __post_init__(self):
         """Custom validation after msgspec validation"""
@@ -125,15 +128,15 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
     wait_selector: Optional[str] = None
     addons: Optional[List[str]] = None
     wait_selector_state: SelectorWaitStates = "attached"
-    cookies: Optional[List[Dict]] = None
     google_search: bool = True
     extra_headers: Optional[Dict[str, str]] = None
-    proxy: Optional[str | Dict[str, str]] = None  # The default value for proxy in Playwright's source is `None`
     os_randomize: bool = False
     disable_ads: bool = False
     geoip: bool = False
-    selector_config: Optional[Dict] = None
-    additional_args: Optional[Dict] = None
     def __post_init__(self):
         """Custom validation after msgspec validation"""
@@ -177,7 +180,7 @@ class FetchConfig(Struct, kw_only=True):
     network_idle: bool = False
     load_dom: bool = True
     solve_cloudflare: bool = False
-    selector_config: Optional[Dict] = {}
     def to_dict(self):
         return {f: getattr(self, f) for f in self.__struct_fields__}
@@ -198,7 +201,7 @@ class _fetch_params:
     network_idle: bool
     load_dom: bool
     solve_cloudflare: bool
-    selector_config: Optional[Dict]
 def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
@@ -212,7 +215,7 @@ def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
             result[arg] = session_value
     if overrides:
-        overrides = validate(overrides, FetchConfig).to_dict()
         overrides.update(result)
         return _fetch_params(**overrides)
@@ -222,7 +225,21 @@ def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
     return _fetch_params(**result)
-def validate(params: Dict, model) -> PlaywrightConfig | CamoufoxConfig | FetchConfig:
     try:
         return convert(params, model)
     except ValidationError as e:

     Tuple,
     Optional,
     Callable,
+    Iterable,
     SelectorWaitStates,
+    cast,
+    overload,
 )
 from scrapling.engines.toolbelt.navigation import construct_proxy_dict
     stealth: bool = False
     wait: Seconds = 0
     page_action: Optional[Callable] = None
+    proxy: Optional[str | Dict[str, str] | Tuple] = None  # The default value for proxy in Playwright's source is `None`
     locale: str = "en-US"
     extra_headers: Optional[Dict[str, str]] = None
     useragent: Optional[str] = None
     init_script: Optional[str] = None
     disable_resources: bool = False
     wait_selector: Optional[str] = None
+    cookies: Optional[Iterable[Dict]] = None
     network_idle: bool = False
     load_dom: bool = True
     wait_selector_state: SelectorWaitStates = "attached"
+    selector_config: Optional[Dict] = {}
     def __post_init__(self):
         """Custom validation after msgspec validation"""
     wait_selector: Optional[str] = None
     addons: Optional[List[str]] = None
     wait_selector_state: SelectorWaitStates = "attached"
+    cookies: Optional[Iterable[Dict]] = None
     google_search: bool = True
     extra_headers: Optional[Dict[str, str]] = None
+    proxy: Optional[str | Dict[str, str] | Tuple] = None  # The default value for proxy in Playwright's source is `None`
     os_randomize: bool = False
     disable_ads: bool = False
     geoip: bool = False
+    selector_config: Optional[Dict] = {}
+    additional_args: Optional[Dict] = {}
     def __post_init__(self):
         """Custom validation after msgspec validation"""
     network_idle: bool = False
     load_dom: bool = True
     solve_cloudflare: bool = False
+    selector_config: Dict = {}
     def to_dict(self):
         return {f: getattr(self, f) for f in self.__struct_fields__}
     network_idle: bool
     load_dom: bool
     solve_cloudflare: bool
+    selector_config: Dict
 def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
             result[arg] = session_value
     if overrides:
+        overrides = cast(FetchConfig, validate(overrides, FetchConfig)).to_dict()
         overrides.update(result)
         return _fetch_params(**overrides)
     return _fetch_params(**result)
+@overload
+def validate(params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...
+@overload
+def validate(params: Dict, model: type[CamoufoxConfig]) -> CamoufoxConfig: ...
+@overload
+def validate(params: Dict, model: type[FetchConfig]) -> FetchConfig: ...
+def validate(
+    params: Dict, model: type[PlaywrightConfig] | type[CamoufoxConfig] | type[FetchConfig]
+) -> PlaywrightConfig | CamoufoxConfig | FetchConfig:
     try:
         return convert(params, model)
     except ValidationError as e:

scrapling/engines/static.py CHANGED Viewed

@@ -182,7 +182,7 @@ class FetcherSession:
         return headers
-    def __enter__(self):
         """Creates and returns a new synchronous Fetcher Session"""
         if self._curl_session:
             raise RuntimeError(
@@ -197,7 +197,7 @@ class FetcherSession:
             )
         self._curl_session = CurlSession()
-        return self
     def __exit__(self, exc_type, exc_val, exc_tb):
         """Closes the active synchronous session managed by this instance, if any."""
@@ -205,7 +205,7 @@ class FetcherSession:
             self._curl_session.close()
             self._curl_session = None
-    async def __aenter__(self):
         """Creates and returns a new asynchronous Session."""
         if self._async_curl_session:
             raise RuntimeError(
@@ -220,7 +220,7 @@ class FetcherSession:
             )
         self._async_curl_session = AsyncCurlSession()
-        return self
     async def __aexit__(self, exc_type, exc_val, exc_tb):
         """Closes the active asynchronous session managed by this instance, if any."""

         return headers
+    def __enter__(self) -> "FetcherClient":
         """Creates and returns a new synchronous Fetcher Session"""
         if self._curl_session:
             raise RuntimeError(
             )
         self._curl_session = CurlSession()
+        return cast("FetcherClient", self)
     def __exit__(self, exc_type, exc_val, exc_tb):
         """Closes the active synchronous session managed by this instance, if any."""
             self._curl_session.close()
             self._curl_session = None
+    async def __aenter__(self) -> "AsyncFetcherClient":
         """Creates and returns a new asynchronous Session."""
         if self._async_curl_session:
             raise RuntimeError(
             )
         self._async_curl_session = AsyncCurlSession()
+        return cast("AsyncFetcherClient", self)
     async def __aexit__(self, exc_type, exc_val, exc_tb):
         """Closes the active asynchronous session managed by this instance, if any."""

scrapling/engines/toolbelt/convertor.py CHANGED Viewed

@@ -58,7 +58,8 @@ class ResponseFactory:
                                 "encoding": cls.__extract_browser_encoding(
                                     current_response.headers.get("content-type", "")
                                 )
-                                or "utf-8",
                                 "cookies": tuple(),
                                 "headers": current_response.all_headers() if current_response else {},
                                 "request_headers": current_request.all_headers(),
@@ -161,7 +162,8 @@ class ResponseFactory:
                                 "encoding": cls.__extract_browser_encoding(
                                     current_response.headers.get("content-type", "")
                                 )
-                                or "utf-8",
                                 "cookies": tuple(),
                                 "headers": await current_response.all_headers() if current_response else {},
                                 "request_headers": await current_request.all_headers(),
@@ -255,8 +257,8 @@ class ResponseFactory:
                 "encoding": response.encoding or "utf-8",
                 "cookies": dict(response.cookies),
                 "headers": dict(response.headers),
-                "request_headers": dict(response.request.headers),
-                "method": response.request.method,
                 "history": response.history,  # https://github.com/lexiforest/curl_cffi/issues/82
                 **parser_arguments,
             }

                                 "encoding": cls.__extract_browser_encoding(
                                     current_response.headers.get("content-type", "")
                                 )
+                                if current_response
+                                else "utf-8",
                                 "cookies": tuple(),
                                 "headers": current_response.all_headers() if current_response else {},
                                 "request_headers": current_request.all_headers(),
                                 "encoding": cls.__extract_browser_encoding(
                                     current_response.headers.get("content-type", "")
                                 )
+                                if current_response
+                                else "utf-8",
                                 "cookies": tuple(),
                                 "headers": await current_response.all_headers() if current_response else {},
                                 "request_headers": await current_request.all_headers(),
                 "encoding": response.encoding or "utf-8",
                 "cookies": dict(response.cookies),
                 "headers": dict(response.headers),
+                "request_headers": dict(response.request.headers) if response.request else {},
+                "method": response.request.method if response.request else "GET",
                 "history": response.history,  # https://github.com/lexiforest/curl_cffi/issues/82
                 **parser_arguments,
             }

scrapling/engines/toolbelt/custom.py CHANGED Viewed

@@ -8,6 +8,7 @@ from scrapling.core.utils import log
 from scrapling.core._types import (
     Any,
     Dict,
     List,
     Optional,
     Tuple,
@@ -30,10 +31,10 @@ class Response(Selector):
         request_headers: Dict,
         encoding: str = "utf-8",
         method: str = "GET",
-        history: List = None,
-        **selector_config: Dict,
     ):
-        adaptive_domain = selector_config.pop("adaptive_domain", None)
         self.status = status
         self.reason = reason
         self.cookies = cookies
@@ -58,7 +59,7 @@ class BaseFetcher:
     keep_cdata: Optional[bool] = False
     storage_args: Optional[Dict] = None
     keep_comments: Optional[bool] = False
-    adaptive_domain: Optional[str] = None
     parser_keywords: Tuple = (
         "huge_tree",
         "adaptive",
@@ -124,12 +125,8 @@ class BaseFetcher:
             adaptive=cls.adaptive,
             storage=cls.storage,
             storage_args=cls.storage_args,
         )
-        if cls.adaptive_domain:
-            if not isinstance(cls.adaptive_domain, str):
-                log.warning('[Ignored] The argument "adaptive_domain" must be of string type')
-            else:
-                parser_arguments.update({"adaptive_domain": cls.adaptive_domain})
         return parser_arguments

 from scrapling.core._types import (
     Any,
     Dict,
+    cast,
     List,
     Optional,
     Tuple,
         request_headers: Dict,
         encoding: str = "utf-8",
         method: str = "GET",
+        history: List | None = None,
+        **selector_config: Any,
     ):
+        adaptive_domain: str = cast(str, selector_config.pop("adaptive_domain", ""))
         self.status = status
         self.reason = reason
         self.cookies = cookies
     keep_cdata: Optional[bool] = False
     storage_args: Optional[Dict] = None
     keep_comments: Optional[bool] = False
+    adaptive_domain: str = ""
     parser_keywords: Tuple = (
         "huge_tree",
         "adaptive",
             adaptive=cls.adaptive,
             storage=cls.storage,
             storage_args=cls.storage_args,
+            adaptive_domain=cls.adaptive_domain,
         )
         return parser_arguments

scrapling/engines/toolbelt/fingerprints.py CHANGED Viewed

@@ -8,9 +8,10 @@ from platform import system as platform_system
 from tldextract import extract
 from browserforge.headers import Browser, HeaderGenerator
-from scrapling.core._types import Dict, Optional
 __OS_NAME__ = platform_system()
 @lru_cache(10, typed=True)
@@ -28,16 +29,20 @@ def generate_convincing_referer(url: str) -> str:
 @lru_cache(1, typed=True)
-def get_os_name() -> Optional[str]:
-    """Get the current OS name in the same format needed for browserforge
     :return: Current OS name or `None` otherwise
     """
-    return {
-        "Linux": "linux",
-        "Darwin": "macos",
-        "Windows": "windows",
-    }.get(__OS_NAME__)
 def generate_headers(browser_mode: bool = False) -> Dict:
@@ -58,8 +63,10 @@ def generate_headers(browser_mode: bool = False) -> Dict:
                 Browser(name="edge", min_version=130),
             ]
         )
-    return HeaderGenerator(browser=browsers, os=os_name, device="desktop").generate()
 __default_useragent__ = generate_headers(browser_mode=False).get("User-Agent")

 from tldextract import extract
 from browserforge.headers import Browser, HeaderGenerator
+from scrapling.core._types import Dict, Literal
 __OS_NAME__ = platform_system()
+OSName = Literal["linux", "macos", "windows"]
 @lru_cache(10, typed=True)
 @lru_cache(1, typed=True)
+def get_os_name() -> OSName | None:
+    """Get the current OS name in the same format needed for browserforge, if the OS is Unknown, return None so browserforge uses all.
     :return: Current OS name or `None` otherwise
     """
+    match __OS_NAME__:
+        case "Linux":
+            return "linux"
+        case "Darwin":
+            return "macos"
+        case "Windows":
+            return "windows"
+        case _:
+            return None
 def generate_headers(browser_mode: bool = False) -> Dict:
                 Browser(name="edge", min_version=130),
             ]
         )
+    if os_name:
+        return HeaderGenerator(browser=browsers, os=os_name, device="desktop").generate()
+    else:
+        return HeaderGenerator(browser=browsers, device="desktop").generate()
 __default_useragent__ = generate_headers(browser_mode=False).get("User-Agent")

scrapling/engines/toolbelt/navigation.py CHANGED Viewed

@@ -11,7 +11,7 @@ from msgspec import Struct, structs, convert, ValidationError
 from playwright.sync_api import Route
 from scrapling.core.utils import log
-from scrapling.core._types import Dict, Optional, Tuple
 from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
 __BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
@@ -49,7 +49,15 @@ async def async_intercept_route(route: async_Route):
         await route.continue_()
-def construct_proxy_dict(proxy_string: str | Dict[str, str], as_tuple=False) -> Optional[Dict | Tuple]:
     """Validate a proxy and return it in the acceptable format for Playwright
     Reference: https://playwright.dev/python/docs/network#http-proxy
@@ -83,7 +91,7 @@ def construct_proxy_dict(proxy_string: str | Dict[str, str], as_tuple=False) ->
         except ValidationError as e:
             raise TypeError(f"Invalid proxy dictionary: {e}")
-    return None
 @lru_cache(10, typed=True)

 from playwright.sync_api import Route
 from scrapling.core.utils import log
+from scrapling.core._types import Dict, Tuple, overload, Literal
 from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
 __BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
         await route.continue_()
+@overload
+def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: Literal[True]) -> Tuple: ...
+@overload
+def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: Literal[False] = False) -> Dict: ...
+def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: bool = False) -> Dict | Tuple:
     """Validate a proxy and return it in the acceptable format for Playwright
     Reference: https://playwright.dev/python/docs/network#http-proxy
         except ValidationError as e:
             raise TypeError(f"Invalid proxy dictionary: {e}")
+    raise TypeError(f"Invalid proxy string: {proxy_string}")
 @lru_cache(10, typed=True)

scrapling/fetchers/__init__.py CHANGED Viewed

@@ -19,7 +19,17 @@ _LAZY_IMPORTS = {
     "AsyncStealthySession": ("scrapling.fetchers.firefox", "AsyncStealthySession"),
 }
-__all__ = ["Fetcher", "AsyncFetcher", "StealthyFetcher", "DynamicFetcher"]
 def __getattr__(name: str) -> Any:

     "AsyncStealthySession": ("scrapling.fetchers.firefox", "AsyncStealthySession"),
 }
+__all__ = [
+    "Fetcher",
+    "AsyncFetcher",
+    "FetcherSession",
+    "DynamicFetcher",
+    "DynamicSession",
+    "AsyncDynamicSession",
+    "StealthyFetcher",
+    "StealthySession",
+    "AsyncStealthySession",
+]
 def __getattr__(name: str) -> Any:

scrapling/fetchers/chrome.py CHANGED Viewed

@@ -1,10 +1,9 @@
 from scrapling.core._types import (
     Callable,
-    Dict,
     List,
     Optional,
     SelectorWaitStates,
-    Iterable,
 )
 from scrapling.engines.toolbelt.custom import BaseFetcher, Response
 from scrapling.engines._browsers._controllers import DynamicSession, AsyncDynamicSession
@@ -47,7 +46,7 @@ class DynamicFetcher(BaseFetcher):
         disable_resources: bool = False,
         wait_selector: Optional[str] = None,
         init_script: Optional[str] = None,
-        cookies: Optional[Iterable[Dict]] = None,
         network_idle: bool = False,
         load_dom: bool = True,
         wait_selector_state: SelectorWaitStates = "attached",
@@ -134,7 +133,7 @@ class DynamicFetcher(BaseFetcher):
         disable_resources: bool = False,
         wait_selector: Optional[str] = None,
         init_script: Optional[str] = None,
-        cookies: Optional[Iterable[Dict]] = None,
         network_idle: bool = False,
         load_dom: bool = True,
         wait_selector_state: SelectorWaitStates = "attached",

 from scrapling.core._types import (
     Callable,
     List,
+    Dict,
     Optional,
     SelectorWaitStates,
 )
 from scrapling.engines.toolbelt.custom import BaseFetcher, Response
 from scrapling.engines._browsers._controllers import DynamicSession, AsyncDynamicSession
         disable_resources: bool = False,
         wait_selector: Optional[str] = None,
         init_script: Optional[str] = None,
+        cookies: Optional[List[Dict]] = None,
         network_idle: bool = False,
         load_dom: bool = True,
         wait_selector_state: SelectorWaitStates = "attached",
         disable_resources: bool = False,
         wait_selector: Optional[str] = None,
         init_script: Optional[str] = None,
+        cookies: Optional[List[Dict]] = None,
         network_idle: bool = False,
         load_dom: bool = True,
         wait_selector_state: SelectorWaitStates = "attached",

scrapling/fetchers/firefox.py CHANGED Viewed

@@ -83,8 +83,6 @@ class StealthyFetcher(BaseFetcher):
         """
         if not custom_config:
             custom_config = {}
-        elif not isinstance(custom_config, dict):
-            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
         with StealthySession(
             wait=wait,
@@ -182,8 +180,6 @@ class StealthyFetcher(BaseFetcher):
         """
         if not custom_config:
             custom_config = {}
-        elif not isinstance(custom_config, dict):
-            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
         async with AsyncStealthySession(
             wait=wait,

         """
         if not custom_config:
             custom_config = {}
         with StealthySession(
             wait=wait,
         """
         if not custom_config:
             custom_config = {}
         async with AsyncStealthySession(
             wait=wait,

scrapling/parser.py CHANGED Viewed

@@ -17,17 +17,21 @@ from lxml.etree import (
 from scrapling.core._types import (
     Any,
     Dict,
     List,
     Tuple,
     Union,
     Pattern,
     Callable,
     Optional,
     Iterable,
     overload,
     Generator,
     SupportsIndex,
 )
 from scrapling.core.custom_types import AttributesHandler, TextHandler, TextHandlers
 from scrapling.core.mixins import SelectorsGeneration
@@ -36,7 +40,7 @@ from scrapling.core.storage import (
     StorageSystemMixin,
     _StorageTools,
 )
-from scrapling.core.translator import translator as _translator
 from scrapling.core.utils import clean_spaces, flatten, html_forbidden, log
 __DEFAULT_DB_FILE__ = str(Path(__file__).parent / "elements_storage.db")
@@ -70,20 +74,23 @@ class Selector(SelectorsGeneration):
         "_raw_body",
     )
     def __init__(
         self,
         content: Optional[str | bytes] = None,
-        url: Optional[str] = None,
         encoding: str = "utf-8",
         huge_tree: bool = True,
         root: Optional[HtmlElement] = None,
         keep_comments: Optional[bool] = False,
         keep_cdata: Optional[bool] = False,
         adaptive: Optional[bool] = False,
-        _storage: object = None,
         storage: Any = SQLiteStorageSystem,
         storage_args: Optional[Dict] = None,
-        **kwargs,
     ):
         """The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
         with expressions in CSS, XPath, or with simply text. Check the docs for more info.
@@ -131,7 +138,7 @@ class Selector(SelectorsGeneration):
                 default_doctype=True,
                 strip_cdata=(not keep_cdata),
             )
-            self._root = fromstring(body, parser=parser, base_url=url)
             self._raw_body = content
         else:
@@ -141,7 +148,7 @@ class Selector(SelectorsGeneration):
                     f"Root have to be a valid element of `html` module types to work, not of type {type(root)}"
                 )
-            self._root = root
             self._raw_body = ""
         self.__adaptive_enabled = adaptive
@@ -238,6 +245,9 @@ class Selector(SelectorsGeneration):
             **self.__response_data,
         )
     def __handle_element(
         self, element: Optional[HtmlElement | _ElementUnicodeResult]
     ) -> Optional[Union[TextHandler, "Selector"]]:
@@ -262,7 +272,7 @@ class Selector(SelectorsGeneration):
         if self._is_text_node(result[0]):
             return TextHandlers(map(TextHandler, result))
-        return Selectors(map(self.__element_convertor, result))
     def __getstate__(self) -> Any:
         # lxml don't like it :)
@@ -323,7 +333,7 @@ class Selector(SelectorsGeneration):
                     if not valid_values or processed_text.strip():
                         _all_strings.append(processed_text)
-        return TextHandler(separator).join(_all_strings)
     def urljoin(self, relative_url: str) -> str:
         """Join this Selector's url with a relative url to form an absolute full URL."""
@@ -372,13 +382,14 @@ class Selector(SelectorsGeneration):
     @property
     def parent(self) -> Optional["Selector"]:
         """Return the direct parent of the element or ``None`` otherwise"""
-        return self.__handle_element(self._root.getparent())
     @property
     def below_elements(self) -> "Selectors":
         """Return all elements under the current element in the DOM tree"""
         below = _find_all_elements(self._root)
-        return self.__handle_elements(below)
     @property
     def children(self) -> "Selectors":
@@ -425,7 +436,7 @@ class Selector(SelectorsGeneration):
             # Ignore HTML comments and unwanted types
             next_element = next_element.getnext()
-        return self.__handle_element(next_element)
     @property
     def previous(self) -> Optional["Selector"]:
@@ -435,10 +446,10 @@ class Selector(SelectorsGeneration):
             # Ignore HTML comments and unwanted types
             prev_element = prev_element.getprevious()
-        return self.__handle_element(prev_element)
     # For easy copy-paste from Scrapy/parsel code when needed :)
-    def get(self, default=None):
         return self
     def get_all(self):
@@ -468,6 +479,16 @@ class Selector(SelectorsGeneration):
         return data + ">"
     # From here we start with the selecting functions
     def relocate(
         self,
         element: Union[Dict, HtmlElement, "Selector"],
@@ -506,11 +527,11 @@ class Selector(SelectorsGeneration):
                     log.debug(f"Highest probability was {highest_probability}%")
                     log.debug("Top 5 best matching elements are: ")
                     for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
-                        log.debug(f"{percent} -> {self.__handle_elements(score_table[percent])}")
                 if not selector_type:
                     return score_table[highest_probability]
-                return self.__handle_elements(score_table[highest_probability])
         return []
     def css_first(
@@ -593,7 +614,7 @@ class Selector(SelectorsGeneration):
         auto_save: bool = False,
         percentage: int = 0,
         **kwargs: Any,
-    ) -> Union["Selectors", List, "TextHandlers"]:
         """Search the current tree with CSS3 selectors
         **Important:
@@ -614,7 +635,7 @@ class Selector(SelectorsGeneration):
         try:
             if not self.__adaptive_enabled or "," not in selector:
                 # No need to split selectors in this case, let's save some CPU cycles :)
-                xpath_selector = _translator.css_to_xpath(selector)
                 return self.xpath(
                     xpath_selector,
                     identifier or selector,
@@ -628,7 +649,7 @@ class Selector(SelectorsGeneration):
             for single_selector in split_selectors(selector):
                 # I'm doing this only so the `save` function saves data correctly for combined selectors
                 # Like using the ',' to combine two different selectors that point to different elements.
-                xpath_selector = _translator.css_to_xpath(single_selector.canonical())
                 results += self.xpath(
                     xpath_selector,
                     identifier or single_selector.canonical(),
@@ -731,7 +752,8 @@ class Selector(SelectorsGeneration):
             raise TypeError("You have to pass something to search with, like tag name(s), tag attributes, or both.")
         attributes = dict()
-        tags, patterns = set(), set()
         results, functions, selectors = Selectors(), [], []
         # Brace yourself for a wonderful journey!
@@ -740,6 +762,7 @@ class Selector(SelectorsGeneration):
                 tags.add(arg)
             elif type(arg) in (list, tuple, set):
                 if not all(map(lambda x: isinstance(x, str), arg)):
                     raise TypeError("Nested Iterables are not accepted, only iterables of tag names are accepted")
                 tags.update(set(arg))
@@ -774,7 +797,7 @@ class Selector(SelectorsGeneration):
             attributes[attribute_name] = value
         # It's easier and faster to build a selector than traversing the tree
-        tags = tags or ["*"]
         for tag in tags:
             selector = tag
             for key, value in attributes.items():
@@ -785,7 +808,7 @@ class Selector(SelectorsGeneration):
                 selectors.append(selector)
         if selectors:
-            results = self.css(", ".join(selectors))
             if results:
                 # From the results, get the ones that fulfill passed regex patterns
                 for pattern in patterns:
@@ -828,20 +851,20 @@ class Selector(SelectorsGeneration):
         :return: A percentage score of how similar is the candidate to the original element
         """
         score, checks = 0, 0
-        candidate = _StorageTools.element_to_dict(candidate)
         # Possible TODO:
         # Study the idea of giving weight to each test below so some are more important than others
         # Current results: With weights some websites had better score while it was worse for others
-        score += 1 if original["tag"] == candidate["tag"] else 0  # * 0.3  # 30%
         checks += 1
         if original["text"]:
-            score += SequenceMatcher(None, original["text"], candidate.get("text") or "").ratio()  # * 0.3  # 30%
             checks += 1
         # if both don't have attributes, it still counts for something!
-        score += self.__calculate_dict_diff(original["attributes"], candidate["attributes"])  # * 0.3  # 30%
         checks += 1
         # Separate similarity test for class, id, href,... this will help in full structural changes
@@ -855,23 +878,23 @@ class Selector(SelectorsGeneration):
                 score += SequenceMatcher(
                     None,
                     original["attributes"][attrib],
-                    candidate["attributes"].get(attrib) or "",
                 ).ratio()  # * 0.3  # 30%
                 checks += 1
-        score += SequenceMatcher(None, original["path"], candidate["path"]).ratio()  # * 0.1  # 10%
         checks += 1
         if original.get("parent_name"):
             # Then we start comparing parents' data
-            if candidate.get("parent_name"):
                 score += SequenceMatcher(
-                    None, original["parent_name"], candidate.get("parent_name") or ""
                 ).ratio()  # * 0.2  # 20%
                 checks += 1
                 score += self.__calculate_dict_diff(
-                    original["parent_attribs"], candidate.get("parent_attribs") or {}
                 )  # * 0.2  # 20%
                 checks += 1
@@ -879,7 +902,7 @@ class Selector(SelectorsGeneration):
                     score += SequenceMatcher(
                         None,
                         original["parent_text"],
-                        candidate.get("parent_text") or "",
                     ).ratio()  # * 0.1  # 10%
                     checks += 1
             # else:
@@ -887,9 +910,7 @@ class Selector(SelectorsGeneration):
             #     score -= 0.1
         if original.get("siblings"):
-            score += SequenceMatcher(
-                None, original["siblings"], candidate.get("siblings") or []
-            ).ratio()  # * 0.1  # 10%
             checks += 1
         # How % sure? let's see
@@ -902,7 +923,7 @@ class Selector(SelectorsGeneration):
         score += SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio() * 0.5
         return score
-    def save(self, element: Union["Selector", HtmlElement], identifier: str) -> None:
         """Saves the element's unique properties to the storage for retrieval and relocation later
         :param element: The element itself that we want to save to storage, it can be a ` Selector ` or pure ` HtmlElement `
@@ -910,15 +931,16 @@ class Selector(SelectorsGeneration):
             the docs for more info.
         """
         if self.__adaptive_enabled:
-            if isinstance(element, self.__class__):
-                element = element._root
-            if self._is_text_node(element):
-                element = element.getparent()
-            self._storage.save(element, identifier)
         else:
-            log.critical(
                 "Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
             )
@@ -932,10 +954,9 @@ class Selector(SelectorsGeneration):
         if self.__adaptive_enabled:
             return self._storage.retrieve(identifier)
-        log.critical(
             "Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
         )
-        return None
     # Operations on text functions
     def json(self) -> Dict:
@@ -1104,28 +1125,30 @@ class Selector(SelectorsGeneration):
         if not case_sensitive:
             text = text.lower()
-        for node in self.__handle_elements(_find_all_elements_with_spaces(self._root)):
-            """Check if element matches given text otherwise, traverse the children tree and iterate"""
-            node_text = node.text
-            if clean_match:
-                node_text = node_text.clean()
-            if not case_sensitive:
-                node_text = node_text.lower()
-            if partial:
-                if text in node_text:
                     results.append(node)
-            elif text == node_text:
-                results.append(node)
-            if first_match and results:
-                # we got an element so we should stop
-                break
-        if first_match:
-            if results:
-                return results[0]
         return results
     def find_by_regex(
@@ -1143,23 +1166,25 @@ class Selector(SelectorsGeneration):
         """
         results = Selectors()
-        for node in self.__handle_elements(_find_all_elements_with_spaces(self._root)):
-            """Check if element matches given regex otherwise, traverse the children tree and iterate"""
-            node_text = node.text
-            if node_text.re(
-                query,
-                check_match=True,
-                clean_match=clean_match,
-                case_sensitive=case_sensitive,
-            ):
-                results.append(node)
-            if first_match and results:
-                # we got an element so we should stop
-                break
-        if results and first_match:
-            return results[0]
         return results
@@ -1181,9 +1206,9 @@ class Selectors(List[Selector]):
     def __getitem__(self, pos: SupportsIndex | slice) -> Union[Selector, "Selectors"]:
         lst = super().__getitem__(pos)
         if isinstance(pos, slice):
-            return self.__class__(lst)
         else:
-            return lst
     def xpath(
         self,
@@ -1265,7 +1290,7 @@ class Selectors(List[Selector]):
     def re_first(
         self,
         regex: str | Pattern,
-        default=None,
         replace_entities: bool = True,
         clean_match: bool = False,
         case_sensitive: bool = True,

 from scrapling.core._types import (
     Any,
+    Set,
     Dict,
+    cast,
     List,
     Tuple,
     Union,
     Pattern,
     Callable,
+    Literal,
     Optional,
     Iterable,
     overload,
     Generator,
     SupportsIndex,
+    TYPE_CHECKING,
 )
 from scrapling.core.custom_types import AttributesHandler, TextHandler, TextHandlers
 from scrapling.core.mixins import SelectorsGeneration
     StorageSystemMixin,
     _StorageTools,
 )
+from scrapling.core.translator import css_to_xpath as _css_to_xpath
 from scrapling.core.utils import clean_spaces, flatten, html_forbidden, log
 __DEFAULT_DB_FILE__ = str(Path(__file__).parent / "elements_storage.db")
         "_raw_body",
     )
+    if TYPE_CHECKING:
+        _storage: StorageSystemMixin
     def __init__(
         self,
         content: Optional[str | bytes] = None,
+        url: str = "",
         encoding: str = "utf-8",
         huge_tree: bool = True,
         root: Optional[HtmlElement] = None,
         keep_comments: Optional[bool] = False,
         keep_cdata: Optional[bool] = False,
         adaptive: Optional[bool] = False,
+        _storage: Optional[StorageSystemMixin] = None,
         storage: Any = SQLiteStorageSystem,
         storage_args: Optional[Dict] = None,
+        **_,
     ):
         """The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
         with expressions in CSS, XPath, or with simply text. Check the docs for more info.
                 default_doctype=True,
                 strip_cdata=(not keep_cdata),
             )
+            self._root = cast(HtmlElement, fromstring(body, parser=parser, base_url=url or None))
             self._raw_body = content
         else:
                     f"Root have to be a valid element of `html` module types to work, not of type {type(root)}"
                 )
+            self._root = cast(HtmlElement, root)
             self._raw_body = ""
         self.__adaptive_enabled = adaptive
             **self.__response_data,
         )
+    def __elements_convertor(self, elements: List[HtmlElement]) -> "Selectors":
+        return Selectors(map(self.__element_convertor, elements))
     def __handle_element(
         self, element: Optional[HtmlElement | _ElementUnicodeResult]
     ) -> Optional[Union[TextHandler, "Selector"]]:
         if self._is_text_node(result[0]):
             return TextHandlers(map(TextHandler, result))
+        return self.__elements_convertor(result)
     def __getstate__(self) -> Any:
         # lxml don't like it :)
                     if not valid_values or processed_text.strip():
                         _all_strings.append(processed_text)
+        return cast(TextHandler, TextHandler(separator).join(_all_strings))
     def urljoin(self, relative_url: str) -> str:
         """Join this Selector's url with a relative url to form an absolute full URL."""
     @property
     def parent(self) -> Optional["Selector"]:
         """Return the direct parent of the element or ``None`` otherwise"""
+        _parent = self._root.getparent()
+        return self.__element_convertor(_parent) if _parent is not None else None
     @property
     def below_elements(self) -> "Selectors":
         """Return all elements under the current element in the DOM tree"""
         below = _find_all_elements(self._root)
+        return self.__elements_convertor(below) if below is not None else Selectors()
     @property
     def children(self) -> "Selectors":
             # Ignore HTML comments and unwanted types
             next_element = next_element.getnext()
+        return self.__element_convertor(next_element) if next_element is not None else None
     @property
     def previous(self) -> Optional["Selector"]:
             # Ignore HTML comments and unwanted types
             prev_element = prev_element.getprevious()
+        return self.__element_convertor(prev_element) if prev_element is not None else None
     # For easy copy-paste from Scrapy/parsel code when needed :)
+    def get(self, default=None):  # pyright: ignore
         return self
     def get_all(self):
         return data + ">"
     # From here we start with the selecting functions
+    @overload
+    def relocate(
+        self, element: Union[Dict, HtmlElement, "Selector"], percentage: int, selector_type: Literal[True]
+    ) -> "Selectors": ...
+    @overload
+    def relocate(
+        self, element: Union[Dict, HtmlElement, "Selector"], percentage: int, selector_type: Literal[False] = False
+    ) -> List[HtmlElement]: ...
     def relocate(
         self,
         element: Union[Dict, HtmlElement, "Selector"],
                     log.debug(f"Highest probability was {highest_probability}%")
                     log.debug("Top 5 best matching elements are: ")
                     for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
+                        log.debug(f"{percent} -> {self.__elements_convertor(score_table[percent])}")
                 if not selector_type:
                     return score_table[highest_probability]
+                return self.__elements_convertor(score_table[highest_probability])
         return []
     def css_first(
         auto_save: bool = False,
         percentage: int = 0,
         **kwargs: Any,
+    ) -> Union["Selectors", List[Any], "TextHandlers"]:
         """Search the current tree with CSS3 selectors
         **Important:
         try:
             if not self.__adaptive_enabled or "," not in selector:
                 # No need to split selectors in this case, let's save some CPU cycles :)
+                xpath_selector = _css_to_xpath(selector)
                 return self.xpath(
                     xpath_selector,
                     identifier or selector,
             for single_selector in split_selectors(selector):
                 # I'm doing this only so the `save` function saves data correctly for combined selectors
                 # Like using the ',' to combine two different selectors that point to different elements.
+                xpath_selector = _css_to_xpath(single_selector.canonical())
                 results += self.xpath(
                     xpath_selector,
                     identifier or single_selector.canonical(),
             raise TypeError("You have to pass something to search with, like tag name(s), tag attributes, or both.")
         attributes = dict()
+        tags: Set[str] = set()
+        patterns: Set[Pattern] = set()
         results, functions, selectors = Selectors(), [], []
         # Brace yourself for a wonderful journey!
                 tags.add(arg)
             elif type(arg) in (list, tuple, set):
+                arg = cast(Iterable, arg)  # Type narrowing for type checkers like pyright
                 if not all(map(lambda x: isinstance(x, str), arg)):
                     raise TypeError("Nested Iterables are not accepted, only iterables of tag names are accepted")
                 tags.update(set(arg))
             attributes[attribute_name] = value
         # It's easier and faster to build a selector than traversing the tree
+        tags = tags or set("*")
         for tag in tags:
             selector = tag
             for key, value in attributes.items():
                 selectors.append(selector)
         if selectors:
+            results = cast(Selectors, self.css(", ".join(selectors)))
             if results:
                 # From the results, get the ones that fulfill passed regex patterns
                 for pattern in patterns:
         :return: A percentage score of how similar is the candidate to the original element
         """
         score, checks = 0, 0
+        data = _StorageTools.element_to_dict(candidate)
         # Possible TODO:
         # Study the idea of giving weight to each test below so some are more important than others
         # Current results: With weights some websites had better score while it was worse for others
+        score += 1 if original["tag"] == data["tag"] else 0  # * 0.3  # 30%
         checks += 1
         if original["text"]:
+            score += SequenceMatcher(None, original["text"], data.get("text") or "").ratio()  # * 0.3  # 30%
             checks += 1
         # if both don't have attributes, it still counts for something!
+        score += self.__calculate_dict_diff(original["attributes"], data["attributes"])  # * 0.3  # 30%
         checks += 1
         # Separate similarity test for class, id, href,... this will help in full structural changes
                 score += SequenceMatcher(
                     None,
                     original["attributes"][attrib],
+                    data["attributes"].get(attrib) or "",
                 ).ratio()  # * 0.3  # 30%
                 checks += 1
+        score += SequenceMatcher(None, original["path"], data["path"]).ratio()  # * 0.1  # 10%
         checks += 1
         if original.get("parent_name"):
             # Then we start comparing parents' data
+            if data.get("parent_name"):
                 score += SequenceMatcher(
+                    None, original["parent_name"], data.get("parent_name") or ""
                 ).ratio()  # * 0.2  # 20%
                 checks += 1
                 score += self.__calculate_dict_diff(
+                    original["parent_attribs"], data.get("parent_attribs") or {}
                 )  # * 0.2  # 20%
                 checks += 1
                     score += SequenceMatcher(
                         None,
                         original["parent_text"],
+                        data.get("parent_text") or "",
                     ).ratio()  # * 0.1  # 10%
                     checks += 1
             # else:
             #     score -= 0.1
         if original.get("siblings"):
+            score += SequenceMatcher(None, original["siblings"], data.get("siblings") or []).ratio()  # * 0.1  # 10%
             checks += 1
         # How % sure? let's see
         score += SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio() * 0.5
         return score
+    def save(self, element: HtmlElement, identifier: str) -> None:
         """Saves the element's unique properties to the storage for retrieval and relocation later
         :param element: The element itself that we want to save to storage, it can be a ` Selector ` or pure ` HtmlElement `
             the docs for more info.
         """
         if self.__adaptive_enabled:
+            target = element
+            if isinstance(target, self.__class__):
+                target: HtmlElement = target._root
+            if self._is_text_node(target):
+                target: HtmlElement = target.getparent()
+            self._storage.save(target, identifier)
         else:
+            raise RuntimeError(
                 "Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
             )
         if self.__adaptive_enabled:
             return self._storage.retrieve(identifier)
+        raise RuntimeError(
             "Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
         )
     # Operations on text functions
     def json(self) -> Dict:
         if not case_sensitive:
             text = text.lower()
+        possible_targets = _find_all_elements_with_spaces(self._root)
+        if possible_targets:
+            for node in self.__elements_convertor(possible_targets):
+                """Check if element matches given text otherwise, traverse the children tree and iterate"""
+                node_text = node.text
+                if clean_match:
+                    node_text = node_text.clean()
+                if not case_sensitive:
+                    node_text = node_text.lower()
+                if partial:
+                    if text in node_text:
+                        results.append(node)
+                elif text == node_text:
                     results.append(node)
+                if first_match and results:
+                    # we got an element so we should stop
+                    break
+            if first_match:
+                if results:
+                    return results[0]
         return results
     def find_by_regex(
         """
         results = Selectors()
+        possible_targets = _find_all_elements_with_spaces(self._root)
+        if possible_targets:
+            for node in self.__elements_convertor(possible_targets):
+                """Check if element matches given regex otherwise, traverse the children tree and iterate"""
+                node_text = node.text
+                if node_text.re(
+                    query,
+                    check_match=True,
+                    clean_match=clean_match,
+                    case_sensitive=case_sensitive,
+                ):
+                    results.append(node)
+                if first_match and results:
+                    # we got an element so we should stop
+                    break
+            if results and first_match:
+                return results[0]
         return results
     def __getitem__(self, pos: SupportsIndex | slice) -> Union[Selector, "Selectors"]:
         lst = super().__getitem__(pos)
         if isinstance(pos, slice):
+            return self.__class__(cast(List[Selector], lst))
         else:
+            return cast(Selector, lst)
     def xpath(
         self,
     def re_first(
         self,
         regex: str | Pattern,
+        default: Any = None,
         replace_entities: bool = True,
         clean_match: bool = False,
         case_sensitive: bool = True,