Spaces:

lenson78
/

Scrapling

Paused

App Files Files Community

Karim shoair commited on Sep 13, 2025

Commit

a85d2c8

1 Parent(s): 085d32d

refactor: Make all fetchers as an optional dependency group

Browse files

Files changed (20) hide show

pyproject.toml +6 -2
scrapling/cli.py +16 -4
scrapling/core/shell.py +32 -66
scrapling/core/translator.py +2 -2
scrapling/core/utils/__init__.py +10 -0
scrapling/core/utils/_shell.py +48 -0
scrapling/core/{utils.py → utils/_utils.py} +0 -0
scrapling/engines/__init__.py +0 -16
scrapling/engines/_browsers/_base.py +6 -9
scrapling/engines/_browsers/_camoufox.py +2 -2
scrapling/engines/_browsers/_config_tools.py +2 -1
scrapling/engines/_browsers/_controllers.py +2 -2
scrapling/engines/_browsers/_validators.py +1 -1
scrapling/engines/static.py +2 -2
scrapling/engines/toolbelt/__init__.py +0 -9
scrapling/engines/toolbelt/custom.py +2 -1
scrapling/engines/toolbelt/fingerprints.py +1 -3
scrapling/engines/toolbelt/navigation.py +0 -45
scrapling/fetchers.py +7 -5
scrapling/parser.py +3 -4

pyproject.toml CHANGED Viewed

@@ -61,6 +61,10 @@ dependencies = [
     "click>=8.2.1",
     "orjson>=3.11.3",
     "tldextract>=5.3.0",
     "curl_cffi>=0.13.0",
     "playwright>=1.52.0",
     "rebrowser-playwright>=1.52.0",
@@ -68,15 +72,15 @@ dependencies = [
     "geoip2>=5.1.0",
     "msgspec>=0.19.0",
 ]
-[project.optional-dependencies]
 ai = [
     "mcp>=1.14.0",
     "markdownify>=1.2.0",
 ]
 shell = [
     "IPython>=8.37",  # The last version that supports Python 3.10
     "markdownify>=1.2.0",
 ]
 all = [
     "scrapling[ai,shell]",

     "click>=8.2.1",
     "orjson>=3.11.3",
     "tldextract>=5.3.0",
+]
+[project.optional-dependencies]
+fetchers = [
     "curl_cffi>=0.13.0",
     "playwright>=1.52.0",
     "rebrowser-playwright>=1.52.0",
     "geoip2>=5.1.0",
     "msgspec>=0.19.0",
 ]
 ai = [
     "mcp>=1.14.0",
     "markdownify>=1.2.0",
+    "scrapling[fetchers]",
 ]
 shell = [
     "IPython>=8.37",  # The last version that supports Python 3.10
     "markdownify>=1.2.0",
+    "scrapling[fetchers]",
 ]
 all = [
     "scrapling[ai,shell]",

scrapling/cli.py CHANGED Viewed

@@ -2,11 +2,9 @@ from pathlib import Path
 from subprocess import check_output
 from sys import executable as python_executable
-from scrapling.core.utils import log
-from scrapling.engines.toolbelt import Response
 from scrapling.core._types import List, Optional, Dict, Tuple, Any, Callable
-from scrapling.fetchers import Fetcher, DynamicFetcher, StealthyFetcher
-from scrapling.core.shell import Convertor, _CookieParser, _ParseHeaders
 from orjson import loads as json_loads, JSONDecodeError
 from click import command, option, Choice, group, argument
@@ -40,6 +38,8 @@ def __Request_and_Save(
     **kwargs,
 ) -> None:
     """Make a request using the specified fetcher function and save the result"""
     # Handle relative paths - convert to an absolute path based on the current working directory
     output_path = Path(output_file)
     if not output_path.is_absolute():
@@ -251,6 +251,8 @@ def get(
         impersonate=impersonate,
         proxy=proxy,
     )
     __Request_and_Save(Fetcher.get, url, output_file, css_selector, **kwargs)
@@ -347,6 +349,8 @@ def post(
         proxy=proxy,
         data=data,
     )
     __Request_and_Save(Fetcher.post, url, output_file, css_selector, **kwargs)
@@ -439,6 +443,8 @@ def put(
         proxy=proxy,
         data=data,
     )
     __Request_and_Save(Fetcher.put, url, output_file, css_selector, **kwargs)
@@ -524,6 +530,8 @@ def delete(
         impersonate=impersonate,
         proxy=proxy,
     )
     __Request_and_Save(Fetcher.delete, url, output_file, css_selector, **kwargs)
@@ -643,6 +651,8 @@ def fetch(
     if parsed_headers:
         kwargs["extra_headers"] = parsed_headers
     __Request_and_Save(DynamicFetcher.fetch, url, output_file, css_selector, **kwargs)
@@ -790,6 +800,8 @@ def stealthy_fetch(
     if parsed_headers:
         kwargs["extra_headers"] = parsed_headers
     __Request_and_Save(StealthyFetcher.fetch, url, output_file, css_selector, **kwargs)

 from subprocess import check_output
 from sys import executable as python_executable
+from scrapling.engines.toolbelt.custom import Response
+from scrapling.core.utils import log, _CookieParser, _ParseHeaders
 from scrapling.core._types import List, Optional, Dict, Tuple, Any, Callable
 from orjson import loads as json_loads, JSONDecodeError
 from click import command, option, Choice, group, argument
     **kwargs,
 ) -> None:
     """Make a request using the specified fetcher function and save the result"""
+    from scrapling.core.shell import Convertor
     # Handle relative paths - convert to an absolute path based on the current working directory
     output_path = Path(output_file)
     if not output_path.is_absolute():
         impersonate=impersonate,
         proxy=proxy,
     )
+    from scrapling.fetchers import Fetcher
     __Request_and_Save(Fetcher.get, url, output_file, css_selector, **kwargs)
         proxy=proxy,
         data=data,
     )
+    from scrapling.fetchers import Fetcher
     __Request_and_Save(Fetcher.post, url, output_file, css_selector, **kwargs)
         proxy=proxy,
         data=data,
     )
+    from scrapling.fetchers import Fetcher
     __Request_and_Save(Fetcher.put, url, output_file, css_selector, **kwargs)
         impersonate=impersonate,
         proxy=proxy,
     )
+    from scrapling.fetchers import Fetcher
     __Request_and_Save(Fetcher.delete, url, output_file, css_selector, **kwargs)
     if parsed_headers:
         kwargs["extra_headers"] = parsed_headers
+    from scrapling.fetchers import DynamicFetcher
     __Request_and_Save(DynamicFetcher.fetch, url, output_file, css_selector, **kwargs)
     if parsed_headers:
         kwargs["extra_headers"] = parsed_headers
+    from scrapling.fetchers import StealthyFetcher
     __Request_and_Save(StealthyFetcher.fetch, url, output_file, css_selector, **kwargs)

scrapling/core/shell.py CHANGED Viewed

@@ -2,7 +2,6 @@
 from re import sub as re_sub
 from sys import stderr
 from functools import wraps
-from http import cookies as Cookie
 from collections import namedtuple
 from shlex import split as shlex_split
 from tempfile import mkstemp as make_temp_file
@@ -23,25 +22,17 @@ from logging import (
 from orjson import loads as json_loads, JSONDecodeError
 from scrapling import __version__
-from scrapling.core.custom_types import TextHandler
-from scrapling.core.utils import log
 from scrapling.parser import Selector, Selectors
 from scrapling.core._types import (
-    List,
     Optional,
     Dict,
-    Tuple,
     Any,
     extraction_types,
     Generator,
 )
-from scrapling.fetchers import (
-    Fetcher,
-    AsyncFetcher,
-    DynamicFetcher,
-    StealthyFetcher,
-    Response,
-)
 _known_logging_levels = {
@@ -71,46 +62,6 @@ Request = namedtuple(
 )
-def _CookieParser(cookie_string):
-    # Errors will be handled on call so the log can be specified
-    cookie_parser = Cookie.SimpleCookie()
-    cookie_parser.load(cookie_string)
-    for key, morsel in cookie_parser.items():
-        yield key, morsel.value
-def _ParseHeaders(header_lines: List[str], parse_cookies: bool = True) -> Tuple[Dict[str, str], Dict[str, str]]:
-    """Parses headers into separate header and cookie dictionaries."""
-    header_dict = dict()
-    cookie_dict = dict()
-    for header_line in header_lines:
-        if ":" not in header_line:
-            if header_line.endswith(";"):
-                header_key = header_line[:-1].strip()
-                header_value = ""
-                header_dict[header_key] = header_value
-            else:
-                raise ValueError(f"Could not parse header without colon: '{header_line}'.")
-        else:
-            header_key, header_value = header_line.split(":", 1)
-            header_key = header_key.strip()
-            header_value = header_value.strip()
-            if parse_cookies:
-                if header_key.lower() == "cookie":
-                    try:
-                        cookie_dict = {key: value for key, value in _CookieParser(header_value)}
-                    except Exception as e:  # pragma: no cover
-                        raise ValueError(f"Could not parse cookie string from header '{header_value}': {e}")
-                else:
-                    header_dict[header_key] = header_value
-            else:
-                header_dict[header_key] = header_value
-    return header_dict, cookie_dict
 # Suppress exit on error to handle parsing errors gracefully
 class NoExitArgumentParser(ArgumentParser):  # pragma: no cover
     def error(self, message):
@@ -128,6 +79,9 @@ class CurlParser:
     """Builds the argument parser for relevant curl flags from DevTools."""
     def __init__(self):
         # We will use argparse parser to parse the curl command directly instead of regex
         # We will focus more on flags that will show up on curl commands copied from DevTools's network tab
         _parser = NoExitArgumentParser(add_help=False)  # Disable default help
@@ -343,7 +297,7 @@ class CurlParser:
                     _ = request_args.pop("json", None)
                 try:
-                    return getattr(Fetcher, method)(**request_args)
                 except Exception as e:  # pragma: no cover
                     log.error(f"Error calling Fetcher.{method}: {e}")
                     return None
@@ -377,6 +331,19 @@ class CustomShell:
     """A custom IPython shell with minimal dependencies"""
     def __init__(self, code, log_level="debug"):
         self.code = code
         self.page = None
         self.pages = Selectors([])
@@ -400,7 +367,7 @@ class CustomShell:
         if self.log_level:
             getLogger("scrapling").setLevel(self.log_level)
-        settings = Fetcher.display_config()
         settings.pop("storage", None)
         settings.pop("storage_args", None)
         log.info(f"Scrapling {__version__} shell started")
@@ -466,12 +433,12 @@ Type 'exit' or press Ctrl+D to exit.
         """Create a namespace with application-specific objects"""
         # Create wrapped versions of fetch functions
-        get = self.create_wrapper(Fetcher.get)
-        post = self.create_wrapper(Fetcher.post)
-        put = self.create_wrapper(Fetcher.put)
-        delete = self.create_wrapper(Fetcher.delete)
-        dynamic_fetch = self.create_wrapper(DynamicFetcher.fetch)
-        stealthy_fetch = self.create_wrapper(StealthyFetcher.fetch)
         curl2fetcher = self.create_wrapper(self._curl_parser.convert2fetcher)
         # Create the namespace dictionary
@@ -480,12 +447,12 @@ Type 'exit' or press Ctrl+D to exit.
             "post": post,
             "put": put,
             "delete": delete,
-            "Fetcher": Fetcher,
-            "AsyncFetcher": AsyncFetcher,
             "fetch": dynamic_fetch,
-            "DynamicFetcher": DynamicFetcher,
             "stealthy_fetch": stealthy_fetch,
-            "StealthyFetcher": StealthyFetcher,
             "Selector": Selector,
             "page": self.page,
             "response": self.page,
@@ -502,11 +469,10 @@ Type 'exit' or press Ctrl+D to exit.
     def start(self):  # pragma: no cover
         """Start the interactive shell"""
-        from IPython.terminal.embed import InteractiveShellEmbed
         # Get our namespace with application objects
         namespace = self.get_namespace()
-        ipython_shell = InteractiveShellEmbed(
             banner1=self.banner(),
             banner2="",
             enable_tip=False,

 from re import sub as re_sub
 from sys import stderr
 from functools import wraps
 from collections import namedtuple
 from shlex import split as shlex_split
 from tempfile import mkstemp as make_temp_file
 from orjson import loads as json_loads, JSONDecodeError
 from scrapling import __version__
 from scrapling.parser import Selector, Selectors
+from scrapling.core.custom_types import TextHandler
+from scrapling.engines.toolbelt.custom import Response
+from scrapling.core.utils import log, _ParseHeaders, _CookieParser
 from scrapling.core._types import (
     Optional,
     Dict,
     Any,
     extraction_types,
     Generator,
 )
 _known_logging_levels = {
 )
 # Suppress exit on error to handle parsing errors gracefully
 class NoExitArgumentParser(ArgumentParser):  # pragma: no cover
     def error(self, message):
     """Builds the argument parser for relevant curl flags from DevTools."""
     def __init__(self):
+        from scrapling.fetchers import Fetcher as __Fetcher
+        self.__fetcher = __Fetcher
         # We will use argparse parser to parse the curl command directly instead of regex
         # We will focus more on flags that will show up on curl commands copied from DevTools's network tab
         _parser = NoExitArgumentParser(add_help=False)  # Disable default help
                     _ = request_args.pop("json", None)
                 try:
+                    return getattr(self.__Fetcher, method)(**request_args)
                 except Exception as e:  # pragma: no cover
                     log.error(f"Error calling Fetcher.{method}: {e}")
                     return None
     """A custom IPython shell with minimal dependencies"""
     def __init__(self, code, log_level="debug"):
+        from IPython.terminal.embed import InteractiveShellEmbed as __InteractiveShellEmbed
+        from scrapling.fetchers import (
+            Fetcher as __Fetcher,
+            AsyncFetcher as __AsyncFetcher,
+            DynamicFetcher as __DynamicFetcher,
+            StealthyFetcher as __StealthyFetcher,
+        )
+        self.__InteractiveShellEmbed = __InteractiveShellEmbed
+        self.__Fetcher = __Fetcher
+        self.__AsyncFetcher = __AsyncFetcher
+        self.__DynamicFetcher = __DynamicFetcher
+        self.__StealthyFetcher = __StealthyFetcher
         self.code = code
         self.page = None
         self.pages = Selectors([])
         if self.log_level:
             getLogger("scrapling").setLevel(self.log_level)
+        settings = self.__Fetcher.display_config()
         settings.pop("storage", None)
         settings.pop("storage_args", None)
         log.info(f"Scrapling {__version__} shell started")
         """Create a namespace with application-specific objects"""
         # Create wrapped versions of fetch functions
+        get = self.create_wrapper(self.__Fetcher.get)
+        post = self.create_wrapper(self.__Fetcher.post)
+        put = self.create_wrapper(self.__Fetcher.put)
+        delete = self.create_wrapper(self.__Fetcher.delete)
+        dynamic_fetch = self.create_wrapper(self.__DynamicFetcher.fetch)
+        stealthy_fetch = self.create_wrapper(self.__StealthyFetcher.fetch)
         curl2fetcher = self.create_wrapper(self._curl_parser.convert2fetcher)
         # Create the namespace dictionary
             "post": post,
             "put": put,
             "delete": delete,
+            "Fetcher": self.__Fetcher,
+            "AsyncFetcher": self.__AsyncFetcher,
             "fetch": dynamic_fetch,
+            "DynamicFetcher": self.__DynamicFetcher,
             "stealthy_fetch": stealthy_fetch,
+            "StealthyFetcher": self.__StealthyFetcher,
             "Selector": Selector,
             "page": self.page,
             "response": self.page,
     def start(self):  # pragma: no cover
         """Start the interactive shell"""
         # Get our namespace with application objects
         namespace = self.get_namespace()
+        ipython_shell = self.__InteractiveShellEmbed(
             banner1=self.banner(),
             banner2="",
             enable_tip=False,

scrapling/core/translator.py CHANGED Viewed

@@ -10,10 +10,10 @@ So you don't have to learn a new selectors/api method like what bs4 done with so
 from functools import lru_cache
-from cssselect import HTMLTranslator as OriginalHTMLTranslator
-from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
 from cssselect.xpath import ExpressionError
 from cssselect.xpath import XPathExpr as OriginalXPathExpr
 from scrapling.core._types import Any, Optional, Protocol, Self

 from functools import lru_cache
 from cssselect.xpath import ExpressionError
 from cssselect.xpath import XPathExpr as OriginalXPathExpr
+from cssselect import HTMLTranslator as OriginalHTMLTranslator
+from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
 from scrapling.core._types import Any, Optional, Protocol, Self

scrapling/core/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from ._utils import (
+    log,
+    __CONSECUTIVE_SPACES_REGEX__,
+    flatten,
+    _is_iterable,
+    _StorageTools,
+    clean_spaces,
+    html_forbidden,
+)
+from ._shell import _CookieParser, _ParseHeaders

scrapling/core/utils/_shell.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from http import cookies as Cookie
+from scrapling.core._types import (
+    List,
+    Dict,
+    Tuple,
+)
+def _CookieParser(cookie_string):
+    # Errors will be handled on call so the log can be specified
+    cookie_parser = Cookie.SimpleCookie()
+    cookie_parser.load(cookie_string)
+    for key, morsel in cookie_parser.items():
+        yield key, morsel.value
+def _ParseHeaders(header_lines: List[str], parse_cookies: bool = True) -> Tuple[Dict[str, str], Dict[str, str]]:
+    """Parses headers into separate header and cookie dictionaries."""
+    header_dict = dict()
+    cookie_dict = dict()
+    for header_line in header_lines:
+        if ":" not in header_line:
+            if header_line.endswith(";"):
+                header_key = header_line[:-1].strip()
+                header_value = ""
+                header_dict[header_key] = header_value
+            else:
+                raise ValueError(f"Could not parse header without colon: '{header_line}'.")
+        else:
+            header_key, header_value = header_line.split(":", 1)
+            header_key = header_key.strip()
+            header_value = header_value.strip()
+            if parse_cookies:
+                if header_key.lower() == "cookie":
+                    try:
+                        cookie_dict = {key: value for key, value in _CookieParser(header_value)}
+                    except Exception as e:  # pragma: no cover
+                        raise ValueError(f"Could not parse cookie string from header '{header_value}': {e}")
+                else:
+                    header_dict[header_key] = header_value
+            else:
+                header_dict[header_key] = header_value
+    return header_dict, cookie_dict

scrapling/core/{utils.py → utils/_utils.py} RENAMED Viewed

File without changes

scrapling/engines/__init__.py CHANGED Viewed

@@ -1,16 +0,0 @@
-from .constants import DEFAULT_DISABLED_RESOURCES, DEFAULT_STEALTH_FLAGS, DEFAULT_FLAGS
-from .static import FetcherSession, FetcherClient, AsyncFetcherClient
-from ._browsers import (
-    DynamicSession,
-    AsyncDynamicSession,
-    StealthySession,
-    AsyncStealthySession,
-)
-__all__ = [
-    "FetcherSession",
-    "DynamicSession",
-    "AsyncDynamicSession",
-    "StealthySession",
-    "AsyncStealthySession",
-]

scrapling/engines/_browsers/_base.py CHANGED Viewed

@@ -12,20 +12,17 @@ from camoufox.utils import (
     installed_verstr as camoufox_version,
 )
-from scrapling.engines.toolbelt import (
-    intercept_route,
-    async_intercept_route,
-    get_os_name,
-)
-from ._page import PageInfo, PagePool
-from ._config_tools import _compiled_stealth_scripts
-from ._validators import validate, PlaywrightConfig, CamoufoxConfig
-from ._config_tools import _launch_kwargs, _context_kwargs
 from scrapling.core._types import (
     Any,
     Dict,
     Optional,
 )
 __ff_version_str__ = camoufox_version().split(".", 1)[0]

     installed_verstr as camoufox_version,
 )
+from scrapling.engines.toolbelt.navigation import intercept_route, async_intercept_route
 from scrapling.core._types import (
     Any,
     Dict,
     Optional,
 )
+from ._page import PageInfo, PagePool
+from ._config_tools import _compiled_stealth_scripts
+from ._config_tools import _launch_kwargs, _context_kwargs
+from scrapling.engines.toolbelt.fingerprints import get_os_name
+from ._validators import validate, PlaywrightConfig, CamoufoxConfig
 __ff_version_str__ = camoufox_version().split(".", 1)[0]

scrapling/engines/_browsers/_camoufox.py CHANGED Viewed

@@ -25,11 +25,11 @@ from scrapling.core._types import (
     Callable,
     SelectorWaitStates,
 )
-from scrapling.engines.toolbelt import (
     Response,
     ResponseFactory,
-    generate_convincing_referer,
 )
 __CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
 _UNSET = object()

     Callable,
     SelectorWaitStates,
 )
+from scrapling.engines.toolbelt.convertor import (
     Response,
     ResponseFactory,
 )
+from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
 __CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
 _UNSET = object()

scrapling/engines/_browsers/_config_tools.py CHANGED Viewed

@@ -6,7 +6,8 @@ from scrapling.engines.constants import (
     HARMFUL_DEFAULT_ARGS,
     DEFAULT_FLAGS,
 )
-from scrapling.engines.toolbelt import js_bypass_path, generate_headers
 __default_useragent__ = generate_headers(browser_mode=True).get("User-Agent")

     HARMFUL_DEFAULT_ARGS,
     DEFAULT_FLAGS,
 )
+from scrapling.engines.toolbelt.navigation import js_bypass_path
+from scrapling.engines.toolbelt.fingerprints import generate_headers
 __default_useragent__ = generate_headers(browser_mode=True).get("User-Agent")

scrapling/engines/_browsers/_controllers.py CHANGED Viewed

@@ -26,11 +26,11 @@ from scrapling.core._types import (
     Callable,
     SelectorWaitStates,
 )
-from scrapling.engines.toolbelt import (
     Response,
     ResponseFactory,
-    generate_convincing_referer,
 )
 _UNSET = object()

     Callable,
     SelectorWaitStates,
 )
+from scrapling.engines.toolbelt.convertor import (
     Response,
     ResponseFactory,
 )
+from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
 _UNSET = object()

scrapling/engines/_browsers/_validators.py CHANGED Viewed

@@ -9,7 +9,7 @@ from scrapling.core._types import (
     List,
     SelectorWaitStates,
 )
-from scrapling.engines.toolbelt import construct_proxy_dict
 class PlaywrightConfig(Struct, kw_only=True, frozen=False):

     List,
     SelectorWaitStates,
 )
+from scrapling.engines.toolbelt.navigation import construct_proxy_dict
 class PlaywrightConfig(Struct, kw_only=True, frozen=False):

scrapling/engines/static.py CHANGED Viewed

@@ -26,11 +26,11 @@ from scrapling.core._types import (
 from .toolbelt import (
     Response,
-    generate_convincing_referer,
     generate_headers,
-    ResponseFactory,
     __default_useragent__,
 )
 _UNSET = object()

 from .toolbelt import (
     Response,
     generate_headers,
     __default_useragent__,
 )
+from .toolbelt.convertor import ResponseFactory
+from .toolbelt.fingerprints import generate_convincing_referer
 _UNSET = object()

scrapling/engines/toolbelt/__init__.py CHANGED Viewed

@@ -5,16 +5,7 @@ from .custom import (
     get_variable_name,
 )
 from .fingerprints import (
-    generate_convincing_referer,
     generate_headers,
     get_os_name,
     __default_useragent__,
 )
-from .navigation import (
-    async_intercept_route,
-    construct_cdp_url,
-    construct_proxy_dict,
-    intercept_route,
-    js_bypass_path,
-)
-from .convertor import ResponseFactory

     get_variable_name,
 )
 from .fingerprints import (
     generate_headers,
     get_os_name,
     __default_useragent__,
 )

scrapling/engines/toolbelt/custom.py CHANGED Viewed

@@ -2,8 +2,10 @@
 Functions related to custom types or type checking
 """
 from email.message import Message
 from scrapling.core._types import (
     Any,
     Dict,
@@ -12,7 +14,6 @@ from scrapling.core._types import (
     Tuple,
 )
 from scrapling.core.custom_types import MappingProxyType
-from scrapling.core.utils import log, lru_cache
 from scrapling.parser import Selector, SQLiteStorageSystem

 Functions related to custom types or type checking
 """
+from functools import lru_cache
 from email.message import Message
+from scrapling.core.utils import log
 from scrapling.core._types import (
     Any,
     Dict,
     Tuple,
 )
 from scrapling.core.custom_types import MappingProxyType
 from scrapling.parser import Selector, SQLiteStorageSystem

scrapling/engines/toolbelt/fingerprints.py CHANGED Viewed

@@ -2,13 +2,13 @@
 Functions related to generating headers and fingerprints generally
 """
 from platform import system as platform_system
 from tldextract import extract
 from browserforge.headers import Browser, HeaderGenerator
 from scrapling.core._types import Dict, Optional
-from scrapling.core.utils import lru_cache
 __OS_NAME__ = platform_system()
@@ -37,8 +37,6 @@ def get_os_name() -> Optional[str]:
         "Linux": "linux",
         "Darwin": "macos",
         "Windows": "windows",
-        # For the future? because why not?
-        "iOS": "ios",
     }.get(__OS_NAME__)

 Functions related to generating headers and fingerprints generally
 """
+from functools import lru_cache
 from platform import system as platform_system
 from tldextract import extract
 from browserforge.headers import Browser, HeaderGenerator
 from scrapling.core._types import Dict, Optional
 __OS_NAME__ = platform_system()
         "Linux": "linux",
         "Darwin": "macos",
         "Windows": "windows",
     }.get(__OS_NAME__)

scrapling/engines/toolbelt/navigation.py CHANGED Viewed

@@ -86,51 +86,6 @@ def construct_proxy_dict(proxy_string: str | Dict[str, str], as_tuple=False) ->
     return None
-def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
-    """Takes a CDP URL, reconstruct it to check it's valid, then adds encoded parameters if exists
-    :param cdp_url: The target URL.
-    :param query_params: A dictionary of the parameters to add.
-    :return: The new CDP URL.
-    """
-    try:
-        # Validate the base URL structure
-        parsed = urlparse(cdp_url)
-        # Check scheme
-        if parsed.scheme not in ("ws", "wss"):
-            raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
-        # Validate hostname and port
-        if not parsed.netloc:
-            raise ValueError("Invalid hostname for the CDP URL")
-        try:
-            # Checking if the port is valid (if available)
-            _ = parsed.port
-        except ValueError:
-            # urlparse will raise `ValueError` if the port can't be casted to integer
-            raise ValueError("Invalid port for the CDP URL")
-        # Ensure the path starts with /
-        path = parsed.path
-        if not path.startswith("/"):
-            path = "/" + path
-        # Reconstruct the base URL with validated parts
-        validated_base = f"{parsed.scheme}://{parsed.netloc}{path}"
-        # Add query parameters
-        if query_params:
-            query_string = urlencode(query_params)
-            return f"{validated_base}?{query_string}"
-        return validated_base
-    except Exception as e:
-        raise ValueError(f"Invalid CDP URL: {str(e)}")
 @lru_cache(10, typed=True)
 def js_bypass_path(filename: str) -> str:
     """Takes the base filename of a JS file inside the `bypasses` folder, then return the full path of it

     return None
 @lru_cache(10, typed=True)
 def js_bypass_path(filename: str) -> str:
     """Takes the base filename of a JS file inside the `bypasses` folder, then return the full path of it

scrapling/fetchers.py CHANGED Viewed

@@ -6,15 +6,17 @@ from scrapling.core._types import (
     SelectorWaitStates,
     Iterable,
 )
-from scrapling.engines import (
     FetcherSession,
-    StealthySession,
-    AsyncStealthySession,
-    DynamicSession,
-    AsyncDynamicSession,
     FetcherClient as _FetcherClient,
     AsyncFetcherClient as _AsyncFetcherClient,
 )
 from scrapling.engines.toolbelt import BaseFetcher, Response
 __FetcherClientInstance__ = _FetcherClient()

     SelectorWaitStates,
     Iterable,
 )
+from scrapling.engines.static import (
     FetcherSession,
     FetcherClient as _FetcherClient,
     AsyncFetcherClient as _AsyncFetcherClient,
 )
+from scrapling.engines._browsers import (
+    DynamicSession,
+    StealthySession,
+    AsyncDynamicSession,
+    AsyncStealthySession,
+)
 from scrapling.engines.toolbelt import BaseFetcher, Response
 __FetcherClientInstance__ = _FetcherClient()

scrapling/parser.py CHANGED Viewed

@@ -1,12 +1,11 @@
-from pathlib import Path
 import re
 from inspect import signature
-from difflib import SequenceMatcher
 from urllib.parse import urljoin
-from cssselect import SelectorError, SelectorSyntaxError
-from cssselect import parse as split_selectors
 from lxml.html import HtmlElement, HtmlMixin, HTMLParser
 from lxml.etree import (
     XPath,
     tostring,

 import re
+from pathlib import Path
 from inspect import signature
 from urllib.parse import urljoin
+from difflib import SequenceMatcher
 from lxml.html import HtmlElement, HtmlMixin, HTMLParser
+from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
 from lxml.etree import (
     XPath,
     tostring,