Spaces:

GraziePrego
/

scrapling

Paused

App Files Files Community

GraziePrego commited on Apr 5

Commit

5fabe30

verified ·

1 Parent(s): cff1e76

Upload folder using huggingface_hub

Browse files

Files changed (46) hide show

__init__.py +38 -0
cli.py +826 -0
core/__init__.py +0 -0
core/_shell_signatures.py +100 -0
core/_types.py +57 -0
core/ai.py +653 -0
core/custom_types.py +345 -0
core/mixins.py +85 -0
core/shell.py +643 -0
core/storage.py +156 -0
core/translator.py +134 -0
core/utils/__init__.py +11 -0
core/utils/_shell.py +48 -0
core/utils/_utils.py +120 -0
engines/__init__.py +0 -0
engines/_browsers/__init__.py +0 -0
engines/_browsers/_base.py +534 -0
engines/_browsers/_config_tools.py +4 -0
engines/_browsers/_controllers.py +362 -0
engines/_browsers/_page.py +87 -0
engines/_browsers/_stealth.py +541 -0
engines/_browsers/_types.py +118 -0
engines/_browsers/_validators.py +229 -0
engines/constants.py +99 -0
engines/static.py +770 -0
engines/toolbelt/__init__.py +3 -0
engines/toolbelt/convertor.py +306 -0
engines/toolbelt/custom.py +295 -0
engines/toolbelt/fingerprints.py +88 -0
engines/toolbelt/navigation.py +109 -0
engines/toolbelt/proxy_rotation.py +104 -0
fetchers/__init__.py +48 -0
fetchers/chrome.py +91 -0
fetchers/requests.py +28 -0
fetchers/stealth_chrome.py +109 -0
parser.py +1363 -0
py.typed +1 -0
spiders/__init__.py +18 -0
spiders/checkpoint.py +90 -0
spiders/engine.py +333 -0
spiders/request.py +163 -0
spiders/result.py +125 -0
spiders/scheduler.py +80 -0
spiders/session.py +145 -0
spiders/spider.py +316 -0
ui.py +57 -0

__init__.py ADDED Viewed

	@@ -0,0 +1,38 @@

+__author__ = "Karim Shoair (karim.shoair@pm.me)"
+__version__ = "0.4.1"
+__copyright__ = "Copyright (c) 2024 Karim Shoair"
+from typing import Any, TYPE_CHECKING
+if TYPE_CHECKING:
+    from scrapling.parser import Selector, Selectors
+    from scrapling.core.custom_types import AttributesHandler, TextHandler
+    from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
+# Lazy import mapping
+_LAZY_IMPORTS = {
+    "Fetcher": ("scrapling.fetchers", "Fetcher"),
+    "Selector": ("scrapling.parser", "Selector"),
+    "Selectors": ("scrapling.parser", "Selectors"),
+    "AttributesHandler": ("scrapling.core.custom_types", "AttributesHandler"),
+    "TextHandler": ("scrapling.core.custom_types", "TextHandler"),
+    "AsyncFetcher": ("scrapling.fetchers", "AsyncFetcher"),
+    "StealthyFetcher": ("scrapling.fetchers", "StealthyFetcher"),
+    "DynamicFetcher": ("scrapling.fetchers", "DynamicFetcher"),
+}
+__all__ = ["Selector", "Fetcher", "AsyncFetcher", "StealthyFetcher", "DynamicFetcher"]
+def __getattr__(name: str) -> Any:
+    if name in _LAZY_IMPORTS:
+        module_path, class_name = _LAZY_IMPORTS[name]
+        module = __import__(module_path, fromlist=[class_name])
+        return getattr(module, class_name)
+    else:
+        raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+def __dir__() -> list[str]:
+    """Support for dir() and autocomplete."""
+    return sorted(__all__ + ["fetchers", "parser", "cli", "core", "__author__", "__version__", "__copyright__"])

cli.py ADDED Viewed

	@@ -0,0 +1,826 @@

+from pathlib import Path
+from subprocess import check_output
+from sys import executable as python_executable
+from scrapling.core.utils import log
+from scrapling.engines.toolbelt.custom import Response
+from scrapling.core.utils._shell import _CookieParser, _ParseHeaders
+from scrapling.core._types import List, Optional, Dict, Tuple, Any, Callable
+from orjson import loads as json_loads, JSONDecodeError
+try:
+    from click import command, option, Choice, group, argument
+except (ImportError, ModuleNotFoundError) as e:
+    raise ModuleNotFoundError(
+        "You need to install scrapling with any of the extras to enable Shell commands. See: https://scrapling.readthedocs.io/en/latest/#installation"
+    ) from e
+__OUTPUT_FILE_HELP__ = "The output file path can be an HTML file, a Markdown file of the HTML content, or the text content itself. Use file extensions (`.html`/`.md`/`.txt`) respectively."
+__PACKAGE_DIR__ = Path(__file__).parent
+def __Execute(cmd: List[str], help_line: str) -> None:  # pragma: no cover
+    print(f"Installing {help_line}...")
+    _ = check_output(cmd, shell=False)  # nosec B603
+    # I meant to not use try except here
+def __ParseJSONData(json_string: Optional[str] = None) -> Optional[Dict[str, Any]]:
+    """Parse JSON string into a Python object"""
+    if not json_string:
+        return None
+    try:
+        return json_loads(json_string)
+    except JSONDecodeError as err:  # pragma: no cover
+        raise ValueError(f"Invalid JSON data '{json_string}': {err}")
+def __Request_and_Save(
+    fetcher_func: Callable[..., Response],
+    url: str,
+    output_file: str,
+    css_selector: Optional[str] = None,
+    **kwargs,
+) -> None:
+    """Make a request using the specified fetcher function and save the result"""
+    from scrapling.core.shell import Convertor
+    # Handle relative paths - convert to an absolute path based on the current working directory
+    output_path = Path(output_file)
+    if not output_path.is_absolute():
+        output_path = Path.cwd() / output_file
+    response = fetcher_func(url, **kwargs)
+    Convertor.write_content_to_file(response, str(output_path), css_selector)
+    log.info(f"Content successfully saved to '{output_path}'")
+def __ParseExtractArguments(
+    headers: List[str], cookies: str, params: str, json: Optional[str] = None
+) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, str], Optional[Dict[str, str]]]:
+    """Parse arguments for extract command"""
+    parsed_headers, parsed_cookies = _ParseHeaders(headers)
+    if cookies:
+        for key, value in _CookieParser(cookies):
+            try:
+                parsed_cookies[key] = value
+            except Exception as err:
+                raise ValueError(f"Could not parse cookies '{cookies}': {err}")
+    parsed_json = __ParseJSONData(json)
+    parsed_params = {}
+    for param in params:
+        if "=" in param:
+            key, value = param.split("=", 1)
+            parsed_params[key] = value
+    return parsed_headers, parsed_cookies, parsed_params, parsed_json
+def __BuildRequest(headers: List[str], cookies: str, params: str, json: Optional[str] = None, **kwargs) -> Dict:
+    """Build a request object using the specified arguments"""
+    # Parse parameters
+    parsed_headers, parsed_cookies, parsed_params, parsed_json = __ParseExtractArguments(headers, cookies, params, json)
+    # Build request arguments
+    request_kwargs: Dict[str, Any] = {
+        "headers": parsed_headers if parsed_headers else None,
+        "cookies": parsed_cookies if parsed_cookies else None,
+    }
+    if parsed_json:
+        request_kwargs["json"] = parsed_json
+    if parsed_params:
+        request_kwargs["params"] = parsed_params
+    if "proxy" in kwargs:
+        request_kwargs["proxy"] = kwargs.pop("proxy")
+    # Parse impersonate parameter if it contains commas (for random selection)
+    if "impersonate" in kwargs and "," in (kwargs.get("impersonate") or ""):
+        kwargs["impersonate"] = [browser.strip() for browser in kwargs["impersonate"].split(",")]
+    return {**request_kwargs, **kwargs}
+@command(help="Install all Scrapling's Fetchers dependencies")
+@option(
+    "-f",
+    "--force",
+    "force",
+    is_flag=True,
+    default=False,
+    type=bool,
+    help="Force Scrapling to reinstall all Fetchers dependencies",
+)
+def install(force):  # pragma: no cover
+    if force or not __PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").exists():
+        __Execute(
+            [python_executable, "-m", "playwright", "install", "chromium"],
+            "Playwright browsers",
+        )
+        __Execute(
+            [
+                python_executable,
+                "-m",
+                "playwright",
+                "install-deps",
+                "chromium",
+            ],
+            "Playwright dependencies",
+        )
+        from tld.utils import update_tld_names
+        update_tld_names(fail_silently=True)
+        # if no errors raised by the above commands, then we add the below file
+        __PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").touch()
+    else:
+        print("The dependencies are already installed")
+@command(help="Run Scrapling's MCP server (Check the docs for more info).")
+@option(
+    "--http",
+    is_flag=True,
+    default=False,
+    help="Whether to run the MCP server in streamable-http transport or leave it as stdio (Default: False)",
+)
+@option(
+    "--host",
+    type=str,
+    default="0.0.0.0",
+    help="The host to use if streamable-http transport is enabled (Default: '0.0.0.0')",
+)
+@option(
+    "--port", type=int, default=8000, help="The port to use if streamable-http transport is enabled (Default: 8000)"
+)
+def mcp(http, host, port):
+    from scrapling.core.ai import ScraplingMCPServer
+    server = ScraplingMCPServer()
+    server.serve(http, host, port)
+@command(help="Interactive scraping console")
+@option(
+    "-c",
+    "--code",
+    "code",
+    is_flag=False,
+    default="",
+    type=str,
+    help="Evaluate the code in the shell, print the result and exit",
+)
+@option(
+    "-L",
+    "--loglevel",
+    "level",
+    is_flag=False,
+    default="debug",
+    type=Choice(["debug", "info", "warning", "error", "critical", "fatal"], case_sensitive=False),
+    help="Log level (default: DEBUG)",
+)
+def shell(code, level):
+    from scrapling.core.shell import CustomShell
+    console = CustomShell(code=code, log_level=level)
+    console.start()
+@group(
+    help="Fetch web pages using various fetchers and extract full/selected HTML content as HTML, Markdown, or extract text content."
+)
+def extract():
+    """Extract content from web pages and save to files"""
+    pass
+@extract.command(help=f"Perform a GET request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
+@argument("url", required=True)
+@argument("output_file", required=True)
+@option(
+    "--headers",
+    "-H",
+    multiple=True,
+    help='HTTP headers in format "Key: Value" (can be used multiple times)',
+)
+@option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
+@option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
+@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
+@option(
+    "--css-selector",
+    "-s",
+    help="CSS selector to extract specific content from the page. It returns all matches.",
+)
+@option(
+    "--params",
+    "-p",
+    multiple=True,
+    help='Query parameters in format "key=value" (can be used multiple times)',
+)
+@option(
+    "--follow-redirects/--no-follow-redirects",
+    default=True,
+    help="Whether to follow redirects (default: True)",
+)
+@option(
+    "--verify/--no-verify",
+    default=True,
+    help="Whether to verify SSL certificates (default: True)",
+)
+@option(
+    "--impersonate",
+    help="Browser to impersonate. Can be a single browser (e.g., chrome) or comma-separated list for random selection (e.g., chrome,firefox,safari).",
+)
+@option(
+    "--stealthy-headers/--no-stealthy-headers",
+    default=True,
+    help="Use stealthy browser headers (default: True)",
+)
+def get(
+    url,
+    output_file,
+    headers,
+    cookies,
+    timeout,
+    proxy,
+    css_selector,
+    params,
+    follow_redirects,
+    verify,
+    impersonate,
+    stealthy_headers,
+):
+    """
+    Perform a GET request and save the content to a file.
+    :param url: Target URL for the request.
+    :param output_file: Output file path (.md for Markdown, .html for HTML).
+    :param headers: HTTP headers to include in the request.
+    :param cookies: Cookies to use in the request.
+    :param timeout: Number of seconds to wait before timing out.
+    :param proxy: Proxy URL to use. (Format: "http://username:password@localhost:8030")
+    :param css_selector: CSS selector to extract specific content.
+    :param params: Query string parameters for the request.
+    :param follow_redirects: Whether to follow redirects.
+    :param verify: Whether to verify HTTPS certificates.
+    :param impersonate: Browser version to impersonate.
+    :param stealthy_headers: If enabled, creates and adds real browser headers.
+    """
+    kwargs = __BuildRequest(
+        headers,
+        cookies,
+        params,
+        None,
+        timeout=timeout,
+        follow_redirects=follow_redirects,
+        verify=verify,
+        stealthy_headers=stealthy_headers,
+        impersonate=impersonate,
+        proxy=proxy,
+    )
+    from scrapling.fetchers import Fetcher
+    __Request_and_Save(Fetcher.get, url, output_file, css_selector, **kwargs)
+@extract.command(help=f"Perform a POST request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
+@argument("url", required=True)
+@argument("output_file", required=True)
+@option(
+    "--data",
+    "-d",
+    help='Form data to include in the request body (as string, ex: "param1=value1&param2=value2")',
+)
+@option("--json", "-j", help="JSON data to include in the request body (as string)")
+@option(
+    "--headers",
+    "-H",
+    multiple=True,
+    help='HTTP headers in format "Key: Value" (can be used multiple times)',
+)
+@option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
+@option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
+@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
+@option(
+    "--css-selector",
+    "-s",
+    help="CSS selector to extract specific content from the page. It returns all matches.",
+)
+@option(
+    "--params",
+    "-p",
+    multiple=True,
+    help='Query parameters in format "key=value" (can be used multiple times)',
+)
+@option(
+    "--follow-redirects/--no-follow-redirects",
+    default=True,
+    help="Whether to follow redirects (default: True)",
+)
+@option(
+    "--verify/--no-verify",
+    default=True,
+    help="Whether to verify SSL certificates (default: True)",
+)
+@option(
+    "--impersonate",
+    help="Browser to impersonate. Can be a single browser (e.g., chrome) or comma-separated list for random selection (e.g., chrome,firefox,safari).",
+)
+@option(
+    "--stealthy-headers/--no-stealthy-headers",
+    default=True,
+    help="Use stealthy browser headers (default: True)",
+)
+def post(
+    url,
+    output_file,
+    data,
+    json,
+    headers,
+    cookies,
+    timeout,
+    proxy,
+    css_selector,
+    params,
+    follow_redirects,
+    verify,
+    impersonate,
+    stealthy_headers,
+):
+    """
+    Perform a POST request and save the content to a file.
+    :param url: Target URL for the request.
+    :param output_file: Output file path (.md for Markdown, .html for HTML).
+    :param data: Form data to include in the request body. (as string, ex: "param1=value1&param2=value2")
+    :param json: A JSON serializable object to include in the body of the request.
+    :param headers: Headers to include in the request.
+    :param cookies: Cookies to use in the request.
+    :param timeout: Number of seconds to wait before timing out.
+    :param proxy: Proxy URL to use.
+    :param css_selector: CSS selector to extract specific content.
+    :param params: Query string parameters for the request.
+    :param follow_redirects: Whether to follow redirects.
+    :param verify: Whether to verify HTTPS certificates.
+    :param impersonate: Browser version to impersonate.
+    :param stealthy_headers: If enabled, creates and adds real browser headers.
+    """
+    kwargs = __BuildRequest(
+        headers,
+        cookies,
+        params,
+        json,
+        timeout=timeout,
+        follow_redirects=follow_redirects,
+        verify=verify,
+        stealthy_headers=stealthy_headers,
+        impersonate=impersonate,
+        proxy=proxy,
+        data=data,
+    )
+    from scrapling.fetchers import Fetcher
+    __Request_and_Save(Fetcher.post, url, output_file, css_selector, **kwargs)
+@extract.command(help=f"Perform a PUT request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
+@argument("url", required=True)
+@argument("output_file", required=True)
+@option("--data", "-d", help="Form data to include in the request body")
+@option("--json", "-j", help="JSON data to include in the request body (as string)")
+@option(
+    "--headers",
+    "-H",
+    multiple=True,
+    help='HTTP headers in format "Key: Value" (can be used multiple times)',
+)
+@option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
+@option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
+@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
+@option(
+    "--css-selector",
+    "-s",
+    help="CSS selector to extract specific content from the page. It returns all matches.",
+)
+@option(
+    "--params",
+    "-p",
+    multiple=True,
+    help='Query parameters in format "key=value" (can be used multiple times)',
+)
+@option(
+    "--follow-redirects/--no-follow-redirects",
+    default=True,
+    help="Whether to follow redirects (default: True)",
+)
+@option(
+    "--verify/--no-verify",
+    default=True,
+    help="Whether to verify SSL certificates (default: True)",
+)
+@option(
+    "--impersonate",
+    help="Browser to impersonate. Can be a single browser (e.g., chrome) or comma-separated list for random selection (e.g., chrome,firefox,safari).",
+)
+@option(
+    "--stealthy-headers/--no-stealthy-headers",
+    default=True,
+    help="Use stealthy browser headers (default: True)",
+)
+def put(
+    url,
+    output_file,
+    data,
+    json,
+    headers,
+    cookies,
+    timeout,
+    proxy,
+    css_selector,
+    params,
+    follow_redirects,
+    verify,
+    impersonate,
+    stealthy_headers,
+):
+    """
+    Perform a PUT request and save the content to a file.
+    :param url: Target URL for the request.
+    :param output_file: Output file path (.md for Markdown, .html for HTML).
+    :param data: Form data to include in the request body.
+    :param json: A JSON serializable object to include in the body of the request.
+    :param headers: Headers to include in the request.
+    :param cookies: Cookies to use in the request.
+    :param timeout: Number of seconds to wait before timing out.
+    :param proxy: Proxy URL to use.
+    :param css_selector: CSS selector to extract specific content.
+    :param params: Query string parameters for the request.
+    :param follow_redirects: Whether to follow redirects.
+    :param verify: Whether to verify HTTPS certificates.
+    :param impersonate: Browser version to impersonate.
+    :param stealthy_headers: If enabled, creates and adds real browser headers.
+    """
+    kwargs = __BuildRequest(
+        headers,
+        cookies,
+        params,
+        json,
+        timeout=timeout,
+        follow_redirects=follow_redirects,
+        verify=verify,
+        stealthy_headers=stealthy_headers,
+        impersonate=impersonate,
+        proxy=proxy,
+        data=data,
+    )
+    from scrapling.fetchers import Fetcher
+    __Request_and_Save(Fetcher.put, url, output_file, css_selector, **kwargs)
+@extract.command(help=f"Perform a DELETE request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
+@argument("url", required=True)
+@argument("output_file", required=True)
+@option(
+    "--headers",
+    "-H",
+    multiple=True,
+    help='HTTP headers in format "Key: Value" (can be used multiple times)',
+)
+@option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
+@option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
+@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
+@option(
+    "--css-selector",
+    "-s",
+    help="CSS selector to extract specific content from the page. It returns all matches.",
+)
+@option(
+    "--params",
+    "-p",
+    multiple=True,
+    help='Query parameters in format "key=value" (can be used multiple times)',
+)
+@option(
+    "--follow-redirects/--no-follow-redirects",
+    default=True,
+    help="Whether to follow redirects (default: True)",
+)
+@option(
+    "--verify/--no-verify",
+    default=True,
+    help="Whether to verify SSL certificates (default: True)",
+)
+@option(
+    "--impersonate",
+    help="Browser to impersonate. Can be a single browser (e.g., chrome) or comma-separated list for random selection (e.g., chrome,firefox,safari).",
+)
+@option(
+    "--stealthy-headers/--no-stealthy-headers",
+    default=True,
+    help="Use stealthy browser headers (default: True)",
+)
+def delete(
+    url,
+    output_file,
+    headers,
+    cookies,
+    timeout,
+    proxy,
+    css_selector,
+    params,
+    follow_redirects,
+    verify,
+    impersonate,
+    stealthy_headers,
+):
+    """
+    Perform a DELETE request and save the content to a file.
+    :param url: Target URL for the request.
+    :param output_file: Output file path (.md for Markdown, .html for HTML).
+    :param headers: Headers to include in the request.
+    :param cookies: Cookies to use in the request.
+    :param timeout: Number of seconds to wait before timing out.
+    :param proxy: Proxy URL to use.
+    :param css_selector: CSS selector to extract specific content.
+    :param params: Query string parameters for the request.
+    :param follow_redirects: Whether to follow redirects.
+    :param verify: Whether to verify HTTPS certificates.
+    :param impersonate: Browser version to impersonate.
+    :param stealthy_headers: If enabled, creates and adds real browser headers.
+    """
+    kwargs = __BuildRequest(
+        headers,
+        cookies,
+        params,
+        None,
+        timeout=timeout,
+        follow_redirects=follow_redirects,
+        verify=verify,
+        stealthy_headers=stealthy_headers,
+        impersonate=impersonate,
+        proxy=proxy,
+    )
+    from scrapling.fetchers import Fetcher
+    __Request_and_Save(Fetcher.delete, url, output_file, css_selector, **kwargs)
+@extract.command(help=f"Use DynamicFetcher to fetch content with browser automation.\n\n{__OUTPUT_FILE_HELP__}")
+@argument("url", required=True)
+@argument("output_file", required=True)
+@option(
+    "--headless/--no-headless",
+    default=True,
+    help="Run browser in headless mode (default: True)",
+)
+@option(
+    "--disable-resources/--enable-resources",
+    default=False,
+    help="Drop unnecessary resources for speed boost (default: False)",
+)
+@option(
+    "--network-idle/--no-network-idle",
+    default=False,
+    help="Wait for network idle (default: False)",
+)
+@option(
+    "--timeout",
+    type=int,
+    default=30000,
+    help="Timeout in milliseconds (default: 30000)",
+)
+@option(
+    "--wait",
+    type=int,
+    default=0,
+    help="Additional wait time in milliseconds after page load (default: 0)",
+)
+@option(
+    "--css-selector",
+    "-s",
+    help="CSS selector to extract specific content from the page. It returns all matches.",
+)
+@option("--wait-selector", help="CSS selector to wait for before proceeding")
+@option("--locale", default=None, help="Specify user locale. Defaults to the system default locale.")
+@option(
+    "--real-chrome/--no-real-chrome",
+    default=False,
+    help="If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False)",
+)
+@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
+@option(
+    "--extra-headers",
+    "-H",
+    multiple=True,
+    help='Extra headers in format "Key: Value" (can be used multiple times)',
+)
+def fetch(
+    url,
+    output_file,
+    headless,
+    disable_resources,
+    network_idle,
+    timeout,
+    wait,
+    css_selector,
+    wait_selector,
+    locale,
+    real_chrome,
+    proxy,
+    extra_headers,
+):
+    """
+    Opens up a browser and fetch content using DynamicFetcher.
+    :param url: Target url.
+    :param output_file: Output file path (.md for Markdown, .html for HTML).
+    :param headless: Run the browser in headless/hidden or headful/visible mode.
+    :param disable_resources: Drop requests of unnecessary resources for a speed boost.
+    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page.
+    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before returning.
+    :param css_selector: CSS selector to extract specific content.
+    :param wait_selector: Wait for a specific CSS selector to be in a specific state.
+    :param locale: Set the locale for the browser.
+    :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
+    :param proxy: The proxy to be used with requests.
+    :param extra_headers: Extra headers to add to the request.
+    """
+    # Parse parameters
+    parsed_headers, _ = _ParseHeaders(extra_headers, False)
+    # Build request arguments
+    kwargs = {
+        "headless": headless,
+        "disable_resources": disable_resources,
+        "network_idle": network_idle,
+        "timeout": timeout,
+        "locale": locale,
+        "real_chrome": real_chrome,
+    }
+    if wait > 0:
+        kwargs["wait"] = wait
+    if wait_selector:
+        kwargs["wait_selector"] = wait_selector
+    if proxy:
+        kwargs["proxy"] = proxy
+    if parsed_headers:
+        kwargs["extra_headers"] = parsed_headers
+    from scrapling.fetchers import DynamicFetcher
+    __Request_and_Save(DynamicFetcher.fetch, url, output_file, css_selector, **kwargs)
+@extract.command(help=f"Use StealthyFetcher to fetch content with advanced stealth features.\n\n{__OUTPUT_FILE_HELP__}")
+@argument("url", required=True)
+@argument("output_file", required=True)
+@option(
+    "--headless/--no-headless",
+    default=True,
+    help="Run browser in headless mode (default: True)",
+)
+@option(
+    "--disable-resources/--enable-resources",
+    default=False,
+    help="Drop unnecessary resources for speed boost (default: False)",
+)
+@option(
+    "--block-webrtc/--allow-webrtc",
+    default=False,
+    help="Block WebRTC entirely (default: False)",
+)
+@option(
+    "--solve-cloudflare/--no-solve-cloudflare",
+    default=False,
+    help="Solve Cloudflare challenges (default: False)",
+)
+@option("--allow-webgl/--block-webgl", default=True, help="Allow WebGL (default: True)")
+@option(
+    "--network-idle/--no-network-idle",
+    default=False,
+    help="Wait for network idle (default: False)",
+)
+@option(
+    "--real-chrome/--no-real-chrome",
+    default=False,
+    help="If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False)",
+)
+@option(
+    "--hide-canvas/--show-canvas",
+    default=False,
+    help="Add noise to canvas operations (default: False)",
+)
+@option(
+    "--timeout",
+    type=int,
+    default=30000,
+    help="Timeout in milliseconds (default: 30000)",
+)
+@option(
+    "--wait",
+    type=int,
+    default=0,
+    help="Additional wait time in milliseconds after page load (default: 0)",
+)
+@option(
+    "--css-selector",
+    "-s",
+    help="CSS selector to extract specific content from the page. It returns all matches.",
+)
+@option("--wait-selector", help="CSS selector to wait for before proceeding")
+@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
+@option(
+    "--extra-headers",
+    "-H",
+    multiple=True,
+    help='Extra headers in format "Key: Value" (can be used multiple times)',
+)
+def stealthy_fetch(
+    url,
+    output_file,
+    headless,
+    disable_resources,
+    block_webrtc,
+    solve_cloudflare,
+    allow_webgl,
+    network_idle,
+    real_chrome,
+    hide_canvas,
+    timeout,
+    wait,
+    css_selector,
+    wait_selector,
+    proxy,
+    extra_headers,
+):
+    """
+    Opens up a browser with advanced stealth features and fetch content using StealthyFetcher.
+    :param url: Target url.
+    :param output_file: Output file path (.md for Markdown, .html for HTML).
+    :param headless: Run the browser in headless/hidden, or headful/visible mode.
+    :param disable_resources: Drop requests of unnecessary resources for a speed boost.
+    :param block_webrtc: Blocks WebRTC entirely.
+    :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges.
+    :param allow_webgl: Allow WebGL (recommended to keep enabled).
+    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+    :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
+    :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
+    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page.
+    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before returning.
+    :param css_selector: CSS selector to extract specific content.
+    :param wait_selector: Wait for a specific CSS selector to be in a specific state.
+    :param proxy: The proxy to be used with requests.
+    :param extra_headers: Extra headers to add to the request.
+    """
+    # Parse parameters
+    parsed_headers, _ = _ParseHeaders(extra_headers, False)
+    # Build request arguments
+    kwargs = {
+        "headless": headless,
+        "disable_resources": disable_resources,
+        "block_webrtc": block_webrtc,
+        "solve_cloudflare": solve_cloudflare,
+        "allow_webgl": allow_webgl,
+        "network_idle": network_idle,
+        "real_chrome": real_chrome,
+        "hide_canvas": hide_canvas,
+        "timeout": timeout,
+    }
+    if wait > 0:
+        kwargs["wait"] = wait
+    if wait_selector:
+        kwargs["wait_selector"] = wait_selector
+    if proxy:
+        kwargs["proxy"] = proxy
+    if parsed_headers:
+        kwargs["extra_headers"] = parsed_headers
+    from scrapling.fetchers import StealthyFetcher
+    __Request_and_Save(StealthyFetcher.fetch, url, output_file, css_selector, **kwargs)
+@group()
+def main():
+    pass
+# Adding commands
+main.add_command(install)
+main.add_command(shell)
+main.add_command(extract)
+main.add_command(mcp)

core/__init__.py ADDED Viewed

File without changes

core/_shell_signatures.py ADDED Viewed

	@@ -0,0 +1,100 @@

+from scrapling.core._types import (
+    Any,
+    Dict,
+    List,
+    Tuple,
+    Sequence,
+    Callable,
+    Optional,
+    SetCookieParam,
+    SelectorWaitStates,
+)
+# Parameter definitions for shell function signatures (defined once at module level)
+# Mirrors TypedDict definitions from _types.py but runtime-accessible for IPython introspection
+_REQUESTS_PARAMS = {
+    "params": Optional[Dict | List | Tuple],
+    "cookies": Any,
+    "auth": Optional[Tuple[str, str]],
+    "impersonate": Any,
+    "http3": Optional[bool],
+    "stealthy_headers": Optional[bool],
+    "proxies": Any,
+    "proxy": Optional[str],
+    "proxy_auth": Optional[Tuple[str, str]],
+    "timeout": Optional[int | float],
+    "headers": Any,
+    "retries": Optional[int],
+    "retry_delay": Optional[int],
+    "follow_redirects": Optional[bool],
+    "max_redirects": Optional[int],
+    "verify": Optional[bool],
+    "cert": Optional[str | Tuple[str, str]],
+    "selector_config": Optional[Dict],
+}
+_FETCH_PARAMS = {
+    "headless": bool,
+    "disable_resources": bool,
+    "network_idle": bool,
+    "load_dom": bool,
+    "wait_selector": Optional[str],
+    "wait_selector_state": SelectorWaitStates,
+    "cookies": Sequence[SetCookieParam],
+    "google_search": bool,
+    "wait": int | float,
+    "timezone_id": str | None,
+    "page_action": Optional[Callable],
+    "proxy": Optional[str | Dict[str, str] | Tuple],
+    "extra_headers": Optional[Dict[str, str]],
+    "timeout": int | float,
+    "init_script": Optional[str],
+    "user_data_dir": str,
+    "selector_config": Optional[Dict],
+    "additional_args": Optional[Dict],
+    "locale": Optional[str],
+    "real_chrome": bool,
+    "cdp_url": Optional[str],
+    "useragent": Optional[str],
+    "extra_flags": Optional[List[str]],
+}
+_STEALTHY_FETCH_PARAMS = {
+    "headless": bool,
+    "disable_resources": bool,
+    "network_idle": bool,
+    "load_dom": bool,
+    "wait_selector": Optional[str],
+    "wait_selector_state": SelectorWaitStates,
+    "cookies": Sequence[SetCookieParam],
+    "google_search": bool,
+    "wait": int | float,
+    "timezone_id": str | None,
+    "page_action": Optional[Callable],
+    "proxy": Optional[str | Dict[str, str] | Tuple],
+    "extra_headers": Optional[Dict[str, str]],
+    "timeout": int | float,
+    "init_script": Optional[str],
+    "user_data_dir": str,
+    "selector_config": Optional[Dict],
+    "additional_args": Optional[Dict],
+    "locale": Optional[str],
+    "real_chrome": bool,
+    "cdp_url": Optional[str],
+    "useragent": Optional[str],
+    "extra_flags": Optional[List[str]],
+    "allow_webgl": bool,
+    "hide_canvas": bool,
+    "block_webrtc": bool,
+    "solve_cloudflare": bool,
+}
+# Mapping of function names to their parameter definitions
+Signatures_map = {
+    "get": _REQUESTS_PARAMS,
+    "post": {**_REQUESTS_PARAMS, "data": Optional[Dict | str], "json": Optional[Dict | List]},
+    "put": {**_REQUESTS_PARAMS, "data": Optional[Dict | str], "json": Optional[Dict | List]},
+    "delete": _REQUESTS_PARAMS,
+    "fetch": _FETCH_PARAMS,
+    "stealthy_fetch": _STEALTHY_FETCH_PARAMS,
+}

core/_types.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""
+Type definitions for type checking purposes.
+"""
+from typing import (
+    TYPE_CHECKING,
+    TypedDict,
+    TypeAlias,
+    cast,
+    overload,
+    Any,
+    Callable,
+    Dict,
+    Generator,
+    AsyncGenerator,
+    Generic,
+    Iterable,
+    List,
+    Set,
+    Literal,
+    Optional,
+    Iterator,
+    Pattern,
+    Sequence,
+    Tuple,
+    TypeVar,
+    Union,
+    Match,
+    Mapping,
+    Awaitable,
+    Protocol,
+    Coroutine,
+    SupportsIndex,
+)
+from typing_extensions import Self, Unpack
+# Proxy can be a string URL or a dict (Playwright format: {"server": "...", "username": "...", "password": "..."})
+ProxyType = Union[str, Dict[str, str]]
+SUPPORTED_HTTP_METHODS = Literal["GET", "POST", "PUT", "DELETE"]
+SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
+PageLoadStates = Literal["commit", "domcontentloaded", "load", "networkidle"]
+extraction_types = Literal["text", "html", "markdown"]
+StrOrBytes = Union[str, bytes]
+# Copied from `playwright._impl._api_structures.SetCookieParam`
+class SetCookieParam(TypedDict, total=False):
+    name: str
+    value: str
+    url: Optional[str]
+    domain: Optional[str]
+    path: Optional[str]
+    expires: Optional[float]
+    httpOnly: Optional[bool]
+    secure: Optional[bool]
+    sameSite: Optional[Literal["Lax", "None", "Strict"]]
+    partitionKey: Optional[str]

core/ai.py ADDED Viewed

	@@ -0,0 +1,653 @@

+from asyncio import gather
+from mcp.server.fastmcp import FastMCP
+from pydantic import BaseModel, Field
+from starlette.requests import Request
+from starlette.responses import Response, JSONResponse
+from scrapling.core.shell import Convertor
+from scrapling.engines.toolbelt.custom import Response as _ScraplingResponse
+from scrapling.engines.static import ImpersonateType
+from scrapling.fetchers import (
+    Fetcher,
+    FetcherSession,
+    DynamicFetcher,
+    AsyncDynamicSession,
+    StealthyFetcher,
+    AsyncStealthySession,
+)
+from scrapling.core._types import (
+    Optional,
+    Tuple,
+    Mapping,
+    Dict,
+    List,
+    Any,
+    Generator,
+    Sequence,
+    SetCookieParam,
+    extraction_types,
+    SelectorWaitStates,
+)
+class ResponseModel(BaseModel):
+    """Request's response information structure."""
+    status: int = Field(description="The status code returned by the website.")
+    content: list[str] = Field(description="The content as Markdown/HTML or the text content of the page.")
+    url: str = Field(description="The URL given by the user that resulted in this response.")
+def _content_translator(content: Generator[str, None, None], page: _ScraplingResponse) -> ResponseModel:
+    """Convert a content generator to a list of ResponseModel objects."""
+    return ResponseModel(status=page.status, content=[result for result in content], url=page.url)
+def _normalize_credentials(credentials: Optional[Dict[str, str]]) -> Optional[Tuple[str, str]]:
+    """Convert a credentials dictionary to a tuple accepted by fetchers."""
+    if not credentials:
+        return None
+    username = credentials.get("username")
+    password = credentials.get("password")
+    if username is None or password is None:
+        raise ValueError("Credentials dictionary must contain both 'username' and 'password' keys")
+    return username, password
+class ScraplingMCPServer:
+    @staticmethod
+    def get(
+        url: str,
+        impersonate: ImpersonateType = "chrome",
+        extraction_type: extraction_types = "markdown",
+        css_selector: Optional[str] = None,
+        main_content_only: bool = True,
+        params: Optional[Dict] = None,
+        headers: Optional[Mapping[str, Optional[str]]] = None,
+        cookies: Optional[Dict[str, str]] = None,
+        timeout: Optional[int | float] = 30,
+        follow_redirects: bool = True,
+        max_redirects: int = 30,
+        retries: Optional[int] = 3,
+        retry_delay: Optional[int] = 1,
+        proxy: Optional[str] = None,
+        proxy_auth: Optional[Dict[str, str]] = None,
+        auth: Optional[Dict[str, str]] = None,
+        verify: Optional[bool] = True,
+        http3: Optional[bool] = False,
+        stealthy_headers: Optional[bool] = True,
+    ) -> ResponseModel:
+        """Make GET HTTP request to a URL and return a structured output of the result.
+        Note: This is only suitable for low-mid protection levels. For high-protection levels or websites that require JS loading, use the other tools directly.
+        Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
+        :param url: The URL to request.
+        :param impersonate: Browser version to impersonate its fingerprint. It's using the latest chrome version by default.
+        :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
+            - Markdown will convert the page content to Markdown format.
+            - HTML will return the raw HTML content of the page.
+            - Text will return the text content of the page.
+        :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
+        :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
+        :param params: Query string parameters for the request.
+        :param headers: Headers to include in the request.
+        :param cookies: Cookies to use in the request.
+        :param timeout: Number of seconds to wait before timing out.
+        :param follow_redirects: Whether to follow redirects. Defaults to True.
+        :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
+        :param retries: Number of retry attempts. Defaults to 3.
+        :param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
+        :param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
+                     Cannot be used together with the `proxies` parameter.
+        :param proxy_auth: HTTP basic auth for proxy in dictionary format with `username` and `password` keys.
+        :param auth: HTTP basic auth in dictionary format with `username` and `password` keys.
+        :param verify: Whether to verify HTTPS certificates.
+        :param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
+        :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
+        """
+        normalized_proxy_auth = _normalize_credentials(proxy_auth)
+        normalized_auth = _normalize_credentials(auth)
+        page = Fetcher.get(
+            url,
+            auth=normalized_auth,
+            proxy=proxy,
+            http3=http3,
+            verify=verify,
+            params=params,
+            proxy_auth=normalized_proxy_auth,
+            retry_delay=retry_delay,
+            stealthy_headers=stealthy_headers,
+            impersonate=impersonate,
+            headers=headers,
+            cookies=cookies,
+            timeout=timeout,
+            retries=retries,
+            max_redirects=max_redirects,
+            follow_redirects=follow_redirects,
+        )
+        return _content_translator(
+            Convertor._extract_content(
+                page,
+                css_selector=css_selector,
+                extraction_type=extraction_type,
+                main_content_only=main_content_only,
+            ),
+            page,
+        )
+    @staticmethod
+    async def bulk_get(
+        urls: List[str],
+        impersonate: ImpersonateType = "chrome",
+        extraction_type: extraction_types = "markdown",
+        css_selector: Optional[str] = None,
+        main_content_only: bool = True,
+        params: Optional[Dict] = None,
+        headers: Optional[Mapping[str, Optional[str]]] = None,
+        cookies: Optional[Dict[str, str]] = None,
+        timeout: Optional[int | float] = 30,
+        follow_redirects: bool = True,
+        max_redirects: int = 30,
+        retries: Optional[int] = 3,
+        retry_delay: Optional[int] = 1,
+        proxy: Optional[str] = None,
+        proxy_auth: Optional[Dict[str, str]] = None,
+        auth: Optional[Dict[str, str]] = None,
+        verify: Optional[bool] = True,
+        http3: Optional[bool] = False,
+        stealthy_headers: Optional[bool] = True,
+    ) -> List[ResponseModel]:
+        """Make GET HTTP request to a group of URLs and for each URL, return a structured output of the result.
+        Note: This is only suitable for low-mid protection levels. For high-protection levels or websites that require JS loading, use the other tools directly.
+        Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
+        :param urls: A list of the URLs to request.
+        :param impersonate: Browser version to impersonate its fingerprint. It's using the latest chrome version by default.
+        :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
+            - Markdown will convert the page content to Markdown format.
+            - HTML will return the raw HTML content of the page.
+            - Text will return the text content of the page.
+        :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
+        :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
+        :param params: Query string parameters for the request.
+        :param headers: Headers to include in the request.
+        :param cookies: Cookies to use in the request.
+        :param timeout: Number of seconds to wait before timing out.
+        :param follow_redirects: Whether to follow redirects. Defaults to True.
+        :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
+        :param retries: Number of retry attempts. Defaults to 3.
+        :param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
+        :param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
+                     Cannot be used together with the `proxies` parameter.
+        :param proxy_auth: HTTP basic auth for proxy in dictionary format with `username` and `password` keys.
+        :param auth: HTTP basic auth in dictionary format with `username` and `password` keys.
+        :param verify: Whether to verify HTTPS certificates.
+        :param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
+        :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
+        """
+        normalized_proxy_auth = _normalize_credentials(proxy_auth)
+        normalized_auth = _normalize_credentials(auth)
+        async with FetcherSession() as session:
+            tasks: List[Any] = [
+                session.get(
+                    url,
+                    auth=normalized_auth,
+                    proxy=proxy,
+                    http3=http3,
+                    verify=verify,
+                    params=params,
+                    headers=headers,
+                    cookies=cookies,
+                    timeout=timeout,
+                    retries=retries,
+                    proxy_auth=normalized_proxy_auth,
+                    retry_delay=retry_delay,
+                    impersonate=impersonate,
+                    max_redirects=max_redirects,
+                    follow_redirects=follow_redirects,
+                    stealthy_headers=stealthy_headers,
+                )
+                for url in urls
+            ]
+            responses = await gather(*tasks)
+            return [
+                _content_translator(
+                    Convertor._extract_content(
+                        page,
+                        css_selector=css_selector,
+                        extraction_type=extraction_type,
+                        main_content_only=main_content_only,
+                    ),
+                    page,
+                )
+                for page in responses
+            ]
+    @staticmethod
+    async def fetch(
+        url: str,
+        extraction_type: extraction_types = "markdown",
+        css_selector: Optional[str] = None,
+        main_content_only: bool = True,
+        headless: bool = True,  # noqa: F821
+        google_search: bool = True,
+        real_chrome: bool = False,
+        wait: int | float = 0,
+        proxy: Optional[str | Dict[str, str]] = None,
+        timezone_id: str | None = None,
+        locale: str | None = None,
+        extra_headers: Optional[Dict[str, str]] = None,
+        useragent: Optional[str] = None,
+        cdp_url: Optional[str] = None,
+        timeout: int | float = 30000,
+        disable_resources: bool = False,
+        wait_selector: Optional[str] = None,
+        cookies: Sequence[SetCookieParam] | None = None,
+        network_idle: bool = False,
+        wait_selector_state: SelectorWaitStates = "attached",
+    ) -> ResponseModel:
+        """Use playwright to open a browser to fetch a URL and return a structured output of the result.
+        Note: This is only suitable for low-mid protection levels.
+        Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
+        :param url: The URL to request.
+        :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
+            - Markdown will convert the page content to Markdown format.
+            - HTML will return the raw HTML content of the page.
+            - Text will return the text content of the page.
+        :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
+        :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
+        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
+        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
+            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
+        :param cookies: Set cookies for the next request. It should be in a dictionary format that Playwright accepts.
+        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
+        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
+        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
+        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
+        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
+            rules. Defaults to the system default locale.
+        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
+        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
+        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
+        :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
+        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
+        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
+        """
+        page = await DynamicFetcher.async_fetch(
+            url,
+            wait=wait,
+            proxy=proxy,
+            locale=locale,
+            timeout=timeout,
+            cookies=cookies,
+            cdp_url=cdp_url,
+            headless=headless,
+            useragent=useragent,
+            timezone_id=timezone_id,
+            real_chrome=real_chrome,
+            network_idle=network_idle,
+            wait_selector=wait_selector,
+            extra_headers=extra_headers,
+            google_search=google_search,
+            disable_resources=disable_resources,
+            wait_selector_state=wait_selector_state,
+        )
+        return _content_translator(
+            Convertor._extract_content(
+                page,
+                css_selector=css_selector,
+                extraction_type=extraction_type,
+                main_content_only=main_content_only,
+            ),
+            page,
+        )
+    @staticmethod
+    async def bulk_fetch(
+        urls: List[str],
+        extraction_type: extraction_types = "markdown",
+        css_selector: Optional[str] = None,
+        main_content_only: bool = True,
+        headless: bool = True,  # noqa: F821
+        google_search: bool = True,
+        real_chrome: bool = False,
+        wait: int | float = 0,
+        proxy: Optional[str | Dict[str, str]] = None,
+        timezone_id: str | None = None,
+        locale: str | None = None,
+        extra_headers: Optional[Dict[str, str]] = None,
+        useragent: Optional[str] = None,
+        cdp_url: Optional[str] = None,
+        timeout: int | float = 30000,
+        disable_resources: bool = False,
+        wait_selector: Optional[str] = None,
+        cookies: Sequence[SetCookieParam] | None = None,
+        network_idle: bool = False,
+        wait_selector_state: SelectorWaitStates = "attached",
+    ) -> List[ResponseModel]:
+        """Use playwright to open a browser, then fetch a group of URLs at the same time, and for each page return a structured output of the result.
+        Note: This is only suitable for low-mid protection levels.
+        Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
+        :param urls: A list of the URLs to request.
+        :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
+            - Markdown will convert the page content to Markdown format.
+            - HTML will return the raw HTML content of the page.
+            - Text will return the text content of the page.
+        :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
+        :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
+        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
+        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
+            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
+        :param cookies: Set cookies for the next request. It should be in a dictionary format that Playwright accepts.
+        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
+        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
+        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
+        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
+        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
+            rules. Defaults to the system default locale.
+        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
+        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
+        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
+        :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
+        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
+        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
+        """
+        async with AsyncDynamicSession(
+            wait=wait,
+            proxy=proxy,
+            locale=locale,
+            timeout=timeout,
+            cookies=cookies,
+            cdp_url=cdp_url,
+            headless=headless,
+            max_pages=len(urls),
+            useragent=useragent,
+            timezone_id=timezone_id,
+            real_chrome=real_chrome,
+            network_idle=network_idle,
+            wait_selector=wait_selector,
+            google_search=google_search,
+            extra_headers=extra_headers,
+            disable_resources=disable_resources,
+            wait_selector_state=wait_selector_state,
+        ) as session:
+            tasks = [session.fetch(url) for url in urls]
+            responses = await gather(*tasks)
+            return [
+                _content_translator(
+                    Convertor._extract_content(
+                        page,
+                        css_selector=css_selector,
+                        extraction_type=extraction_type,
+                        main_content_only=main_content_only,
+                    ),
+                    page,
+                )
+                for page in responses
+            ]
+    @staticmethod
+    async def stealthy_fetch(
+        url: str,
+        extraction_type: extraction_types = "markdown",
+        css_selector: Optional[str] = None,
+        main_content_only: bool = True,
+        headless: bool = True,  # noqa: F821
+        google_search: bool = True,
+        real_chrome: bool = False,
+        wait: int | float = 0,
+        proxy: Optional[str | Dict[str, str]] = None,
+        timezone_id: str | None = None,
+        locale: str | None = None,
+        extra_headers: Optional[Dict[str, str]] = None,
+        useragent: Optional[str] = None,
+        hide_canvas: bool = False,
+        cdp_url: Optional[str] = None,
+        timeout: int | float = 30000,
+        disable_resources: bool = False,
+        wait_selector: Optional[str] = None,
+        cookies: Sequence[SetCookieParam] | None = None,
+        network_idle: bool = False,
+        wait_selector_state: SelectorWaitStates = "attached",
+        block_webrtc: bool = False,
+        allow_webgl: bool = True,
+        solve_cloudflare: bool = False,
+        additional_args: Optional[Dict] = None,
+    ) -> ResponseModel:
+        """Use the stealthy fetcher to fetch a URL and return a structured output of the result.
+        Note: This is the only suitable fetcher for high protection levels.
+        Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
+        :param url: The URL to request.
+        :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
+            - Markdown will convert the page content to Markdown format.
+            - HTML will return the raw HTML content of the page.
+            - Text will return the text content of the page.
+        :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
+        :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
+        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
+        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
+            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
+        :param cookies: Set cookies for the next request.
+        :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
+        :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
+        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
+        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
+        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
+        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
+        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
+            rules. Defaults to the system default locale.
+        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
+        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
+        :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
+        :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
+        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
+        :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
+        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
+        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
+        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
+        """
+        page = await StealthyFetcher.async_fetch(
+            url,
+            wait=wait,
+            proxy=proxy,
+            locale=locale,
+            cdp_url=cdp_url,
+            timeout=timeout,
+            cookies=cookies,
+            headless=headless,
+            useragent=useragent,
+            timezone_id=timezone_id,
+            real_chrome=real_chrome,
+            hide_canvas=hide_canvas,
+            allow_webgl=allow_webgl,
+            network_idle=network_idle,
+            block_webrtc=block_webrtc,
+            wait_selector=wait_selector,
+            google_search=google_search,
+            extra_headers=extra_headers,
+            additional_args=additional_args,
+            solve_cloudflare=solve_cloudflare,
+            disable_resources=disable_resources,
+            wait_selector_state=wait_selector_state,
+        )
+        return _content_translator(
+            Convertor._extract_content(
+                page,
+                css_selector=css_selector,
+                extraction_type=extraction_type,
+                main_content_only=main_content_only,
+            ),
+            page,
+        )
+    @staticmethod
+    async def bulk_stealthy_fetch(
+        urls: List[str],
+        extraction_type: extraction_types = "markdown",
+        css_selector: Optional[str] = None,
+        main_content_only: bool = True,
+        headless: bool = True,  # noqa: F821
+        google_search: bool = True,
+        real_chrome: bool = False,
+        wait: int | float = 0,
+        proxy: Optional[str | Dict[str, str]] = None,
+        timezone_id: str | None = None,
+        locale: str | None = None,
+        extra_headers: Optional[Dict[str, str]] = None,
+        useragent: Optional[str] = None,
+        hide_canvas: bool = False,
+        cdp_url: Optional[str] = None,
+        timeout: int | float = 30000,
+        disable_resources: bool = False,
+        wait_selector: Optional[str] = None,
+        cookies: Sequence[SetCookieParam] | None = None,
+        network_idle: bool = False,
+        wait_selector_state: SelectorWaitStates = "attached",
+        block_webrtc: bool = False,
+        allow_webgl: bool = True,
+        solve_cloudflare: bool = False,
+        additional_args: Optional[Dict] = None,
+    ) -> List[ResponseModel]:
+        """Use the stealthy fetcher to fetch a group of URLs at the same time, and for each page return a structured output of the result.
+        Note: This is the only suitable fetcher for high protection levels.
+        Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
+        :param urls: A list of the URLs to request.
+        :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
+            - Markdown will convert the page content to Markdown format.
+            - HTML will return the raw HTML content of the page.
+            - Text will return the text content of the page.
+        :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
+        :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
+        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
+        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
+            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
+        :param cookies: Set cookies for the next request.
+        :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
+        :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
+        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
+        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
+        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
+        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
+        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
+            rules. Defaults to the system default locale.
+        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
+        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
+        :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
+        :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
+        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
+        :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
+        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
+        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
+        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
+        """
+        async with AsyncStealthySession(
+            wait=wait,
+            proxy=proxy,
+            locale=locale,
+            cdp_url=cdp_url,
+            timeout=timeout,
+            cookies=cookies,
+            headless=headless,
+            useragent=useragent,
+            timezone_id=timezone_id,
+            real_chrome=real_chrome,
+            hide_canvas=hide_canvas,
+            allow_webgl=allow_webgl,
+            network_idle=network_idle,
+            block_webrtc=block_webrtc,
+            wait_selector=wait_selector,
+            google_search=google_search,
+            extra_headers=extra_headers,
+            additional_args=additional_args,
+            solve_cloudflare=solve_cloudflare,
+            disable_resources=disable_resources,
+            wait_selector_state=wait_selector_state,
+        ) as session:
+            tasks = [session.fetch(url) for url in urls]
+            responses = await gather(*tasks)
+            return [
+                _content_translator(
+                    Convertor._extract_content(
+                        page,
+                        css_selector=css_selector,
+                        extraction_type=extraction_type,
+                        main_content_only=main_content_only,
+                    ),
+                    page,
+                )
+                for page in responses
+            ]
+    def serve(self, http: bool, host: str, port: int):
+        """Serve the MCP server."""
+        server = FastMCP(name="Scrapling", host=host, port=port)
+        server.add_tool(self.get, title="get", description=self.get.__doc__, structured_output=True)
+        server.add_tool(self.bulk_get, title="bulk_get", description=self.bulk_get.__doc__, structured_output=True)
+        server.add_tool(self.fetch, title="fetch", description=self.fetch.__doc__, structured_output=True)
+        server.add_tool(
+            self.bulk_fetch, title="bulk_fetch", description=self.bulk_fetch.__doc__, structured_output=True
+        )
+        server.add_tool(
+            self.stealthy_fetch, title="stealthy_fetch", description=self.stealthy_fetch.__doc__, structured_output=True
+        )
+        server.add_tool(
+            self.bulk_stealthy_fetch,
+            title="bulk_stealthy_fetch",
+            description=self.bulk_stealthy_fetch.__doc__,
+            structured_output=True,
+        )
+        @server.custom_route("/health", methods=["GET"])
+        async def health_check(request: Request) -> Response:
+            return JSONResponse({"status": "healthy"})
+        @server.custom_route("/api-docs", methods=["GET"])
+        async def api_docs(request: Request) -> Response:
+            tools = await server.list_tools()
+            return JSONResponse([tool.model_dump() for tool in tools])
+        if http:
+            import uvicorn
+            # Get the Starlette app from FastMCP
+            mcp_app = server.streamable_http_app()
+            try:
+                import gradio as gr
+                from scrapling.ui import create_ui
+                demo = create_ui()
+                # Mount Gradio app onto the MCP app
+                # When path="/", Gradio handles requests not handled by the underlying app (or vice versa depending on implementation)
+                # Actually gr.mount_gradio_app returns a NEW FastAPI app that mounts the input app.
+                # But here we want to mount Gradio ON TOP of MCP app or ALONGSIDE.
+                # mount_gradio_app(app, blocks, path) -> app
+                # It adds routes to `app`.
+                # Since mcp_app is Starlette, we might need to wrap it or cast it.
+                # Gradio supports Starlette.
+                app = gr.mount_gradio_app(mcp_app, demo, path="/")
+            except (ImportError, ModuleNotFoundError):
+                app = mcp_app
+                print("Gradio not installed or failed to load, running MCP server only.")
+            uvicorn.run(app, host=host, port=port)
+        else:
+            server.run(transport="stdio")

core/custom_types.py ADDED Viewed

	@@ -0,0 +1,345 @@

+from collections.abc import Mapping
+from types import MappingProxyType
+from re import compile as re_compile, UNICODE, IGNORECASE
+from orjson import dumps, loads
+from w3lib.html import replace_entities as _replace_entities
+from scrapling.core._types import (
+    Any,
+    cast,
+    Dict,
+    List,
+    Union,
+    overload,
+    TypeVar,
+    Literal,
+    Pattern,
+    Iterable,
+    Generator,
+    SupportsIndex,
+)
+from scrapling.core.utils import _is_iterable, flatten, __CONSECUTIVE_SPACES_REGEX__
+# Define type variable for AttributeHandler value type
+_TextHandlerType = TypeVar("_TextHandlerType", bound="TextHandler")
+__CLEANING_TABLE__ = str.maketrans("\t\r\n", "   ")
+class TextHandler(str):
+    """Extends standard Python string by adding more functionality"""
+    __slots__ = ()
+    def __getitem__(self, key: SupportsIndex | slice) -> "TextHandler":  # pragma: no cover
+        lst = super().__getitem__(key)
+        return TextHandler(lst)
+    def split(self, sep: str | None = None, maxsplit: SupportsIndex = -1) -> list[Any]:  # pragma: no cover
+        return TextHandlers([TextHandler(s) for s in super().split(sep, maxsplit)])
+    def strip(self, chars: str | None = None) -> Union[str, "TextHandler"]:  # pragma: no cover
+        return TextHandler(super().strip(chars))
+    def lstrip(self, chars: str | None = None) -> Union[str, "TextHandler"]:  # pragma: no cover
+        return TextHandler(super().lstrip(chars))
+    def rstrip(self, chars: str | None = None) -> Union[str, "TextHandler"]:  # pragma: no cover
+        return TextHandler(super().rstrip(chars))
+    def capitalize(self) -> Union[str, "TextHandler"]:  # pragma: no cover
+        return TextHandler(super().capitalize())
+    def casefold(self) -> Union[str, "TextHandler"]:  # pragma: no cover
+        return TextHandler(super().casefold())
+    def center(self, width: SupportsIndex, fillchar: str = " ") -> Union[str, "TextHandler"]:  # pragma: no cover
+        return TextHandler(super().center(width, fillchar))
+    def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, "TextHandler"]:  # pragma: no cover
+        return TextHandler(super().expandtabs(tabsize))
+    def format(self, *args: object, **kwargs: object) -> Union[str, "TextHandler"]:  # pragma: no cover
+        return TextHandler(super().format(*args, **kwargs))
+    def format_map(self, mapping) -> Union[str, "TextHandler"]:  # pragma: no cover
+        return TextHandler(super().format_map(mapping))
+    def join(self, iterable: Iterable[str]) -> Union[str, "TextHandler"]:  # pragma: no cover
+        return TextHandler(super().join(iterable))
+    def ljust(self, width: SupportsIndex, fillchar: str = " ") -> Union[str, "TextHandler"]:  # pragma: no cover
+        return TextHandler(super().ljust(width, fillchar))
+    def rjust(self, width: SupportsIndex, fillchar: str = " ") -> Union[str, "TextHandler"]:  # pragma: no cover
+        return TextHandler(super().rjust(width, fillchar))
+    def swapcase(self) -> Union[str, "TextHandler"]:  # pragma: no cover
+        return TextHandler(super().swapcase())
+    def title(self) -> Union[str, "TextHandler"]:  # pragma: no cover
+        return TextHandler(super().title())
+    def translate(self, table) -> Union[str, "TextHandler"]:  # pragma: no cover
+        return TextHandler(super().translate(table))
+    def zfill(self, width: SupportsIndex) -> Union[str, "TextHandler"]:  # pragma: no cover
+        return TextHandler(super().zfill(width))
+    def replace(self, old: str, new: str, count: SupportsIndex = -1) -> Union[str, "TextHandler"]:
+        return TextHandler(super().replace(old, new, count))
+    def upper(self) -> Union[str, "TextHandler"]:
+        return TextHandler(super().upper())
+    def lower(self) -> Union[str, "TextHandler"]:
+        return TextHandler(super().lower())
+    ##############
+    def sort(self, reverse: bool = False) -> Union[str, "TextHandler"]:
+        """Return a sorted version of the string"""
+        return self.__class__("".join(sorted(self, reverse=reverse)))
+    def clean(self, remove_entities=False) -> Union[str, "TextHandler"]:
+        """Return a new version of the string after removing all white spaces and consecutive spaces"""
+        data = self.translate(__CLEANING_TABLE__)
+        if remove_entities:
+            data = _replace_entities(data)
+        return self.__class__(__CONSECUTIVE_SPACES_REGEX__.sub(" ", data).strip())
+    # For easy copy-paste from Scrapy/parsel code when needed :)
+    def get(self, default=None):  # pragma: no cover
+        return self
+    def get_all(self):  # pragma: no cover
+        return self
+    extract = get_all
+    extract_first = get
+    def json(self) -> Dict:
+        """Return JSON response if the response is jsonable otherwise throw error"""
+        # Using str function as a workaround for orjson issue with subclasses of str.
+        # Check this out: https://github.com/ijl/orjson/issues/445
+        return loads(str(self))
+    @overload
+    def re(
+        self,
+        regex: str | Pattern,
+        replace_entities: bool = True,
+        clean_match: bool = False,
+        case_sensitive: bool = True,
+        *,
+        check_match: Literal[True],
+    ) -> bool: ...
+    @overload
+    def re(
+        self,
+        regex: str | Pattern,
+        replace_entities: bool = True,
+        clean_match: bool = False,
+        case_sensitive: bool = True,
+        check_match: Literal[False] = False,
+    ) -> "TextHandlers": ...
+    def re(
+        self,
+        regex: str | Pattern,
+        replace_entities: bool = True,
+        clean_match: bool = False,
+        case_sensitive: bool = True,
+        check_match: bool = False,
+    ) -> Union["TextHandlers", bool]:
+        """Apply the given regex to the current text and return a list of strings with the matches.
+        :param regex: Can be either a compiled regular expression or a string.
+        :param replace_entities: If enabled character entity references are replaced by their corresponding character
+        :param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching
+        :param case_sensitive: If disabled, function will set the regex to ignore the letters-case while compiling it
+        :param check_match: Used to quickly check if this regex matches or not without any operations on the results
+        """
+        if isinstance(regex, str):
+            if case_sensitive:
+                regex = re_compile(regex, UNICODE)
+            else:
+                regex = re_compile(regex, flags=UNICODE | IGNORECASE)
+        input_text = self.clean() if clean_match else self
+        results = regex.findall(input_text)
+        if check_match:
+            return bool(results)
+        if all(_is_iterable(res) for res in results):
+            results = flatten(results)
+        if not replace_entities:
+            return TextHandlers([TextHandler(string) for string in results])
+        return TextHandlers([TextHandler(_replace_entities(s)) for s in results])
+    def re_first(
+        self,
+        regex: str | Pattern,
+        default: Any = None,
+        replace_entities: bool = True,
+        clean_match: bool = False,
+        case_sensitive: bool = True,
+    ) -> "TextHandler":
+        """Apply the given regex to text and return the first match if found, otherwise return the default value.
+        :param regex: Can be either a compiled regular expression or a string.
+        :param default: The default value to be returned if there is no match
+        :param replace_entities: If enabled character entity references are replaced by their corresponding character
+        :param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching
+        :param case_sensitive: If disabled, function will set the regex to ignore the letters-case while compiling it
+        """
+        result = self.re(
+            regex,
+            replace_entities,
+            clean_match=clean_match,
+            case_sensitive=case_sensitive,
+        )
+        return result[0] if result else default
+class TextHandlers(List[TextHandler]):
+    """
+    The :class:`TextHandlers` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
+    """
+    __slots__ = ()
+    @overload
+    def __getitem__(self, pos: SupportsIndex) -> TextHandler:  # pragma: no cover
+        pass
+    @overload
+    def __getitem__(self, pos: slice) -> "TextHandlers":  # pragma: no cover
+        pass
+    def __getitem__(self, pos: SupportsIndex | slice) -> Union[TextHandler, "TextHandlers"]:
+        lst = super().__getitem__(pos)
+        if isinstance(pos, slice):
+            return TextHandlers(cast(List[TextHandler], lst))
+        return TextHandler(cast(TextHandler, lst))
+    def re(
+        self,
+        regex: str | Pattern,
+        replace_entities: bool = True,
+        clean_match: bool = False,
+        case_sensitive: bool = True,
+    ) -> "TextHandlers":
+        """Call the ``.re()`` method for each element in this list and return
+        their results flattened as TextHandlers.
+        :param regex: Can be either a compiled regular expression or a string.
+        :param replace_entities: If enabled character entity references are replaced by their corresponding character
+        :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
+        :param case_sensitive: if disabled, the function will set the regex to ignore the letters-case while compiling it
+        """
+        results = [n.re(regex, replace_entities, clean_match, case_sensitive) for n in self]
+        return TextHandlers(flatten(results))
+    def re_first(
+        self,
+        regex: str | Pattern,
+        default: Any = None,
+        replace_entities: bool = True,
+        clean_match: bool = False,
+        case_sensitive: bool = True,
+    ) -> TextHandler:  # pragma: no cover
+        """Call the ``.re_first()`` method for each element in this list and return
+        the first result or the default value otherwise.
+        :param regex: Can be either a compiled regular expression or a string.
+        :param default: The default value to be returned if there is no match
+        :param replace_entities: If enabled character entity references are replaced by their corresponding character
+        :param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching
+        :param case_sensitive: If disabled, function will set the regex to ignore the letters-case while compiling it
+        """
+        for n in self:
+            for result in n.re(regex, replace_entities, clean_match, case_sensitive):
+                return result
+        return default
+    # For easy copy-paste from Scrapy/parsel code when needed :)
+    def get(self, default=None):
+        """Returns the first item of the current list
+        :param default: the default value to return if the current list is empty
+        """
+        return self[0] if len(self) > 0 else default
+    def extract(self):
+        return self
+    extract_first = get
+    get_all = extract
+class AttributesHandler(Mapping[str, _TextHandlerType]):
+    """A read-only mapping to use instead of the standard dictionary for the speed boost, but at the same time I use it to add more functionalities.
+    If the standard dictionary is needed, convert this class to a dictionary with the `dict` function
+    """
+    __slots__ = ("_data",)
+    def __init__(self, mapping: Any = None, **kwargs: Any) -> None:
+        mapping = (
+            {key: TextHandler(value) if isinstance(value, str) else value for key, value in mapping.items()}
+            if mapping is not None
+            else {}
+        )
+        if kwargs:
+            mapping.update(
+                {key: TextHandler(value) if isinstance(value, str) else value for key, value in kwargs.items()}
+            )
+        # Fastest read-only mapping type
+        self._data: Mapping[str, Any] = MappingProxyType(mapping)
+    def get(self, key: str, default: Any = None) -> _TextHandlerType:
+        """Acts like the standard dictionary `.get()` method"""
+        return self._data.get(key, default)
+    def search_values(self, keyword: str, partial: bool = False) -> Generator["AttributesHandler", None, None]:
+        """Search current attributes by values and return a dictionary of each matching item
+        :param keyword: The keyword to search for in the attribute values
+        :param partial: If True, the function will search if keyword in each value instead of perfect match
+        """
+        for key, value in self._data.items():
+            if partial:
+                if keyword in value:
+                    yield AttributesHandler({key: value})
+            else:
+                if keyword == value:
+                    yield AttributesHandler({key: value})
+    @property
+    def json_string(self) -> bytes:
+        """Convert current attributes to JSON bytes if the attributes are JSON serializable otherwise throws error"""
+        return dumps(dict(self._data))
+    def __getitem__(self, key: str) -> _TextHandlerType:
+        return self._data[key]
+    def __iter__(self):
+        return iter(self._data)
+    def __len__(self):
+        return len(self._data)
+    def __repr__(self):
+        return f"{self.__class__.__name__}({self._data})"
+    def __str__(self):
+        return str(self._data)
+    def __contains__(self, key):
+        return key in self._data

core/mixins.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from scrapling.core._types import Any, Dict
+class SelectorsGeneration:
+    """
+    Functions for generating selectors
+    Trying to generate selectors like Firefox or maybe cleaner ones!? Ehm
+    Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591
+    """
+    # Note: This is a mixin class meant to be used with Selector.
+    # The methods access Selector attributes (._root, .parent, .attrib, .tag, etc.)
+    # through self, which will be a Selector instance at runtime.
+    def _general_selection(self: Any, selection: str = "css", full_path: bool = False) -> str:
+        """Generate a selector for the current element.
+        :return: A string of the generated selector.
+        """
+        if self._is_text_node(self._root):
+            return ""
+        selectorPath = []
+        target = self
+        css = selection.lower() == "css"
+        while target is not None:
+            if target.parent:
+                if target.attrib.get("id"):
+                    # id is enough
+                    part = f"#{target.attrib['id']}" if css else f"[@id='{target.attrib['id']}']"
+                    selectorPath.append(part)
+                    if not full_path:
+                        return " > ".join(reversed(selectorPath)) if css else "//*" + "/".join(reversed(selectorPath))
+                else:
+                    part = f"{target.tag}"
+                    # We won't use classes anymore because I some websites share exact classes between elements
+                    # classes = target.attrib.get('class', '').split()
+                    # if classes and css:
+                    #     part += f".{'.'.join(classes)}"
+                    # else:
+                    counter: Dict[str, int] = {}
+                    for child in target.parent.children:
+                        counter.setdefault(child.tag, 0)
+                        counter[child.tag] += 1
+                        if child._root == target._root:
+                            break
+                    if counter[target.tag] > 1:
+                        part += f":nth-of-type({counter[target.tag]})" if css else f"[{counter[target.tag]}]"
+                selectorPath.append(part)
+                target = target.parent
+                if target is None or target.tag == "html":
+                    return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))
+            else:
+                break
+        return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))
+    @property
+    def generate_css_selector(self: Any) -> str:
+        """Generate a CSS selector for the current element
+        :return: A string of the generated selector.
+        """
+        return self._general_selection()
+    @property
+    def generate_full_css_selector(self: Any) -> str:
+        """Generate a complete CSS selector for the current element
+        :return: A string of the generated selector.
+        """
+        return self._general_selection(full_path=True)
+    @property
+    def generate_xpath_selector(self: Any) -> str:
+        """Generate an XPath selector for the current element
+        :return: A string of the generated selector.
+        """
+        return self._general_selection("xpath")
+    @property
+    def generate_full_xpath_selector(self: Any) -> str:
+        """Generate a complete XPath selector for the current element
+        :return: A string of the generated selector.
+        """
+        return self._general_selection("xpath", full_path=True)

core/shell.py ADDED Viewed

	@@ -0,0 +1,643 @@

+# -*- coding: utf-8 -*-
+from sys import stderr
+from copy import deepcopy
+from functools import wraps
+from re import sub as re_sub
+from collections import namedtuple
+from shlex import split as shlex_split
+from inspect import signature, Parameter
+from tempfile import mkstemp as make_temp_file
+from argparse import ArgumentParser, SUPPRESS
+from webbrowser import open as open_in_browser
+from urllib.parse import urlparse, urlunparse, parse_qsl
+from logging import (
+    DEBUG,
+    INFO,
+    WARNING,
+    ERROR,
+    CRITICAL,
+    FATAL,
+    getLogger,
+    getLevelName,
+)
+from orjson import loads as json_loads, JSONDecodeError
+from ._shell_signatures import Signatures_map
+from scrapling import __version__
+from scrapling.core.utils import log
+from scrapling.parser import Selector, Selectors
+from scrapling.core.custom_types import TextHandler
+from scrapling.engines.toolbelt.custom import Response
+from scrapling.core.utils._shell import _ParseHeaders, _CookieParser
+from scrapling.core._types import (
+    Callable,
+    Dict,
+    Any,
+    cast,
+    Optional,
+    Generator,
+    extraction_types,
+)
+_known_logging_levels = {
+    "debug": DEBUG,
+    "info": INFO,
+    "warning": WARNING,
+    "error": ERROR,
+    "critical": CRITICAL,
+    "fatal": FATAL,
+}
+# Define the structure for parsed context - Simplified for Fetcher args
+Request = namedtuple(
+    "Request",
+    [
+        "method",
+        "url",
+        "params",
+        "data",  # Can be str, bytes, or dict (for urlencoded)
+        "json_data",  # Python object (dict/list) for JSON payload
+        "headers",
+        "cookies",
+        "proxy",
+        "follow_redirects",  # Added for -L flag
+    ],
+)
+# Suppress exit on error to handle parsing errors gracefully
+class NoExitArgumentParser(ArgumentParser):  # pragma: no cover
+    def error(self, message):
+        log.error(f"Curl arguments parsing error: {message}")
+        raise ValueError(f"Curl arguments parsing error: {message}")
+    def exit(self, status=0, message=None):
+        if message:
+            log.error(f"Scrapling shell exited with status {status}: {message}")
+            self._print_message(message, stderr)
+        raise ValueError(f"Scrapling shell exited with status {status}: {message or 'Unknown reason'}")
+class CurlParser:
+    """Builds the argument parser for relevant curl flags from DevTools."""
+    def __init__(self) -> None:
+        from scrapling.fetchers import Fetcher as __Fetcher
+        self.__fetcher = __Fetcher
+        # We will use argparse parser to parse the curl command directly instead of regex
+        # We will focus more on flags that will show up on curl commands copied from DevTools's network tab
+        _parser = NoExitArgumentParser(add_help=False)  # Disable default help
+        # Basic curl arguments
+        _parser.add_argument("curl_command_placeholder", nargs="?", help=SUPPRESS)
+        _parser.add_argument("url")
+        _parser.add_argument("-X", "--request", dest="method", default=None)
+        _parser.add_argument("-H", "--header", action="append", default=[])
+        _parser.add_argument(
+            "-A", "--user-agent", help="Will be parsed from -H if present"
+        )  # Note: DevTools usually includes this in -H
+        # Data arguments (prioritizing types common from DevTools)
+        _parser.add_argument("-d", "--data", default=None)
+        _parser.add_argument("--data-raw", default=None)  # Often used by browsers for JSON body
+        _parser.add_argument("--data-binary", default=None)
+        # Keep urlencode for completeness, though less common from browser copy/paste
+        _parser.add_argument("--data-urlencode", action="append", default=[])
+        _parser.add_argument("-G", "--get", action="store_true")  # Use GET and put data in URL
+        _parser.add_argument(
+            "-b",
+            "--cookie",
+            default=None,
+            help="Send cookies from string/file (string format used by DevTools)",
+        )
+        # Proxy
+        _parser.add_argument("-x", "--proxy", default=None)
+        _parser.add_argument("-U", "--proxy-user", default=None)  # Basic proxy auth
+        # Connection/Security
+        _parser.add_argument("-k", "--insecure", action="store_true")
+        _parser.add_argument("--compressed", action="store_true")  # Very common from browsers
+        # Other flags often included but may not map directly to request args
+        _parser.add_argument("-i", "--include", action="store_true")
+        _parser.add_argument("-s", "--silent", action="store_true")
+        _parser.add_argument("-v", "--verbose", action="store_true")
+        self.parser: NoExitArgumentParser = _parser
+        self._supported_methods = ("get", "post", "put", "delete")
+    # --- Main Parsing Logic ---
+    def parse(self, curl_command: str) -> Optional[Request]:
+        """Parses the curl command string into a structured context for Fetcher."""
+        clean_command = curl_command.strip().lstrip("curl").strip().replace("\\\n", " ")
+        try:
+            tokens = shlex_split(clean_command)  # Split the string using shell-like syntax
+        except ValueError as e:  # pragma: no cover
+            log.error(f"Could not split command line: {e}")
+            return None
+        try:
+            parsed_args, unknown = self.parser.parse_known_args(tokens)
+            if unknown:
+                raise AttributeError(f"Unknown/Unsupported curl arguments: {unknown}")
+        except ValueError:  # pragma: no cover
+            return None
+        except AttributeError:
+            raise
+        except Exception as e:  # pragma: no cover
+            log.error(f"An unexpected error occurred during curl arguments parsing: {e}")
+            return None
+        # --- Determine Method ---
+        method = "get"  # Default
+        if parsed_args.get:  # `-G` forces GET
+            method = "get"
+        elif parsed_args.method:
+            method = parsed_args.method.strip().lower()
+        # Infer POST if data is present (unless overridden by -X or -G)
+        elif any(
+            [
+                parsed_args.data,
+                parsed_args.data_raw,
+                parsed_args.data_binary,
+                parsed_args.data_urlencode,
+            ]
+        ):
+            method = "post"
+        headers, cookies = _ParseHeaders(parsed_args.header)
+        if parsed_args.cookie:
+            # We are focusing on the string format from DevTools.
+            try:
+                for key, value in _CookieParser(parsed_args.cookie):
+                    # Update the cookie dict, potentially overwriting cookies with the same name from -H 'cookie:'
+                    cookies[key] = value
+                log.debug(f"Parsed cookies from -b argument: {list(cookies.keys())}")
+            except Exception as e:  # pragma: no cover
+                log.error(f"Could not parse cookie string from -b '{parsed_args.cookie}': {e}")
+        # --- Process Data Payload ---
+        params = dict()
+        data_payload: Optional[str | bytes | Dict] = None
+        json_payload: Optional[Any] = None
+        # DevTools often uses --data-raw for JSON bodies
+        # Precedence: --data-binary > --data-raw / -d > --data-urlencode
+        if parsed_args.data_binary is not None:  # pragma: no cover
+            try:
+                data_payload = parsed_args.data_binary.encode("utf-8")
+                log.debug("Using data from --data-binary as bytes.")
+            except Exception as e:
+                log.warning(
+                    f"Could not encode binary data '{parsed_args.data_binary}' as bytes: {e}. Using raw string."
+                )
+                data_payload = parsed_args.data_binary  # Fallback to string
+        elif parsed_args.data_raw is not None:
+            data_payload = parsed_args.data_raw.lstrip("$")
+        elif parsed_args.data is not None:
+            data_payload = parsed_args.data
+        elif parsed_args.data_urlencode:  # pragma: no cover
+            # Combine and parse urlencoded data
+            combined_data = "&".join(parsed_args.data_urlencode)
+            try:
+                data_payload = dict(parse_qsl(combined_data, keep_blank_values=True))
+            except Exception as e:
+                log.warning(f"Could not parse urlencoded data '{combined_data}': {e}. Treating as raw string.")
+                data_payload = combined_data
+        # Check if raw data looks like JSON, prefer 'json' param if so
+        if isinstance(data_payload, str):
+            try:
+                maybe_json = json_loads(data_payload)
+                if isinstance(maybe_json, (dict, list)):
+                    json_payload = maybe_json
+                    data_payload = None
+            except JSONDecodeError:
+                pass  # Not JSON, keep it in data_payload
+        # Handle `-G`: Move data to params if the method is GET
+        if method == "get" and data_payload:  # pragma: no cover
+            if isinstance(data_payload, dict):  # From --data-urlencode likely
+                params.update(data_payload)
+            elif isinstance(data_payload, str):
+                try:
+                    params.update(dict(parse_qsl(data_payload, keep_blank_values=True)))
+                except ValueError:
+                    log.warning(f"Could not parse data '{data_payload}' into GET parameters for -G.")
+            if params:
+                data_payload = None  # Clear data as it's moved to params
+                json_payload = None  # Should not have JSON body with -G
+        # --- Process Proxy ---
+        proxies: Optional[Dict[str, str]] = None
+        if parsed_args.proxy:
+            proxy_url = f"http://{parsed_args.proxy}" if "://" not in parsed_args.proxy else parsed_args.proxy
+            if parsed_args.proxy_user:
+                user_pass = parsed_args.proxy_user
+                parts = urlparse(proxy_url)
+                netloc_parts = parts.netloc.split("@")
+                netloc = f"{user_pass}@{netloc_parts[-1]}" if len(netloc_parts) > 1 else f"{user_pass}@{parts.netloc}"
+                proxy_url = urlunparse(
+                    (
+                        parts.scheme,
+                        netloc,
+                        parts.path,
+                        parts.params,
+                        parts.query,
+                        parts.fragment,
+                    )
+                )
+            # Standard proxy dict format
+            proxies = {"http": proxy_url, "https": proxy_url}
+            log.debug(f"Using proxy configuration: {proxies}")
+        # --- Final Context ---
+        return Request(
+            method=method,
+            url=parsed_args.url,
+            params=params,
+            data=data_payload,
+            json_data=json_payload,
+            headers=headers,
+            cookies=cookies,
+            proxy=proxies,
+            follow_redirects=True,  # Scrapling default is True
+        )
+    def convert2fetcher(self, curl_command: Request | str) -> Optional[Response]:
+        if isinstance(curl_command, (Request, str)):
+            request = self.parse(curl_command) if isinstance(curl_command, str) else curl_command
+            # Ensure request parsing was successful before proceeding
+            if request is None:  # pragma: no cover
+                log.error("Failed to parse curl command, cannot convert to fetcher.")
+                return None
+            request_args = request._asdict()
+            method = request_args.pop("method").strip().lower()
+            if method in self._supported_methods:
+                request_args["json"] = request_args.pop("json_data")
+                # Ensure data/json are removed for non-POST/PUT methods
+                if method not in ("post", "put"):
+                    _ = request_args.pop("data", None)
+                    _ = request_args.pop("json", None)
+                try:
+                    return getattr(self.__fetcher, method)(**request_args)
+                except Exception as e:  # pragma: no cover
+                    log.error(f"Error calling Fetcher.{method}: {e}")
+                    return None
+            else:  # pragma: no cover
+                log.error(f'Request method "{method}" isn\'t supported by Scrapling yet')
+                return None
+        else:  # pragma: no cover
+            log.error("Input must be a valid curl command string or a Request object.")
+            return None
+def _unpack_signature(func, signature_name=None):
+    """
+    Unpack TypedDict from Unpack[TypedDict] annotations in **kwargs and reconstruct the signature.
+    This allows the interactive shell to show individual parameters instead of just **kwargs, similar to how IDEs display them.
+    """
+    try:
+        sig = signature(func)
+        func_name = signature_name or getattr(func, "__name__", None)
+        # Check if this function has known parameters
+        if func_name not in Signatures_map:
+            return sig
+        new_params = []
+        for param in sig.parameters.values():
+            if param.kind == Parameter.VAR_KEYWORD:
+                # Replace **kwargs with individual keyword-only parameters
+                for field_name, field_type in Signatures_map[func_name].items():
+                    new_params.append(
+                        Parameter(field_name, Parameter.KEYWORD_ONLY, default=Parameter.empty, annotation=field_type)
+                    )
+            else:
+                new_params.append(param)
+        # Reconstruct signature with unpacked parameters
+        if len(new_params) != len(sig.parameters):
+            return sig.replace(parameters=new_params)
+        return sig
+    except Exception:  # pragma: no cover
+        return signature(func)
+def show_page_in_browser(page: Selector):  # pragma: no cover
+    if not page or not isinstance(page, Selector):
+        log.error("Input must be of type `Selector`")
+        return
+    try:
+        fd, fname = make_temp_file(prefix="scrapling_view_", suffix=".html")
+        with open(fd, "w", encoding=page.encoding) as f:
+            f.write(page.html_content)
+        open_in_browser(f"file://{fname}")
+    except IOError as e:
+        log.error(f"Failed to write temporary file for viewing: {e}")
+    except Exception as e:
+        log.error(f"An unexpected error occurred while viewing the page: {e}")
+class CustomShell:
+    """A custom IPython shell with minimal dependencies"""
+    def __init__(self, code, log_level="debug"):
+        from IPython.terminal.embed import InteractiveShellEmbed as __InteractiveShellEmbed
+        from scrapling.fetchers import (
+            Fetcher as __Fetcher,
+            AsyncFetcher as __AsyncFetcher,
+            FetcherSession as __FetcherSession,
+            DynamicFetcher as __DynamicFetcher,
+            DynamicSession as __DynamicSession,
+            AsyncDynamicSession as __AsyncDynamicSession,
+            StealthyFetcher as __StealthyFetcher,
+            StealthySession as __StealthySession,
+            AsyncStealthySession as __AsyncStealthySession,
+        )
+        self.__InteractiveShellEmbed = __InteractiveShellEmbed
+        self.__Fetcher = __Fetcher
+        self.__AsyncFetcher = __AsyncFetcher
+        self.__FetcherSession = __FetcherSession
+        self.__DynamicFetcher = __DynamicFetcher
+        self.__DynamicSession = __DynamicSession
+        self.__AsyncDynamicSession = __AsyncDynamicSession
+        self.__StealthyFetcher = __StealthyFetcher
+        self.__StealthySession = __StealthySession
+        self.__AsyncStealthySession = __AsyncStealthySession
+        self.code = code
+        self.page = None
+        self.pages = Selectors([])
+        self._curl_parser = CurlParser()
+        log_level = log_level.strip().lower()
+        if _known_logging_levels.get(log_level):
+            self.log_level = _known_logging_levels[log_level]
+        else:  # pragma: no cover
+            log.warning(f'Unknown log level "{log_level}", defaulting to "DEBUG"')
+            self.log_level = DEBUG
+        self.shell = None
+        # Initialize your application components
+        self.init_components()
+    def init_components(self):
+        """Initialize application components"""
+        # This is where you'd set up your application-specific objects
+        if self.log_level:
+            getLogger("scrapling").setLevel(self.log_level)
+        settings = self.__Fetcher.display_config()
+        settings.pop("storage", None)
+        settings.pop("storage_args", None)
+        log.info(f"Scrapling {__version__} shell started")
+        log.info(f"Logging level is set to '{getLevelName(self.log_level)}'")
+        log.info(f"Fetchers' parsing settings: {settings}")
+    @staticmethod
+    def banner():
+        """Create a custom banner for the shell"""
+        return f"""
+-> Available Scrapling objects:
+   - Fetcher/AsyncFetcher/FetcherSession
+   - DynamicFetcher/DynamicSession/AsyncDynamicSession
+   - StealthyFetcher/StealthySession/AsyncStealthySession
+   - Selector
+-> Useful shortcuts:
+   - {"get":<30} Shortcut for `Fetcher.get`
+   - {"post":<30} Shortcut for `Fetcher.post`
+   - {"put":<30} Shortcut for `Fetcher.put`
+   - {"delete":<30} Shortcut for `Fetcher.delete`
+   - {"fetch":<30} Shortcut for `DynamicFetcher.fetch`
+   - {"stealthy_fetch":<30} Shortcut for `StealthyFetcher.fetch`
+-> Useful commands
+   - {"page / response":<30} The response object of the last page you fetched
+   - {"pages":<30} Selectors object of the last 5 response objects you fetched
+   - {"uncurl('curl_command')":<30} Convert curl command to a Request object. (Optimized to handle curl commands copied from DevTools network tab.)
+   - {"curl2fetcher('curl_command')":<30} Convert curl command and make the request with Fetcher. (Optimized to handle curl commands copied from DevTools network tab.)
+   - {"view(page)":<30} View page in a browser
+   - {"help()":<30} Show this help message (Shell help)
+Type 'exit' or press Ctrl+D to exit.
+        """
+    def update_page(self, result):  # pragma: no cover
+        """Update the current page and add to pages history"""
+        self.page = result
+        if isinstance(result, (Response, Selector)):
+            self.pages.append(result)
+            if len(self.pages) > 5:
+                self.pages.pop(0)  # Remove the oldest item
+            # Update in IPython namespace too
+            if self.shell:
+                self.shell.user_ns["page"] = self.page
+                self.shell.user_ns["response"] = self.page
+                self.shell.user_ns["pages"] = self.pages
+        return result
+    def create_wrapper(
+        self, func: Callable, get_signature: bool = True, signature_name: Optional[str] = None
+    ) -> Callable:
+        """Create a wrapper that preserves function signature but updates page"""
+        @wraps(func)
+        def wrapper(*args: Any, **kwargs: Any) -> Any:
+            result = func(*args, **kwargs)
+            return self.update_page(result)
+        if get_signature:
+            # Explicitly preserve and unpack signature for IPython introspection and autocompletion
+            setattr(wrapper, "__signature__", _unpack_signature(func, signature_name))
+        else:
+            setattr(wrapper, "__signature__", signature(func))
+        return wrapper
+    def get_namespace(self):
+        """Create a namespace with application-specific objects"""
+        # Create wrapped versions of fetch functions
+        get = self.create_wrapper(self.__Fetcher.get)
+        post = self.create_wrapper(self.__Fetcher.post)
+        put = self.create_wrapper(self.__Fetcher.put)
+        delete = self.create_wrapper(self.__Fetcher.delete)
+        dynamic_fetch = self.create_wrapper(self.__DynamicFetcher.fetch)
+        stealthy_fetch = self.create_wrapper(self.__StealthyFetcher.fetch, signature_name="stealthy_fetch")
+        curl2fetcher = self.create_wrapper(self._curl_parser.convert2fetcher, get_signature=False)
+        # Create the namespace dictionary
+        return {
+            "get": get,
+            "post": post,
+            "put": put,
+            "delete": delete,
+            "Fetcher": self.__Fetcher,
+            "AsyncFetcher": self.__AsyncFetcher,
+            "FetcherSession": self.__FetcherSession,
+            "DynamicSession": self.__DynamicSession,
+            "AsyncDynamicSession": self.__AsyncDynamicSession,
+            "StealthySession": self.__StealthySession,
+            "AsyncStealthySession": self.__AsyncStealthySession,
+            "fetch": dynamic_fetch,
+            "DynamicFetcher": self.__DynamicFetcher,
+            "stealthy_fetch": stealthy_fetch,
+            "StealthyFetcher": self.__StealthyFetcher,
+            "Selector": Selector,
+            "page": self.page,
+            "response": self.page,
+            "pages": self.pages,
+            "view": show_page_in_browser,
+            "uncurl": self._curl_parser.parse,
+            "curl2fetcher": curl2fetcher,
+            "help": self.show_help,
+        }
+    def show_help(self):  # pragma: no cover
+        """Show help information"""
+        print(self.banner())
+    def start(self):  # pragma: no cover
+        """Start the interactive shell"""
+        # Get our namespace with application objects
+        namespace = self.get_namespace()
+        ipython_shell = self.__InteractiveShellEmbed(
+            banner1=self.banner(),
+            banner2="",
+            enable_tip=False,
+            exit_msg="Bye Bye",
+            user_ns=namespace,
+        )
+        self.shell = ipython_shell
+        # If a command was provided, execute it and exit
+        if self.code:
+            log.info(f"Executing provided code: {self.code}")
+            try:
+                ipython_shell.run_cell(self.code, store_history=False)
+            except Exception as e:
+                log.error(f"Error executing initial code: {e}")
+            return
+        ipython_shell()
+class Convertor:
+    """Utils for the extract shell command"""
+    _extension_map: Dict[str, extraction_types] = {
+        "md": "markdown",
+        "html": "html",
+        "txt": "text",
+    }
+    @classmethod
+    def _convert_to_markdown(cls, body: TextHandler) -> str:
+        """Convert HTML content to Markdown"""
+        from markdownify import markdownify
+        return markdownify(body)
+    @classmethod
+    def _strip_noise_tags(cls, page: Selector) -> Selector:
+        """Return a copy of the Selector with noise tags removed."""
+        clean_root = deepcopy(page._root)
+        for element in clean_root.iter(*{"script", "style", "noscript", "svg"}):
+            element.drop_tree()
+        return Selector(root=clean_root, url=page.url)
+    @classmethod
+    def _extract_content(
+        cls,
+        page: Selector,
+        extraction_type: extraction_types = "markdown",
+        css_selector: Optional[str] = None,
+        main_content_only: bool = False,
+    ) -> Generator[str, None, None]:
+        """Extract the content of a Selector"""
+        if not page or not isinstance(page, Selector):  # pragma: no cover
+            raise TypeError("Input must be of type `Selector`")
+        elif not extraction_type or extraction_type not in cls._extension_map.values():
+            raise ValueError(f"Unknown extraction type: {extraction_type}")
+        else:
+            if main_content_only:
+                page = cast(Selector, page.css("body").first) or page
+                page = cls._strip_noise_tags(page)
+            pages = [page] if not css_selector else cast(Selectors, page.css(css_selector))
+            for page in pages:
+                match extraction_type:
+                    case "markdown":
+                        yield cls._convert_to_markdown(page.html_content)
+                    case "html":
+                        yield page.html_content
+                    case "text":
+                        txt_content = page.get_all_text(
+                            strip=True, ignore_tags=("script", "style", "noscript", "svg", "iframe")
+                        )
+                        for s in (
+                            "\n",
+                            "\r",
+                            "\t",
+                            " ",
+                        ):
+                            # Remove consecutive white-spaces
+                            txt_content = TextHandler(re_sub(f"[{s}]+", s, txt_content))
+                        yield txt_content
+            yield ""
+    @classmethod
+    def write_content_to_file(cls, page: Selector, filename: str, css_selector: Optional[str] = None) -> None:
+        """Write a Selector's content to a file"""
+        if not page or not isinstance(page, Selector):  # pragma: no cover
+            raise TypeError("Input must be of type `Selector`")
+        elif not filename or not isinstance(filename, str) or not filename.strip():
+            raise ValueError("Filename must be provided")
+        elif not filename.endswith((".md", ".html", ".txt")):
+            raise ValueError("Unknown file type: filename must end with '.md', '.html', or '.txt'")
+        else:
+            with open(filename, "w", encoding=page.encoding) as f:
+                extension = filename.split(".")[-1]
+                f.write(
+                    "".join(
+                        cls._extract_content(
+                            page,
+                            cls._extension_map[extension],
+                            css_selector=css_selector,
+                        )
+                    )
+                )

core/storage.py ADDED Viewed

	@@ -0,0 +1,156 @@

+from hashlib import sha256
+from threading import RLock
+from functools import lru_cache
+from abc import ABC, abstractmethod
+from sqlite3 import connect as db_connect
+from orjson import dumps, loads
+from lxml.html import HtmlElement
+from scrapling.core.utils import _StorageTools, log
+from scrapling.core._types import Dict, Optional, Any, cast
+class StorageSystemMixin(ABC):  # pragma: no cover
+    # If you want to make your own storage system, you have to inherit from this
+    def __init__(self, url: Optional[str] = None):
+        """
+        :param url: URL of the website we are working on to separate it from other websites data
+        """
+        # Make the url in lowercase to handle this edge case until it's updated: https://github.com/barseghyanartur/tld/issues/124
+        self.url = url.lower() if (url and isinstance(url, str)) else None
+    @lru_cache(64, typed=True)
+    def _get_base_url(self, default_value: str = "default") -> str:
+        if not self.url:
+            return default_value
+        try:
+            from tld import get_tld, Result
+            # Fixing the inaccurate return type hint in `get_tld`
+            extracted: Result | None = cast(
+                Result, get_tld(self.url, as_object=True, fail_silently=True, fix_protocol=True)
+            )
+            if not extracted:
+                return default_value
+            return extracted.fld or extracted.domain or default_value
+        except AttributeError:
+            return default_value
+    @abstractmethod
+    def save(self, element: HtmlElement, identifier: str) -> None:
+        """Saves the element's unique properties to the storage for retrieval and relocation later
+        :param element: The element itself which we want to save to storage.
+        :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
+            the docs for more info.
+        """
+        raise NotImplementedError("Storage system must implement `save` method")
+    @abstractmethod
+    def retrieve(self, identifier: str) -> Optional[Dict]:
+        """Using the identifier, we search the storage and return the unique properties of the element
+        :param identifier: This is the identifier that will be used to retrieve the element from the storage. See
+            the docs for more info.
+        :return: A dictionary of the unique properties
+        """
+        raise NotImplementedError("Storage system must implement `save` method")
+    @staticmethod
+    @lru_cache(128, typed=True)
+    def _get_hash(identifier: str) -> str:
+        """If you want to hash identifier in your storage system, use this safer"""
+        _identifier = identifier.lower().strip()
+        # Hash functions have to take bytes
+        _identifier_bytes = _identifier.encode("utf-8")
+        hash_value = sha256(_identifier_bytes).hexdigest()
+        return f"{hash_value}_{len(_identifier_bytes)}"  # Length to reduce collision chance
+@lru_cache(1, typed=True)
+class SQLiteStorageSystem(StorageSystemMixin):
+    """The recommended system to use, it's race condition safe and thread safe.
+    Mainly built, so the library can run in threaded frameworks like scrapy or threaded tools
+    > It's optimized for threaded applications, but running it without threads shouldn't make it slow."""
+    def __init__(self, storage_file: str, url: Optional[str] = None):
+        """
+        :param storage_file: File to be used to store elements' data.
+        :param url: URL of the website we are working on to separate it from other websites data
+        """
+        super().__init__(url)
+        self.storage_file = storage_file
+        self.lock = RLock()  # Better than Lock for reentrancy
+        # >SQLite default mode in the earlier version is 1 not 2 (1=thread-safe 2=serialized)
+        # `check_same_thread=False` to allow it to be used across different threads.
+        self.connection = db_connect(self.storage_file, check_same_thread=False)
+        # WAL (Write-Ahead Logging) allows for better concurrency.
+        self.connection.execute("PRAGMA journal_mode=WAL")
+        self.cursor = self.connection.cursor()
+        self._setup_database()
+        log.debug(f'Storage system loaded with arguments (storage_file="{storage_file}", url="{url}")')
+    def _setup_database(self) -> None:
+        self.cursor.execute("""
+            CREATE TABLE IF NOT EXISTS storage (
+                id INTEGER PRIMARY KEY,
+                url TEXT,
+                identifier TEXT,
+                element_data TEXT,
+                UNIQUE (url, identifier)
+            )
+        """)
+        self.connection.commit()
+    def save(self, element: HtmlElement, identifier: str) -> None:
+        """Saves the elements unique properties to the storage for retrieval and relocation later
+        :param element: The element itself which we want to save to storage.
+        :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
+            the docs for more info.
+        """
+        url = self._get_base_url()
+        element_data = _StorageTools.element_to_dict(element)
+        with self.lock:
+            self.cursor.execute(
+                """
+                INSERT OR REPLACE INTO storage (url, identifier, element_data)
+                VALUES (?, ?, ?)
+            """,
+                (url, identifier, dumps(element_data)),
+            )
+            self.cursor.fetchall()
+            self.connection.commit()
+    def retrieve(self, identifier: str) -> Optional[Dict[str, Any]]:
+        """Using the identifier, we search the storage and return the unique properties of the element
+        :param identifier: This is the identifier that will be used to retrieve the element from the storage. See
+            the docs for more info.
+        :return: A dictionary of the unique properties
+        """
+        url = self._get_base_url()
+        with self.lock:
+            self.cursor.execute(
+                "SELECT element_data FROM storage WHERE url = ? AND identifier = ?",
+                (url, identifier),
+            )
+            result = self.cursor.fetchone()
+            if result:
+                return loads(result[0])
+            return None
+    def close(self):
+        """Close all connections. It will be useful when with some things like scrapy Spider.closed() function/signal"""
+        with self.lock:
+            self.connection.commit()
+            self.cursor.close()
+            self.connection.close()
+    def __del__(self):
+        """To ensure all connections are closed when the object is destroyed."""
+        self.close()

core/translator.py ADDED Viewed

	@@ -0,0 +1,134 @@

+"""
+Most of this file is an adapted version of the parsel library's translator with some modifications simply for 1 important reason...
+To add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match the Parsel/Scrapy selectors format which will be important in future releases but most importantly...
+So you don't have to learn a new selectors/api method like what bs4 done with soupsieve :)
+    If you want to learn about this, head to https://cssselect.readthedocs.io/en/latest/#cssselect.FunctionalPseudoElement
+"""
+from functools import lru_cache
+from cssselect import HTMLTranslator as OriginalHTMLTranslator
+from cssselect.xpath import ExpressionError, XPathExpr as OriginalXPathExpr
+from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
+from scrapling.core._types import Any, Protocol, Self
+class XPathExpr(OriginalXPathExpr):
+    textnode: bool = False
+    attribute: str | None = None
+    @classmethod
+    def from_xpath(
+        cls,
+        xpath: OriginalXPathExpr,
+        textnode: bool = False,
+        attribute: str | None = None,
+    ) -> Self:
+        x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
+        x.textnode = textnode
+        x.attribute = attribute
+        return x
+    def __str__(self) -> str:
+        path = super().__str__()
+        if self.textnode:
+            if path == "*":  # pragma: no cover
+                path = "text()"
+            elif path.endswith("::*/*"):  # pragma: no cover
+                path = path[:-3] + "text()"
+            else:
+                path += "/text()"
+        if self.attribute is not None:
+            if path.endswith("::*/*"):  # pragma: no cover
+                path = path[:-2]
+            path += f"/@{self.attribute}"
+        return path
+    def join(
+        self: Self,
+        combiner: str,
+        other: OriginalXPathExpr,
+        *args: Any,
+        **kwargs: Any,
+    ) -> Self:
+        if not isinstance(other, XPathExpr):
+            raise ValueError(  # pragma: no cover
+                f"Expressions of type {__name__}.XPathExpr can ony join expressions"
+                f" of the same type (or its descendants), got {type(other)}"
+            )
+        super().join(combiner, other, *args, **kwargs)
+        self.textnode = other.textnode
+        self.attribute = other.attribute
+        return self
+# e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
+class TranslatorProtocol(Protocol):
+    def xpath_element(self, selector: Element) -> OriginalXPathExpr:  # pyright: ignore # pragma: no cover
+        pass
+    def css_to_xpath(self, css: str, prefix: str = ...) -> str:  # pyright: ignore # pragma: no cover
+        pass
+class TranslatorMixin:
+    """This mixin adds support to CSS pseudo elements via dynamic dispatch.
+    Currently supported pseudo-elements are ``::text`` and ``::attr(ATTR_NAME)``.
+    """
+    def xpath_element(self: TranslatorProtocol, selector: Element) -> XPathExpr:
+        # https://github.com/python/mypy/issues/14757
+        xpath = super().xpath_element(selector)  # type: ignore[safe-super]
+        return XPathExpr.from_xpath(xpath)
+    def xpath_pseudo_element(self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement) -> OriginalXPathExpr:
+        """
+        Dispatch method that transforms XPath to support the pseudo-element.
+        """
+        if isinstance(pseudo_element, FunctionalPseudoElement):
+            method_name = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element"
+            method = getattr(self, method_name, None)
+            if not method:  # pragma: no cover
+                raise ExpressionError(f"The functional pseudo-element ::{pseudo_element.name}() is unknown")
+            xpath = method(xpath, pseudo_element)
+        else:
+            method_name = f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
+            method = getattr(self, method_name, None)
+            if not method:  # pragma: no cover
+                raise ExpressionError(f"The pseudo-element ::{pseudo_element} is unknown")
+            xpath = method(xpath)
+        return xpath
+    @staticmethod
+    def xpath_attr_functional_pseudo_element(xpath: OriginalXPathExpr, function: FunctionalPseudoElement) -> XPathExpr:
+        """Support selecting attribute values using ::attr() pseudo-element"""
+        if function.argument_types() not in (["STRING"], ["IDENT"]):  # pragma: no cover
+            raise ExpressionError(f"Expected a single string or ident for ::attr(), got {function.arguments!r}")
+        return XPathExpr.from_xpath(xpath, attribute=function.arguments[0].value)
+    @staticmethod
+    def xpath_text_simple_pseudo_element(xpath: OriginalXPathExpr) -> XPathExpr:
+        """Support selecting text nodes using ::text pseudo-element"""
+        return XPathExpr.from_xpath(xpath, textnode=True)
+class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
+    def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
+        return super().css_to_xpath(css, prefix)
+translator = HTMLTranslator()
+# Using a function instead of the translator directly to avoid Pyright override error
+@lru_cache(maxsize=256)
+def css_to_xpath(query: str) -> str:
+    """Return the translated XPath version of a given CSS query"""
+    return translator.css_to_xpath(query)

core/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from ._utils import (
+    log,
+    set_logger,
+    reset_logger,
+    __CONSECUTIVE_SPACES_REGEX__,
+    flatten,
+    _is_iterable,
+    _StorageTools,
+    clean_spaces,
+    html_forbidden,
+)

core/utils/_shell.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from http import cookies as Cookie
+from scrapling.core._types import (
+    List,
+    Dict,
+    Tuple,
+)
+def _CookieParser(cookie_string):
+    # Errors will be handled on call so the log can be specified
+    cookie_parser = Cookie.SimpleCookie()
+    cookie_parser.load(cookie_string)
+    for key, morsel in cookie_parser.items():
+        yield key, morsel.value
+def _ParseHeaders(header_lines: List[str], parse_cookies: bool = True) -> Tuple[Dict[str, str], Dict[str, str]]:
+    """Parses headers into separate header and cookie dictionaries."""
+    header_dict = dict()
+    cookie_dict = dict()
+    for header_line in header_lines:
+        if ":" not in header_line:
+            if header_line.endswith(";"):
+                header_key = header_line[:-1].strip()
+                header_value = ""
+                header_dict[header_key] = header_value
+            else:
+                raise ValueError(f"Could not parse header without colon: '{header_line}'.")
+        else:
+            header_key, header_value = header_line.split(":", 1)
+            header_key = header_key.strip()
+            header_value = header_value.strip()
+            if parse_cookies:
+                if header_key.lower() == "cookie":
+                    try:
+                        cookie_dict = {key: value for key, value in _CookieParser(header_value)}
+                    except Exception as e:  # pragma: no cover
+                        raise ValueError(f"Could not parse cookie string from header '{header_value}': {e}")
+                else:
+                    header_dict[header_key] = header_value
+            else:
+                header_dict[header_key] = header_value
+    return header_dict, cookie_dict

core/utils/_utils.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import logging
+from itertools import chain
+from re import compile as re_compile
+from contextvars import ContextVar, Token
+from lxml import html
+from scrapling.core._types import Any, Dict, Iterable, List
+# Using cache on top of a class is a brilliant way to achieve a Singleton design pattern without much code
+from functools import lru_cache  # isort:skip
+html_forbidden = (html.HtmlComment,)
+__CLEANING_TABLE__ = str.maketrans({"\t": " ", "\n": None, "\r": None})
+__CONSECUTIVE_SPACES_REGEX__ = re_compile(r" +")
+@lru_cache(1, typed=True)
+def setup_logger():
+    """Create and configure a logger with a standard format.
+    :returns: logging.Logger: Configured logger instance
+    """
+    logger = logging.getLogger("scrapling")
+    logger.setLevel(logging.INFO)
+    formatter = logging.Formatter(fmt="[%(asctime)s] %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
+    console_handler = logging.StreamHandler()
+    console_handler.setFormatter(formatter)
+    # Add handler to logger (if not already added)
+    if not logger.handlers:
+        logger.addHandler(console_handler)
+    return logger
+_current_logger: ContextVar[logging.Logger] = ContextVar("scrapling_logger", default=setup_logger())
+class LoggerProxy:
+    def __getattr__(self, name: str):
+        return getattr(_current_logger.get(), name)
+log = LoggerProxy()
+def set_logger(logger: logging.Logger) -> Token:
+    """Set the current context logger. Returns token for reset."""
+    return _current_logger.set(logger)
+def reset_logger(token: Token) -> None:
+    """Reset logger to previous state using token."""
+    _current_logger.reset(token)
+def flatten(lst: Iterable[Any]) -> List[Any]:
+    return list(chain.from_iterable(lst))
+def _is_iterable(obj: Any) -> bool:
+    # This will be used only in regex functions to make sure it's iterable but not string/bytes
+    return isinstance(
+        obj,
+        (
+            list,
+            tuple,
+        ),
+    )
+class _StorageTools:
+    @staticmethod
+    def __clean_attributes(element: html.HtmlElement, forbidden: tuple = ()) -> Dict:
+        if not element.attrib:
+            return {}
+        return {k: v.strip() for k, v in element.attrib.items() if v and v.strip() and k not in forbidden}
+    @classmethod
+    def element_to_dict(cls, element: html.HtmlElement) -> Dict:
+        parent = element.getparent()
+        result = {
+            "tag": str(element.tag),
+            "attributes": cls.__clean_attributes(element),
+            "text": element.text.strip() if element.text else None,
+            "path": cls._get_element_path(element),
+        }
+        if parent is not None:
+            result.update(
+                {
+                    "parent_name": parent.tag,
+                    "parent_attribs": dict(parent.attrib),
+                    "parent_text": parent.text.strip() if parent.text else None,
+                }
+            )
+            siblings = [child.tag for child in parent.iterchildren() if child != element]
+            if siblings:
+                result.update({"siblings": tuple(siblings)})
+        children = [child.tag for child in element.iterchildren() if not isinstance(child, html_forbidden)]
+        if children:
+            result.update({"children": tuple(children)})
+        return result
+    @classmethod
+    def _get_element_path(cls, element: html.HtmlElement):
+        parent = element.getparent()
+        return tuple((element.tag,) if parent is None else (cls._get_element_path(parent) + (element.tag,)))
+@lru_cache(128, typed=True)
+def clean_spaces(string):
+    string = string.translate(__CLEANING_TABLE__)
+    return __CONSECUTIVE_SPACES_REGEX__.sub(" ", string)

engines/__init__.py ADDED Viewed

File without changes

engines/_browsers/__init__.py ADDED Viewed

File without changes

engines/_browsers/_base.py ADDED Viewed

	@@ -0,0 +1,534 @@

+from time import time
+from asyncio import sleep as asyncio_sleep, Lock
+from contextlib import contextmanager, asynccontextmanager
+from playwright.sync_api._generated import Page
+from playwright.sync_api import (
+    Frame,
+    BrowserContext,
+    Response as SyncPlaywrightResponse,
+)
+from playwright.async_api._generated import Page as AsyncPage
+from playwright.async_api import (
+    Frame as AsyncFrame,
+    Response as AsyncPlaywrightResponse,
+    BrowserContext as AsyncBrowserContext,
+)
+from playwright._impl._errors import Error as PlaywrightError
+from scrapling.parser import Selector
+from scrapling.engines._browsers._page import PageInfo, PagePool
+from scrapling.engines._browsers._validators import validate, PlaywrightConfig, StealthConfig
+from scrapling.engines._browsers._config_tools import __default_chrome_useragent__, __default_useragent__
+from scrapling.engines.toolbelt.navigation import (
+    construct_proxy_dict,
+    create_intercept_handler,
+    create_async_intercept_handler,
+)
+from scrapling.core._types import (
+    Any,
+    Dict,
+    List,
+    Set,
+    Optional,
+    Callable,
+    TYPE_CHECKING,
+    cast,
+    overload,
+    Tuple,
+    ProxyType,
+    Generator,
+    AsyncGenerator,
+)
+from scrapling.engines.constants import STEALTH_ARGS, HARMFUL_ARGS, DEFAULT_ARGS
+class SyncSession:
+    _config: "PlaywrightConfig | StealthConfig"
+    _context_options: Dict[str, Any]
+    def _build_context_with_proxy(self, proxy: Optional[ProxyType] = None) -> Dict[str, Any]:
+        raise NotImplementedError  # pragma: no cover
+    def __init__(self, max_pages: int = 1):
+        self.max_pages = max_pages
+        self.page_pool = PagePool(max_pages)
+        self._max_wait_for_page = 60
+        self.playwright: Any = None
+        self.context: Any = None
+        self.browser: Any = None
+        self._is_alive = False
+    def start(self) -> None:
+        pass
+    def close(self):  # pragma: no cover
+        """Close all resources"""
+        if not self._is_alive:
+            return
+        if self.context:
+            self.context.close()
+            self.context = None
+        if self.browser:
+            self.browser.close()
+            self.browser = None
+        if self.playwright:
+            self.playwright.stop()
+            self.playwright = None  # pyright: ignore
+        self._is_alive = False
+    def __enter__(self):
+        self.start()
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+    def _initialize_context(self, config: PlaywrightConfig | StealthConfig, ctx: BrowserContext) -> BrowserContext:
+        """Initialize the browser context."""
+        if config.init_script:
+            ctx.add_init_script(path=config.init_script)
+        if config.cookies:  # pragma: no cover
+            ctx.add_cookies(config.cookies)
+        return ctx
+    def _get_page(
+        self,
+        timeout: int | float,
+        extra_headers: Optional[Dict[str, str]],
+        disable_resources: bool,
+        blocked_domains: Optional[Set[str]] = None,
+        context: Optional[BrowserContext] = None,
+    ) -> PageInfo[Page]:  # pragma: no cover
+        """Get a new page to use"""
+        # No need to check if a page is available or not in sync code because the code blocked before reaching here till the page closed, ofc.
+        ctx = context if context is not None else self.context
+        assert ctx is not None, "Browser context not initialized"
+        page = ctx.new_page()
+        page.set_default_navigation_timeout(timeout)
+        page.set_default_timeout(timeout)
+        if extra_headers:
+            page.set_extra_http_headers(extra_headers)
+        if disable_resources or blocked_domains:
+            page.route("**/*", create_intercept_handler(disable_resources, blocked_domains))
+        page_info = self.page_pool.add_page(page)
+        page_info.mark_busy()
+        return page_info
+    def get_pool_stats(self) -> Dict[str, int]:
+        """Get statistics about the current page pool"""
+        return {
+            "total_pages": self.page_pool.pages_count,
+            "busy_pages": self.page_pool.busy_count,
+            "max_pages": self.max_pages,
+        }
+    @staticmethod
+    def _wait_for_networkidle(page: Page | Frame, timeout: Optional[int] = None):
+        """Wait for the page to become idle (no network activity) even if there are never-ending requests."""
+        try:
+            page.wait_for_load_state("networkidle", timeout=timeout)
+        except (PlaywrightError, Exception):
+            pass
+    def _wait_for_page_stability(self, page: Page | Frame, load_dom: bool, network_idle: bool):
+        page.wait_for_load_state(state="load")
+        if load_dom:
+            page.wait_for_load_state(state="domcontentloaded")
+        if network_idle:
+            self._wait_for_networkidle(page)
+    @staticmethod
+    def _create_response_handler(page_info: PageInfo[Page], response_container: List) -> Callable:
+        """Create a response handler that captures the final navigation response.
+        :param page_info: The PageInfo object containing the page
+        :param response_container: A list to store the final response (mutable container)
+        :return: A callback function for page.on("response", ...)
+        """
+        def handle_response(finished_response: SyncPlaywrightResponse):
+            if (
+                finished_response.request.resource_type == "document"
+                and finished_response.request.is_navigation_request()
+                and finished_response.request.frame == page_info.page.main_frame
+            ):
+                response_container[0] = finished_response
+        return handle_response
+    @contextmanager
+    def _page_generator(
+        self,
+        timeout: int | float,
+        extra_headers: Optional[Dict[str, str]],
+        disable_resources: bool,
+        proxy: Optional[ProxyType] = None,
+        blocked_domains: Optional[Set[str]] = None,
+    ) -> Generator["PageInfo[Page]", None, None]:
+        """Acquire a page - either from persistent context or fresh context with proxy."""
+        if proxy:
+            # Rotation mode: create fresh context with the provided proxy
+            if not self.browser:  # pragma: no cover
+                raise RuntimeError("Browser not initialized for proxy rotation mode")
+            context_options = self._build_context_with_proxy(proxy)
+            context: BrowserContext = self.browser.new_context(**context_options)
+            try:
+                context = self._initialize_context(self._config, context)
+                page_info = self._get_page(timeout, extra_headers, disable_resources, blocked_domains, context=context)
+                yield page_info
+            finally:
+                context.close()
+        else:
+            # Standard mode: use PagePool with persistent context
+            page_info = self._get_page(timeout, extra_headers, disable_resources, blocked_domains)
+            try:
+                yield page_info
+            finally:
+                page_info.page.close()
+                self.page_pool.pages.remove(page_info)
+class AsyncSession:
+    _config: "PlaywrightConfig | StealthConfig"
+    _context_options: Dict[str, Any]
+    def _build_context_with_proxy(self, proxy: Optional[ProxyType] = None) -> Dict[str, Any]:
+        raise NotImplementedError  # pragma: no cover
+    def __init__(self, max_pages: int = 1):
+        self.max_pages = max_pages
+        self.page_pool = PagePool(max_pages)
+        self._max_wait_for_page = 60
+        self.playwright: Any = None
+        self.context: Any = None
+        self.browser: Any = None
+        self._is_alive = False
+        self._lock = Lock()
+    async def start(self) -> None:
+        pass
+    async def close(self):
+        """Close all resources"""
+        if not self._is_alive:  # pragma: no cover
+            return
+        if self.context:
+            await self.context.close()
+            self.context = None  # pyright: ignore
+        if self.browser:
+            await self.browser.close()
+            self.browser = None
+        if self.playwright:
+            await self.playwright.stop()
+            self.playwright = None  # pyright: ignore
+        self._is_alive = False
+    async def __aenter__(self):
+        await self.start()
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self.close()
+    async def _initialize_context(
+        self, config: PlaywrightConfig | StealthConfig, ctx: AsyncBrowserContext
+    ) -> AsyncBrowserContext:
+        """Initialize the browser context."""
+        if config.init_script:  # pragma: no cover
+            await ctx.add_init_script(path=config.init_script)
+        if config.cookies:  # pragma: no cover
+            await ctx.add_cookies(config.cookies)
+        return ctx
+    async def _get_page(
+        self,
+        timeout: int | float,
+        extra_headers: Optional[Dict[str, str]],
+        disable_resources: bool,
+        blocked_domains: Optional[Set[str]] = None,
+        context: Optional[AsyncBrowserContext] = None,
+    ) -> PageInfo[AsyncPage]:  # pragma: no cover
+        """Get a new page to use"""
+        ctx = context if context is not None else self.context
+        if TYPE_CHECKING:
+            assert ctx is not None, "Browser context not initialized"
+        async with self._lock:
+            # If we're at max capacity after cleanup, wait for busy pages to finish
+            if context is None and self.page_pool.pages_count >= self.max_pages:
+                # Only applies when using persistent context
+                start_time = time()
+                while time() - start_time < self._max_wait_for_page:
+                    await asyncio_sleep(0.05)
+                    if self.page_pool.pages_count < self.max_pages:
+                        break
+                else:
+                    raise TimeoutError(
+                        f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
+                    )
+            page = await ctx.new_page()
+            page.set_default_navigation_timeout(timeout)
+            page.set_default_timeout(timeout)
+            if extra_headers:
+                await page.set_extra_http_headers(extra_headers)
+            if disable_resources or blocked_domains:
+                await page.route("**/*", create_async_intercept_handler(disable_resources, blocked_domains))
+            return self.page_pool.add_page(page)
+    def get_pool_stats(self) -> Dict[str, int]:
+        """Get statistics about the current page pool"""
+        return {
+            "total_pages": self.page_pool.pages_count,
+            "busy_pages": self.page_pool.busy_count,
+            "max_pages": self.max_pages,
+        }
+    @staticmethod
+    async def _wait_for_networkidle(page: AsyncPage | AsyncFrame, timeout: Optional[int] = None):
+        """Wait for the page to become idle (no network activity) even if there are never-ending requests."""
+        try:
+            await page.wait_for_load_state("networkidle", timeout=timeout)
+        except (PlaywrightError, Exception):
+            pass
+    async def _wait_for_page_stability(self, page: AsyncPage | AsyncFrame, load_dom: bool, network_idle: bool):
+        await page.wait_for_load_state(state="load")
+        if load_dom:
+            await page.wait_for_load_state(state="domcontentloaded")
+        if network_idle:
+            await self._wait_for_networkidle(page)
+    @staticmethod
+    def _create_response_handler(page_info: PageInfo[AsyncPage], response_container: List) -> Callable:
+        """Create an async response handler that captures the final navigation response.
+        :param page_info: The PageInfo object containing the page
+        :param response_container: A list to store the final response (mutable container)
+        :return: A callback function for page.on("response", ...)
+        """
+        async def handle_response(finished_response: AsyncPlaywrightResponse):
+            if (
+                finished_response.request.resource_type == "document"
+                and finished_response.request.is_navigation_request()
+                and finished_response.request.frame == page_info.page.main_frame
+            ):
+                response_container[0] = finished_response
+        return handle_response
+    @asynccontextmanager
+    async def _page_generator(
+        self,
+        timeout: int | float,
+        extra_headers: Optional[Dict[str, str]],
+        disable_resources: bool,
+        proxy: Optional[ProxyType] = None,
+        blocked_domains: Optional[Set[str]] = None,
+    ) -> AsyncGenerator["PageInfo[AsyncPage]", None]:
+        """Acquire a page - either from persistent context or fresh context with proxy."""
+        if proxy:
+            # Rotation mode: create fresh context with the provided proxy
+            if not self.browser:  # pragma: no cover
+                raise RuntimeError("Browser not initialized for proxy rotation mode")
+            context_options = self._build_context_with_proxy(proxy)
+            context: AsyncBrowserContext = await self.browser.new_context(**context_options)
+            try:
+                context = await self._initialize_context(self._config, context)
+                page_info = await self._get_page(
+                    timeout, extra_headers, disable_resources, blocked_domains, context=context
+                )
+                yield page_info
+            finally:
+                await context.close()
+        else:
+            # Standard mode: use PagePool with persistent context
+            page_info = await self._get_page(timeout, extra_headers, disable_resources, blocked_domains)
+            try:
+                yield page_info
+            finally:
+                await page_info.page.close()
+                self.page_pool.pages.remove(page_info)
+class BaseSessionMixin:
+    _config: "PlaywrightConfig | StealthConfig"
+    @overload
+    def __validate_routine__(self, params: Dict, model: type[StealthConfig]) -> StealthConfig: ...
+    @overload
+    def __validate_routine__(self, params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...
+    def __validate_routine__(
+        self, params: Dict, model: type[PlaywrightConfig] | type[StealthConfig]
+    ) -> PlaywrightConfig | StealthConfig:
+        # Dark color scheme bypasses the 'prefersLightColor' check in creepjs
+        self._context_options: Dict[str, Any] = {"color_scheme": "dark", "device_scale_factor": 2}
+        self._browser_options: Dict[str, Any] = {
+            "args": DEFAULT_ARGS,
+            "ignore_default_args": HARMFUL_ARGS,
+        }
+        if "__max_pages" in params:
+            params["max_pages"] = params.pop("__max_pages")
+        config = validate(params, model=model)
+        self._headers_keys = (
+            {header.lower() for header in config.extra_headers.keys()} if config.extra_headers else set()
+        )
+        return config
+    def __generate_options__(self, extra_flags: Tuple | None = None) -> None:
+        config: PlaywrightConfig | StealthConfig = self._config
+        self._context_options.update(
+            {
+                "proxy": config.proxy,
+                "locale": config.locale,
+                "timezone_id": config.timezone_id,
+                "extra_http_headers": config.extra_headers,
+            }
+        )
+        # The default useragent in the headful is always correct now in the current versions of Playwright
+        if config.useragent:
+            self._context_options["user_agent"] = config.useragent
+        elif not config.useragent and config.headless:
+            self._context_options["user_agent"] = (
+                __default_chrome_useragent__ if config.real_chrome else __default_useragent__
+            )
+        if not config.cdp_url:
+            flags = self._browser_options["args"]
+            if config.extra_flags or extra_flags:
+                flags = list(set(flags + (config.extra_flags or extra_flags)))
+            self._browser_options.update(
+                {
+                    "args": flags,
+                    "headless": config.headless,
+                    "channel": "chrome" if config.real_chrome else "chromium",
+                }
+            )
+            self._user_data_dir = config.user_data_dir
+        else:
+            self._browser_options = {}
+        if config.additional_args:
+            self._context_options.update(config.additional_args)
+    def _build_context_with_proxy(self, proxy: Optional[ProxyType] = None) -> Dict[str, Any]:
+        """
+        Build context options with a specific proxy for rotation mode.
+        :param proxy: Proxy URL string or Playwright-style proxy dict to use for this context.
+        :return: Dictionary of context options for browser.new_context().
+        """
+        context_options = self._context_options.copy()
+        # Override proxy if provided
+        if proxy:
+            context_options["proxy"] = construct_proxy_dict(proxy)
+        return context_options
+class DynamicSessionMixin(BaseSessionMixin):
+    def __validate__(self, **params):
+        self._config = self.__validate_routine__(params, model=PlaywrightConfig)
+        self.__generate_options__()
+class StealthySessionMixin(BaseSessionMixin):
+    def __validate__(self, **params):
+        self._config = self.__validate_routine__(params, model=StealthConfig)
+        self._context_options.update(
+            {
+                "is_mobile": False,
+                "has_touch": False,
+                # I'm thinking about disabling it to rest from all Service Workers' headache, but let's keep it as it is for now
+                "service_workers": "allow",
+                "ignore_https_errors": True,
+                "screen": {"width": 1920, "height": 1080},
+                "viewport": {"width": 1920, "height": 1080},
+                "permissions": ["geolocation", "notifications"],
+            }
+        )
+        self.__generate_stealth_options()
+    def __generate_stealth_options(self) -> None:
+        config = cast(StealthConfig, self._config)
+        flags: Tuple[str, ...] = tuple()
+        if not config.cdp_url:
+            flags = DEFAULT_ARGS + STEALTH_ARGS
+            if config.block_webrtc:
+                flags += (
+                    "--webrtc-ip-handling-policy=disable_non_proxied_udp",
+                    "--force-webrtc-ip-handling-policy",  # Ensures the policy is enforced
+                )
+            if not config.allow_webgl:
+                flags += (
+                    "--disable-webgl",
+                    "--disable-webgl-image-chromium",
+                    "--disable-webgl2",
+                )
+            if config.hide_canvas:
+                flags += ("--fingerprinting-canvas-image-data-noise",)
+        super(StealthySessionMixin, self).__generate_options__(flags)
+    @staticmethod
+    def _detect_cloudflare(page_content: str) -> str | None:
+        """
+        Detect the type of Cloudflare challenge present in the provided page content.
+        This function analyzes the given page content to identify whether a specific
+        type of Cloudflare challenge is present. It checks for three predefined
+        challenge types: non-interactive, managed, and interactive. If a challenge
+        type is detected, it returns the corresponding type as a string. If no
+        challenge type is detected, it returns None.
+        Args:
+            page_content (str): The content of the page to analyze for Cloudflare
+                challenge types.
+        Returns:
+            str: A string representing the detected Cloudflare challenge type, if
+                found. Returns None if no challenge matches.
+        """
+        challenge_types = (
+            "non-interactive",
+            "managed",
+            "interactive",
+        )
+        for ctype in challenge_types:
+            if f"cType: '{ctype}'" in page_content:
+                return ctype
+        # Check if turnstile captcha is embedded inside the page (Usually inside a closed Shadow iframe)
+        selector = Selector(content=page_content)
+        if selector.css('script[src*="challenges.cloudflare.com/turnstile/v"]'):
+            return "embedded"
+        return None

engines/_browsers/_config_tools.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from scrapling.engines.toolbelt.fingerprints import generate_headers
+__default_useragent__ = generate_headers(browser_mode=True).get("User-Agent")
+__default_chrome_useragent__ = generate_headers(browser_mode="chrome").get("User-Agent")

engines/_browsers/_controllers.py ADDED Viewed

	@@ -0,0 +1,362 @@

+from time import sleep as time_sleep
+from asyncio import sleep as asyncio_sleep
+from playwright.sync_api import (
+    Locator,
+    sync_playwright,
+)
+from playwright.async_api import (
+    async_playwright,
+    Locator as AsyncLocator,
+)
+from scrapling.core.utils import log
+from scrapling.core._types import Optional, ProxyType, Unpack
+from scrapling.engines.toolbelt.proxy_rotation import is_proxy_error
+from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
+from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
+from scrapling.engines._browsers._types import PlaywrightSession, PlaywrightFetchParams
+from scrapling.engines._browsers._base import SyncSession, AsyncSession, DynamicSessionMixin
+from scrapling.engines._browsers._validators import validate_fetch as _validate, PlaywrightConfig
+class DynamicSession(SyncSession, DynamicSessionMixin):
+    """A Browser session manager with page pooling."""
+    __slots__ = (
+        "_config",
+        "_context_options",
+        "_browser_options",
+        "_user_data_dir",
+        "_headers_keys",
+        "max_pages",
+        "page_pool",
+        "_max_wait_for_page",
+        "playwright",
+        "context",
+    )
+    def __init__(self, **kwargs: Unpack[PlaywrightSession]):
+        """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
+        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
+        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
+            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
+        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
+        :param cookies: Set cookies for the next request.
+        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
+        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
+        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
+        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
+        :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
+        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
+            rules. Defaults to the system default locale.
+        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
+        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
+        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
+        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
+        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
+        :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
+        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
+        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
+        :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
+        :param extra_flags: A list of additional browser flags to pass to the browser on launch.
+        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
+        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
+        """
+        self.__validate__(**kwargs)
+        super().__init__()
+    def start(self):
+        """Create a browser for this instance and context."""
+        if not self.playwright:
+            self.playwright = sync_playwright().start()
+            try:
+                if self._config.cdp_url:  # pragma: no cover
+                    self.browser = self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
+                    if not self._config.proxy_rotator and self.browser:
+                        self.context = self.browser.new_context(**self._context_options)
+                elif self._config.proxy_rotator:
+                    self.browser = self.playwright.chromium.launch(**self._browser_options)
+                else:
+                    persistent_options = (
+                        self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
+                    )
+                    self.context = self.playwright.chromium.launch_persistent_context(**persistent_options)
+                if self.context:
+                    self.context = self._initialize_context(self._config, self.context)
+                self._is_alive = True
+            except Exception:
+                # Clean up playwright if browser setup fails
+                self.playwright.stop()
+                self.playwright = None
+                raise
+        else:
+            raise RuntimeError("Session has been already started")
+    def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Response:
+        """Opens up the browser and do your request based on your chosen options.
+        :param url: The Target url.
+        :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
+        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
+        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
+        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
+        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
+        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
+            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
+        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
+        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
+        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
+        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
+        :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
+        :return: A `Response` object.
+        """
+        static_proxy = kwargs.pop("proxy", None)
+        params = _validate(kwargs, self, PlaywrightConfig)
+        if not self._is_alive:  # pragma: no cover
+            raise RuntimeError("Context manager has been closed")
+        request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
+        referer = (
+            generate_convincing_referer(url)
+            if (params.google_search and "referer" not in request_headers_keys)
+            else None
+        )
+        for attempt in range(self._config.retries):
+            proxy: Optional[ProxyType] = None
+            if self._config.proxy_rotator and static_proxy is None:
+                proxy = self._config.proxy_rotator.get_proxy()
+            else:
+                proxy = static_proxy
+            with self._page_generator(
+                params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
+            ) as page_info:
+                final_response = [None]
+                page = page_info.page
+                page.on("response", self._create_response_handler(page_info, final_response))
+                try:
+                    first_response = page.goto(url, referer=referer)
+                    self._wait_for_page_stability(page, params.load_dom, params.network_idle)
+                    if not first_response:
+                        raise RuntimeError(f"Failed to get response for {url}")
+                    if params.page_action:
+                        try:
+                            _ = params.page_action(page)
+                        except Exception as e:  # pragma: no cover
+                            log.error(f"Error executing page_action: {e}")
+                    if params.wait_selector:
+                        try:
+                            waiter: Locator = page.locator(params.wait_selector)
+                            waiter.first.wait_for(state=params.wait_selector_state)
+                            self._wait_for_page_stability(page, params.load_dom, params.network_idle)
+                        except Exception as e:  # pragma: no cover
+                            log.error(f"Error waiting for selector {params.wait_selector}: {e}")
+                    page.wait_for_timeout(params.wait)
+                    response = ResponseFactory.from_playwright_response(
+                        page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
+                    )
+                    return response
+                except Exception as e:
+                    page_info.mark_error()
+                    if attempt < self._config.retries - 1:
+                        if is_proxy_error(e):
+                            log.warning(
+                                f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
+                            )
+                        else:
+                            log.warning(
+                                f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
+                            )
+                        time_sleep(self._config.retry_delay)
+                    else:
+                        log.error(f"Failed after {self._config.retries} attempts: {e}")
+                        raise
+        raise RuntimeError("Request failed")  # pragma: no cover
+class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
+    """An async Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory."""
+    __slots__ = (
+        "_config",
+        "_context_options",
+        "_browser_options",
+        "_user_data_dir",
+        "_headers_keys",
+    )
+    def __init__(self, **kwargs: Unpack[PlaywrightSession]):
+        """A Browser session manager with page pooling
+        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
+        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
+            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
+        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
+        :param cookies: Set cookies for the next request.
+        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
+        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
+        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
+        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
+        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
+        :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
+        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
+            rules. Defaults to the system default locale.
+        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
+        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
+        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
+        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
+        :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
+        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
+        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
+        :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
+        :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
+        :param extra_flags: A list of additional browser flags to pass to the browser on launch.
+        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
+        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
+        """
+        self.__validate__(**kwargs)
+        super().__init__(max_pages=self._config.max_pages)
+    async def start(self) -> None:
+        """Create a browser for this instance and context."""
+        if not self.playwright:
+            self.playwright = await async_playwright().start()
+            try:
+                if self._config.cdp_url:
+                    self.browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
+                    if not self._config.proxy_rotator and self.browser:
+                        self.context = await self.browser.new_context(**self._context_options)
+                elif self._config.proxy_rotator:
+                    self.browser = await self.playwright.chromium.launch(**self._browser_options)
+                else:
+                    persistent_options = (
+                        self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
+                    )
+                    self.context = await self.playwright.chromium.launch_persistent_context(**persistent_options)
+                if self.context:
+                    self.context = await self._initialize_context(self._config, self.context)
+                self._is_alive = True
+            except Exception:
+                # Clean up playwright if browser setup fails
+                await self.playwright.stop()
+                self.playwright = None
+                raise
+        else:
+            raise RuntimeError("Session has been already started")
+    async def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Response:
+        """Opens up the browser and do your request based on your chosen options.
+        :param url: The Target url.
+        :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
+        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
+        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
+        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
+        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
+        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
+            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
+        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
+        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
+        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
+        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
+        :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
+        :return: A `Response` object.
+        """
+        static_proxy = kwargs.pop("proxy", None)
+        params = _validate(kwargs, self, PlaywrightConfig)
+        if not self._is_alive:  # pragma: no cover
+            raise RuntimeError("Context manager has been closed")
+        request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
+        referer = (
+            generate_convincing_referer(url)
+            if (params.google_search and "referer" not in request_headers_keys)
+            else None
+        )
+        for attempt in range(self._config.retries):
+            proxy: Optional[ProxyType] = None
+            if self._config.proxy_rotator and static_proxy is None:
+                proxy = self._config.proxy_rotator.get_proxy()
+            else:
+                proxy = static_proxy
+            async with self._page_generator(
+                params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
+            ) as page_info:
+                final_response = [None]
+                page = page_info.page
+                page.on("response", self._create_response_handler(page_info, final_response))
+                try:
+                    first_response = await page.goto(url, referer=referer)
+                    await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
+                    if not first_response:
+                        raise RuntimeError(f"Failed to get response for {url}")
+                    if params.page_action:
+                        try:
+                            _ = await params.page_action(page)
+                        except Exception as e:  # pragma: no cover
+                            log.error(f"Error executing page_action: {e}")
+                    if params.wait_selector:
+                        try:
+                            waiter: AsyncLocator = page.locator(params.wait_selector)
+                            await waiter.first.wait_for(state=params.wait_selector_state)
+                            await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
+                        except Exception as e:  # pragma: no cover
+                            log.error(f"Error waiting for selector {params.wait_selector}: {e}")
+                    await page.wait_for_timeout(params.wait)
+                    response = await ResponseFactory.from_async_playwright_response(
+                        page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
+                    )
+                    return response
+                except Exception as e:
+                    page_info.mark_error()
+                    if attempt < self._config.retries - 1:
+                        if is_proxy_error(e):
+                            log.warning(
+                                f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
+                            )
+                        else:
+                            log.warning(
+                                f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
+                            )
+                        await asyncio_sleep(self._config.retry_delay)
+                    else:
+                        log.error(f"Failed after {self._config.retries} attempts: {e}")
+                        raise
+        raise RuntimeError("Request failed")  # pragma: no cover

engines/_browsers/_page.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from threading import RLock
+from dataclasses import dataclass
+from playwright.sync_api._generated import Page as SyncPage
+from playwright.async_api._generated import Page as AsyncPage
+from scrapling.core._types import Optional, List, Literal, overload, TypeVar, Generic, cast
+PageState = Literal["ready", "busy", "error"]  # States that a page can be in
+PageType = TypeVar("PageType", SyncPage, AsyncPage)
+@dataclass
+class PageInfo(Generic[PageType]):
+    """Information about the page and its current state"""
+    __slots__ = ("page", "state", "url")
+    page: PageType
+    state: PageState
+    url: Optional[str]
+    def mark_busy(self, url: str = ""):
+        """Mark the page as busy"""
+        self.state = "busy"
+        self.url = url
+    def mark_error(self):
+        """Mark the page as having an error"""
+        self.state = "error"
+    def __repr__(self):
+        return f'Page(URL="{self.url!r}", state={self.state!r})'
+    def __eq__(self, other_page):
+        """Comparing this page to another page object."""
+        if other_page.__class__ is not self.__class__:
+            return NotImplemented
+        return self.page == other_page.page
+class PagePool:
+    """Manages a pool of browser pages/tabs with state tracking"""
+    __slots__ = ("max_pages", "pages", "_lock")
+    def __init__(self, max_pages: int = 5):
+        self.max_pages = max_pages
+        self.pages: List[PageInfo[SyncPage] | PageInfo[AsyncPage]] = []
+        self._lock = RLock()
+    @overload
+    def add_page(self, page: SyncPage) -> PageInfo[SyncPage]: ...
+    @overload
+    def add_page(self, page: AsyncPage) -> PageInfo[AsyncPage]: ...
+    def add_page(self, page: SyncPage | AsyncPage) -> PageInfo[SyncPage] | PageInfo[AsyncPage]:
+        """Add a new page to the pool"""
+        with self._lock:
+            if len(self.pages) >= self.max_pages:
+                raise RuntimeError(f"Maximum page limit ({self.max_pages}) reached")
+            if isinstance(page, AsyncPage):
+                page_info: PageInfo[SyncPage] | PageInfo[AsyncPage] = cast(
+                    PageInfo[AsyncPage], PageInfo(page, "ready", "")
+                )
+            else:
+                page_info = cast(PageInfo[SyncPage], PageInfo(page, "ready", ""))
+            self.pages.append(page_info)
+            return page_info
+    @property
+    def pages_count(self) -> int:
+        """Get the total number of pages"""
+        return len(self.pages)
+    @property
+    def busy_count(self) -> int:
+        """Get the number of busy pages"""
+        with self._lock:
+            return sum(1 for p in self.pages if p.state == "busy")
+    def cleanup_error_pages(self):
+        """Remove pages in error state"""
+        with self._lock:
+            self.pages = [p for p in self.pages if p.state != "error"]

engines/_browsers/_stealth.py ADDED Viewed

	@@ -0,0 +1,541 @@

+from random import randint
+from re import compile as re_compile
+from time import sleep as time_sleep
+from asyncio import sleep as asyncio_sleep
+from playwright.sync_api import Locator, Page, BrowserContext
+from playwright.async_api import (
+    Page as async_Page,
+    Locator as AsyncLocator,
+    BrowserContext as AsyncBrowserContext,
+)
+from patchright.sync_api import sync_playwright
+from patchright.async_api import async_playwright
+from scrapling.core.utils import log
+from scrapling.core._types import Any, Optional, ProxyType, Unpack
+from scrapling.engines.toolbelt.proxy_rotation import is_proxy_error
+from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
+from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
+from scrapling.engines._browsers._types import StealthSession, StealthFetchParams
+from scrapling.engines._browsers._base import SyncSession, AsyncSession, StealthySessionMixin
+from scrapling.engines._browsers._validators import validate_fetch as _validate, StealthConfig
+__CF_PATTERN__ = re_compile(r"^https?://challenges\.cloudflare\.com/cdn-cgi/challenge-platform/.*")
+class StealthySession(SyncSession, StealthySessionMixin):
+    """A Stealthy Browser session manager with page pooling."""
+    __slots__ = (
+        "_config",
+        "_context_options",
+        "_browser_options",
+        "_user_data_dir",
+        "_headers_keys",
+        "max_pages",
+        "page_pool",
+        "_max_wait_for_page",
+        "playwright",
+        "context",
+    )
+    def __init__(self, **kwargs: Unpack[StealthSession]):
+        """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
+        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
+        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
+            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
+        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
+        :param cookies: Set cookies for the next request.
+        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
+        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
+        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
+        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
+        :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
+        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
+            rules. Defaults to the system default locale.
+        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
+        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
+        :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
+        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
+        :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
+        :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
+        :param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
+        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
+        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
+        :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
+        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
+        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
+        :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
+        :param extra_flags: A list of additional browser flags to pass to the browser on launch.
+        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
+        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
+        """
+        self.__validate__(**kwargs)
+        super().__init__()
+    def start(self) -> None:
+        """Create a browser for this instance and context."""
+        if not self.playwright:
+            self.playwright = sync_playwright().start()
+            try:
+                if self._config.cdp_url:  # pragma: no cover
+                    self.browser = self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
+                    if not self._config.proxy_rotator:
+                        assert self.browser is not None
+                        self.context = self.browser.new_context(**self._context_options)
+                elif self._config.proxy_rotator:
+                    self.browser = self.playwright.chromium.launch(**self._browser_options)
+                else:
+                    persistent_options = (
+                        self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
+                    )
+                    self.context = self.playwright.chromium.launch_persistent_context(**persistent_options)
+                if self.context:
+                    self.context = self._initialize_context(self._config, self.context)
+                self._is_alive = True
+            except Exception:
+                # Clean up playwright if browser setup fails
+                self.playwright.stop()
+                self.playwright = None
+                raise
+        else:
+            raise RuntimeError("Session has been already started")
+    def _cloudflare_solver(self, page: Page) -> None:  # pragma: no cover
+        """Solve the cloudflare challenge displayed on the playwright page passed
+        :param page: The targeted page
+        :return:
+        """
+        self._wait_for_networkidle(page, timeout=5000)
+        challenge_type = self._detect_cloudflare(ResponseFactory._get_page_content(page))
+        if not challenge_type:
+            log.error("No Cloudflare challenge found.")
+            return None
+        else:
+            log.info(f'The turnstile version discovered is "{challenge_type}"')
+            if challenge_type == "non-interactive":
+                while "<title>Just a moment...</title>" in (ResponseFactory._get_page_content(page)):
+                    log.info("Waiting for Cloudflare wait page to disappear.")
+                    page.wait_for_timeout(1000)
+                    page.wait_for_load_state()
+                log.info("Cloudflare captcha is solved")
+                return None
+            else:
+                box_selector = "#cf_turnstile div, #cf-turnstile div, .turnstile>div>div"
+                if challenge_type != "embedded":
+                    box_selector = ".main-content p+div>div>div"
+                    while "Verifying you are human." in ResponseFactory._get_page_content(page):
+                        # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
+                        page.wait_for_timeout(500)
+                outer_box: Any = {}
+                iframe = page.frame(url=__CF_PATTERN__)
+                if iframe is not None:
+                    self._wait_for_page_stability(iframe, True, False)
+                    if challenge_type != "embedded":
+                        while not iframe.frame_element().is_visible():
+                            # Double-checking that the iframe is loaded
+                            page.wait_for_timeout(500)
+                    outer_box = iframe.frame_element().bounding_box()
+                if not iframe or not outer_box:
+                    if "<title>Just a moment...</title>" not in (ResponseFactory._get_page_content(page)):
+                        log.info("Cloudflare captcha is solved")
+                        return None
+                    outer_box = page.locator(box_selector).last.bounding_box()
+                # Calculate the Captcha coordinates for any viewport
+                captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)
+                # Move the mouse to the center of the window, then press and hold the left mouse button
+                page.mouse.click(captcha_x, captcha_y, delay=randint(100, 200), button="left")
+                self._wait_for_networkidle(page)
+                if challenge_type != "embedded":
+                    attempts = 0
+                    while "<title>Just a moment...</title>" in ResponseFactory._get_page_content(page):
+                        # Wait for the page
+                        if attempts >= 100:
+                            log.info("Cloudflare page didn't disappear after 10s, continuing...")
+                            break
+                        page.wait_for_timeout(100)
+                        attempts += 1
+                    # page.locator(box_selector).last.wait_for(state="detached")
+                    # page.locator(".zone-name-title").wait_for(state="hidden")
+                self._wait_for_page_stability(page, True, False)
+                if "<title>Just a moment...</title>" not in (ResponseFactory._get_page_content(page)):
+                    log.info("Cloudflare captcha is solved")
+                    return None
+                else:
+                    log.info("Looks like Cloudflare captcha is still present, solving again")
+                    return self._cloudflare_solver(page)
+    def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Response:
+        """Opens up the browser and do your request based on your chosen options.
+        :param url: The Target url.
+        :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
+        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
+        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
+        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
+        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
+        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
+            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
+        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
+        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
+        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
+        :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
+        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
+        :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
+        :return: A `Response` object.
+        """
+        static_proxy = kwargs.pop("proxy", None)
+        params = _validate(kwargs, self, StealthConfig)
+        if not self._is_alive:  # pragma: no cover
+            raise RuntimeError("Context manager has been closed")
+        request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
+        referer = (
+            generate_convincing_referer(url)
+            if (params.google_search and "referer" not in request_headers_keys)
+            else None
+        )
+        for attempt in range(self._config.retries):
+            proxy: Optional[ProxyType] = None
+            if self._config.proxy_rotator and static_proxy is None:
+                proxy = self._config.proxy_rotator.get_proxy()
+            else:
+                proxy = static_proxy
+            with self._page_generator(
+                params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
+            ) as page_info:
+                final_response = [None]
+                page = page_info.page
+                page.on("response", self._create_response_handler(page_info, final_response))
+                try:
+                    first_response = page.goto(url, referer=referer)
+                    self._wait_for_page_stability(page, params.load_dom, params.network_idle)
+                    if not first_response:
+                        raise RuntimeError(f"Failed to get response for {url}")
+                    if params.solve_cloudflare:
+                        self._cloudflare_solver(page)
+                        # Make sure the page is fully loaded after the captcha
+                        self._wait_for_page_stability(page, params.load_dom, params.network_idle)
+                    if params.page_action:
+                        try:
+                            _ = params.page_action(page)
+                        except Exception as e:  # pragma: no cover
+                            log.error(f"Error executing page_action: {e}")
+                    if params.wait_selector:
+                        try:
+                            waiter: Locator = page.locator(params.wait_selector)
+                            waiter.first.wait_for(state=params.wait_selector_state)
+                            self._wait_for_page_stability(page, params.load_dom, params.network_idle)
+                        except Exception as e:  # pragma: no cover
+                            log.error(f"Error waiting for selector {params.wait_selector}: {e}")
+                    page.wait_for_timeout(params.wait)
+                    response = ResponseFactory.from_playwright_response(
+                        page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
+                    )
+                    return response
+                except Exception as e:
+                    page_info.mark_error()
+                    if attempt < self._config.retries - 1:
+                        if is_proxy_error(e):
+                            log.warning(
+                                f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
+                            )
+                        else:
+                            log.warning(
+                                f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
+                            )
+                        time_sleep(self._config.retry_delay)
+                    else:
+                        log.error(f"Failed after {self._config.retries} attempts: {e}")
+                        raise
+        raise RuntimeError("Request failed")  # pragma: no cover
+class AsyncStealthySession(AsyncSession, StealthySessionMixin):
+    """An async Stealthy Browser session manager with page pooling."""
+    __slots__ = (
+        "_config",
+        "_context_options",
+        "_browser_options",
+        "_user_data_dir",
+        "_headers_keys",
+    )
+    def __init__(self, **kwargs: Unpack[StealthSession]):
+        """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
+        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
+        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
+            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
+        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
+        :param cookies: Set cookies for the next request.
+        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
+        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
+        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
+        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
+        :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
+        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
+            rules. Defaults to the system default locale.
+        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
+        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
+        :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
+        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
+        :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
+        :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
+        :param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
+        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
+        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
+        :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
+        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
+        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
+        :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
+        :param extra_flags: A list of additional browser flags to pass to the browser on launch.
+        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
+        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
+        """
+        self.__validate__(**kwargs)
+        super().__init__(max_pages=self._config.max_pages)
+    async def start(self) -> None:
+        """Create a browser for this instance and context."""
+        if not self.playwright:
+            self.playwright = await async_playwright().start()
+            try:
+                if self._config.cdp_url:
+                    self.browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
+                    if not self._config.proxy_rotator:
+                        assert self.browser is not None
+                        self.context = await self.browser.new_context(**self._context_options)
+                elif self._config.proxy_rotator:
+                    self.browser = await self.playwright.chromium.launch(**self._browser_options)
+                else:
+                    persistent_options = (
+                        self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
+                    )
+                    self.context = await self.playwright.chromium.launch_persistent_context(**persistent_options)
+                if self.context:
+                    self.context = await self._initialize_context(self._config, self.context)
+                self._is_alive = True
+            except Exception:
+                # Clean up playwright if browser setup fails
+                await self.playwright.stop()
+                self.playwright = None
+                raise
+        else:
+            raise RuntimeError("Session has been already started")
+    async def _cloudflare_solver(self, page: async_Page) -> None:  # pragma: no cover
+        """Solve the cloudflare challenge displayed on the playwright page passed
+        :param page: The targeted page
+        :return:
+        """
+        await self._wait_for_networkidle(page, timeout=5000)
+        challenge_type = self._detect_cloudflare(await ResponseFactory._get_async_page_content(page))
+        if not challenge_type:
+            log.error("No Cloudflare challenge found.")
+            return None
+        else:
+            log.info(f'The turnstile version discovered is "{challenge_type}"')
+            if challenge_type == "non-interactive":
+                while "<title>Just a moment...</title>" in (await ResponseFactory._get_async_page_content(page)):
+                    log.info("Waiting for Cloudflare wait page to disappear.")
+                    await page.wait_for_timeout(1000)
+                    await page.wait_for_load_state()
+                log.info("Cloudflare captcha is solved")
+                return None
+            else:
+                box_selector = "#cf_turnstile div, #cf-turnstile div, .turnstile>div>div"
+                if challenge_type != "embedded":
+                    box_selector = ".main-content p+div>div>div"
+                    while "Verifying you are human." in (await ResponseFactory._get_async_page_content(page)):
+                        # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
+                        await page.wait_for_timeout(500)
+                outer_box: Any = {}
+                iframe = page.frame(url=__CF_PATTERN__)
+                if iframe is not None:
+                    await self._wait_for_page_stability(iframe, True, False)
+                    if challenge_type != "embedded":
+                        while not await (await iframe.frame_element()).is_visible():
+                            # Double-checking that the iframe is loaded
+                            await page.wait_for_timeout(500)
+                    outer_box = await (await iframe.frame_element()).bounding_box()
+                if not iframe or not outer_box:
+                    if "<title>Just a moment...</title>" not in (await ResponseFactory._get_async_page_content(page)):
+                        log.info("Cloudflare captcha is solved")
+                        return None
+                    outer_box = await page.locator(box_selector).last.bounding_box()
+                # Calculate the Captcha coordinates for any viewport
+                captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)
+                # Move the mouse to the center of the window, then press and hold the left mouse button
+                await page.mouse.click(captcha_x, captcha_y, delay=randint(100, 200), button="left")
+                await self._wait_for_networkidle(page)
+                if challenge_type != "embedded":
+                    attempts = 0
+                    while "<title>Just a moment...</title>" in (await ResponseFactory._get_async_page_content(page)):
+                        # Wait for the page
+                        if attempts >= 100:
+                            log.info("Cloudflare page didn't disappear after 10s, continuing...")
+                            break
+                        await page.wait_for_timeout(100)
+                        attempts += 1
+                    # await page.locator(box_selector).last.wait_for(state="detached")
+                    # await page.locator(".zone-name-title").wait_for(state="hidden")
+                await self._wait_for_page_stability(page, True, False)
+                if "<title>Just a moment...</title>" not in (await ResponseFactory._get_async_page_content(page)):
+                    log.info("Cloudflare captcha is solved")
+                    return None
+                else:
+                    log.info("Looks like Cloudflare captcha is still present, solving again")
+                    return await self._cloudflare_solver(page)
+    async def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Response:
+        """Opens up the browser and do your request based on your chosen options.
+        :param url: The Target url.
+        :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
+        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
+        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
+        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
+        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
+        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
+            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
+        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
+        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
+        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
+        :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
+        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
+        :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
+        :return: A `Response` object.
+        """
+        static_proxy = kwargs.pop("proxy", None)
+        params = _validate(kwargs, self, StealthConfig)
+        if not self._is_alive:  # pragma: no cover
+            raise RuntimeError("Context manager has been closed")
+        request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
+        referer = (
+            generate_convincing_referer(url)
+            if (params.google_search and "referer" not in request_headers_keys)
+            else None
+        )
+        for attempt in range(self._config.retries):
+            proxy: Optional[ProxyType] = None
+            if self._config.proxy_rotator and static_proxy is None:
+                proxy = self._config.proxy_rotator.get_proxy()
+            else:
+                proxy = static_proxy
+            async with self._page_generator(
+                params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
+            ) as page_info:
+                final_response = [None]
+                page = page_info.page
+                page.on("response", self._create_response_handler(page_info, final_response))
+                try:
+                    first_response = await page.goto(url, referer=referer)
+                    await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
+                    if not first_response:
+                        raise RuntimeError(f"Failed to get response for {url}")
+                    if params.solve_cloudflare:
+                        await self._cloudflare_solver(page)
+                        # Make sure the page is fully loaded after the captcha
+                        await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
+                    if params.page_action:
+                        try:
+                            _ = await params.page_action(page)
+                        except Exception as e:  # pragma: no cover
+                            log.error(f"Error executing page_action: {e}")
+                    if params.wait_selector:
+                        try:
+                            waiter: AsyncLocator = page.locator(params.wait_selector)
+                            await waiter.first.wait_for(state=params.wait_selector_state)
+                            await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
+                        except Exception as e:  # pragma: no cover
+                            log.error(f"Error waiting for selector {params.wait_selector}: {e}")
+                    await page.wait_for_timeout(params.wait)
+                    response = await ResponseFactory.from_async_playwright_response(
+                        page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
+                    )
+                    return response
+                except Exception as e:
+                    page_info.mark_error()
+                    if attempt < self._config.retries - 1:
+                        if is_proxy_error(e):
+                            log.warning(
+                                f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
+                            )
+                        else:
+                            log.warning(
+                                f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
+                            )
+                        await asyncio_sleep(self._config.retry_delay)
+                    else:
+                        log.error(f"Failed after {self._config.retries} attempts: {e}")
+                        raise
+        raise RuntimeError("Request failed")  # pragma: no cover

engines/_browsers/_types.py ADDED Viewed

	@@ -0,0 +1,118 @@

+from io import BytesIO
+from curl_cffi.requests import (
+    ProxySpec,
+    CookieTypes,
+    BrowserTypeLiteral,
+)
+from scrapling.core._types import (
+    Dict,
+    List,
+    Set,
+    Tuple,
+    Mapping,
+    Optional,
+    Callable,
+    Sequence,
+    TypedDict,
+    TypeAlias,
+    SetCookieParam,
+    SelectorWaitStates,
+)
+from scrapling.engines.toolbelt.proxy_rotation import ProxyRotator
+# Type alias for `impersonate` parameter - accepts a single browser or list of browsers
+ImpersonateType: TypeAlias = BrowserTypeLiteral | List[BrowserTypeLiteral] | None
+# Types for session initialization
+class RequestsSession(TypedDict, total=False):
+    impersonate: ImpersonateType
+    http3: Optional[bool]
+    stealthy_headers: Optional[bool]
+    proxies: Optional[ProxySpec]
+    proxy: Optional[str]
+    proxy_auth: Optional[Tuple[str, str]]
+    proxy_rotator: Optional[ProxyRotator]
+    timeout: Optional[int | float]
+    headers: Optional[Mapping[str, Optional[str]]]
+    retries: Optional[int]
+    retry_delay: Optional[int]
+    follow_redirects: Optional[bool]
+    max_redirects: Optional[int]
+    verify: Optional[bool]
+    cert: Optional[str | Tuple[str, str]]
+    selector_config: Optional[Dict]
+# Types for GET request method parameters
+class GetRequestParams(RequestsSession, total=False):
+    params: Optional[Dict | List | Tuple]
+    cookies: Optional[CookieTypes]
+    auth: Optional[Tuple[str, str]]
+# Types for POST/PUT/DELETE request method parameters
+class DataRequestParams(GetRequestParams, total=False):
+    data: Optional[Dict[str, str] | List[Tuple] | str | BytesIO | bytes]
+    json: Optional[Dict | List]
+# Types for browser session
+class PlaywrightSession(TypedDict, total=False):
+    max_pages: int
+    headless: bool
+    disable_resources: bool
+    network_idle: bool
+    load_dom: bool
+    wait_selector: Optional[str]
+    wait_selector_state: SelectorWaitStates
+    cookies: Sequence[SetCookieParam] | None
+    google_search: bool
+    wait: int | float
+    timezone_id: str | None
+    page_action: Optional[Callable]
+    proxy: Optional[str | Dict[str, str] | Tuple]
+    proxy_rotator: Optional[ProxyRotator]
+    extra_headers: Optional[Dict[str, str]]
+    timeout: int | float
+    init_script: Optional[str]
+    user_data_dir: str
+    selector_config: Optional[Dict]
+    additional_args: Optional[Dict]
+    locale: Optional[str]
+    real_chrome: bool
+    cdp_url: Optional[str]
+    useragent: Optional[str]
+    extra_flags: Optional[List[str]]
+    blocked_domains: Optional[Set[str]]
+    retries: int
+    retry_delay: int | float
+class PlaywrightFetchParams(TypedDict, total=False):
+    load_dom: bool
+    wait: int | float
+    network_idle: bool
+    google_search: bool
+    timeout: int | float
+    disable_resources: bool
+    wait_selector: Optional[str]
+    page_action: Optional[Callable]
+    selector_config: Optional[Dict]
+    extra_headers: Optional[Dict[str, str]]
+    wait_selector_state: SelectorWaitStates
+    blocked_domains: Optional[Set[str]]
+    proxy: Optional[str | Dict[str, str]]
+class StealthSession(PlaywrightSession, total=False):
+    allow_webgl: bool
+    hide_canvas: bool
+    block_webrtc: bool
+    solve_cloudflare: bool
+class StealthFetchParams(PlaywrightFetchParams, total=False):
+    solve_cloudflare: bool

engines/_browsers/_validators.py ADDED Viewed

	@@ -0,0 +1,229 @@

+from pathlib import Path
+from typing import Annotated
+from functools import lru_cache
+from urllib.parse import urlparse
+from dataclasses import dataclass, fields
+from msgspec import Struct, Meta, convert, ValidationError
+from scrapling.core._types import (
+    Any,
+    Dict,
+    List,
+    Set,
+    Tuple,
+    Optional,
+    Callable,
+    Sequence,
+    overload,
+    SetCookieParam,
+    SelectorWaitStates,
+)
+from scrapling.engines.toolbelt.proxy_rotation import ProxyRotator
+from scrapling.engines.toolbelt.navigation import construct_proxy_dict
+from scrapling.engines._browsers._types import PlaywrightFetchParams, StealthFetchParams
+# Custom validators for msgspec
+@lru_cache(8)
+def _is_invalid_file_path(value: str) -> bool | str:  # pragma: no cover
+    """Fast file path validation"""
+    path = Path(value)
+    if not path.exists():
+        return f"Init script path not found: {value}"
+    if not path.is_file():
+        return f"Init script is not a file: {value}"
+    if not path.is_absolute():
+        return f"Init script is not a absolute path: {value}"
+    return False
+@lru_cache(2)
+def _is_invalid_cdp_url(cdp_url: str) -> bool | str:
+    """Fast CDP URL validation"""
+    if not cdp_url.startswith(("ws://", "wss://")):
+        return "CDP URL must use 'ws://' or 'wss://' scheme"
+    netloc = urlparse(cdp_url).netloc
+    if not netloc:  # pragma: no cover
+        return "Invalid hostname for the CDP URL"
+    return False
+# Type aliases for cleaner annotations
+PagesCount = Annotated[int, Meta(ge=1, le=50)]
+RetriesCount = Annotated[int, Meta(ge=1, le=10)]
+Seconds = Annotated[int, float, Meta(ge=0)]
+class PlaywrightConfig(Struct, kw_only=True, frozen=False, weakref=True):
+    """Configuration struct for validation"""
+    max_pages: PagesCount = 1
+    headless: bool = True
+    disable_resources: bool = False
+    network_idle: bool = False
+    load_dom: bool = True
+    wait_selector: Optional[str] = None
+    wait_selector_state: SelectorWaitStates = "attached"
+    cookies: Sequence[SetCookieParam] | None = []
+    google_search: bool = True
+    wait: Seconds = 0
+    timezone_id: str | None = ""
+    page_action: Optional[Callable] = None
+    proxy: Optional[str | Dict[str, str] | Tuple] = None  # The default value for proxy in Playwright's source is `None`
+    proxy_rotator: Optional[ProxyRotator] = None
+    extra_headers: Optional[Dict[str, str]] = None
+    timeout: Seconds = 30000
+    init_script: Optional[str] = None
+    user_data_dir: str = ""
+    selector_config: Optional[Dict] = {}
+    additional_args: Optional[Dict] = {}
+    locale: str | None = None
+    real_chrome: bool = False
+    cdp_url: Optional[str] = None
+    useragent: Optional[str] = None
+    extra_flags: Optional[List[str]] = None
+    blocked_domains: Optional[Set[str]] = None
+    retries: RetriesCount = 3
+    retry_delay: Seconds = 1
+    def __post_init__(self):  # pragma: no cover
+        """Custom validation after msgspec validation"""
+        if self.page_action and not callable(self.page_action):
+            raise TypeError(f"page_action must be callable, got {type(self.page_action).__name__}")
+        if self.proxy and self.proxy_rotator:
+            raise ValueError(
+                "Cannot use 'proxy_rotator' together with 'proxy'. "
+                "Use either a static proxy or proxy rotation, not both."
+            )
+        if self.proxy:
+            self.proxy = construct_proxy_dict(self.proxy)
+        if self.cdp_url:
+            cdp_msg = _is_invalid_cdp_url(self.cdp_url)
+            if cdp_msg:
+                raise ValueError(cdp_msg)
+        if not self.cookies:
+            self.cookies = []
+        if not self.extra_flags:
+            self.extra_flags = []
+        if not self.selector_config:
+            self.selector_config = {}
+        if not self.additional_args:
+            self.additional_args = {}
+        if self.init_script is not None:
+            validation_msg = _is_invalid_file_path(self.init_script)
+            if validation_msg:
+                raise ValueError(validation_msg)
+class StealthConfig(PlaywrightConfig, kw_only=True, frozen=False, weakref=True):
+    allow_webgl: bool = True
+    hide_canvas: bool = False
+    block_webrtc: bool = False
+    solve_cloudflare: bool = False
+    def __post_init__(self):
+        """Custom validation after msgspec validation"""
+        super(StealthConfig, self).__post_init__()
+        # Cloudflare timeout adjustment
+        if self.solve_cloudflare and self.timeout < 60_000:
+            self.timeout = 60_000
+@dataclass
+class _fetch_params:
+    """A dataclass of all parameters used by `fetch` calls"""
+    google_search: bool
+    timeout: Seconds
+    wait: Seconds
+    page_action: Optional[Callable]
+    extra_headers: Optional[Dict[str, str]]
+    disable_resources: bool
+    wait_selector: Optional[str]
+    wait_selector_state: SelectorWaitStates
+    network_idle: bool
+    load_dom: bool
+    blocked_domains: Optional[Set[str]]
+    solve_cloudflare: bool
+    selector_config: Dict
+def validate_fetch(
+    method_kwargs: Dict | PlaywrightFetchParams | StealthFetchParams,
+    session: Any,
+    model: type[PlaywrightConfig] | type[StealthConfig],
+) -> _fetch_params:  # pragma: no cover
+    result: Dict[str, Any] = {}
+    overrides: Dict[str, Any] = {}
+    kwargs_dict: Dict[str, Any] = dict(method_kwargs)
+    # Get all field names that _fetch_params needs
+    fetch_param_fields = {f.name for f in fields(_fetch_params)}
+    for key in fetch_param_fields:
+        if key in kwargs_dict:
+            overrides[key] = kwargs_dict[key]
+        elif hasattr(session, "_config") and hasattr(session._config, key):
+            result[key] = getattr(session._config, key)
+    if overrides:
+        validated_config = validate(overrides, model)
+        # Extract ONLY the fields that were actually overridden (not all fields)
+        # This prevents validated defaults from overwriting session config values
+        validated_dict = {
+            field: getattr(validated_config, field) for field in overrides.keys() if hasattr(validated_config, field)
+        }
+        # Preserve solve_cloudflare if the user explicitly provided it, even if the model doesn't have it
+        if "solve_cloudflare" in overrides:
+            validated_dict["solve_cloudflare"] = overrides["solve_cloudflare"]
+        # Start with session defaults, then overwrite with validated overrides
+        result.update(validated_dict)
+    # solve_cloudflare defaults to False for models that don't have it (PlaywrightConfig)
+    result.setdefault("solve_cloudflare", False)
+    result.setdefault("blocked_domains", None)
+    return _fetch_params(**result)
+# Cache default values for each model to reduce validation overhead
+models_default_values = {}
+for _model in (StealthConfig, PlaywrightConfig):
+    _defaults = {}
+    if hasattr(_model, "__struct_defaults__") and hasattr(_model, "__struct_fields__"):
+        for field_name, default_value in zip(_model.__struct_fields__, _model.__struct_defaults__):  # type: ignore
+            # Skip factory defaults - these are msgspec._core.Factory instances
+            if type(default_value).__name__ != "Factory":
+                _defaults[field_name] = default_value
+    models_default_values[_model.__name__] = _defaults.copy()
+def _filter_defaults(params: Dict, model: str) -> Dict:
+    """Filter out parameters that match their default values to reduce validation overhead."""
+    defaults = models_default_values[model]
+    return {k: v for k, v in params.items() if k not in defaults or v != defaults[k]}
+@overload
+def validate(params: Dict, model: type[StealthConfig]) -> StealthConfig: ...
+@overload
+def validate(params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...
+def validate(params: Dict, model: type[PlaywrightConfig] | type[StealthConfig]) -> PlaywrightConfig | StealthConfig:
+    try:
+        # Filter out params with the default values (no need to validate them) to speed up validation
+        filtered = _filter_defaults(params, model.__name__)
+        return convert(filtered, model)
+    except ValidationError as e:
+        raise TypeError(f"Invalid argument type: {e}") from e

engines/constants.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# Disable loading these resources for speed
+EXTRA_RESOURCES = {
+    "font",
+    "image",
+    "media",
+    "beacon",
+    "object",
+    "imageset",
+    "texttrack",
+    "websocket",
+    "csp_report",
+    "stylesheet",
+}
+HARMFUL_ARGS = (
+    # This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
+    "--enable-automation",
+    "--disable-popup-blocking",
+    "--disable-component-update",
+    "--disable-default-apps",
+    "--disable-extensions",
+)
+DEFAULT_ARGS = (
+    # Speed up chromium browsers by default
+    "--no-pings",
+    "--no-first-run",
+    "--disable-infobars",
+    "--disable-breakpad",
+    "--no-service-autorun",
+    "--homepage=about:blank",
+    "--password-store=basic",
+    "--disable-hang-monitor",
+    "--no-default-browser-check",
+    "--disable-session-crashed-bubble",
+    "--disable-search-engine-choice-screen",
+)
+STEALTH_ARGS = (
+    # Explanation: https://peter.sh/experiments/chromium-command-line-switches/
+    # Generally this will make the browser faster and less detectable
+    # "--incognito",
+    "--test-type",
+    "--lang=en-US",
+    "--mute-audio",
+    "--disable-sync",
+    "--hide-scrollbars",
+    "--disable-logging",
+    "--start-maximized",  # For headless check bypass
+    "--enable-async-dns",
+    "--accept-lang=en-US",
+    "--use-mock-keychain",
+    "--disable-translate",
+    "--disable-voice-input",
+    "--window-position=0,0",
+    "--disable-wake-on-wifi",
+    "--ignore-gpu-blocklist",
+    "--enable-tcp-fast-open",
+    "--enable-web-bluetooth",
+    "--disable-cloud-import",
+    "--disable-print-preview",
+    "--disable-dev-shm-usage",
+    # '--disable-popup-blocking',
+    "--metrics-recording-only",
+    "--disable-crash-reporter",
+    "--disable-partial-raster",
+    "--disable-gesture-typing",
+    "--disable-checker-imaging",
+    "--disable-prompt-on-repost",
+    "--force-color-profile=srgb",
+    "--font-render-hinting=none",
+    "--aggressive-cache-discard",
+    "--disable-cookie-encryption",
+    "--disable-domain-reliability",
+    "--disable-threaded-animation",
+    "--disable-threaded-scrolling",
+    "--enable-simple-cache-backend",
+    "--disable-background-networking",
+    "--enable-surface-synchronization",
+    "--disable-image-animation-resync",
+    "--disable-renderer-backgrounding",
+    "--disable-ipc-flooding-protection",
+    "--prerender-from-omnibox=disabled",
+    "--safebrowsing-disable-auto-update",
+    "--disable-offer-upload-credit-cards",
+    "--disable-background-timer-throttling",
+    "--disable-new-content-rendering-timeout",
+    "--run-all-compositor-stages-before-draw",
+    "--disable-client-side-phishing-detection",
+    "--disable-backgrounding-occluded-windows",
+    "--disable-layer-tree-host-memory-pressure",
+    "--autoplay-policy=user-gesture-required",
+    "--disable-offer-store-unmasked-wallet-cards",
+    "--disable-blink-features=AutomationControlled",
+    "--disable-component-extensions-with-background-pages",
+    "--enable-features=NetworkService,NetworkServiceInProcess,TrustTokens,TrustTokensAlwaysAllowIssuance",
+    "--blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4",
+    "--disable-features=AudioServiceOutOfProcess,TranslateUI,BlinkGenPropertyTrees",
+)

engines/static.py ADDED Viewed

	@@ -0,0 +1,770 @@

+from abc import ABC
+from random import choice
+from time import sleep as time_sleep
+from asyncio import sleep as asyncio_sleep
+from curl_cffi.curl import CurlError
+from curl_cffi import CurlHttpVersion
+from curl_cffi.requests import (
+    BrowserTypeLiteral,
+    Session as CurlSession,
+    AsyncSession as AsyncCurlSession,
+)
+from scrapling.core.utils import log
+from scrapling.core._types import (
+    Any,
+    Dict,
+    Tuple,
+    Unpack,
+    Optional,
+    Awaitable,
+    SUPPORTED_HTTP_METHODS,
+)
+from .toolbelt.custom import Response
+from .toolbelt.convertor import ResponseFactory
+from .toolbelt.proxy_rotation import ProxyRotator, is_proxy_error
+from ._browsers._types import RequestsSession, GetRequestParams, DataRequestParams, ImpersonateType
+from .toolbelt.fingerprints import generate_convincing_referer, generate_headers, __default_useragent__
+_NO_SESSION: Any = object()
+def _select_random_browser(impersonate: ImpersonateType) -> Optional[BrowserTypeLiteral]:
+    """
+    Handle browser selection logic for the ` impersonate ` parameter.
+    If impersonate is a list, randomly select one browser from it.
+    If it's a string or None, return as is.
+    """
+    if isinstance(impersonate, list):
+        if not impersonate:
+            return None
+        return choice(impersonate)
+    return impersonate
+class _ConfigurationLogic(ABC):
+    # Core Logic Handler (Internal Engine)
+    __slots__ = (
+        "_default_impersonate",
+        "_stealth",
+        "_default_proxies",
+        "_default_proxy",
+        "_default_proxy_auth",
+        "_default_timeout",
+        "_default_headers",
+        "_default_retries",
+        "_default_retry_delay",
+        "_default_follow_redirects",
+        "_default_max_redirects",
+        "_default_verify",
+        "_default_cert",
+        "_default_http3",
+        "selector_config",
+        "_is_alive",
+        "_proxy_rotator",
+    )
+    def __init__(self, **kwargs: Unpack[RequestsSession]):
+        self._default_impersonate = kwargs.get("impersonate", "chrome")
+        self._stealth = kwargs.get("stealthy_headers", True)
+        self._default_proxies = kwargs.get("proxies") or {}
+        self._default_proxy = kwargs.get("proxy") or None
+        self._default_proxy_auth = kwargs.get("proxy_auth") or None
+        self._default_timeout = kwargs.get("timeout", 30)
+        self._default_headers = kwargs.get("headers") or {}
+        self._default_retries = kwargs.get("retries", 3)
+        self._default_retry_delay = kwargs.get("retry_delay", 1)
+        self._default_follow_redirects = kwargs.get("follow_redirects", True)
+        self._default_max_redirects = kwargs.get("max_redirects", 30)
+        self._default_verify = kwargs.get("verify", True)
+        self._default_cert = kwargs.get("cert") or None
+        self._default_http3 = kwargs.get("http3", False)
+        self.selector_config = kwargs.get("selector_config") or {}
+        self._is_alive = False
+        self._proxy_rotator: Optional[ProxyRotator] = kwargs.get("proxy_rotator")
+        if self._proxy_rotator and (self._default_proxy or self._default_proxies):
+            raise ValueError(
+                "Cannot use 'proxy_rotator' together with 'proxy' or 'proxies'. "
+                "Use either a static proxy or proxy rotation, not both."
+            )
+    @staticmethod
+    def _get_param(kwargs: Dict, key: str, default: Any) -> Any:
+        """Get parameter from kwargs if present, otherwise return default."""
+        return kwargs[key] if key in kwargs else default
+    def _merge_request_args(self, **method_kwargs) -> Dict[str, Any]:
+        """Merge request-specific arguments with default session arguments."""
+        url = method_kwargs.pop("url")
+        # Get parameters from kwargs or use defaults
+        impersonate = self._get_param(method_kwargs, "impersonate", self._default_impersonate)
+        impersonate = _select_random_browser(impersonate)
+        http3_enabled = self._get_param(method_kwargs, "http3", self._default_http3)
+        stealth = self._get_param(method_kwargs, "stealth", self._stealth)
+        final_args = {
+            "url": url,
+            # Curl automatically generates the suitable browser headers when you use `impersonate`
+            "headers": self._headers_job(
+                url,
+                self._get_param(method_kwargs, "headers", self._default_headers),
+                stealth,
+                bool(impersonate),
+            ),
+            "proxies": self._get_param(method_kwargs, "proxies", self._default_proxies),
+            "proxy": self._get_param(method_kwargs, "proxy", self._default_proxy),
+            "proxy_auth": self._get_param(method_kwargs, "proxy_auth", self._default_proxy_auth),
+            "timeout": self._get_param(method_kwargs, "timeout", self._default_timeout),
+            "allow_redirects": self._get_param(method_kwargs, "follow_redirects", self._default_follow_redirects),
+            "max_redirects": self._get_param(method_kwargs, "max_redirects", self._default_max_redirects),
+            "verify": self._get_param(method_kwargs, "verify", self._default_verify),
+            "cert": self._get_param(method_kwargs, "cert", self._default_cert),
+            "impersonate": impersonate,
+        }
+        # Add any remaining parameters that weren't explicitly handled above
+        # Skip the ones we already processed plus internal params
+        skip_keys = {
+            "impersonate",
+            "http3",
+            "stealth",
+            "headers",
+            "proxies",
+            "proxy",
+            "proxy_auth",
+            "timeout",
+            "follow_redirects",
+            "max_redirects",
+            "verify",
+            "cert",
+            "retries",
+            "retry_delay",
+            "selector_config",
+            # Browser session params (ignored by HTTP sessions)
+            "extra_headers",
+            "google_search",
+        }
+        for k, v in method_kwargs.items():
+            if k not in skip_keys and v is not None:
+                final_args[k] = v
+        if http3_enabled:  # pragma: no cover
+            final_args["http_version"] = CurlHttpVersion.V3ONLY
+            if impersonate:
+                log.warning(
+                    "The argument `http3` might cause errors if used with `impersonate` argument, try switching it off if you encounter any curl errors."
+                )
+        return final_args
+    def _headers_job(self, url, headers: Dict, stealth: bool, impersonate_enabled: bool) -> Dict:
+        """
+        1. Adds a useragent to the headers if it doesn't have one
+        2. Generates real headers and append them to current headers
+        3. Generates a referer header that looks like as if this request came from a Google's search of the current URL's domain.
+        """
+        # Merge session headers with request headers, request takes precedence (if it was set)
+        final_headers = {**self._default_headers, **(headers if headers else {})}
+        headers_keys = {k.lower() for k in final_headers}
+        if stealth:
+            if "referer" not in headers_keys:
+                final_headers["referer"] = generate_convincing_referer(url)
+            if not impersonate_enabled:  # Curl will generate the suitable headers
+                extra_headers = generate_headers(browser_mode=False)
+                final_headers.update(
+                    {k: v for k, v in extra_headers.items() if k.lower() not in headers_keys}
+                )  # Don't overwrite user-supplied headers
+        elif "user-agent" not in headers_keys and not impersonate_enabled:  # pragma: no cover
+            final_headers["User-Agent"] = __default_useragent__
+            log.debug(f"Can't find useragent in headers so '{final_headers['User-Agent']}' was used.")
+        return final_headers
+class _SyncSessionLogic(_ConfigurationLogic):
+    __slots__ = ("_curl_session",)
+    def __init__(self, **kwargs: Unpack[RequestsSession]):
+        super().__init__(**kwargs)
+        self._curl_session: Optional[CurlSession] = None
+    def __enter__(self):
+        """Creates and returns a new synchronous Fetcher Session"""
+        if self._is_alive:
+            raise RuntimeError("This FetcherSession instance already has an active synchronous session.")
+        self._curl_session = CurlSession()
+        self._is_alive = True
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Closes the active synchronous session managed by this instance, if any."""
+        # For type checking (not accessed error)
+        _ = (
+            exc_type,
+            exc_val,
+            exc_tb,
+        )
+        if self._curl_session:
+            self._curl_session.close()
+            self._curl_session = None
+        self._is_alive = False
+    def _make_request(self, method: SUPPORTED_HTTP_METHODS, stealth: Optional[bool] = None, **kwargs) -> Response:
+        """
+        Perform an HTTP request using the configured session.
+        """
+        stealth = self._stealth if stealth is None else stealth
+        selector_config = self._get_param(kwargs, "selector_config", self.selector_config) or self.selector_config
+        max_retries = self._get_param(kwargs, "retries", self._default_retries)
+        retry_delay = self._get_param(kwargs, "retry_delay", self._default_retry_delay)
+        static_proxy = kwargs.pop("proxy", None)
+        session = self._curl_session
+        one_off_request = False
+        if session is _NO_SESSION and self.__enter__ is None:
+            # For usage inside FetcherClient
+            # It turns out `curl_cffi` caches impersonation state, so if you turned it off, then on then off, it won't be off on the last time.
+            session = CurlSession()
+            one_off_request = True
+        if not session:
+            raise RuntimeError("No active session available.")  # pragma: no cover
+        try:
+            for attempt in range(max_retries):
+                if self._proxy_rotator and static_proxy is None:
+                    proxy = self._proxy_rotator.get_proxy()
+                else:
+                    proxy = static_proxy
+                request_args = self._merge_request_args(stealth=stealth, proxy=proxy, **kwargs)
+                try:
+                    response = session.request(method, **request_args)
+                    result = ResponseFactory.from_http_request(response, selector_config, meta={"proxy": proxy})
+                    return result
+                except CurlError as e:  # pragma: no cover
+                    if attempt < max_retries - 1:
+                        # Now if the rotator is enabled, we will try again with the new proxy
+                        # If it's not enabled, then we will try again with the same proxy
+                        if is_proxy_error(e):
+                            log.warning(
+                                f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {retry_delay} seconds..."
+                            )
+                        else:
+                            log.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds...")
+                        time_sleep(retry_delay)
+                    else:
+                        log.error(f"Failed after {max_retries} attempts: {e}")
+                        raise  # Raise the exception if all retries fail
+        finally:
+            if session and one_off_request:
+                session.close()
+        raise RuntimeError("No active session available.")  # pragma: no cover
+    def get(self, url: str, **kwargs: Unpack[GetRequestParams]) -> Response:
+        """
+        Perform a GET request.
+        Any additional keyword arguments are passed to the `curl_cffi.requests.Session().request()` method.
+        :param url: Target URL for the request.
+        :param kwargs: Additional keyword arguments including:
+            - params: Query string parameters for the request.
+            - headers: Headers to include in the request.
+            - cookies: Cookies to use in the request.
+            - timeout: Number of seconds to wait before timing out.
+            - follow_redirects: Whether to follow redirects. Defaults to True.
+            - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
+            - retries: Number of retry attempts. Defaults to 3.
+            - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
+            - proxies: Dict of proxies to use.
+            - proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
+            - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
+            - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
+            - verify: Whether to verify HTTPS certificates.
+            - cert: Tuple of (cert, key) filenames for the client certificate.
+            - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
+            - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
+            - stealthy_headers: If enabled (default), it creates and adds real browser headers.
+        :return: A `Response` object.
+        """
+        stealthy_headers = kwargs.pop("stealthy_headers", None)
+        return self._make_request("GET", stealth=stealthy_headers, url=url, **kwargs)
+    def post(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
+        """
+        Perform a POST request.
+        Any additional keyword arguments are passed to the `curl_cffi.requests.Session().request()` method.
+        :param url: Target URL for the request.
+        :param kwargs: Additional keyword arguments including:
+            - data: Form data to include in the request body.
+            - json: A JSON serializable object to include in the body of the request.
+            - params: Query string parameters for the request.
+            - headers: Headers to include in the request.
+            - cookies: Cookies to use in the request.
+            - timeout: Number of seconds to wait before timing out.
+            - follow_redirects: Whether to follow redirects. Defaults to True.
+            - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
+            - retries: Number of retry attempts. Defaults to 3.
+            - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
+            - proxies: Dict of proxies to use.
+            - proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
+            - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
+            - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
+            - verify: Whether to verify HTTPS certificates.
+            - cert: Tuple of (cert, key) filenames for the client certificate.
+            - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
+            - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
+            - stealthy_headers: If enabled (default), it creates and adds real browser headers.
+        :return: A `Response` object.
+        """
+        stealthy_headers = kwargs.pop("stealthy_headers", None)
+        return self._make_request("POST", stealth=stealthy_headers, url=url, **kwargs)
+    def put(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
+        """
+        Perform a PUT request.
+        Any additional keyword arguments are passed to the `curl_cffi.requests.Session().request()` method.
+        :param url: Target URL for the request.
+        :param kwargs: Additional keyword arguments including:
+            - data: Form data to include in the request body.
+            - json: A JSON serializable object to include in the body of the request.
+            - params: Query string parameters for the request.
+            - headers: Headers to include in the request.
+            - cookies: Cookies to use in the request.
+            - timeout: Number of seconds to wait before timing out.
+            - follow_redirects: Whether to follow redirects. Defaults to True.
+            - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
+            - retries: Number of retry attempts. Defaults to 3.
+            - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
+            - proxies: Dict of proxies to use.
+            - proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
+            - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
+            - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
+            - verify: Whether to verify HTTPS certificates.
+            - cert: Tuple of (cert, key) filenames for the client certificate.
+            - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
+            - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
+            - stealthy_headers: If enabled (default), it creates and adds real browser headers.
+        :return: A `Response` object.
+        """
+        stealthy_headers = kwargs.pop("stealthy_headers", None)
+        return self._make_request("PUT", stealth=stealthy_headers, url=url, **kwargs)
+    def delete(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
+        """
+        Perform a DELETE request.
+        Any additional keyword arguments are passed to the `curl_cffi.requests.Session().request()` method.
+        :param url: Target URL for the request.
+        :param kwargs: Additional keyword arguments including:
+            - data: Form data to include in the request body.
+            - json: A JSON serializable object to include in the body of the request.
+            - params: Query string parameters for the request.
+            - headers: Headers to include in the request.
+            - cookies: Cookies to use in the request.
+            - timeout: Number of seconds to wait before timing out.
+            - follow_redirects: Whether to follow redirects. Defaults to True.
+            - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
+            - retries: Number of retry attempts. Defaults to 3.
+            - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
+            - proxies: Dict of proxies to use.
+            - proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
+            - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
+            - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
+            - verify: Whether to verify HTTPS certificates.
+            - cert: Tuple of (cert, key) filenames for the client certificate.
+            - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
+            - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
+            - stealthy_headers: If enabled (default), it creates and adds real browser headers.
+        :return: A `Response` object.
+        """
+        # Careful of sending a body in a DELETE request, it might cause some websites to reject the request as per https://www.rfc-editor.org/rfc/rfc7231#section-4.3.5,
+        # But some websites accept it, it depends on the implementation used.
+        stealthy_headers = kwargs.pop("stealthy_headers", None)
+        return self._make_request("DELETE", stealth=stealthy_headers, url=url, **kwargs)
+class _ASyncSessionLogic(_ConfigurationLogic):
+    __slots__ = ("_async_curl_session",)
+    def __init__(self, **kwargs: Unpack[RequestsSession]):
+        super().__init__(**kwargs)
+        self._async_curl_session: Optional[AsyncCurlSession] = None
+    async def __aenter__(self):  # pragma: no cover
+        """Creates and returns a new asynchronous Session."""
+        if self._is_alive:
+            raise RuntimeError("This FetcherSession instance already has an active asynchronous session.")
+        self._async_curl_session = AsyncCurlSession()
+        self._is_alive = True
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Closes the active asynchronous session managed by this instance, if any."""
+        # For type checking (not accessed error)
+        _ = (
+            exc_type,
+            exc_val,
+            exc_tb,
+        )
+        if self._async_curl_session:
+            await self._async_curl_session.close()
+            self._async_curl_session = None
+        self._is_alive = False
+    async def _make_request(self, method: SUPPORTED_HTTP_METHODS, stealth: Optional[bool] = None, **kwargs) -> Response:
+        """
+        Perform an HTTP request using the configured session.
+        """
+        stealth = self._stealth if stealth is None else stealth
+        selector_config = self._get_param(kwargs, "selector_config", self.selector_config) or self.selector_config
+        max_retries = self._get_param(kwargs, "retries", self._default_retries)
+        retry_delay = self._get_param(kwargs, "retry_delay", self._default_retry_delay)
+        static_proxy = kwargs.pop("proxy", None)
+        session = self._async_curl_session
+        one_off_request = False
+        if session is _NO_SESSION and self.__aenter__ is None:
+            # For usage inside the ` AsyncFetcherClient ` class, and that's for several reasons
+            # 1. It turns out `curl_cffi` caches impersonation state, so if you turned it off, then on then off, it won't be off on the last time.
+            # 2. `curl_cffi` doesn't support making async requests without sessions
+            # 3. Using a single session for many requests at the same time in async doesn't sit well with curl_cffi.
+            session = AsyncCurlSession()
+            one_off_request = True
+        if not session:
+            raise RuntimeError("No active session available.")  # pragma: no cover
+        try:
+            # Determine if we should use proxy rotation
+            for attempt in range(max_retries):
+                if self._proxy_rotator and static_proxy is None:
+                    proxy = self._proxy_rotator.get_proxy()
+                else:
+                    proxy = static_proxy
+                request_args = self._merge_request_args(stealth=stealth, proxy=proxy, **kwargs)
+                try:
+                    response = await session.request(method, **request_args)
+                    result = ResponseFactory.from_http_request(response, selector_config, meta={"proxy": proxy})
+                    return result
+                except CurlError as e:  # pragma: no cover
+                    if attempt < max_retries - 1:
+                        # Now if the rotator is enabled, we will try again with the new proxy
+                        # If it's not enabled, then we will try again with the same proxy
+                        if is_proxy_error(e):
+                            log.warning(
+                                f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {retry_delay} seconds..."
+                            )
+                        else:
+                            log.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds...")
+                        await asyncio_sleep(retry_delay)
+                    else:
+                        log.error(f"Failed after {max_retries} attempts: {e}")
+                        raise  # Raise the exception if all retries fail
+        finally:
+            if session and one_off_request:
+                await session.close()
+        raise RuntimeError("No active session available.")  # pragma: no cover
+    def get(self, url: str, **kwargs: Unpack[GetRequestParams]) -> Awaitable[Response]:
+        """
+        Perform a GET request.
+        Any additional keyword arguments are passed to the `curl_cffi.requests.AsyncSession().request()` method.
+        :param url: Target URL for the request.
+        :param kwargs: Additional keyword arguments including:
+            - params: Query string parameters for the request.
+            - headers: Headers to include in the request.
+            - cookies: Cookies to use in the request.
+            - timeout: Number of seconds to wait before timing out.
+            - follow_redirects: Whether to follow redirects. Defaults to True.
+            - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
+            - retries: Number of retry attempts. Defaults to 3.
+            - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
+            - proxies: Dict of proxies to use.
+            - proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
+            - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
+            - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
+            - verify: Whether to verify HTTPS certificates.
+            - cert: Tuple of (cert, key) filenames for the client certificate.
+            - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
+            - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
+            - stealthy_headers: If enabled (default), it creates and adds real browser headers.
+        :return: A `Response` object.
+        """
+        stealthy_headers = kwargs.pop("stealthy_headers", None)
+        return self._make_request("GET", stealth=stealthy_headers, url=url, **kwargs)
+    def post(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
+        """
+        Perform a POST request.
+        Any additional keyword arguments are passed to the `curl_cffi.requests.AsyncSession().request()` method.
+        :param url: Target URL for the request.
+        :param kwargs: Additional keyword arguments including:
+            - data: Form data to include in the request body.
+            - json: A JSON serializable object to include in the body of the request.
+            - params: Query string parameters for the request.
+            - headers: Headers to include in the request.
+            - cookies: Cookies to use in the request.
+            - timeout: Number of seconds to wait before timing out.
+            - follow_redirects: Whether to follow redirects. Defaults to True.
+            - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
+            - retries: Number of retry attempts. Defaults to 3.
+            - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
+            - proxies: Dict of proxies to use.
+            - proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
+            - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
+            - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
+            - verify: Whether to verify HTTPS certificates.
+            - cert: Tuple of (cert, key) filenames for the client certificate.
+            - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
+            - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
+            - stealthy_headers: If enabled (default), it creates and adds real browser headers.
+        :return: A `Response` object.
+        """
+        stealthy_headers = kwargs.pop("stealthy_headers", None)
+        return self._make_request("POST", stealth=stealthy_headers, url=url, **kwargs)
+    def put(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
+        """
+        Perform a PUT request.
+        Any additional keyword arguments are passed to the `curl_cffi.requests.AsyncSession().request()` method.
+        :param url: Target URL for the request.
+        :param kwargs: Additional keyword arguments including:
+            - data: Form data to include in the request body.
+            - json: A JSON serializable object to include in the body of the request.
+            - params: Query string parameters for the request.
+            - headers: Headers to include in the request.
+            - cookies: Cookies to use in the request.
+            - timeout: Number of seconds to wait before timing out.
+            - follow_redirects: Whether to follow redirects. Defaults to True.
+            - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
+            - retries: Number of retry attempts. Defaults to 3.
+            - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
+            - proxies: Dict of proxies to use.
+            - proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
+            - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
+            - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
+            - verify: Whether to verify HTTPS certificates.
+            - cert: Tuple of (cert, key) filenames for the client certificate.
+            - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
+            - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
+            - stealthy_headers: If enabled (default), it creates and adds real browser headers.
+        :return: A `Response` object.
+        """
+        stealthy_headers = kwargs.pop("stealthy_headers", None)
+        return self._make_request("PUT", stealth=stealthy_headers, url=url, **kwargs)
+    def delete(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
+        """
+        Perform a DELETE request.
+        Any additional keyword arguments are passed to the `curl_cffi.requests.AsyncSession().request()` method.
+        :param url: Target URL for the request.
+        :param kwargs: Additional keyword arguments including:
+            - data: Form data to include in the request body.
+            - json: A JSON serializable object to include in the body of the request.
+            - params: Query string parameters for the request.
+            - headers: Headers to include in the request.
+            - cookies: Cookies to use in the request.
+            - timeout: Number of seconds to wait before timing out.
+            - follow_redirects: Whether to follow redirects. Defaults to True.
+            - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
+            - retries: Number of retry attempts. Defaults to 3.
+            - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
+            - proxies: Dict of proxies to use.
+            - proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
+            - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
+            - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
+            - verify: Whether to verify HTTPS certificates.
+            - cert: Tuple of (cert, key) filenames for the client certificate.
+            - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
+            - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
+            - stealthy_headers: If enabled (default), it creates and adds real browser headers.
+        :return: A `Response` object.
+        """
+        # Careful of sending a body in a DELETE request, it might cause some websites to reject the request as per https://www.rfc-editor.org/rfc/rfc7231#section-4.3.5,
+        # But some websites accept it, it depends on the implementation used.
+        stealthy_headers = kwargs.pop("stealthy_headers", None)
+        return self._make_request("DELETE", stealth=stealthy_headers, url=url, **kwargs)
+class FetcherSession:
+    """
+    A factory context manager that provides configured Fetcher sessions.
+    When this manager is used in a 'with' or 'async with' block,
+    it yields a new session configured with the manager's defaults.
+    A single instance of this manager should ideally be used for one active
+    session at a time (or sequentially). Re-entering a context with the
+    same manager instance while a session is already active is disallowed.
+    """
+    __slots__ = (
+        "_default_impersonate",
+        "_stealth",
+        "_default_proxies",
+        "_default_proxy",
+        "_default_proxy_auth",
+        "_default_timeout",
+        "_default_headers",
+        "_default_retries",
+        "_default_retry_delay",
+        "_default_follow_redirects",
+        "_default_max_redirects",
+        "_default_verify",
+        "_default_cert",
+        "_default_http3",
+        "selector_config",
+        "_client",
+        "_is_alive",
+        "_proxy_rotator",
+    )
+    def __init__(
+        self,
+        impersonate: ImpersonateType = "chrome",
+        http3: Optional[bool] = False,
+        stealthy_headers: Optional[bool] = True,
+        proxies: Optional[Dict[str, str]] = None,
+        proxy: Optional[str] = None,
+        proxy_auth: Optional[Tuple[str, str]] = None,
+        timeout: Optional[int | float] = 30,
+        headers: Optional[Dict[str, str]] = None,
+        retries: Optional[int] = 3,
+        retry_delay: Optional[int] = 1,
+        follow_redirects: bool = True,
+        max_redirects: int = 30,
+        verify: bool = True,
+        cert: Optional[str | Tuple[str, str]] = None,
+        selector_config: Optional[Dict] = None,
+        proxy_rotator: Optional[ProxyRotator] = None,
+    ):
+        """
+        :param impersonate: Browser version to impersonate. Can be a single browser string or a list of browser strings for random selection. (Default: latest available Chrome version)
+        :param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
+        :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
+        :param proxies: Dict of proxies to use. Format: {"http": proxy_url, "https": proxy_url}.
+        :param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
+                     Cannot be used together with the `proxies` parameter.
+        :param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
+        :param timeout: Number of seconds to wait before timing out.
+        :param headers: Headers to include in the session with every request.
+        :param retries: Number of retry attempts. Defaults to 3.
+        :param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
+        :param follow_redirects: Whether to follow redirects. Defaults to True.
+        :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
+        :param verify: Whether to verify HTTPS certificates. Defaults to True.
+        :param cert: Tuple of (cert, key) filenames for the client certificate.
+        :param selector_config: Arguments passed when creating the final Selector class.
+        :param proxy_rotator: A ProxyRotator instance for automatic proxy rotation.
+        """
+        self._default_impersonate: ImpersonateType = impersonate
+        self._stealth = stealthy_headers
+        self._default_proxies = proxies or {}
+        self._default_proxy = proxy or None
+        self._default_proxy_auth = proxy_auth or None
+        self._default_timeout = timeout
+        self._default_headers = headers or {}
+        self._default_retries = retries
+        self._default_retry_delay = retry_delay
+        self._default_follow_redirects = follow_redirects
+        self._default_max_redirects = max_redirects
+        self._default_verify = verify
+        self._default_cert = cert
+        self._default_http3 = http3
+        self.selector_config = selector_config or {}
+        self._is_alive = False
+        self._client: _SyncSessionLogic | _ASyncSessionLogic | None = None
+        self._proxy_rotator = proxy_rotator
+    def __enter__(self) -> _SyncSessionLogic:
+        """Creates and returns a new synchronous Fetcher Session"""
+        if self._client is None:
+            # Use **vars(self) to avoid repeating all parameters
+            config = {k.replace("_default_", ""): getattr(self, k) for k in self.__slots__ if k.startswith("_default")}
+            config["stealthy_headers"] = self._stealth
+            config["selector_config"] = self.selector_config
+            config["proxy_rotator"] = self._proxy_rotator
+            self._client = _SyncSessionLogic(**config)
+            self._is_alive = True
+            return self._client.__enter__()
+        raise RuntimeError("This FetcherSession instance already has an active synchronous session.")
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self._client is not None and isinstance(self._client, _SyncSessionLogic):
+            self._client.__exit__(exc_type, exc_val, exc_tb)
+            self._client = None
+            self._is_alive = False
+            return
+        raise RuntimeError("Cannot exit invalid session")
+    async def __aenter__(self) -> _ASyncSessionLogic:
+        """Creates and returns a new asynchronous Session."""
+        if self._client is None:
+            # Use **vars(self) to avoid repeating all parameters
+            config = {k.replace("_default_", ""): getattr(self, k) for k in self.__slots__ if k.startswith("_default")}
+            config["stealthy_headers"] = self._stealth
+            config["selector_config"] = self.selector_config
+            config["proxy_rotator"] = self._proxy_rotator
+            self._client = _ASyncSessionLogic(**config)
+            self._is_alive = True
+            return await self._client.__aenter__()
+        raise RuntimeError("This FetcherSession instance already has an active asynchronous session.")
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        if self._client is not None and isinstance(self._client, _ASyncSessionLogic):
+            await self._client.__aexit__(exc_type, exc_val, exc_tb)
+            self._client = None
+            self._is_alive = False
+            return
+        raise RuntimeError("Cannot exit invalid session")
+class FetcherClient(_SyncSessionLogic):
+    __slots__ = ("__enter__", "__exit__")
+    def __init__(self, **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+        self.__enter__: Any = None
+        self.__exit__: Any = None
+        self._curl_session: Any = _NO_SESSION
+class AsyncFetcherClient(_ASyncSessionLogic):
+    __slots__ = ("__aenter__", "__aexit__")
+    def __init__(self, **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+        self.__aenter__: Any = None
+        self.__aexit__: Any = None
+        self._async_curl_session: Any = _NO_SESSION

engines/toolbelt/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .proxy_rotation import ProxyRotator, is_proxy_error, cyclic_rotation
2	+
3	+ __all__ = ["ProxyRotator", "is_proxy_error", "cyclic_rotation"]

engines/toolbelt/convertor.py ADDED Viewed

	@@ -0,0 +1,306 @@

+from functools import lru_cache
+from re import compile as re_compile
+from curl_cffi.requests import Response as CurlResponse
+from playwright._impl._errors import Error as PlaywrightError
+from playwright.sync_api import Page as SyncPage, Response as SyncResponse
+from playwright.async_api import Page as AsyncPage, Response as AsyncResponse
+from scrapling.core.utils import log
+from .custom import Response, StatusText
+from scrapling.core._types import Dict, Optional
+__CHARSET_RE__ = re_compile(r"charset=([\w-]+)")
+class ResponseFactory:
+    """
+    Factory class for creating `Response` objects from various sources.
+    This class provides multiple static and instance methods for building standardized `Response` objects
+    from diverse input sources such as Playwright responses, asynchronous Playwright responses,
+    and raw HTTP request responses. It supports handling response histories, constructing the proper
+    response objects, and managing encoding, headers, cookies, and other attributes.
+    """
+    @classmethod
+    @lru_cache(maxsize=16)
+    def __extract_browser_encoding(cls, content_type: str | None, default: str = "utf-8") -> str:
+        """Extract browser encoding from headers.
+        Ex: from header "content-type: text/html; charset=utf-8" -> "utf-8
+        """
+        if content_type:
+            # Because Playwright can't do that by themselves like all libraries for some reason :3
+            match = __CHARSET_RE__.search(content_type)
+            return match.group(1) if match else default
+        return default
+    @classmethod
+    def _process_response_history(cls, first_response: SyncResponse, parser_arguments: Dict) -> list[Response]:
+        """Process response history to build a list of `Response` objects"""
+        history: list[Response] = []
+        current_request = first_response.request.redirected_from
+        try:
+            while current_request:
+                try:
+                    current_response = current_request.response()
+                    history.insert(
+                        0,
+                        Response(
+                            **{
+                                "url": current_request.url,
+                                # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
+                                "content": "",
+                                "status": current_response.status if current_response else 301,
+                                "reason": (current_response.status_text or StatusText.get(current_response.status))
+                                if current_response
+                                else StatusText.get(301),
+                                "encoding": cls.__extract_browser_encoding(
+                                    current_response.headers.get("content-type", "")
+                                )
+                                if current_response
+                                else "utf-8",
+                                "cookies": tuple(),
+                                "headers": current_response.all_headers() if current_response else {},
+                                "request_headers": current_request.all_headers(),
+                                **parser_arguments,
+                            }
+                        ),
+                    )
+                except Exception as e:  # pragma: no cover
+                    log.error(f"Error processing redirect: {e}")
+                    break
+                current_request = current_request.redirected_from
+        except Exception as e:  # pragma: no cover
+            log.error(f"Error processing response history: {e}")
+        return history
+    @classmethod
+    def from_playwright_response(
+        cls,
+        page: SyncPage,
+        first_response: SyncResponse,
+        final_response: Optional[SyncResponse],
+        parser_arguments: Dict,
+        meta: Optional[Dict] = None,
+    ) -> Response:
+        """
+        Transforms a Playwright response into an internal `Response` object, encapsulating
+        the page's content, response status, headers, and relevant metadata.
+        The function handles potential issues, such as empty or missing final responses,
+        by falling back to the first response if necessary. Encoding and status text
+        are also derived from the provided response headers or reasonable defaults.
+        Additionally, the page content and cookies are extracted for further use.
+        :param page: A synchronous Playwright `Page` instance that represents the current browser page. Required to retrieve the page's URL, cookies, and content.
+        :param final_response: The last response received for the given request from the Playwright instance. Typically used as the main response object to derive status, headers, and other metadata.
+        :param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.
+        :param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into
+            the `Response` object.
+        :param meta: Additional meta data to be saved with the response.
+        :return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.
+        :rtype: Response
+        """
+        # In case we didn't catch a document type somehow
+        final_response = final_response if final_response else first_response
+        if not final_response:
+            raise ValueError("Failed to get a response from the page")
+        encoding = cls.__extract_browser_encoding(final_response.headers.get("content-type", ""))
+        # PlayWright API sometimes give empty status text for some reason!
+        status_text = final_response.status_text or StatusText.get(final_response.status)
+        history = cls._process_response_history(first_response, parser_arguments)
+        try:
+            if "html" in final_response.all_headers().get("content-type", ""):
+                page_content = cls._get_page_content(page).encode("utf-8")
+            else:
+                page_content = final_response.body()
+        except Exception as e:  # pragma: no cover
+            log.error(f"Error getting page content: {e}")
+            page_content = b""
+        return Response(
+            **{
+                "url": page.url,
+                "content": page_content,
+                "status": final_response.status,
+                "reason": status_text,
+                "encoding": encoding,
+                "cookies": tuple(dict(cookie) for cookie in page.context.cookies()),
+                "headers": first_response.all_headers(),
+                "request_headers": first_response.request.all_headers(),
+                "history": history,
+                "meta": meta,
+                **parser_arguments,
+            }
+        )
+    @classmethod
+    async def _async_process_response_history(
+        cls, first_response: AsyncResponse, parser_arguments: Dict
+    ) -> list[Response]:
+        """Process response history to build a list of `Response` objects"""
+        history: list[Response] = []
+        current_request = first_response.request.redirected_from
+        try:
+            while current_request:
+                try:
+                    current_response = await current_request.response()
+                    history.insert(
+                        0,
+                        Response(
+                            **{
+                                "url": current_request.url,
+                                # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
+                                "content": "",
+                                "status": current_response.status if current_response else 301,
+                                "reason": (current_response.status_text or StatusText.get(current_response.status))
+                                if current_response
+                                else StatusText.get(301),
+                                "encoding": cls.__extract_browser_encoding(
+                                    current_response.headers.get("content-type", "")
+                                )
+                                if current_response
+                                else "utf-8",
+                                "cookies": tuple(),
+                                "headers": await current_response.all_headers() if current_response else {},
+                                "request_headers": await current_request.all_headers(),
+                                **parser_arguments,
+                            }
+                        ),
+                    )
+                except Exception as e:  # pragma: no cover
+                    log.error(f"Error processing redirect: {e}")
+                    break
+                current_request = current_request.redirected_from
+        except Exception as e:  # pragma: no cover
+            log.error(f"Error processing response history: {e}")
+        return history
+    @classmethod
+    def _get_page_content(cls, page: SyncPage) -> str:
+        """
+        A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
+        :param page: The page to extract content from.
+        :return:
+        """
+        while True:
+            try:
+                return page.content() or ""
+            except PlaywrightError:
+                page.wait_for_timeout(500)
+                continue
+        return ""  # pyright: ignore
+    @classmethod
+    async def _get_async_page_content(cls, page: AsyncPage) -> str:
+        """
+        A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
+        :param page: The page to extract content from.
+        :return:
+        """
+        while True:
+            try:
+                return (await page.content()) or ""
+            except PlaywrightError:
+                await page.wait_for_timeout(500)
+                continue
+        return ""  # pyright: ignore
+    @classmethod
+    async def from_async_playwright_response(
+        cls,
+        page: AsyncPage,
+        first_response: AsyncResponse,
+        final_response: Optional[AsyncResponse],
+        parser_arguments: Dict,
+        meta: Optional[Dict] = None,
+    ) -> Response:
+        """
+        Transforms a Playwright response into an internal `Response` object, encapsulating
+        the page's content, response status, headers, and relevant metadata.
+        The function handles potential issues, such as empty or missing final responses,
+        by falling back to the first response if necessary. Encoding and status text
+        are also derived from the provided response headers or reasonable defaults.
+        Additionally, the page content and cookies are extracted for further use.
+        :param page: An asynchronous Playwright `Page` instance that represents the current browser page. Required to retrieve the page's URL, cookies, and content.
+        :param final_response: The last response received for the given request from the Playwright instance. Typically used as the main response object to derive status, headers, and other metadata.
+        :param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.
+        :param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into
+            the `Response` object.
+        :param meta: Additional meta data to be saved with the response.
+        :return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.
+        :rtype: Response
+        """
+        # In case we didn't catch a document type somehow
+        final_response = final_response if final_response else first_response
+        if not final_response:
+            raise ValueError("Failed to get a response from the page")
+        encoding = cls.__extract_browser_encoding(final_response.headers.get("content-type", ""))
+        # PlayWright API sometimes give empty status text for some reason!
+        status_text = final_response.status_text or StatusText.get(final_response.status)
+        history = await cls._async_process_response_history(first_response, parser_arguments)
+        try:
+            if "html" in (await final_response.all_headers()).get("content-type", ""):
+                page_content = (await cls._get_async_page_content(page)).encode("utf-8")
+            else:
+                page_content = await final_response.body()
+        except Exception as e:  # pragma: no cover
+            log.error(f"Error getting page content in async: {e}")
+            page_content = b""
+        return Response(
+            **{
+                "url": page.url,
+                "content": page_content,
+                "status": final_response.status,
+                "reason": status_text,
+                "encoding": encoding,
+                "cookies": tuple(dict(cookie) for cookie in await page.context.cookies()),
+                "headers": await first_response.all_headers(),
+                "request_headers": await first_response.request.all_headers(),
+                "history": history,
+                "meta": meta,
+                **parser_arguments,
+            }
+        )
+    @staticmethod
+    def from_http_request(response: CurlResponse, parser_arguments: Dict, meta: Optional[Dict] = None) -> Response:
+        """Takes `curl_cffi` response and generates `Response` object from it.
+        :param response: `curl_cffi` response object
+        :param parser_arguments: Additional arguments to be passed to the `Response` object constructor.
+        :param meta: Optional metadata dictionary to attach to the Response.
+        :return: A `Response` object that is the same as `Selector` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
+        """
+        return Response(
+            **{
+                "url": response.url,
+                "content": response.content,
+                "status": response.status_code,
+                "reason": response.reason,
+                "encoding": response.encoding or "utf-8",
+                "cookies": dict(response.cookies),
+                "headers": dict(response.headers),
+                "request_headers": dict(response.request.headers) if response.request else {},
+                "method": response.request.method if response.request else "GET",
+                "history": response.history,  # https://github.com/lexiforest/curl_cffi/issues/82
+                "meta": meta,
+                **parser_arguments,
+            }
+        )

engines/toolbelt/custom.py ADDED Viewed

	@@ -0,0 +1,295 @@

+"""
+Functions related to custom types or type checking
+"""
+from functools import lru_cache
+from scrapling.core.utils import log
+from scrapling.core._types import (
+    Any,
+    Dict,
+    cast,
+    List,
+    Tuple,
+    Union,
+    Optional,
+    Callable,
+    Sequence,
+    TYPE_CHECKING,
+    AsyncGenerator,
+)
+from scrapling.core.custom_types import MappingProxyType
+from scrapling.parser import Selector, SQLiteStorageSystem
+if TYPE_CHECKING:
+    from scrapling.spiders import Request
+class Response(Selector):
+    """This class is returned by all engines as a way to unify the response type between different libraries."""
+    def __init__(
+        self,
+        url: str,
+        content: str | bytes,
+        status: int,
+        reason: str,
+        cookies: Tuple[Dict[str, str], ...] | Dict[str, str],
+        headers: Dict,
+        request_headers: Dict,
+        encoding: str = "utf-8",
+        method: str = "GET",
+        history: List | None = None,
+        meta: Dict[str, Any] | None = None,
+        **selector_config: Any,
+    ):
+        if isinstance(content, str):
+            content = content.encode("utf-8")
+        adaptive_domain: str = cast(str, selector_config.pop("adaptive_domain", ""))
+        self.status = status
+        self.reason = reason
+        self.cookies = cookies
+        self.headers = headers
+        self.request_headers = request_headers
+        self.history = history or []
+        super().__init__(
+            content=content,
+            url=adaptive_domain or url,
+            encoding=encoding,
+            **selector_config,
+        )
+        # For easier debugging while working from a Python shell
+        log.info(f"Fetched ({status}) <{method} {url}> (referer: {request_headers.get('referer')})")
+        if meta and not isinstance(meta, dict):
+            raise TypeError(f"Response meta should be dictionary but got {type(meta).__name__} instead!")
+        self.meta: Dict[str, Any] = meta or {}
+        self.request: Optional["Request"] = None  # Will be set by crawler
+    @property
+    def body(self) -> bytes:
+        """Return the raw body of the response as bytes."""
+        return cast(bytes, cast(Sequence, self._raw_body))
+    def follow(
+        self,
+        url: str,
+        sid: str = "",
+        callback: Callable[["Response"], AsyncGenerator[Union[Dict[str, Any], "Request", None], None]] | None = None,
+        priority: int | None = None,
+        dont_filter: bool = False,
+        meta: dict[str, Any] | None = None,
+        referer_flow: bool = True,
+        **kwargs: Any,
+    ) -> Any:
+        """Create a Request to follow a URL.
+        This is a helper method for spiders to easily follow links found in pages.
+        **IMPORTANT**: The below arguments if left empty, the corresponding value from the previous request will be used. The only exception is `dont_filter`.
+        :param url: The URL to follow (can be relative, will be joined with current URL)
+        :param sid: The session id to use
+        :param callback: Spider callback method to use
+        :param priority: The priority number to use, the higher the number, the higher priority to be processed first.
+        :param dont_filter: If this request has been done before, disable the filter to allow it again.
+        :param meta: Additional meta data to included in the request
+        :param referer_flow: Enabled by default, set the current response url as referer for the new request url.
+        :param kwargs: Additional Request arguments
+        :return: Request object ready to be yielded
+        """
+        from scrapling.spiders import Request
+        if not self.request or not isinstance(self.request, Request):
+            raise TypeError("This response has no request set yet.")
+        # Merge original session kwargs with new kwargs (new takes precedence)
+        session_kwargs = {**self.request._session_kwargs, **kwargs}
+        if referer_flow:
+            # For requests
+            headers = session_kwargs.get("headers", {})
+            headers["referer"] = self.url
+            session_kwargs["headers"] = headers
+            # For browsers
+            extra_headers = session_kwargs.get("extra_headers", {})
+            extra_headers["referer"] = self.url
+            session_kwargs["extra_headers"] = extra_headers
+            session_kwargs["google_search"] = False
+        return Request(
+            url=self.urljoin(url),
+            sid=sid or self.request.sid,
+            callback=callback or self.request.callback,
+            priority=priority if priority is not None else self.request.priority,
+            dont_filter=dont_filter,
+            meta={**(self.meta or {}), **(meta or {})},
+            **session_kwargs,
+        )
+    def __str__(self) -> str:
+        return f"<{self.status} {self.url}>"
+class BaseFetcher:
+    __slots__ = ()
+    huge_tree: bool = True
+    adaptive: Optional[bool] = False
+    storage: Any = SQLiteStorageSystem
+    keep_cdata: Optional[bool] = False
+    storage_args: Optional[Dict] = None
+    keep_comments: Optional[bool] = False
+    adaptive_domain: str = ""
+    parser_keywords: Tuple = (
+        "huge_tree",
+        "adaptive",
+        "storage",
+        "keep_cdata",
+        "storage_args",
+        "keep_comments",
+        "adaptive_domain",
+    )  # Left open for the user
+    def __init__(self, *args, **kwargs):
+        # For backward-compatibility before 0.2.99
+        args_str = ", ".join(args) or ""
+        kwargs_str = ", ".join(f"{k}={v}" for k, v in kwargs.items()) or ""
+        if args_str:
+            args_str += ", "
+        log.warning(
+            f"This logic is deprecated now, and have no effect; It will be removed with v0.3. Use `{self.__class__.__name__}.configure({args_str}{kwargs_str})` instead before fetching"
+        )
+        pass
+    @classmethod
+    def display_config(cls):
+        return dict(
+            huge_tree=cls.huge_tree,
+            keep_comments=cls.keep_comments,
+            keep_cdata=cls.keep_cdata,
+            adaptive=cls.adaptive,
+            storage=cls.storage,
+            storage_args=cls.storage_args,
+            adaptive_domain=cls.adaptive_domain,
+        )
+    @classmethod
+    def configure(cls, **kwargs):
+        """Set multiple arguments for the parser at once globally
+        :param kwargs: The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata, adaptive, storage, storage_args, adaptive_domain
+        """
+        for key, value in kwargs.items():
+            key = key.strip().lower()
+            if hasattr(cls, key):
+                if key in cls.parser_keywords:
+                    setattr(cls, key, value)
+                else:
+                    # Yup, no fun allowed LOL
+                    raise AttributeError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
+            else:
+                raise ValueError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
+        if not kwargs:
+            raise AttributeError(f"You must pass a keyword to configure, current keywords: {cls.parser_keywords}?")
+    @classmethod
+    def _generate_parser_arguments(cls) -> Dict:
+        # Selector class parameters
+        # I won't validate Selector's class parameters here again, I will leave it to be validated later
+        parser_arguments = dict(
+            huge_tree=cls.huge_tree,
+            keep_comments=cls.keep_comments,
+            keep_cdata=cls.keep_cdata,
+            adaptive=cls.adaptive,
+            storage=cls.storage,
+            storage_args=cls.storage_args,
+            adaptive_domain=cls.adaptive_domain,
+        )
+        return parser_arguments
+class StatusText:
+    """A class that gets the status text of the response status code.
+    Reference: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status
+    """
+    _phrases = MappingProxyType(
+        {
+            100: "Continue",
+            101: "Switching Protocols",
+            102: "Processing",
+            103: "Early Hints",
+            200: "OK",
+            201: "Created",
+            202: "Accepted",
+            203: "Non-Authoritative Information",
+            204: "No Content",
+            205: "Reset Content",
+            206: "Partial Content",
+            207: "Multi-Status",
+            208: "Already Reported",
+            226: "IM Used",
+            300: "Multiple Choices",
+            301: "Moved Permanently",
+            302: "Found",
+            303: "See Other",
+            304: "Not Modified",
+            305: "Use Proxy",
+            307: "Temporary Redirect",
+            308: "Permanent Redirect",
+            400: "Bad Request",
+            401: "Unauthorized",
+            402: "Payment Required",
+            403: "Forbidden",
+            404: "Not Found",
+            405: "Method Not Allowed",
+            406: "Not Acceptable",
+            407: "Proxy Authentication Required",
+            408: "Request Timeout",
+            409: "Conflict",
+            410: "Gone",
+            411: "Length Required",
+            412: "Precondition Failed",
+            413: "Payload Too Large",
+            414: "URI Too Long",
+            415: "Unsupported Media Type",
+            416: "Range Not Satisfiable",
+            417: "Expectation Failed",
+            418: "I'm a teapot",
+            421: "Misdirected Request",
+            422: "Unprocessable Entity",
+            423: "Locked",
+            424: "Failed Dependency",
+            425: "Too Early",
+            426: "Upgrade Required",
+            428: "Precondition Required",
+            429: "Too Many Requests",
+            431: "Request Header Fields Too Large",
+            451: "Unavailable For Legal Reasons",
+            500: "Internal Server Error",
+            501: "Not Implemented",
+            502: "Bad Gateway",
+            503: "Service Unavailable",
+            504: "Gateway Timeout",
+            505: "HTTP Version Not Supported",
+            506: "Variant Also Negotiates",
+            507: "Insufficient Storage",
+            508: "Loop Detected",
+            510: "Not Extended",
+            511: "Network Authentication Required",
+        }
+    )
+    @classmethod
+    @lru_cache(maxsize=128)
+    def get(cls, status_code: int) -> str:
+        """Get the phrase for a given HTTP status code."""
+        return cls._phrases.get(status_code, "Unknown Status Code")

engines/toolbelt/fingerprints.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""
+Functions related to generating headers and fingerprints generally
+"""
+from functools import lru_cache
+from platform import system as platform_system
+from tld import get_tld, Result
+from browserforge.headers import Browser, HeaderGenerator
+from browserforge.headers.generator import SUPPORTED_OPERATING_SYSTEMS
+from scrapling.core._types import Dict, Literal, Tuple, cast
+__OS_NAME__ = platform_system()
+OSName = Literal["linux", "macos", "windows"]
+# Current versions hardcoded for now (Playwright doesn't allow to know the version of a browser without launching it)
+chromium_version = 141
+chrome_version = 143
+@lru_cache(10, typed=True)
+def generate_convincing_referer(url: str) -> str | None:
+    """Takes the domain from the URL without the subdomain/suffix and make it look like you were searching Google for this website
+    >>> generate_convincing_referer('https://www.somewebsite.com/blah')
+    'https://www.google.com/search?q=somewebsite'
+    :param url: The URL you are about to fetch.
+    :return: Google's search URL of the domain name, or None for localhost/IP addresses
+    """
+    # Fixing the inaccurate return type hint in `get_tld`
+    extracted: Result | None = cast(Result, get_tld(url, as_object=True, fail_silently=True))
+    if not extracted:
+        return None
+    website_name = extracted.domain
+    # Skip generating referer for localhost, IP addresses, or when there's no valid domain
+    if not website_name or not extracted.tld or website_name in ("localhost", "127.0.0.1", "::1"):
+        return None
+    # Check if it's an IP address (simple check for IPv4)
+    if all(part.isdigit() for part in website_name.split(".") if part):
+        return None
+    return f"https://www.google.com/search?q={website_name}"
+@lru_cache(1, typed=True)
+def get_os_name() -> OSName | Tuple:
+    """Get the current OS name in the same format needed for browserforge, if the OS is Unknown, return None so browserforge uses all.
+    :return: Current OS name or `None` otherwise
+    """
+    match __OS_NAME__:  # pragma: no cover
+        case "Linux":
+            return "linux"
+        case "Darwin":
+            return "macos"
+        case "Windows":
+            return "windows"
+        case _:
+            return SUPPORTED_OPERATING_SYSTEMS
+def generate_headers(browser_mode: bool | str = False) -> Dict:
+    """Generate real browser-like headers using browserforge's generator
+    :param browser_mode: If enabled, the headers created are used for playwright, so it has to match everything
+    :return: A dictionary of the generated headers
+    """
+    # In the browser mode, we don't care about anything other than matching the OS and the browser type with the browser we are using,
+    # So we don't raise any inconsistency red flags while websites fingerprinting us
+    os_name = get_os_name()
+    ver = chrome_version if browser_mode and browser_mode == "chrome" else chromium_version
+    browsers = [Browser(name="chrome", min_version=ver, max_version=ver)]
+    if not browser_mode:
+        os_name = ("windows", "macos", "linux")
+        browsers.extend(
+            [
+                Browser(name="firefox", min_version=142),
+                Browser(name="edge", min_version=140),
+            ]
+        )
+    return HeaderGenerator(browser=browsers, os=os_name, device="desktop").generate()
+__default_useragent__ = generate_headers(browser_mode=False).get("User-Agent")

engines/toolbelt/navigation.py ADDED Viewed

	@@ -0,0 +1,109 @@

+"""
+Functions related to files and URLs
+"""
+from urllib.parse import urlparse
+from playwright.async_api import Route as async_Route
+from msgspec import Struct, structs, convert, ValidationError
+from playwright.sync_api import Route
+from scrapling.core.utils import log
+from scrapling.core._types import Dict, Set, Tuple, Optional, Callable
+from scrapling.engines.constants import EXTRA_RESOURCES
+class ProxyDict(Struct):
+    server: str
+    username: str = ""
+    password: str = ""
+def create_intercept_handler(disable_resources: bool, blocked_domains: Optional[Set[str]] = None) -> Callable:
+    """Create a route handler that blocks both resource types and specific domains.
+    :param disable_resources: Whether to block default resource types.
+    :param blocked_domains: Set of domain names to block requests to.
+    :return: A sync route handler function.
+    """
+    disabled_resources = EXTRA_RESOURCES if disable_resources else set()
+    domains = blocked_domains or set()
+    def handler(route: Route):
+        if route.request.resource_type in disabled_resources:
+            log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
+            route.abort()
+        elif domains:
+            hostname = urlparse(route.request.url).hostname or ""
+            if any(hostname == d or hostname.endswith("." + d) for d in domains):
+                log.debug(f'Blocking request to blocked domain "{hostname}" ({route.request.url})')
+                route.abort()
+            else:
+                route.continue_()
+        else:
+            route.continue_()
+    return handler
+def create_async_intercept_handler(disable_resources: bool, blocked_domains: Optional[Set[str]] = None) -> Callable:
+    """Create an async route handler that blocks both resource types and specific domains.
+    :param disable_resources: Whether to block default resource types.
+    :param blocked_domains: Set of domain names to block requests to.
+    :return: An async route handler function.
+    """
+    disabled_resources = EXTRA_RESOURCES if disable_resources else set()
+    domains = blocked_domains or set()
+    async def handler(route: async_Route):
+        if route.request.resource_type in disabled_resources:
+            log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
+            await route.abort()
+        elif domains:
+            hostname = urlparse(route.request.url).hostname or ""
+            if any(hostname == d or hostname.endswith("." + d) for d in domains):
+                log.debug(f'Blocking request to blocked domain "{hostname}" ({route.request.url})')
+                await route.abort()
+            else:
+                await route.continue_()
+        else:
+            await route.continue_()
+    return handler
+def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple) -> Dict:
+    """Validate a proxy and return it in the acceptable format for Playwright
+    Reference: https://playwright.dev/python/docs/network#http-proxy
+    :param proxy_string: A string or a dictionary representation of the proxy.
+    :return:
+    """
+    if isinstance(proxy_string, str):
+        proxy = urlparse(proxy_string)
+        if proxy.scheme not in ("http", "https", "socks4", "socks5") or not proxy.hostname:
+            raise ValueError("Invalid proxy string!")
+        try:
+            result = {
+                "server": f"{proxy.scheme}://{proxy.hostname}",
+                "username": proxy.username or "",
+                "password": proxy.password or "",
+            }
+            if proxy.port:
+                result["server"] += f":{proxy.port}"
+            return result
+        except ValueError:
+            # Urllib will say that one of the parameters above can't be casted to the correct type like `int` for port etc...
+            raise ValueError("The proxy argument's string is in invalid format!")
+    elif isinstance(proxy_string, dict):
+        try:
+            validated = convert(proxy_string, ProxyDict)
+            result_dict = structs.asdict(validated)
+            return result_dict
+        except ValidationError as e:
+            raise TypeError(f"Invalid proxy dictionary: {e}")
+    raise TypeError(f"Invalid proxy string: {proxy_string}")

engines/toolbelt/proxy_rotation.py ADDED Viewed

	@@ -0,0 +1,104 @@

+from threading import Lock
+from scrapling.core._types import Callable, Dict, List, Tuple, ProxyType
+RotationStrategy = Callable[[List[ProxyType], int], Tuple[ProxyType, int]]
+_PROXY_ERROR_INDICATORS = {
+    "net::err_proxy",
+    "net::err_tunnel",
+    "connection refused",
+    "connection reset",
+    "connection timed out",
+    "failed to connect",
+    "could not resolve proxy",
+}
+def _get_proxy_key(proxy: ProxyType) -> str:
+    """Generate a unique key for a proxy (for dicts it's server plus username)."""
+    if isinstance(proxy, str):
+        return proxy
+    server = proxy.get("server", "")
+    username = proxy.get("username", "")
+    return f"{server}|{username}"
+def is_proxy_error(error: Exception) -> bool:
+    """Check if an error is proxy-related. Works for both HTTP and browser errors."""
+    error_msg = str(error).lower()
+    return any(indicator in error_msg for indicator in _PROXY_ERROR_INDICATORS)
+def cyclic_rotation(proxies: List[ProxyType], current_index: int) -> Tuple[ProxyType, int]:
+    """Default cyclic rotation strategy — iterates through proxies sequentially, wrapping around at the end."""
+    idx = current_index % len(proxies)
+    return proxies[idx], (idx + 1) % len(proxies)
+class ProxyRotator:
+    """
+    A thread-safe proxy rotator with pluggable rotation strategies.
+    Supports:
+    - Cyclic rotation (default)
+    - Custom rotation strategies via callable
+    - Both string URLs and Playwright-style dict proxies
+    """
+    __slots__ = ("_proxies", "_proxy_to_index", "_strategy", "_current_index", "_lock")
+    def __init__(
+        self,
+        proxies: List[ProxyType],
+        strategy: RotationStrategy = cyclic_rotation,
+    ):
+        """
+        Initialize the proxy rotator.
+        :param proxies: List of proxy URLs or Playwright-style proxy dicts.
+            - String format: "http://proxy1:8080" or "http://user:pass@proxy:8080"
+            - Dict format: {"server": "http://proxy:8080", "username": "user", "password": "pass"}
+        :param strategy: Rotation strategy function. Takes (proxies, current_index) and returns (proxy, next_index). Defaults to cyclic_rotation.
+        """
+        if not proxies:
+            raise ValueError("At least one proxy must be provided")
+        if not callable(strategy):
+            raise TypeError(f"strategy must be callable, got {type(strategy).__name__}")
+        self._strategy = strategy
+        self._lock = Lock()
+        # Validate and store proxies
+        self._proxies: List[ProxyType] = []
+        self._proxy_to_index: Dict[str, int] = {}  # O(1) lookup by unique key (server + username)
+        for i, proxy in enumerate(proxies):
+            if isinstance(proxy, (str, dict)):
+                if isinstance(proxy, dict) and "server" not in proxy:
+                    raise ValueError("Proxy dict must have a 'server' key")
+                self._proxy_to_index[_get_proxy_key(proxy)] = i
+                self._proxies.append(proxy)
+            else:
+                raise TypeError(f"Invalid proxy type: {type(proxy)}. Expected str or dict.")
+        self._current_index = 0
+    def get_proxy(self) -> ProxyType:
+        """Get the next proxy according to the rotation strategy."""
+        with self._lock:
+            proxy, self._current_index = self._strategy(self._proxies, self._current_index)
+            return proxy
+    @property
+    def proxies(self) -> List[ProxyType]:
+        """Get a copy of all configured proxies."""
+        return list(self._proxies)
+    def __len__(self) -> int:
+        """Return the total number of configured proxies."""
+        return len(self._proxies)
+    def __repr__(self) -> str:
+        return f"ProxyRotator(proxies={len(self._proxies)})"

fetchers/__init__.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from typing import TYPE_CHECKING, Any
+from scrapling.engines.toolbelt import ProxyRotator
+if TYPE_CHECKING:
+    from scrapling.fetchers.requests import Fetcher, AsyncFetcher, FetcherSession
+    from scrapling.fetchers.chrome import DynamicFetcher, DynamicSession, AsyncDynamicSession
+    from scrapling.fetchers.stealth_chrome import StealthyFetcher, StealthySession, AsyncStealthySession
+# Lazy import mapping
+_LAZY_IMPORTS = {
+    "Fetcher": ("scrapling.fetchers.requests", "Fetcher"),
+    "AsyncFetcher": ("scrapling.fetchers.requests", "AsyncFetcher"),
+    "FetcherSession": ("scrapling.fetchers.requests", "FetcherSession"),
+    "DynamicFetcher": ("scrapling.fetchers.chrome", "DynamicFetcher"),
+    "DynamicSession": ("scrapling.fetchers.chrome", "DynamicSession"),
+    "AsyncDynamicSession": ("scrapling.fetchers.chrome", "AsyncDynamicSession"),
+    "StealthyFetcher": ("scrapling.fetchers.stealth_chrome", "StealthyFetcher"),
+    "StealthySession": ("scrapling.fetchers.stealth_chrome", "StealthySession"),
+    "AsyncStealthySession": ("scrapling.fetchers.stealth_chrome", "AsyncStealthySession"),
+}
+__all__ = [
+    "Fetcher",
+    "AsyncFetcher",
+    "ProxyRotator",
+    "FetcherSession",
+    "DynamicFetcher",
+    "DynamicSession",
+    "AsyncDynamicSession",
+    "StealthyFetcher",
+    "StealthySession",
+    "AsyncStealthySession",
+]
+def __getattr__(name: str) -> Any:
+    if name in _LAZY_IMPORTS:
+        module_path, class_name = _LAZY_IMPORTS[name]
+        module = __import__(module_path, fromlist=[class_name])
+        return getattr(module, class_name)
+    else:
+        raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+def __dir__() -> list[str]:
+    """Support for dir() and autocomplete."""
+    return sorted(list(_LAZY_IMPORTS.keys()))

fetchers/chrome.py ADDED Viewed

	@@ -0,0 +1,91 @@

+from scrapling.core._types import Unpack
+from scrapling.engines._browsers._types import PlaywrightSession
+from scrapling.engines.toolbelt.custom import BaseFetcher, Response
+from scrapling.engines._browsers._controllers import DynamicSession, AsyncDynamicSession
+class DynamicFetcher(BaseFetcher):
+    """A `Fetcher` that provide many options to fetch/load websites' pages through chromium-based browsers."""
+    @classmethod
+    def fetch(cls, url: str, **kwargs: Unpack[PlaywrightSession]) -> Response:
+        """Opens up a browser and do your request based on your chosen options below.
+        :param url: Target url.
+        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
+        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
+        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
+        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
+        :param cookies: Set cookies for the next request.
+        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
+        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
+        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.
+        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
+        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
+        :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
+        :param locale: Set the locale for the browser if wanted. Defaults to the system default locale.
+        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
+        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
+        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
+        :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
+        :param extra_headers: A dictionary of extra headers to add to the request.
+        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
+        :param extra_flags: A list of additional browser flags to pass to the browser on launch.
+        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
+        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings.
+        :return: A `Response` object.
+        """
+        selector_config = kwargs.get("selector_config", {}) or kwargs.get(
+            "custom_config", {}
+        )  # Checking `custom_config` for backward compatibility
+        if not isinstance(selector_config, dict):
+            raise TypeError("Argument `selector_config` must be a dictionary.")
+        kwargs["selector_config"] = {**cls._generate_parser_arguments(), **selector_config}
+        with DynamicSession(**kwargs) as session:
+            return session.fetch(url)
+    @classmethod
+    async def async_fetch(cls, url: str, **kwargs: Unpack[PlaywrightSession]) -> Response:
+        """Opens up a browser and do your request based on your chosen options below.
+        :param url: Target url.
+        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
+        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
+        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
+        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
+        :param cookies: Set cookies for the next request.
+        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
+        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
+        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.
+        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
+        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
+        :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
+        :param locale: Set the locale for the browser if wanted. Defaults to the system default locale.
+        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
+        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
+        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
+        :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
+        :param extra_headers: A dictionary of extra headers to add to the request.
+        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
+        :param extra_flags: A list of additional browser flags to pass to the browser on launch.
+        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
+        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings.
+        :return: A `Response` object.
+        """
+        selector_config = kwargs.get("selector_config", {}) or kwargs.get(
+            "custom_config", {}
+        )  # Checking `custom_config` for backward compatibility
+        if not isinstance(selector_config, dict):
+            raise TypeError("Argument `selector_config` must be a dictionary.")
+        kwargs["selector_config"] = {**cls._generate_parser_arguments(), **selector_config}
+        async with AsyncDynamicSession(**kwargs) as session:
+            return await session.fetch(url)
+PlayWrightFetcher = DynamicFetcher  # For backward-compatibility

fetchers/requests.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from scrapling.engines.static import (
+    FetcherSession,
+    FetcherClient as _FetcherClient,
+    AsyncFetcherClient as _AsyncFetcherClient,
+)
+from scrapling.engines.toolbelt.custom import BaseFetcher
+__FetcherClientInstance__ = _FetcherClient()
+__AsyncFetcherClientInstance__ = _AsyncFetcherClient()
+class Fetcher(BaseFetcher):
+    """A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on `curl_cffi`."""
+    get = __FetcherClientInstance__.get
+    post = __FetcherClientInstance__.post
+    put = __FetcherClientInstance__.put
+    delete = __FetcherClientInstance__.delete
+class AsyncFetcher(BaseFetcher):
+    """A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on `curl_cffi`."""
+    get = __AsyncFetcherClientInstance__.get
+    post = __AsyncFetcherClientInstance__.post
+    put = __AsyncFetcherClientInstance__.put
+    delete = __AsyncFetcherClientInstance__.delete

fetchers/stealth_chrome.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from scrapling.core._types import Unpack
+from scrapling.engines._browsers._types import StealthSession
+from scrapling.engines.toolbelt.custom import BaseFetcher, Response
+from scrapling.engines._browsers._stealth import StealthySession, AsyncStealthySession
+class StealthyFetcher(BaseFetcher):
+    """A `Fetcher` class type which is a completely stealthy built on top of Chromium.
+    It works as real browsers passing almost all online tests/protections with many customization options.
+    """
+    @classmethod
+    def fetch(cls, url: str, **kwargs: Unpack[StealthSession]) -> Response:
+        """
+        Opens up a browser and do your request based on your chosen options below.
+        :param url: Target url.
+        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
+        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
+            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
+        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
+        :param cookies: Set cookies for the next request.
+        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
+        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
+        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
+        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
+        :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
+        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
+            rules. Defaults to the system default locale.
+        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
+        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
+        :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
+        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
+        :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
+        :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
+        :param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
+        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
+        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
+        :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
+        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
+        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
+        :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
+        :param extra_flags: A list of additional browser flags to pass to the browser on launch.
+        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
+        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
+        :return: A `Response` object.
+        """
+        selector_config = kwargs.get("selector_config", {}) or kwargs.get(
+            "custom_config", {}
+        )  # Checking `custom_config` for backward compatibility
+        if not isinstance(selector_config, dict):
+            raise TypeError("Argument `selector_config` must be a dictionary.")
+        kwargs["selector_config"] = {**cls._generate_parser_arguments(), **selector_config}
+        with StealthySession(**kwargs) as engine:
+            return engine.fetch(url)
+    @classmethod
+    async def async_fetch(cls, url: str, **kwargs: Unpack[StealthSession]) -> Response:
+        """
+        Opens up a browser and do your request based on your chosen options below.
+        :param url: Target url.
+        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
+        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
+            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
+        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
+        :param cookies: Set cookies for the next request.
+        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
+        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
+        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
+        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
+        :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
+        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
+            rules. Defaults to the system default locale.
+        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
+        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
+        :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
+        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
+        :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
+        :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
+        :param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
+        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
+        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
+        :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
+        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
+        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
+        :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
+        :param extra_flags: A list of additional browser flags to pass to the browser on launch.
+        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
+        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
+        :return: A `Response` object.
+        """
+        selector_config = kwargs.get("selector_config", {}) or kwargs.get(
+            "custom_config", {}
+        )  # Checking `custom_config` for backward compatibility
+        if not isinstance(selector_config, dict):
+            raise TypeError("Argument `selector_config` must be a dictionary.")
+        kwargs["selector_config"] = {**cls._generate_parser_arguments(), **selector_config}
+        async with AsyncStealthySession(**kwargs) as engine:
+            return await engine.fetch(url)

parser.py ADDED Viewed

	@@ -0,0 +1,1363 @@

+from pathlib import Path
+from inspect import signature
+from urllib.parse import urljoin
+from difflib import SequenceMatcher
+from re import Pattern as re_Pattern
+from lxml.html import HtmlElement, HTMLParser
+from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
+from lxml.etree import (
+    XPath,
+    tostring,
+    fromstring,
+    XPathError,
+    XPathEvalError,
+    _ElementUnicodeResult,
+)
+from scrapling.core._types import (
+    Any,
+    Set,
+    Dict,
+    cast,
+    List,
+    Tuple,
+    Union,
+    TypeVar,
+    Pattern,
+    Callable,
+    Literal,
+    Optional,
+    Iterable,
+    overload,
+    Generator,
+    SupportsIndex,
+    TYPE_CHECKING,
+)
+from scrapling.core.custom_types import AttributesHandler, TextHandler, TextHandlers
+from scrapling.core.mixins import SelectorsGeneration
+from scrapling.core.storage import (
+    SQLiteStorageSystem,
+    StorageSystemMixin,
+    _StorageTools,
+)
+from scrapling.core.translator import css_to_xpath as _css_to_xpath
+from scrapling.core.utils import clean_spaces, flatten, html_forbidden, log
+__DEFAULT_DB_FILE__ = str(Path(__file__).parent / "elements_storage.db")
+# Attributes that are Python reserved words and can't be used directly
+# Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
+# https://www.w3schools.com/python/python_ref_keywords.asp
+_whitelisted = {
+    "class_": "class",
+    "for_": "for",
+}
+_T = TypeVar("_T")
+# Pre-compiled selectors for efficiency
+_find_all_elements = XPath(".//*")
+_find_all_elements_with_spaces = XPath(
+    ".//*[normalize-space(text())]"
+)  # This selector gets all elements with text content
+class Selector(SelectorsGeneration):
+    __slots__ = (
+        "url",
+        "encoding",
+        "__adaptive_enabled",
+        "_root",
+        "_storage",
+        "__keep_comments",
+        "__huge_tree_enabled",
+        "__attributes",
+        "__text",
+        "__tag",
+        "__keep_cdata",
+        "_raw_body",
+    )
+    def __init__(
+        self,
+        content: Optional[str | bytes] = None,
+        url: str = "",
+        encoding: str = "utf-8",
+        huge_tree: bool = True,
+        root: Optional[HtmlElement] = None,
+        keep_comments: Optional[bool] = False,
+        keep_cdata: Optional[bool] = False,
+        adaptive: Optional[bool] = False,
+        _storage: Optional[StorageSystemMixin] = None,
+        storage: Any = SQLiteStorageSystem,
+        storage_args: Optional[Dict] = None,
+        **_,
+    ):
+        """The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
+        with expressions in CSS, XPath, or with simply text. Check the docs for more info.
+        Here we try to extend module ``lxml.html.HtmlElement`` while maintaining a simpler interface, We are not
+        inheriting from the ``lxml.html.HtmlElement`` because it's not pickleable, which makes a lot of reference jobs
+        not possible. You can test it here and see code explodes with `AssertionError: invalid Element proxy at...`.
+        It's an old issue with lxml, see `this entry <https://bugs.launchpad.net/lxml/+bug/736708>`
+        :param content: HTML content as either string or bytes.
+        :param url: It allows storing a URL with the HTML data for retrieving later.
+        :param encoding: The encoding type that will be used in HTML parsing, default is `UTF-8`
+        :param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
+             the libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
+        :param root: Used internally to pass etree objects instead of text/body arguments, it takes the highest priority.
+            Don't use it unless you know what you are doing!
+        :param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
+        :param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
+        :param adaptive: Globally turn off the adaptive feature in all functions, this argument takes higher
+            priority over all adaptive related arguments/functions in the class.
+        :param storage: The storage class to be passed for adaptive functionalities, see ``Docs`` for more info.
+        :param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
+            If empty, default values will be used.
+        """
+        if root is None and content is None:
+            raise ValueError("Selector class needs HTML content, or root arguments to work")
+        self.url = url
+        self._raw_body: str | bytes = ""
+        self.encoding = encoding
+        self.__keep_cdata = keep_cdata
+        self.__huge_tree_enabled = huge_tree
+        self.__keep_comments = keep_comments
+        # For selector stuff
+        self.__text: Optional[TextHandler] = None
+        self.__attributes: Optional[AttributesHandler] = None
+        self.__tag: Optional[str] = None
+        self._storage: Optional[StorageSystemMixin] = None
+        if root is None:
+            body: str | bytes
+            if isinstance(content, str):
+                body = content.strip().replace("\x00", "") or "<html/>"
+            elif isinstance(content, bytes):
+                body = content.replace(b"\x00", b"")
+            else:
+                raise TypeError(f"content argument must be str or bytes, got {type(content)}")
+            # https://lxml.de/api/lxml.etree.HTMLParser-class.html
+            _parser_kwargs: Dict[str, Any] = dict(
+                recover=True,
+                remove_blank_text=True,
+                remove_comments=(not keep_comments),
+                encoding=encoding,
+                compact=True,
+                huge_tree=huge_tree,
+                default_doctype=True,  # Supported by lxml but missing from stubs
+                strip_cdata=(not keep_cdata),
+            )
+            parser = HTMLParser(**_parser_kwargs)
+            self._root = cast(HtmlElement, fromstring(body or "<html/>", parser=parser, base_url=url or ""))
+            self._raw_body = content
+        else:
+            self._root = cast(HtmlElement, root)
+            if self._is_text_node(root):
+                self.__adaptive_enabled = False
+                return
+        self.__adaptive_enabled = bool(adaptive)
+        if self.__adaptive_enabled:
+            if _storage is not None:
+                self._storage = _storage
+            else:
+                if not storage_args:
+                    storage_args = {
+                        "storage_file": __DEFAULT_DB_FILE__,
+                        "url": url,
+                    }
+                if not hasattr(storage, "__wrapped__"):
+                    raise ValueError("Storage class must be wrapped with lru_cache decorator, see docs for info")
+                if not issubclass(storage.__wrapped__, StorageSystemMixin):  # pragma: no cover
+                    raise ValueError("Storage system must be inherited from class `StorageSystemMixin`")
+                self._storage = storage(**storage_args)
+    def __getitem__(self, key: str) -> TextHandler:
+        if self._is_text_node(self._root):
+            raise TypeError("Text nodes do not have attributes")
+        return self.attrib[key]
+    def __contains__(self, key: str) -> bool:
+        if self._is_text_node(self._root):
+            return False
+        return key in self.attrib
+    # Node functionalities, I wanted to move to a separate Mixin class, but it had a slight impact on performance
+    @staticmethod
+    def _is_text_node(
+        element: HtmlElement | _ElementUnicodeResult,
+    ) -> bool:
+        """Return True if the given element is a result of a string expression
+        Examples:
+            XPath -> '/text()', '/@attribute', etc...
+            CSS3 -> '::text', '::attr(attrib)'...
+        """
+        # Faster than checking `element.is_attribute or element.is_text or element.is_tail`
+        return issubclass(type(element), _ElementUnicodeResult)
+    def __element_convertor(self, element: HtmlElement | _ElementUnicodeResult) -> "Selector":
+        """Used internally to convert a single HtmlElement or text node to Selector directly without checks"""
+        return Selector(
+            root=element,
+            url=self.url,
+            encoding=self.encoding,
+            adaptive=self.__adaptive_enabled,
+            _storage=self._storage,
+            keep_comments=self.__keep_comments,
+            keep_cdata=self.__keep_cdata,
+            huge_tree=self.__huge_tree_enabled,
+        )
+    def __elements_convertor(self, elements: List[HtmlElement | _ElementUnicodeResult]) -> "Selectors":
+        # Store them for non-repeated call-ups
+        url = self.url
+        encoding = self.encoding
+        adaptive = self.__adaptive_enabled
+        storage = self._storage
+        comments = self.__keep_comments
+        cdata = self.__keep_cdata
+        huge_tree = self.__huge_tree_enabled
+        return Selectors(
+            Selector(
+                root=el,
+                url=url,
+                encoding=encoding,
+                adaptive=adaptive,
+                _storage=storage,
+                keep_comments=comments,
+                keep_cdata=cdata,
+                huge_tree=huge_tree,
+            )
+            for el in elements
+        )
+    def __handle_elements(self, result: List[HtmlElement | _ElementUnicodeResult]) -> "Selectors":
+        """Used internally in all functions to convert results to Selectors in bulk"""
+        if not result:
+            return Selectors()
+        return self.__elements_convertor(result)
+    def __getstate__(self) -> Any:
+        # lxml don't like it :)
+        raise TypeError("Can't pickle Selector objects")
+    # The following four properties I made them into functions instead of variables directly
+    # So they don't slow down the process of initializing many instances of the class and gets executed only
+    # when the user needs them for the first time for that specific element and gets cached for next times
+    # Doing that only made the library performance test sky rocked multiple times faster than before
+    # because I was executing them on initialization before :))
+    @property
+    def tag(self) -> str:
+        """Get the tag name of the element"""
+        if self._is_text_node(self._root):
+            return "#text"
+        if not self.__tag:
+            self.__tag = str(self._root.tag)
+        return self.__tag or ""
+    @property
+    def text(self) -> TextHandler:
+        """Get text content of the element"""
+        if self._is_text_node(self._root):
+            return TextHandler(str(self._root))
+        if self.__text is None:
+            # If you want to escape lxml default behavior and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
+            # before extracting text, then keep `keep_comments` set to False while initializing the first class
+            self.__text = TextHandler(self._root.text or "")
+        return self.__text
+    def get_all_text(
+        self,
+        separator: str = "\n",
+        strip: bool = False,
+        ignore_tags: Tuple = (
+            "script",
+            "style",
+        ),
+        valid_values: bool = True,
+    ) -> TextHandler:
+        """Get all child strings of this element, concatenated using the given separator.
+        :param separator: Strings will be concatenated using this separator.
+        :param strip: If True, strings will be stripped before being concatenated.
+        :param ignore_tags: A tuple of all tag names you want to ignore
+        :param valid_values: If enabled, elements with text-content that is empty or only whitespaces will be ignored
+        :return: A TextHandler
+        """
+        if self._is_text_node(self._root):
+            return TextHandler(str(self._root))
+        ignored_elements: set[Any] = set()
+        if ignore_tags:
+            for element in self._root.iter(*ignore_tags):
+                ignored_elements.add(element)
+                ignored_elements.update(cast(list, _find_all_elements(element)))
+        _all_strings = []
+        for node in self._root.iter():
+            if node not in ignored_elements:
+                text = node.text
+                if text and isinstance(text, str):
+                    processed_text = text.strip() if strip else text
+                    if not valid_values or processed_text.strip():
+                        _all_strings.append(processed_text)
+        return cast(TextHandler, TextHandler(separator).join(_all_strings))
+    def urljoin(self, relative_url: str) -> str:
+        """Join this Selector's url with a relative url to form an absolute full URL."""
+        return urljoin(self.url, relative_url)
+    @property
+    def attrib(self) -> AttributesHandler:
+        """Get attributes of the element"""
+        if self._is_text_node(self._root):
+            return AttributesHandler({})
+        if not self.__attributes:
+            self.__attributes = AttributesHandler(self._root.attrib)
+        return self.__attributes
+    @property
+    def html_content(self) -> TextHandler:
+        """Return the inner HTML code of the element"""
+        if self._is_text_node(self._root):
+            return TextHandler(str(self._root))
+        content = tostring(self._root, encoding=self.encoding, method="html", with_tail=False)
+        if isinstance(content, bytes):
+            content = content.strip().decode(self.encoding)
+        return TextHandler(content)
+    @property
+    def body(self) -> str | bytes:
+        """Return the raw body of the current `Selector` without any processing. Useful for binary and non-HTML requests."""
+        if self._is_text_node(self._root):
+            return ""
+        return self._raw_body
+    def prettify(self) -> TextHandler:
+        """Return a prettified version of the element's inner html-code"""
+        if self._is_text_node(self._root):
+            return TextHandler(str(self._root))
+        content = tostring(
+            self._root,
+            encoding=self.encoding,
+            pretty_print=True,
+            method="html",
+            with_tail=False,
+        )
+        if isinstance(content, bytes):
+            content = content.strip().decode(self.encoding)
+        return TextHandler(content)
+    def has_class(self, class_name: str) -> bool:
+        """Check if the element has a specific class
+        :param class_name: The class name to check for
+        :return: True if element has class with that name otherwise False
+        """
+        if self._is_text_node(self._root):
+            return False
+        return class_name in self._root.classes
+    @property
+    def parent(self) -> Optional["Selector"]:
+        """Return the direct parent of the element or ``None`` otherwise"""
+        _parent = self._root.getparent()
+        return self.__element_convertor(_parent) if _parent is not None else None
+    @property
+    def below_elements(self) -> "Selectors":
+        """Return all elements under the current element in the DOM tree"""
+        if self._is_text_node(self._root):
+            return Selectors()
+        below = cast(List, _find_all_elements(self._root))
+        return self.__elements_convertor(below) if below is not None else Selectors()
+    @property
+    def children(self) -> "Selectors":
+        """Return the children elements of the current element or empty list otherwise"""
+        if self._is_text_node(self._root):
+            return Selectors()
+        return Selectors(
+            self.__element_convertor(child)
+            for child in self._root.iterchildren()
+            if not isinstance(child, html_forbidden)
+        )
+    @property
+    def siblings(self) -> "Selectors":
+        """Return other children of the current element's parent or empty list otherwise"""
+        if self.parent:
+            return Selectors(child for child in self.parent.children if child._root != self._root)
+        return Selectors()
+    def iterancestors(self) -> Generator["Selector", None, None]:
+        """Return a generator that loops over all ancestors of the element, starting with the element's parent."""
+        if self._is_text_node(self._root):
+            return
+        for ancestor in self._root.iterancestors():
+            yield self.__element_convertor(ancestor)
+    def find_ancestor(self, func: Callable[["Selector"], bool]) -> Optional["Selector"]:
+        """Loop over all ancestors of the element till one match the passed function
+        :param func: A function that takes each ancestor as an argument and returns True/False
+        :return: The first ancestor that match the function or ``None`` otherwise.
+        """
+        for ancestor in self.iterancestors():
+            if func(ancestor):
+                return ancestor
+        return None
+    @property
+    def path(self) -> "Selectors":
+        """Returns a list of type `Selectors` that contains the path leading to the current element from the root."""
+        lst = list(self.iterancestors())
+        return Selectors(lst)
+    @property
+    def next(self) -> Optional["Selector"]:
+        """Returns the next element of the current element in the children of the parent or ``None`` otherwise."""
+        if self._is_text_node(self._root):
+            return None
+        next_element = self._root.getnext()
+        while next_element is not None and isinstance(next_element, html_forbidden):
+            # Ignore HTML comments and unwanted types
+            next_element = next_element.getnext()
+        return self.__element_convertor(next_element) if next_element is not None else None
+    @property
+    def previous(self) -> Optional["Selector"]:
+        """Returns the previous element of the current element in the children of the parent or ``None`` otherwise."""
+        if self._is_text_node(self._root):
+            return None
+        prev_element = self._root.getprevious()
+        while prev_element is not None and isinstance(prev_element, html_forbidden):
+            # Ignore HTML comments and unwanted types
+            prev_element = prev_element.getprevious()
+        return self.__element_convertor(prev_element) if prev_element is not None else None
+    def get(self) -> TextHandler:
+        """
+        Serialize this element to a string.
+        For text nodes, returns the text value. For HTML elements, returns the outer HTML.
+        """
+        if self._is_text_node(self._root):
+            return TextHandler(str(self._root))
+        return self.html_content
+    def getall(self) -> TextHandlers:
+        """Return a single-element list containing this element's serialized string."""
+        return TextHandlers([self.get()])
+    extract = getall
+    extract_first = get
+    def __str__(self) -> str:
+        if self._is_text_node(self._root):
+            return str(self._root)
+        return self.html_content
+    def __repr__(self) -> str:
+        length_limit = 40
+        if self._is_text_node(self._root):
+            text = str(self._root)
+            if len(text) > length_limit:
+                text = text[:length_limit].strip() + "..."
+            return f"<text='{text}'>"
+        content = clean_spaces(self.html_content)
+        if len(content) > length_limit:
+            content = content[:length_limit].strip() + "..."
+        data = f"<data='{content}'"
+        if self.parent:
+            parent_content = clean_spaces(self.parent.html_content)
+            if len(parent_content) > length_limit:
+                parent_content = parent_content[:length_limit].strip() + "..."
+            data += f" parent='{parent_content}'"
+        return data + ">"
+    # From here we start with the selecting functions
+    @overload
+    def relocate(
+        self, element: Union[Dict, HtmlElement, "Selector"], percentage: int, selector_type: Literal[True]
+    ) -> "Selectors": ...
+    @overload
+    def relocate(
+        self, element: Union[Dict, HtmlElement, "Selector"], percentage: int, selector_type: Literal[False] = False
+    ) -> List[HtmlElement]: ...
+    def relocate(
+        self,
+        element: Union[Dict, HtmlElement, "Selector"],
+        percentage: int = 0,
+        selector_type: bool = False,
+    ) -> Union[List[HtmlElement], "Selectors"]:
+        """This function will search again for the element in the page tree, used automatically on page structure change
+        :param element: The element we want to relocate in the tree
+        :param percentage: The minimum percentage to accept and not going lower than that. Be aware that the percentage
+         calculation depends solely on the page structure, so don't play with this number unless you must know
+         what you are doing!
+        :param selector_type: If True, the return result will be converted to `Selectors` object
+        :return: List of pure HTML elements that got the highest matching score or 'Selectors' object
+        """
+        score_table: Dict[float, List[Any]] = {}
+        # Note: `element` will most likely always be a dictionary at this point.
+        if isinstance(element, self.__class__):
+            element = element._root
+        if issubclass(type(element), HtmlElement):
+            element = _StorageTools.element_to_dict(element)
+        for node in cast(List, _find_all_elements(self._root)):
+            # Collect all elements in the page, then for each element get the matching score of it against the node.
+            # Hence: the code doesn't stop even if the score was 100%
+            # because there might be another element(s) left in page with the same score
+            score = self.__calculate_similarity_score(cast(Dict, element), node)
+            score_table.setdefault(score, []).append(node)
+        if score_table:
+            highest_probability = max(score_table.keys())
+            if score_table[highest_probability] and highest_probability >= percentage:
+                if log.getEffectiveLevel() < 20:
+                    # No need to execute this part if the logging level is not debugging
+                    log.debug(f"Highest probability was {highest_probability}%")
+                    log.debug("Top 5 best matching elements are: ")
+                    for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
+                        log.debug(f"{percent} -> {self.__elements_convertor(score_table[percent])}")
+                if not selector_type:
+                    return score_table[highest_probability]
+                return self.__elements_convertor(score_table[highest_probability])
+        return []
+    def css(
+        self,
+        selector: str,
+        identifier: str = "",
+        adaptive: bool = False,
+        auto_save: bool = False,
+        percentage: int = 0,
+    ) -> "Selectors":
+        """Search the current tree with CSS3 selectors
+        **Important:
+        It's recommended to use the identifier argument if you plan to use a different selector later
+        and want to relocate the same element(s)**
+        :param selector: The CSS3 selector to be used.
+        :param adaptive: Enabled will make the function try to relocate the element if it was 'saved' before
+        :param identifier: A string that will be used to save/retrieve element's data in adaptive,
+         otherwise the selector will be used.
+        :param auto_save: Automatically save new elements for `adaptive` later
+        :param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.
+         Be aware that the percentage calculation depends solely on the page structure, so don't play with this
+         number unless you must know what you are doing!
+        :return: `Selectors` class.
+        """
+        if self._is_text_node(self._root):
+            return Selectors()
+        try:
+            if not self.__adaptive_enabled or "," not in selector:
+                # No need to split selectors in this case, let's save some CPU cycles :)
+                xpath_selector = _css_to_xpath(selector)
+                return self.xpath(
+                    xpath_selector,
+                    identifier or selector,
+                    adaptive,
+                    auto_save,
+                    percentage,
+                )
+            results = Selectors()
+            for single_selector in split_selectors(selector):
+                # I'm doing this only so the `save` function saves data correctly for combined selectors
+                # Like using the ',' to combine two different selectors that point to different elements.
+                xpath_selector = _css_to_xpath(single_selector.canonical())
+                results += self.xpath(
+                    xpath_selector,
+                    identifier or single_selector.canonical(),
+                    adaptive,
+                    auto_save,
+                    percentage,
+                )
+            return Selectors(results)
+        except (
+            SelectorError,
+            SelectorSyntaxError,
+        ) as e:
+            raise SelectorSyntaxError(f"Invalid CSS selector '{selector}': {str(e)}") from e
+    def xpath(
+        self,
+        selector: str,
+        identifier: str = "",
+        adaptive: bool = False,
+        auto_save: bool = False,
+        percentage: int = 0,
+        **kwargs: Any,
+    ) -> "Selectors":
+        """Search the current tree with XPath selectors
+        **Important:
+        It's recommended to use the identifier argument if you plan to use a different selector later
+        and want to relocate the same element(s)**
+         Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
+        :param selector: The XPath selector to be used.
+        :param adaptive: Enabled will make the function try to relocate the element if it was 'saved' before
+        :param identifier: A string that will be used to save/retrieve element's data in adaptive,
+         otherwise the selector will be used.
+        :param auto_save: Automatically save new elements for `adaptive` later
+        :param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.
+         Be aware that the percentage calculation depends solely on the page structure, so don't play with this
+         number unless you must know what you are doing!
+        :return: `Selectors` class.
+        """
+        if self._is_text_node(self._root):
+            return Selectors()
+        try:
+            if elements := self._root.xpath(selector, **kwargs):
+                if not self.__adaptive_enabled and auto_save:
+                    log.warning(
+                        "Argument `auto_save` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
+                    )
+                elif self.__adaptive_enabled and auto_save:
+                    self.save(elements[0], identifier or selector)
+                return self.__handle_elements(elements)
+            elif self.__adaptive_enabled:
+                if adaptive:
+                    element_data = self.retrieve(identifier or selector)
+                    if element_data:
+                        elements = self.relocate(element_data, percentage)
+                        if elements is not None and auto_save:
+                            self.save(elements[0], identifier or selector)
+                return self.__handle_elements(elements)
+            else:
+                if adaptive:
+                    log.warning(
+                        "Argument `adaptive` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
+                    )
+                elif auto_save:
+                    log.warning(
+                        "Argument `auto_save` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
+                    )
+                return self.__handle_elements(elements)
+        except (
+            SelectorError,
+            SelectorSyntaxError,
+            XPathError,
+            XPathEvalError,
+        ) as e:
+            raise SelectorSyntaxError(f"Invalid XPath selector: {selector}") from e
+    def find_all(
+        self,
+        *args: str | Iterable[str] | Pattern | Callable | Dict[str, str],
+        **kwargs: str,
+    ) -> "Selectors":
+        """Find elements by filters of your creations for ease.
+        :param args: Tag name(s), iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
+        :param kwargs: The attributes you want to filter elements based on it.
+        :return: The `Selectors` object of the elements or empty list
+        """
+        if self._is_text_node(self._root):
+            return Selectors()
+        if not args and not kwargs:
+            raise TypeError("You have to pass something to search with, like tag name(s), tag attributes, or both.")
+        attributes: Dict[str, Any] = dict()
+        tags: Set[str] = set()
+        patterns: Set[Pattern] = set()
+        results, functions, selectors = Selectors(), [], []
+        # Brace yourself for a wonderful journey!
+        for arg in args:
+            if isinstance(arg, str):
+                tags.add(arg)
+            elif type(arg) in (list, tuple, set):
+                arg = cast(Iterable, arg)  # Type narrowing for type checkers like pyright
+                if not all(map(lambda x: isinstance(x, str), arg)):
+                    raise TypeError("Nested Iterables are not accepted, only iterables of tag names are accepted")
+                tags.update(set(arg))
+            elif isinstance(arg, dict):
+                if not all([(isinstance(k, str) and isinstance(v, str)) for k, v in arg.items()]):
+                    raise TypeError(
+                        "Nested dictionaries are not accepted, only string keys and string values are accepted"
+                    )
+                attributes.update(arg)
+            elif isinstance(arg, re_Pattern):
+                patterns.add(arg)
+            elif callable(arg):
+                if len(signature(arg).parameters) > 0:
+                    functions.append(arg)
+                else:
+                    raise TypeError(
+                        "Callable filter function must have at least one argument to take `Selector` objects."
+                    )
+            else:
+                raise TypeError(f'Argument with type "{type(arg)}" is not accepted, please read the docs.')
+        if not all([(isinstance(k, str) and isinstance(v, str)) for k, v in kwargs.items()]):
+            raise TypeError("Only string values are accepted for arguments")
+        for attribute_name, value in kwargs.items():
+            # Only replace names for kwargs, replacing them in dictionaries doesn't make sense
+            attribute_name = _whitelisted.get(attribute_name, attribute_name)
+            attributes[attribute_name] = value
+        # It's easier and faster to build a selector than traversing the tree
+        tags = tags or set("*")
+        for tag in tags:
+            selector = tag
+            for key, value in attributes.items():
+                value = value.replace('"', r"\"")  # Escape double quotes in user input
+                # Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
+                selector += '[{}="{}"]'.format(key, value)
+            if selector != "*":
+                selectors.append(selector)
+        if selectors:
+            results = cast(Selectors, self.css(", ".join(selectors)))
+            if results:
+                # From the results, get the ones that fulfill passed regex patterns
+                for pattern in patterns:
+                    results = results.filter(lambda e: e.text.re(pattern, check_match=True))
+                # From the results, get the ones that fulfill passed functions
+                for function in functions:
+                    results = results.filter(function)
+        else:
+            results = results or self.below_elements
+            for pattern in patterns:
+                results = results.filter(lambda e: e.text.re(pattern, check_match=True))
+            # Collect an element if it fulfills the passed function otherwise
+            for function in functions:
+                results = results.filter(function)
+        return results
+    def find(
+        self,
+        *args: str | Iterable[str] | Pattern | Callable | Dict[str, str],
+        **kwargs: str,
+    ) -> Optional["Selector"]:
+        """Find elements by filters of your creations for ease, then return the first result. Otherwise return `None`.
+        :param args: Tag name(s), iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
+        :param kwargs: The attributes you want to filter elements based on it.
+        :return: The `Selector` object of the element or `None` if the result didn't match
+        """
+        for element in self.find_all(*args, **kwargs):
+            return element
+        return None
+    def __calculate_similarity_score(self, original: Dict, candidate: HtmlElement) -> float:
+        """Used internally to calculate a score that shows how a candidate element similar to the original one
+        :param original: The original element in the form of the dictionary generated from `element_to_dict` function
+        :param candidate: The element to compare with the original element.
+        :return: A percentage score of how similar is the candidate to the original element
+        """
+        score: float = 0
+        checks: int = 0
+        data = _StorageTools.element_to_dict(candidate)
+        score += 1 if original["tag"] == data["tag"] else 0
+        checks += 1
+        if original["text"]:
+            score += SequenceMatcher(None, original["text"], data.get("text") or "").ratio()
+            checks += 1
+        # if both don't have attributes, it still counts for something!
+        score += self.__calculate_dict_diff(original["attributes"], data["attributes"])
+        checks += 1
+        # Separate similarity test for class, id, href,... this will help in full structural changes
+        for attrib in (
+            "class",
+            "id",
+            "href",
+            "src",
+        ):
+            if original["attributes"].get(attrib):
+                score += SequenceMatcher(
+                    None,
+                    original["attributes"][attrib],
+                    data["attributes"].get(attrib) or "",
+                ).ratio()
+                checks += 1
+        score += SequenceMatcher(None, original["path"], data["path"]).ratio()
+        checks += 1
+        if original.get("parent_name"):
+            # Then we start comparing parents' data
+            if data.get("parent_name"):
+                score += SequenceMatcher(None, original["parent_name"], data.get("parent_name") or "").ratio()
+                checks += 1
+                score += self.__calculate_dict_diff(original["parent_attribs"], data.get("parent_attribs") or {})
+                checks += 1
+                if original["parent_text"]:
+                    score += SequenceMatcher(
+                        None,
+                        original["parent_text"],
+                        data.get("parent_text") or "",
+                    ).ratio()
+                    checks += 1
+            # else:
+            #     # The original element has a parent and this one not, this is not a good sign
+            #     score -= 0.1
+        if original.get("siblings"):
+            score += SequenceMatcher(None, original["siblings"], data.get("siblings") or []).ratio()
+            checks += 1
+        # How % sure? let's see
+        return round((score / checks) * 100, 2)
+    @staticmethod
+    def __calculate_dict_diff(dict1: Dict, dict2: Dict) -> float:
+        """Used internally to calculate similarity between two dictionaries as SequenceMatcher doesn't accept dictionaries"""
+        score = SequenceMatcher(None, tuple(dict1.keys()), tuple(dict2.keys())).ratio() * 0.5
+        score += SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio() * 0.5
+        return score
+    def save(self, element: HtmlElement, identifier: str) -> None:
+        """Saves the element's unique properties to the storage for retrieval and relocation later
+        :param element: The element itself that we want to save to storage, it can be a ` Selector ` or pure ` HtmlElement `
+        :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
+            the docs for more info.
+        """
+        if self.__adaptive_enabled and self._storage:
+            target_element: Any = element
+            if isinstance(target_element, self.__class__):
+                target_element = target_element._root
+            if self._is_text_node(target_element):
+                target_element = target_element.getparent()
+            self._storage.save(target_element, identifier)
+        else:
+            raise RuntimeError(
+                "Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
+            )
+    def retrieve(self, identifier: str) -> Optional[Dict[str, Any]]:
+        """Using the identifier, we search the storage and return the unique properties of the element
+        :param identifier: This is the identifier that will be used to retrieve the element from the storage. See
+            the docs for more info.
+        :return: A dictionary of the unique properties
+        """
+        if self.__adaptive_enabled and self._storage:
+            return self._storage.retrieve(identifier)
+        raise RuntimeError(
+            "Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
+        )
+    # Operations on text functions
+    def json(self) -> Dict:
+        """Return JSON response if the response is jsonable otherwise throws error"""
+        if self._is_text_node(self._root):
+            return TextHandler(str(self._root)).json()
+        if self._raw_body and isinstance(self._raw_body, (str, bytes)):
+            if isinstance(self._raw_body, str):
+                return TextHandler(self._raw_body).json()
+            else:
+                if TYPE_CHECKING:
+                    assert isinstance(self._raw_body, bytes)
+                return TextHandler(self._raw_body.decode()).json()
+        elif self.text:
+            return self.text.json()
+        else:
+            return self.get_all_text(strip=True).json()
+    def re(
+        self,
+        regex: str | Pattern[str],
+        replace_entities: bool = True,
+        clean_match: bool = False,
+        case_sensitive: bool = True,
+    ) -> TextHandlers:
+        """Apply the given regex to the current text and return a list of strings with the matches.
+        :param regex: Can be either a compiled regular expression or a string.
+        :param replace_entities: If enabled character entity references are replaced by their corresponding character
+        :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
+        :param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
+        """
+        return self.text.re(regex, replace_entities, clean_match, case_sensitive)
+    def re_first(
+        self,
+        regex: str | Pattern[str],
+        default=None,
+        replace_entities: bool = True,
+        clean_match: bool = False,
+        case_sensitive: bool = True,
+    ) -> TextHandler:
+        """Apply the given regex to text and return the first match if found, otherwise return the default value.
+        :param regex: Can be either a compiled regular expression or a string.
+        :param default: The default value to be returned if there is no match
+        :param replace_entities: if enabled character entity references are replaced by their corresponding character
+        :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
+        :param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
+        """
+        return self.text.re_first(regex, default, replace_entities, clean_match, case_sensitive)
+    @staticmethod
+    def __get_attributes(element: HtmlElement, ignore_attributes: List | Tuple) -> Dict:
+        """Return attributes dictionary without the ignored list"""
+        return {k: v for k, v in element.attrib.items() if k not in ignore_attributes}
+    def __are_alike(
+        self,
+        original: HtmlElement,
+        original_attributes: Dict,
+        candidate: HtmlElement,
+        ignore_attributes: List | Tuple,
+        similarity_threshold: float,
+        match_text: bool = False,
+    ) -> bool:
+        """Calculate a score of how much these elements are alike and return True
+        if the score is higher or equals the threshold"""
+        candidate_attributes = (
+            self.__get_attributes(candidate, ignore_attributes) if ignore_attributes else candidate.attrib
+        )
+        score: float = 0
+        checks: int = 0
+        if original_attributes:
+            score += sum(
+                SequenceMatcher(None, v, candidate_attributes.get(k, "")).ratio()
+                for k, v in original_attributes.items()
+            )
+            checks += len(candidate_attributes)
+        else:
+            if not candidate_attributes:
+                # Both don't have attributes, this must mean something
+                score += 1
+                checks += 1
+        if match_text:
+            score += SequenceMatcher(
+                None,
+                clean_spaces(original.text or ""),
+                clean_spaces(candidate.text or ""),
+            ).ratio()
+            checks += 1
+        if checks:
+            return round(score / checks, 2) >= similarity_threshold
+        return False
+    def find_similar(
+        self,
+        similarity_threshold: float = 0.2,
+        ignore_attributes: List | Tuple = (
+            "href",
+            "src",
+        ),
+        match_text: bool = False,
+    ) -> "Selectors":
+        """Find elements that are in the same tree depth in the page with the same tag name and same parent tag etc...
+        then return the ones that match the current element attributes with a percentage higher than the input threshold.
+        This function is inspired by AutoScraper and made for cases where you, for example, found a product div inside
+        a products-list container and want to find other products using that element as a starting point EXCEPT
+        this function works in any case without depending on the element type.
+        :param similarity_threshold: The percentage to use while comparing element attributes.
+            Note: Elements found before attributes matching/comparison will be sharing the same depth, same tag name,
+            same parent tag name, and same grand parent tag name. So they are 99% likely to be correct unless you are
+            extremely unlucky, then attributes matching comes into play, so don't play with this number unless
+            you are getting the results you don't want.
+            Also, if the current element doesn't have attributes and the similar element as well, then it's a 100% match.
+        :param ignore_attributes: Attribute names passed will be ignored while matching the attributes in the last step.
+            The default value is to ignore `href` and `src` as URLs can change a lot between elements, so it's unreliable
+        :param match_text: If True, element text content will be taken into calculation while matching.
+            Not recommended to use in normal cases, but it depends.
+        :return: A ``Selectors`` container of ``Selector`` objects or empty list
+        """
+        if self._is_text_node(self._root):
+            return Selectors()
+        # We will use the elements' root from now on to get the speed boost of using Lxml directly
+        root = self._root
+        similar_elements = list()
+        current_depth = len(list(root.iterancestors()))
+        target_attrs = self.__get_attributes(root, ignore_attributes) if ignore_attributes else root.attrib
+        path_parts = [self.tag]
+        if (parent := root.getparent()) is not None:
+            path_parts.insert(0, parent.tag)
+            if (grandparent := parent.getparent()) is not None:
+                path_parts.insert(0, grandparent.tag)
+        xpath_path = "//{}".format("/".join(path_parts))
+        potential_matches = root.xpath(f"{xpath_path}[count(ancestor::*) = {current_depth}]")
+        for potential_match in potential_matches:
+            if potential_match != root and self.__are_alike(
+                root,
+                target_attrs,
+                potential_match,
+                ignore_attributes,
+                similarity_threshold,
+                match_text,
+            ):
+                similar_elements.append(potential_match)
+        return Selectors(map(self.__element_convertor, similar_elements))
+    @overload
+    def find_by_text(
+        self,
+        text: str,
+        first_match: Literal[True] = ...,
+        partial: bool = ...,
+        case_sensitive: bool = ...,
+        clean_match: bool = ...,
+    ) -> "Selector": ...
+    @overload
+    def find_by_text(
+        self,
+        text: str,
+        first_match: Literal[False],
+        partial: bool = ...,
+        case_sensitive: bool = ...,
+        clean_match: bool = ...,
+    ) -> "Selectors": ...
+    def find_by_text(
+        self,
+        text: str,
+        first_match: bool = True,
+        partial: bool = False,
+        case_sensitive: bool = False,
+        clean_match: bool = True,
+    ) -> Union["Selectors", "Selector"]:
+        """Find elements that its text content fully/partially matches input.
+        :param text: Text query to match
+        :param first_match: Returns the first element that matches conditions, enabled by default
+        :param partial: If enabled, the function returns elements that contain the input text
+        :param case_sensitive: if enabled, the letters case will be taken into consideration
+        :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
+        """
+        if self._is_text_node(self._root):
+            return Selectors()
+        results = Selectors()
+        if not case_sensitive:
+            text = text.lower()
+        possible_targets = cast(List, _find_all_elements_with_spaces(self._root))
+        if possible_targets:
+            for node in self.__elements_convertor(possible_targets):
+                """Check if element matches given text otherwise, traverse the children tree and iterate"""
+                node_text: TextHandler = node.text
+                if clean_match:
+                    node_text = TextHandler(node_text.clean())
+                if not case_sensitive:
+                    node_text = TextHandler(node_text.lower())
+                if partial:
+                    if text in node_text:
+                        results.append(node)
+                elif text == node_text:
+                    results.append(node)
+                if first_match and results:
+                    # we got an element so we should stop
+                    break
+            if first_match:
+                if results:
+                    return results[0]
+        return results
+    @overload
+    def find_by_regex(
+        self,
+        query: str | Pattern[str],
+        first_match: Literal[True] = ...,
+        case_sensitive: bool = ...,
+        clean_match: bool = ...,
+    ) -> "Selector": ...
+    @overload
+    def find_by_regex(
+        self,
+        query: str | Pattern[str],
+        first_match: Literal[False],
+        case_sensitive: bool = ...,
+        clean_match: bool = ...,
+    ) -> "Selectors": ...
+    def find_by_regex(
+        self,
+        query: str | Pattern[str],
+        first_match: bool = True,
+        case_sensitive: bool = False,
+        clean_match: bool = True,
+    ) -> Union["Selectors", "Selector"]:
+        """Find elements that its text content matches the input regex pattern.
+        :param query: Regex query/pattern to match
+        :param first_match: Return the first element that matches conditions; enabled by default.
+        :param case_sensitive: If enabled, the letters case will be taken into consideration in the regex.
+        :param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching.
+        """
+        if self._is_text_node(self._root):
+            return Selectors()
+        results = Selectors()
+        possible_targets = cast(List, _find_all_elements_with_spaces(self._root))
+        if possible_targets:
+            for node in self.__elements_convertor(possible_targets):
+                """Check if element matches given regex otherwise, traverse the children tree and iterate"""
+                node_text = node.text
+                if node_text.re(
+                    query,
+                    check_match=True,
+                    clean_match=clean_match,
+                    case_sensitive=case_sensitive,
+                ):
+                    results.append(node)
+                if first_match and results:
+                    # we got an element so we should stop
+                    break
+            if results and first_match:
+                return results[0]
+        return results
+class Selectors(List[Selector]):
+    """
+    The `Selectors` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
+    """
+    __slots__ = ()
+    @overload
+    def __getitem__(self, pos: SupportsIndex) -> Selector:
+        pass
+    @overload
+    def __getitem__(self, pos: slice) -> "Selectors":
+        pass
+    def __getitem__(self, pos: SupportsIndex | slice) -> Union[Selector, "Selectors"]:
+        lst = super().__getitem__(pos)
+        if isinstance(pos, slice):
+            return self.__class__(cast(List[Selector], lst))
+        else:
+            return cast(Selector, lst)
+    def xpath(
+        self,
+        selector: str,
+        identifier: str = "",
+        auto_save: bool = False,
+        percentage: int = 0,
+        **kwargs: Any,
+    ) -> "Selectors":
+        """
+        Call the ``.xpath()`` method for each element in this list and return
+        their results as another `Selectors` class.
+        **Important:
+        It's recommended to use the identifier argument if you plan to use a different selector later
+        and want to relocate the same element(s)**
+         Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
+        :param selector: The XPath selector to be used.
+        :param identifier: A string that will be used to retrieve element's data in adaptive,
+         otherwise the selector will be used.
+        :param auto_save: Automatically save new elements for `adaptive` later
+        :param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.
+         Be aware that the percentage calculation depends solely on the page structure, so don't play with this
+         number unless you must know what you are doing!
+        :return: `Selectors` class.
+        """
+        results = [n.xpath(selector, identifier or selector, False, auto_save, percentage, **kwargs) for n in self]
+        return self.__class__(flatten(results))
+    def css(
+        self,
+        selector: str,
+        identifier: str = "",
+        auto_save: bool = False,
+        percentage: int = 0,
+    ) -> "Selectors":
+        """
+        Call the ``.css()`` method for each element in this list and return
+        their results flattened as another `Selectors` class.
+        **Important:
+        It's recommended to use the identifier argument if you plan to use a different selector later
+        and want to relocate the same element(s)**
+        :param selector: The CSS3 selector to be used.
+        :param identifier: A string that will be used to retrieve element's data in adaptive,
+         otherwise the selector will be used.
+        :param auto_save: Automatically save new elements for `adaptive` later
+        :param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.
+         Be aware that the percentage calculation depends solely on the page structure, so don't play with this
+         number unless you must know what you are doing!
+        :return: `Selectors` class.
+        """
+        results = [n.css(selector, identifier or selector, False, auto_save, percentage) for n in self]
+        return self.__class__(flatten(results))
+    def re(
+        self,
+        regex: str | Pattern,
+        replace_entities: bool = True,
+        clean_match: bool = False,
+        case_sensitive: bool = True,
+    ) -> TextHandlers:
+        """Call the ``.re()`` method for each element in this list and return
+        their results flattened as List of TextHandler.
+        :param regex: Can be either a compiled regular expression or a string.
+        :param replace_entities: If enabled character entity references are replaced by their corresponding character
+        :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
+        :param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
+        """
+        results = [n.re(regex, replace_entities, clean_match, case_sensitive) for n in self]
+        return TextHandlers(flatten(results))
+    def re_first(
+        self,
+        regex: str | Pattern,
+        default: Any = None,
+        replace_entities: bool = True,
+        clean_match: bool = False,
+        case_sensitive: bool = True,
+    ) -> TextHandler:
+        """Call the ``.re_first()`` method for each element in this list and return
+        the first result or the default value otherwise.
+        :param regex: Can be either a compiled regular expression or a string.
+        :param default: The default value to be returned if there is no match
+        :param replace_entities: if enabled character entity references are replaced by their corresponding character
+        :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
+        :param case_sensitive: if disabled, function will set the regex to ignore the letters case while compiling it
+        """
+        for n in self:
+            for result in n.re(regex, replace_entities, clean_match, case_sensitive):
+                return result
+        return default
+    def search(self, func: Callable[["Selector"], bool]) -> Optional["Selector"]:
+        """Loop over all current elements and return the first element that matches the passed function
+        :param func: A function that takes each element as an argument and returns True/False
+        :return: The first element that match the function or ``None`` otherwise.
+        """
+        for element in self:
+            if func(element):
+                return element
+        return None
+    def filter(self, func: Callable[["Selector"], bool]) -> "Selectors":
+        """Filter current elements based on the passed function
+        :param func: A function that takes each element as an argument and returns True/False
+        :return: The new `Selectors` object or empty list otherwise.
+        """
+        return self.__class__([element for element in self if func(element)])
+    @overload
+    def get(self) -> Optional[TextHandler]: ...
+    @overload
+    def get(self, default: _T) -> Union[TextHandler, _T]: ...
+    def get(self, default=None):
+        """Returns the serialized string of the first element, or ``default`` if empty.
+        :param default: the default value to return if the current list is empty
+        """
+        for x in self:
+            return x.get()
+        return default
+    def getall(self) -> TextHandlers:
+        """Serialize all elements and return as a TextHandlers list."""
+        return TextHandlers([x.get() for x in self])
+    extract = getall
+    extract_first = get
+    @property
+    def first(self) -> Optional[Selector]:
+        """Returns the first Selector item of the current list or `None` if the list is empty"""
+        return self[0] if len(self) > 0 else None
+    @property
+    def last(self) -> Optional[Selector]:
+        """Returns the last Selector item of the current list or `None` if the list is empty"""
+        return self[-1] if len(self) > 0 else None
+    @property
+    def length(self) -> int:
+        """Returns the length of the current list"""
+        return len(self)
+    def __getstate__(self) -> Any:  # pragma: no cover
+        # lxml don't like it :)
+        raise TypeError("Can't pickle Selectors object")
+# For backward compatibility
+Adaptor = Selector
+Adaptors = Selectors

py.typed ADDED Viewed

	@@ -0,0 +1 @@


1	+

spiders/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from .request import Request
+from .result import CrawlResult
+from .scheduler import Scheduler
+from .engine import CrawlerEngine
+from .session import SessionManager
+from .spider import Spider, SessionConfigurationError
+from scrapling.engines.toolbelt.custom import Response
+__all__ = [
+    "Spider",
+    "SessionConfigurationError",
+    "Request",
+    "CrawlerEngine",
+    "CrawlResult",
+    "SessionManager",
+    "Scheduler",
+    "Response",
+]

spiders/checkpoint.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import pickle
+from pathlib import Path
+from dataclasses import dataclass, field
+import anyio
+from anyio import Path as AsyncPath
+from scrapling.core.utils import log
+from scrapling.core._types import Set, List, Optional, TYPE_CHECKING
+if TYPE_CHECKING:
+    from scrapling.spiders.request import Request
+@dataclass
+class CheckpointData:
+    """Container for checkpoint state."""
+    requests: List["Request"] = field(default_factory=list)
+    seen: Set[bytes] = field(default_factory=set)
+class CheckpointManager:
+    """Manages saving and loading checkpoint state to/from disk."""
+    CHECKPOINT_FILE = "checkpoint.pkl"
+    def __init__(self, crawldir: str | Path | AsyncPath, interval: float = 300.0):
+        self.crawldir = AsyncPath(crawldir)
+        self._checkpoint_path = self.crawldir / self.CHECKPOINT_FILE
+        self.interval = interval
+        if not isinstance(interval, (int, float)):
+            raise TypeError("Checkpoints interval must be integer or float.")
+        else:
+            if interval < 0:
+                raise ValueError("Checkpoints interval must be equal or greater than 0.")
+    async def has_checkpoint(self) -> bool:
+        """Check if a checkpoint exists."""
+        return await self._checkpoint_path.exists()
+    async def save(self, data: CheckpointData) -> None:
+        """Save checkpoint data to disk atomically."""
+        await self.crawldir.mkdir(parents=True, exist_ok=True)
+        temp_path = self._checkpoint_path.with_suffix(".tmp")
+        try:
+            serialized = pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL)
+            async with await anyio.open_file(temp_path, "wb") as f:
+                await f.write(serialized)
+            await temp_path.rename(self._checkpoint_path)
+            log.info(f"Checkpoint saved: {len(data.requests)} requests, {len(data.seen)} seen URLs")
+        except Exception as e:
+            # Clean up temp file if it exists
+            if await temp_path.exists():
+                await temp_path.unlink()
+            log.error(f"Failed to save checkpoint: {e}")
+            raise
+    async def load(self) -> Optional[CheckpointData]:
+        """Load checkpoint data from disk.
+        Returns None if no checkpoint exists or if loading fails.
+        """
+        if not await self.has_checkpoint():
+            return None
+        try:
+            async with await anyio.open_file(self._checkpoint_path, "rb") as f:
+                content = await f.read()
+                data: CheckpointData = pickle.loads(content)
+            log.info(f"Checkpoint loaded: {len(data.requests)} requests, {len(data.seen)} seen URLs")
+            return data
+        except Exception as e:
+            log.error(f"Failed to load checkpoint (starting fresh): {e}")
+            return None
+    async def cleanup(self) -> None:
+        """Delete checkpoint file after successful completion."""
+        try:
+            if await self._checkpoint_path.exists():
+                await self._checkpoint_path.unlink()
+            log.debug("Checkpoint file cleaned up")
+        except Exception as e:
+            log.warning(f"Failed to cleanup checkpoint file: {e}")

spiders/engine.py ADDED Viewed

	@@ -0,0 +1,333 @@

+import json
+import pprint
+from pathlib import Path
+import anyio
+from anyio import Path as AsyncPath
+from anyio import create_task_group, CapacityLimiter, create_memory_object_stream, EndOfStream
+from scrapling.core.utils import log
+from scrapling.spiders.request import Request
+from scrapling.spiders.scheduler import Scheduler
+from scrapling.spiders.session import SessionManager
+from scrapling.spiders.result import CrawlStats, ItemList
+from scrapling.spiders.checkpoint import CheckpointManager, CheckpointData
+from scrapling.core._types import Dict, Union, Optional, TYPE_CHECKING, Any, AsyncGenerator
+if TYPE_CHECKING:
+    from scrapling.spiders.spider import Spider
+def _dump(obj: Dict) -> str:
+    return json.dumps(obj, indent=4)
+class CrawlerEngine:
+    """Orchestrates the crawling process."""
+    def __init__(
+        self,
+        spider: "Spider",
+        session_manager: SessionManager,
+        crawldir: Optional[Union[str, Path, AsyncPath]] = None,
+        interval: float = 300.0,
+    ):
+        self.spider = spider
+        self.session_manager = session_manager
+        self.scheduler = Scheduler(
+            include_kwargs=spider.fp_include_kwargs,
+            include_headers=spider.fp_include_headers,
+            keep_fragments=spider.fp_keep_fragments,
+        )
+        self.stats = CrawlStats()
+        self._global_limiter = CapacityLimiter(spider.concurrent_requests)
+        self._domain_limiters: dict[str, CapacityLimiter] = {}
+        self._allowed_domains: set[str] = spider.allowed_domains or set()
+        self._active_tasks: int = 0
+        self._running: bool = False
+        self._items: ItemList = ItemList()
+        self._item_stream: Any = None
+        self._checkpoint_system_enabled = bool(crawldir)
+        self._checkpoint_manager = CheckpointManager(crawldir or "", interval)
+        self._last_checkpoint_time: float = 0.0
+        self._pause_requested: bool = False
+        self._force_stop: bool = False
+        self.paused: bool = False
+    def _is_domain_allowed(self, request: Request) -> bool:
+        """Check if the request's domain is in allowed_domains."""
+        if not self._allowed_domains:
+            return True
+        domain = request.domain
+        for allowed in self._allowed_domains:
+            if domain == allowed or domain.endswith("." + allowed):
+                return True
+        return False
+    def _rate_limiter(self, domain: str) -> CapacityLimiter:
+        """Get or create a per-domain concurrency limiter if enabled, otherwise use the global limiter."""
+        if self.spider.concurrent_requests_per_domain:
+            if domain not in self._domain_limiters:
+                self._domain_limiters[domain] = CapacityLimiter(self.spider.concurrent_requests_per_domain)
+            return self._domain_limiters[domain]
+        return self._global_limiter
+    def _normalize_request(self, request: Request) -> None:
+        """Normalize request fields before enqueueing.
+        Resolves empty sid to the session manager's default session ID.
+        This ensures consistent fingerprinting for requests using the same session.
+        """
+        if not request.sid:
+            request.sid = self.session_manager.default_session_id
+    async def _process_request(self, request: Request) -> None:
+        """Download and process a single request."""
+        async with self._rate_limiter(request.domain):
+            if self.spider.download_delay:
+                await anyio.sleep(self.spider.download_delay)
+            if request._session_kwargs.get("proxy"):
+                self.stats.proxies.append(request._session_kwargs["proxy"])
+            if request._session_kwargs.get("proxies"):
+                self.stats.proxies.append(dict(request._session_kwargs["proxies"]))
+            try:
+                response = await self.session_manager.fetch(request)
+                self.stats.increment_requests_count(request.sid or self.session_manager.default_session_id)
+                self.stats.increment_response_bytes(request.domain, len(response.body))
+                self.stats.increment_status(response.status)
+            except Exception as e:
+                self.stats.failed_requests_count += 1
+                await self.spider.on_error(request, e)
+                return
+        if await self.spider.is_blocked(response):
+            self.stats.blocked_requests_count += 1
+            if request._retry_count < self.spider.max_blocked_retries:
+                retry_request = request.copy()
+                retry_request._retry_count += 1
+                retry_request.priority -= 1  # Don't retry immediately
+                retry_request.dont_filter = True
+                retry_request._session_kwargs.pop("proxy", None)
+                retry_request._session_kwargs.pop("proxies", None)
+                new_request = await self.spider.retry_blocked_request(retry_request, response)
+                self._normalize_request(new_request)
+                await self.scheduler.enqueue(new_request)
+                log.info(
+                    f"Scheduled blocked request for retry ({retry_request._retry_count}/{self.spider.max_blocked_retries}): {request.url}"
+                )
+            else:
+                log.warning(f"Max retries exceeded for blocked request: {request.url}")
+            return
+        callback = request.callback if request.callback else self.spider.parse
+        try:
+            async for result in callback(response):
+                if isinstance(result, Request):
+                    if self._is_domain_allowed(result):
+                        self._normalize_request(result)
+                        await self.scheduler.enqueue(result)
+                    else:
+                        self.stats.offsite_requests_count += 1
+                        log.debug(f"Filtered offsite request to: {result.url}")
+                elif isinstance(result, dict):
+                    processed_result = await self.spider.on_scraped_item(result)
+                    if processed_result:
+                        self.stats.items_scraped += 1
+                        log.debug(f"Scraped from {str(response)}\n{pprint.pformat(processed_result)}")
+                        if self._item_stream:
+                            await self._item_stream.send(processed_result)
+                        else:
+                            self._items.append(processed_result)
+                    else:
+                        self.stats.items_dropped += 1
+                        log.warning(f"Dropped from {str(response)}\n{processed_result}")
+                elif result is not None:
+                    log.error(f"Spider must return Request, dict or None, got '{type(result)}' in {request}")
+        except Exception as e:
+            msg = f"Spider error processing {request}:\n {e}"
+            log.error(msg, exc_info=e)
+            await self.spider.on_error(request, e)
+    async def _task_wrapper(self, request: Request) -> None:
+        """Wrapper to track active task count."""
+        try:
+            await self._process_request(request)
+        finally:
+            self._active_tasks -= 1
+    def request_pause(self) -> None:
+        """Request a graceful pause of the crawl.
+        First call: requests graceful pause (waits for active tasks).
+        Second call: forces immediate stop.
+        """
+        if self._force_stop:
+            return  # Already forcing stop
+        if self._pause_requested:
+            # Second Ctrl+C - force stop
+            self._force_stop = True
+            log.warning("Force stop requested, cancelling immediately...")
+        else:
+            self._pause_requested = True
+            log.info(
+                "Pause requested, waiting for in-flight requests to complete (press Ctrl+C again to force stop)..."
+            )
+    async def _save_checkpoint(self) -> None:
+        """Save current state to checkpoint files."""
+        requests, seen = self.scheduler.snapshot()
+        data = CheckpointData(requests=requests, seen=seen)
+        await self._checkpoint_manager.save(data)
+        self._last_checkpoint_time = anyio.current_time()
+    def _is_checkpoint_time(self) -> bool:
+        """Check if it's time for the periodic checkpoint."""
+        if not self._checkpoint_system_enabled:
+            return False
+        if self._checkpoint_manager.interval == 0:
+            return False
+        current_time = anyio.current_time()
+        return (current_time - self._last_checkpoint_time) >= self._checkpoint_manager.interval
+    async def _restore_from_checkpoint(self) -> bool:
+        """Attempt to restore state from checkpoint.
+        Returns True if successfully restored, False otherwise.
+        """
+        if not self._checkpoint_system_enabled:
+            raise
+        data = await self._checkpoint_manager.load()
+        if data is None:
+            return False
+        self.scheduler.restore(data)
+        # Restore callbacks from spider after scheduler restore
+        for request in data.requests:
+            request._restore_callback(self.spider)
+        return True
+    async def crawl(self) -> CrawlStats:
+        """Run the spider and return CrawlStats."""
+        self._running = True
+        self._items.clear()
+        self.paused = False
+        self._pause_requested = False
+        self._force_stop = False
+        self.stats = CrawlStats(start_time=anyio.current_time())
+        # Check for existing checkpoint
+        resuming = (await self._restore_from_checkpoint()) if self._checkpoint_system_enabled else False
+        self._last_checkpoint_time = anyio.current_time()
+        async with self.session_manager:
+            self.stats.concurrent_requests = self.spider.concurrent_requests
+            self.stats.concurrent_requests_per_domain = self.spider.concurrent_requests_per_domain
+            self.stats.download_delay = self.spider.download_delay
+            await self.spider.on_start(resuming=resuming)
+            try:
+                if not resuming:
+                    async for request in self.spider.start_requests():
+                        self._normalize_request(request)
+                        await self.scheduler.enqueue(request)
+                else:
+                    log.info("Resuming from checkpoint, skipping start_requests()")
+                # Process queue
+                async with create_task_group() as tg:
+                    while self._running:
+                        if self._pause_requested:
+                            if self._active_tasks == 0 or self._force_stop:
+                                if self._force_stop:
+                                    log.warning(f"Force stopping with {self._active_tasks} active tasks")
+                                    tg.cancel_scope.cancel()
+                                # Only save checkpoint if checkpoint system is enabled
+                                if self._checkpoint_system_enabled:
+                                    await self._save_checkpoint()
+                                    self.paused = True
+                                    log.info("Spider paused, checkpoint saved")
+                                else:
+                                    log.info("Spider stopped gracefully")
+                                self._running = False
+                                break
+                            # Wait briefly and check again
+                            await anyio.sleep(0.05)
+                            continue
+                        if self._checkpoint_system_enabled and self._is_checkpoint_time():
+                            await self._save_checkpoint()
+                        if self.scheduler.is_empty:
+                            # Empty queue + no active tasks = done
+                            if self._active_tasks == 0:
+                                self._running = False
+                                log.debug("Spider idle")
+                                break
+                            # Brief wait for callbacks to enqueue new requests
+                            await anyio.sleep(0.05)
+                            continue
+                        # Only spawn tasks up to concurrent_requests limit
+                        # This prevents spawning thousands of waiting tasks
+                        if self._active_tasks >= self.spider.concurrent_requests:
+                            await anyio.sleep(0.01)
+                            continue
+                        request = await self.scheduler.dequeue()
+                        self._active_tasks += 1
+                        tg.start_soon(self._task_wrapper, request)
+            finally:
+                await self.spider.on_close()
+                # Clean up checkpoint files on successful completion (not paused)
+                if not self.paused and self._checkpoint_system_enabled:
+                    await self._checkpoint_manager.cleanup()
+        self.stats.log_levels_counter = self.spider._log_counter.get_counts()
+        self.stats.end_time = anyio.current_time()
+        log.info(_dump(self.stats.to_dict()))
+        return self.stats
+    @property
+    def items(self) -> ItemList:
+        """Access scraped items."""
+        return self._items
+    def __aiter__(self) -> AsyncGenerator[dict, None]:
+        return self._stream()
+    async def _stream(self) -> AsyncGenerator[dict, None]:
+        """Async generator that runs crawl and yields items."""
+        send, recv = create_memory_object_stream[dict](100)
+        self._item_stream = send
+        async def run():
+            try:
+                await self.crawl()
+            finally:
+                await send.aclose()
+        async with create_task_group() as tg:
+            tg.start_soon(run)
+            try:
+                async for item in recv:
+                    yield item
+            except EndOfStream:
+                pass

spiders/request.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import hashlib
+from io import BytesIO
+from functools import cached_property
+from urllib.parse import urlparse, urlencode
+import orjson
+from w3lib.url import canonicalize_url
+from scrapling.engines.toolbelt.custom import Response
+from scrapling.core._types import Any, AsyncGenerator, Callable, Dict, Optional, Union, Tuple, TYPE_CHECKING
+if TYPE_CHECKING:
+    from scrapling.spiders.spider import Spider
+def _convert_to_bytes(value: str | bytes) -> bytes:
+    if isinstance(value, bytes):
+        return value
+    if not isinstance(value, str):
+        raise TypeError(f"Can't convert {type(value).__name__} to bytes")
+    return value.encode(encoding="utf-8", errors="ignore")
+class Request:
+    def __init__(
+        self,
+        url: str,
+        sid: str = "",
+        callback: Callable[[Response], AsyncGenerator[Union[Dict[str, Any], "Request", None], None]] | None = None,
+        priority: int = 0,
+        dont_filter: bool = False,
+        meta: dict[str, Any] | None = None,
+        _retry_count: int = 0,
+        **kwargs: Any,
+    ) -> None:
+        self.url: str = url
+        self.sid: str = sid
+        self.callback = callback
+        self.priority: int = priority
+        self.dont_filter: bool = dont_filter
+        self.meta: dict[str, Any] = meta if meta else {}
+        self._retry_count: int = _retry_count
+        self._session_kwargs = kwargs if kwargs else {}
+        self._fp: Optional[bytes] = None
+    def copy(self) -> "Request":
+        """Create a copy of this request."""
+        return Request(
+            url=self.url,
+            sid=self.sid,
+            callback=self.callback,
+            priority=self.priority,
+            dont_filter=self.dont_filter,
+            meta=self.meta.copy(),
+            _retry_count=self._retry_count,
+            **self._session_kwargs,
+        )
+    @cached_property
+    def domain(self) -> str:
+        return urlparse(self.url).netloc
+    def update_fingerprint(
+        self,
+        include_kwargs: bool = False,
+        include_headers: bool = False,
+        keep_fragments: bool = False,
+    ) -> bytes:
+        """Generate a unique fingerprint for deduplication.
+        Caches the result in self._fp after first computation.
+        """
+        if self._fp is not None:
+            return self._fp
+        post_data = self._session_kwargs.get("data", {})
+        body = b""
+        if post_data:
+            if isinstance(post_data, dict | list | tuple):
+                body = urlencode(post_data).encode()
+            elif isinstance(post_data, str):
+                body = post_data.encode()
+            elif isinstance(post_data, BytesIO):
+                body = post_data.getvalue()
+            elif isinstance(post_data, bytes):
+                body = post_data
+        else:
+            post_data = self._session_kwargs.get("json", {})
+            body = orjson.dumps(post_data) if post_data else b""
+        data: Dict[str, str | Tuple] = {
+            "sid": self.sid,
+            "body": body.hex(),
+            "method": self._session_kwargs.get("method", "GET"),
+            "url": canonicalize_url(self.url, keep_fragments=keep_fragments),
+        }
+        if include_kwargs:
+            kwargs = (key.lower() for key in self._session_kwargs.keys() if key.lower() not in ("data", "json"))
+            data["kwargs"] = "".join(set(_convert_to_bytes(key).hex() for key in kwargs))
+        if include_headers:
+            headers = self._session_kwargs.get("headers") or self._session_kwargs.get("extra_headers") or {}
+            processed_headers = {}
+            # Some header normalization
+            for key, value in headers.items():
+                processed_headers[_convert_to_bytes(key.lower()).hex()] = _convert_to_bytes(value.lower()).hex()
+            data["headers"] = tuple(processed_headers.items())
+        fp = hashlib.sha1(orjson.dumps(data, option=orjson.OPT_SORT_KEYS), usedforsecurity=False).digest()
+        self._fp = fp
+        return fp
+    def __repr__(self) -> str:
+        callback_name = getattr(self.callback, "__name__", None) or "None"
+        return f"<Request({self.url}) priority={self.priority} callback={callback_name}>"
+    def __str__(self) -> str:
+        return self.url
+    def __lt__(self, other: object) -> bool:
+        """Compare requests by priority"""
+        if not isinstance(other, Request):
+            return NotImplemented
+        return self.priority < other.priority
+    def __gt__(self, other: object) -> bool:
+        """Compare requests by priority"""
+        if not isinstance(other, Request):
+            return NotImplemented
+        return self.priority > other.priority
+    def __eq__(self, other: object) -> bool:
+        """Requests are equal if they have the same fingerprint."""
+        if not isinstance(other, Request):
+            return NotImplemented
+        if self._fp is None or other._fp is None:
+            raise RuntimeError("Cannot compare requests before generating their fingerprints!")
+        return self._fp == other._fp
+    def __getstate__(self) -> dict[str, Any]:
+        """Prepare state for pickling - store callback as name string for pickle compatibility."""
+        state = self.__dict__.copy()
+        state["_callback_name"] = getattr(self.callback, "__name__", None) if self.callback is not None else None
+        state["callback"] = None  # Don't pickle the actual callable
+        return state
+    def __setstate__(self, state: dict[str, Any]) -> None:
+        """Restore state from pickle - callback restored later via _restore_callback()."""
+        self._callback_name: str | None = state.pop("_callback_name", None)
+        self.__dict__.update(state)
+    def _restore_callback(self, spider: "Spider") -> None:
+        """Restore callback from spider after unpickling.
+        :param spider: Spider instance to look up callback method on
+        """
+        if hasattr(self, "_callback_name") and self._callback_name:
+            self.callback = getattr(spider, self._callback_name, None) or spider.parse
+            del self._callback_name
+        elif hasattr(self, "_callback_name"):
+            del self._callback_name

spiders/result.py ADDED Viewed

	@@ -0,0 +1,125 @@

+from pathlib import Path
+from dataclasses import dataclass, field
+import orjson
+from scrapling.core.utils import log
+from scrapling.core._types import Any, Iterator, Dict, List, Tuple, Union
+class ItemList(list):
+    """A list of scraped items with export capabilities."""
+    def to_json(self, path: Union[str, Path], *, indent: bool = False):
+        """Export items to a JSON file.
+        :param path: Path to the output file
+        :param indent: Pretty-print with 2-space indentation (slightly slower)
+        """
+        options = orjson.OPT_SERIALIZE_NUMPY
+        if indent:
+            options |= orjson.OPT_INDENT_2
+        file = Path(path)
+        file.parent.mkdir(parents=True, exist_ok=True)
+        file.write_bytes(orjson.dumps(list(self), option=options))
+        log.info("Saved %d items to %s", len(self), path)
+    def to_jsonl(self, path: Union[str, Path]):
+        """Export items as JSON Lines (one JSON object per line).
+        :param path: Path to the output file
+        """
+        Path(path).parent.mkdir(parents=True, exist_ok=True)
+        with open(path, "wb") as f:
+            for item in self:
+                f.write(orjson.dumps(item, option=orjson.OPT_SERIALIZE_NUMPY))
+                f.write(b"\n")
+        log.info("Saved %d items to %s", len(self), path)
+@dataclass
+class CrawlStats:
+    """Statistics for a crawl run."""
+    requests_count: int = 0
+    concurrent_requests: int = 0
+    concurrent_requests_per_domain: int = 0
+    failed_requests_count: int = 0
+    offsite_requests_count: int = 0
+    response_bytes: int = 0
+    items_scraped: int = 0
+    items_dropped: int = 0
+    start_time: float = 0.0
+    end_time: float = 0.0
+    download_delay: float = 0.0
+    blocked_requests_count: int = 0
+    custom_stats: Dict = field(default_factory=dict)
+    response_status_count: Dict = field(default_factory=dict)
+    domains_response_bytes: Dict = field(default_factory=dict)
+    sessions_requests_count: Dict = field(default_factory=dict)
+    proxies: List[str | Dict | Tuple] = field(default_factory=list)
+    log_levels_counter: Dict = field(default_factory=dict)
+    @property
+    def elapsed_seconds(self) -> float:
+        return self.end_time - self.start_time
+    @property
+    def requests_per_second(self) -> float:
+        if self.elapsed_seconds == 0:
+            return 0.0
+        return self.requests_count / self.elapsed_seconds
+    def increment_status(self, status: int) -> None:
+        self.response_status_count[f"status_{status}"] = self.response_status_count.get(f"status_{status}", 0) + 1
+    def increment_response_bytes(self, domain: str, count: int) -> None:
+        self.response_bytes += count
+        self.domains_response_bytes[domain] = self.domains_response_bytes.get(domain, 0) + count
+    def increment_requests_count(self, sid: str) -> None:
+        self.requests_count += 1
+        self.sessions_requests_count[sid] = self.sessions_requests_count.get(sid, 0) + 1
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "items_scraped": self.items_scraped,
+            "items_dropped": self.items_dropped,
+            "elapsed_seconds": round(self.elapsed_seconds, 2),
+            "download_delay": round(self.download_delay, 2),
+            "concurrent_requests": self.concurrent_requests,
+            "concurrent_requests_per_domain": self.concurrent_requests_per_domain,
+            "requests_count": self.requests_count,
+            "requests_per_second": round(self.requests_per_second, 2),
+            "sessions_requests_count": self.sessions_requests_count,
+            "failed_requests_count": self.failed_requests_count,
+            "offsite_requests_count": self.offsite_requests_count,
+            "blocked_requests_count": self.blocked_requests_count,
+            "response_status_count": self.response_status_count,
+            "response_bytes": self.response_bytes,
+            "domains_response_bytes": self.domains_response_bytes,
+            "proxies": self.proxies,
+            "custom_stats": self.custom_stats,
+            "log_count": self.log_levels_counter,
+        }
+@dataclass
+class CrawlResult:
+    """Complete result from a spider run."""
+    stats: CrawlStats
+    items: ItemList
+    paused: bool = False
+    @property
+    def completed(self) -> bool:
+        """True if the crawl completed normally (not paused)."""
+        return not self.paused
+    def __len__(self) -> int:
+        return len(self.items)
+    def __iter__(self) -> Iterator[dict[str, Any]]:
+        return iter(self.items)

spiders/scheduler.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import asyncio
+from itertools import count
+from scrapling.core.utils import log
+from scrapling.spiders.request import Request
+from scrapling.core._types import List, Set, Tuple, TYPE_CHECKING
+if TYPE_CHECKING:
+    from scrapling.spiders.checkpoint import CheckpointData
+class Scheduler:
+    """
+    Priority queue with URL deduplication. (heapq)
+    Higher priority requests are processed first.
+    Duplicate URLs are filtered unless dont_filter=True.
+    """
+    def __init__(self, include_kwargs: bool = False, include_headers: bool = False, keep_fragments: bool = False):
+        self._queue: asyncio.PriorityQueue[tuple[int, int, Request]] = asyncio.PriorityQueue()
+        self._seen: set[bytes] = set()
+        self._counter = count()
+        # Mirror dict for snapshot without draining queue
+        self._pending: dict[int, tuple[int, int, Request]] = {}
+        self._include_kwargs = include_kwargs
+        self._include_headers = include_headers
+        self._keep_fragments = keep_fragments
+    async def enqueue(self, request: Request) -> bool:
+        """Add a request to the queue."""
+        fingerprint = request.update_fingerprint(self._include_kwargs, self._include_headers, self._keep_fragments)
+        if not request.dont_filter and fingerprint in self._seen:
+            log.debug("Dropped duplicate request: %s", request)
+            return False
+        self._seen.add(fingerprint)
+        # Negative priority so higher priority = dequeued first
+        counter = next(self._counter)
+        item = (-request.priority, counter, request)
+        self._pending[counter] = item
+        await self._queue.put(item)
+        return True
+    async def dequeue(self) -> Request:
+        """Get the next request to process."""
+        _, counter, request = await self._queue.get()
+        self._pending.pop(counter, None)
+        return request
+    def __len__(self) -> int:
+        return self._queue.qsize()
+    @property
+    def is_empty(self) -> bool:
+        return self._queue.empty()
+    def snapshot(self) -> Tuple[List[Request], Set[bytes]]:
+        """Create a snapshot of the current state for checkpoints."""
+        sorted_items = sorted(self._pending.values(), key=lambda x: (x[0], x[1]))  # Maintain queue order
+        requests = [item[2] for item in sorted_items]
+        return requests, self._seen.copy()
+    def restore(self, data: "CheckpointData") -> None:
+        """Restore scheduler state from checkpoint data.
+        :param data: CheckpointData containing requests and seen set
+        """
+        self._seen = data.seen.copy()
+        # Restore pending requests in order (they're already sorted by priority)
+        for request in data.requests:
+            counter = next(self._counter)
+            item = (-request.priority, counter, request)
+            self._pending[counter] = item
+            self._queue.put_nowait(item)
+        log.info(f"Scheduler restored: {len(data.requests)} requests, {len(data.seen)} seen")

spiders/session.py ADDED Viewed

	@@ -0,0 +1,145 @@

+from asyncio import Lock
+from scrapling.spiders.request import Request
+from scrapling.engines.static import _ASyncSessionLogic
+from scrapling.engines.toolbelt.convertor import Response
+from scrapling.core._types import Set, cast, SUPPORTED_HTTP_METHODS
+from scrapling.fetchers import AsyncDynamicSession, AsyncStealthySession, FetcherSession
+Session = FetcherSession | AsyncDynamicSession | AsyncStealthySession
+class SessionManager:
+    """Manages pre-configured session instances."""
+    def __init__(self) -> None:
+        self._sessions: dict[str, Session] = {}
+        self._default_session_id: str | None = None
+        self._started: bool = False
+        self._lazy_sessions: Set[str] = set()
+        self._lazy_lock = Lock()
+    def add(self, session_id: str, session: Session, *, default: bool = False, lazy: bool = False) -> "SessionManager":
+        """Register a session instance.
+        :param session_id: Name to reference this session in requests
+        :param session: Your pre-configured session instance
+        :param default: If True, this becomes the default session
+        :param lazy: If True, the session will be started only when a request uses its ID.
+        """
+        if session_id in self._sessions:
+            raise ValueError(f"Session '{session_id}' already registered")
+        self._sessions[session_id] = session
+        if default or self._default_session_id is None:
+            self._default_session_id = session_id
+        if lazy:
+            self._lazy_sessions.add(session_id)
+        return self
+    def remove(self, session_id: str) -> None:
+        """Removes a session.
+        :param session_id: ID of session to remove
+        """
+        _ = self.pop(session_id)
+    def pop(self, session_id: str) -> Session:
+        """Remove and returns a session.
+        :param session_id: ID of session to remove
+        """
+        if session_id not in self._sessions:
+            raise KeyError(f"Session '{session_id}' not found")
+        session = self._sessions.pop(session_id)
+        if session_id in self._lazy_sessions:
+            self._lazy_sessions.remove(session_id)
+        if session and self._default_session_id == session_id:
+            self._default_session_id = next(iter(self._sessions), None)
+        return session
+    @property
+    def default_session_id(self) -> str:
+        if self._default_session_id is None:
+            raise RuntimeError("No sessions registered")
+        return self._default_session_id
+    @property
+    def session_ids(self) -> list[str]:
+        return list(self._sessions.keys())
+    def get(self, session_id: str) -> Session:
+        if session_id not in self._sessions:
+            available = ", ".join(self._sessions.keys())
+            raise KeyError(f"Session '{session_id}' not found. Available: {available}")
+        return self._sessions[session_id]
+    async def start(self) -> None:
+        """Start all sessions that aren't already alive."""
+        if self._started:
+            return
+        for sid, session in self._sessions.items():
+            if sid not in self._lazy_sessions and not session._is_alive:
+                await session.__aenter__()
+        self._started = True
+    async def close(self) -> None:
+        """Close all registered sessions."""
+        for session in self._sessions.values():
+            _ = await session.__aexit__(None, None, None)
+        self._started = False
+    async def fetch(self, request: Request) -> Response:
+        sid = request.sid if request.sid else self.default_session_id
+        session = self.get(sid)
+        if session:
+            if sid in self._lazy_sessions and not session._is_alive:
+                async with self._lazy_lock:
+                    if not session._is_alive:
+                        await session.__aenter__()
+            if isinstance(session, FetcherSession):
+                client = session._client
+                if isinstance(client, _ASyncSessionLogic):
+                    response = await client._make_request(
+                        method=cast(SUPPORTED_HTTP_METHODS, request._session_kwargs.pop("method", "GET")),
+                        url=request.url,
+                        **request._session_kwargs,
+                    )
+                else:
+                    # Sync session or other types - shouldn't happen in async context
+                    raise TypeError(f"Session type {type(client)} not supported for async fetch")
+            else:
+                response = await session.fetch(url=request.url, **request._session_kwargs)
+            response.request = request
+            # Merge request meta into response meta (response meta takes priority)
+            response.meta = {**request.meta, **response.meta}
+            return response
+        raise RuntimeError("No session found with the request session id")
+    async def __aenter__(self) -> "SessionManager":
+        await self.start()
+        return self
+    async def __aexit__(self, *exc) -> None:
+        await self.close()
+    def __contains__(self, session_id: str) -> bool:
+        """Check if a session ID is registered."""
+        return session_id in self._sessions
+    def __len__(self) -> int:
+        """Number of registered sessions."""
+        return len(self._sessions)

spiders/spider.py ADDED Viewed

	@@ -0,0 +1,316 @@

+import signal
+import logging
+from pathlib import Path
+from abc import ABC, abstractmethod
+import anyio
+from anyio import Path as AsyncPath
+from scrapling.spiders.request import Request
+from scrapling.spiders.engine import CrawlerEngine
+from scrapling.spiders.session import SessionManager
+from scrapling.core.utils import set_logger, reset_logger
+from scrapling.spiders.result import CrawlResult, CrawlStats
+from scrapling.core._types import Set, Any, Dict, Optional, Union, TYPE_CHECKING, AsyncGenerator
+BLOCKED_CODES = {401, 403, 407, 429, 444, 500, 502, 503, 504}
+if TYPE_CHECKING:
+    from scrapling.engines.toolbelt.custom import Response
+class LogCounterHandler(logging.Handler):
+    """A logging handler that counts log messages by level."""
+    def __init__(self):
+        super().__init__()
+        self.counts = {
+            logging.DEBUG: 0,
+            logging.INFO: 0,
+            logging.WARNING: 0,
+            logging.ERROR: 0,
+            logging.CRITICAL: 0,
+        }
+    def emit(self, record: logging.LogRecord) -> None:
+        level = record.levelno
+        # Map to the closest standard level
+        if level >= logging.CRITICAL:
+            self.counts[logging.CRITICAL] += 1
+        elif level >= logging.ERROR:
+            self.counts[logging.ERROR] += 1
+        elif level >= logging.WARNING:
+            self.counts[logging.WARNING] += 1
+        elif level >= logging.INFO:
+            self.counts[logging.INFO] += 1
+        else:
+            self.counts[logging.DEBUG] += 1
+    def get_counts(self) -> Dict[str, int]:
+        """Return counts as a dictionary with string keys."""
+        return {
+            "debug": self.counts[logging.DEBUG],
+            "info": self.counts[logging.INFO],
+            "warning": self.counts[logging.WARNING],
+            "error": self.counts[logging.ERROR],
+            "critical": self.counts[logging.CRITICAL],
+        }
+class SessionConfigurationError(Exception):
+    """Raised when session configuration fails."""
+    pass
+class Spider(ABC):
+    """An abstract base class for creating web spiders.
+    Check the documentation website for more information.
+    """
+    name: Optional[str] = None
+    start_urls: list[str] = []
+    allowed_domains: Set[str] = set()
+    # Concurrency settings
+    concurrent_requests: int = 4
+    concurrent_requests_per_domain: int = 0
+    download_delay: float = 0.0
+    max_blocked_retries: int = 3
+    # Fingerprint adjustments
+    fp_include_kwargs: bool = False
+    fp_keep_fragments: bool = False
+    fp_include_headers: bool = False
+    # Logging settings
+    logging_level: int = logging.DEBUG
+    logging_format: str = "[%(asctime)s]:({spider_name}) %(levelname)s: %(message)s"
+    logging_date_format: str = "%Y-%m-%d %H:%M:%S"
+    log_file: Optional[str] = None
+    def __init__(self, crawldir: Optional[Union[str, Path, AsyncPath]] = None, interval: float = 300.0):
+        """Initialize the spider.
+        :param crawldir: Directory for checkpoint files. If provided, enables pause/resume.
+        :param interval: Seconds between periodic checkpoint saves (default 5 minutes).
+        """
+        if self.name is None:
+            raise ValueError(f"{self.__class__.__name__} must have a name.")
+        self.logger = logging.getLogger(f"scrapling.spiders.{self.name}")
+        self.logger.setLevel(self.logging_level)
+        self.logger.handlers.clear()
+        self.logger.propagate = False  # Don't propagate to parent 'scrapling' logger
+        formatter = logging.Formatter(
+            fmt=self.logging_format.format(spider_name=self.name), datefmt=self.logging_date_format
+        )
+        # Add a log counter handler to track log counts by level
+        self._log_counter = LogCounterHandler()
+        self.logger.addHandler(self._log_counter)
+        console_handler = logging.StreamHandler()
+        console_handler.setFormatter(formatter)
+        self.logger.addHandler(console_handler)
+        if self.log_file:
+            Path(self.log_file).parent.mkdir(parents=True, exist_ok=True)
+            file_handler = logging.FileHandler(self.log_file)
+            file_handler.setFormatter(formatter)
+            self.logger.addHandler(file_handler)
+        self.crawldir: Optional[Path] = Path(crawldir) if crawldir else None
+        self._interval = interval
+        self._engine: Optional[CrawlerEngine] = None
+        self._original_sigint_handler: Any = None
+        self._session_manager = SessionManager()
+        try:
+            self.configure_sessions(self._session_manager)
+        except Exception as e:
+            raise SessionConfigurationError(f"Error in {self.__class__.__name__}.configure_sessions(): {e}") from e
+        if len(self._session_manager) == 0:
+            raise SessionConfigurationError(f"{self.__class__.__name__}.configure_sessions() did not add any sessions")
+        self.logger.info("Spider initialized")
+    async def start_requests(self) -> AsyncGenerator[Request, None]:
+        """Generate initial requests to start the crawl.
+        By default, this generates Request objects for each URL in `start_urls`
+        using the session manager's default session and `parse()` as callback.
+        Override this method for more control over initial requests
+        (e.g., to add custom headers, use different callbacks, etc.)
+        """
+        if not self.start_urls:
+            raise RuntimeError(
+                "Spider has no starting point, either set `start_urls` or override `start_requests` function."
+            )
+        for url in self.start_urls:
+            yield Request(url, sid=self._session_manager.default_session_id)
+    @abstractmethod
+    async def parse(self, response: "Response") -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
+        """Default callback for processing responses"""
+        raise NotImplementedError(f"{self.__class__.__name__} must implement parse() method")
+        yield  # Make this a generator for type checkers
+    async def on_start(self, resuming: bool = False) -> None:
+        """Called before crawling starts. Override for setup logic.
+        :param resuming: It's enabled if the spider is resuming from a checkpoint, left for the user to use.
+        """
+        if resuming:
+            self.logger.debug("Resuming spider from checkpoint")
+        else:
+            self.logger.debug("Starting spider")
+    async def on_close(self) -> None:
+        """Called after crawling finishes. Override for cleanup logic."""
+        self.logger.debug("Spider closed")
+    async def on_error(self, request: Request, error: Exception) -> None:
+        """
+        Handle request errors for all spider requests.
+        Override for custom error handling.
+        """
+        pass
+    async def on_scraped_item(self, item: Dict[str, Any]) -> Dict[str, Any] | None:
+        """A hook to be overridden by users to do some processing on scraped items, return `None` to drop the item silently."""
+        return item
+    async def is_blocked(self, response: "Response") -> bool:
+        """Check if the response is blocked. Users should override this for custom detection logic."""
+        if response.status in BLOCKED_CODES:
+            return True
+        return False
+    async def retry_blocked_request(self, request: Request, response: "Response") -> Request:
+        """Users should override this to prepare the blocked request before retrying, if needed."""
+        return request
+    def __repr__(self) -> str:
+        """String representation of the spider."""
+        return f"<{self.__class__.__name__} '{self.name}'>"
+    def configure_sessions(self, manager: SessionManager) -> None:
+        """Configure sessions for this spider.
+        Override this method to add custom sessions.
+        The default implementation creates a FetcherSession session.
+        The first session added becomes the default for `start_requests()` unless specified otherwise.
+        :param manager: SessionManager to configure
+        """
+        from scrapling.fetchers import FetcherSession
+        manager.add("default", FetcherSession())
+    def pause(self):
+        """Request graceful shutdown of the crawling process."""
+        if self._engine:
+            self._engine.request_pause()
+        else:
+            raise RuntimeError("No active crawl to stop")
+    def _setup_signal_handler(self) -> None:
+        """Set up SIGINT handler for graceful pause."""
+        def handler(_signum: int, _frame: Any) -> None:
+            if self._engine:
+                self._engine.request_pause()
+            else:
+                # No engine yet, just raise KeyboardInterrupt
+                raise KeyboardInterrupt
+        try:
+            self._original_sigint_handler = signal.signal(signal.SIGINT, handler)
+        except ValueError:
+            self._original_sigint_handler = None
+    def _restore_signal_handler(self) -> None:
+        """Restore original SIGINT handler."""
+        if self._original_sigint_handler is not None:
+            try:
+                signal.signal(signal.SIGINT, self._original_sigint_handler)
+            except ValueError:
+                pass
+    async def __run(self) -> CrawlResult:
+        token = set_logger(self.logger)
+        try:
+            self._engine = CrawlerEngine(self, self._session_manager, self.crawldir, self._interval)
+            stats = await self._engine.crawl()
+            paused = self._engine.paused
+            return CrawlResult(stats=stats, items=self._engine.items, paused=paused)
+        finally:
+            self._engine = None
+            reset_logger(token)
+            # Close any file handlers to release file resources.
+            if self.log_file:
+                for handler in self.logger.handlers:
+                    if isinstance(handler, logging.FileHandler):
+                        handler.close()
+    def start(self, use_uvloop: bool = False, **backend_options: Any) -> CrawlResult:
+        """Run the spider and return results.
+        This is the main entry point for running a spider.
+        Handles async execution internally via anyio.
+        Pressing Ctrl+C will initiate graceful shutdown (waits for active tasks to complete).
+        Pressing Ctrl+C a second time will force immediate stop.
+        If crawldir is set, a checkpoint will also be saved on graceful shutdown,
+        allowing you to resume the crawl later by running the spider again.
+        :param use_uvloop: Whether to use the faster uvloop/winloop event loop implementation, if available.
+        :param backend_options: Asyncio backend options to be used with `anyio.run`
+        """
+        backend_options = backend_options or {}
+        if use_uvloop:
+            backend_options.update({"use_uvloop": True})
+        # Set up SIGINT handler for graceful shutdown
+        self._setup_signal_handler()
+        try:
+            return anyio.run(self.__run, backend="asyncio", backend_options=backend_options)
+        finally:
+            self._restore_signal_handler()
+    async def stream(self) -> AsyncGenerator[Dict[str, Any], None]:
+        """Stream items as they're scraped. Ideal for long-running spiders or building applications on top of the spiders.
+        Must be called from an async context. Yields items one by one as they are scraped.
+        Access `spider.stats` during iteration for real-time statistics.
+        Note: SIGINT handling for pause/resume is not available in stream mode.
+        """
+        token = set_logger(self.logger)
+        try:
+            self._engine = CrawlerEngine(self, self._session_manager, self.crawldir, self._interval)
+            async for item in self._engine:
+                yield item
+        finally:
+            self._engine = None
+            reset_logger(token)
+            if self.log_file:
+                for handler in self.logger.handlers:
+                    if isinstance(handler, logging.FileHandler):
+                        handler.close()
+    @property
+    def stats(self) -> CrawlStats:
+        """Access current crawl stats (works during streaming)."""
+        if self._engine:
+            return self._engine.stats
+        raise RuntimeError("No active crawl. Use this property inside `async for item in spider.stream():`")

ui.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import gradio as gr
+from scrapling.core.ai import ScraplingMCPServer
+import asyncio
+from typing import Any
+def create_ui():
+    with gr.Blocks(title="Scrapling") as demo:
+        gr.Markdown("# Scrapling Web Interface")
+        with gr.Tab("Fetch (HTTP)"):
+            gr.Markdown("Standard HTTP Fetcher. Fast but less stealthy.")
+            url_input = gr.Textbox(label="URL", placeholder="https://example.com")
+            selector_input = gr.Textbox(label="CSS Selector (Optional)", placeholder=".content")
+            output = gr.JSON(label="Result")
+            fetch_btn = gr.Button("Fetch")
+            async def fetch_wrapper(url, selector):
+                if not url:
+                    return {"error": "URL is required"}
+                try:
+                    # ScraplingMCPServer.get is synchronous or async?
+                    # In code: staticmethod def get(...) -> ResponseModel:
+                    # It calls Fetcher.get which is synchronous.
+                    # Gradio handles async/sync. But running sync function in async context might block.
+                    # Since it is blocking, we should probably run it in executor or just let Gradio handle it.
+                    # But ScraplingMCPServer.get uses 'impersonate' which uses curl_cffi.
+                    result = ScraplingMCPServer.get(url, css_selector=selector if selector else None)
+                    return result.model_dump()
+                except Exception as e:
+                    return {"error": str(e)}
+            fetch_btn.click(fetch_wrapper, inputs=[url_input, selector_input], outputs=output)
+        with gr.Tab("Stealthy Fetch (Browser)"):
+            gr.Markdown("Stealthy Browser Fetcher (Playwright). Slower but bypasses bot protection.")
+            s_url_input = gr.Textbox(label="URL")
+            s_selector_input = gr.Textbox(label="CSS Selector (Optional)")
+            s_headless = gr.Checkbox(label="Headless", value=True)
+            s_output = gr.JSON(label="Result")
+            s_fetch_btn = gr.Button("Stealthy Fetch")
+            async def stealthy_fetch_wrapper(url, selector, headless):
+                if not url:
+                    return {"error": "URL is required"}
+                try:
+                    result = await ScraplingMCPServer.stealthy_fetch(
+                        url,
+                        css_selector=selector if selector else None,
+                        headless=headless
+                    )
+                    return result.model_dump()
+                except Exception as e:
+                    return {"error": str(e)}
+            s_fetch_btn.click(stealthy_fetch_wrapper, inputs=[s_url_input, s_selector_input, s_headless], outputs=s_output)
+    return demo