Spaces:

lenson78
/

Scrapling

Paused

App Files Files Community

Karim shoair commited on Jun 19, 2025

Commit

0cd97d9

1 Parent(s): 3ced0d2

feat(fetchers): Adding the foundation of the new browser-based fetchers logic

Browse files

Files changed (7) hide show

scrapling/core/_types.py +1 -0
scrapling/engines/_browsers/__init__.py +1 -0
scrapling/engines/_browsers/_config_tools.py +99 -0
scrapling/engines/_browsers/_controllers.py +615 -0
scrapling/engines/_browsers/_page.py +93 -0
scrapling/engines/_browsers/_validators.py +88 -0
scrapling/engines/constants.py +9 -0

scrapling/core/_types.py CHANGED Viewed

@@ -24,6 +24,7 @@ from typing import (
 SUPPORTED_HTTP_METHODS = Literal["GET", "POST", "PUT", "DELETE"]
 SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
 StrOrBytes = Union[str, bytes]
 try:

 SUPPORTED_HTTP_METHODS = Literal["GET", "POST", "PUT", "DELETE"]
 SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
+PageLoadStates = Literal["commit", "domcontentloaded", "load", "networkidle"]
 StrOrBytes = Union[str, bytes]
 try:

scrapling/engines/_browsers/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from ._controllers import DynamicSession, AsyncDynamicSession

scrapling/engines/_browsers/_config_tools.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from functools import lru_cache
+from scrapling.core._types import Tuple
+from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS, HARMFUL_DEFAULT_ARGS
+from scrapling.engines.toolbelt import js_bypass_path, generate_headers
+__default_useragent__ = generate_headers(browser_mode=True).get("User-Agent")
+@lru_cache(1)
+def _compiled_stealth_scripts():
+    """Pre-read and compile stealth scripts"""
+    # Basic bypasses nothing fancy as I'm still working on it
+    # But with adding these bypasses to the above config, it bypasses many online tests like
+    # https://bot.sannysoft.com/
+    # https://kaliiiiiiiiii.github.io/brotector/
+    # https://pixelscan.net/
+    # https://iphey.com/
+    # https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
+    # https://arh.antoinevastel.com/bots/areyouheadless/
+    # https://prescience-data.github.io/execution-monitor.html
+    stealth_scripts_paths = tuple(
+        js_bypass_path(script)
+        for script in (
+            # Order is important
+            "webdriver_fully.js",
+            "window_chrome.js",
+            "navigator_plugins.js",
+            "pdf_viewer.js",
+            "notification_permission.js",
+            "screen_props.js",
+            "playwright_fingerprint.js",
+        )
+    )
+    scripts = []
+    for script_path in stealth_scripts_paths:
+        with open(script_path, "r") as f:
+            scripts.append(f.read())
+    return tuple(scripts)
+@lru_cache(2, typed=True)
+def _set_flags(hide_canvas, disable_webgl):
+    """Returns the flags that will be used while launching the browser if stealth mode is enabled"""
+    flags = DEFAULT_STEALTH_FLAGS
+    if hide_canvas:
+        flags += ("--fingerprinting-canvas-image-data-noise",)
+    if disable_webgl:
+        flags += (
+            "--disable-webgl",
+            "--disable-webgl-image-chromium",
+            "--disable-webgl2",
+        )
+    return flags
+@lru_cache(2, typed=True)
+def _launch_kwargs(headless, real_chrome, stealth, hide_canvas, disable_webgl) -> Tuple:
+    """Creates the arguments we will use while launching playwright's browser"""
+    launch_kwargs = {
+        "headless": headless,
+        "ignore_default_args": HARMFUL_DEFAULT_ARGS,
+        "channel": "chrome" if real_chrome else "chromium",
+    }
+    if stealth:
+        launch_kwargs.update(
+            {"args": _set_flags(hide_canvas, disable_webgl), "chromium_sandbox": True}
+        )
+    return tuple(launch_kwargs.items())
+@lru_cache(2, typed=True)
+def _context_kwargs(proxy, locale, extra_headers, useragent, stealth) -> Tuple:
+    """Creates the arguments for the browser context"""
+    context_kwargs = {
+        "proxy": proxy or tuple(),
+        "locale": locale,
+        "color_scheme": "dark",  # Bypasses the 'prefersLightColor' check in creepjs
+        "device_scale_factor": 2,
+        "extra_http_headers": extra_headers or tuple(),
+        "user_agent": useragent or __default_useragent__,
+    }
+    if stealth:
+        context_kwargs.update(
+            {
+                "is_mobile": False,
+                "has_touch": False,
+                # I'm thinking about disabling it to rest from all Service Workers' headache, but let's keep it as it is for now
+                "service_workers": "allow",
+                "ignore_https_errors": True,
+                "screen": {"width": 1920, "height": 1080},
+                "viewport": {"width": 1920, "height": 1080},
+                "permissions": ["geolocation", "notifications"],
+            }
+        )
+    return tuple(context_kwargs.items())

scrapling/engines/_browsers/_controllers.py ADDED Viewed

	@@ -0,0 +1,615 @@

+import time
+import asyncio
+# from camoufox import AsyncNewBrowser, NewBrowser
+from playwright.sync_api import (
+    sync_playwright,
+    BrowserType,
+    Browser,
+    BrowserContext,
+    Playwright,
+    Locator,
+)
+from playwright.async_api import (
+    async_playwright,
+    BrowserType as AsyncBrowserType,
+    Browser as AsyncBrowser,
+    BrowserContext as AsyncBrowserContext,
+    Playwright as AsyncPlaywright,
+    Locator as AsyncLocator,
+)
+from playwright.sync_api import Response as SyncPlaywrightResponse
+from playwright.async_api import Response as AsyncPlaywrightResponse
+from rebrowser_playwright.sync_api import sync_playwright as sync_rebrowser_playwright
+from rebrowser_playwright.async_api import (
+    async_playwright as async_rebrowser_playwright,
+)
+from scrapling.core.utils import log
+from ._page import PageInfo, PagePool
+from ._validators import validate, PlaywrightConfig
+from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
+from scrapling.core._types import (
+    Dict,
+    Optional,
+    Union,
+    Iterable,
+    Callable,
+    SelectorWaitStates,
+)
+from scrapling.engines.toolbelt import (
+    Response,
+    ResponseFactory,
+    generate_convincing_referer,
+    intercept_route,
+    async_intercept_route,
+)
+class DynamicSession:
+    """A Browser session manager with page pooling."""
+    __slots__ = (
+        "max_pages",
+        "headless",
+        "hide_canvas",
+        "disable_webgl",
+        "real_chrome",
+        "stealth",
+        "google_search",
+        "proxy",
+        "locale",
+        "extra_headers",
+        "useragent",
+        "timeout",
+        "cookies",
+        "disable_resources",
+        "network_idle",
+        "wait_selector",
+        "wait_selector_state",
+        "wait",
+        "playwright",
+        "browser",
+        "context",
+        "page_pool",
+        "_closed",
+        "adaptor_arguments",
+        "page_action",
+        "launch_options",
+        "context_options",
+        "cdp_url",
+    )
+    def __init__(
+        self,
+        max_pages: int = 1,
+        headless: bool = True,
+        google_search: bool = True,
+        hide_canvas: bool = False,
+        disable_webgl: bool = False,
+        real_chrome: bool = False,
+        stealth: bool = False,
+        wait: Union[int, float] = 0,
+        page_action: Optional[Callable] = None,
+        proxy: Optional[Union[str, Dict[str, str]]] = None,
+        locale: str = "en-US",
+        extra_headers: Optional[Dict[str, str]] = None,
+        useragent: Optional[str] = None,
+        cdp_url: Optional[str] = None,
+        timeout: Union[int, float] = 30000,
+        disable_resources: bool = False,
+        wait_selector: Optional[str] = None,
+        cookies: Optional[Iterable[Dict]] = None,
+        network_idle: bool = False,
+        wait_selector_state: SelectorWaitStates = "attached",
+        adaptor_arguments: Optional[Dict] = None,
+    ):
+        """A Browser session manager with page pooling
+        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
+        :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
+            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+            This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
+        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
+        :param cookies: Set cookies for the next request.
+        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
+        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
+        :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
+        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
+        :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
+        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
+        :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
+        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
+        :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
+        :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
+        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
+        :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
+        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
+        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
+        :param max_pages: The maximum number of pages to be opened at the same time. It will be used in rotation through a PagePool.
+        :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
+        """
+        params = {
+            "max_pages": max_pages,
+            "headless": headless,
+            "google_search": google_search,
+            "hide_canvas": hide_canvas,
+            "disable_webgl": disable_webgl,
+            "real_chrome": real_chrome,
+            "stealth": stealth,
+            "wait": wait,
+            "page_action": page_action,
+            "proxy": proxy,
+            "locale": locale,
+            "extra_headers": extra_headers,
+            "useragent": useragent,
+            "timeout": timeout,
+            "adaptor_arguments": adaptor_arguments,
+            "disable_resources": disable_resources,
+            "wait_selector": wait_selector,
+            "cookies": cookies,
+            "network_idle": network_idle,
+            "wait_selector_state": wait_selector_state,
+            "cdp_url": cdp_url,
+        }
+        config = validate(params, PlaywrightConfig)
+        self.max_pages = config.max_pages
+        self.headless = config.headless
+        self.hide_canvas = config.hide_canvas
+        self.disable_webgl = config.disable_webgl
+        self.real_chrome = config.real_chrome
+        self.stealth = config.stealth
+        self.google_search = config.google_search
+        self.wait = config.wait
+        self.proxy = config.proxy
+        self.locale = config.locale
+        self.extra_headers = config.extra_headers
+        self.useragent = config.useragent
+        self.timeout = config.timeout
+        self.cookies = list(config.cookies) if config.cookies else []
+        self.disable_resources = config.disable_resources
+        self.cdp_url = config.cdp_url
+        self.network_idle = config.network_idle
+        self.wait_selector = config.wait_selector
+        self.wait_selector_state = config.wait_selector_state
+        self.playwright: Optional[Playwright] = None
+        self.browser: Optional[Union[BrowserType, Browser]] = None
+        self.context: Optional[BrowserContext] = None
+        self.page_pool = PagePool(self.max_pages)
+        self._closed = False
+        self.adaptor_arguments = config.adaptor_arguments or {}
+        self.page_action = config.page_action
+        self.__initiate_browser_options__()
+    def __initiate_browser_options__(self):
+        self.launch_options = dict(
+            _launch_kwargs(
+                self.headless,
+                self.real_chrome,
+                self.stealth,
+                self.hide_canvas,
+                self.disable_webgl,
+            )
+        )
+        self.context_options = dict(
+            _context_kwargs(
+                self.proxy,
+                self.locale,
+                tuple(self.extra_headers.items()) if self.extra_headers else tuple(),
+                self.useragent,
+                self.stealth,
+            )
+        )
+        self.context_options["extra_http_headers"] = dict(
+            self.context_options["extra_http_headers"]
+        )
+        self.context_options["proxy"] = dict(self.context_options["proxy"]) or None
+    def __create__(self):
+        """Create a browser for this instance and context."""
+        sync_context = sync_rebrowser_playwright
+        if not self.stealth or self.real_chrome:
+            # Because rebrowser_playwright doesn't play well with real browsers
+            sync_context = sync_playwright
+        self.playwright = sync_context().start()
+        browser_launcher = getattr(
+            self.playwright, "chrome" if self.real_chrome else "chromium"
+        )
+        if self.cdp_url:
+            self.browser = browser_launcher.connect_over_cdp(endpoint_url=self.cdp_url)
+        else:
+            self.browser = browser_launcher.launch(**self.launch_options)
+        self.context = self.browser.new_context(**self.context_options)
+        if self.cookies:
+            self.context.add_cookies(self.cookies)
+    def __enter__(self):
+        self.__create__()
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+    def close(self):
+        """Close all resources"""
+        if self._closed:
+            return
+        if self.context:
+            self.context.close()
+            self.context = None
+        if self.browser:
+            self.browser.close()
+            self.browser = None
+        if self.playwright:
+            self.playwright.stop()
+            self.playwright = None
+        self._closed = True
+    def _get_or_create_page(self) -> PageInfo:
+        """Get an available page or create a new one"""
+        # Try to get a ready page first
+        page_info = self.page_pool.get_ready_page()
+        if page_info:
+            return page_info
+        # Create new page if under limit
+        if self.page_pool.pages_count < self.max_pages:
+            page = self.context.new_page()
+            page.set_default_navigation_timeout(self.timeout)
+            page.set_default_timeout(self.timeout)
+            if self.extra_headers:
+                page.set_extra_http_headers(self.extra_headers)
+            if self.disable_resources:
+                page.route("**/*", intercept_route)
+            if self.stealth:
+                for script in _compiled_stealth_scripts():
+                    page.add_init_script(path=script)
+            return self.page_pool.add_page(page)
+        # Wait for a page to become available
+        max_wait = 30
+        start_time = time.time()
+        while time.time() - start_time < max_wait:
+            page_info = self.page_pool.get_ready_page()
+            if page_info:
+                return page_info
+            time.sleep(0.05)
+        raise TimeoutError("No pages available within timeout period")
+    def fetch(self, url: str) -> Response:
+        """Opens up the browser and do your request based on your chosen options.
+        :param url: The Target url.
+        :return: A `Response` object.
+        """
+        if self._closed:
+            raise RuntimeError("Context manager has been closed")
+        final_response = None
+        referer = generate_convincing_referer(url) if self.google_search else None
+        def handle_response(finished_response: SyncPlaywrightResponse):
+            nonlocal final_response
+            if (
+                finished_response.request.resource_type == "document"
+                and finished_response.request.is_navigation_request()
+            ):
+                final_response = finished_response
+        page_info = self._get_or_create_page()
+        page_info.mark_busy(url=url)
+        try:
+            # Navigate to URL and wait for a specified state
+            page_info.page.on("response", handle_response)
+            first_response = page_info.page.goto(url, referer=referer)
+            page_info.page.wait_for_load_state(state="domcontentloaded")
+            if self.network_idle:
+                page_info.page.wait_for_load_state("networkidle")
+            if not first_response:
+                raise RuntimeError(f"Failed to get response for {url}")
+            if self.page_action is not None:
+                try:
+                    page_info.page = self.page_action(page_info.page)
+                except Exception as e:
+                    log.error(f"Error executing page_action: {e}")
+            if self.wait_selector:
+                try:
+                    waiter: Locator = page_info.page.locator(self.wait_selector)
+                    waiter.first.wait_for(state=self.wait_selector_state)
+                    # Wait again after waiting for the selector, helpful with protections like Cloudflare
+                    page_info.page.wait_for_load_state(state="load")
+                    page_info.page.wait_for_load_state(state="domcontentloaded")
+                    if self.network_idle:
+                        page_info.page.wait_for_load_state("networkidle")
+                except Exception as e:
+                    log.error(f"Error waiting for selector {self.wait_selector}: {e}")
+            page_info.page.wait_for_timeout(self.wait)
+            # Create response object
+            response = ResponseFactory.from_playwright_response(
+                page_info.page, first_response, final_response, self.adaptor_arguments
+            )
+            # Mark page as ready for next use
+            page_info.mark_ready()
+            return response
+        except Exception as e:
+            page_info.mark_error()
+            raise e
+    def get_pool_stats(self) -> Dict[str, int]:
+        """Get statistics about the current page pool"""
+        return {
+            "total_pages": self.page_pool.pages_count,
+            "ready_pages": self.page_pool.ready_count,
+            "busy_pages": self.page_pool.busy_count,
+            "max_pages": self.max_pages,
+        }
+class AsyncDynamicSession(DynamicSession):
+    """A Browser session manager with page pooling"""
+    def __init__(
+        self,
+        max_pages: int = 1,
+        headless: bool = True,
+        google_search: bool = True,
+        hide_canvas: bool = False,
+        disable_webgl: bool = False,
+        real_chrome: bool = False,
+        stealth: bool = False,
+        wait: Union[int, float] = 0,
+        page_action: Optional[Callable] = None,
+        proxy: Optional[Union[str, Dict[str, str]]] = None,
+        locale: str = "en-US",
+        extra_headers: Optional[Dict[str, str]] = None,
+        useragent: Optional[str] = None,
+        cdp_url: Optional[str] = None,
+        timeout: Union[int, float] = 30000,
+        disable_resources: bool = False,
+        wait_selector: Optional[str] = None,
+        cookies: Optional[Iterable[Dict]] = None,
+        network_idle: bool = False,
+        wait_selector_state: SelectorWaitStates = "attached",
+        adaptor_arguments: Optional[Dict] = None,
+    ):
+        """A Browser session manager with page pooling
+        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
+        :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
+            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+            This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
+        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
+        :param cookies: Set cookies for the next request.
+        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
+        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
+        :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
+        :param wait_selector: Wait for a specific CSS selector to be in a specific state.
+        :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
+        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
+        :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
+        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
+        :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
+        :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
+        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
+        :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
+        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
+        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
+        :param max_pages: The maximum number of pages to be opened at the same time. It will be used in rotation through a PagePool.
+        :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
+        """
+        super().__init__(
+            max_pages,
+            headless,
+            google_search,
+            hide_canvas,
+            disable_webgl,
+            real_chrome,
+            stealth,
+            wait,
+            page_action,
+            proxy,
+            locale,
+            extra_headers,
+            useragent,
+            cdp_url,
+            timeout,
+            disable_resources,
+            wait_selector,
+            cookies,
+            network_idle,
+            wait_selector_state,
+            adaptor_arguments,
+        )
+        self.playwright: Optional[AsyncPlaywright] = None
+        self.browser: Optional[Union[AsyncBrowserType, AsyncBrowser]] = None
+        self.context: Optional[AsyncBrowserContext] = None
+        self._lock = asyncio.Lock()
+        self.__enter__ = None
+        self.__exit__ = None
+    async def __create__(self):
+        """Create a browser for this instance and context."""
+        async_context = async_rebrowser_playwright
+        if not self.stealth or self.real_chrome:
+            # Because rebrowser_playwright doesn't play well with real browsers
+            async_context = async_playwright
+        self.playwright: AsyncPlaywright = await async_context().start()
+        browser_launcher: AsyncBrowserType = getattr(
+            self.playwright, "chrome" if self.real_chrome else "chromium"
+        )
+        if self.cdp_url:
+            self.browser = await browser_launcher.connect_over_cdp(
+                endpoint_url=self.cdp_url
+            )
+        else:
+            self.browser = await browser_launcher.launch(**self.launch_options)
+        self.context: AsyncBrowserContext = await self.browser.new_context(
+            **self.context_options
+        )
+        if self.cookies:
+            await self.context.add_cookies(self.cookies)
+    async def __aenter__(self):
+        await self.__create__()
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self.close()
+    async def close(self):
+        """Close all resources"""
+        if self._closed:
+            return
+        if self.context:
+            await self.context.close()
+            self.context = None
+        if self.browser:
+            await self.browser.close()
+            self.browser = None
+        if self.playwright:
+            await self.playwright.stop()
+            self.playwright = None
+        self._closed = True
+    async def _get_or_create_page(self) -> PageInfo:
+        """Get an available page or create a new one"""
+        async with self._lock:
+            # Try to get a ready page first
+            page_info = self.page_pool.get_ready_page()
+            if page_info:
+                return page_info
+            # Create new page if under limit
+            if self.page_pool.pages_count < self.max_pages:
+                page = await self.context.new_page()
+                page.set_default_navigation_timeout(self.timeout)
+                page.set_default_timeout(self.timeout)
+                if self.extra_headers:
+                    await page.set_extra_http_headers(self.extra_headers)
+                if self.disable_resources:
+                    await page.route("**/*", async_intercept_route)
+                if self.stealth:
+                    for script in _compiled_stealth_scripts():
+                        await page.add_init_script(path=script)
+                return self.page_pool.add_page(page)
+        # Wait for a page to become available
+        max_wait = 30  # seconds
+        start_time = time.time()
+        while time.time() - start_time < max_wait:
+            page_info = self.page_pool.get_ready_page()
+            if page_info:
+                return page_info
+            await asyncio.sleep(0.05)
+        raise TimeoutError("No pages available within timeout period")
+    async def fetch(self, url: str) -> Response:
+        """Opens up the browser and do your request based on your chosen options.
+        :param url: The Target url.
+        :return: A `Response` object.
+        """
+        if self._closed:
+            raise RuntimeError("Context manager has been closed")
+        final_response = None
+        referer = generate_convincing_referer(url) if self.google_search else None
+        async def handle_response(finished_response: AsyncPlaywrightResponse):
+            nonlocal final_response
+            if (
+                finished_response.request.resource_type == "document"
+                and finished_response.request.is_navigation_request()
+            ):
+                final_response = finished_response
+        page_info = await self._get_or_create_page()
+        page_info.mark_busy(url=url)
+        try:
+            # Navigate to URL and wait for a specified state
+            page_info.page.on("response", handle_response)
+            first_response = await page_info.page.goto(url, referer=referer)
+            await page_info.page.wait_for_load_state(state="domcontentloaded")
+            if self.network_idle:
+                await page_info.page.wait_for_load_state("networkidle")
+            if not first_response:
+                raise RuntimeError(f"Failed to get response for {url}")
+            if self.page_action is not None:
+                try:
+                    page_info.page = await self.page_action(page_info.page)
+                except Exception as e:
+                    log.error(f"Error executing page_action: {e}")
+            if self.wait_selector:
+                try:
+                    waiter: AsyncLocator = page_info.page.locator(self.wait_selector)
+                    await waiter.first.wait_for(state=self.wait_selector_state)
+                    # Wait again after waiting for the selector, helpful with protections like Cloudflare
+                    await page_info.page.wait_for_load_state(state="load")
+                    await page_info.page.wait_for_load_state(state="domcontentloaded")
+                    if self.network_idle:
+                        await page_info.page.wait_for_load_state("networkidle")
+                except Exception as e:
+                    log.error(f"Error waiting for selector {self.wait_selector}: {e}")
+            await page_info.page.wait_for_timeout(self.wait)
+            # Create response object
+            response = await ResponseFactory.from_async_playwright_response(
+                page_info.page, first_response, final_response, self.adaptor_arguments
+            )
+            # Mark page as ready for next use
+            page_info.mark_ready()
+            return response
+        except Exception as e:
+            page_info.mark_error()
+            raise e

scrapling/engines/_browsers/_page.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from threading import RLock
+from dataclasses import dataclass
+from playwright.sync_api import Page as SyncPage
+from playwright.async_api import Page as AsyncPage
+from scrapling.core._types import Optional, Union, List, Literal
+PageState = Literal["ready", "busy", "error"]  # States that a page can be in
+@dataclass
+class PageInfo:
+    """Information about the page and its current state"""
+    __slots__ = ("page", "state", "url")
+    page: Union[SyncPage, AsyncPage]
+    state: PageState
+    url: Optional[str]
+    def mark_busy(self, url: str = ""):
+        """Mark the page as busy"""
+        self.state = "busy"
+        self.url = url
+    def mark_ready(self):
+        """Mark the page as ready for new requests"""
+        self.state = "ready"
+        self.url = ""
+    def mark_error(self):
+        """Mark the page as having an error"""
+        self.state = "error"
+    def __repr__(self):
+        return f'Page(URL="{self.url!r}", state={self.state!r})'
+    def __eq__(self, other_page):
+        """Comparing this page to another page object."""
+        if other_page.__class__ is not self.__class__:
+            return NotImplemented
+        return self.page == other_page.page
+class PagePool:
+    """Manages a pool of browser pages/tabs with state tracking"""
+    __slots__ = ("max_pages", "pages", "_lock")
+    def __init__(self, max_pages: int = 5):
+        self.max_pages = max_pages
+        self.pages: List[PageInfo] = []
+        self._lock = RLock()
+    def add_page(self, page: Union[SyncPage, AsyncPage]) -> PageInfo:
+        """Add a new page to the pool"""
+        with self._lock:
+            if len(self.pages) >= self.max_pages:
+                raise RuntimeError(f"Maximum page limit ({self.max_pages}) reached")
+            page_info = PageInfo(page, "ready", "")
+            self.pages.append(page_info)
+            return page_info
+    def get_ready_page(self) -> Optional[PageInfo]:
+        """Get a page that's ready for use"""
+        with self._lock:
+            for page_info in self.pages:
+                if page_info.state == "ready":
+                    return page_info
+            return None
+    @property
+    def pages_count(self) -> int:
+        """Get the total number of pages"""
+        return len(self.pages)
+    @property
+    def ready_count(self) -> int:
+        """Get the number of ready pages"""
+        with self._lock:
+            return sum(1 for p in self.pages if p.state == "ready")
+    @property
+    def busy_count(self) -> int:
+        """Get the number of busy pages"""
+        with self._lock:
+            return sum(1 for p in self.pages if p.state == "busy")
+    def cleanup_error_pages(self):
+        """Remove pages in error state"""
+        with self._lock:
+            self.pages = [p for p in self.pages if p.state != "error"]

scrapling/engines/_browsers/_validators.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import msgspec
+from urllib.parse import urlparse
+from scrapling.core._types import (
+    Optional,
+    Union,
+    Dict,
+    Callable,
+    Iterable,
+    SelectorWaitStates,
+)
+from scrapling.engines.toolbelt import construct_proxy_dict
+class PlaywrightConfig(msgspec.Struct, kw_only=True, frozen=False):
+    """Configuration struct for validation"""
+    max_pages: int = 1
+    cdp_url: Optional[str] = None
+    headless: bool = True
+    google_search: bool = True
+    hide_canvas: bool = False
+    disable_webgl: bool = False
+    real_chrome: bool = False
+    stealth: bool = False
+    wait: Union[int, float] = 0
+    page_action: Optional[Callable] = None
+    proxy: Optional[Union[str, Dict[str, str]]] = (
+        None  # The default value for proxy in Playwright's source is `None`
+    )
+    locale: str = "en-US"
+    extra_headers: Optional[Dict[str, str]] = None
+    useragent: Optional[str] = None
+    timeout: Union[int, float] = 30000
+    disable_resources: bool = False
+    wait_selector: Optional[str] = None
+    cookies: Optional[Iterable[Dict]] = None
+    network_idle: bool = False
+    wait_selector_state: SelectorWaitStates = "attached"
+    adaptor_arguments: Optional[Dict] = None
+    def __post_init__(self):
+        """Custom validation after msgspec validation"""
+        if self.max_pages < 1 or self.max_pages > 50:
+            raise ValueError("max_pages must be between 1 and 50")
+        if self.wait_selector_state not in (
+            "attached",
+            "detached",
+            "hidden",
+            "visible",
+        ):
+            raise ValueError(f"Invalid wait_selector_state: {self.wait_selector_state}")
+        if self.timeout < 0:
+            raise ValueError("timeout must be >= 0")
+        if self.page_action is not None and not callable(self.page_action):
+            raise TypeError(
+                f"page_action must be callable, got {type(self.page_action).__name__}"
+            )
+        if self.proxy:
+            self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
+        if self.cdp_url:
+            self.__validate_cdp(self.cdp_url)
+    @staticmethod
+    def __validate_cdp(cdp_url):
+        try:
+            # Check the scheme
+            if not cdp_url.startswith(("ws://", "wss://")):
+                raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
+            # Validate hostname and port
+            if not urlparse(cdp_url).netloc:
+                raise ValueError("Invalid hostname for the CDP URL")
+        except AttributeError as e:
+            raise ValueError(f"Malformed CDP URL: {cdp_url}: {str(e)}")
+        except Exception as e:
+            raise ValueError(f"Invalid CDP URL '{cdp_url}': {str(e)}")
+def validate(params, model):
+    try:
+        config = msgspec.convert(params, model)
+    except msgspec.ValidationError as e:
+        raise TypeError(f"Invalid argument type: {e}")
+    return config

scrapling/engines/constants.py CHANGED Viewed

@@ -12,6 +12,15 @@ DEFAULT_DISABLED_RESOURCES = {
     "stylesheet",
 }
 DEFAULT_STEALTH_FLAGS = (
     # Explanation: https://peter.sh/experiments/chromium-command-line-switches/
     # Generally this will make the browser faster and less detectable

     "stylesheet",
 }
+HARMFUL_DEFAULT_ARGS = (
+    # This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
+    "--enable-automation",
+    "--disable-popup-blocking",
+    # '--disable-component-update',
+    # '--disable-default-apps',
+    # '--disable-extensions',
+)
 DEFAULT_STEALTH_FLAGS = (
     # Explanation: https://peter.sh/experiments/chromium-command-line-switches/
     # Generally this will make the browser faster and less detectable