Spaces:

lenson78
/

Scrapling

Paused

App Files Files Community

Karim shoair commited on Feb 6

Commit

47dd985

1 Parent(s): f67ebd1

feat(browsers): Add option to block requests to specific domains

Browse files

Files changed (8) hide show

scrapling/engines/_browsers/_base.py +20 -10
scrapling/engines/_browsers/_controllers.py +6 -2
scrapling/engines/_browsers/_stealth.py +6 -2
scrapling/engines/_browsers/_types.py +3 -0
scrapling/engines/_browsers/_validators.py +4 -0
scrapling/engines/toolbelt/navigation.py +50 -22
scrapling/fetchers/chrome.py +4 -2
scrapling/fetchers/stealth_chrome.py +2 -0

scrapling/engines/_browsers/_base.py CHANGED Viewed

@@ -24,11 +24,16 @@ from scrapling.parser import Selector
 from scrapling.engines._browsers._page import PageInfo, PagePool
 from scrapling.engines._browsers._validators import validate, PlaywrightConfig, StealthConfig
 from scrapling.engines._browsers._config_tools import __default_chrome_useragent__, __default_useragent__
-from scrapling.engines.toolbelt.navigation import construct_proxy_dict, intercept_route, async_intercept_route
 from scrapling.core._types import (
     Any,
     Dict,
     List,
     Optional,
     Callable,
     TYPE_CHECKING,
@@ -105,6 +110,7 @@ class SyncSession:
         timeout: int | float,
         extra_headers: Optional[Dict[str, str]],
         disable_resources: bool,
         context: Optional[BrowserContext] = None,
     ) -> PageInfo[Page]:  # pragma: no cover
         """Get a new page to use"""
@@ -117,9 +123,8 @@ class SyncSession:
         if extra_headers:
             page.set_extra_http_headers(extra_headers)
-        if disable_resources:
-            page.route("**/*", intercept_route)
         page_info = self.page_pool.add_page(page)
         page_info.mark_busy()
         return page_info
@@ -173,6 +178,7 @@ class SyncSession:
         extra_headers: Optional[Dict[str, str]],
         disable_resources: bool,
         proxy: Optional[ProxyType] = None,
     ) -> Generator["PageInfo[Page]", None, None]:
         """Acquire a page - either from persistent context or fresh context with proxy."""
         if proxy:
@@ -184,13 +190,13 @@ class SyncSession:
             try:
                 context = self._initialize_context(self._config, context)
-                page_info = self._get_page(timeout, extra_headers, disable_resources, context=context)
                 yield page_info
             finally:
                 context.close()
         else:
             # Standard mode: use PagePool with persistent context
-            page_info = self._get_page(timeout, extra_headers, disable_resources)
             try:
                 yield page_info
             finally:
@@ -261,6 +267,7 @@ class AsyncSession:
         timeout: int | float,
         extra_headers: Optional[Dict[str, str]],
         disable_resources: bool,
         context: Optional[AsyncBrowserContext] = None,
     ) -> PageInfo[AsyncPage]:  # pragma: no cover
         """Get a new page to use"""
@@ -288,8 +295,8 @@ class AsyncSession:
             if extra_headers:
                 await page.set_extra_http_headers(extra_headers)
-            if disable_resources:
-                await page.route("**/*", async_intercept_route)
             return self.page_pool.add_page(page)
@@ -342,6 +349,7 @@ class AsyncSession:
         extra_headers: Optional[Dict[str, str]],
         disable_resources: bool,
         proxy: Optional[ProxyType] = None,
     ) -> AsyncGenerator["PageInfo[AsyncPage]", None]:
         """Acquire a page - either from persistent context or fresh context with proxy."""
         if proxy:
@@ -353,13 +361,15 @@ class AsyncSession:
             try:
                 context = await self._initialize_context(self._config, context)
-                page_info = await self._get_page(timeout, extra_headers, disable_resources, context=context)
                 yield page_info
             finally:
                 await context.close()
         else:
             # Standard mode: use PagePool with persistent context
-            page_info = await self._get_page(timeout, extra_headers, disable_resources)
             try:
                 yield page_info
             finally:

 from scrapling.engines._browsers._page import PageInfo, PagePool
 from scrapling.engines._browsers._validators import validate, PlaywrightConfig, StealthConfig
 from scrapling.engines._browsers._config_tools import __default_chrome_useragent__, __default_useragent__
+from scrapling.engines.toolbelt.navigation import (
+    construct_proxy_dict,
+    create_intercept_handler,
+    create_async_intercept_handler,
+)
 from scrapling.core._types import (
     Any,
     Dict,
     List,
+    Set,
     Optional,
     Callable,
     TYPE_CHECKING,
         timeout: int | float,
         extra_headers: Optional[Dict[str, str]],
         disable_resources: bool,
+        blocked_domains: Optional[Set[str]] = None,
         context: Optional[BrowserContext] = None,
     ) -> PageInfo[Page]:  # pragma: no cover
         """Get a new page to use"""
         if extra_headers:
             page.set_extra_http_headers(extra_headers)
+        if disable_resources or blocked_domains:
+            page.route("**/*", create_intercept_handler(disable_resources, blocked_domains))
         page_info = self.page_pool.add_page(page)
         page_info.mark_busy()
         return page_info
         extra_headers: Optional[Dict[str, str]],
         disable_resources: bool,
         proxy: Optional[ProxyType] = None,
+        blocked_domains: Optional[Set[str]] = None,
     ) -> Generator["PageInfo[Page]", None, None]:
         """Acquire a page - either from persistent context or fresh context with proxy."""
         if proxy:
             try:
                 context = self._initialize_context(self._config, context)
+                page_info = self._get_page(timeout, extra_headers, disable_resources, blocked_domains, context=context)
                 yield page_info
             finally:
                 context.close()
         else:
             # Standard mode: use PagePool with persistent context
+            page_info = self._get_page(timeout, extra_headers, disable_resources, blocked_domains)
             try:
                 yield page_info
             finally:
         timeout: int | float,
         extra_headers: Optional[Dict[str, str]],
         disable_resources: bool,
+        blocked_domains: Optional[Set[str]] = None,
         context: Optional[AsyncBrowserContext] = None,
     ) -> PageInfo[AsyncPage]:  # pragma: no cover
         """Get a new page to use"""
             if extra_headers:
                 await page.set_extra_http_headers(extra_headers)
+            if disable_resources or blocked_domains:
+                await page.route("**/*", create_async_intercept_handler(disable_resources, blocked_domains))
             return self.page_pool.add_page(page)
         extra_headers: Optional[Dict[str, str]],
         disable_resources: bool,
         proxy: Optional[ProxyType] = None,
+        blocked_domains: Optional[Set[str]] = None,
     ) -> AsyncGenerator["PageInfo[AsyncPage]", None]:
         """Acquire a page - either from persistent context or fresh context with proxy."""
         if proxy:
             try:
                 context = await self._initialize_context(self._config, context)
+                page_info = await self._get_page(
+                    timeout, extra_headers, disable_resources, blocked_domains, context=context
+                )
                 yield page_info
             finally:
                 await context.close()
         else:
             # Standard mode: use PagePool with persistent context
+            page_info = await self._get_page(timeout, extra_headers, disable_resources, blocked_domains)
             try:
                 yield page_info
             finally:

scrapling/engines/_browsers/_controllers.py CHANGED Viewed

@@ -43,6 +43,7 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
         :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
         :param disable_resources: Drop requests for unnecessary resources for a speed boost.
             Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
         :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
         :param cookies: Set cookies for the next request.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
@@ -110,6 +111,7 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
         :param disable_resources: Drop requests for unnecessary resources for a speed boost.
             Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
         :param wait_selector: Wait for a specific CSS selector to be in a specific state.
         :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
@@ -138,7 +140,7 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
                 proxy = static_proxy
             with self._page_generator(
-                params.timeout, params.extra_headers, params.disable_resources, proxy
             ) as page_info:
                 final_response = [None]
                 page = page_info.page
@@ -208,6 +210,7 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
         :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
         :param disable_resources: Drop requests for unnecessary resources for a speed boost.
             Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
         :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
         :param cookies: Set cookies for the next request.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
@@ -277,6 +280,7 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
         :param disable_resources: Drop requests for unnecessary resources for a speed boost.
             Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
         :param wait_selector: Wait for a specific CSS selector to be in a specific state.
         :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
@@ -306,7 +310,7 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
                 proxy = static_proxy
             async with self._page_generator(
-                params.timeout, params.extra_headers, params.disable_resources, proxy
             ) as page_info:
                 final_response = [None]
                 page = page_info.page

         :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
         :param disable_resources: Drop requests for unnecessary resources for a speed boost.
             Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
         :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
         :param cookies: Set cookies for the next request.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
         :param disable_resources: Drop requests for unnecessary resources for a speed boost.
             Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
         :param wait_selector: Wait for a specific CSS selector to be in a specific state.
         :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
                 proxy = static_proxy
             with self._page_generator(
+                params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
             ) as page_info:
                 final_response = [None]
                 page = page_info.page
         :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
         :param disable_resources: Drop requests for unnecessary resources for a speed boost.
             Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
         :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
         :param cookies: Set cookies for the next request.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
         :param disable_resources: Drop requests for unnecessary resources for a speed boost.
             Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
         :param wait_selector: Wait for a specific CSS selector to be in a specific state.
         :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
                 proxy = static_proxy
             async with self._page_generator(
+                params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
             ) as page_info:
                 final_response = [None]
                 page = page_info.page

scrapling/engines/_browsers/_stealth.py CHANGED Viewed

@@ -47,6 +47,7 @@ class StealthySession(SyncSession, StealthySessionMixin):
         :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
         :param disable_resources: Drop requests for unnecessary resources for a speed boost.
             Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
         :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
         :param cookies: Set cookies for the next request.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
@@ -198,6 +199,7 @@ class StealthySession(SyncSession, StealthySessionMixin):
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
         :param disable_resources: Drop requests for unnecessary resources for a speed boost.
             Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
         :param wait_selector: Wait for a specific CSS selector to be in a specific state.
         :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
@@ -227,7 +229,7 @@ class StealthySession(SyncSession, StealthySessionMixin):
                 proxy = static_proxy
             with self._page_generator(
-                params.timeout, params.extra_headers, params.disable_resources, proxy
             ) as page_info:
                 final_response = [None]
                 page = page_info.page
@@ -302,6 +304,7 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
         :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
         :param disable_resources: Drop requests for unnecessary resources for a speed boost.
             Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
         :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
         :param cookies: Set cookies for the next request.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
@@ -454,6 +457,7 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
         :param disable_resources: Drop requests for unnecessary resources for a speed boost.
             Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
         :param wait_selector: Wait for a specific CSS selector to be in a specific state.
         :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
@@ -484,7 +488,7 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
                 proxy = static_proxy
             async with self._page_generator(
-                params.timeout, params.extra_headers, params.disable_resources, proxy
             ) as page_info:
                 final_response = [None]
                 page = page_info.page

         :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
         :param disable_resources: Drop requests for unnecessary resources for a speed boost.
             Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
         :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
         :param cookies: Set cookies for the next request.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
         :param disable_resources: Drop requests for unnecessary resources for a speed boost.
             Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
         :param wait_selector: Wait for a specific CSS selector to be in a specific state.
         :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
                 proxy = static_proxy
             with self._page_generator(
+                params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
             ) as page_info:
                 final_response = [None]
                 page = page_info.page
         :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
         :param disable_resources: Drop requests for unnecessary resources for a speed boost.
             Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
         :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
         :param cookies: Set cookies for the next request.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
         :param disable_resources: Drop requests for unnecessary resources for a speed boost.
             Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
         :param wait_selector: Wait for a specific CSS selector to be in a specific state.
         :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
                 proxy = static_proxy
             async with self._page_generator(
+                params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
             ) as page_info:
                 final_response = [None]
                 page = page_info.page

scrapling/engines/_browsers/_types.py CHANGED Viewed

@@ -9,6 +9,7 @@ from curl_cffi.requests import (
 from scrapling.core._types import (
     Dict,
     List,
     Tuple,
     Mapping,
     Optional,
@@ -84,6 +85,7 @@ if TYPE_CHECKING:  # pragma: no cover
         cdp_url: Optional[str]
         useragent: Optional[str]
         extra_flags: Optional[List[str]]
         retries: int
         retry_delay: int | float
@@ -99,6 +101,7 @@ if TYPE_CHECKING:  # pragma: no cover
         selector_config: Optional[Dict]
         extra_headers: Optional[Dict[str, str]]
         wait_selector_state: SelectorWaitStates
         proxy: Optional[str | Dict[str, str]]
     class StealthSession(PlaywrightSession, total=False):

 from scrapling.core._types import (
     Dict,
     List,
+    Set,
     Tuple,
     Mapping,
     Optional,
         cdp_url: Optional[str]
         useragent: Optional[str]
         extra_flags: Optional[List[str]]
+        blocked_domains: Optional[Set[str]]
         retries: int
         retry_delay: int | float
         selector_config: Optional[Dict]
         extra_headers: Optional[Dict[str, str]]
         wait_selector_state: SelectorWaitStates
+        blocked_domains: Optional[Set[str]]
         proxy: Optional[str | Dict[str, str]]
     class StealthSession(PlaywrightSession, total=False):

scrapling/engines/_browsers/_validators.py CHANGED Viewed

@@ -10,6 +10,7 @@ from scrapling.core._types import (
     Any,
     Dict,
     List,
     Tuple,
     Optional,
     Callable,
@@ -83,6 +84,7 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False, weakref=True):
     cdp_url: Optional[str] = None
     useragent: Optional[str] = None
     extra_flags: Optional[List[str]] = None
     retries: RetriesCount = 3
     retry_delay: Seconds = 1
@@ -145,6 +147,7 @@ class _fetch_params:
     wait_selector_state: SelectorWaitStates
     network_idle: bool
     load_dom: bool
     solve_cloudflare: bool
     selector_config: Dict
@@ -183,6 +186,7 @@ def validate_fetch(
     # solve_cloudflare defaults to False for models that don't have it (PlaywrightConfig)
     result.setdefault("solve_cloudflare", False)
     return _fetch_params(**result)

     Any,
     Dict,
     List,
+    Set,
     Tuple,
     Optional,
     Callable,
     cdp_url: Optional[str] = None
     useragent: Optional[str] = None
     extra_flags: Optional[List[str]] = None
+    blocked_domains: Optional[Set[str]] = None
     retries: RetriesCount = 3
     retry_delay: Seconds = 1
     wait_selector_state: SelectorWaitStates
     network_idle: bool
     load_dom: bool
+    blocked_domains: Optional[Set[str]]
     solve_cloudflare: bool
     selector_config: Dict
     # solve_cloudflare defaults to False for models that don't have it (PlaywrightConfig)
     result.setdefault("solve_cloudflare", False)
+    result.setdefault("blocked_domains", None)
     return _fetch_params(**result)

scrapling/engines/toolbelt/navigation.py CHANGED Viewed

@@ -11,7 +11,7 @@ from msgspec import Struct, structs, convert, ValidationError
 from playwright.sync_api import Route
 from scrapling.core.utils import log
-from scrapling.core._types import Dict, Tuple
 from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
 __BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
@@ -23,30 +23,58 @@ class ProxyDict(Struct):
     password: str = ""
-def intercept_route(route: Route):
-    """This is just a route handler, but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
-    :param route: PlayWright `Route` object of the current page
-    :return: PlayWright `Route` object
     """
-    if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
-        log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
-        route.abort()
-    else:
-        route.continue_()
-async def async_intercept_route(route: async_Route):
-    """This is just a route handler, but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
-    :param route: PlayWright `Route` object of the current page
-    :return: PlayWright `Route` object
     """
-    if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
-        log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
-        await route.abort()
-    else:
-        await route.continue_()
 def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple) -> Dict:

 from playwright.sync_api import Route
 from scrapling.core.utils import log
+from scrapling.core._types import Dict, Set, Tuple, Optional, Callable
 from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
 __BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
     password: str = ""
+def create_intercept_handler(disable_resources: bool, blocked_domains: Optional[Set[str]] = None) -> Callable:
+    """Create a route handler that blocks both resource types and specific domains.
+    :param disable_resources: Whether to block default resource types.
+    :param blocked_domains: Set of domain names to block requests to.
+    :return: A sync route handler function.
     """
+    disabled_resources = DEFAULT_DISABLED_RESOURCES if disable_resources else set()
+    domains = blocked_domains or set()
+    def handler(route: Route):
+        if route.request.resource_type in disabled_resources:
+            log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
+            route.abort()
+        elif domains:
+            hostname = urlparse(route.request.url).hostname or ""
+            if any(hostname == d or hostname.endswith("." + d) for d in domains):
+                log.debug(f'Blocking request to blocked domain "{hostname}" ({route.request.url})')
+                route.abort()
+            else:
+                route.continue_()
+        else:
+            route.continue_()
+    return handler
+def create_async_intercept_handler(disable_resources: bool, blocked_domains: Optional[Set[str]] = None) -> Callable:
+    """Create an async route handler that blocks both resource types and specific domains.
+    :param disable_resources: Whether to block default resource types.
+    :param blocked_domains: Set of domain names to block requests to.
+    :return: An async route handler function.
     """
+    disabled_resources = DEFAULT_DISABLED_RESOURCES if disable_resources else set()
+    domains = blocked_domains or set()
+    async def handler(route: async_Route):
+        if route.request.resource_type in disabled_resources:
+            log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
+            await route.abort()
+        elif domains:
+            hostname = urlparse(route.request.url).hostname or ""
+            if any(hostname == d or hostname.endswith("." + d) for d in domains):
+                log.debug(f'Blocking request to blocked domain "{hostname}" ({route.request.url})')
+                await route.abort()
+            else:
+                await route.continue_()
+        else:
+            await route.continue_()
+    return handler
 def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple) -> Dict:

scrapling/fetchers/chrome.py CHANGED Viewed

@@ -13,7 +13,8 @@ class DynamicFetcher(BaseFetcher):
         :param url: Target url.
         :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
-        :param disable_resources: Drop requests of unnecessary resources for a speed boost.
         :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
         :param cookies: Set cookies for the next request.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
@@ -55,7 +56,8 @@ class DynamicFetcher(BaseFetcher):
         :param url: Target url.
         :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
-        :param disable_resources: Drop requests of unnecessary resources for a speed boost.
         :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
         :param cookies: Set cookies for the next request.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.

         :param url: Target url.
         :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
+        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
+        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
         :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
         :param cookies: Set cookies for the next request.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
         :param url: Target url.
         :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
+        :param disable_resources: Drop requests for unnecessary resources for a speed boost.
+        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
         :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
         :param cookies: Set cookies for the next request.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.

scrapling/fetchers/stealth_chrome.py CHANGED Viewed

@@ -19,6 +19,7 @@ class StealthyFetcher(BaseFetcher):
         :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
         :param disable_resources: Drop requests for unnecessary resources for a speed boost.
             Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
         :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
         :param cookies: Set cookies for the next request.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
@@ -67,6 +68,7 @@ class StealthyFetcher(BaseFetcher):
         :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
         :param disable_resources: Drop requests for unnecessary resources for a speed boost.
             Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
         :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
         :param cookies: Set cookies for the next request.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.

         :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
         :param disable_resources: Drop requests for unnecessary resources for a speed boost.
             Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
         :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
         :param cookies: Set cookies for the next request.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
         :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
         :param disable_resources: Drop requests for unnecessary resources for a speed boost.
             Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
+        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
         :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
         :param cookies: Set cookies for the next request.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.