Karim shoair commited on
Commit ·
47dd985
1
Parent(s): f67ebd1
feat(browsers): Add option to block requests to specific domains
Browse files- scrapling/engines/_browsers/_base.py +20 -10
- scrapling/engines/_browsers/_controllers.py +6 -2
- scrapling/engines/_browsers/_stealth.py +6 -2
- scrapling/engines/_browsers/_types.py +3 -0
- scrapling/engines/_browsers/_validators.py +4 -0
- scrapling/engines/toolbelt/navigation.py +50 -22
- scrapling/fetchers/chrome.py +4 -2
- scrapling/fetchers/stealth_chrome.py +2 -0
scrapling/engines/_browsers/_base.py
CHANGED
|
@@ -24,11 +24,16 @@ from scrapling.parser import Selector
|
|
| 24 |
from scrapling.engines._browsers._page import PageInfo, PagePool
|
| 25 |
from scrapling.engines._browsers._validators import validate, PlaywrightConfig, StealthConfig
|
| 26 |
from scrapling.engines._browsers._config_tools import __default_chrome_useragent__, __default_useragent__
|
| 27 |
-
from scrapling.engines.toolbelt.navigation import
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
from scrapling.core._types import (
|
| 29 |
Any,
|
| 30 |
Dict,
|
| 31 |
List,
|
|
|
|
| 32 |
Optional,
|
| 33 |
Callable,
|
| 34 |
TYPE_CHECKING,
|
|
@@ -105,6 +110,7 @@ class SyncSession:
|
|
| 105 |
timeout: int | float,
|
| 106 |
extra_headers: Optional[Dict[str, str]],
|
| 107 |
disable_resources: bool,
|
|
|
|
| 108 |
context: Optional[BrowserContext] = None,
|
| 109 |
) -> PageInfo[Page]: # pragma: no cover
|
| 110 |
"""Get a new page to use"""
|
|
@@ -117,9 +123,8 @@ class SyncSession:
|
|
| 117 |
if extra_headers:
|
| 118 |
page.set_extra_http_headers(extra_headers)
|
| 119 |
|
| 120 |
-
if disable_resources:
|
| 121 |
-
page.route("**/*",
|
| 122 |
-
|
| 123 |
page_info = self.page_pool.add_page(page)
|
| 124 |
page_info.mark_busy()
|
| 125 |
return page_info
|
|
@@ -173,6 +178,7 @@ class SyncSession:
|
|
| 173 |
extra_headers: Optional[Dict[str, str]],
|
| 174 |
disable_resources: bool,
|
| 175 |
proxy: Optional[ProxyType] = None,
|
|
|
|
| 176 |
) -> Generator["PageInfo[Page]", None, None]:
|
| 177 |
"""Acquire a page - either from persistent context or fresh context with proxy."""
|
| 178 |
if proxy:
|
|
@@ -184,13 +190,13 @@ class SyncSession:
|
|
| 184 |
|
| 185 |
try:
|
| 186 |
context = self._initialize_context(self._config, context)
|
| 187 |
-
page_info = self._get_page(timeout, extra_headers, disable_resources, context=context)
|
| 188 |
yield page_info
|
| 189 |
finally:
|
| 190 |
context.close()
|
| 191 |
else:
|
| 192 |
# Standard mode: use PagePool with persistent context
|
| 193 |
-
page_info = self._get_page(timeout, extra_headers, disable_resources)
|
| 194 |
try:
|
| 195 |
yield page_info
|
| 196 |
finally:
|
|
@@ -261,6 +267,7 @@ class AsyncSession:
|
|
| 261 |
timeout: int | float,
|
| 262 |
extra_headers: Optional[Dict[str, str]],
|
| 263 |
disable_resources: bool,
|
|
|
|
| 264 |
context: Optional[AsyncBrowserContext] = None,
|
| 265 |
) -> PageInfo[AsyncPage]: # pragma: no cover
|
| 266 |
"""Get a new page to use"""
|
|
@@ -288,8 +295,8 @@ class AsyncSession:
|
|
| 288 |
if extra_headers:
|
| 289 |
await page.set_extra_http_headers(extra_headers)
|
| 290 |
|
| 291 |
-
if disable_resources:
|
| 292 |
-
await page.route("**/*",
|
| 293 |
|
| 294 |
return self.page_pool.add_page(page)
|
| 295 |
|
|
@@ -342,6 +349,7 @@ class AsyncSession:
|
|
| 342 |
extra_headers: Optional[Dict[str, str]],
|
| 343 |
disable_resources: bool,
|
| 344 |
proxy: Optional[ProxyType] = None,
|
|
|
|
| 345 |
) -> AsyncGenerator["PageInfo[AsyncPage]", None]:
|
| 346 |
"""Acquire a page - either from persistent context or fresh context with proxy."""
|
| 347 |
if proxy:
|
|
@@ -353,13 +361,15 @@ class AsyncSession:
|
|
| 353 |
|
| 354 |
try:
|
| 355 |
context = await self._initialize_context(self._config, context)
|
| 356 |
-
page_info = await self._get_page(
|
|
|
|
|
|
|
| 357 |
yield page_info
|
| 358 |
finally:
|
| 359 |
await context.close()
|
| 360 |
else:
|
| 361 |
# Standard mode: use PagePool with persistent context
|
| 362 |
-
page_info = await self._get_page(timeout, extra_headers, disable_resources)
|
| 363 |
try:
|
| 364 |
yield page_info
|
| 365 |
finally:
|
|
|
|
| 24 |
from scrapling.engines._browsers._page import PageInfo, PagePool
|
| 25 |
from scrapling.engines._browsers._validators import validate, PlaywrightConfig, StealthConfig
|
| 26 |
from scrapling.engines._browsers._config_tools import __default_chrome_useragent__, __default_useragent__
|
| 27 |
+
from scrapling.engines.toolbelt.navigation import (
|
| 28 |
+
construct_proxy_dict,
|
| 29 |
+
create_intercept_handler,
|
| 30 |
+
create_async_intercept_handler,
|
| 31 |
+
)
|
| 32 |
from scrapling.core._types import (
|
| 33 |
Any,
|
| 34 |
Dict,
|
| 35 |
List,
|
| 36 |
+
Set,
|
| 37 |
Optional,
|
| 38 |
Callable,
|
| 39 |
TYPE_CHECKING,
|
|
|
|
| 110 |
timeout: int | float,
|
| 111 |
extra_headers: Optional[Dict[str, str]],
|
| 112 |
disable_resources: bool,
|
| 113 |
+
blocked_domains: Optional[Set[str]] = None,
|
| 114 |
context: Optional[BrowserContext] = None,
|
| 115 |
) -> PageInfo[Page]: # pragma: no cover
|
| 116 |
"""Get a new page to use"""
|
|
|
|
| 123 |
if extra_headers:
|
| 124 |
page.set_extra_http_headers(extra_headers)
|
| 125 |
|
| 126 |
+
if disable_resources or blocked_domains:
|
| 127 |
+
page.route("**/*", create_intercept_handler(disable_resources, blocked_domains))
|
|
|
|
| 128 |
page_info = self.page_pool.add_page(page)
|
| 129 |
page_info.mark_busy()
|
| 130 |
return page_info
|
|
|
|
| 178 |
extra_headers: Optional[Dict[str, str]],
|
| 179 |
disable_resources: bool,
|
| 180 |
proxy: Optional[ProxyType] = None,
|
| 181 |
+
blocked_domains: Optional[Set[str]] = None,
|
| 182 |
) -> Generator["PageInfo[Page]", None, None]:
|
| 183 |
"""Acquire a page - either from persistent context or fresh context with proxy."""
|
| 184 |
if proxy:
|
|
|
|
| 190 |
|
| 191 |
try:
|
| 192 |
context = self._initialize_context(self._config, context)
|
| 193 |
+
page_info = self._get_page(timeout, extra_headers, disable_resources, blocked_domains, context=context)
|
| 194 |
yield page_info
|
| 195 |
finally:
|
| 196 |
context.close()
|
| 197 |
else:
|
| 198 |
# Standard mode: use PagePool with persistent context
|
| 199 |
+
page_info = self._get_page(timeout, extra_headers, disable_resources, blocked_domains)
|
| 200 |
try:
|
| 201 |
yield page_info
|
| 202 |
finally:
|
|
|
|
| 267 |
timeout: int | float,
|
| 268 |
extra_headers: Optional[Dict[str, str]],
|
| 269 |
disable_resources: bool,
|
| 270 |
+
blocked_domains: Optional[Set[str]] = None,
|
| 271 |
context: Optional[AsyncBrowserContext] = None,
|
| 272 |
) -> PageInfo[AsyncPage]: # pragma: no cover
|
| 273 |
"""Get a new page to use"""
|
|
|
|
| 295 |
if extra_headers:
|
| 296 |
await page.set_extra_http_headers(extra_headers)
|
| 297 |
|
| 298 |
+
if disable_resources or blocked_domains:
|
| 299 |
+
await page.route("**/*", create_async_intercept_handler(disable_resources, blocked_domains))
|
| 300 |
|
| 301 |
return self.page_pool.add_page(page)
|
| 302 |
|
|
|
|
| 349 |
extra_headers: Optional[Dict[str, str]],
|
| 350 |
disable_resources: bool,
|
| 351 |
proxy: Optional[ProxyType] = None,
|
| 352 |
+
blocked_domains: Optional[Set[str]] = None,
|
| 353 |
) -> AsyncGenerator["PageInfo[AsyncPage]", None]:
|
| 354 |
"""Acquire a page - either from persistent context or fresh context with proxy."""
|
| 355 |
if proxy:
|
|
|
|
| 361 |
|
| 362 |
try:
|
| 363 |
context = await self._initialize_context(self._config, context)
|
| 364 |
+
page_info = await self._get_page(
|
| 365 |
+
timeout, extra_headers, disable_resources, blocked_domains, context=context
|
| 366 |
+
)
|
| 367 |
yield page_info
|
| 368 |
finally:
|
| 369 |
await context.close()
|
| 370 |
else:
|
| 371 |
# Standard mode: use PagePool with persistent context
|
| 372 |
+
page_info = await self._get_page(timeout, extra_headers, disable_resources, blocked_domains)
|
| 373 |
try:
|
| 374 |
yield page_info
|
| 375 |
finally:
|
scrapling/engines/_browsers/_controllers.py
CHANGED
|
@@ -43,6 +43,7 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
|
|
| 43 |
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 44 |
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 45 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
|
|
|
| 46 |
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 47 |
:param cookies: Set cookies for the next request.
|
| 48 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
|
@@ -110,6 +111,7 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
|
|
| 110 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 111 |
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 112 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
|
|
|
| 113 |
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 114 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 115 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
|
@@ -138,7 +140,7 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
|
|
| 138 |
proxy = static_proxy
|
| 139 |
|
| 140 |
with self._page_generator(
|
| 141 |
-
params.timeout, params.extra_headers, params.disable_resources, proxy
|
| 142 |
) as page_info:
|
| 143 |
final_response = [None]
|
| 144 |
page = page_info.page
|
|
@@ -208,6 +210,7 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
|
|
| 208 |
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 209 |
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 210 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
|
|
|
| 211 |
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 212 |
:param cookies: Set cookies for the next request.
|
| 213 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
|
@@ -277,6 +280,7 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
|
|
| 277 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 278 |
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 279 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
|
|
|
| 280 |
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 281 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 282 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
|
@@ -306,7 +310,7 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
|
|
| 306 |
proxy = static_proxy
|
| 307 |
|
| 308 |
async with self._page_generator(
|
| 309 |
-
params.timeout, params.extra_headers, params.disable_resources, proxy
|
| 310 |
) as page_info:
|
| 311 |
final_response = [None]
|
| 312 |
page = page_info.page
|
|
|
|
| 43 |
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 44 |
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 45 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 46 |
+
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
| 47 |
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 48 |
:param cookies: Set cookies for the next request.
|
| 49 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
|
|
|
| 111 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 112 |
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 113 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 114 |
+
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
| 115 |
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 116 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 117 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
|
|
|
| 140 |
proxy = static_proxy
|
| 141 |
|
| 142 |
with self._page_generator(
|
| 143 |
+
params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
|
| 144 |
) as page_info:
|
| 145 |
final_response = [None]
|
| 146 |
page = page_info.page
|
|
|
|
| 210 |
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 211 |
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 212 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 213 |
+
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
| 214 |
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 215 |
:param cookies: Set cookies for the next request.
|
| 216 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
|
|
|
| 280 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 281 |
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 282 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 283 |
+
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
| 284 |
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 285 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 286 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
|
|
|
| 310 |
proxy = static_proxy
|
| 311 |
|
| 312 |
async with self._page_generator(
|
| 313 |
+
params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
|
| 314 |
) as page_info:
|
| 315 |
final_response = [None]
|
| 316 |
page = page_info.page
|
scrapling/engines/_browsers/_stealth.py
CHANGED
|
@@ -47,6 +47,7 @@ class StealthySession(SyncSession, StealthySessionMixin):
|
|
| 47 |
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 48 |
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 49 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
|
|
|
| 50 |
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 51 |
:param cookies: Set cookies for the next request.
|
| 52 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
|
@@ -198,6 +199,7 @@ class StealthySession(SyncSession, StealthySessionMixin):
|
|
| 198 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 199 |
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 200 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
|
|
|
| 201 |
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 202 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 203 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
|
@@ -227,7 +229,7 @@ class StealthySession(SyncSession, StealthySessionMixin):
|
|
| 227 |
proxy = static_proxy
|
| 228 |
|
| 229 |
with self._page_generator(
|
| 230 |
-
params.timeout, params.extra_headers, params.disable_resources, proxy
|
| 231 |
) as page_info:
|
| 232 |
final_response = [None]
|
| 233 |
page = page_info.page
|
|
@@ -302,6 +304,7 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
|
|
| 302 |
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 303 |
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 304 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
|
|
|
| 305 |
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 306 |
:param cookies: Set cookies for the next request.
|
| 307 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
|
@@ -454,6 +457,7 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
|
|
| 454 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 455 |
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 456 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
|
|
|
| 457 |
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 458 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 459 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
|
@@ -484,7 +488,7 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
|
|
| 484 |
proxy = static_proxy
|
| 485 |
|
| 486 |
async with self._page_generator(
|
| 487 |
-
params.timeout, params.extra_headers, params.disable_resources, proxy
|
| 488 |
) as page_info:
|
| 489 |
final_response = [None]
|
| 490 |
page = page_info.page
|
|
|
|
| 47 |
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 48 |
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 49 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 50 |
+
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
| 51 |
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 52 |
:param cookies: Set cookies for the next request.
|
| 53 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
|
|
|
| 199 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 200 |
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 201 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 202 |
+
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
| 203 |
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 204 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 205 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
|
|
|
| 229 |
proxy = static_proxy
|
| 230 |
|
| 231 |
with self._page_generator(
|
| 232 |
+
params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
|
| 233 |
) as page_info:
|
| 234 |
final_response = [None]
|
| 235 |
page = page_info.page
|
|
|
|
| 304 |
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 305 |
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 306 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 307 |
+
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
| 308 |
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 309 |
:param cookies: Set cookies for the next request.
|
| 310 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
|
|
|
| 457 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 458 |
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 459 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 460 |
+
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
| 461 |
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 462 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 463 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
|
|
|
| 488 |
proxy = static_proxy
|
| 489 |
|
| 490 |
async with self._page_generator(
|
| 491 |
+
params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
|
| 492 |
) as page_info:
|
| 493 |
final_response = [None]
|
| 494 |
page = page_info.page
|
scrapling/engines/_browsers/_types.py
CHANGED
|
@@ -9,6 +9,7 @@ from curl_cffi.requests import (
|
|
| 9 |
from scrapling.core._types import (
|
| 10 |
Dict,
|
| 11 |
List,
|
|
|
|
| 12 |
Tuple,
|
| 13 |
Mapping,
|
| 14 |
Optional,
|
|
@@ -84,6 +85,7 @@ if TYPE_CHECKING: # pragma: no cover
|
|
| 84 |
cdp_url: Optional[str]
|
| 85 |
useragent: Optional[str]
|
| 86 |
extra_flags: Optional[List[str]]
|
|
|
|
| 87 |
retries: int
|
| 88 |
retry_delay: int | float
|
| 89 |
|
|
@@ -99,6 +101,7 @@ if TYPE_CHECKING: # pragma: no cover
|
|
| 99 |
selector_config: Optional[Dict]
|
| 100 |
extra_headers: Optional[Dict[str, str]]
|
| 101 |
wait_selector_state: SelectorWaitStates
|
|
|
|
| 102 |
proxy: Optional[str | Dict[str, str]]
|
| 103 |
|
| 104 |
class StealthSession(PlaywrightSession, total=False):
|
|
|
|
| 9 |
from scrapling.core._types import (
|
| 10 |
Dict,
|
| 11 |
List,
|
| 12 |
+
Set,
|
| 13 |
Tuple,
|
| 14 |
Mapping,
|
| 15 |
Optional,
|
|
|
|
| 85 |
cdp_url: Optional[str]
|
| 86 |
useragent: Optional[str]
|
| 87 |
extra_flags: Optional[List[str]]
|
| 88 |
+
blocked_domains: Optional[Set[str]]
|
| 89 |
retries: int
|
| 90 |
retry_delay: int | float
|
| 91 |
|
|
|
|
| 101 |
selector_config: Optional[Dict]
|
| 102 |
extra_headers: Optional[Dict[str, str]]
|
| 103 |
wait_selector_state: SelectorWaitStates
|
| 104 |
+
blocked_domains: Optional[Set[str]]
|
| 105 |
proxy: Optional[str | Dict[str, str]]
|
| 106 |
|
| 107 |
class StealthSession(PlaywrightSession, total=False):
|
scrapling/engines/_browsers/_validators.py
CHANGED
|
@@ -10,6 +10,7 @@ from scrapling.core._types import (
|
|
| 10 |
Any,
|
| 11 |
Dict,
|
| 12 |
List,
|
|
|
|
| 13 |
Tuple,
|
| 14 |
Optional,
|
| 15 |
Callable,
|
|
@@ -83,6 +84,7 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False, weakref=True):
|
|
| 83 |
cdp_url: Optional[str] = None
|
| 84 |
useragent: Optional[str] = None
|
| 85 |
extra_flags: Optional[List[str]] = None
|
|
|
|
| 86 |
retries: RetriesCount = 3
|
| 87 |
retry_delay: Seconds = 1
|
| 88 |
|
|
@@ -145,6 +147,7 @@ class _fetch_params:
|
|
| 145 |
wait_selector_state: SelectorWaitStates
|
| 146 |
network_idle: bool
|
| 147 |
load_dom: bool
|
|
|
|
| 148 |
solve_cloudflare: bool
|
| 149 |
selector_config: Dict
|
| 150 |
|
|
@@ -183,6 +186,7 @@ def validate_fetch(
|
|
| 183 |
|
| 184 |
# solve_cloudflare defaults to False for models that don't have it (PlaywrightConfig)
|
| 185 |
result.setdefault("solve_cloudflare", False)
|
|
|
|
| 186 |
|
| 187 |
return _fetch_params(**result)
|
| 188 |
|
|
|
|
| 10 |
Any,
|
| 11 |
Dict,
|
| 12 |
List,
|
| 13 |
+
Set,
|
| 14 |
Tuple,
|
| 15 |
Optional,
|
| 16 |
Callable,
|
|
|
|
| 84 |
cdp_url: Optional[str] = None
|
| 85 |
useragent: Optional[str] = None
|
| 86 |
extra_flags: Optional[List[str]] = None
|
| 87 |
+
blocked_domains: Optional[Set[str]] = None
|
| 88 |
retries: RetriesCount = 3
|
| 89 |
retry_delay: Seconds = 1
|
| 90 |
|
|
|
|
| 147 |
wait_selector_state: SelectorWaitStates
|
| 148 |
network_idle: bool
|
| 149 |
load_dom: bool
|
| 150 |
+
blocked_domains: Optional[Set[str]]
|
| 151 |
solve_cloudflare: bool
|
| 152 |
selector_config: Dict
|
| 153 |
|
|
|
|
| 186 |
|
| 187 |
# solve_cloudflare defaults to False for models that don't have it (PlaywrightConfig)
|
| 188 |
result.setdefault("solve_cloudflare", False)
|
| 189 |
+
result.setdefault("blocked_domains", None)
|
| 190 |
|
| 191 |
return _fetch_params(**result)
|
| 192 |
|
scrapling/engines/toolbelt/navigation.py
CHANGED
|
@@ -11,7 +11,7 @@ from msgspec import Struct, structs, convert, ValidationError
|
|
| 11 |
from playwright.sync_api import Route
|
| 12 |
|
| 13 |
from scrapling.core.utils import log
|
| 14 |
-
from scrapling.core._types import Dict, Tuple
|
| 15 |
from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
|
| 16 |
|
| 17 |
__BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
|
|
@@ -23,30 +23,58 @@ class ProxyDict(Struct):
|
|
| 23 |
password: str = ""
|
| 24 |
|
| 25 |
|
| 26 |
-
def
|
| 27 |
-
"""
|
| 28 |
|
| 29 |
-
:param
|
| 30 |
-
:
|
|
|
|
| 31 |
"""
|
| 32 |
-
if
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
route.
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
"""
|
| 45 |
-
if
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
|
| 52 |
def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple) -> Dict:
|
|
|
|
| 11 |
from playwright.sync_api import Route
|
| 12 |
|
| 13 |
from scrapling.core.utils import log
|
| 14 |
+
from scrapling.core._types import Dict, Set, Tuple, Optional, Callable
|
| 15 |
from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
|
| 16 |
|
| 17 |
__BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
|
|
|
|
| 23 |
password: str = ""
|
| 24 |
|
| 25 |
|
| 26 |
+
def create_intercept_handler(disable_resources: bool, blocked_domains: Optional[Set[str]] = None) -> Callable:
|
| 27 |
+
"""Create a route handler that blocks both resource types and specific domains.
|
| 28 |
|
| 29 |
+
:param disable_resources: Whether to block default resource types.
|
| 30 |
+
:param blocked_domains: Set of domain names to block requests to.
|
| 31 |
+
:return: A sync route handler function.
|
| 32 |
"""
|
| 33 |
+
disabled_resources = DEFAULT_DISABLED_RESOURCES if disable_resources else set()
|
| 34 |
+
domains = blocked_domains or set()
|
| 35 |
+
|
| 36 |
+
def handler(route: Route):
|
| 37 |
+
if route.request.resource_type in disabled_resources:
|
| 38 |
+
log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
|
| 39 |
+
route.abort()
|
| 40 |
+
elif domains:
|
| 41 |
+
hostname = urlparse(route.request.url).hostname or ""
|
| 42 |
+
if any(hostname == d or hostname.endswith("." + d) for d in domains):
|
| 43 |
+
log.debug(f'Blocking request to blocked domain "{hostname}" ({route.request.url})')
|
| 44 |
+
route.abort()
|
| 45 |
+
else:
|
| 46 |
+
route.continue_()
|
| 47 |
+
else:
|
| 48 |
+
route.continue_()
|
| 49 |
+
|
| 50 |
+
return handler
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def create_async_intercept_handler(disable_resources: bool, blocked_domains: Optional[Set[str]] = None) -> Callable:
|
| 54 |
+
"""Create an async route handler that blocks both resource types and specific domains.
|
| 55 |
+
|
| 56 |
+
:param disable_resources: Whether to block default resource types.
|
| 57 |
+
:param blocked_domains: Set of domain names to block requests to.
|
| 58 |
+
:return: An async route handler function.
|
| 59 |
"""
|
| 60 |
+
disabled_resources = DEFAULT_DISABLED_RESOURCES if disable_resources else set()
|
| 61 |
+
domains = blocked_domains or set()
|
| 62 |
+
|
| 63 |
+
async def handler(route: async_Route):
|
| 64 |
+
if route.request.resource_type in disabled_resources:
|
| 65 |
+
log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
|
| 66 |
+
await route.abort()
|
| 67 |
+
elif domains:
|
| 68 |
+
hostname = urlparse(route.request.url).hostname or ""
|
| 69 |
+
if any(hostname == d or hostname.endswith("." + d) for d in domains):
|
| 70 |
+
log.debug(f'Blocking request to blocked domain "{hostname}" ({route.request.url})')
|
| 71 |
+
await route.abort()
|
| 72 |
+
else:
|
| 73 |
+
await route.continue_()
|
| 74 |
+
else:
|
| 75 |
+
await route.continue_()
|
| 76 |
+
|
| 77 |
+
return handler
|
| 78 |
|
| 79 |
|
| 80 |
def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple) -> Dict:
|
scrapling/fetchers/chrome.py
CHANGED
|
@@ -13,7 +13,8 @@ class DynamicFetcher(BaseFetcher):
|
|
| 13 |
|
| 14 |
:param url: Target url.
|
| 15 |
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 16 |
-
:param disable_resources: Drop requests
|
|
|
|
| 17 |
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 18 |
:param cookies: Set cookies for the next request.
|
| 19 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
|
@@ -55,7 +56,8 @@ class DynamicFetcher(BaseFetcher):
|
|
| 55 |
|
| 56 |
:param url: Target url.
|
| 57 |
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 58 |
-
:param disable_resources: Drop requests
|
|
|
|
| 59 |
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 60 |
:param cookies: Set cookies for the next request.
|
| 61 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
|
|
|
| 13 |
|
| 14 |
:param url: Target url.
|
| 15 |
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 16 |
+
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 17 |
+
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
| 18 |
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 19 |
:param cookies: Set cookies for the next request.
|
| 20 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
|
|
|
| 56 |
|
| 57 |
:param url: Target url.
|
| 58 |
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 59 |
+
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 60 |
+
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
| 61 |
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 62 |
:param cookies: Set cookies for the next request.
|
| 63 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
scrapling/fetchers/stealth_chrome.py
CHANGED
|
@@ -19,6 +19,7 @@ class StealthyFetcher(BaseFetcher):
|
|
| 19 |
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 20 |
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 21 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
|
|
|
| 22 |
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 23 |
:param cookies: Set cookies for the next request.
|
| 24 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
|
@@ -67,6 +68,7 @@ class StealthyFetcher(BaseFetcher):
|
|
| 67 |
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 68 |
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 69 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
|
|
|
| 70 |
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 71 |
:param cookies: Set cookies for the next request.
|
| 72 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
|
|
|
| 19 |
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 20 |
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 21 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 22 |
+
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
| 23 |
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 24 |
:param cookies: Set cookies for the next request.
|
| 25 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
|
|
|
| 68 |
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 69 |
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 70 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 71 |
+
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
| 72 |
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 73 |
:param cookies: Set cookies for the next request.
|
| 74 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|