Karim shoair commited on
Commit
47dd985
·
1 Parent(s): f67ebd1

feat(browsers): Add option to block requests to specific domains

Browse files
scrapling/engines/_browsers/_base.py CHANGED
@@ -24,11 +24,16 @@ from scrapling.parser import Selector
24
  from scrapling.engines._browsers._page import PageInfo, PagePool
25
  from scrapling.engines._browsers._validators import validate, PlaywrightConfig, StealthConfig
26
  from scrapling.engines._browsers._config_tools import __default_chrome_useragent__, __default_useragent__
27
- from scrapling.engines.toolbelt.navigation import construct_proxy_dict, intercept_route, async_intercept_route
 
 
 
 
28
  from scrapling.core._types import (
29
  Any,
30
  Dict,
31
  List,
 
32
  Optional,
33
  Callable,
34
  TYPE_CHECKING,
@@ -105,6 +110,7 @@ class SyncSession:
105
  timeout: int | float,
106
  extra_headers: Optional[Dict[str, str]],
107
  disable_resources: bool,
 
108
  context: Optional[BrowserContext] = None,
109
  ) -> PageInfo[Page]: # pragma: no cover
110
  """Get a new page to use"""
@@ -117,9 +123,8 @@ class SyncSession:
117
  if extra_headers:
118
  page.set_extra_http_headers(extra_headers)
119
 
120
- if disable_resources:
121
- page.route("**/*", intercept_route)
122
-
123
  page_info = self.page_pool.add_page(page)
124
  page_info.mark_busy()
125
  return page_info
@@ -173,6 +178,7 @@ class SyncSession:
173
  extra_headers: Optional[Dict[str, str]],
174
  disable_resources: bool,
175
  proxy: Optional[ProxyType] = None,
 
176
  ) -> Generator["PageInfo[Page]", None, None]:
177
  """Acquire a page - either from persistent context or fresh context with proxy."""
178
  if proxy:
@@ -184,13 +190,13 @@ class SyncSession:
184
 
185
  try:
186
  context = self._initialize_context(self._config, context)
187
- page_info = self._get_page(timeout, extra_headers, disable_resources, context=context)
188
  yield page_info
189
  finally:
190
  context.close()
191
  else:
192
  # Standard mode: use PagePool with persistent context
193
- page_info = self._get_page(timeout, extra_headers, disable_resources)
194
  try:
195
  yield page_info
196
  finally:
@@ -261,6 +267,7 @@ class AsyncSession:
261
  timeout: int | float,
262
  extra_headers: Optional[Dict[str, str]],
263
  disable_resources: bool,
 
264
  context: Optional[AsyncBrowserContext] = None,
265
  ) -> PageInfo[AsyncPage]: # pragma: no cover
266
  """Get a new page to use"""
@@ -288,8 +295,8 @@ class AsyncSession:
288
  if extra_headers:
289
  await page.set_extra_http_headers(extra_headers)
290
 
291
- if disable_resources:
292
- await page.route("**/*", async_intercept_route)
293
 
294
  return self.page_pool.add_page(page)
295
 
@@ -342,6 +349,7 @@ class AsyncSession:
342
  extra_headers: Optional[Dict[str, str]],
343
  disable_resources: bool,
344
  proxy: Optional[ProxyType] = None,
 
345
  ) -> AsyncGenerator["PageInfo[AsyncPage]", None]:
346
  """Acquire a page - either from persistent context or fresh context with proxy."""
347
  if proxy:
@@ -353,13 +361,15 @@ class AsyncSession:
353
 
354
  try:
355
  context = await self._initialize_context(self._config, context)
356
- page_info = await self._get_page(timeout, extra_headers, disable_resources, context=context)
 
 
357
  yield page_info
358
  finally:
359
  await context.close()
360
  else:
361
  # Standard mode: use PagePool with persistent context
362
- page_info = await self._get_page(timeout, extra_headers, disable_resources)
363
  try:
364
  yield page_info
365
  finally:
 
24
  from scrapling.engines._browsers._page import PageInfo, PagePool
25
  from scrapling.engines._browsers._validators import validate, PlaywrightConfig, StealthConfig
26
  from scrapling.engines._browsers._config_tools import __default_chrome_useragent__, __default_useragent__
27
+ from scrapling.engines.toolbelt.navigation import (
28
+ construct_proxy_dict,
29
+ create_intercept_handler,
30
+ create_async_intercept_handler,
31
+ )
32
  from scrapling.core._types import (
33
  Any,
34
  Dict,
35
  List,
36
+ Set,
37
  Optional,
38
  Callable,
39
  TYPE_CHECKING,
 
110
  timeout: int | float,
111
  extra_headers: Optional[Dict[str, str]],
112
  disable_resources: bool,
113
+ blocked_domains: Optional[Set[str]] = None,
114
  context: Optional[BrowserContext] = None,
115
  ) -> PageInfo[Page]: # pragma: no cover
116
  """Get a new page to use"""
 
123
  if extra_headers:
124
  page.set_extra_http_headers(extra_headers)
125
 
126
+ if disable_resources or blocked_domains:
127
+ page.route("**/*", create_intercept_handler(disable_resources, blocked_domains))
 
128
  page_info = self.page_pool.add_page(page)
129
  page_info.mark_busy()
130
  return page_info
 
178
  extra_headers: Optional[Dict[str, str]],
179
  disable_resources: bool,
180
  proxy: Optional[ProxyType] = None,
181
+ blocked_domains: Optional[Set[str]] = None,
182
  ) -> Generator["PageInfo[Page]", None, None]:
183
  """Acquire a page - either from persistent context or fresh context with proxy."""
184
  if proxy:
 
190
 
191
  try:
192
  context = self._initialize_context(self._config, context)
193
+ page_info = self._get_page(timeout, extra_headers, disable_resources, blocked_domains, context=context)
194
  yield page_info
195
  finally:
196
  context.close()
197
  else:
198
  # Standard mode: use PagePool with persistent context
199
+ page_info = self._get_page(timeout, extra_headers, disable_resources, blocked_domains)
200
  try:
201
  yield page_info
202
  finally:
 
267
  timeout: int | float,
268
  extra_headers: Optional[Dict[str, str]],
269
  disable_resources: bool,
270
+ blocked_domains: Optional[Set[str]] = None,
271
  context: Optional[AsyncBrowserContext] = None,
272
  ) -> PageInfo[AsyncPage]: # pragma: no cover
273
  """Get a new page to use"""
 
295
  if extra_headers:
296
  await page.set_extra_http_headers(extra_headers)
297
 
298
+ if disable_resources or blocked_domains:
299
+ await page.route("**/*", create_async_intercept_handler(disable_resources, blocked_domains))
300
 
301
  return self.page_pool.add_page(page)
302
 
 
349
  extra_headers: Optional[Dict[str, str]],
350
  disable_resources: bool,
351
  proxy: Optional[ProxyType] = None,
352
+ blocked_domains: Optional[Set[str]] = None,
353
  ) -> AsyncGenerator["PageInfo[AsyncPage]", None]:
354
  """Acquire a page - either from persistent context or fresh context with proxy."""
355
  if proxy:
 
361
 
362
  try:
363
  context = await self._initialize_context(self._config, context)
364
+ page_info = await self._get_page(
365
+ timeout, extra_headers, disable_resources, blocked_domains, context=context
366
+ )
367
  yield page_info
368
  finally:
369
  await context.close()
370
  else:
371
  # Standard mode: use PagePool with persistent context
372
+ page_info = await self._get_page(timeout, extra_headers, disable_resources, blocked_domains)
373
  try:
374
  yield page_info
375
  finally:
scrapling/engines/_browsers/_controllers.py CHANGED
@@ -43,6 +43,7 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
43
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
44
  :param disable_resources: Drop requests for unnecessary resources for a speed boost.
45
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
 
46
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
47
  :param cookies: Set cookies for the next request.
48
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
@@ -110,6 +111,7 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
110
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
111
  :param disable_resources: Drop requests for unnecessary resources for a speed boost.
112
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
 
113
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
114
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
115
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
@@ -138,7 +140,7 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
138
  proxy = static_proxy
139
 
140
  with self._page_generator(
141
- params.timeout, params.extra_headers, params.disable_resources, proxy
142
  ) as page_info:
143
  final_response = [None]
144
  page = page_info.page
@@ -208,6 +210,7 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
208
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
209
  :param disable_resources: Drop requests for unnecessary resources for a speed boost.
210
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
 
211
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
212
  :param cookies: Set cookies for the next request.
213
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
@@ -277,6 +280,7 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
277
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
278
  :param disable_resources: Drop requests for unnecessary resources for a speed boost.
279
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
 
280
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
281
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
282
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
@@ -306,7 +310,7 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
306
  proxy = static_proxy
307
 
308
  async with self._page_generator(
309
- params.timeout, params.extra_headers, params.disable_resources, proxy
310
  ) as page_info:
311
  final_response = [None]
312
  page = page_info.page
 
43
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
44
  :param disable_resources: Drop requests for unnecessary resources for a speed boost.
45
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
46
+ :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
47
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
48
  :param cookies: Set cookies for the next request.
49
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
111
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
112
  :param disable_resources: Drop requests for unnecessary resources for a speed boost.
113
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
114
+ :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
115
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
116
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
117
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
140
  proxy = static_proxy
141
 
142
  with self._page_generator(
143
+ params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
144
  ) as page_info:
145
  final_response = [None]
146
  page = page_info.page
 
210
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
211
  :param disable_resources: Drop requests for unnecessary resources for a speed boost.
212
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
213
+ :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
214
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
215
  :param cookies: Set cookies for the next request.
216
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
280
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
281
  :param disable_resources: Drop requests for unnecessary resources for a speed boost.
282
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
283
+ :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
284
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
285
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
286
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
310
  proxy = static_proxy
311
 
312
  async with self._page_generator(
313
+ params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
314
  ) as page_info:
315
  final_response = [None]
316
  page = page_info.page
scrapling/engines/_browsers/_stealth.py CHANGED
@@ -47,6 +47,7 @@ class StealthySession(SyncSession, StealthySessionMixin):
47
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
48
  :param disable_resources: Drop requests for unnecessary resources for a speed boost.
49
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
 
50
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
51
  :param cookies: Set cookies for the next request.
52
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
@@ -198,6 +199,7 @@ class StealthySession(SyncSession, StealthySessionMixin):
198
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
199
  :param disable_resources: Drop requests for unnecessary resources for a speed boost.
200
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
 
201
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
202
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
203
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
@@ -227,7 +229,7 @@ class StealthySession(SyncSession, StealthySessionMixin):
227
  proxy = static_proxy
228
 
229
  with self._page_generator(
230
- params.timeout, params.extra_headers, params.disable_resources, proxy
231
  ) as page_info:
232
  final_response = [None]
233
  page = page_info.page
@@ -302,6 +304,7 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
302
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
303
  :param disable_resources: Drop requests for unnecessary resources for a speed boost.
304
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
 
305
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
306
  :param cookies: Set cookies for the next request.
307
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
@@ -454,6 +457,7 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
454
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
455
  :param disable_resources: Drop requests for unnecessary resources for a speed boost.
456
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
 
457
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
458
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
459
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
@@ -484,7 +488,7 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
484
  proxy = static_proxy
485
 
486
  async with self._page_generator(
487
- params.timeout, params.extra_headers, params.disable_resources, proxy
488
  ) as page_info:
489
  final_response = [None]
490
  page = page_info.page
 
47
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
48
  :param disable_resources: Drop requests for unnecessary resources for a speed boost.
49
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
50
+ :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
51
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
52
  :param cookies: Set cookies for the next request.
53
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
199
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
200
  :param disable_resources: Drop requests for unnecessary resources for a speed boost.
201
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
202
+ :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
203
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
204
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
205
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
229
  proxy = static_proxy
230
 
231
  with self._page_generator(
232
+ params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
233
  ) as page_info:
234
  final_response = [None]
235
  page = page_info.page
 
304
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
305
  :param disable_resources: Drop requests for unnecessary resources for a speed boost.
306
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
307
+ :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
308
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
309
  :param cookies: Set cookies for the next request.
310
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
457
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
458
  :param disable_resources: Drop requests for unnecessary resources for a speed boost.
459
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
460
+ :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
461
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
462
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
463
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
488
  proxy = static_proxy
489
 
490
  async with self._page_generator(
491
+ params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
492
  ) as page_info:
493
  final_response = [None]
494
  page = page_info.page
scrapling/engines/_browsers/_types.py CHANGED
@@ -9,6 +9,7 @@ from curl_cffi.requests import (
9
  from scrapling.core._types import (
10
  Dict,
11
  List,
 
12
  Tuple,
13
  Mapping,
14
  Optional,
@@ -84,6 +85,7 @@ if TYPE_CHECKING: # pragma: no cover
84
  cdp_url: Optional[str]
85
  useragent: Optional[str]
86
  extra_flags: Optional[List[str]]
 
87
  retries: int
88
  retry_delay: int | float
89
 
@@ -99,6 +101,7 @@ if TYPE_CHECKING: # pragma: no cover
99
  selector_config: Optional[Dict]
100
  extra_headers: Optional[Dict[str, str]]
101
  wait_selector_state: SelectorWaitStates
 
102
  proxy: Optional[str | Dict[str, str]]
103
 
104
  class StealthSession(PlaywrightSession, total=False):
 
9
  from scrapling.core._types import (
10
  Dict,
11
  List,
12
+ Set,
13
  Tuple,
14
  Mapping,
15
  Optional,
 
85
  cdp_url: Optional[str]
86
  useragent: Optional[str]
87
  extra_flags: Optional[List[str]]
88
+ blocked_domains: Optional[Set[str]]
89
  retries: int
90
  retry_delay: int | float
91
 
 
101
  selector_config: Optional[Dict]
102
  extra_headers: Optional[Dict[str, str]]
103
  wait_selector_state: SelectorWaitStates
104
+ blocked_domains: Optional[Set[str]]
105
  proxy: Optional[str | Dict[str, str]]
106
 
107
  class StealthSession(PlaywrightSession, total=False):
scrapling/engines/_browsers/_validators.py CHANGED
@@ -10,6 +10,7 @@ from scrapling.core._types import (
10
  Any,
11
  Dict,
12
  List,
 
13
  Tuple,
14
  Optional,
15
  Callable,
@@ -83,6 +84,7 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False, weakref=True):
83
  cdp_url: Optional[str] = None
84
  useragent: Optional[str] = None
85
  extra_flags: Optional[List[str]] = None
 
86
  retries: RetriesCount = 3
87
  retry_delay: Seconds = 1
88
 
@@ -145,6 +147,7 @@ class _fetch_params:
145
  wait_selector_state: SelectorWaitStates
146
  network_idle: bool
147
  load_dom: bool
 
148
  solve_cloudflare: bool
149
  selector_config: Dict
150
 
@@ -183,6 +186,7 @@ def validate_fetch(
183
 
184
  # solve_cloudflare defaults to False for models that don't have it (PlaywrightConfig)
185
  result.setdefault("solve_cloudflare", False)
 
186
 
187
  return _fetch_params(**result)
188
 
 
10
  Any,
11
  Dict,
12
  List,
13
+ Set,
14
  Tuple,
15
  Optional,
16
  Callable,
 
84
  cdp_url: Optional[str] = None
85
  useragent: Optional[str] = None
86
  extra_flags: Optional[List[str]] = None
87
+ blocked_domains: Optional[Set[str]] = None
88
  retries: RetriesCount = 3
89
  retry_delay: Seconds = 1
90
 
 
147
  wait_selector_state: SelectorWaitStates
148
  network_idle: bool
149
  load_dom: bool
150
+ blocked_domains: Optional[Set[str]]
151
  solve_cloudflare: bool
152
  selector_config: Dict
153
 
 
186
 
187
  # solve_cloudflare defaults to False for models that don't have it (PlaywrightConfig)
188
  result.setdefault("solve_cloudflare", False)
189
+ result.setdefault("blocked_domains", None)
190
 
191
  return _fetch_params(**result)
192
 
scrapling/engines/toolbelt/navigation.py CHANGED
@@ -11,7 +11,7 @@ from msgspec import Struct, structs, convert, ValidationError
11
  from playwright.sync_api import Route
12
 
13
  from scrapling.core.utils import log
14
- from scrapling.core._types import Dict, Tuple
15
  from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
16
 
17
  __BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
@@ -23,30 +23,58 @@ class ProxyDict(Struct):
23
  password: str = ""
24
 
25
 
26
- def intercept_route(route: Route):
27
- """This is just a route handler, but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
28
 
29
- :param route: PlayWright `Route` object of the current page
30
- :return: PlayWright `Route` object
 
31
  """
32
- if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
33
- log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
34
- route.abort()
35
- else:
36
- route.continue_()
37
-
38
-
39
- async def async_intercept_route(route: async_Route):
40
- """This is just a route handler, but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
41
-
42
- :param route: PlayWright `Route` object of the current page
43
- :return: PlayWright `Route` object
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  """
45
- if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
46
- log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
47
- await route.abort()
48
- else:
49
- await route.continue_()
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
 
52
  def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple) -> Dict:
 
11
  from playwright.sync_api import Route
12
 
13
  from scrapling.core.utils import log
14
+ from scrapling.core._types import Dict, Set, Tuple, Optional, Callable
15
  from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
16
 
17
  __BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
 
23
  password: str = ""
24
 
25
 
26
+ def create_intercept_handler(disable_resources: bool, blocked_domains: Optional[Set[str]] = None) -> Callable:
27
+ """Create a route handler that blocks both resource types and specific domains.
28
 
29
+ :param disable_resources: Whether to block default resource types.
30
+ :param blocked_domains: Set of domain names to block requests to.
31
+ :return: A sync route handler function.
32
  """
33
+ disabled_resources = DEFAULT_DISABLED_RESOURCES if disable_resources else set()
34
+ domains = blocked_domains or set()
35
+
36
+ def handler(route: Route):
37
+ if route.request.resource_type in disabled_resources:
38
+ log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
39
+ route.abort()
40
+ elif domains:
41
+ hostname = urlparse(route.request.url).hostname or ""
42
+ if any(hostname == d or hostname.endswith("." + d) for d in domains):
43
+ log.debug(f'Blocking request to blocked domain "{hostname}" ({route.request.url})')
44
+ route.abort()
45
+ else:
46
+ route.continue_()
47
+ else:
48
+ route.continue_()
49
+
50
+ return handler
51
+
52
+
53
+ def create_async_intercept_handler(disable_resources: bool, blocked_domains: Optional[Set[str]] = None) -> Callable:
54
+ """Create an async route handler that blocks both resource types and specific domains.
55
+
56
+ :param disable_resources: Whether to block default resource types.
57
+ :param blocked_domains: Set of domain names to block requests to.
58
+ :return: An async route handler function.
59
  """
60
+ disabled_resources = DEFAULT_DISABLED_RESOURCES if disable_resources else set()
61
+ domains = blocked_domains or set()
62
+
63
+ async def handler(route: async_Route):
64
+ if route.request.resource_type in disabled_resources:
65
+ log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
66
+ await route.abort()
67
+ elif domains:
68
+ hostname = urlparse(route.request.url).hostname or ""
69
+ if any(hostname == d or hostname.endswith("." + d) for d in domains):
70
+ log.debug(f'Blocking request to blocked domain "{hostname}" ({route.request.url})')
71
+ await route.abort()
72
+ else:
73
+ await route.continue_()
74
+ else:
75
+ await route.continue_()
76
+
77
+ return handler
78
 
79
 
80
  def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple) -> Dict:
scrapling/fetchers/chrome.py CHANGED
@@ -13,7 +13,8 @@ class DynamicFetcher(BaseFetcher):
13
 
14
  :param url: Target url.
15
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
16
- :param disable_resources: Drop requests of unnecessary resources for a speed boost.
 
17
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
18
  :param cookies: Set cookies for the next request.
19
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
@@ -55,7 +56,8 @@ class DynamicFetcher(BaseFetcher):
55
 
56
  :param url: Target url.
57
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
58
- :param disable_resources: Drop requests of unnecessary resources for a speed boost.
 
59
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
60
  :param cookies: Set cookies for the next request.
61
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
13
 
14
  :param url: Target url.
15
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
16
+ :param disable_resources: Drop requests for unnecessary resources for a speed boost.
17
+ :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
18
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
19
  :param cookies: Set cookies for the next request.
20
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
56
 
57
  :param url: Target url.
58
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
59
+ :param disable_resources: Drop requests for unnecessary resources for a speed boost.
60
+ :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
61
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
62
  :param cookies: Set cookies for the next request.
63
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
scrapling/fetchers/stealth_chrome.py CHANGED
@@ -19,6 +19,7 @@ class StealthyFetcher(BaseFetcher):
19
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
20
  :param disable_resources: Drop requests for unnecessary resources for a speed boost.
21
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
 
22
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
23
  :param cookies: Set cookies for the next request.
24
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
@@ -67,6 +68,7 @@ class StealthyFetcher(BaseFetcher):
67
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
68
  :param disable_resources: Drop requests for unnecessary resources for a speed boost.
69
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
 
70
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
71
  :param cookies: Set cookies for the next request.
72
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
19
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
20
  :param disable_resources: Drop requests for unnecessary resources for a speed boost.
21
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
22
+ :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
23
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
24
  :param cookies: Set cookies for the next request.
25
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
68
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
69
  :param disable_resources: Drop requests for unnecessary resources for a speed boost.
70
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
71
+ :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
72
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
73
  :param cookies: Set cookies for the next request.
74
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.