Karim shoair commited on
Commit
ed96cdc
·
1 Parent(s): 88b6ab5

feat(spiders/fetchers): Adding proxy rotation logic and change retry logic

Browse files

- User passes a single proxy to the browser session, and it will keep using the same context. Pass a proxy manager that automatically refreshes the IP with each proxy, and you get speed but sacrifice some stealth.

- User imports the proxy rotator class and passes proxies to it and a rotation strategy (Round robin by default). Then pass the instance to the session class, which will create a context and a tab for each proxy returned by the rotator. This way, you sacrifice speed a bit, but you get maximum stealth since each context is created with the proxy that will be used.

- Normal requests use what you pass without issues, of course.

- All errors are retried now, and proxies are rotated on retries automatically.

scrapling/core/_types.py CHANGED
@@ -33,6 +33,8 @@ from typing import (
33
  SupportsIndex,
34
  )
35
 
 
 
36
  SUPPORTED_HTTP_METHODS = Literal["GET", "POST", "PUT", "DELETE"]
37
  SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
38
  PageLoadStates = Literal["commit", "domcontentloaded", "load", "networkidle"]
 
33
  SupportsIndex,
34
  )
35
 
36
+ # Proxy can be a string URL or a dict (Playwright format: {"server": "...", "username": "...", "password": "..."})
37
+ ProxyType = Union[str, Dict[str, str]]
38
  SUPPORTED_HTTP_METHODS = Literal["GET", "POST", "PUT", "DELETE"]
39
  SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
40
  PageLoadStates = Literal["commit", "domcontentloaded", "load", "networkidle"]
scrapling/engines/_browsers/_base.py CHANGED
@@ -1,9 +1,11 @@
1
  from time import time
2
  from asyncio import sleep as asyncio_sleep, Lock
 
3
 
4
  from playwright.sync_api._generated import Page
5
  from playwright.sync_api import (
6
  Frame,
 
7
  BrowserContext,
8
  Playwright,
9
  Response as SyncPlaywrightResponse,
@@ -11,18 +13,31 @@ from playwright.sync_api import (
11
  from playwright.async_api._generated import Page as AsyncPage
12
  from playwright.async_api import (
13
  Frame as AsyncFrame,
 
14
  Playwright as AsyncPlaywright,
15
  Response as AsyncPlaywrightResponse,
16
  BrowserContext as AsyncBrowserContext,
17
  )
18
  from playwright._impl._errors import Error as PlaywrightError
19
 
20
- from ._page import PageInfo, PagePool
21
  from scrapling.parser import Selector
22
- from ._validators import validate, PlaywrightConfig, StealthConfig
23
- from ._config_tools import __default_chrome_useragent__, __default_useragent__
24
- from scrapling.engines.toolbelt.navigation import intercept_route, async_intercept_route
25
- from scrapling.core._types import Any, cast, Dict, List, Optional, Callable, TYPE_CHECKING, overload, Tuple
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  from scrapling.engines.constants import (
27
  DEFAULT_STEALTH_FLAGS,
28
  HARMFUL_DEFAULT_ARGS,
@@ -31,12 +46,19 @@ from scrapling.engines.constants import (
31
 
32
 
33
  class SyncSession:
 
 
 
 
 
 
34
  def __init__(self, max_pages: int = 1):
35
  self.max_pages = max_pages
36
  self.page_pool = PagePool(max_pages)
37
  self._max_wait_for_page = 60
38
  self.playwright: Playwright | Any = None
39
  self.context: BrowserContext | Any = None
 
40
  self._is_alive = False
41
 
42
  def start(self):
@@ -51,6 +73,10 @@ class SyncSession:
51
  self.context.close()
52
  self.context = None
53
 
 
 
 
 
54
  if self.playwright:
55
  self.playwright.stop()
56
  self.playwright = None # pyright: ignore
@@ -64,17 +90,28 @@ class SyncSession:
64
  def __exit__(self, exc_type, exc_val, exc_tb):
65
  self.close()
66
 
 
 
 
 
 
 
 
 
 
 
67
  def _get_page(
68
  self,
69
  timeout: int | float,
70
  extra_headers: Optional[Dict[str, str]],
71
  disable_resources: bool,
 
72
  ) -> PageInfo[Page]: # pragma: no cover
73
  """Get a new page to use"""
74
-
75
  # No need to check if a page is available or not in sync code because the code blocked before reaching here till the page closed, ofc.
76
- assert self.context is not None, "Browser context not initialized"
77
- page = self.context.new_page()
 
78
  page.set_default_navigation_timeout(timeout)
79
  page.set_default_timeout(timeout)
80
  if extra_headers:
@@ -129,14 +166,52 @@ class SyncSession:
129
 
130
  return handle_response
131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
  class AsyncSession:
 
 
 
 
 
 
134
  def __init__(self, max_pages: int = 1):
135
  self.max_pages = max_pages
136
  self.page_pool = PagePool(max_pages)
137
  self._max_wait_for_page = 60
138
  self.playwright: AsyncPlaywright | Any = None
139
  self.context: AsyncBrowserContext | Any = None
 
140
  self._is_alive = False
141
  self._lock = Lock()
142
 
@@ -152,6 +227,10 @@ class AsyncSession:
152
  await self.context.close()
153
  self.context = None # pyright: ignore
154
 
 
 
 
 
155
  if self.playwright:
156
  await self.playwright.stop()
157
  self.playwright = None # pyright: ignore
@@ -165,19 +244,34 @@ class AsyncSession:
165
  async def __aexit__(self, exc_type, exc_val, exc_tb):
166
  await self.close()
167
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  async def _get_page(
169
  self,
170
  timeout: int | float,
171
  extra_headers: Optional[Dict[str, str]],
172
  disable_resources: bool,
 
173
  ) -> PageInfo[AsyncPage]: # pragma: no cover
174
  """Get a new page to use"""
 
175
  if TYPE_CHECKING:
176
- assert self.context is not None, "Browser context not initialized"
177
 
178
  async with self._lock:
179
  # If we're at max capacity after cleanup, wait for busy pages to finish
180
- if self.page_pool.pages_count >= self.max_pages:
 
181
  start_time = time()
182
  while time() - start_time < self._max_wait_for_page:
183
  await asyncio_sleep(0.05)
@@ -188,7 +282,7 @@ class AsyncSession:
188
  f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
189
  )
190
 
191
- page = await self.context.new_page()
192
  page.set_default_navigation_timeout(timeout)
193
  page.set_default_timeout(timeout)
194
  if extra_headers:
@@ -241,6 +335,37 @@ class AsyncSession:
241
 
242
  return handle_response
243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
 
245
  class BaseSessionMixin:
246
  @overload
@@ -254,7 +379,7 @@ class BaseSessionMixin:
254
  ) -> PlaywrightConfig | StealthConfig:
255
  # Dark color scheme bypasses the 'prefersLightColor' check in creepjs
256
  self._context_options: Dict[str, Any] = {"color_scheme": "dark", "device_scale_factor": 2}
257
- self._launch_options: Dict[str, Any] = self._context_options | {
258
  "args": DEFAULT_FLAGS,
259
  "ignore_default_args": HARMFUL_DEFAULT_ARGS,
260
  }
@@ -269,7 +394,7 @@ class BaseSessionMixin:
269
  return config
270
 
271
  def __generate_options__(self, extra_flags: Tuple | None = None) -> None:
272
- config = cast(PlaywrightConfig, getattr(self, "_config", None))
273
  self._context_options.update(
274
  {
275
  "proxy": config.proxy,
@@ -287,36 +412,40 @@ class BaseSessionMixin:
287
  )
288
 
289
  if not config.cdp_url:
290
- self._launch_options |= self._context_options
291
- self._context_options = {}
292
- flags = self._launch_options["args"]
293
  if config.extra_flags or extra_flags:
294
  flags = list(set(flags + (config.extra_flags or extra_flags)))
295
 
296
- self._launch_options.update(
297
  {
298
  "args": flags,
299
  "headless": config.headless,
300
- "user_data_dir": config.user_data_dir,
301
  "channel": "chrome" if config.real_chrome else "chromium",
302
  }
303
  )
304
 
305
- if config.additional_args:
306
- self._launch_options.update(config.additional_args)
307
  else:
308
- # while `context_options` is left to be used when cdp mode is enabled
309
- self._launch_options = dict()
310
- if config.additional_args:
311
- self._context_options.update(config.additional_args)
312
 
313
- @staticmethod
314
- def _is_retriable(error: Exception) -> bool:
315
- """Check if an error is retriable (transient network/timeout issues)."""
316
- if isinstance(error, TimeoutError):
317
- return True
318
- error_msg = str(error).lower()
319
- return "net::" in error_msg or "failed to get response" in error_msg
 
 
 
 
 
 
 
 
 
 
 
320
 
321
 
322
  class DynamicSessionMixin(BaseSessionMixin):
 
1
  from time import time
2
  from asyncio import sleep as asyncio_sleep, Lock
3
+ from contextlib import contextmanager, asynccontextmanager
4
 
5
  from playwright.sync_api._generated import Page
6
  from playwright.sync_api import (
7
  Frame,
8
+ Browser,
9
  BrowserContext,
10
  Playwright,
11
  Response as SyncPlaywrightResponse,
 
13
  from playwright.async_api._generated import Page as AsyncPage
14
  from playwright.async_api import (
15
  Frame as AsyncFrame,
16
+ Browser as AsyncBrowser,
17
  Playwright as AsyncPlaywright,
18
  Response as AsyncPlaywrightResponse,
19
  BrowserContext as AsyncBrowserContext,
20
  )
21
  from playwright._impl._errors import Error as PlaywrightError
22
 
 
23
  from scrapling.parser import Selector
24
+ from scrapling.engines._browsers._page import PageInfo, PagePool
25
+ from scrapling.engines._browsers._validators import validate, PlaywrightConfig, StealthConfig
26
+ from scrapling.engines._browsers._config_tools import __default_chrome_useragent__, __default_useragent__
27
+ from scrapling.engines.toolbelt.navigation import construct_proxy_dict, intercept_route, async_intercept_route
28
+ from scrapling.core._types import (
29
+ Any,
30
+ Dict,
31
+ List,
32
+ Optional,
33
+ Callable,
34
+ TYPE_CHECKING,
35
+ overload,
36
+ Tuple,
37
+ ProxyType,
38
+ Generator,
39
+ AsyncGenerator,
40
+ )
41
  from scrapling.engines.constants import (
42
  DEFAULT_STEALTH_FLAGS,
43
  HARMFUL_DEFAULT_ARGS,
 
46
 
47
 
48
  class SyncSession:
49
+ _config: "PlaywrightConfig | StealthConfig"
50
+ _context_options: Dict[str, Any]
51
+
52
+ def _build_context_with_proxy(self, proxy: Optional[ProxyType] = None) -> Dict[str, Any]:
53
+ raise NotImplementedError # pragma: no cover
54
+
55
  def __init__(self, max_pages: int = 1):
56
  self.max_pages = max_pages
57
  self.page_pool = PagePool(max_pages)
58
  self._max_wait_for_page = 60
59
  self.playwright: Playwright | Any = None
60
  self.context: BrowserContext | Any = None
61
+ self.browser: Optional[Browser] = None
62
  self._is_alive = False
63
 
64
  def start(self):
 
73
  self.context.close()
74
  self.context = None
75
 
76
+ if self.browser:
77
+ self.browser.close()
78
+ self.browser = None
79
+
80
  if self.playwright:
81
  self.playwright.stop()
82
  self.playwright = None # pyright: ignore
 
90
  def __exit__(self, exc_type, exc_val, exc_tb):
91
  self.close()
92
 
93
+ def _initialize_context(self, config: PlaywrightConfig | StealthConfig, ctx: BrowserContext) -> BrowserContext:
94
+ """Initialize the browser context."""
95
+ if config.init_script:
96
+ ctx.add_init_script(path=config.init_script)
97
+
98
+ if config.cookies: # pragma: no cover
99
+ ctx.add_cookies(config.cookies)
100
+
101
+ return ctx
102
+
103
  def _get_page(
104
  self,
105
  timeout: int | float,
106
  extra_headers: Optional[Dict[str, str]],
107
  disable_resources: bool,
108
+ context: Optional[BrowserContext] = None,
109
  ) -> PageInfo[Page]: # pragma: no cover
110
  """Get a new page to use"""
 
111
  # No need to check if a page is available or not in sync code because the code blocked before reaching here till the page closed, ofc.
112
+ ctx = context if context is not None else self.context
113
+ assert ctx is not None, "Browser context not initialized"
114
+ page = ctx.new_page()
115
  page.set_default_navigation_timeout(timeout)
116
  page.set_default_timeout(timeout)
117
  if extra_headers:
 
166
 
167
  return handle_response
168
 
169
+ @contextmanager
170
+ def _page_generator(
171
+ self,
172
+ timeout: int | float,
173
+ extra_headers: Optional[Dict[str, str]],
174
+ disable_resources: bool,
175
+ proxy: Optional[ProxyType] = None,
176
+ ) -> Generator["PageInfo[Page]", None, None]:
177
+ """Acquire a page - either from persistent context or fresh context with proxy."""
178
+ if self._config.proxy_rotator:
179
+ # Rotation mode: create fresh context with the provided proxy
180
+ if not self.browser: # pragma: no cover
181
+ raise RuntimeError("Browser not initialized for proxy rotation mode")
182
+ context_options = self._build_context_with_proxy(proxy)
183
+ context: BrowserContext = self.browser.new_context(**context_options)
184
+
185
+ try:
186
+ context = self._initialize_context(self._config, context)
187
+ page_info = self._get_page(timeout, extra_headers, disable_resources, context=context)
188
+ yield page_info
189
+ finally:
190
+ context.close()
191
+ else:
192
+ # Standard mode: use PagePool with persistent context
193
+ page_info = self._get_page(timeout, extra_headers, disable_resources)
194
+ try:
195
+ yield page_info
196
+ finally:
197
+ page_info.page.close()
198
+ self.page_pool.pages.remove(page_info)
199
+
200
 
201
  class AsyncSession:
202
+ _config: "PlaywrightConfig | StealthConfig"
203
+ _context_options: Dict[str, Any]
204
+
205
+ def _build_context_with_proxy(self, proxy: Optional[ProxyType] = None) -> Dict[str, Any]:
206
+ raise NotImplementedError # pragma: no cover
207
+
208
  def __init__(self, max_pages: int = 1):
209
  self.max_pages = max_pages
210
  self.page_pool = PagePool(max_pages)
211
  self._max_wait_for_page = 60
212
  self.playwright: AsyncPlaywright | Any = None
213
  self.context: AsyncBrowserContext | Any = None
214
+ self.browser: Optional[AsyncBrowser] = None
215
  self._is_alive = False
216
  self._lock = Lock()
217
 
 
227
  await self.context.close()
228
  self.context = None # pyright: ignore
229
 
230
+ if self.browser:
231
+ await self.browser.close()
232
+ self.browser = None
233
+
234
  if self.playwright:
235
  await self.playwright.stop()
236
  self.playwright = None # pyright: ignore
 
244
  async def __aexit__(self, exc_type, exc_val, exc_tb):
245
  await self.close()
246
 
247
+ async def _initialize_context(
248
+ self, config: PlaywrightConfig | StealthConfig, ctx: AsyncBrowserContext
249
+ ) -> AsyncBrowserContext:
250
+ """Initialize the browser context."""
251
+ if config.init_script: # pragma: no cover
252
+ await ctx.add_init_script(path=config.init_script)
253
+
254
+ if config.cookies: # pragma: no cover
255
+ await ctx.add_cookies(config.cookies)
256
+
257
+ return ctx
258
+
259
  async def _get_page(
260
  self,
261
  timeout: int | float,
262
  extra_headers: Optional[Dict[str, str]],
263
  disable_resources: bool,
264
+ context: Optional[AsyncBrowserContext] = None,
265
  ) -> PageInfo[AsyncPage]: # pragma: no cover
266
  """Get a new page to use"""
267
+ ctx = context if context is not None else self.context
268
  if TYPE_CHECKING:
269
+ assert ctx is not None, "Browser context not initialized"
270
 
271
  async with self._lock:
272
  # If we're at max capacity after cleanup, wait for busy pages to finish
273
+ if context is None and self.page_pool.pages_count >= self.max_pages:
274
+ # Only applies when using persistent context
275
  start_time = time()
276
  while time() - start_time < self._max_wait_for_page:
277
  await asyncio_sleep(0.05)
 
282
  f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
283
  )
284
 
285
+ page = await ctx.new_page()
286
  page.set_default_navigation_timeout(timeout)
287
  page.set_default_timeout(timeout)
288
  if extra_headers:
 
335
 
336
  return handle_response
337
 
338
+ @asynccontextmanager
339
+ async def _page_generator(
340
+ self,
341
+ timeout: int | float,
342
+ extra_headers: Optional[Dict[str, str]],
343
+ disable_resources: bool,
344
+ proxy: Optional[ProxyType] = None,
345
+ ) -> AsyncGenerator["PageInfo[AsyncPage]", None]:
346
+ """Acquire a page - either from persistent context or fresh context with proxy."""
347
+ if self._config.proxy_rotator:
348
+ # Rotation mode: create fresh context with the provided proxy
349
+ if not self.browser: # pragma: no cover
350
+ raise RuntimeError("Browser not initialized for proxy rotation mode")
351
+ context_options = self._build_context_with_proxy(proxy)
352
+ context: AsyncBrowserContext = await self.browser.new_context(**context_options)
353
+
354
+ try:
355
+ context = await self._initialize_context(self._config, context)
356
+ page_info = await self._get_page(timeout, extra_headers, disable_resources, context=context)
357
+ yield page_info
358
+ finally:
359
+ await context.close()
360
+ else:
361
+ # Standard mode: use PagePool with persistent context
362
+ page_info = await self._get_page(timeout, extra_headers, disable_resources)
363
+ try:
364
+ yield page_info
365
+ finally:
366
+ await page_info.page.close()
367
+ self.page_pool.pages.remove(page_info)
368
+
369
 
370
  class BaseSessionMixin:
371
  @overload
 
379
  ) -> PlaywrightConfig | StealthConfig:
380
  # Dark color scheme bypasses the 'prefersLightColor' check in creepjs
381
  self._context_options: Dict[str, Any] = {"color_scheme": "dark", "device_scale_factor": 2}
382
+ self._browser_options: Dict[str, Any] = {
383
  "args": DEFAULT_FLAGS,
384
  "ignore_default_args": HARMFUL_DEFAULT_ARGS,
385
  }
 
394
  return config
395
 
396
  def __generate_options__(self, extra_flags: Tuple | None = None) -> None:
397
+ config: PlaywrightConfig | StealthConfig = self._config # type: ignore[has-type]
398
  self._context_options.update(
399
  {
400
  "proxy": config.proxy,
 
412
  )
413
 
414
  if not config.cdp_url:
415
+ flags = self._browser_options["args"]
 
 
416
  if config.extra_flags or extra_flags:
417
  flags = list(set(flags + (config.extra_flags or extra_flags)))
418
 
419
+ self._browser_options.update(
420
  {
421
  "args": flags,
422
  "headless": config.headless,
 
423
  "channel": "chrome" if config.real_chrome else "chromium",
424
  }
425
  )
426
 
427
+ self._user_data_dir = config.user_data_dir
 
428
  else:
429
+ self._browser_options = {}
 
 
 
430
 
431
+ if config.additional_args:
432
+ self._context_options.update(config.additional_args)
433
+
434
+ def _build_context_with_proxy(self, proxy: Optional[ProxyType] = None) -> Dict[str, Any]:
435
+ """
436
+ Build context options with a specific proxy for rotation mode.
437
+
438
+ :param proxy: Proxy URL string or Playwright-style proxy dict to use for this context.
439
+ :return: Dictionary of context options for browser.new_context().
440
+ """
441
+
442
+ context_options = self._context_options.copy()
443
+
444
+ # Override proxy if provided
445
+ if proxy:
446
+ context_options["proxy"] = construct_proxy_dict(proxy)
447
+
448
+ return context_options
449
 
450
 
451
  class DynamicSessionMixin(BaseSessionMixin):
scrapling/engines/_browsers/_controllers.py CHANGED
@@ -12,12 +12,13 @@ from playwright.async_api import (
12
  )
13
 
14
  from scrapling.core.utils import log
15
- from scrapling.core._types import Unpack, TYPE_CHECKING
16
- from ._types import PlaywrightSession, PlaywrightFetchParams
17
- from ._base import SyncSession, AsyncSession, DynamicSessionMixin
18
- from ._validators import validate_fetch as _validate, PlaywrightConfig
19
  from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
20
  from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
 
 
 
21
 
22
 
23
  class DynamicSession(SyncSession, DynamicSessionMixin):
@@ -26,7 +27,9 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
26
  __slots__ = (
27
  "_config",
28
  "_context_options",
29
- "_launch_options",
 
 
30
  "max_pages",
31
  "page_pool",
32
  "_max_wait_for_page",
@@ -73,16 +76,19 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
73
 
74
  try:
75
  if self._config.cdp_url: # pragma: no cover
76
- browser = self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
77
- self.context = browser.new_context(**self._context_options)
 
 
 
78
  else:
79
- self.context = self.playwright.chromium.launch_persistent_context(**self._launch_options)
80
-
81
- if self._config.init_script: # pragma: no cover
82
- self.context.add_init_script(path=self._config.init_script)
83
 
84
- if self._config.cookies: # pragma: no cover
85
- self.context.add_cookies(self._config.cookies)
86
 
87
  self._is_alive = True
88
  except Exception:
@@ -123,60 +129,73 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
123
  )
124
 
125
  for attempt in range(self._config.retries):
126
- page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources)
127
- final_response = [None]
128
- handle_response = self._create_response_handler(page_info, final_response)
129
-
130
- try: # pragma: no cover
131
- page_info.page.on("response", handle_response)
132
- first_response = page_info.page.goto(url, referer=referer)
133
- self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
134
-
135
- if not first_response:
136
- raise RuntimeError(f"Failed to get response for {url}")
137
-
138
- if params.page_action:
139
- try:
140
- _ = params.page_action(page_info.page)
141
- except Exception as e: # pragma: no cover
142
- log.error(f"Error executing page_action: {e}")
143
-
144
- if params.wait_selector:
145
- try:
146
- waiter: Locator = page_info.page.locator(params.wait_selector)
147
- waiter.first.wait_for(state=params.wait_selector_state)
148
- self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
149
- except Exception as e: # pragma: no cover
150
- log.error(f"Error waiting for selector {params.wait_selector}: {e}")
151
-
152
- page_info.page.wait_for_timeout(params.wait)
153
-
154
- response = ResponseFactory.from_playwright_response(
155
- page_info.page, first_response, final_response[0], params.selector_config
156
- )
157
-
158
- page_info.page.close()
159
- self.page_pool.pages.remove(page_info)
160
- return response
161
-
162
- except Exception as e:
163
- page_info.mark_error()
164
- page_info.page.close()
165
- self.page_pool.pages.remove(page_info)
166
-
167
- if attempt < self._config.retries - 1 and self._is_retriable(e):
168
- log.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s...")
169
- time_sleep(self._config.retry_delay)
170
- else:
171
- raise
172
-
173
- # For type checking purposes only
174
- raise AssertionError("Unreachable: retry loop must return or raise") # pragma: no cover
 
 
 
 
 
175
 
176
 
177
  class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
178
  """An async Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory."""
179
 
 
 
 
 
 
 
 
 
180
  def __init__(self, **kwargs: Unpack[PlaywrightSession]):
181
  """A Browser session manager with page pooling
182
 
@@ -216,18 +235,21 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
216
  self.playwright = await async_playwright().start()
217
  try:
218
  if self._config.cdp_url:
219
- browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
220
- self.context: AsyncBrowserContext = await browser.new_context(**self._context_options)
 
 
 
221
  else:
 
 
 
222
  self.context: AsyncBrowserContext = await self.playwright.chromium.launch_persistent_context(
223
- **self._launch_options
224
  )
225
 
226
- if self._config.init_script: # pragma: no cover
227
- await self.context.add_init_script(path=self._config.init_script)
228
-
229
- if self._config.cookies:
230
- await self.context.add_cookies(self._config.cookies) # pyright: ignore
231
 
232
  self._is_alive = True
233
  except Exception:
@@ -269,58 +291,57 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
269
  )
270
 
271
  for attempt in range(self._config.retries):
272
- page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
273
- final_response = [None]
274
- handle_response = self._create_response_handler(page_info, final_response)
275
-
276
- if TYPE_CHECKING:
277
- from playwright.async_api import Page as async_Page
278
-
279
- if not isinstance(page_info.page, async_Page):
280
- raise TypeError
281
-
282
- try:
283
- page_info.page.on("response", handle_response)
284
- first_response = await page_info.page.goto(url, referer=referer)
285
- await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
286
-
287
- if not first_response:
288
- raise RuntimeError(f"Failed to get response for {url}")
289
-
290
- if params.page_action:
291
- try:
292
- _ = await params.page_action(page_info.page)
293
- except Exception as e:
294
- log.error(f"Error executing page_action: {e}")
295
-
296
- if params.wait_selector:
297
- try:
298
- waiter: AsyncLocator = page_info.page.locator(params.wait_selector)
299
- await waiter.first.wait_for(state=params.wait_selector_state)
300
- await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
301
- except Exception as e:
302
- log.error(f"Error waiting for selector {params.wait_selector}: {e}")
303
-
304
- await page_info.page.wait_for_timeout(params.wait)
305
-
306
- response = await ResponseFactory.from_async_playwright_response(
307
- page_info.page, first_response, final_response[0], params.selector_config
308
- )
309
-
310
- await page_info.page.close()
311
- self.page_pool.pages.remove(page_info)
312
- return response
313
-
314
- except Exception as e: # pragma: no cover
315
- page_info.mark_error()
316
- await page_info.page.close()
317
- self.page_pool.pages.remove(page_info)
318
-
319
- if attempt < self._config.retries - 1 and self._is_retriable(e):
320
- log.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s...")
321
- await asyncio_sleep(self._config.retry_delay)
322
- else:
323
- raise
324
-
325
- # For type checking purposes only
326
- raise AssertionError("Unreachable: retry loop must return or raise") # pragma: no cover
 
12
  )
13
 
14
  from scrapling.core.utils import log
15
+ from scrapling.core._types import Unpack
16
+ from scrapling.engines.toolbelt.proxy_rotation import is_proxy_error
 
 
17
  from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
18
  from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
19
+ from scrapling.engines._browsers._types import PlaywrightSession, PlaywrightFetchParams
20
+ from scrapling.engines._browsers._base import SyncSession, AsyncSession, DynamicSessionMixin
21
+ from scrapling.engines._browsers._validators import validate_fetch as _validate, PlaywrightConfig
22
 
23
 
24
  class DynamicSession(SyncSession, DynamicSessionMixin):
 
27
  __slots__ = (
28
  "_config",
29
  "_context_options",
30
+ "_browser_options",
31
+ "_user_data_dir",
32
+ "_headers_keys",
33
  "max_pages",
34
  "page_pool",
35
  "_max_wait_for_page",
 
76
 
77
  try:
78
  if self._config.cdp_url: # pragma: no cover
79
+ self.browser = self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
80
+ if not self._config.proxy_rotator and self.browser:
81
+ self.context = self.browser.new_context(**self._context_options)
82
+ elif self._config.proxy_rotator:
83
+ self.browser = self.playwright.chromium.launch(**self._browser_options)
84
  else:
85
+ persistent_options = (
86
+ self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
87
+ )
88
+ self.context = self.playwright.chromium.launch_persistent_context(**persistent_options)
89
 
90
+ if self.context:
91
+ self.context = self._initialize_context(self._config, self.context)
92
 
93
  self._is_alive = True
94
  except Exception:
 
129
  )
130
 
131
  for attempt in range(self._config.retries):
132
+ proxy = self._config.proxy_rotator.get_proxy() if self._config.proxy_rotator else None
133
+
134
+ with self._page_generator(
135
+ params.timeout, params.extra_headers, params.disable_resources, proxy
136
+ ) as page_info:
137
+ final_response = [None]
138
+ page = page_info.page
139
+ page.on("response", self._create_response_handler(page_info, final_response))
140
+
141
+ try:
142
+ first_response = page.goto(url, referer=referer)
143
+ self._wait_for_page_stability(page, params.load_dom, params.network_idle)
144
+
145
+ if not first_response:
146
+ raise RuntimeError(f"Failed to get response for {url}")
147
+
148
+ if params.page_action:
149
+ try:
150
+ _ = params.page_action(page)
151
+ except Exception as e: # pragma: no cover
152
+ log.error(f"Error executing page_action: {e}")
153
+
154
+ if params.wait_selector:
155
+ try:
156
+ waiter: Locator = page.locator(params.wait_selector)
157
+ waiter.first.wait_for(state=params.wait_selector_state)
158
+ self._wait_for_page_stability(page, params.load_dom, params.network_idle)
159
+ except Exception as e: # pragma: no cover
160
+ log.error(f"Error waiting for selector {params.wait_selector}: {e}")
161
+
162
+ page.wait_for_timeout(params.wait)
163
+
164
+ response = ResponseFactory.from_playwright_response(
165
+ page, first_response, final_response[0], params.selector_config
166
+ )
167
+ return response
168
+
169
+ except Exception as e:
170
+ page_info.mark_error()
171
+ if attempt < self._config.retries - 1:
172
+ if is_proxy_error(e):
173
+ log.warning(
174
+ f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
175
+ )
176
+ else:
177
+ log.warning(
178
+ f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
179
+ )
180
+ time_sleep(self._config.retry_delay)
181
+ else:
182
+ log.error(f"Failed after {self._config.retries} attempts: {e}")
183
+ raise
184
+
185
+ raise RuntimeError("Request failed") # pragma: no cover
186
 
187
 
188
  class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
189
  """An async Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory."""
190
 
191
+ __slots__ = (
192
+ "_config",
193
+ "_context_options",
194
+ "_browser_options",
195
+ "_user_data_dir",
196
+ "_headers_keys",
197
+ )
198
+
199
  def __init__(self, **kwargs: Unpack[PlaywrightSession]):
200
  """A Browser session manager with page pooling
201
 
 
235
  self.playwright = await async_playwright().start()
236
  try:
237
  if self._config.cdp_url:
238
+ self.browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
239
+ if not self._config.proxy_rotator and self.browser:
240
+ self.context: AsyncBrowserContext = await self.browser.new_context(**self._context_options)
241
+ elif self._config.proxy_rotator:
242
+ self.browser = await self.playwright.chromium.launch(**self._browser_options)
243
  else:
244
+ persistent_options = (
245
+ self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
246
+ )
247
  self.context: AsyncBrowserContext = await self.playwright.chromium.launch_persistent_context(
248
+ **persistent_options
249
  )
250
 
251
+ if self.context:
252
+ self.context = await self._initialize_context(self._config, self.context)
 
 
 
253
 
254
  self._is_alive = True
255
  except Exception:
 
291
  )
292
 
293
  for attempt in range(self._config.retries):
294
+ proxy = self._config.proxy_rotator.get_proxy() if self._config.proxy_rotator else None
295
+
296
+ async with self._page_generator(
297
+ params.timeout, params.extra_headers, params.disable_resources, proxy
298
+ ) as page_info:
299
+ final_response = [None]
300
+ page = page_info.page
301
+ page.on("response", self._create_response_handler(page_info, final_response))
302
+
303
+ try:
304
+ first_response = await page.goto(url, referer=referer)
305
+ await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
306
+
307
+ if not first_response:
308
+ raise RuntimeError(f"Failed to get response for {url}")
309
+
310
+ if params.page_action:
311
+ try:
312
+ _ = await params.page_action(page)
313
+ except Exception as e: # pragma: no cover
314
+ log.error(f"Error executing page_action: {e}")
315
+
316
+ if params.wait_selector:
317
+ try:
318
+ waiter: AsyncLocator = page.locator(params.wait_selector)
319
+ await waiter.first.wait_for(state=params.wait_selector_state)
320
+ await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
321
+ except Exception as e: # pragma: no cover
322
+ log.error(f"Error waiting for selector {params.wait_selector}: {e}")
323
+
324
+ await page.wait_for_timeout(params.wait)
325
+
326
+ response = await ResponseFactory.from_async_playwright_response(
327
+ page, first_response, final_response[0], params.selector_config
328
+ )
329
+ return response
330
+
331
+ except Exception as e:
332
+ page_info.mark_error()
333
+ if attempt < self._config.retries - 1:
334
+ if is_proxy_error(e):
335
+ log.warning(
336
+ f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
337
+ )
338
+ else:
339
+ log.warning(
340
+ f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
341
+ )
342
+ await asyncio_sleep(self._config.retry_delay)
343
+ else:
344
+ log.error(f"Failed after {self._config.retries} attempts: {e}")
345
+ raise
346
+
347
+ raise RuntimeError("Request failed") # pragma: no cover
 
scrapling/engines/_browsers/_stealth.py CHANGED
@@ -3,10 +3,7 @@ from re import compile as re_compile
3
  from time import sleep as time_sleep
4
  from asyncio import sleep as asyncio_sleep
5
 
6
- from playwright.sync_api import (
7
- Locator,
8
- Page,
9
- )
10
  from playwright.async_api import (
11
  Page as async_Page,
12
  Locator as AsyncLocator,
@@ -17,12 +14,13 @@ from patchright.async_api import async_playwright
17
 
18
  from scrapling.core.utils import log
19
  from scrapling.core._types import Any, Unpack
20
- from ._config_tools import _compiled_stealth_scripts
21
- from ._types import StealthSession, StealthFetchParams
22
- from ._base import SyncSession, AsyncSession, StealthySessionMixin
23
- from ._validators import validate_fetch as _validate, StealthConfig
24
  from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
25
  from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
 
 
 
 
26
 
27
  __CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
28
 
@@ -33,7 +31,9 @@ class StealthySession(SyncSession, StealthySessionMixin):
33
  __slots__ = (
34
  "_config",
35
  "_context_options",
36
- "_launch_options",
 
 
37
  "max_pages",
38
  "page_pool",
39
  "_max_wait_for_page",
@@ -84,19 +84,20 @@ class StealthySession(SyncSession, StealthySessionMixin):
84
 
85
  try:
86
  if self._config.cdp_url: # pragma: no cover
87
- browser = self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
88
- self.context = browser.new_context(**self._context_options)
 
 
 
 
89
  else:
90
- self.context = self.playwright.chromium.launch_persistent_context(**self._launch_options)
91
-
92
- for script in _compiled_stealth_scripts():
93
- self.context.add_init_script(script=script)
94
-
95
- if self._config.init_script: # pragma: no cover
96
- self.context.add_init_script(path=self._config.init_script)
97
 
98
- if self._config.cookies: # pragma: no cover
99
- self.context.add_cookies(self._config.cookies)
100
 
101
  self._is_alive = True
102
  except Exception:
@@ -107,6 +108,14 @@ class StealthySession(SyncSession, StealthySessionMixin):
107
  else:
108
  raise RuntimeError("Session has been already started")
109
 
 
 
 
 
 
 
 
 
110
  def _cloudflare_solver(self, page: Page) -> None: # pragma: no cover
111
  """Solve the cloudflare challenge displayed on the playwright page passed
112
 
@@ -209,70 +218,78 @@ class StealthySession(SyncSession, StealthySessionMixin):
209
  )
210
 
211
  for attempt in range(self._config.retries):
212
- page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources)
213
- final_response = [None]
214
- handle_response = self._create_response_handler(page_info, final_response)
215
-
216
- try: # pragma: no cover
217
- # Navigate to URL and wait for a specified state
218
- page_info.page.on("response", handle_response)
219
- first_response = page_info.page.goto(url, referer=referer)
220
- self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
221
-
222
- if not first_response:
223
- raise RuntimeError(f"Failed to get response for {url}")
224
-
225
- if params.solve_cloudflare:
226
- self._cloudflare_solver(page_info.page)
227
- # Make sure the page is fully loaded after the captcha
228
- self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
229
-
230
- if params.page_action:
231
- try:
232
- _ = params.page_action(page_info.page)
233
- except Exception as e: # pragma: no cover
234
- log.error(f"Error executing page_action: {e}")
235
-
236
- if params.wait_selector:
237
- try:
238
- waiter: Locator = page_info.page.locator(params.wait_selector)
239
- waiter.first.wait_for(state=params.wait_selector_state)
240
- # Wait again after waiting for the selector, helpful with protections like Cloudflare
241
- self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
242
- except Exception as e: # pragma: no cover
243
- log.error(f"Error waiting for selector {params.wait_selector}: {e}")
244
-
245
- page_info.page.wait_for_timeout(params.wait)
246
-
247
- # Create response object
248
- response = ResponseFactory.from_playwright_response(
249
- page_info.page, first_response, final_response[0], params.selector_config
250
- )
251
-
252
- # Close the page to free up resources
253
- page_info.page.close()
254
- self.page_pool.pages.remove(page_info)
255
-
256
- return response
257
-
258
- except Exception as e:
259
- page_info.mark_error()
260
- page_info.page.close()
261
- self.page_pool.pages.remove(page_info)
262
-
263
- if attempt < self._config.retries - 1 and self._is_retriable(e):
264
- log.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s...")
265
- time_sleep(self._config.retry_delay)
266
- else:
267
- raise
268
-
269
- # For type checking purposes only
270
- raise AssertionError("Unreachable: retry loop must return or raise") # pragma: no cover
271
 
272
 
273
  class AsyncStealthySession(AsyncSession, StealthySessionMixin):
274
  """An async Stealthy Browser session manager with page pooling."""
275
 
 
 
 
 
 
 
 
 
276
  def __init__(self, **kwargs: Unpack[StealthSession]):
277
  """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
278
 
@@ -315,21 +332,22 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
315
  self.playwright = await async_playwright().start()
316
  try:
317
  if self._config.cdp_url:
318
- browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
319
- self.context: AsyncBrowserContext = await browser.new_context(**self._context_options)
 
 
 
 
320
  else:
 
 
 
321
  self.context: AsyncBrowserContext = await self.playwright.chromium.launch_persistent_context(
322
- **self._launch_options
323
  )
324
 
325
- for script in _compiled_stealth_scripts():
326
- await self.context.add_init_script(script=script)
327
-
328
- if self._config.init_script: # pragma: no cover
329
- await self.context.add_init_script(path=self._config.init_script)
330
-
331
- if self._config.cookies:
332
- await self.context.add_cookies(self._config.cookies) # pyright: ignore
333
 
334
  self._is_alive = True
335
  except Exception:
@@ -340,6 +358,14 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
340
  else:
341
  raise RuntimeError("Session has been already started")
342
 
 
 
 
 
 
 
 
 
343
  async def _cloudflare_solver(self, page: async_Page) -> None: # pragma: no cover
344
  """Solve the cloudflare challenge displayed on the playwright page passed
345
 
@@ -443,61 +469,62 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
443
  )
444
 
445
  for attempt in range(self._config.retries):
446
- page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
447
- final_response = [None]
448
- handle_response = self._create_response_handler(page_info, final_response)
449
-
450
- try:
451
- # Navigate to URL and wait for a specified state
452
- page_info.page.on("response", handle_response)
453
- first_response = await page_info.page.goto(url, referer=referer)
454
- await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
455
-
456
- if not first_response:
457
- raise RuntimeError(f"Failed to get response for {url}")
458
-
459
- if params.solve_cloudflare:
460
- await self._cloudflare_solver(page_info.page)
461
- # Make sure the page is fully loaded after the captcha
462
- await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
463
-
464
- if params.page_action:
465
- try:
466
- _ = await params.page_action(page_info.page)
467
- except Exception as e:
468
- log.error(f"Error executing page_action: {e}")
469
-
470
- if params.wait_selector:
471
- try:
472
- waiter: AsyncLocator = page_info.page.locator(params.wait_selector)
473
- await waiter.first.wait_for(state=params.wait_selector_state)
474
- # Wait again after waiting for the selector, helpful with protections like Cloudflare
475
- await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
476
- except Exception as e:
477
- log.error(f"Error waiting for selector {params.wait_selector}: {e}")
478
-
479
- await page_info.page.wait_for_timeout(params.wait)
480
-
481
- # Create response object
482
- response = await ResponseFactory.from_async_playwright_response(
483
- page_info.page, first_response, final_response[0], params.selector_config
484
- )
485
-
486
- # Close the page to free up resources
487
- await page_info.page.close()
488
- self.page_pool.pages.remove(page_info)
489
- return response
490
-
491
- except Exception as e: # pragma: no cover
492
- page_info.mark_error()
493
- await page_info.page.close()
494
- self.page_pool.pages.remove(page_info)
495
-
496
- if attempt < self._config.retries - 1 and self._is_retriable(e):
497
- log.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s...")
498
- await asyncio_sleep(self._config.retry_delay)
499
- else:
500
- raise
501
-
502
- # For type checking purposes only
503
- raise AssertionError("Unreachable: retry loop must return or raise") # pragma: no cover
 
 
3
  from time import sleep as time_sleep
4
  from asyncio import sleep as asyncio_sleep
5
 
6
+ from playwright.sync_api import Locator, Page, BrowserContext
 
 
 
7
  from playwright.async_api import (
8
  Page as async_Page,
9
  Locator as AsyncLocator,
 
14
 
15
  from scrapling.core.utils import log
16
  from scrapling.core._types import Any, Unpack
17
+ from scrapling.engines.toolbelt.proxy_rotation import is_proxy_error
 
 
 
18
  from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
19
  from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
20
+ from scrapling.engines._browsers._config_tools import _compiled_stealth_scripts
21
+ from scrapling.engines._browsers._types import StealthSession, StealthFetchParams
22
+ from scrapling.engines._browsers._base import SyncSession, AsyncSession, StealthySessionMixin
23
+ from scrapling.engines._browsers._validators import validate_fetch as _validate, StealthConfig
24
 
25
  __CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
26
 
 
31
  __slots__ = (
32
  "_config",
33
  "_context_options",
34
+ "_browser_options",
35
+ "_user_data_dir",
36
+ "_headers_keys",
37
  "max_pages",
38
  "page_pool",
39
  "_max_wait_for_page",
 
84
 
85
  try:
86
  if self._config.cdp_url: # pragma: no cover
87
+ self.browser = self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
88
+ if not self._config.proxy_rotator:
89
+ assert self.browser is not None
90
+ self.context = self.browser.new_context(**self._context_options)
91
+ elif self._config.proxy_rotator:
92
+ self.browser = self.playwright.chromium.launch(**self._browser_options)
93
  else:
94
+ persistent_options = (
95
+ self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
96
+ )
97
+ self.context = self.playwright.chromium.launch_persistent_context(**persistent_options)
 
 
 
98
 
99
+ if self.context:
100
+ self.context = self._initialize_context(self._config, self.context)
101
 
102
  self._is_alive = True
103
  except Exception:
 
108
  else:
109
  raise RuntimeError("Session has been already started")
110
 
111
+ def _initialize_context(self, config, ctx: BrowserContext) -> BrowserContext:
112
+ """Initialize the browser context."""
113
+ for script in _compiled_stealth_scripts():
114
+ ctx.add_init_script(script=script)
115
+
116
+ ctx = super()._initialize_context(config, ctx)
117
+ return ctx
118
+
119
  def _cloudflare_solver(self, page: Page) -> None: # pragma: no cover
120
  """Solve the cloudflare challenge displayed on the playwright page passed
121
 
 
218
  )
219
 
220
  for attempt in range(self._config.retries):
221
+ proxy = self._config.proxy_rotator.get_proxy() if self._config.proxy_rotator else None
222
+
223
+ with self._page_generator(
224
+ params.timeout, params.extra_headers, params.disable_resources, proxy
225
+ ) as page_info:
226
+ final_response = [None]
227
+ page = page_info.page
228
+ page.on("response", self._create_response_handler(page_info, final_response))
229
+
230
+ try:
231
+ first_response = page.goto(url, referer=referer)
232
+ self._wait_for_page_stability(page, params.load_dom, params.network_idle)
233
+
234
+ if not first_response:
235
+ raise RuntimeError(f"Failed to get response for {url}")
236
+
237
+ if params.solve_cloudflare:
238
+ self._cloudflare_solver(page)
239
+ # Make sure the page is fully loaded after the captcha
240
+ self._wait_for_page_stability(page, params.load_dom, params.network_idle)
241
+
242
+ if params.page_action:
243
+ try:
244
+ _ = params.page_action(page)
245
+ except Exception as e: # pragma: no cover
246
+ log.error(f"Error executing page_action: {e}")
247
+
248
+ if params.wait_selector:
249
+ try:
250
+ waiter: Locator = page.locator(params.wait_selector)
251
+ waiter.first.wait_for(state=params.wait_selector_state)
252
+ self._wait_for_page_stability(page, params.load_dom, params.network_idle)
253
+ except Exception as e: # pragma: no cover
254
+ log.error(f"Error waiting for selector {params.wait_selector}: {e}")
255
+
256
+ page.wait_for_timeout(params.wait)
257
+
258
+ response = ResponseFactory.from_playwright_response(
259
+ page, first_response, final_response[0], params.selector_config
260
+ )
261
+ return response
262
+
263
+ except Exception as e:
264
+ page_info.mark_error()
265
+ if attempt < self._config.retries - 1:
266
+ if is_proxy_error(e):
267
+ log.warning(
268
+ f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
269
+ )
270
+ else:
271
+ log.warning(
272
+ f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
273
+ )
274
+ time_sleep(self._config.retry_delay)
275
+ else:
276
+ log.error(f"Failed after {self._config.retries} attempts: {e}")
277
+ raise
278
+
279
+ raise RuntimeError("Request failed") # pragma: no cover
280
 
281
 
282
  class AsyncStealthySession(AsyncSession, StealthySessionMixin):
283
  """An async Stealthy Browser session manager with page pooling."""
284
 
285
+ __slots__ = (
286
+ "_config",
287
+ "_context_options",
288
+ "_browser_options",
289
+ "_user_data_dir",
290
+ "_headers_keys",
291
+ )
292
+
293
  def __init__(self, **kwargs: Unpack[StealthSession]):
294
  """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
295
 
 
332
  self.playwright = await async_playwright().start()
333
  try:
334
  if self._config.cdp_url:
335
+ self.browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
336
+ if not self._config.proxy_rotator:
337
+ assert self.browser is not None
338
+ self.context: AsyncBrowserContext = await self.browser.new_context(**self._context_options)
339
+ elif self._config.proxy_rotator:
340
+ self.browser = await self.playwright.chromium.launch(**self._browser_options)
341
  else:
342
+ persistent_options = (
343
+ self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
344
+ )
345
  self.context: AsyncBrowserContext = await self.playwright.chromium.launch_persistent_context(
346
+ **persistent_options
347
  )
348
 
349
+ if self.context:
350
+ self.context = await self._initialize_context(self._config, self.context)
 
 
 
 
 
 
351
 
352
  self._is_alive = True
353
  except Exception:
 
358
  else:
359
  raise RuntimeError("Session has been already started")
360
 
361
+ async def _initialize_context(self, config, ctx: AsyncBrowserContext) -> AsyncBrowserContext:
362
+ """Initialize the browser context."""
363
+ for script in _compiled_stealth_scripts():
364
+ await ctx.add_init_script(script=script)
365
+
366
+ ctx = await super()._initialize_context(config, ctx)
367
+ return ctx
368
+
369
  async def _cloudflare_solver(self, page: async_Page) -> None: # pragma: no cover
370
  """Solve the cloudflare challenge displayed on the playwright page passed
371
 
 
469
  )
470
 
471
  for attempt in range(self._config.retries):
472
+ proxy = self._config.proxy_rotator.get_proxy() if self._config.proxy_rotator else None
473
+
474
+ async with self._page_generator(
475
+ params.timeout, params.extra_headers, params.disable_resources, proxy
476
+ ) as page_info:
477
+ final_response = [None]
478
+ page = page_info.page
479
+ page.on("response", self._create_response_handler(page_info, final_response))
480
+
481
+ try:
482
+ first_response = await page.goto(url, referer=referer)
483
+ await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
484
+
485
+ if not first_response:
486
+ raise RuntimeError(f"Failed to get response for {url}")
487
+
488
+ if params.solve_cloudflare:
489
+ await self._cloudflare_solver(page)
490
+ # Make sure the page is fully loaded after the captcha
491
+ await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
492
+
493
+ if params.page_action:
494
+ try:
495
+ _ = await params.page_action(page)
496
+ except Exception as e: # pragma: no cover
497
+ log.error(f"Error executing page_action: {e}")
498
+
499
+ if params.wait_selector:
500
+ try:
501
+ waiter: AsyncLocator = page.locator(params.wait_selector)
502
+ await waiter.first.wait_for(state=params.wait_selector_state)
503
+ await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
504
+ except Exception as e: # pragma: no cover
505
+ log.error(f"Error waiting for selector {params.wait_selector}: {e}")
506
+
507
+ await page.wait_for_timeout(params.wait)
508
+
509
+ response = await ResponseFactory.from_async_playwright_response(
510
+ page, first_response, final_response[0], params.selector_config
511
+ )
512
+ return response
513
+
514
+ except Exception as e:
515
+ page_info.mark_error()
516
+ if attempt < self._config.retries - 1:
517
+ if is_proxy_error(e):
518
+ log.warning(
519
+ f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
520
+ )
521
+ else:
522
+ log.warning(
523
+ f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
524
+ )
525
+ await asyncio_sleep(self._config.retry_delay)
526
+ else:
527
+ log.error(f"Failed after {self._config.retries} attempts: {e}")
528
+ raise
529
+
530
+ raise RuntimeError("Request failed") # pragma: no cover
scrapling/engines/_browsers/_types.py CHANGED
@@ -20,6 +20,7 @@ from scrapling.core._types import (
20
  SelectorWaitStates,
21
  TYPE_CHECKING,
22
  )
 
23
 
24
  # Type alias for `impersonate` parameter - accepts a single browser or list of browsers
25
  ImpersonateType: TypeAlias = BrowserTypeLiteral | List[BrowserTypeLiteral] | None
@@ -34,6 +35,7 @@ if TYPE_CHECKING: # pragma: no cover
34
  proxies: Optional[ProxySpec]
35
  proxy: Optional[str]
36
  proxy_auth: Optional[Tuple[str, str]]
 
37
  timeout: Optional[int | float]
38
  headers: Optional[Mapping[str, Optional[str]]]
39
  retries: Optional[int]
@@ -70,6 +72,7 @@ if TYPE_CHECKING: # pragma: no cover
70
  timezone_id: str | None
71
  page_action: Optional[Callable]
72
  proxy: Optional[str | Dict[str, str] | Tuple]
 
73
  extra_headers: Optional[Dict[str, str]]
74
  timeout: int | float
75
  init_script: Optional[str]
 
20
  SelectorWaitStates,
21
  TYPE_CHECKING,
22
  )
23
+ from scrapling.engines.toolbelt.proxy_rotation import ProxyRotator
24
 
25
  # Type alias for `impersonate` parameter - accepts a single browser or list of browsers
26
  ImpersonateType: TypeAlias = BrowserTypeLiteral | List[BrowserTypeLiteral] | None
 
35
  proxies: Optional[ProxySpec]
36
  proxy: Optional[str]
37
  proxy_auth: Optional[Tuple[str, str]]
38
+ proxy_rotator: Optional[ProxyRotator]
39
  timeout: Optional[int | float]
40
  headers: Optional[Mapping[str, Optional[str]]]
41
  retries: Optional[int]
 
72
  timezone_id: str | None
73
  page_action: Optional[Callable]
74
  proxy: Optional[str | Dict[str, str] | Tuple]
75
+ proxy_rotator: Optional[ProxyRotator]
76
  extra_headers: Optional[Dict[str, str]]
77
  timeout: int | float
78
  init_script: Optional[str]
scrapling/engines/_browsers/_validators.py CHANGED
@@ -18,6 +18,7 @@ from scrapling.core._types import (
18
  SetCookieParam,
19
  SelectorWaitStates,
20
  )
 
21
  from scrapling.engines.toolbelt.navigation import construct_proxy_dict
22
  from scrapling.engines._browsers._types import PlaywrightFetchParams, StealthFetchParams
23
 
@@ -70,6 +71,7 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False, weakref=True):
70
  timezone_id: str | None = ""
71
  page_action: Optional[Callable] = None
72
  proxy: Optional[str | Dict[str, str] | Tuple] = None # The default value for proxy in Playwright's source is `None`
 
73
  extra_headers: Optional[Dict[str, str]] = None
74
  timeout: Seconds = 30000
75
  init_script: Optional[str] = None
@@ -88,6 +90,11 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False, weakref=True):
88
  """Custom validation after msgspec validation"""
89
  if self.page_action and not callable(self.page_action):
90
  raise TypeError(f"page_action must be callable, got {type(self.page_action).__name__}")
 
 
 
 
 
91
  if self.proxy:
92
  self.proxy = construct_proxy_dict(self.proxy)
93
  if self.cdp_url:
 
18
  SetCookieParam,
19
  SelectorWaitStates,
20
  )
21
+ from scrapling.engines.toolbelt.proxy_rotation import ProxyRotator
22
  from scrapling.engines.toolbelt.navigation import construct_proxy_dict
23
  from scrapling.engines._browsers._types import PlaywrightFetchParams, StealthFetchParams
24
 
 
71
  timezone_id: str | None = ""
72
  page_action: Optional[Callable] = None
73
  proxy: Optional[str | Dict[str, str] | Tuple] = None # The default value for proxy in Playwright's source is `None`
74
+ proxy_rotator: Optional[ProxyRotator] = None
75
  extra_headers: Optional[Dict[str, str]] = None
76
  timeout: Seconds = 30000
77
  init_script: Optional[str] = None
 
90
  """Custom validation after msgspec validation"""
91
  if self.page_action and not callable(self.page_action):
92
  raise TypeError(f"page_action must be callable, got {type(self.page_action).__name__}")
93
+ if self.proxy and self.proxy_rotator:
94
+ raise ValueError(
95
+ "Cannot use 'proxy_rotator' together with 'proxy'. "
96
+ "Use either a static proxy or proxy rotation, not both."
97
+ )
98
  if self.proxy:
99
  self.proxy = construct_proxy_dict(self.proxy)
100
  if self.cdp_url:
scrapling/engines/constants.py CHANGED
@@ -57,7 +57,6 @@ DEFAULT_STEALTH_FLAGS = (
57
  "--ignore-gpu-blocklist",
58
  "--enable-tcp-fast-open",
59
  "--enable-web-bluetooth",
60
- "--disable-hang-monitor",
61
  "--disable-cloud-import",
62
  "--disable-print-preview",
63
  "--disable-dev-shm-usage",
@@ -84,7 +83,6 @@ DEFAULT_STEALTH_FLAGS = (
84
  "--prerender-from-omnibox=disabled",
85
  "--safebrowsing-disable-auto-update",
86
  "--disable-offer-upload-credit-cards",
87
- "--disable-features=site-per-process",
88
  "--disable-background-timer-throttling",
89
  "--disable-new-content-rendering-timeout",
90
  "--run-all-compositor-stages-before-draw",
 
57
  "--ignore-gpu-blocklist",
58
  "--enable-tcp-fast-open",
59
  "--enable-web-bluetooth",
 
60
  "--disable-cloud-import",
61
  "--disable-print-preview",
62
  "--disable-dev-shm-usage",
 
83
  "--prerender-from-omnibox=disabled",
84
  "--safebrowsing-disable-auto-update",
85
  "--disable-offer-upload-credit-cards",
 
86
  "--disable-background-timer-throttling",
87
  "--disable-new-content-rendering-timeout",
88
  "--run-all-compositor-stages-before-draw",
scrapling/engines/static.py CHANGED
@@ -22,9 +22,10 @@ from scrapling.core._types import (
22
  SUPPORTED_HTTP_METHODS,
23
  )
24
 
25
- from ._browsers._types import RequestsSession, GetRequestParams, DataRequestParams, ImpersonateType
26
  from .toolbelt.custom import Response
27
  from .toolbelt.convertor import ResponseFactory
 
 
28
  from .toolbelt.fingerprints import generate_convincing_referer, generate_headers, __default_useragent__
29
 
30
  _NO_SESSION: Any = object()
@@ -63,6 +64,7 @@ class _ConfigurationLogic(ABC):
63
  "_default_http3",
64
  "selector_config",
65
  "_is_alive",
 
66
  )
67
 
68
  def __init__(self, **kwargs: Unpack[RequestsSession]):
@@ -82,6 +84,13 @@ class _ConfigurationLogic(ABC):
82
  self._default_http3 = kwargs.get("http3", False)
83
  self.selector_config = kwargs.get("selector_config") or {}
84
  self._is_alive = False
 
 
 
 
 
 
 
85
 
86
  @staticmethod
87
  def _get_param(kwargs: Dict, key: str, default: Any) -> Any:
@@ -218,7 +227,7 @@ class _SyncSessionLogic(_ConfigurationLogic):
218
  selector_config = self._get_param(kwargs, "selector_config", self.selector_config) or self.selector_config
219
  max_retries = self._get_param(kwargs, "retries", self._default_retries)
220
  retry_delay = self._get_param(kwargs, "retry_delay", self._default_retry_delay)
221
- request_args = self._merge_request_args(stealth=stealth, **kwargs)
222
 
223
  session = self._curl_session
224
  one_off_request = False
@@ -228,22 +237,38 @@ class _SyncSessionLogic(_ConfigurationLogic):
228
  session = CurlSession()
229
  one_off_request = True
230
 
231
- if session:
 
 
 
232
  for attempt in range(max_retries):
 
 
 
 
 
 
233
  try:
234
  response = session.request(method, **request_args)
235
  result = ResponseFactory.from_http_request(response, selector_config)
236
  return result
237
  except CurlError as e: # pragma: no cover
238
  if attempt < max_retries - 1:
239
- log.error(f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds...")
 
 
 
 
 
 
 
240
  time_sleep(retry_delay)
241
  else:
242
  log.error(f"Failed after {max_retries} attempts: {e}")
243
  raise # Raise the exception if all retries fail
244
- finally:
245
- if session and one_off_request:
246
- session.close()
247
 
248
  raise RuntimeError("No active session available.") # pragma: no cover
249
 
@@ -415,7 +440,7 @@ class _ASyncSessionLogic(_ConfigurationLogic):
415
  selector_config = self._get_param(kwargs, "selector_config", self.selector_config) or self.selector_config
416
  max_retries = self._get_param(kwargs, "retries", self._default_retries)
417
  retry_delay = self._get_param(kwargs, "retry_delay", self._default_retry_delay)
418
- request_args = self._merge_request_args(stealth=stealth, **kwargs)
419
 
420
  session = self._async_curl_session
421
  one_off_request = False
@@ -427,22 +452,40 @@ class _ASyncSessionLogic(_ConfigurationLogic):
427
  session = AsyncCurlSession()
428
  one_off_request = True
429
 
430
- if session:
 
 
 
 
431
  for attempt in range(max_retries):
 
 
 
 
 
 
432
  try:
433
  response = await session.request(method, **request_args)
434
  result = ResponseFactory.from_http_request(response, selector_config)
435
  return result
436
  except CurlError as e: # pragma: no cover
437
  if attempt < max_retries - 1:
438
- log.error(f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds...")
 
 
 
 
 
 
 
 
439
  await asyncio_sleep(retry_delay)
440
  else:
441
  log.error(f"Failed after {max_retries} attempts: {e}")
442
  raise # Raise the exception if all retries fail
443
- finally:
444
- if session and one_off_request:
445
- await session.close()
446
 
447
  raise RuntimeError("No active session available.") # pragma: no cover
448
 
@@ -604,6 +647,7 @@ class FetcherSession:
604
  "selector_config",
605
  "_client",
606
  "_is_alive",
 
607
  )
608
 
609
  def __init__(
@@ -623,6 +667,7 @@ class FetcherSession:
623
  verify: bool = True,
624
  cert: Optional[str | Tuple[str, str]] = None,
625
  selector_config: Optional[Dict] = None,
 
626
  ):
627
  """
628
  :param impersonate: Browser version to impersonate. Can be a single browser string or a list of browser strings for random selection. (Default: latest available Chrome version)
@@ -641,6 +686,7 @@ class FetcherSession:
641
  :param verify: Whether to verify HTTPS certificates. Defaults to True.
642
  :param cert: Tuple of (cert, key) filenames for the client certificate.
643
  :param selector_config: Arguments passed when creating the final Selector class.
 
644
  """
645
  self._default_impersonate: ImpersonateType = impersonate
646
  self._stealth = stealthy_headers
@@ -659,6 +705,7 @@ class FetcherSession:
659
  self.selector_config = selector_config or {}
660
  self._is_alive = False
661
  self._client: _SyncSessionLogic | _ASyncSessionLogic | None = None
 
662
 
663
  def __enter__(self) -> _SyncSessionLogic:
664
  """Creates and returns a new synchronous Fetcher Session"""
@@ -667,6 +714,7 @@ class FetcherSession:
667
  config = {k.replace("_default_", ""): getattr(self, k) for k in self.__slots__ if k.startswith("_default")}
668
  config["stealthy_headers"] = self._stealth
669
  config["selector_config"] = self.selector_config
 
670
  self._client = _SyncSessionLogic(**config)
671
  self._is_alive = True
672
  return self._client.__enter__()
@@ -687,6 +735,7 @@ class FetcherSession:
687
  config = {k.replace("_default_", ""): getattr(self, k) for k in self.__slots__ if k.startswith("_default")}
688
  config["stealthy_headers"] = self._stealth
689
  config["selector_config"] = self.selector_config
 
690
  self._client = _ASyncSessionLogic(**config)
691
  self._is_alive = True
692
  return await self._client.__aenter__()
 
22
  SUPPORTED_HTTP_METHODS,
23
  )
24
 
 
25
  from .toolbelt.custom import Response
26
  from .toolbelt.convertor import ResponseFactory
27
+ from .toolbelt.proxy_rotation import ProxyRotator, is_proxy_error
28
+ from ._browsers._types import RequestsSession, GetRequestParams, DataRequestParams, ImpersonateType
29
  from .toolbelt.fingerprints import generate_convincing_referer, generate_headers, __default_useragent__
30
 
31
  _NO_SESSION: Any = object()
 
64
  "_default_http3",
65
  "selector_config",
66
  "_is_alive",
67
+ "_proxy_rotator",
68
  )
69
 
70
  def __init__(self, **kwargs: Unpack[RequestsSession]):
 
84
  self._default_http3 = kwargs.get("http3", False)
85
  self.selector_config = kwargs.get("selector_config") or {}
86
  self._is_alive = False
87
+ self._proxy_rotator: Optional[ProxyRotator] = kwargs.get("proxy_rotator")
88
+
89
+ if self._proxy_rotator and (self._default_proxy or self._default_proxies):
90
+ raise ValueError(
91
+ "Cannot use 'proxy_rotator' together with 'proxy' or 'proxies'. "
92
+ "Use either a static proxy or proxy rotation, not both."
93
+ )
94
 
95
  @staticmethod
96
  def _get_param(kwargs: Dict, key: str, default: Any) -> Any:
 
227
  selector_config = self._get_param(kwargs, "selector_config", self.selector_config) or self.selector_config
228
  max_retries = self._get_param(kwargs, "retries", self._default_retries)
229
  retry_delay = self._get_param(kwargs, "retry_delay", self._default_retry_delay)
230
+ static_proxy = kwargs.pop("proxy", None)
231
 
232
  session = self._curl_session
233
  one_off_request = False
 
237
  session = CurlSession()
238
  one_off_request = True
239
 
240
+ if not session:
241
+ raise RuntimeError("No active session available.") # pragma: no cover
242
+
243
+ try:
244
  for attempt in range(max_retries):
245
+ if self._proxy_rotator and static_proxy is None:
246
+ proxy = self._proxy_rotator.get_proxy()
247
+ else:
248
+ proxy = static_proxy
249
+
250
+ request_args = self._merge_request_args(stealth=stealth, proxy=proxy, **kwargs)
251
  try:
252
  response = session.request(method, **request_args)
253
  result = ResponseFactory.from_http_request(response, selector_config)
254
  return result
255
  except CurlError as e: # pragma: no cover
256
  if attempt < max_retries - 1:
257
+ # Now if the rotator is enabled, we will try again with the new proxy
258
+ # If it's not enabled, then we will try again with the same proxy
259
+ if is_proxy_error(e):
260
+ log.warning(
261
+ f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {retry_delay} seconds..."
262
+ )
263
+ else:
264
+ log.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds...")
265
  time_sleep(retry_delay)
266
  else:
267
  log.error(f"Failed after {max_retries} attempts: {e}")
268
  raise # Raise the exception if all retries fail
269
+ finally:
270
+ if session and one_off_request:
271
+ session.close()
272
 
273
  raise RuntimeError("No active session available.") # pragma: no cover
274
 
 
440
  selector_config = self._get_param(kwargs, "selector_config", self.selector_config) or self.selector_config
441
  max_retries = self._get_param(kwargs, "retries", self._default_retries)
442
  retry_delay = self._get_param(kwargs, "retry_delay", self._default_retry_delay)
443
+ static_proxy = kwargs.pop("proxy", None)
444
 
445
  session = self._async_curl_session
446
  one_off_request = False
 
452
  session = AsyncCurlSession()
453
  one_off_request = True
454
 
455
+ if not session:
456
+ raise RuntimeError("No active session available.") # pragma: no cover
457
+
458
+ try:
459
+ # Determine if we should use proxy rotation
460
  for attempt in range(max_retries):
461
+ if self._proxy_rotator and static_proxy is None:
462
+ proxy = self._proxy_rotator.get_proxy()
463
+ else:
464
+ proxy = static_proxy
465
+
466
+ request_args = self._merge_request_args(stealth=stealth, proxy=proxy, **kwargs)
467
  try:
468
  response = await session.request(method, **request_args)
469
  result = ResponseFactory.from_http_request(response, selector_config)
470
  return result
471
  except CurlError as e: # pragma: no cover
472
  if attempt < max_retries - 1:
473
+ # Now if the rotator is enabled, we will try again with the new proxy
474
+ # If it's not enabled, then we will try again with the same proxy
475
+ if is_proxy_error(e):
476
+ log.warning(
477
+ f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {retry_delay} seconds..."
478
+ )
479
+ else:
480
+ log.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds...")
481
+
482
  await asyncio_sleep(retry_delay)
483
  else:
484
  log.error(f"Failed after {max_retries} attempts: {e}")
485
  raise # Raise the exception if all retries fail
486
+ finally:
487
+ if session and one_off_request:
488
+ await session.close()
489
 
490
  raise RuntimeError("No active session available.") # pragma: no cover
491
 
 
647
  "selector_config",
648
  "_client",
649
  "_is_alive",
650
+ "_proxy_rotator",
651
  )
652
 
653
  def __init__(
 
667
  verify: bool = True,
668
  cert: Optional[str | Tuple[str, str]] = None,
669
  selector_config: Optional[Dict] = None,
670
+ proxy_rotator: Optional[ProxyRotator] = None,
671
  ):
672
  """
673
  :param impersonate: Browser version to impersonate. Can be a single browser string or a list of browser strings for random selection. (Default: latest available Chrome version)
 
686
  :param verify: Whether to verify HTTPS certificates. Defaults to True.
687
  :param cert: Tuple of (cert, key) filenames for the client certificate.
688
  :param selector_config: Arguments passed when creating the final Selector class.
689
+ :param proxy_rotator: A ProxyRotator instance for automatic proxy rotation.
690
  """
691
  self._default_impersonate: ImpersonateType = impersonate
692
  self._stealth = stealthy_headers
 
705
  self.selector_config = selector_config or {}
706
  self._is_alive = False
707
  self._client: _SyncSessionLogic | _ASyncSessionLogic | None = None
708
+ self._proxy_rotator = proxy_rotator
709
 
710
  def __enter__(self) -> _SyncSessionLogic:
711
  """Creates and returns a new synchronous Fetcher Session"""
 
714
  config = {k.replace("_default_", ""): getattr(self, k) for k in self.__slots__ if k.startswith("_default")}
715
  config["stealthy_headers"] = self._stealth
716
  config["selector_config"] = self.selector_config
717
+ config["proxy_rotator"] = self._proxy_rotator
718
  self._client = _SyncSessionLogic(**config)
719
  self._is_alive = True
720
  return self._client.__enter__()
 
735
  config = {k.replace("_default_", ""): getattr(self, k) for k in self.__slots__ if k.startswith("_default")}
736
  config["stealthy_headers"] = self._stealth
737
  config["selector_config"] = self.selector_config
738
+ config["proxy_rotator"] = self._proxy_rotator
739
  self._client = _ASyncSessionLogic(**config)
740
  self._is_alive = True
741
  return await self._client.__aenter__()
scrapling/engines/toolbelt/__init__.py CHANGED
@@ -1 +1,3 @@
 
1
 
 
 
1
+ from .proxy_rotation import ProxyRotator, is_proxy_error, round_robin
2
 
3
+ __all__ = ["ProxyRotator", "is_proxy_error", "round_robin"]
scrapling/engines/toolbelt/proxy_rotation.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from threading import Lock
2
+
3
+ from scrapling.core._types import Callable, Dict, List, Tuple, ProxyType
4
+
5
+
6
+ RotationStrategy = Callable[[List[ProxyType], int], Tuple[ProxyType, int]]
7
+ _PROXY_ERROR_INDICATORS = {
8
+ "net::err_proxy",
9
+ "net::err_tunnel",
10
+ "connection refused",
11
+ "connection reset",
12
+ "connection timed out",
13
+ "failed to connect",
14
+ "could not resolve proxy",
15
+ }
16
+
17
+
18
+ def _get_proxy_key(proxy: ProxyType) -> str:
19
+ """Generate a unique key for a proxy (for dicts it's server plus username)."""
20
+ if isinstance(proxy, str):
21
+ return proxy
22
+ server = proxy.get("server", "")
23
+ username = proxy.get("username", "")
24
+ return f"{server}|{username}"
25
+
26
+
27
+ def is_proxy_error(error: Exception) -> bool:
28
+ """Check if an error is proxy-related. Works for both HTTP and browser errors."""
29
+ error_msg = str(error).lower()
30
+ return any(indicator in error_msg for indicator in _PROXY_ERROR_INDICATORS)
31
+
32
+
33
+ def round_robin(proxies: List[ProxyType], current_index: int) -> Tuple[ProxyType, int]:
34
+ """Default round-robin rotation strategy."""
35
+ idx = current_index % len(proxies)
36
+ return proxies[idx], (idx + 1) % len(proxies)
37
+
38
+
39
+ class ProxyRotator:
40
+ """
41
+ A thread-safe proxy rotator with pluggable rotation strategies.
42
+
43
+ Supports:
44
+ - Round-robin rotation (default)
45
+ - Custom rotation strategies via callable
46
+ - Both string URLs and Playwright-style dict proxies
47
+ """
48
+
49
+ __slots__ = ("_proxies", "_proxy_to_index", "_strategy", "_current_index", "_lock")
50
+
51
+ def __init__(
52
+ self,
53
+ proxies: List[ProxyType],
54
+ strategy: RotationStrategy = round_robin,
55
+ ):
56
+ """
57
+ Initialize the proxy rotator.
58
+
59
+ :param proxies: List of proxy URLs or Playwright-style proxy dicts.
60
+ - String format: "http://proxy1:8080" or "http://user:pass@proxy:8080"
61
+ - Dict format: {"server": "http://proxy:8080", "username": "user", "password": "pass"}
62
+ :param strategy: Rotation strategy function. Takes (proxies, current_index) and returns (proxy, next_index). Defaults to round_robin.
63
+ """
64
+ if not proxies:
65
+ raise ValueError("At least one proxy must be provided")
66
+
67
+ if not callable(strategy):
68
+ raise TypeError(f"strategy must be callable, got {type(strategy).__name__}")
69
+
70
+ self._strategy = strategy
71
+ self._lock = Lock()
72
+
73
+ # Validate and store proxies
74
+ self._proxies: List[ProxyType] = []
75
+ self._proxy_to_index: Dict[str, int] = {} # O(1) lookup by unique key (server + username)
76
+ for i, proxy in enumerate(proxies):
77
+ if isinstance(proxy, (str, dict)):
78
+ if isinstance(proxy, dict) and "server" not in proxy:
79
+ raise ValueError("Proxy dict must have a 'server' key")
80
+
81
+ self._proxy_to_index[_get_proxy_key(proxy)] = i
82
+ self._proxies.append(proxy)
83
+ else:
84
+ raise TypeError(f"Invalid proxy type: {type(proxy)}. Expected str or dict.")
85
+
86
+ self._current_index = 0
87
+
88
+ def get_proxy(self) -> ProxyType:
89
+ """Get the next proxy according to the rotation strategy."""
90
+ with self._lock:
91
+ proxy, self._current_index = self._strategy(self._proxies, self._current_index)
92
+ return proxy
93
+
94
+ @property
95
+ def proxies(self) -> List[ProxyType]:
96
+ """Get a copy of all configured proxies."""
97
+ return list(self._proxies)
98
+
99
+ def __len__(self) -> int:
100
+ """Return the total number of configured proxies."""
101
+ return len(self._proxies)
102
+
103
+ def __repr__(self) -> str:
104
+ return f"ProxyRotator(proxies={len(self._proxies)})"