Karim shoair commited on
Commit
024cbba
·
1 Parent(s): e251ff4

feat(browsers): Add option to retry tabs that gives errors

Browse files
scrapling/engines/_browsers/_base.py CHANGED
@@ -310,6 +310,14 @@ class BaseSessionMixin:
310
  if config.additional_args:
311
  self._context_options.update(config.additional_args)
312
 
 
 
 
 
 
 
 
 
313
 
314
  class DynamicSessionMixin(BaseSessionMixin):
315
  def __validate__(self, **params):
 
310
  if config.additional_args:
311
  self._context_options.update(config.additional_args)
312
 
313
+ @staticmethod
314
+ def _is_retriable(error: Exception) -> bool:
315
+ """Check if an error is retriable (transient network/timeout issues)."""
316
+ if isinstance(error, TimeoutError):
317
+ return True
318
+ error_msg = str(error).lower()
319
+ return "net::" in error_msg or "failed to get response" in error_msg
320
+
321
 
322
  class DynamicSessionMixin(BaseSessionMixin):
323
  def __validate__(self, **params):
scrapling/engines/_browsers/_controllers.py CHANGED
@@ -1,3 +1,6 @@
 
 
 
1
  from playwright.sync_api import (
2
  Locator,
3
  Playwright,
@@ -115,52 +118,56 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
115
  else None
116
  )
117
 
118
- page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources)
119
- final_response = [None]
120
- handle_response = self._create_response_handler(page_info, final_response)
121
-
122
- try: # pragma: no cover
123
- # Navigate to URL and wait for a specified state
124
- page_info.page.on("response", handle_response)
125
- first_response = page_info.page.goto(url, referer=referer)
126
- self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
127
-
128
- if not first_response:
129
- raise RuntimeError(f"Failed to get response for {url}")
130
-
131
- if params.page_action:
132
- try:
133
- _ = params.page_action(page_info.page)
134
- except Exception as e: # pragma: no cover
135
- log.error(f"Error executing page_action: {e}")
136
-
137
- if params.wait_selector:
138
- try:
139
- waiter: Locator = page_info.page.locator(params.wait_selector)
140
- waiter.first.wait_for(state=params.wait_selector_state)
141
- # Wait again after waiting for the selector, helpful with protections like Cloudflare
142
- self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
143
- except Exception as e: # pragma: no cover
144
- log.error(f"Error waiting for selector {params.wait_selector}: {e}")
145
-
146
- page_info.page.wait_for_timeout(params.wait)
 
 
 
147
 
148
- # Create response object
149
- response = ResponseFactory.from_playwright_response(
150
- page_info.page, first_response, final_response[0], params.selector_config
151
- )
152
 
153
- # Close the page to free up resources
154
- page_info.page.close()
155
- self.page_pool.pages.remove(page_info)
 
156
 
157
- return response
 
 
 
 
158
 
159
- except Exception as e:
160
- page_info.mark_error()
161
- page_info.page.close()
162
- self.page_pool.pages.remove(page_info)
163
- raise e
164
 
165
 
166
  class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
@@ -252,54 +259,59 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
252
  else None
253
  )
254
 
255
- page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
256
- final_response = [None]
257
- handle_response = self._create_response_handler(page_info, final_response)
258
-
259
- if TYPE_CHECKING:
260
- from playwright.async_api import Page as async_Page
261
-
262
- if not isinstance(page_info.page, async_Page):
263
- raise TypeError
264
-
265
- try:
266
- # Navigate to URL and wait for a specified state
267
- page_info.page.on("response", handle_response)
268
- first_response = await page_info.page.goto(url, referer=referer)
269
- await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
270
-
271
- if not first_response:
272
- raise RuntimeError(f"Failed to get response for {url}")
273
-
274
- if params.page_action:
275
- try:
276
- _ = await params.page_action(page_info.page)
277
- except Exception as e:
278
- log.error(f"Error executing page_action: {e}")
279
-
280
- if params.wait_selector:
281
- try:
282
- waiter: AsyncLocator = page_info.page.locator(params.wait_selector)
283
- await waiter.first.wait_for(state=params.wait_selector_state)
284
- # Wait again after waiting for the selector, helpful with protections like Cloudflare
285
- await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
286
- except Exception as e:
287
- log.error(f"Error waiting for selector {params.wait_selector}: {e}")
288
-
289
- await page_info.page.wait_for_timeout(params.wait)
290
-
291
- # Create response object
292
- response = await ResponseFactory.from_async_playwright_response(
293
- page_info.page, first_response, final_response[0], params.selector_config
294
- )
295
-
296
- # Close the page to free up resources
297
- await page_info.page.close()
298
- self.page_pool.pages.remove(page_info)
299
- return response
300
-
301
- except Exception as e: # pragma: no cover
302
- page_info.mark_error()
303
- await page_info.page.close()
304
- self.page_pool.pages.remove(page_info)
305
- raise e
 
 
 
 
 
 
1
+ from time import sleep as time_sleep
2
+ from asyncio import sleep as asyncio_sleep
3
+
4
  from playwright.sync_api import (
5
  Locator,
6
  Playwright,
 
118
  else None
119
  )
120
 
121
+ for attempt in range(self._config.retries):
122
+ page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources)
123
+ final_response = [None]
124
+ handle_response = self._create_response_handler(page_info, final_response)
125
+
126
+ try: # pragma: no cover
127
+ page_info.page.on("response", handle_response)
128
+ first_response = page_info.page.goto(url, referer=referer)
129
+ self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
130
+
131
+ if not first_response:
132
+ raise RuntimeError(f"Failed to get response for {url}")
133
+
134
+ if params.page_action:
135
+ try:
136
+ _ = params.page_action(page_info.page)
137
+ except Exception as e: # pragma: no cover
138
+ log.error(f"Error executing page_action: {e}")
139
+
140
+ if params.wait_selector:
141
+ try:
142
+ waiter: Locator = page_info.page.locator(params.wait_selector)
143
+ waiter.first.wait_for(state=params.wait_selector_state)
144
+ self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
145
+ except Exception as e: # pragma: no cover
146
+ log.error(f"Error waiting for selector {params.wait_selector}: {e}")
147
+
148
+ page_info.page.wait_for_timeout(params.wait)
149
+
150
+ response = ResponseFactory.from_playwright_response(
151
+ page_info.page, first_response, final_response[0], params.selector_config
152
+ )
153
 
154
+ page_info.page.close()
155
+ self.page_pool.pages.remove(page_info)
156
+ return response
 
157
 
158
+ except Exception as e:
159
+ page_info.mark_error()
160
+ page_info.page.close()
161
+ self.page_pool.pages.remove(page_info)
162
 
163
+ if attempt < self._config.retries - 1 and self._is_retriable(e):
164
+ log.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s...")
165
+ time_sleep(self._config.retry_delay)
166
+ else:
167
+ raise
168
 
169
+ # For type checking purposes only
170
+ raise AssertionError("Unreachable: retry loop must return or raise") # pragma: no cover
 
 
 
171
 
172
 
173
  class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
 
259
  else None
260
  )
261
 
262
+ for attempt in range(self._config.retries):
263
+ page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
264
+ final_response = [None]
265
+ handle_response = self._create_response_handler(page_info, final_response)
266
+
267
+ if TYPE_CHECKING:
268
+ from playwright.async_api import Page as async_Page
269
+
270
+ if not isinstance(page_info.page, async_Page):
271
+ raise TypeError
272
+
273
+ try:
274
+ page_info.page.on("response", handle_response)
275
+ first_response = await page_info.page.goto(url, referer=referer)
276
+ await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
277
+
278
+ if not first_response:
279
+ raise RuntimeError(f"Failed to get response for {url}")
280
+
281
+ if params.page_action:
282
+ try:
283
+ _ = await params.page_action(page_info.page)
284
+ except Exception as e:
285
+ log.error(f"Error executing page_action: {e}")
286
+
287
+ if params.wait_selector:
288
+ try:
289
+ waiter: AsyncLocator = page_info.page.locator(params.wait_selector)
290
+ await waiter.first.wait_for(state=params.wait_selector_state)
291
+ await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
292
+ except Exception as e:
293
+ log.error(f"Error waiting for selector {params.wait_selector}: {e}")
294
+
295
+ await page_info.page.wait_for_timeout(params.wait)
296
+
297
+ response = await ResponseFactory.from_async_playwright_response(
298
+ page_info.page, first_response, final_response[0], params.selector_config
299
+ )
300
+
301
+ await page_info.page.close()
302
+ self.page_pool.pages.remove(page_info)
303
+ return response
304
+
305
+ except Exception as e: # pragma: no cover
306
+ page_info.mark_error()
307
+ await page_info.page.close()
308
+ self.page_pool.pages.remove(page_info)
309
+
310
+ if attempt < self._config.retries - 1 and self._is_retriable(e):
311
+ log.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s...")
312
+ await asyncio_sleep(self._config.retry_delay)
313
+ else:
314
+ raise
315
+
316
+ # For type checking purposes only
317
+ raise AssertionError("Unreachable: retry loop must return or raise") # pragma: no cover
scrapling/engines/_browsers/_stealth.py CHANGED
@@ -1,5 +1,7 @@
1
  from random import randint
2
  from re import compile as re_compile
 
 
3
 
4
  from playwright.sync_api import (
5
  Locator,
@@ -202,57 +204,66 @@ class StealthySession(SyncSession, StealthySessionMixin):
202
  else None
203
  )
204
 
205
- page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources)
206
- final_response = [None]
207
- handle_response = self._create_response_handler(page_info, final_response)
 
208
 
209
- try: # pragma: no cover
210
- # Navigate to URL and wait for a specified state
211
- page_info.page.on("response", handle_response)
212
- first_response = page_info.page.goto(url, referer=referer)
213
- self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
214
-
215
- if not first_response:
216
- raise RuntimeError(f"Failed to get response for {url}")
217
-
218
- if params.solve_cloudflare:
219
- self._cloudflare_solver(page_info.page)
220
- # Make sure the page is fully loaded after the captcha
221
  self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
222
 
223
- if params.page_action:
224
- try:
225
- _ = params.page_action(page_info.page)
226
- except Exception as e: # pragma: no cover
227
- log.error(f"Error executing page_action: {e}")
228
-
229
- if params.wait_selector:
230
- try:
231
- waiter: Locator = page_info.page.locator(params.wait_selector)
232
- waiter.first.wait_for(state=params.wait_selector_state)
233
- # Wait again after waiting for the selector, helpful with protections like Cloudflare
234
  self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
235
- except Exception as e: # pragma: no cover
236
- log.error(f"Error waiting for selector {params.wait_selector}: {e}")
237
 
238
- page_info.page.wait_for_timeout(params.wait)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
- # Create response object
241
- response = ResponseFactory.from_playwright_response(
242
- page_info.page, first_response, final_response[0], params.selector_config
243
- )
244
 
245
- # Close the page to free up resources
246
- page_info.page.close()
247
- self.page_pool.pages.remove(page_info)
248
 
249
- return response
 
 
 
250
 
251
- except Exception as e:
252
- page_info.mark_error()
253
- page_info.page.close()
254
- self.page_pool.pages.remove(page_info)
255
- raise e
 
 
 
256
 
257
 
258
  class AsyncStealthySession(AsyncSession, StealthySessionMixin):
@@ -422,53 +433,62 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
422
  else None
423
  )
424
 
425
- page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
426
- final_response = [None]
427
- handle_response = self._create_response_handler(page_info, final_response)
428
-
429
- try:
430
- # Navigate to URL and wait for a specified state
431
- page_info.page.on("response", handle_response)
432
- first_response = await page_info.page.goto(url, referer=referer)
433
- await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
434
-
435
- if not first_response:
436
- raise RuntimeError(f"Failed to get response for {url}")
437
 
438
- if params.solve_cloudflare:
439
- await self._cloudflare_solver(page_info.page)
440
- # Make sure the page is fully loaded after the captcha
 
441
  await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
442
 
443
- if params.page_action:
444
- try:
445
- _ = await params.page_action(page_info.page)
446
- except Exception as e:
447
- log.error(f"Error executing page_action: {e}")
448
-
449
- if params.wait_selector:
450
- try:
451
- waiter: AsyncLocator = page_info.page.locator(params.wait_selector)
452
- await waiter.first.wait_for(state=params.wait_selector_state)
453
- # Wait again after waiting for the selector, helpful with protections like Cloudflare
454
  await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
455
- except Exception as e:
456
- log.error(f"Error waiting for selector {params.wait_selector}: {e}")
457
-
458
- await page_info.page.wait_for_timeout(params.wait)
459
-
460
- # Create response object
461
- response = await ResponseFactory.from_async_playwright_response(
462
- page_info.page, first_response, final_response[0], params.selector_config
463
- )
464
-
465
- # Close the page to free up resources
466
- await page_info.page.close()
467
- self.page_pool.pages.remove(page_info)
468
- return response
469
-
470
- except Exception as e: # pragma: no cover
471
- page_info.mark_error()
472
- await page_info.page.close()
473
- self.page_pool.pages.remove(page_info)
474
- raise e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from random import randint
2
  from re import compile as re_compile
3
+ from time import sleep as time_sleep
4
+ from asyncio import sleep as asyncio_sleep
5
 
6
  from playwright.sync_api import (
7
  Locator,
 
204
  else None
205
  )
206
 
207
+ for attempt in range(self._config.retries):
208
+ page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources)
209
+ final_response = [None]
210
+ handle_response = self._create_response_handler(page_info, final_response)
211
 
212
+ try: # pragma: no cover
213
+ # Navigate to URL and wait for a specified state
214
+ page_info.page.on("response", handle_response)
215
+ first_response = page_info.page.goto(url, referer=referer)
 
 
 
 
 
 
 
 
216
  self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
217
 
218
+ if not first_response:
219
+ raise RuntimeError(f"Failed to get response for {url}")
220
+
221
+ if params.solve_cloudflare:
222
+ self._cloudflare_solver(page_info.page)
223
+ # Make sure the page is fully loaded after the captcha
 
 
 
 
 
224
  self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
 
 
225
 
226
+ if params.page_action:
227
+ try:
228
+ _ = params.page_action(page_info.page)
229
+ except Exception as e: # pragma: no cover
230
+ log.error(f"Error executing page_action: {e}")
231
+
232
+ if params.wait_selector:
233
+ try:
234
+ waiter: Locator = page_info.page.locator(params.wait_selector)
235
+ waiter.first.wait_for(state=params.wait_selector_state)
236
+ # Wait again after waiting for the selector, helpful with protections like Cloudflare
237
+ self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
238
+ except Exception as e: # pragma: no cover
239
+ log.error(f"Error waiting for selector {params.wait_selector}: {e}")
240
+
241
+ page_info.page.wait_for_timeout(params.wait)
242
+
243
+ # Create response object
244
+ response = ResponseFactory.from_playwright_response(
245
+ page_info.page, first_response, final_response[0], params.selector_config
246
+ )
247
 
248
+ # Close the page to free up resources
249
+ page_info.page.close()
250
+ self.page_pool.pages.remove(page_info)
 
251
 
252
+ return response
 
 
253
 
254
+ except Exception as e:
255
+ page_info.mark_error()
256
+ page_info.page.close()
257
+ self.page_pool.pages.remove(page_info)
258
 
259
+ if attempt < self._config.retries - 1 and self._is_retriable(e):
260
+ log.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s...")
261
+ time_sleep(self._config.retry_delay)
262
+ else:
263
+ raise
264
+
265
+ # For type checking purposes only
266
+ raise AssertionError("Unreachable: retry loop must return or raise") # pragma: no cover
267
 
268
 
269
  class AsyncStealthySession(AsyncSession, StealthySessionMixin):
 
433
  else None
434
  )
435
 
436
+ for attempt in range(self._config.retries):
437
+ page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
438
+ final_response = [None]
439
+ handle_response = self._create_response_handler(page_info, final_response)
 
 
 
 
 
 
 
 
440
 
441
+ try:
442
+ # Navigate to URL and wait for a specified state
443
+ page_info.page.on("response", handle_response)
444
+ first_response = await page_info.page.goto(url, referer=referer)
445
  await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
446
 
447
+ if not first_response:
448
+ raise RuntimeError(f"Failed to get response for {url}")
449
+
450
+ if params.solve_cloudflare:
451
+ await self._cloudflare_solver(page_info.page)
452
+ # Make sure the page is fully loaded after the captcha
 
 
 
 
 
453
  await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
454
+
455
+ if params.page_action:
456
+ try:
457
+ _ = await params.page_action(page_info.page)
458
+ except Exception as e:
459
+ log.error(f"Error executing page_action: {e}")
460
+
461
+ if params.wait_selector:
462
+ try:
463
+ waiter: AsyncLocator = page_info.page.locator(params.wait_selector)
464
+ await waiter.first.wait_for(state=params.wait_selector_state)
465
+ # Wait again after waiting for the selector, helpful with protections like Cloudflare
466
+ await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
467
+ except Exception as e:
468
+ log.error(f"Error waiting for selector {params.wait_selector}: {e}")
469
+
470
+ await page_info.page.wait_for_timeout(params.wait)
471
+
472
+ # Create response object
473
+ response = await ResponseFactory.from_async_playwright_response(
474
+ page_info.page, first_response, final_response[0], params.selector_config
475
+ )
476
+
477
+ # Close the page to free up resources
478
+ await page_info.page.close()
479
+ self.page_pool.pages.remove(page_info)
480
+ return response
481
+
482
+ except Exception as e: # pragma: no cover
483
+ page_info.mark_error()
484
+ await page_info.page.close()
485
+ self.page_pool.pages.remove(page_info)
486
+
487
+ if attempt < self._config.retries - 1 and self._is_retriable(e):
488
+ log.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s...")
489
+ await asyncio_sleep(self._config.retry_delay)
490
+ else:
491
+ raise
492
+
493
+ # For type checking purposes only
494
+ raise AssertionError("Unreachable: retry loop must return or raise") # pragma: no cover
scrapling/engines/_browsers/_types.py CHANGED
@@ -79,6 +79,8 @@ if TYPE_CHECKING: # pragma: no cover
79
  cdp_url: Optional[str]
80
  useragent: Optional[str]
81
  extra_flags: Optional[List[str]]
 
 
82
 
83
  class PlaywrightFetchParams(TypedDict, total=False):
84
  load_dom: bool
 
79
  cdp_url: Optional[str]
80
  useragent: Optional[str]
81
  extra_flags: Optional[List[str]]
82
+ retries: int
83
+ retry_delay: int | float
84
 
85
  class PlaywrightFetchParams(TypedDict, total=False):
86
  load_dom: bool
scrapling/engines/_browsers/_validators.py CHANGED
@@ -50,6 +50,7 @@ def _is_invalid_cdp_url(cdp_url: str) -> bool | str:
50
 
51
  # Type aliases for cleaner annotations
52
  PagesCount = Annotated[int, Meta(ge=1, le=50)]
 
53
  Seconds = Annotated[int, float, Meta(ge=0)]
54
 
55
 
@@ -80,6 +81,8 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False, weakref=True):
80
  cdp_url: Optional[str] = None
81
  useragent: Optional[str] = None
82
  extra_flags: Optional[List[str]] = None
 
 
83
 
84
  def __post_init__(self): # pragma: no cover
85
  """Custom validation after msgspec validation"""
 
50
 
51
  # Type aliases for cleaner annotations
52
  PagesCount = Annotated[int, Meta(ge=1, le=50)]
53
+ RetriesCount = Annotated[int, Meta(ge=1, le=10)]
54
  Seconds = Annotated[int, float, Meta(ge=0)]
55
 
56
 
 
81
  cdp_url: Optional[str] = None
82
  useragent: Optional[str] = None
83
  extra_flags: Optional[List[str]] = None
84
+ retries: RetriesCount = 3
85
+ retry_delay: Seconds = 1
86
 
87
  def __post_init__(self): # pragma: no cover
88
  """Custom validation after msgspec validation"""