Karim shoair commited on
Commit
c181b7d
·
1 Parent(s): 967fb23

refactor/feat(browser fetchers): make it possible to have a configuration per page in sessions

Browse files
scrapling/engines/_browsers/_base.py CHANGED
@@ -22,6 +22,7 @@ from ._config_tools import _compiled_stealth_scripts
22
  from ._validators import validate, PlaywrightConfig, CamoufoxConfig
23
  from ._config_tools import _launch_kwargs, _context_kwargs
24
  from scrapling.core._types import (
 
25
  Dict,
26
  Optional,
27
  )
@@ -38,7 +39,12 @@ class SyncSession:
38
  self.context: Optional[BrowserContext] = None
39
  self._closed = False
40
 
41
- def _get_page(self) -> PageInfo: # pragma: no cover
 
 
 
 
 
42
  """Get a new page to use"""
43
 
44
  # Close all finished pages to ensure clean state
@@ -59,13 +65,12 @@ class SyncSession:
59
  )
60
 
61
  page = self.context.new_page()
62
- timeout = getattr(self, "timeout", 30000)
63
  page.set_default_navigation_timeout(timeout)
64
  page.set_default_timeout(timeout)
65
- if getattr(self, "extra_headers", False):
66
- page.set_extra_http_headers(getattr(self, "extra_headers"))
67
 
68
- if getattr(self, "disable_resources", False):
69
  page.route("**/*", intercept_route)
70
 
71
  if getattr(self, "stealth", False):
@@ -74,6 +79,13 @@ class SyncSession:
74
 
75
  return self.page_pool.add_page(page)
76
 
 
 
 
 
 
 
 
77
  def get_pool_stats(self) -> Dict[str, int]:
78
  """Get statistics about the current page pool"""
79
  return {
@@ -90,7 +102,12 @@ class AsyncSession(SyncSession):
90
  self.context: Optional[AsyncBrowserContext] = None
91
  self._lock = Lock()
92
 
93
- async def _get_page(self) -> PageInfo: # pragma: no cover
 
 
 
 
 
94
  """Get a new page to use"""
95
  async with self._lock:
96
  # Close all finished pages to ensure clean state
@@ -111,13 +128,12 @@ class AsyncSession(SyncSession):
111
  )
112
 
113
  page = await self.context.new_page()
114
- timeout = getattr(self, "timeout", 30000)
115
  page.set_default_navigation_timeout(timeout)
116
  page.set_default_timeout(timeout)
117
- if getattr(self, "extra_headers", False):
118
- await page.set_extra_http_headers(getattr(self, "extra_headers"))
119
 
120
- if getattr(self, "disable_resources", False):
121
  await page.route("**/*", async_intercept_route)
122
 
123
  if getattr(self, "stealth", False):
@@ -334,7 +350,6 @@ class StealthySessionMixin:
334
  self.geoip = config.geoip
335
  self.selector_config = config.selector_config
336
  self.additional_args = config.additional_args
337
- self.selector_config = config.selector_config
338
  self.page_action = config.page_action
339
  self._headers_keys = (
340
  set(map(str.lower, self.extra_headers.keys()))
 
22
  from ._validators import validate, PlaywrightConfig, CamoufoxConfig
23
  from ._config_tools import _launch_kwargs, _context_kwargs
24
  from scrapling.core._types import (
25
+ Any,
26
  Dict,
27
  Optional,
28
  )
 
39
  self.context: Optional[BrowserContext] = None
40
  self._closed = False
41
 
42
+ def _get_page(
43
+ self,
44
+ timeout: int | float,
45
+ extra_headers: Optional[Dict[str, str]],
46
+ disable_resources: bool,
47
+ ) -> PageInfo: # pragma: no cover
48
  """Get a new page to use"""
49
 
50
  # Close all finished pages to ensure clean state
 
65
  )
66
 
67
  page = self.context.new_page()
 
68
  page.set_default_navigation_timeout(timeout)
69
  page.set_default_timeout(timeout)
70
+ if extra_headers:
71
+ page.set_extra_http_headers(extra_headers)
72
 
73
+ if disable_resources:
74
  page.route("**/*", intercept_route)
75
 
76
  if getattr(self, "stealth", False):
 
79
 
80
  return self.page_pool.add_page(page)
81
 
82
+ @staticmethod
83
+ def _get_with_precedence(
84
+ request_value: Any, session_value: Any, sentinel_value: object
85
+ ) -> Any:
86
+ """Get value with request-level priority over session-level"""
87
+ return request_value if request_value is not sentinel_value else session_value
88
+
89
  def get_pool_stats(self) -> Dict[str, int]:
90
  """Get statistics about the current page pool"""
91
  return {
 
102
  self.context: Optional[AsyncBrowserContext] = None
103
  self._lock = Lock()
104
 
105
+ async def _get_page(
106
+ self,
107
+ timeout: int | float,
108
+ extra_headers: Optional[Dict[str, str]],
109
+ disable_resources: bool,
110
+ ) -> PageInfo: # pragma: no cover
111
  """Get a new page to use"""
112
  async with self._lock:
113
  # Close all finished pages to ensure clean state
 
128
  )
129
 
130
  page = await self.context.new_page()
 
131
  page.set_default_navigation_timeout(timeout)
132
  page.set_default_timeout(timeout)
133
+ if extra_headers:
134
+ await page.set_extra_http_headers(extra_headers)
135
 
136
+ if disable_resources:
137
  await page.route("**/*", async_intercept_route)
138
 
139
  if getattr(self, "stealth", False):
 
350
  self.geoip = config.geoip
351
  self.selector_config = config.selector_config
352
  self.additional_args = config.additional_args
 
353
  self.page_action = config.page_action
354
  self._headers_keys = (
355
  set(map(str.lower, self.extra_headers.keys()))
scrapling/engines/_browsers/_camoufox.py CHANGED
@@ -31,6 +31,7 @@ from scrapling.engines.toolbelt import (
31
  )
32
 
33
  __CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
 
34
 
35
 
36
  class StealthySession(StealthySessionMixin, SyncSession):
@@ -247,19 +248,74 @@ class StealthySession(StealthySessionMixin, SyncSession):
247
  log.info("Cloudflare captcha is solved")
248
  return
249
 
250
- def fetch(self, url: str) -> Response:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  """Opens up the browser and do your request based on your chosen options.
252
 
253
  :param url: The Target url.
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  :return: A `Response` object.
255
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
  if self._closed: # pragma: no cover
257
  raise RuntimeError("Context manager has been closed")
258
 
259
  final_response = None
260
  referer = (
261
  generate_convincing_referer(url)
262
- if (self.google_search and "referer" not in self._headers_keys)
263
  else None
264
  )
265
 
@@ -271,7 +327,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
271
  ):
272
  final_response = finished_response
273
 
274
- page_info = self._get_page()
275
  page_info.mark_busy(url=url)
276
 
277
  try: # pragma: no cover
@@ -280,41 +336,41 @@ class StealthySession(StealthySessionMixin, SyncSession):
280
  first_response = page_info.page.goto(url, referer=referer)
281
  page_info.page.wait_for_load_state(state="domcontentloaded")
282
 
283
- if self.network_idle:
284
  page_info.page.wait_for_load_state("networkidle")
285
 
286
  if not first_response:
287
  raise RuntimeError(f"Failed to get response for {url}")
288
 
289
- if self.solve_cloudflare:
290
  self._solve_cloudflare(page_info.page)
291
  # Make sure the page is fully loaded after the captcha
292
  page_info.page.wait_for_load_state(state="load")
293
  page_info.page.wait_for_load_state(state="domcontentloaded")
294
- if self.network_idle:
295
  page_info.page.wait_for_load_state("networkidle")
296
 
297
- if self.page_action is not None:
298
  try:
299
- page_info.page = self.page_action(page_info.page)
300
  except Exception as e:
301
  log.error(f"Error executing page_action: {e}")
302
 
303
- if self.wait_selector:
304
  try:
305
- waiter: Locator = page_info.page.locator(self.wait_selector)
306
- waiter.first.wait_for(state=self.wait_selector_state)
307
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
308
  page_info.page.wait_for_load_state(state="load")
309
  page_info.page.wait_for_load_state(state="domcontentloaded")
310
- if self.network_idle:
311
  page_info.page.wait_for_load_state("networkidle")
312
  except Exception as e:
313
- log.error(f"Error waiting for selector {self.wait_selector}: {e}")
314
 
315
- page_info.page.wait_for_timeout(self.wait)
316
  response = ResponseFactory.from_playwright_response(
317
- page_info.page, first_response, final_response, self.selector_config
318
  )
319
 
320
  # Mark the page as finished for next use
@@ -508,19 +564,74 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
508
  log.info("Cloudflare captcha is solved")
509
  return
510
 
511
- async def fetch(self, url: str) -> Response:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
512
  """Opens up the browser and do your request based on your chosen options.
513
 
514
  :param url: The Target url.
 
 
 
 
 
 
 
 
 
 
 
 
 
515
  :return: A `Response` object.
516
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
517
  if self._closed: # pragma: no cover
518
  raise RuntimeError("Context manager has been closed")
519
 
520
  final_response = None
521
  referer = (
522
  generate_convincing_referer(url)
523
- if (self.google_search and "referer" not in self._headers_keys)
524
  else None
525
  )
526
 
@@ -532,7 +643,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
532
  ):
533
  final_response = finished_response
534
 
535
- page_info = await self._get_page()
536
  page_info.mark_busy(url=url)
537
 
538
  try:
@@ -541,43 +652,43 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
541
  first_response = await page_info.page.goto(url, referer=referer)
542
  await page_info.page.wait_for_load_state(state="domcontentloaded")
543
 
544
- if self.network_idle:
545
  await page_info.page.wait_for_load_state("networkidle")
546
 
547
  if not first_response:
548
  raise RuntimeError(f"Failed to get response for {url}")
549
 
550
- if self.solve_cloudflare:
551
  await self._solve_cloudflare(page_info.page)
552
  # Make sure the page is fully loaded after the captcha
553
  await page_info.page.wait_for_load_state(state="load")
554
  await page_info.page.wait_for_load_state(state="domcontentloaded")
555
- if self.network_idle:
556
  await page_info.page.wait_for_load_state("networkidle")
557
 
558
- if self.page_action is not None:
559
  try:
560
- page_info.page = await self.page_action(page_info.page)
561
  except Exception as e:
562
  log.error(f"Error executing page_action: {e}")
563
 
564
- if self.wait_selector:
565
  try:
566
- waiter: AsyncLocator = page_info.page.locator(self.wait_selector)
567
- await waiter.first.wait_for(state=self.wait_selector_state)
568
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
569
  await page_info.page.wait_for_load_state(state="load")
570
  await page_info.page.wait_for_load_state(state="domcontentloaded")
571
- if self.network_idle:
572
  await page_info.page.wait_for_load_state("networkidle")
573
  except Exception as e:
574
- log.error(f"Error waiting for selector {self.wait_selector}: {e}")
575
 
576
- await page_info.page.wait_for_timeout(self.wait)
577
 
578
  # Create response object
579
  response = await ResponseFactory.from_async_playwright_response(
580
- page_info.page, first_response, final_response, self.selector_config
581
  )
582
 
583
  # Mark the page as finished for next use
 
31
  )
32
 
33
  __CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
34
+ _UNSET = object()
35
 
36
 
37
  class StealthySession(StealthySessionMixin, SyncSession):
 
248
  log.info("Cloudflare captcha is solved")
249
  return
250
 
251
+ def fetch(
252
+ self,
253
+ url: str,
254
+ google_search: bool = _UNSET,
255
+ timeout: int | float = _UNSET,
256
+ wait: int | float = _UNSET,
257
+ page_action: Optional[Callable] = _UNSET,
258
+ extra_headers: Optional[Dict[str, str]] = _UNSET,
259
+ disable_resources: bool = _UNSET,
260
+ wait_selector: Optional[str] = _UNSET,
261
+ wait_selector_state: SelectorWaitStates = _UNSET,
262
+ network_idle: bool = _UNSET,
263
+ solve_cloudflare: bool = _UNSET,
264
+ selector_config: Optional[Dict] = _UNSET,
265
+ ) -> Response:
266
  """Opens up the browser and do your request based on your chosen options.
267
 
268
  :param url: The Target url.
269
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
270
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
271
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
272
+ :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
273
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
274
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
275
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
276
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
277
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
278
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
279
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
280
+ :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
281
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
282
  :return: A `Response` object.
283
  """
284
+ google_search = self._get_with_precedence(
285
+ google_search, self.google_search, _UNSET
286
+ )
287
+ timeout = self._get_with_precedence(timeout, self.timeout, _UNSET)
288
+ wait = self._get_with_precedence(wait, self.wait, _UNSET)
289
+ page_action = self._get_with_precedence(page_action, self.page_action, _UNSET)
290
+ extra_headers = self._get_with_precedence(
291
+ extra_headers, self.extra_headers, _UNSET
292
+ )
293
+ disable_resources = self._get_with_precedence(
294
+ disable_resources, self.disable_resources, _UNSET
295
+ )
296
+ wait_selector = self._get_with_precedence(
297
+ wait_selector, self.wait_selector, _UNSET
298
+ )
299
+ wait_selector_state = self._get_with_precedence(
300
+ wait_selector_state, self.wait_selector_state, _UNSET
301
+ )
302
+ network_idle = self._get_with_precedence(
303
+ network_idle, self.network_idle, _UNSET
304
+ )
305
+ solve_cloudflare = self._get_with_precedence(
306
+ solve_cloudflare, self.solve_cloudflare, _UNSET
307
+ )
308
+ selector_config = self._get_with_precedence(
309
+ selector_config, self.selector_config, _UNSET
310
+ )
311
+
312
  if self._closed: # pragma: no cover
313
  raise RuntimeError("Context manager has been closed")
314
 
315
  final_response = None
316
  referer = (
317
  generate_convincing_referer(url)
318
+ if (google_search and "referer" not in self._headers_keys)
319
  else None
320
  )
321
 
 
327
  ):
328
  final_response = finished_response
329
 
330
+ page_info = self._get_page(timeout, extra_headers, disable_resources)
331
  page_info.mark_busy(url=url)
332
 
333
  try: # pragma: no cover
 
336
  first_response = page_info.page.goto(url, referer=referer)
337
  page_info.page.wait_for_load_state(state="domcontentloaded")
338
 
339
+ if network_idle:
340
  page_info.page.wait_for_load_state("networkidle")
341
 
342
  if not first_response:
343
  raise RuntimeError(f"Failed to get response for {url}")
344
 
345
+ if solve_cloudflare:
346
  self._solve_cloudflare(page_info.page)
347
  # Make sure the page is fully loaded after the captcha
348
  page_info.page.wait_for_load_state(state="load")
349
  page_info.page.wait_for_load_state(state="domcontentloaded")
350
+ if network_idle:
351
  page_info.page.wait_for_load_state("networkidle")
352
 
353
+ if page_action is not None:
354
  try:
355
+ _ = page_action(page_info.page)
356
  except Exception as e:
357
  log.error(f"Error executing page_action: {e}")
358
 
359
+ if wait_selector:
360
  try:
361
+ waiter: Locator = page_info.page.locator(wait_selector)
362
+ waiter.first.wait_for(state=wait_selector_state)
363
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
364
  page_info.page.wait_for_load_state(state="load")
365
  page_info.page.wait_for_load_state(state="domcontentloaded")
366
+ if network_idle:
367
  page_info.page.wait_for_load_state("networkidle")
368
  except Exception as e:
369
+ log.error(f"Error waiting for selector {wait_selector}: {e}")
370
 
371
+ page_info.page.wait_for_timeout(wait)
372
  response = ResponseFactory.from_playwright_response(
373
+ page_info.page, first_response, final_response, selector_config
374
  )
375
 
376
  # Mark the page as finished for next use
 
564
  log.info("Cloudflare captcha is solved")
565
  return
566
 
567
+ async def fetch(
568
+ self,
569
+ url: str,
570
+ google_search: bool = _UNSET,
571
+ timeout: int | float = _UNSET,
572
+ wait: int | float = _UNSET,
573
+ page_action: Optional[Callable] = _UNSET,
574
+ extra_headers: Optional[Dict[str, str]] = _UNSET,
575
+ disable_resources: bool = _UNSET,
576
+ wait_selector: Optional[str] = _UNSET,
577
+ wait_selector_state: SelectorWaitStates = _UNSET,
578
+ network_idle: bool = _UNSET,
579
+ solve_cloudflare: bool = _UNSET,
580
+ selector_config: Optional[Dict] = _UNSET,
581
+ ) -> Response:
582
  """Opens up the browser and do your request based on your chosen options.
583
 
584
  :param url: The Target url.
585
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
586
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
587
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
588
+ :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
589
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
590
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
591
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
592
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
593
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
594
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
595
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
596
+ :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
597
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
598
  :return: A `Response` object.
599
  """
600
+ google_search = self._get_with_precedence(
601
+ google_search, self.google_search, _UNSET
602
+ )
603
+ timeout = self._get_with_precedence(timeout, self.timeout, _UNSET)
604
+ wait = self._get_with_precedence(wait, self.wait, _UNSET)
605
+ page_action = self._get_with_precedence(page_action, self.page_action, _UNSET)
606
+ extra_headers = self._get_with_precedence(
607
+ extra_headers, self.extra_headers, _UNSET
608
+ )
609
+ disable_resources = self._get_with_precedence(
610
+ disable_resources, self.disable_resources, _UNSET
611
+ )
612
+ wait_selector = self._get_with_precedence(
613
+ wait_selector, self.wait_selector, _UNSET
614
+ )
615
+ wait_selector_state = self._get_with_precedence(
616
+ wait_selector_state, self.wait_selector_state, _UNSET
617
+ )
618
+ network_idle = self._get_with_precedence(
619
+ network_idle, self.network_idle, _UNSET
620
+ )
621
+ solve_cloudflare = self._get_with_precedence(
622
+ solve_cloudflare, self.solve_cloudflare, _UNSET
623
+ )
624
+ selector_config = self._get_with_precedence(
625
+ selector_config, self.selector_config, _UNSET
626
+ )
627
+
628
  if self._closed: # pragma: no cover
629
  raise RuntimeError("Context manager has been closed")
630
 
631
  final_response = None
632
  referer = (
633
  generate_convincing_referer(url)
634
+ if (google_search and "referer" not in self._headers_keys)
635
  else None
636
  )
637
 
 
643
  ):
644
  final_response = finished_response
645
 
646
+ page_info = await self._get_page(timeout, extra_headers, disable_resources)
647
  page_info.mark_busy(url=url)
648
 
649
  try:
 
652
  first_response = await page_info.page.goto(url, referer=referer)
653
  await page_info.page.wait_for_load_state(state="domcontentloaded")
654
 
655
+ if network_idle:
656
  await page_info.page.wait_for_load_state("networkidle")
657
 
658
  if not first_response:
659
  raise RuntimeError(f"Failed to get response for {url}")
660
 
661
+ if solve_cloudflare:
662
  await self._solve_cloudflare(page_info.page)
663
  # Make sure the page is fully loaded after the captcha
664
  await page_info.page.wait_for_load_state(state="load")
665
  await page_info.page.wait_for_load_state(state="domcontentloaded")
666
+ if network_idle:
667
  await page_info.page.wait_for_load_state("networkidle")
668
 
669
+ if page_action is not None:
670
  try:
671
+ _ = await page_action(page_info.page)
672
  except Exception as e:
673
  log.error(f"Error executing page_action: {e}")
674
 
675
+ if wait_selector:
676
  try:
677
+ waiter: AsyncLocator = page_info.page.locator(wait_selector)
678
+ await waiter.first.wait_for(state=wait_selector_state)
679
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
680
  await page_info.page.wait_for_load_state(state="load")
681
  await page_info.page.wait_for_load_state(state="domcontentloaded")
682
+ if network_idle:
683
  await page_info.page.wait_for_load_state("networkidle")
684
  except Exception as e:
685
+ log.error(f"Error waiting for selector {wait_selector}: {e}")
686
 
687
+ await page_info.page.wait_for_timeout(wait)
688
 
689
  # Create response object
690
  response = await ResponseFactory.from_async_playwright_response(
691
+ page_info.page, first_response, final_response, selector_config
692
  )
693
 
694
  # Mark the page as finished for next use
scrapling/engines/_browsers/_controllers.py CHANGED
@@ -31,6 +31,8 @@ from scrapling.engines.toolbelt import (
31
  generate_convincing_referer,
32
  )
33
 
 
 
34
 
35
  class DynamicSession(DynamicSessionMixin, SyncSession):
36
  """A Browser session manager with page pooling."""
@@ -198,19 +200,66 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
198
  def fetch(
199
  self,
200
  url: str,
 
 
 
 
 
 
 
 
 
 
201
  ) -> Response:
202
  """Opens up the browser and do your request based on your chosen options.
203
 
204
  :param url: The Target url.
 
 
 
 
 
 
 
 
 
 
 
 
205
  :return: A `Response` object.
206
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  if self._closed: # pragma: no cover
208
  raise RuntimeError("Context manager has been closed")
209
 
210
  final_response = None
211
  referer = (
212
  generate_convincing_referer(url)
213
- if (self.google_search and "referer" not in self._headers_keys)
214
  else None
215
  )
216
 
@@ -222,7 +271,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
222
  ):
223
  final_response = finished_response
224
 
225
- page_info = self._get_page()
226
  page_info.mark_busy(url=url)
227
 
228
  try: # pragma: no cover
@@ -231,35 +280,35 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
231
  first_response = page_info.page.goto(url, referer=referer)
232
  page_info.page.wait_for_load_state(state="domcontentloaded")
233
 
234
- if self.network_idle:
235
  page_info.page.wait_for_load_state("networkidle")
236
 
237
  if not first_response:
238
  raise RuntimeError(f"Failed to get response for {url}")
239
 
240
- if self.page_action is not None:
241
  try:
242
- page_info.page = self.page_action(page_info.page)
243
  except Exception as e: # pragma: no cover
244
  log.error(f"Error executing page_action: {e}")
245
 
246
- if self.wait_selector:
247
  try:
248
- waiter: Locator = page_info.page.locator(self.wait_selector)
249
- waiter.first.wait_for(state=self.wait_selector_state)
250
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
251
  page_info.page.wait_for_load_state(state="load")
252
  page_info.page.wait_for_load_state(state="domcontentloaded")
253
- if self.network_idle:
254
  page_info.page.wait_for_load_state("networkidle")
255
  except Exception as e: # pragma: no cover
256
- log.error(f"Error waiting for selector {self.wait_selector}: {e}")
257
 
258
- page_info.page.wait_for_timeout(self.wait)
259
 
260
  # Create response object
261
  response = ResponseFactory.from_playwright_response(
262
- page_info.page, first_response, final_response, self.selector_config
263
  )
264
 
265
  # Mark the page as finished for next use
@@ -409,19 +458,69 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
409
 
410
  self._closed = True
411
 
412
- async def fetch(self, url: str) -> Response:
 
 
 
 
 
 
 
 
 
 
 
 
 
413
  """Opens up the browser and do your request based on your chosen options.
414
 
415
  :param url: The Target url.
 
 
 
 
 
 
 
 
 
 
 
 
416
  :return: A `Response` object.
417
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
  if self._closed: # pragma: no cover
419
  raise RuntimeError("Context manager has been closed")
420
 
421
  final_response = None
422
  referer = (
423
  generate_convincing_referer(url)
424
- if (self.google_search and "referer" not in self._headers_keys)
425
  else None
426
  )
427
 
@@ -433,7 +532,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
433
  ):
434
  final_response = finished_response
435
 
436
- page_info = await self._get_page()
437
  page_info.mark_busy(url=url)
438
 
439
  try:
@@ -442,35 +541,35 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
442
  first_response = await page_info.page.goto(url, referer=referer)
443
  await page_info.page.wait_for_load_state(state="domcontentloaded")
444
 
445
- if self.network_idle:
446
  await page_info.page.wait_for_load_state("networkidle")
447
 
448
  if not first_response:
449
  raise RuntimeError(f"Failed to get response for {url}")
450
 
451
- if self.page_action is not None:
452
  try:
453
- page_info.page = await self.page_action(page_info.page)
454
  except Exception as e:
455
  log.error(f"Error executing page_action: {e}")
456
 
457
- if self.wait_selector:
458
  try:
459
- waiter: AsyncLocator = page_info.page.locator(self.wait_selector)
460
- await waiter.first.wait_for(state=self.wait_selector_state)
461
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
462
  await page_info.page.wait_for_load_state(state="load")
463
  await page_info.page.wait_for_load_state(state="domcontentloaded")
464
- if self.network_idle:
465
  await page_info.page.wait_for_load_state("networkidle")
466
  except Exception as e:
467
- log.error(f"Error waiting for selector {self.wait_selector}: {e}")
468
 
469
- await page_info.page.wait_for_timeout(self.wait)
470
 
471
  # Create response object
472
  response = await ResponseFactory.from_async_playwright_response(
473
- page_info.page, first_response, final_response, self.selector_config
474
  )
475
 
476
  # Mark the page as finished for next use
 
31
  generate_convincing_referer,
32
  )
33
 
34
+ _UNSET = object()
35
+
36
 
37
  class DynamicSession(DynamicSessionMixin, SyncSession):
38
  """A Browser session manager with page pooling."""
 
200
  def fetch(
201
  self,
202
  url: str,
203
+ google_search: bool = _UNSET,
204
+ timeout: int | float = _UNSET,
205
+ wait: int | float = _UNSET,
206
+ page_action: Optional[Callable] = _UNSET,
207
+ extra_headers: Optional[Dict[str, str]] = _UNSET,
208
+ disable_resources: bool = _UNSET,
209
+ wait_selector: Optional[str] = _UNSET,
210
+ wait_selector_state: SelectorWaitStates = _UNSET,
211
+ network_idle: bool = _UNSET,
212
+ selector_config: Optional[Dict] = _UNSET,
213
  ) -> Response:
214
  """Opens up the browser and do your request based on your chosen options.
215
 
216
  :param url: The Target url.
217
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
218
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
219
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
220
+ :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
221
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
222
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
223
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
224
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
225
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
226
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
227
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
228
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
229
  :return: A `Response` object.
230
  """
231
+ google_search = self._get_with_precedence(
232
+ google_search, self.google_search, _UNSET
233
+ )
234
+ timeout = self._get_with_precedence(timeout, self.timeout, _UNSET)
235
+ wait = self._get_with_precedence(wait, self.wait, _UNSET)
236
+ page_action = self._get_with_precedence(page_action, self.page_action, _UNSET)
237
+ extra_headers = self._get_with_precedence(
238
+ extra_headers, self.extra_headers, _UNSET
239
+ )
240
+ disable_resources = self._get_with_precedence(
241
+ disable_resources, self.disable_resources, _UNSET
242
+ )
243
+ wait_selector = self._get_with_precedence(
244
+ wait_selector, self.wait_selector, _UNSET
245
+ )
246
+ wait_selector_state = self._get_with_precedence(
247
+ wait_selector_state, self.wait_selector_state, _UNSET
248
+ )
249
+ network_idle = self._get_with_precedence(
250
+ network_idle, self.network_idle, _UNSET
251
+ )
252
+ selector_config = self._get_with_precedence(
253
+ selector_config, self.selector_config, _UNSET
254
+ )
255
+
256
  if self._closed: # pragma: no cover
257
  raise RuntimeError("Context manager has been closed")
258
 
259
  final_response = None
260
  referer = (
261
  generate_convincing_referer(url)
262
+ if (google_search and "referer" not in self._headers_keys)
263
  else None
264
  )
265
 
 
271
  ):
272
  final_response = finished_response
273
 
274
+ page_info = self._get_page(timeout, extra_headers, disable_resources)
275
  page_info.mark_busy(url=url)
276
 
277
  try: # pragma: no cover
 
280
  first_response = page_info.page.goto(url, referer=referer)
281
  page_info.page.wait_for_load_state(state="domcontentloaded")
282
 
283
+ if network_idle:
284
  page_info.page.wait_for_load_state("networkidle")
285
 
286
  if not first_response:
287
  raise RuntimeError(f"Failed to get response for {url}")
288
 
289
+ if page_action is not None:
290
  try:
291
+ _ = page_action(page_info.page)
292
  except Exception as e: # pragma: no cover
293
  log.error(f"Error executing page_action: {e}")
294
 
295
+ if wait_selector:
296
  try:
297
+ waiter: Locator = page_info.page.locator(wait_selector)
298
+ waiter.first.wait_for(state=wait_selector_state)
299
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
300
  page_info.page.wait_for_load_state(state="load")
301
  page_info.page.wait_for_load_state(state="domcontentloaded")
302
+ if network_idle:
303
  page_info.page.wait_for_load_state("networkidle")
304
  except Exception as e: # pragma: no cover
305
+ log.error(f"Error waiting for selector {wait_selector}: {e}")
306
 
307
+ page_info.page.wait_for_timeout(wait)
308
 
309
  # Create response object
310
  response = ResponseFactory.from_playwright_response(
311
+ page_info.page, first_response, final_response, selector_config
312
  )
313
 
314
  # Mark the page as finished for next use
 
458
 
459
  self._closed = True
460
 
461
+ async def fetch(
462
+ self,
463
+ url: str,
464
+ google_search: bool = _UNSET,
465
+ timeout: int | float = _UNSET,
466
+ wait: int | float = _UNSET,
467
+ page_action: Optional[Callable] = _UNSET,
468
+ extra_headers: Optional[Dict[str, str]] = _UNSET,
469
+ disable_resources: bool = _UNSET,
470
+ wait_selector: Optional[str] = _UNSET,
471
+ wait_selector_state: SelectorWaitStates = _UNSET,
472
+ network_idle: bool = _UNSET,
473
+ selector_config: Optional[Dict] = _UNSET,
474
+ ) -> Response:
475
  """Opens up the browser and do your request based on your chosen options.
476
 
477
  :param url: The Target url.
478
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
479
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
480
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
481
+ :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
482
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
483
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
484
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
485
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
486
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
487
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
488
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
489
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
490
  :return: A `Response` object.
491
  """
492
+ google_search = self._get_with_precedence(
493
+ google_search, self.google_search, _UNSET
494
+ )
495
+ timeout = self._get_with_precedence(timeout, self.timeout, _UNSET)
496
+ wait = self._get_with_precedence(wait, self.wait, _UNSET)
497
+ page_action = self._get_with_precedence(page_action, self.page_action, _UNSET)
498
+ extra_headers = self._get_with_precedence(
499
+ extra_headers, self.extra_headers, _UNSET
500
+ )
501
+ disable_resources = self._get_with_precedence(
502
+ disable_resources, self.disable_resources, _UNSET
503
+ )
504
+ wait_selector = self._get_with_precedence(
505
+ wait_selector, self.wait_selector, _UNSET
506
+ )
507
+ wait_selector_state = self._get_with_precedence(
508
+ wait_selector_state, self.wait_selector_state, _UNSET
509
+ )
510
+ network_idle = self._get_with_precedence(
511
+ network_idle, self.network_idle, _UNSET
512
+ )
513
+ selector_config = self._get_with_precedence(
514
+ selector_config, self.selector_config, _UNSET
515
+ )
516
+
517
  if self._closed: # pragma: no cover
518
  raise RuntimeError("Context manager has been closed")
519
 
520
  final_response = None
521
  referer = (
522
  generate_convincing_referer(url)
523
+ if (google_search and "referer" not in self._headers_keys)
524
  else None
525
  )
526
 
 
532
  ):
533
  final_response = finished_response
534
 
535
+ page_info = await self._get_page(timeout, extra_headers, disable_resources)
536
  page_info.mark_busy(url=url)
537
 
538
  try:
 
541
  first_response = await page_info.page.goto(url, referer=referer)
542
  await page_info.page.wait_for_load_state(state="domcontentloaded")
543
 
544
+ if network_idle:
545
  await page_info.page.wait_for_load_state("networkidle")
546
 
547
  if not first_response:
548
  raise RuntimeError(f"Failed to get response for {url}")
549
 
550
+ if page_action is not None:
551
  try:
552
+ _ = await page_action(page_info.page)
553
  except Exception as e:
554
  log.error(f"Error executing page_action: {e}")
555
 
556
+ if wait_selector:
557
  try:
558
+ waiter: AsyncLocator = page_info.page.locator(wait_selector)
559
+ await waiter.first.wait_for(state=wait_selector_state)
560
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
561
  await page_info.page.wait_for_load_state(state="load")
562
  await page_info.page.wait_for_load_state(state="domcontentloaded")
563
+ if network_idle:
564
  await page_info.page.wait_for_load_state("networkidle")
565
  except Exception as e:
566
+ log.error(f"Error waiting for selector {wait_selector}: {e}")
567
 
568
+ await page_info.page.wait_for_timeout(wait)
569
 
570
  # Create response object
571
  response = await ResponseFactory.from_async_playwright_response(
572
+ page_info.page, first_response, final_response, selector_config
573
  )
574
 
575
  # Mark the page as finished for next use