Karim shoair commited on
Commit
ee2299e
·
1 Parent(s): 123011a

refactor(fetchers)!: Replace Camoufox with patchright and many optimizations

Browse files

- DynamicFetcher became 20% faster
- StealthyFetcher became 99% faster
- Scrapling size decreased
- Code became ~400 lines shorter
- Most importantly, scrapling is more stable and reliable now.
- Less confusing for new users.
- More...

scrapling/cli.py CHANGED
@@ -125,14 +125,9 @@ def install(force): # pragma: no cover
125
  "playwright",
126
  "install-deps",
127
  "chromium",
128
- "firefox",
129
  ],
130
  "Playwright dependencies",
131
  )
132
- __Execute(
133
- [python_executable, "-m", "camoufox", "fetch", "--browserforge"],
134
- "Camoufox browser and databases",
135
- )
136
  # if no errors raised by the above commands, then we add the below file
137
  __PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").touch()
138
  else:
@@ -611,16 +606,10 @@ def delete(
611
  )
612
  @option("--wait-selector", help="CSS selector to wait for before proceeding")
613
  @option("--locale", default="en-US", help="Browser locale (default: en-US)")
614
- @option("--stealth/--no-stealth", default=False, help="Enable stealth mode (default: False)")
615
- @option(
616
- "--hide-canvas/--show-canvas",
617
- default=False,
618
- help="Add noise to canvas operations (default: False)",
619
- )
620
  @option(
621
- "--disable-webgl/--enable-webgl",
622
  default=False,
623
- help="Disable WebGL support (default: False)",
624
  )
625
  @option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
626
  @option(
@@ -640,9 +629,7 @@ def fetch(
640
  css_selector,
641
  wait_selector,
642
  locale,
643
- stealth,
644
- hide_canvas,
645
- disable_webgl,
646
  proxy,
647
  extra_headers,
648
  ):
@@ -659,9 +646,7 @@ def fetch(
659
  :param css_selector: CSS selector to extract specific content.
660
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
661
  :param locale: Set the locale for the browser.
662
- :param stealth: Enables stealth mode.
663
- :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
664
- :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
665
  :param proxy: The proxy to be used with requests.
666
  :param extra_headers: Extra headers to add to the request.
667
  """
@@ -676,9 +661,7 @@ def fetch(
676
  "network_idle": network_idle,
677
  "timeout": timeout,
678
  "locale": locale,
679
- "stealth": stealth,
680
- "hide_canvas": hide_canvas,
681
- "disable_webgl": disable_webgl,
682
  }
683
 
684
  if wait > 0:
@@ -703,11 +686,6 @@ def fetch(
703
  default=True,
704
  help="Run browser in headless mode (default: True)",
705
  )
706
- @option(
707
- "--block-images/--allow-images",
708
- default=False,
709
- help="Block image loading (default: False)",
710
- )
711
  @option(
712
  "--disable-resources/--enable-resources",
713
  default=False,
@@ -718,11 +696,6 @@ def fetch(
718
  default=False,
719
  help="Block WebRTC entirely (default: False)",
720
  )
721
- @option(
722
- "--humanize/--no-humanize",
723
- default=False,
724
- help="Humanize cursor movement (default: False)",
725
- )
726
  @option(
727
  "--solve-cloudflare/--no-solve-cloudflare",
728
  default=False,
@@ -735,9 +708,14 @@ def fetch(
735
  help="Wait for network idle (default: False)",
736
  )
737
  @option(
738
- "--disable-ads/--allow-ads",
739
  default=False,
740
- help="Install uBlock Origin addon (default: False)",
 
 
 
 
 
741
  )
742
  @option(
743
  "--timeout",
@@ -757,11 +735,6 @@ def fetch(
757
  help="CSS selector to extract specific content from the page. It returns all matches.",
758
  )
759
  @option("--wait-selector", help="CSS selector to wait for before proceeding")
760
- @option(
761
- "--geoip/--no-geoip",
762
- default=False,
763
- help="Use IP geolocation for timezone/locale (default: False)",
764
- )
765
  @option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
766
  @option(
767
  "--extra-headers",
@@ -773,19 +746,17 @@ def stealthy_fetch(
773
  url,
774
  output_file,
775
  headless,
776
- block_images,
777
  disable_resources,
778
  block_webrtc,
779
- humanize,
780
  solve_cloudflare,
781
  allow_webgl,
782
  network_idle,
783
- disable_ads,
 
784
  timeout,
785
  wait,
786
  css_selector,
787
  wait_selector,
788
- geoip,
789
  proxy,
790
  extra_headers,
791
  ):
@@ -795,19 +766,17 @@ def stealthy_fetch(
795
  :param url: Target url.
796
  :param output_file: Output file path (.md for Markdown, .html for HTML).
797
  :param headless: Run the browser in headless/hidden, or headful/visible mode.
798
- :param block_images: Prevent the loading of images through Firefox preferences.
799
  :param disable_resources: Drop requests of unnecessary resources for a speed boost.
800
  :param block_webrtc: Blocks WebRTC entirely.
801
- :param humanize: Humanize the cursor movement.
802
  :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges.
803
  :param allow_webgl: Allow WebGL (recommended to keep enabled).
804
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
805
- :param disable_ads: Install the uBlock Origin addon on the browser.
 
806
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page.
807
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before returning.
808
  :param css_selector: CSS selector to extract specific content.
809
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
810
- :param geoip: Automatically use IP's longitude, latitude, timezone, country, locale.
811
  :param proxy: The proxy to be used with requests.
812
  :param extra_headers: Extra headers to add to the request.
813
  """
@@ -818,16 +787,14 @@ def stealthy_fetch(
818
  # Build request arguments
819
  kwargs = {
820
  "headless": headless,
821
- "block_images": block_images,
822
  "disable_resources": disable_resources,
823
  "block_webrtc": block_webrtc,
824
- "humanize": humanize,
825
  "solve_cloudflare": solve_cloudflare,
826
  "allow_webgl": allow_webgl,
827
  "network_idle": network_idle,
828
- "disable_ads": disable_ads,
 
829
  "timeout": timeout,
830
- "geoip": geoip,
831
  }
832
 
833
  if wait > 0:
 
125
  "playwright",
126
  "install-deps",
127
  "chromium",
 
128
  ],
129
  "Playwright dependencies",
130
  )
 
 
 
 
131
  # if no errors raised by the above commands, then we add the below file
132
  __PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").touch()
133
  else:
 
606
  )
607
  @option("--wait-selector", help="CSS selector to wait for before proceeding")
608
  @option("--locale", default="en-US", help="Browser locale (default: en-US)")
 
 
 
 
 
 
609
  @option(
610
+ "--real-chrome/--no-real-chrome",
611
  default=False,
612
+ help="If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False)",
613
  )
614
  @option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
615
  @option(
 
629
  css_selector,
630
  wait_selector,
631
  locale,
632
+ real_chrome,
 
 
633
  proxy,
634
  extra_headers,
635
  ):
 
646
  :param css_selector: CSS selector to extract specific content.
647
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
648
  :param locale: Set the locale for the browser.
649
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
 
 
650
  :param proxy: The proxy to be used with requests.
651
  :param extra_headers: Extra headers to add to the request.
652
  """
 
661
  "network_idle": network_idle,
662
  "timeout": timeout,
663
  "locale": locale,
664
+ "real_chrome": real_chrome,
 
 
665
  }
666
 
667
  if wait > 0:
 
686
  default=True,
687
  help="Run browser in headless mode (default: True)",
688
  )
 
 
 
 
 
689
  @option(
690
  "--disable-resources/--enable-resources",
691
  default=False,
 
696
  default=False,
697
  help="Block WebRTC entirely (default: False)",
698
  )
 
 
 
 
 
699
  @option(
700
  "--solve-cloudflare/--no-solve-cloudflare",
701
  default=False,
 
708
  help="Wait for network idle (default: False)",
709
  )
710
  @option(
711
+ "--real-chrome/--no-real-chrome",
712
  default=False,
713
+ help="If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False)",
714
+ )
715
+ @option(
716
+ "--hide-canvas/--show-canvas",
717
+ default=False,
718
+ help="Add noise to canvas operations (default: False)",
719
  )
720
  @option(
721
  "--timeout",
 
735
  help="CSS selector to extract specific content from the page. It returns all matches.",
736
  )
737
  @option("--wait-selector", help="CSS selector to wait for before proceeding")
 
 
 
 
 
738
  @option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
739
  @option(
740
  "--extra-headers",
 
746
  url,
747
  output_file,
748
  headless,
 
749
  disable_resources,
750
  block_webrtc,
 
751
  solve_cloudflare,
752
  allow_webgl,
753
  network_idle,
754
+ real_chrome,
755
+ hide_canvas,
756
  timeout,
757
  wait,
758
  css_selector,
759
  wait_selector,
 
760
  proxy,
761
  extra_headers,
762
  ):
 
766
  :param url: Target url.
767
  :param output_file: Output file path (.md for Markdown, .html for HTML).
768
  :param headless: Run the browser in headless/hidden, or headful/visible mode.
 
769
  :param disable_resources: Drop requests of unnecessary resources for a speed boost.
770
  :param block_webrtc: Blocks WebRTC entirely.
 
771
  :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges.
772
  :param allow_webgl: Allow WebGL (recommended to keep enabled).
773
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
774
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
775
+ :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
776
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page.
777
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before returning.
778
  :param css_selector: CSS selector to extract specific content.
779
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
 
780
  :param proxy: The proxy to be used with requests.
781
  :param extra_headers: Extra headers to add to the request.
782
  """
 
787
  # Build request arguments
788
  kwargs = {
789
  "headless": headless,
 
790
  "disable_resources": disable_resources,
791
  "block_webrtc": block_webrtc,
 
792
  "solve_cloudflare": solve_cloudflare,
793
  "allow_webgl": allow_webgl,
794
  "network_idle": network_idle,
795
+ "real_chrome": real_chrome,
796
+ "hide_canvas": hide_canvas,
797
  "timeout": timeout,
 
798
  }
799
 
800
  if wait > 0:
scrapling/core/_types.py CHANGED
@@ -57,3 +57,17 @@ except ImportError: # pragma: no cover
57
  from typing_extensions import Self # Backport
58
  except ImportError:
59
  Self = object
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  from typing_extensions import Self # Backport
58
  except ImportError:
59
  Self = object
60
+
61
+
62
+ # Copied from `playwright._impl._api_structures.SetCookieParam`
63
+ class SetCookieParam(TypedDict, total=False):
64
+ name: str
65
+ value: str
66
+ url: Optional[str]
67
+ domain: Optional[str]
68
+ path: Optional[str]
69
+ expires: Optional[float]
70
+ httpOnly: Optional[bool]
71
+ secure: Optional[bool]
72
+ sameSite: Optional[Literal["Lax", "None", "Strict"]]
73
+ partitionKey: Optional[str]
scrapling/core/ai.py CHANGED
@@ -213,13 +213,11 @@ class ScraplingMCPServer:
213
  main_content_only: bool = True,
214
  headless: bool = False,
215
  google_search: bool = True,
216
- hide_canvas: bool = False,
217
- disable_webgl: bool = False,
218
  real_chrome: bool = False,
219
- stealth: bool = False,
220
  wait: int | float = 0,
221
  proxy: Optional[str | Dict[str, str]] = None,
222
- locale: str = "en-US",
 
223
  extra_headers: Optional[Dict[str, str]] = None,
224
  useragent: Optional[str] = None,
225
  cdp_url: Optional[str] = None,
@@ -251,12 +249,11 @@ class ScraplingMCPServer:
251
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
252
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
253
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
254
- :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
 
 
255
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
256
- :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
257
  :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
258
- :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
259
- :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
260
  :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
261
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
262
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
@@ -269,15 +266,13 @@ class ScraplingMCPServer:
269
  locale=locale,
270
  timeout=timeout,
271
  cookies=cookies,
272
- stealth=stealth,
273
  cdp_url=cdp_url,
274
  headless=headless,
275
  useragent=useragent,
276
- hide_canvas=hide_canvas,
277
  real_chrome=real_chrome,
278
  network_idle=network_idle,
279
  wait_selector=wait_selector,
280
- disable_webgl=disable_webgl,
281
  extra_headers=extra_headers,
282
  google_search=google_search,
283
  disable_resources=disable_resources,
@@ -301,13 +296,11 @@ class ScraplingMCPServer:
301
  main_content_only: bool = True,
302
  headless: bool = False,
303
  google_search: bool = True,
304
- hide_canvas: bool = False,
305
- disable_webgl: bool = False,
306
  real_chrome: bool = False,
307
- stealth: bool = False,
308
  wait: int | float = 0,
309
  proxy: Optional[str | Dict[str, str]] = None,
310
- locale: str = "en-US",
 
311
  extra_headers: Optional[Dict[str, str]] = None,
312
  useragent: Optional[str] = None,
313
  cdp_url: Optional[str] = None,
@@ -339,12 +332,11 @@ class ScraplingMCPServer:
339
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
340
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
341
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
342
- :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
 
 
343
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
344
- :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
345
  :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
346
- :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
347
- :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
348
  :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
349
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
350
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
@@ -356,17 +348,15 @@ class ScraplingMCPServer:
356
  locale=locale,
357
  timeout=timeout,
358
  cookies=cookies,
359
- stealth=stealth,
360
  cdp_url=cdp_url,
361
  headless=headless,
362
  max_pages=len(urls),
363
  useragent=useragent,
364
- hide_canvas=hide_canvas,
365
  real_chrome=real_chrome,
366
  network_idle=network_idle,
367
  wait_selector=wait_selector,
368
  google_search=google_search,
369
- disable_webgl=disable_webgl,
370
  extra_headers=extra_headers,
371
  disable_resources=disable_resources,
372
  wait_selector_state=wait_selector_state,
@@ -393,29 +383,29 @@ class ScraplingMCPServer:
393
  css_selector: Optional[str] = None,
394
  main_content_only: bool = True,
395
  headless: bool = True, # noqa: F821
396
- block_images: bool = False,
397
- disable_resources: bool = False,
398
- block_webrtc: bool = False,
399
- allow_webgl: bool = True,
400
- network_idle: bool = False,
401
- humanize: bool | float = True,
402
- solve_cloudflare: bool = False,
403
  wait: int | float = 0,
 
 
 
 
 
 
 
404
  timeout: int | float = 30000,
 
405
  wait_selector: Optional[str] = None,
406
- addons: Optional[List[str]] = None,
407
- wait_selector_state: SelectorWaitStates = "attached",
408
  cookies: Optional[List[Dict]] = None,
409
- google_search: bool = True,
410
- extra_headers: Optional[Dict[str, str]] = None,
411
- proxy: Optional[str | Dict[str, str]] = None,
412
- os_randomize: bool = False,
413
- disable_ads: bool = False,
414
- geoip: bool = False,
415
  additional_args: Optional[Dict] = None,
416
  ) -> ResponseModel:
417
- """Use Scrapling's version of the Camoufox browser to fetch a URL and return a structured output of the result.
418
- Note: This is best suitable for high protection levels. It's slower than the other tools.
419
  Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
420
 
421
  :param url: The URL to request.
@@ -426,54 +416,53 @@ class ScraplingMCPServer:
426
  :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
427
  :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
428
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
429
- :param block_images: Prevent the loading of images through Firefox preferences.
430
- This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
431
  :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
432
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
433
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
434
- :param block_webrtc: Blocks WebRTC entirely.
435
  :param cookies: Set cookies for the next request.
436
- :param addons: List of Firefox addons to use. Must be paths to extracted addons.
437
- :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
438
  :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
439
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
440
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
441
- :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
442
- :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
443
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
444
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
445
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
446
- :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
447
- It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
 
448
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
 
 
 
 
449
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
450
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
451
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
452
- :param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
453
  """
454
  page = await StealthyFetcher.async_fetch(
455
  url,
456
  wait=wait,
457
  proxy=proxy,
458
- geoip=geoip,
459
- addons=addons,
460
  timeout=timeout,
461
  cookies=cookies,
462
  headless=headless,
463
- humanize=humanize,
 
 
 
464
  allow_webgl=allow_webgl,
465
- disable_ads=disable_ads,
466
  network_idle=network_idle,
467
- block_images=block_images,
468
  block_webrtc=block_webrtc,
469
- os_randomize=os_randomize,
470
  wait_selector=wait_selector,
471
  google_search=google_search,
472
  extra_headers=extra_headers,
 
473
  solve_cloudflare=solve_cloudflare,
474
  disable_resources=disable_resources,
475
  wait_selector_state=wait_selector_state,
476
- additional_args=additional_args,
477
  )
478
  return _ContentTranslator(
479
  Convertor._extract_content(
@@ -492,29 +481,29 @@ class ScraplingMCPServer:
492
  css_selector: Optional[str] = None,
493
  main_content_only: bool = True,
494
  headless: bool = True, # noqa: F821
495
- block_images: bool = False,
496
- disable_resources: bool = False,
497
- block_webrtc: bool = False,
498
- allow_webgl: bool = True,
499
- network_idle: bool = False,
500
- humanize: bool | float = True,
501
- solve_cloudflare: bool = False,
502
  wait: int | float = 0,
 
 
 
 
 
 
 
503
  timeout: int | float = 30000,
 
504
  wait_selector: Optional[str] = None,
505
- addons: Optional[List[str]] = None,
506
- wait_selector_state: SelectorWaitStates = "attached",
507
  cookies: Optional[List[Dict]] = None,
508
- google_search: bool = True,
509
- extra_headers: Optional[Dict[str, str]] = None,
510
- proxy: Optional[str | Dict[str, str]] = None,
511
- os_randomize: bool = False,
512
- disable_ads: bool = False,
513
- geoip: bool = False,
514
  additional_args: Optional[Dict] = None,
515
  ) -> List[ResponseModel]:
516
- """Use Scrapling's version of the Camoufox browser to fetch a group of URLs at the same time, and for each page return a structured output of the result.
517
- Note: This is best suitable for high protection levels. It's slower than the other tools.
518
  Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
519
 
520
  :param urls: A tuple of the URLs to request.
@@ -525,54 +514,52 @@ class ScraplingMCPServer:
525
  :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
526
  :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
527
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
528
- :param block_images: Prevent the loading of images through Firefox preferences.
529
- This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
530
  :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
531
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
532
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
533
- :param block_webrtc: Blocks WebRTC entirely.
534
  :param cookies: Set cookies for the next request.
535
- :param addons: List of Firefox addons to use. Must be paths to extracted addons.
536
- :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
537
  :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
538
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
539
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
540
- :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
541
- :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
542
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
543
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
544
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
545
- :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
546
- It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
 
547
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
 
 
 
 
548
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
549
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
550
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
551
- :param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
552
  """
553
  async with AsyncStealthySession(
554
  wait=wait,
555
  proxy=proxy,
556
- geoip=geoip,
557
- addons=addons,
558
  timeout=timeout,
559
  cookies=cookies,
560
  headless=headless,
561
- humanize=humanize,
562
- max_pages=len(urls),
 
 
563
  allow_webgl=allow_webgl,
564
- disable_ads=disable_ads,
565
- block_images=block_images,
566
- block_webrtc=block_webrtc,
567
  network_idle=network_idle,
568
- os_randomize=os_randomize,
569
  wait_selector=wait_selector,
570
  google_search=google_search,
571
  extra_headers=extra_headers,
 
572
  solve_cloudflare=solve_cloudflare,
573
  disable_resources=disable_resources,
574
  wait_selector_state=wait_selector_state,
575
- additional_args=additional_args,
576
  ) as session:
577
  tasks = [session.fetch(url) for url in urls]
578
  responses = await gather(*tasks)
 
213
  main_content_only: bool = True,
214
  headless: bool = False,
215
  google_search: bool = True,
 
 
216
  real_chrome: bool = False,
 
217
  wait: int | float = 0,
218
  proxy: Optional[str | Dict[str, str]] = None,
219
+ timezone_id: str | None = None,
220
+ locale: str | None = None,
221
  extra_headers: Optional[Dict[str, str]] = None,
222
  useragent: Optional[str] = None,
223
  cdp_url: Optional[str] = None,
 
249
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
250
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
251
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
252
+ :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
253
+ :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
254
+ rules. Defaults to the system default locale.
255
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
 
256
  :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
 
 
257
  :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
258
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
259
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
 
266
  locale=locale,
267
  timeout=timeout,
268
  cookies=cookies,
 
269
  cdp_url=cdp_url,
270
  headless=headless,
271
  useragent=useragent,
272
+ timezone_id=timezone_id,
273
  real_chrome=real_chrome,
274
  network_idle=network_idle,
275
  wait_selector=wait_selector,
 
276
  extra_headers=extra_headers,
277
  google_search=google_search,
278
  disable_resources=disable_resources,
 
296
  main_content_only: bool = True,
297
  headless: bool = False,
298
  google_search: bool = True,
 
 
299
  real_chrome: bool = False,
 
300
  wait: int | float = 0,
301
  proxy: Optional[str | Dict[str, str]] = None,
302
+ timezone_id: str | None = None,
303
+ locale: str | None = None,
304
  extra_headers: Optional[Dict[str, str]] = None,
305
  useragent: Optional[str] = None,
306
  cdp_url: Optional[str] = None,
 
332
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
333
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
334
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
335
+ :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
336
+ :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
337
+ rules. Defaults to the system default locale.
338
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
 
339
  :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
 
 
340
  :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
341
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
342
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
 
348
  locale=locale,
349
  timeout=timeout,
350
  cookies=cookies,
 
351
  cdp_url=cdp_url,
352
  headless=headless,
353
  max_pages=len(urls),
354
  useragent=useragent,
355
+ timezone_id=timezone_id,
356
  real_chrome=real_chrome,
357
  network_idle=network_idle,
358
  wait_selector=wait_selector,
359
  google_search=google_search,
 
360
  extra_headers=extra_headers,
361
  disable_resources=disable_resources,
362
  wait_selector_state=wait_selector_state,
 
383
  css_selector: Optional[str] = None,
384
  main_content_only: bool = True,
385
  headless: bool = True, # noqa: F821
386
+ google_search: bool = True,
387
+ real_chrome: bool = False,
 
 
 
 
 
388
  wait: int | float = 0,
389
+ proxy: Optional[str | Dict[str, str]] = None,
390
+ timezone_id: str | None = None,
391
+ locale: str | None = None,
392
+ extra_headers: Optional[Dict[str, str]] = None,
393
+ useragent: Optional[str] = None,
394
+ hide_canvas: bool = False,
395
+ cdp_url: Optional[str] = None,
396
  timeout: int | float = 30000,
397
+ disable_resources: bool = False,
398
  wait_selector: Optional[str] = None,
 
 
399
  cookies: Optional[List[Dict]] = None,
400
+ network_idle: bool = False,
401
+ wait_selector_state: SelectorWaitStates = "attached",
402
+ block_webrtc: bool = False,
403
+ allow_webgl: bool = True,
404
+ solve_cloudflare: bool = False,
 
405
  additional_args: Optional[Dict] = None,
406
  ) -> ResponseModel:
407
+ """Use the stealthy fetcher to fetch a URL and return a structured output of the result.
408
+ Note: This is the only suitable fetcher for high protection levels.
409
  Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
410
 
411
  :param url: The URL to request.
 
416
  :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
417
  :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
418
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
 
 
419
  :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
420
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
421
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
422
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
423
  :param cookies: Set cookies for the next request.
 
 
424
  :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
425
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
426
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
 
427
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
428
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
429
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
430
+ :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
431
+ :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
432
+ rules. Defaults to the system default locale.
433
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
434
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
435
+ :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
436
+ :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
437
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
438
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
439
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
440
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
441
+ :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
442
  """
443
  page = await StealthyFetcher.async_fetch(
444
  url,
445
  wait=wait,
446
  proxy=proxy,
447
+ locale=locale,
448
+ cdp_url=cdp_url,
449
  timeout=timeout,
450
  cookies=cookies,
451
  headless=headless,
452
+ useragent=useragent,
453
+ timezone_id=timezone_id,
454
+ real_chrome=real_chrome,
455
+ hide_canvas=hide_canvas,
456
  allow_webgl=allow_webgl,
 
457
  network_idle=network_idle,
 
458
  block_webrtc=block_webrtc,
 
459
  wait_selector=wait_selector,
460
  google_search=google_search,
461
  extra_headers=extra_headers,
462
+ additional_args=additional_args,
463
  solve_cloudflare=solve_cloudflare,
464
  disable_resources=disable_resources,
465
  wait_selector_state=wait_selector_state,
 
466
  )
467
  return _ContentTranslator(
468
  Convertor._extract_content(
 
481
  css_selector: Optional[str] = None,
482
  main_content_only: bool = True,
483
  headless: bool = True, # noqa: F821
484
+ google_search: bool = True,
485
+ real_chrome: bool = False,
 
 
 
 
 
486
  wait: int | float = 0,
487
+ proxy: Optional[str | Dict[str, str]] = None,
488
+ timezone_id: str | None = None,
489
+ locale: str | None = None,
490
+ extra_headers: Optional[Dict[str, str]] = None,
491
+ useragent: Optional[str] = None,
492
+ hide_canvas: bool = False,
493
+ cdp_url: Optional[str] = None,
494
  timeout: int | float = 30000,
495
+ disable_resources: bool = False,
496
  wait_selector: Optional[str] = None,
 
 
497
  cookies: Optional[List[Dict]] = None,
498
+ network_idle: bool = False,
499
+ wait_selector_state: SelectorWaitStates = "attached",
500
+ block_webrtc: bool = False,
501
+ allow_webgl: bool = True,
502
+ solve_cloudflare: bool = False,
 
503
  additional_args: Optional[Dict] = None,
504
  ) -> List[ResponseModel]:
505
+ """Use the stealthy fetcher to fetch a group of URLs at the same time, and for each page return a structured output of the result.
506
+ Note: This is the only suitable fetcher for high protection levels.
507
  Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
508
 
509
  :param urls: A tuple of the URLs to request.
 
514
  :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
515
  :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
516
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
 
 
517
  :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
518
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
519
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
520
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
521
  :param cookies: Set cookies for the next request.
 
 
522
  :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
523
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
524
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
 
525
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
526
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
527
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
528
+ :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
529
+ :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
530
+ rules. Defaults to the system default locale.
531
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
532
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
533
+ :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
534
+ :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
535
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
536
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
537
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
538
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
539
+ :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
540
  """
541
  async with AsyncStealthySession(
542
  wait=wait,
543
  proxy=proxy,
544
+ locale=locale,
545
+ cdp_url=cdp_url,
546
  timeout=timeout,
547
  cookies=cookies,
548
  headless=headless,
549
+ useragent=useragent,
550
+ timezone_id=timezone_id,
551
+ real_chrome=real_chrome,
552
+ hide_canvas=hide_canvas,
553
  allow_webgl=allow_webgl,
 
 
 
554
  network_idle=network_idle,
555
+ block_webrtc=block_webrtc,
556
  wait_selector=wait_selector,
557
  google_search=google_search,
558
  extra_headers=extra_headers,
559
+ additional_args=additional_args,
560
  solve_cloudflare=solve_cloudflare,
561
  disable_resources=disable_resources,
562
  wait_selector_state=wait_selector_state,
 
563
  ) as session:
564
  tasks = [session.fetch(url) for url in urls]
565
  responses = await gather(*tasks)
scrapling/engines/_browsers/_base.py CHANGED
@@ -1,7 +1,6 @@
1
  from time import time
2
  from asyncio import sleep as asyncio_sleep, Lock
3
 
4
- from camoufox import DefaultAddons
5
  from playwright.sync_api._generated import Page
6
  from playwright.sync_api import (
7
  Frame,
@@ -17,18 +16,18 @@ from playwright.async_api import (
17
  BrowserContext as AsyncBrowserContext,
18
  )
19
  from playwright._impl._errors import Error as PlaywrightError
20
- from camoufox.pkgman import installed_verstr as camoufox_version
21
- from camoufox.utils import launch_options as generate_launch_options
22
 
23
  from ._page import PageInfo, PagePool
24
  from scrapling.parser import Selector
25
- from scrapling.core._types import Any, cast, Dict, List, Optional, Callable, TYPE_CHECKING
26
- from scrapling.engines.toolbelt.fingerprints import get_os_name
27
- from ._validators import validate, PlaywrightConfig, CamoufoxConfig
28
- from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
29
  from scrapling.engines.toolbelt.navigation import intercept_route, async_intercept_route
30
-
31
- __ff_version_str__ = camoufox_version().split(".", 1)[0]
 
 
 
 
32
 
33
 
34
  class SyncSession:
@@ -84,10 +83,6 @@ class SyncSession:
84
  if disable_resources:
85
  page.route("**/*", intercept_route)
86
 
87
- if getattr(self, "stealth", False):
88
- for script in _compiled_stealth_scripts():
89
- page.add_init_script(script=script)
90
-
91
  page_info = self.page_pool.add_page(page)
92
  page_info.mark_busy()
93
  return page_info
@@ -202,10 +197,6 @@ class AsyncSession:
202
  if disable_resources:
203
  await page.route("**/*", async_intercept_route)
204
 
205
- if getattr(self, "stealth", False):
206
- for script in _compiled_stealth_scripts():
207
- await page.add_init_script(script=script)
208
-
209
  return self.page_pool.add_page(page)
210
 
211
  def get_pool_stats(self) -> Dict[str, int]:
@@ -251,151 +242,118 @@ class AsyncSession:
251
  return handle_response
252
 
253
 
254
- class DynamicSessionMixin:
255
- def __validate__(self, **params):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
  if "__max_pages" in params:
257
  params["max_pages"] = params.pop("__max_pages")
258
 
259
- config = validate(params, model=PlaywrightConfig)
260
-
261
- self._max_pages = config.max_pages
262
- self._headless = config.headless
263
- self._hide_canvas = config.hide_canvas
264
- self._disable_webgl = config.disable_webgl
265
- self._real_chrome = config.real_chrome
266
- self._stealth = config.stealth
267
- self._google_search = config.google_search
268
- self._wait = config.wait
269
- self._proxy = config.proxy
270
- self._locale = config.locale
271
- self._extra_headers = config.extra_headers
272
- self._useragent = config.useragent
273
- self._timeout = config.timeout
274
- self._cookies = config.cookies
275
- self._disable_resources = config.disable_resources
276
- self._cdp_url = config.cdp_url
277
- self._network_idle = config.network_idle
278
- self._load_dom = config.load_dom
279
- self._wait_selector = config.wait_selector
280
- self._init_script = config.init_script
281
- self._wait_selector_state = config.wait_selector_state
282
- self._extra_flags = config.extra_flags
283
- self._selector_config = config.selector_config
284
- self._timezone_id = config.timezone_id
285
- self._additional_args = config.additional_args
286
- self._page_action = config.page_action
287
- self._user_data_dir = config.user_data_dir
288
- self._headers_keys = {header.lower() for header in self._extra_headers.keys()} if self._extra_headers else set()
289
- self.__initiate_browser_options__()
290
-
291
- def __initiate_browser_options__(self):
292
- if TYPE_CHECKING:
293
- assert isinstance(self._proxy, tuple)
294
-
295
- if not self._cdp_url:
296
- # `launch_options` is used with persistent context
297
- self.launch_options = dict(
298
- _launch_kwargs(
299
- self._headless,
300
- self._proxy,
301
- self._locale,
302
- tuple(self._extra_headers.items()) if self._extra_headers else tuple(),
303
- self._useragent,
304
- self._real_chrome,
305
- self._stealth,
306
- self._hide_canvas,
307
- self._disable_webgl,
308
- self._timezone_id,
309
- tuple(self._extra_flags) if self._extra_flags else tuple(),
310
- )
311
  )
312
- self.launch_options["extra_http_headers"] = dict(self.launch_options["extra_http_headers"])
313
- self.launch_options["proxy"] = dict(self.launch_options["proxy"]) or None
314
- self.launch_options["user_data_dir"] = self._user_data_dir
315
- self.launch_options.update(cast(Dict, self._additional_args))
316
- self.context_options = dict()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
  else:
318
  # while `context_options` is left to be used when cdp mode is enabled
319
- self.launch_options = dict()
320
- self.context_options = dict(
321
- _context_kwargs(
322
- self._proxy,
323
- self._locale,
324
- tuple(self._extra_headers.items()) if self._extra_headers else tuple(),
325
- self._useragent,
326
- self._stealth,
327
- )
328
- )
329
- self.context_options["extra_http_headers"] = dict(self.context_options["extra_http_headers"])
330
- self.context_options["proxy"] = dict(self.context_options["proxy"]) or None
331
- self.context_options.update(cast(Dict, self._additional_args))
332
 
333
 
334
- class StealthySessionMixin:
335
  def __validate__(self, **params):
336
- if "__max_pages" in params:
337
- params["max_pages"] = params.pop("__max_pages")
338
 
339
- config: CamoufoxConfig = validate(params, model=CamoufoxConfig)
340
-
341
- self._max_pages = config.max_pages
342
- self._headless = config.headless
343
- self._block_images = config.block_images
344
- self._disable_resources = config.disable_resources
345
- self._block_webrtc = config.block_webrtc
346
- self._allow_webgl = config.allow_webgl
347
- self._network_idle = config.network_idle
348
- self._load_dom = config.load_dom
349
- self._humanize = config.humanize
350
- self._solve_cloudflare = config.solve_cloudflare
351
- self._wait = config.wait
352
- self._timeout = config.timeout
353
- self._page_action = config.page_action
354
- self._wait_selector = config.wait_selector
355
- self._init_script = config.init_script
356
- self._addons = config.addons
357
- self._wait_selector_state = config.wait_selector_state
358
- self._cookies = config.cookies
359
- self._google_search = config.google_search
360
- self._extra_headers = config.extra_headers
361
- self._proxy = config.proxy
362
- self._os_randomize = config.os_randomize
363
- self._disable_ads = config.disable_ads
364
- self._geoip = config.geoip
365
- self._selector_config = config.selector_config
366
- self._additional_args = config.additional_args
367
- self._user_data_dir = config.user_data_dir
368
- self._headers_keys = {header.lower() for header in self._extra_headers.keys()} if self._extra_headers else set()
369
- self.__initiate_browser_options__()
370
-
371
- def __initiate_browser_options__(self):
372
- """Initiate browser options."""
373
- self.launch_options: Dict[str, Any] = generate_launch_options(
374
- **{
375
- "geoip": self._geoip,
376
- "proxy": dict(self._proxy) if self._proxy and isinstance(self._proxy, tuple) else self._proxy,
377
- "addons": self._addons,
378
- "exclude_addons": [] if self._disable_ads else [DefaultAddons.UBO],
379
- "headless": self._headless,
380
- "humanize": True if self._solve_cloudflare else self._humanize,
381
- "i_know_what_im_doing": True, # To turn warnings off with the user configurations
382
- "allow_webgl": self._allow_webgl,
383
- "block_webrtc": self._block_webrtc,
384
- "block_images": self._block_images, # Careful! it makes some websites don't finish loading at all like stackoverflow even in headful mode.
385
- "os": None if self._os_randomize else get_os_name(),
386
- "user_data_dir": self._user_data_dir,
387
- "ff_version": __ff_version_str__,
388
- "firefox_user_prefs": {
389
- # This is what enabling `enable_cache` does internally, so we do it from here instead
390
- "browser.sessionhistory.max_entries": 10,
391
- "browser.sessionhistory.max_total_viewers": -1,
392
- "browser.cache.memory.enable": True,
393
- "browser.cache.disk_cache_ssl": True,
394
- "browser.cache.disk.smart_size.enabled": True,
395
- },
396
- **cast(Dict, self._additional_args),
397
  }
398
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
 
400
  @staticmethod
401
  def _detect_cloudflare(page_content: str) -> str | None:
 
1
  from time import time
2
  from asyncio import sleep as asyncio_sleep, Lock
3
 
 
4
  from playwright.sync_api._generated import Page
5
  from playwright.sync_api import (
6
  Frame,
 
16
  BrowserContext as AsyncBrowserContext,
17
  )
18
  from playwright._impl._errors import Error as PlaywrightError
 
 
19
 
20
  from ._page import PageInfo, PagePool
21
  from scrapling.parser import Selector
22
+ from ._validators import validate, PlaywrightConfig, StealthConfig
23
+ from ._config_tools import __default_chrome_useragent__, __default_useragent__
 
 
24
  from scrapling.engines.toolbelt.navigation import intercept_route, async_intercept_route
25
+ from scrapling.core._types import Any, cast, Dict, List, Optional, Callable, TYPE_CHECKING, overload, Tuple
26
+ from scrapling.engines.constants import (
27
+ DEFAULT_STEALTH_FLAGS,
28
+ HARMFUL_DEFAULT_ARGS,
29
+ DEFAULT_FLAGS,
30
+ )
31
 
32
 
33
  class SyncSession:
 
83
  if disable_resources:
84
  page.route("**/*", intercept_route)
85
 
 
 
 
 
86
  page_info = self.page_pool.add_page(page)
87
  page_info.mark_busy()
88
  return page_info
 
197
  if disable_resources:
198
  await page.route("**/*", async_intercept_route)
199
 
 
 
 
 
200
  return self.page_pool.add_page(page)
201
 
202
  def get_pool_stats(self) -> Dict[str, int]:
 
242
  return handle_response
243
 
244
 
245
+ class BaseSessionMixin:
246
+ @overload
247
+ def __validate_routine__(self, params: Dict, model: type[StealthConfig]) -> StealthConfig: ...
248
+
249
+ @overload
250
+ def __validate_routine__(self, params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...
251
+
252
+ def __validate_routine__(
253
+ self, params: Dict, model: type[PlaywrightConfig] | type[StealthConfig]
254
+ ) -> PlaywrightConfig | StealthConfig:
255
+ # Dark color scheme bypasses the 'prefersLightColor' check in creepjs
256
+ self._context_options: Dict[str, Any] = {"color_scheme": "dark", "device_scale_factor": 2}
257
+ self._launch_options: Dict[str, Any] = self._context_options | {
258
+ "args": DEFAULT_FLAGS,
259
+ "ignore_default_args": HARMFUL_DEFAULT_ARGS,
260
+ }
261
  if "__max_pages" in params:
262
  params["max_pages"] = params.pop("__max_pages")
263
 
264
+ config = validate(params, model=model)
265
+ self._headers_keys = (
266
+ {header.lower() for header in config.extra_headers.keys()} if config.extra_headers else set()
267
+ )
268
+
269
+ return config
270
+
271
+ def __generate_options__(self, extra_flags: Tuple | None = None) -> None:
272
+ config = cast(PlaywrightConfig, getattr(self, "_config", None))
273
+ self._context_options.update(
274
+ {
275
+ "proxy": config.proxy,
276
+ "locale": config.locale,
277
+ "timezone_id": config.timezone_id,
278
+ "extra_http_headers": config.extra_headers,
279
+ }
280
+ )
281
+ # The default useragent in the headful is always correct now in the current versions of Playwright
282
+ if config.useragent:
283
+ self._context_options["user_agent"] = config.useragent
284
+ elif not config.useragent and config.headless:
285
+ self._context_options["user_agent"] = (
286
+ __default_chrome_useragent__ if config.real_chrome else __default_useragent__
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
  )
288
+
289
+ if not config.cdp_url:
290
+ self._launch_options |= self._context_options
291
+ self._context_options = {}
292
+ flags = self._launch_options["args"]
293
+ if config.extra_flags or extra_flags:
294
+ flags = list(set(flags + (config.extra_flags or extra_flags)))
295
+
296
+ self._launch_options.update(
297
+ {
298
+ "args": flags,
299
+ "headless": config.headless,
300
+ "user_data_dir": config.user_data_dir,
301
+ "channel": "chrome" if config.real_chrome else "chromium",
302
+ }
303
+ )
304
+
305
+ if config.additional_args:
306
+ self._launch_options.update(config.additional_args)
307
  else:
308
  # while `context_options` is left to be used when cdp mode is enabled
309
+ self._launch_options = dict()
310
+ if config.additional_args:
311
+ self._context_options.update(config.additional_args)
 
 
 
 
 
 
 
 
 
 
312
 
313
 
314
+ class DynamicSessionMixin(BaseSessionMixin):
315
  def __validate__(self, **params):
316
+ self._config = self.__validate_routine__(params, model=PlaywrightConfig)
317
+ self.__generate_options__()
318
 
319
+
320
+ class StealthySessionMixin(BaseSessionMixin):
321
+ def __validate__(self, **params):
322
+ self._config: StealthConfig = self.__validate_routine__(params, model=StealthConfig)
323
+ self._context_options.update(
324
+ {
325
+ "is_mobile": False,
326
+ "has_touch": False,
327
+ # I'm thinking about disabling it to rest from all Service Workers' headache, but let's keep it as it is for now
328
+ "service_workers": "allow",
329
+ "ignore_https_errors": True,
330
+ "screen": {"width": 1920, "height": 1080},
331
+ "viewport": {"width": 1920, "height": 1080},
332
+ "permissions": ["geolocation", "notifications"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
  }
334
  )
335
+ self.__generate_stealth_options()
336
+
337
+ def __generate_stealth_options(self) -> None:
338
+ flags = tuple()
339
+ if not self._config.cdp_url:
340
+ flags = DEFAULT_FLAGS + DEFAULT_STEALTH_FLAGS
341
+
342
+ if self._config.block_webrtc:
343
+ flags += (
344
+ "--webrtc-ip-handling-policy=disable_non_proxied_udp",
345
+ "--force-webrtc-ip-handling-policy", # Ensures the policy is enforced
346
+ )
347
+ if not self._config.allow_webgl:
348
+ flags += (
349
+ "--disable-webgl",
350
+ "--disable-webgl-image-chromium",
351
+ "--disable-webgl2",
352
+ )
353
+ if self._config.hide_canvas:
354
+ flags += ("--fingerprinting-canvas-image-data-noise",)
355
+
356
+ super(StealthySessionMixin, self).__generate_options__(flags)
357
 
358
  @staticmethod
359
  def _detect_cloudflare(page_content: str) -> str | None:
scrapling/engines/_browsers/_config_tools.py CHANGED
@@ -58,88 +58,3 @@ def _set_flags(hide_canvas, disable_webgl): # pragma: no cover
58
  )
59
 
60
  return flags
61
-
62
-
63
- @lru_cache(2, typed=True)
64
- def _launch_kwargs(
65
- headless,
66
- proxy: Tuple,
67
- locale,
68
- extra_headers,
69
- useragent,
70
- real_chrome,
71
- stealth,
72
- hide_canvas,
73
- disable_webgl,
74
- timezone_id,
75
- extra_flags: Tuple,
76
- ) -> Tuple:
77
- """Creates the arguments we will use while launching playwright's browser"""
78
- base_args = DEFAULT_FLAGS
79
- if extra_flags:
80
- base_args = base_args + extra_flags
81
-
82
- launch_kwargs = {
83
- "locale": locale,
84
- "timezone_id": timezone_id or None,
85
- "headless": headless,
86
- "args": base_args,
87
- "color_scheme": "dark", # Bypasses the 'prefersLightColor' check in creepjs
88
- "proxy": proxy or tuple(),
89
- "device_scale_factor": 2,
90
- "ignore_default_args": HARMFUL_DEFAULT_ARGS,
91
- "channel": "chrome" if real_chrome else "chromium",
92
- "extra_http_headers": extra_headers or tuple(),
93
- }
94
- # The default useragent in the headful is always correct now in the current versions of Playwright
95
- if useragent:
96
- launch_kwargs["user_agent"] = useragent
97
- elif not useragent and headless:
98
- launch_kwargs["user_agent"] = __default_chrome_useragent__ if real_chrome else __default_useragent__
99
-
100
- if stealth:
101
- stealth_args = base_args + _set_flags(hide_canvas, disable_webgl)
102
- launch_kwargs.update(
103
- {
104
- "args": stealth_args,
105
- "chromium_sandbox": True,
106
- "is_mobile": False,
107
- "has_touch": False,
108
- # I'm thinking about disabling it to rest from all Service Workers' headache, but let's keep it as it is for now
109
- "service_workers": "allow",
110
- "ignore_https_errors": True,
111
- "screen": {"width": 1920, "height": 1080},
112
- "viewport": {"width": 1920, "height": 1080},
113
- "permissions": ["geolocation", "notifications"],
114
- }
115
- )
116
-
117
- return tuple(launch_kwargs.items())
118
-
119
-
120
- @lru_cache(2, typed=True)
121
- def _context_kwargs(proxy, locale, extra_headers, useragent, stealth) -> Tuple:
122
- """Creates the arguments for the browser context"""
123
- context_kwargs = {
124
- "proxy": proxy or tuple(),
125
- "locale": locale,
126
- "color_scheme": "dark", # Bypasses the 'prefersLightColor' check in creepjs
127
- "device_scale_factor": 2,
128
- "extra_http_headers": extra_headers or tuple(),
129
- "user_agent": useragent or __default_useragent__,
130
- }
131
- if stealth:
132
- context_kwargs.update(
133
- {
134
- "is_mobile": False,
135
- "has_touch": False,
136
- # I'm thinking about disabling it to rest from all Service Workers' headache, but let's keep it as it is for now
137
- "service_workers": "allow",
138
- "ignore_https_errors": True,
139
- "screen": {"width": 1920, "height": 1080},
140
- "viewport": {"width": 1920, "height": 1080},
141
- "permissions": ["geolocation", "notifications"],
142
- }
143
- )
144
-
145
- return tuple(context_kwargs.items())
 
58
  )
59
 
60
  return flags
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scrapling/engines/_browsers/_controllers.py CHANGED
@@ -9,8 +9,6 @@ from playwright.async_api import (
9
  Playwright as AsyncPlaywright,
10
  BrowserContext as AsyncBrowserContext,
11
  )
12
- from patchright.sync_api import sync_playwright as sync_patchright
13
- from patchright.async_api import async_playwright as async_patchright
14
 
15
  from scrapling.core.utils import log
16
  from scrapling.core._types import Unpack, TYPE_CHECKING
@@ -21,44 +19,19 @@ from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
21
  from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
22
 
23
 
24
- class DynamicSession(DynamicSessionMixin, SyncSession):
25
  """A Browser session manager with page pooling."""
26
 
27
  __slots__ = (
28
- "_max_pages",
29
- "_headless",
30
- "_hide_canvas",
31
- "_disable_webgl",
32
- "_real_chrome",
33
- "_stealth",
34
- "_google_search",
35
- "_proxy",
36
- "_locale",
37
- "_extra_headers",
38
- "_useragent",
39
- "_timeout",
40
- "_cookies",
41
- "_disable_resources",
42
- "_network_idle",
43
- "_load_dom",
44
- "_wait_selector",
45
- "_init_script",
46
- "_wait_selector_state",
47
- "_wait",
48
  "playwright",
49
- "browser",
50
  "context",
51
- "page_pool",
52
  "_closed",
53
- "_selector_config",
54
- "_page_action",
55
- "launch_options",
56
- "context_options",
57
- "_cdp_url",
58
- "_headers_keys",
59
- "_extra_flags",
60
- "_additional_args",
61
- "_user_data_dir",
62
  )
63
 
64
  def __init__(self, **kwargs: Unpack[PlaywrightSession]):
@@ -76,8 +49,9 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
76
  :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
77
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
78
  :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
79
- :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
80
- :param timezone_id: Set the timezone for the browser if wanted.
 
81
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
82
  :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
83
  :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
@@ -94,27 +68,24 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
94
  :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
95
  """
96
  self.__validate__(**kwargs)
97
- super().__init__(max_pages=self._max_pages)
98
 
99
  def start(self):
100
  """Create a browser for this instance and context."""
101
  if not self.playwright:
102
- sync_context = sync_patchright if self._stealth else sync_playwright
103
-
104
- self.playwright: Playwright = sync_context().start() # pyright: ignore [reportAttributeAccessIssue]
105
 
106
- if self._cdp_url: # pragma: no cover
107
- self.context = self.playwright.chromium.connect_over_cdp(endpoint_url=self._cdp_url).new_context(
108
- **self.context_options
109
- )
110
  else:
111
- self.context = self.playwright.chromium.launch_persistent_context(**self.launch_options)
112
 
113
- if self._init_script: # pragma: no cover
114
- self.context.add_init_script(path=self._init_script)
115
 
116
- if self._cookies: # pragma: no cover
117
- self.context.add_cookies(self._cookies)
118
  else:
119
  raise RuntimeError("Session has been already started")
120
 
@@ -139,7 +110,6 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
139
  :return: A `Response` object.
140
  """
141
  params = _validate(kwargs, self, PlaywrightConfig)
142
-
143
  if self._closed: # pragma: no cover
144
  raise RuntimeError("Context manager has been closed")
145
 
@@ -193,7 +163,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
193
  raise e
194
 
195
 
196
- class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
197
  """An async Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory."""
198
 
199
  def __init__(self, **kwargs: Unpack[PlaywrightSession]):
@@ -212,8 +182,9 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
212
  :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
213
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
214
  :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
215
- :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
216
- :param timezone_id: Set the timezone for the browser if wanted.
 
217
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
218
  :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
219
  :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
@@ -230,28 +201,26 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
230
  :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
231
  """
232
  self.__validate__(**kwargs)
233
- super().__init__(max_pages=self._max_pages)
234
 
235
  async def start(self):
236
  """Create a browser for this instance and context."""
237
  if not self.playwright:
238
- async_context = async_patchright if self._stealth else async_playwright
239
-
240
- self.playwright: AsyncPlaywright = await async_context().start() # pyright: ignore [reportAttributeAccessIssue]
241
 
242
- if self._cdp_url:
243
- browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._cdp_url)
244
- self.context: AsyncBrowserContext = await browser.new_context(**self.context_options)
245
  else:
246
  self.context: AsyncBrowserContext = await self.playwright.chromium.launch_persistent_context(
247
- **self.launch_options
248
  )
249
 
250
- if self._init_script: # pragma: no cover
251
- await self.context.add_init_script(path=self._init_script)
252
 
253
- if self._cookies:
254
- await self.context.add_cookies(self._cookies) # pyright: ignore
255
  else:
256
  raise RuntimeError("Session has been already started")
257
 
 
9
  Playwright as AsyncPlaywright,
10
  BrowserContext as AsyncBrowserContext,
11
  )
 
 
12
 
13
  from scrapling.core.utils import log
14
  from scrapling.core._types import Unpack, TYPE_CHECKING
 
19
  from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
20
 
21
 
22
+ class DynamicSession(SyncSession, DynamicSessionMixin):
23
  """A Browser session manager with page pooling."""
24
 
25
  __slots__ = (
26
+ "_config",
27
+ "_context_options",
28
+ "_launch_options",
29
+ "max_pages",
30
+ "page_pool",
31
+ "_max_wait_for_page",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  "playwright",
 
33
  "context",
 
34
  "_closed",
 
 
 
 
 
 
 
 
 
35
  )
36
 
37
  def __init__(self, **kwargs: Unpack[PlaywrightSession]):
 
49
  :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
50
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
51
  :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
52
+ :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
53
+ rules. Defaults to the system default locale.
54
+ :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
55
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
56
  :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
57
  :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
 
68
  :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
69
  """
70
  self.__validate__(**kwargs)
71
+ super().__init__()
72
 
73
  def start(self):
74
  """Create a browser for this instance and context."""
75
  if not self.playwright:
76
+ self.playwright: Playwright = sync_playwright().start() # pyright: ignore [reportAttributeAccessIssue]
 
 
77
 
78
+ if self._config.cdp_url: # pragma: no cover
79
+ browser = self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
80
+ self.context = browser.new_context(**self._context_options)
 
81
  else:
82
+ self.context = self.playwright.chromium.launch_persistent_context(**self._launch_options)
83
 
84
+ if self._config.init_script: # pragma: no cover
85
+ self.context.add_init_script(path=self._config.init_script)
86
 
87
+ if self._config.cookies: # pragma: no cover
88
+ self.context.add_cookies(self._config.cookies)
89
  else:
90
  raise RuntimeError("Session has been already started")
91
 
 
110
  :return: A `Response` object.
111
  """
112
  params = _validate(kwargs, self, PlaywrightConfig)
 
113
  if self._closed: # pragma: no cover
114
  raise RuntimeError("Context manager has been closed")
115
 
 
163
  raise e
164
 
165
 
166
+ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
167
  """An async Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory."""
168
 
169
  def __init__(self, **kwargs: Unpack[PlaywrightSession]):
 
182
  :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
183
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
184
  :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
185
+ :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
186
+ rules. Defaults to the system default locale.
187
+ :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
188
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
189
  :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
190
  :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
 
201
  :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
202
  """
203
  self.__validate__(**kwargs)
204
+ super().__init__(max_pages=self._config.max_pages)
205
 
206
  async def start(self):
207
  """Create a browser for this instance and context."""
208
  if not self.playwright:
209
+ self.playwright: AsyncPlaywright = await async_playwright().start() # pyright: ignore [reportAttributeAccessIssue]
 
 
210
 
211
+ if self._config.cdp_url:
212
+ browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
213
+ self.context: AsyncBrowserContext = await browser.new_context(**self._context_options)
214
  else:
215
  self.context: AsyncBrowserContext = await self.playwright.chromium.launch_persistent_context(
216
+ **self._launch_options
217
  )
218
 
219
+ if self._config.init_script: # pragma: no cover
220
+ await self.context.add_init_script(path=self._config.init_script)
221
 
222
+ if self._config.cookies:
223
+ await self.context.add_cookies(self._config.cookies) # pyright: ignore
224
  else:
225
  raise RuntimeError("Session has been already started")
226
 
scrapling/engines/_browsers/{_camoufox.py → _stealth.py} RENAMED
@@ -2,117 +2,102 @@ from random import randint
2
  from re import compile as re_compile
3
 
4
  from playwright.sync_api import (
5
- Page,
6
  Locator,
7
- sync_playwright,
 
8
  )
9
  from playwright.async_api import (
10
- async_playwright,
11
  Page as async_Page,
12
  Locator as AsyncLocator,
13
  Playwright as AsyncPlaywright,
14
  BrowserContext as AsyncBrowserContext,
15
  )
 
 
16
 
17
  from scrapling.core.utils import log
18
- from ._types import CamoufoxSession, CamoufoxFetchParams
19
- from scrapling.core._types import Any, Unpack, TYPE_CHECKING
 
20
  from ._base import SyncSession, AsyncSession, StealthySessionMixin
21
- from ._validators import validate_fetch as _validate, CamoufoxConfig
22
  from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
23
  from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
24
 
25
  __CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
26
 
27
 
28
- class StealthySession(StealthySessionMixin, SyncSession):
29
- """A Stealthy session manager with page pooling."""
30
 
31
  __slots__ = (
32
- "_max_pages",
33
- "_headless",
34
- "_block_images",
35
- "_disable_resources",
36
- "_block_webrtc",
37
- "_allow_webgl",
38
- "_network_idle",
39
- "_load_dom",
40
- "_humanize",
41
- "_solve_cloudflare",
42
- "_wait",
43
- "_timeout",
44
- "_page_action",
45
- "_wait_selector",
46
- "_init_script",
47
- "_addons",
48
- "_wait_selector_state",
49
- "_cookies",
50
- "_google_search",
51
- "_extra_headers",
52
- "_proxy",
53
- "_os_randomize",
54
- "_disable_ads",
55
- "_geoip",
56
- "_selector_config",
57
- "_additional_args",
58
  "playwright",
59
- "browser",
60
  "context",
61
- "page_pool",
62
  "_closed",
63
- "launch_options",
64
- "_headers_keys",
65
- "_user_data_dir",
66
  )
67
 
68
- def __init__(self, **kwargs: Unpack[CamoufoxSession]):
69
- """A Browser session manager with page pooling
70
 
71
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
72
- :param block_images: Prevent the loading of images through Firefox preferences.
73
- This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
74
  :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
75
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
76
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
77
- :param block_webrtc: Blocks WebRTC entirely.
78
  :param cookies: Set cookies for the next request.
79
- :param addons: List of Firefox addons to use. Must be paths to extracted addons.
80
- :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
81
- :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
82
- :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
83
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
84
- :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
85
- :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
86
- :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
87
- :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
88
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
 
89
  :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
90
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
91
  :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
92
- :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
93
- It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
 
94
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
 
 
 
 
 
 
 
95
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
96
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
97
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
98
  :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
 
99
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
100
- :param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
101
  """
102
  self.__validate__(**kwargs)
103
- super().__init__(max_pages=self._max_pages)
104
 
105
  def start(self):
106
  """Create a browser for this instance and context."""
107
  if not self.playwright:
108
- self.playwright = sync_playwright().start()
109
- self.context = self.playwright.firefox.launch_persistent_context(**self.launch_options)
110
 
111
- if self._init_script: # pragma: no cover
112
- self.context.add_init_script(path=self._init_script)
 
 
 
 
 
 
 
 
 
113
 
114
- if self._cookies: # pragma: no cover
115
- self.context.add_cookies(self._cookies)
116
  else:
117
  raise RuntimeError("Session has been already started")
118
 
@@ -148,22 +133,27 @@ class StealthySession(StealthySessionMixin, SyncSession):
148
  outer_box = {}
149
  iframe = page.frame(url=__CF_PATTERN__)
150
  if iframe is not None:
151
- self._wait_for_page_stability(iframe, True, True)
152
 
153
  if challenge_type != "embedded":
154
  while not iframe.frame_element().is_visible():
155
  # Double-checking that the iframe is loaded
156
  page.wait_for_timeout(500)
 
157
  outer_box: Any = iframe.frame_element().bounding_box()
158
 
159
  if not iframe or not outer_box:
 
 
 
 
160
  outer_box: Any = page.locator(box_selector).last.bounding_box()
161
 
162
  # Calculate the Captcha coordinates for any viewport
163
  captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)
164
 
165
  # Move the mouse to the center of the window, then press and hold the left mouse button
166
- page.mouse.click(captcha_x, captcha_y, delay=60, button="left")
167
  self._wait_for_networkidle(page)
168
  if iframe is not None:
169
  # Wait for the frame to be removed from the page (with 30s timeout = 300 iterations * 100 ms)
@@ -182,7 +172,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
182
  log.info("Cloudflare captcha is solved")
183
  return
184
 
185
- def fetch(self, url: str, **kwargs: Unpack[CamoufoxFetchParams]) -> Response:
186
  """Opens up the browser and do your request based on your chosen options.
187
 
188
  :param url: The Target url.
@@ -203,8 +193,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
203
  - selector_config: The arguments that will be passed in the end while creating the final Selector's class.
204
  :return: A `Response` object.
205
  """
206
- params = _validate(kwargs, self, CamoufoxConfig)
207
-
208
  if self._closed: # pragma: no cover
209
  raise RuntimeError("Context manager has been closed")
210
 
@@ -233,7 +222,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
233
  if params.page_action:
234
  try:
235
  _ = params.page_action(page_info.page)
236
- except Exception as e:
237
  log.error(f"Error executing page_action: {e}")
238
 
239
  if params.wait_selector:
@@ -242,10 +231,12 @@ class StealthySession(StealthySessionMixin, SyncSession):
242
  waiter.first.wait_for(state=params.wait_selector_state)
243
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
244
  self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
245
- except Exception as e:
246
  log.error(f"Error waiting for selector {params.wait_selector}: {e}")
247
 
248
  page_info.page.wait_for_timeout(params.wait)
 
 
249
  response = ResponseFactory.from_playwright_response(
250
  page_info.page, first_response, final_response[0], params.selector_config
251
  )
@@ -256,72 +247,79 @@ class StealthySession(StealthySessionMixin, SyncSession):
256
 
257
  return response
258
 
259
- except Exception as e: # pragma: no cover
260
  page_info.mark_error()
261
  raise e
262
 
263
 
264
- class AsyncStealthySession(StealthySessionMixin, AsyncSession):
265
- """A Stealthy session manager with page pooling."""
266
 
267
- def __init__(self, **kwargs: Unpack[CamoufoxSession]):
268
- """A Browser session manager with page pooling
269
 
270
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
271
- :param block_images: Prevent the loading of images through Firefox preferences.
272
- This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
273
  :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
274
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
275
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
276
- :param block_webrtc: Blocks WebRTC entirely.
277
  :param cookies: Set cookies for the next request.
278
- :param addons: List of Firefox addons to use. Must be paths to extracted addons.
279
- :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
280
- :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
281
- :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
282
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
283
- :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
284
- :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
285
- :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
286
- :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
287
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
 
288
  :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
289
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
290
  :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
291
- :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
292
- It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
 
293
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
 
 
 
 
 
 
 
294
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
295
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
296
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
297
- :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
298
  :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
 
299
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
300
- :param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
301
  """
302
  self.__validate__(**kwargs)
303
- super().__init__(max_pages=self._max_pages)
304
 
305
  async def start(self):
306
  """Create a browser for this instance and context."""
307
  if not self.playwright:
308
- self.playwright: AsyncPlaywright = await async_playwright().start()
309
- self.context: AsyncBrowserContext = await self.playwright.firefox.launch_persistent_context(
310
- **self.launch_options
311
- )
 
 
 
 
 
312
 
313
- if self._init_script: # pragma: no cover
314
- await self.context.add_init_script(path=self._init_script)
315
 
316
- if self._cookies:
317
- await self.context.add_cookies(self._cookies) # pyright: ignore [reportArgumentType]
 
 
 
318
  else:
319
  raise RuntimeError("Session has been already started")
320
 
321
- async def _cloudflare_solver(self, page: async_Page): # pragma: no cover
322
- """Solve the cloudflare challenge displayed on the playwright page passed. The async version
323
 
324
- :param page: The async targeted page
325
  :return:
326
  """
327
  await self._wait_for_networkidle(page, timeout=5000)
@@ -331,7 +329,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
331
  return
332
  else:
333
  log.info(f'The turnstile version discovered is "{challenge_type}"')
334
- if challenge_type == "non-interactive": # pragma: no cover
335
  while "<title>Just a moment...</title>" in (await ResponseFactory._get_async_page_content(page)):
336
  log.info("Waiting for Cloudflare wait page to disappear.")
337
  await page.wait_for_timeout(1000)
@@ -350,22 +348,27 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
350
  outer_box = {}
351
  iframe = page.frame(url=__CF_PATTERN__)
352
  if iframe is not None:
353
- await self._wait_for_page_stability(iframe, True, True)
354
 
355
  if challenge_type != "embedded":
356
  while not await (await iframe.frame_element()).is_visible():
357
  # Double-checking that the iframe is loaded
358
  await page.wait_for_timeout(500)
359
- outer_box: Any = await (await iframe.frame_element()).bounding_box()
 
360
 
361
  if not iframe or not outer_box:
 
 
 
 
362
  outer_box: Any = await page.locator(box_selector).last.bounding_box()
363
 
364
  # Calculate the Captcha coordinates for any viewport
365
  captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)
366
 
367
  # Move the mouse to the center of the window, then press and hold the left mouse button
368
- await page.mouse.click(captcha_x, captcha_y, delay=60, button="left")
369
  await self._wait_for_networkidle(page)
370
  if iframe is not None:
371
  # Wait for the frame to be removed from the page (with 30s timeout = 300 iterations * 100 ms)
@@ -377,14 +380,14 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
377
  await page.wait_for_timeout(100)
378
  attempts += 1
379
  if challenge_type != "embedded":
380
- await page.locator(box_selector).wait_for(state="detached")
381
  await page.locator(".zone-name-title").wait_for(state="hidden")
382
  await self._wait_for_page_stability(page, True, False)
383
 
384
  log.info("Cloudflare captcha is solved")
385
  return
386
 
387
- async def fetch(self, url: str, **kwargs: Unpack[CamoufoxFetchParams]) -> Response:
388
  """Opens up the browser and do your request based on your chosen options.
389
 
390
  :param url: The Target url.
@@ -405,7 +408,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
405
  - selector_config: The arguments that will be passed in the end while creating the final Selector's class.
406
  :return: A `Response` object.
407
  """
408
- params = _validate(kwargs, self, CamoufoxConfig)
409
 
410
  if self._closed: # pragma: no cover
411
  raise RuntimeError("Context manager has been closed")
@@ -418,10 +421,6 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
418
  final_response = [None]
419
  handle_response = self._create_response_handler(page_info, final_response)
420
 
421
- if TYPE_CHECKING:
422
- if not isinstance(page_info.page, async_Page):
423
- raise TypeError
424
-
425
  try:
426
  # Navigate to URL and wait for a specified state
427
  page_info.page.on("response", handle_response)
@@ -461,9 +460,8 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
461
  # Close the page to free up resources
462
  await page_info.page.close()
463
  self.page_pool.pages.remove(page_info)
464
-
465
  return response
466
 
467
- except Exception as e:
468
  page_info.mark_error()
469
  raise e
 
2
  from re import compile as re_compile
3
 
4
  from playwright.sync_api import (
 
5
  Locator,
6
+ Page,
7
+ Playwright,
8
  )
9
  from playwright.async_api import (
 
10
  Page as async_Page,
11
  Locator as AsyncLocator,
12
  Playwright as AsyncPlaywright,
13
  BrowserContext as AsyncBrowserContext,
14
  )
15
+ from patchright.sync_api import sync_playwright
16
+ from patchright.async_api import async_playwright
17
 
18
  from scrapling.core.utils import log
19
+ from scrapling.core._types import Any, Unpack
20
+ from ._config_tools import _compiled_stealth_scripts
21
+ from ._types import StealthSession, StealthFetchParams
22
  from ._base import SyncSession, AsyncSession, StealthySessionMixin
23
+ from ._validators import validate_fetch as _validate, StealthConfig
24
  from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
25
  from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
26
 
27
  __CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
28
 
29
 
30
+ class StealthySession(SyncSession, StealthySessionMixin):
31
+ """A Stealthy Browser session manager with page pooling."""
32
 
33
  __slots__ = (
34
+ "_config",
35
+ "_context_options",
36
+ "_launch_options",
37
+ "max_pages",
38
+ "page_pool",
39
+ "_max_wait_for_page",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  "playwright",
 
41
  "context",
 
42
  "_closed",
 
 
 
43
  )
44
 
45
+ def __init__(self, **kwargs: Unpack[StealthSession]):
46
+ """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
47
 
48
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
 
 
49
  :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
50
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
51
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
52
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
53
  :param cookies: Set cookies for the next request.
 
 
 
 
54
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
 
 
 
55
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
56
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
57
  :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
58
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
59
  :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
60
+ :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
61
+ rules. Defaults to the system default locale.
62
+ :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
63
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
64
+ :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
65
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
66
+ :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
67
+ :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
68
+ :param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
69
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
70
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
71
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
72
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
73
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
74
  :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
75
+ :param extra_flags: A list of additional browser flags to pass to the browser on launch.
76
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
77
+ :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
78
  """
79
  self.__validate__(**kwargs)
80
+ super().__init__()
81
 
82
  def start(self):
83
  """Create a browser for this instance and context."""
84
  if not self.playwright:
85
+ self.playwright: Playwright = sync_playwright().start() # pyright: ignore [reportAttributeAccessIssue]
 
86
 
87
+ if self._config.cdp_url: # pragma: no cover
88
+ browser = self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
89
+ self.context = browser.new_context(**self._context_options)
90
+ else:
91
+ self.context = self.playwright.chromium.launch_persistent_context(**self._launch_options)
92
+
93
+ for script in _compiled_stealth_scripts():
94
+ self.context.add_init_script(script=script)
95
+
96
+ if self._config.init_script: # pragma: no cover
97
+ self.context.add_init_script(path=self._config.init_script)
98
 
99
+ if self._config.cookies: # pragma: no cover
100
+ self.context.add_cookies(self._config.cookies)
101
  else:
102
  raise RuntimeError("Session has been already started")
103
 
 
133
  outer_box = {}
134
  iframe = page.frame(url=__CF_PATTERN__)
135
  if iframe is not None:
136
+ self._wait_for_page_stability(iframe, True, False)
137
 
138
  if challenge_type != "embedded":
139
  while not iframe.frame_element().is_visible():
140
  # Double-checking that the iframe is loaded
141
  page.wait_for_timeout(500)
142
+
143
  outer_box: Any = iframe.frame_element().bounding_box()
144
 
145
  if not iframe or not outer_box:
146
+ if "<title>Just a moment...</title>" not in (ResponseFactory._get_page_content(page)):
147
+ log.info("Cloudflare captcha is solved")
148
+ return
149
+
150
  outer_box: Any = page.locator(box_selector).last.bounding_box()
151
 
152
  # Calculate the Captcha coordinates for any viewport
153
  captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)
154
 
155
  # Move the mouse to the center of the window, then press and hold the left mouse button
156
+ page.mouse.click(captcha_x, captcha_y, delay=randint(100, 200), button="left")
157
  self._wait_for_networkidle(page)
158
  if iframe is not None:
159
  # Wait for the frame to be removed from the page (with 30s timeout = 300 iterations * 100 ms)
 
172
  log.info("Cloudflare captcha is solved")
173
  return
174
 
175
+ def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Response:
176
  """Opens up the browser and do your request based on your chosen options.
177
 
178
  :param url: The Target url.
 
193
  - selector_config: The arguments that will be passed in the end while creating the final Selector's class.
194
  :return: A `Response` object.
195
  """
196
+ params = _validate(kwargs, self, StealthConfig)
 
197
  if self._closed: # pragma: no cover
198
  raise RuntimeError("Context manager has been closed")
199
 
 
222
  if params.page_action:
223
  try:
224
  _ = params.page_action(page_info.page)
225
+ except Exception as e: # pragma: no cover
226
  log.error(f"Error executing page_action: {e}")
227
 
228
  if params.wait_selector:
 
231
  waiter.first.wait_for(state=params.wait_selector_state)
232
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
233
  self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
234
+ except Exception as e: # pragma: no cover
235
  log.error(f"Error waiting for selector {params.wait_selector}: {e}")
236
 
237
  page_info.page.wait_for_timeout(params.wait)
238
+
239
+ # Create response object
240
  response = ResponseFactory.from_playwright_response(
241
  page_info.page, first_response, final_response[0], params.selector_config
242
  )
 
247
 
248
  return response
249
 
250
+ except Exception as e:
251
  page_info.mark_error()
252
  raise e
253
 
254
 
255
+ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
256
+ """An async Stealthy Browser session manager with page pooling."""
257
 
258
+ def __init__(self, **kwargs: Unpack[StealthSession]):
259
+ """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
260
 
261
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
 
 
262
  :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
263
  Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
264
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
265
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
266
  :param cookies: Set cookies for the next request.
 
 
 
 
267
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
 
 
 
268
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
269
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
270
  :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
271
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
272
  :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
273
+ :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
274
+ rules. Defaults to the system default locale.
275
+ :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
276
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
277
+ :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
278
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
279
+ :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
280
+ :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
281
+ :param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
282
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
283
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
284
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
285
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
286
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
 
287
  :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
288
+ :param extra_flags: A list of additional browser flags to pass to the browser on launch.
289
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
290
+ :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
291
  """
292
  self.__validate__(**kwargs)
293
+ super().__init__(max_pages=self._config.max_pages)
294
 
295
  async def start(self):
296
  """Create a browser for this instance and context."""
297
  if not self.playwright:
298
+ self.playwright: AsyncPlaywright = await async_playwright().start() # pyright: ignore [reportAttributeAccessIssue]
299
+
300
+ if self._config.cdp_url:
301
+ browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
302
+ self.context: AsyncBrowserContext = await browser.new_context(**self._context_options)
303
+ else:
304
+ self.context: AsyncBrowserContext = await self.playwright.chromium.launch_persistent_context(
305
+ **self._launch_options
306
+ )
307
 
308
+ for script in _compiled_stealth_scripts():
309
+ await self.context.add_init_script(script=script)
310
 
311
+ if self._config.init_script: # pragma: no cover
312
+ await self.context.add_init_script(path=self._config.init_script)
313
+
314
+ if self._config.cookies:
315
+ await self.context.add_cookies(self._config.cookies) # pyright: ignore
316
  else:
317
  raise RuntimeError("Session has been already started")
318
 
319
+ async def _cloudflare_solver(self, page: async_Page) -> None: # pragma: no cover
320
+ """Solve the cloudflare challenge displayed on the playwright page passed
321
 
322
+ :param page: The targeted page
323
  :return:
324
  """
325
  await self._wait_for_networkidle(page, timeout=5000)
 
329
  return
330
  else:
331
  log.info(f'The turnstile version discovered is "{challenge_type}"')
332
+ if challenge_type == "non-interactive":
333
  while "<title>Just a moment...</title>" in (await ResponseFactory._get_async_page_content(page)):
334
  log.info("Waiting for Cloudflare wait page to disappear.")
335
  await page.wait_for_timeout(1000)
 
348
  outer_box = {}
349
  iframe = page.frame(url=__CF_PATTERN__)
350
  if iframe is not None:
351
+ await self._wait_for_page_stability(iframe, True, False)
352
 
353
  if challenge_type != "embedded":
354
  while not await (await iframe.frame_element()).is_visible():
355
  # Double-checking that the iframe is loaded
356
  await page.wait_for_timeout(500)
357
+
358
+ outer_box: Any = (await iframe.frame_element()).bounding_box()
359
 
360
  if not iframe or not outer_box:
361
+ if "<title>Just a moment...</title>" not in (await ResponseFactory._get_async_page_content(page)):
362
+ log.info("Cloudflare captcha is solved")
363
+ return
364
+
365
  outer_box: Any = await page.locator(box_selector).last.bounding_box()
366
 
367
  # Calculate the Captcha coordinates for any viewport
368
  captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)
369
 
370
  # Move the mouse to the center of the window, then press and hold the left mouse button
371
+ await page.mouse.click(captcha_x, captcha_y, delay=randint(100, 200), button="left")
372
  await self._wait_for_networkidle(page)
373
  if iframe is not None:
374
  # Wait for the frame to be removed from the page (with 30s timeout = 300 iterations * 100 ms)
 
380
  await page.wait_for_timeout(100)
381
  attempts += 1
382
  if challenge_type != "embedded":
383
+ await page.locator(box_selector).last.wait_for(state="detached")
384
  await page.locator(".zone-name-title").wait_for(state="hidden")
385
  await self._wait_for_page_stability(page, True, False)
386
 
387
  log.info("Cloudflare captcha is solved")
388
  return
389
 
390
+ async def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Response:
391
  """Opens up the browser and do your request based on your chosen options.
392
 
393
  :param url: The Target url.
 
408
  - selector_config: The arguments that will be passed in the end while creating the final Selector's class.
409
  :return: A `Response` object.
410
  """
411
+ params = _validate(kwargs, self, StealthConfig)
412
 
413
  if self._closed: # pragma: no cover
414
  raise RuntimeError("Context manager has been closed")
 
421
  final_response = [None]
422
  handle_response = self._create_response_handler(page_info, final_response)
423
 
 
 
 
 
424
  try:
425
  # Navigate to URL and wait for a specified state
426
  page_info.page.on("response", handle_response)
 
460
  # Close the page to free up resources
461
  await page_info.page.close()
462
  self.page_pool.pages.remove(page_info)
 
463
  return response
464
 
465
+ except Exception as e: # pragma: no cover
466
  page_info.mark_error()
467
  raise e
scrapling/engines/_browsers/_types.py CHANGED
@@ -53,7 +53,7 @@ if TYPE_CHECKING: # pragma: no cover
53
  json: Optional[Dict | List]
54
 
55
  # Types for browser session
56
- class BrowserSession(TypedDict, total=False):
57
  max_pages: int
58
  headless: bool
59
  disable_resources: bool
@@ -64,6 +64,7 @@ if TYPE_CHECKING: # pragma: no cover
64
  cookies: Optional[Iterable[Dict]]
65
  google_search: bool
66
  wait: int | float
 
67
  page_action: Optional[Callable]
68
  proxy: Optional[str | Dict[str, str] | Tuple]
69
  extra_headers: Optional[Dict[str, str]]
@@ -72,42 +73,32 @@ if TYPE_CHECKING: # pragma: no cover
72
  user_data_dir: str
73
  selector_config: Optional[Dict]
74
  additional_args: Optional[Dict]
75
-
76
- class PlaywrightSession(BrowserSession, total=False):
77
- cdp_url: Optional[str]
78
- hide_canvas: bool
79
- disable_webgl: bool
80
  real_chrome: bool
81
- stealth: bool
82
- locale: str
83
  useragent: Optional[str]
84
  extra_flags: Optional[List[str]]
85
 
86
  class PlaywrightFetchParams(TypedDict, total=False):
 
 
 
87
  google_search: bool
88
  timeout: int | float
89
- wait: int | float
90
- page_action: Optional[Callable]
91
- extra_headers: Optional[Dict[str, str]]
92
  disable_resources: bool
93
  wait_selector: Optional[str]
94
- wait_selector_state: SelectorWaitStates
95
- network_idle: bool
96
- load_dom: bool
97
  selector_config: Optional[Dict]
 
 
98
 
99
- class CamoufoxSession(BrowserSession, total=False):
100
- block_images: bool
101
- block_webrtc: bool
102
  allow_webgl: bool
103
- humanize: bool | float
 
104
  solve_cloudflare: bool
105
- addons: Optional[List[str]]
106
- os_randomize: bool
107
- disable_ads: bool
108
- geoip: bool
109
 
110
- class CamoufoxFetchParams(PlaywrightFetchParams, total=False):
111
  solve_cloudflare: bool
112
 
113
  else: # pragma: no cover
@@ -116,5 +107,5 @@ else: # pragma: no cover
116
  DataRequestParams = TypedDict
117
  PlaywrightSession = TypedDict
118
  PlaywrightFetchParams = TypedDict
119
- CamoufoxSession = TypedDict
120
- CamoufoxFetchParams = TypedDict
 
53
  json: Optional[Dict | List]
54
 
55
  # Types for browser session
56
+ class PlaywrightSession(TypedDict, total=False):
57
  max_pages: int
58
  headless: bool
59
  disable_resources: bool
 
64
  cookies: Optional[Iterable[Dict]]
65
  google_search: bool
66
  wait: int | float
67
+ timezone_id: str | None
68
  page_action: Optional[Callable]
69
  proxy: Optional[str | Dict[str, str] | Tuple]
70
  extra_headers: Optional[Dict[str, str]]
 
73
  user_data_dir: str
74
  selector_config: Optional[Dict]
75
  additional_args: Optional[Dict]
76
+ locale: Optional[str]
 
 
 
 
77
  real_chrome: bool
78
+ cdp_url: Optional[str]
 
79
  useragent: Optional[str]
80
  extra_flags: Optional[List[str]]
81
 
82
  class PlaywrightFetchParams(TypedDict, total=False):
83
+ load_dom: bool
84
+ wait: int | float
85
+ network_idle: bool
86
  google_search: bool
87
  timeout: int | float
 
 
 
88
  disable_resources: bool
89
  wait_selector: Optional[str]
90
+ page_action: Optional[Callable]
 
 
91
  selector_config: Optional[Dict]
92
+ extra_headers: Optional[Dict[str, str]]
93
+ wait_selector_state: SelectorWaitStates
94
 
95
+ class StealthSession(PlaywrightSession, total=False):
 
 
96
  allow_webgl: bool
97
+ hide_canvas: bool
98
+ block_webrtc: bool
99
  solve_cloudflare: bool
 
 
 
 
100
 
101
+ class StealthFetchParams(PlaywrightFetchParams, total=False):
102
  solve_cloudflare: bool
103
 
104
  else: # pragma: no cover
 
107
  DataRequestParams = TypedDict
108
  PlaywrightSession = TypedDict
109
  PlaywrightFetchParams = TypedDict
110
+ StealthSession = TypedDict
111
+ StealthFetchParams = TypedDict
scrapling/engines/_browsers/_validators.py CHANGED
@@ -14,11 +14,13 @@ from scrapling.core._types import (
14
  Optional,
15
  Callable,
16
  Iterable,
17
- SelectorWaitStates,
18
  overload,
 
 
19
  )
20
  from scrapling.engines.toolbelt.navigation import construct_proxy_dict
21
- from scrapling.engines._browsers._types import PlaywrightFetchParams, CamoufoxFetchParams
22
 
23
 
24
  # Custom validators for msgspec
@@ -68,26 +70,26 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False, weakref=True):
68
  cdp_url: Optional[str] = None
69
  headless: bool = True
70
  google_search: bool = True
71
- hide_canvas: bool = False
72
- disable_webgl: bool = False
73
  real_chrome: bool = False
74
- stealth: bool = False
75
  wait: Seconds = 0
76
  page_action: Optional[Callable] = None
77
  proxy: Optional[str | Dict[str, str] | Tuple] = None # The default value for proxy in Playwright's source is `None`
78
- locale: str = "en-US"
79
  extra_headers: Optional[Dict[str, str]] = None
80
  useragent: Optional[str] = None
81
  timeout: Seconds = 30000
82
  init_script: Optional[str] = None
83
  disable_resources: bool = False
84
  wait_selector: Optional[str] = None
85
- cookies: Optional[Iterable[Dict]] = None
86
  network_idle: bool = False
87
  load_dom: bool = True
88
  wait_selector_state: SelectorWaitStates = "attached"
89
  user_data_dir: str = ""
90
- timezone_id: str = ""
91
  extra_flags: Optional[List[str]] = None
92
  selector_config: Optional[Dict] = {}
93
  additional_args: Optional[Dict] = {}
@@ -118,64 +120,18 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False, weakref=True):
118
  raise ValueError(validation_msg)
119
 
120
 
121
- class CamoufoxConfig(Struct, kw_only=True, frozen=False, weakref=True):
122
- """Configuration struct for validation"""
123
-
124
- max_pages: PagesCount = 1
125
- headless: bool = True # noqa: F821
126
- block_images: bool = False
127
- disable_resources: bool = False
128
- block_webrtc: bool = False
129
  allow_webgl: bool = True
130
- network_idle: bool = False
131
- load_dom: bool = True
132
- humanize: bool | float = True
133
  solve_cloudflare: bool = False
134
- wait: Seconds = 0
135
- timeout: Seconds = 30000
136
- init_script: Optional[str] = None
137
- page_action: Optional[Callable] = None
138
- wait_selector: Optional[str] = None
139
- addons: Optional[List[str]] = None
140
- wait_selector_state: SelectorWaitStates = "attached"
141
- cookies: Optional[Iterable[Dict]] = None
142
- google_search: bool = True
143
- extra_headers: Optional[Dict[str, str]] = None
144
- proxy: Optional[str | Dict[str, str] | Tuple] = None # The default value for proxy in Playwright's source is `None`
145
- os_randomize: bool = False
146
- disable_ads: bool = False
147
- geoip: bool = False
148
- user_data_dir: str = ""
149
- selector_config: Optional[Dict] = {}
150
- additional_args: Optional[Dict] = {}
151
 
152
  def __post_init__(self):
153
  """Custom validation after msgspec validation"""
154
- if self.page_action and not callable(self.page_action):
155
- raise TypeError(f"page_action must be callable, got {type(self.page_action).__name__}")
156
- if self.proxy:
157
- self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
158
-
159
- if self.addons:
160
- for addon in self.addons:
161
- _validate_addon_path(addon)
162
- else:
163
- self.addons = []
164
-
165
- if self.init_script is not None:
166
- validation_msg = _is_invalid_file_path(self.init_script)
167
- if validation_msg:
168
- raise ValueError(validation_msg)
169
-
170
- if not self.cookies:
171
- self.cookies = []
172
  # Cloudflare timeout adjustment
173
  if self.solve_cloudflare and self.timeout < 60_000:
174
  self.timeout = 60_000
175
- if not self.selector_config:
176
- self.selector_config = {}
177
- if not self.additional_args:
178
- self.additional_args = {}
179
 
180
 
181
  @dataclass
@@ -197,9 +153,9 @@ class _fetch_params:
197
 
198
 
199
  def validate_fetch(
200
- method_kwargs: Dict | PlaywrightFetchParams | CamoufoxFetchParams,
201
  session: Any,
202
- model: type[PlaywrightConfig] | type[CamoufoxConfig],
203
  ) -> _fetch_params: # pragma: no cover
204
  result = {}
205
  overrides = {}
@@ -210,21 +166,20 @@ def validate_fetch(
210
  for key in fetch_param_fields:
211
  if key in method_kwargs:
212
  overrides[key] = method_kwargs[key]
213
- else:
214
- # Check for underscore-prefixed attribute (private)
215
- attr_name = f"_{key}"
216
- if hasattr(session, attr_name):
217
- result[key] = getattr(session, attr_name)
218
 
219
  if overrides:
220
  validated_config = validate(overrides, model)
221
- # Extract only the fields that _fetch_params needs from validated_config
 
222
  validated_dict = {
223
- f.name: getattr(validated_config, f.name)
224
- for f in fields(_fetch_params)
225
- if hasattr(validated_config, f.name)
226
  }
227
- validated_dict.setdefault("solve_cloudflare", False)
 
 
 
228
 
229
  # Start with session defaults, then overwrite with validated overrides
230
  result.update(validated_dict)
@@ -238,7 +193,7 @@ def validate_fetch(
238
  # Cache default values for each model to reduce validation overhead
239
  models_default_values = {}
240
 
241
- for _model in (CamoufoxConfig, PlaywrightConfig):
242
  _defaults = {}
243
  if hasattr(_model, "__struct_defaults__") and hasattr(_model, "__struct_fields__"):
244
  for field_name, default_value in zip(_model.__struct_fields__, _model.__struct_defaults__): # type: ignore
@@ -256,14 +211,14 @@ def _filter_defaults(params: Dict, model: str) -> Dict:
256
 
257
 
258
  @overload
259
- def validate(params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...
260
 
261
 
262
  @overload
263
- def validate(params: Dict, model: type[CamoufoxConfig]) -> CamoufoxConfig: ...
264
 
265
 
266
- def validate(params: Dict, model: type[PlaywrightConfig] | type[CamoufoxConfig]) -> PlaywrightConfig | CamoufoxConfig:
267
  try:
268
  # Filter out params with the default values (no need to validate them) to speed up validation
269
  filtered = _filter_defaults(params, model.__name__)
 
14
  Optional,
15
  Callable,
16
  Iterable,
17
+ Sequence,
18
  overload,
19
+ SetCookieParam,
20
+ SelectorWaitStates,
21
  )
22
  from scrapling.engines.toolbelt.navigation import construct_proxy_dict
23
+ from scrapling.engines._browsers._types import PlaywrightFetchParams, StealthFetchParams
24
 
25
 
26
  # Custom validators for msgspec
 
70
  cdp_url: Optional[str] = None
71
  headless: bool = True
72
  google_search: bool = True
73
+ # hide_canvas: bool = False
74
+ # disable_webgl: bool = False
75
  real_chrome: bool = False
76
+ # stealth: bool = False
77
  wait: Seconds = 0
78
  page_action: Optional[Callable] = None
79
  proxy: Optional[str | Dict[str, str] | Tuple] = None # The default value for proxy in Playwright's source is `None`
80
+ locale: str | None = None
81
  extra_headers: Optional[Dict[str, str]] = None
82
  useragent: Optional[str] = None
83
  timeout: Seconds = 30000
84
  init_script: Optional[str] = None
85
  disable_resources: bool = False
86
  wait_selector: Optional[str] = None
87
+ cookies: Sequence[SetCookieParam] | None = []
88
  network_idle: bool = False
89
  load_dom: bool = True
90
  wait_selector_state: SelectorWaitStates = "attached"
91
  user_data_dir: str = ""
92
+ timezone_id: str | None = ""
93
  extra_flags: Optional[List[str]] = None
94
  selector_config: Optional[Dict] = {}
95
  additional_args: Optional[Dict] = {}
 
120
  raise ValueError(validation_msg)
121
 
122
 
123
+ class StealthConfig(PlaywrightConfig, kw_only=True, frozen=False, weakref=True):
 
 
 
 
 
 
 
124
  allow_webgl: bool = True
125
+ hide_canvas: bool = False
126
+ block_webrtc: bool = False
 
127
  solve_cloudflare: bool = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
  def __post_init__(self):
130
  """Custom validation after msgspec validation"""
131
+ super(StealthConfig, self).__post_init__()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  # Cloudflare timeout adjustment
133
  if self.solve_cloudflare and self.timeout < 60_000:
134
  self.timeout = 60_000
 
 
 
 
135
 
136
 
137
  @dataclass
 
153
 
154
 
155
  def validate_fetch(
156
+ method_kwargs: Dict | PlaywrightFetchParams | StealthFetchParams,
157
  session: Any,
158
+ model: type[PlaywrightConfig] | type[StealthConfig],
159
  ) -> _fetch_params: # pragma: no cover
160
  result = {}
161
  overrides = {}
 
166
  for key in fetch_param_fields:
167
  if key in method_kwargs:
168
  overrides[key] = method_kwargs[key]
169
+ elif hasattr(session, "_config") and hasattr(session._config, key):
170
+ result[key] = getattr(session._config, key)
 
 
 
171
 
172
  if overrides:
173
  validated_config = validate(overrides, model)
174
+ # Extract ONLY the fields that were actually overridden (not all fields)
175
+ # This prevents validated defaults from overwriting session config values
176
  validated_dict = {
177
+ field: getattr(validated_config, field) for field in overrides.keys() if hasattr(validated_config, field)
 
 
178
  }
179
+
180
+ # Preserve solve_cloudflare if the user explicitly provided it, even if the model doesn't have it
181
+ if "solve_cloudflare" in overrides:
182
+ validated_dict["solve_cloudflare"] = overrides["solve_cloudflare"]
183
 
184
  # Start with session defaults, then overwrite with validated overrides
185
  result.update(validated_dict)
 
193
  # Cache default values for each model to reduce validation overhead
194
  models_default_values = {}
195
 
196
+ for _model in (StealthConfig, PlaywrightConfig):
197
  _defaults = {}
198
  if hasattr(_model, "__struct_defaults__") and hasattr(_model, "__struct_fields__"):
199
  for field_name, default_value in zip(_model.__struct_fields__, _model.__struct_defaults__): # type: ignore
 
211
 
212
 
213
  @overload
214
+ def validate(params: Dict, model: type[StealthConfig]) -> StealthConfig: ...
215
 
216
 
217
  @overload
218
+ def validate(params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...
219
 
220
 
221
+ def validate(params: Dict, model: type[PlaywrightConfig] | type[StealthConfig]) -> PlaywrightConfig | StealthConfig:
222
  try:
223
  # Filter out params with the default values (no need to validate them) to speed up validation
224
  filtered = _filter_defaults(params, model.__name__)
scrapling/engines/constants.py CHANGED
@@ -74,7 +74,6 @@ DEFAULT_STEALTH_FLAGS = (
74
  "--disable-domain-reliability",
75
  "--disable-threaded-animation",
76
  "--disable-threaded-scrolling",
77
- # '--disable-reading-from-canvas', # For Firefox
78
  "--enable-simple-cache-backend",
79
  "--disable-background-networking",
80
  "--enable-surface-synchronization",
 
74
  "--disable-domain-reliability",
75
  "--disable-threaded-animation",
76
  "--disable-threaded-scrolling",
 
77
  "--enable-simple-cache-backend",
78
  "--disable-background-networking",
79
  "--enable-surface-synchronization",
scrapling/fetchers/__init__.py CHANGED
@@ -3,7 +3,7 @@ from typing import TYPE_CHECKING, Any
3
  if TYPE_CHECKING:
4
  from scrapling.fetchers.requests import Fetcher, AsyncFetcher, FetcherSession
5
  from scrapling.fetchers.chrome import DynamicFetcher, DynamicSession, AsyncDynamicSession
6
- from scrapling.fetchers.firefox import StealthyFetcher, StealthySession, AsyncStealthySession
7
 
8
 
9
  # Lazy import mapping
@@ -14,9 +14,9 @@ _LAZY_IMPORTS = {
14
  "DynamicFetcher": ("scrapling.fetchers.chrome", "DynamicFetcher"),
15
  "DynamicSession": ("scrapling.fetchers.chrome", "DynamicSession"),
16
  "AsyncDynamicSession": ("scrapling.fetchers.chrome", "AsyncDynamicSession"),
17
- "StealthyFetcher": ("scrapling.fetchers.firefox", "StealthyFetcher"),
18
- "StealthySession": ("scrapling.fetchers.firefox", "StealthySession"),
19
- "AsyncStealthySession": ("scrapling.fetchers.firefox", "AsyncStealthySession"),
20
  }
21
 
22
  __all__ = [
 
3
  if TYPE_CHECKING:
4
  from scrapling.fetchers.requests import Fetcher, AsyncFetcher, FetcherSession
5
  from scrapling.fetchers.chrome import DynamicFetcher, DynamicSession, AsyncDynamicSession
6
+ from scrapling.fetchers.stealth_chrome import StealthyFetcher, StealthySession, AsyncStealthySession
7
 
8
 
9
  # Lazy import mapping
 
14
  "DynamicFetcher": ("scrapling.fetchers.chrome", "DynamicFetcher"),
15
  "DynamicSession": ("scrapling.fetchers.chrome", "DynamicSession"),
16
  "AsyncDynamicSession": ("scrapling.fetchers.chrome", "AsyncDynamicSession"),
17
+ "StealthyFetcher": ("scrapling.fetchers.stealth_chrome", "StealthyFetcher"),
18
+ "StealthySession": ("scrapling.fetchers.stealth_chrome", "StealthySession"),
19
+ "AsyncStealthySession": ("scrapling.fetchers.stealth_chrome", "AsyncStealthySession"),
20
  }
21
 
22
  __all__ = [
scrapling/fetchers/{firefox.py → stealth_chrome.py} RENAMED
@@ -1,48 +1,52 @@
1
  from scrapling.core._types import Unpack
2
- from scrapling.engines._browsers._types import CamoufoxSession
3
  from scrapling.engines.toolbelt.custom import BaseFetcher, Response
4
- from scrapling.engines._browsers._camoufox import StealthySession, AsyncStealthySession
5
 
6
 
7
  class StealthyFetcher(BaseFetcher):
8
- """A `Fetcher` class type that is a completely stealthy fetcher that uses a modified version of Firefox.
9
 
10
- It works as real browsers passing almost all online tests/protections based on Camoufox.
11
- Other added flavors include setting the faked OS fingerprints to match the user's OS, and the referer of every request is set as if this request came from Google's search of this URL's domain.
12
  """
13
 
14
  @classmethod
15
- def fetch(cls, url: str, **kwargs: Unpack[CamoufoxSession]) -> Response:
16
  """
17
  Opens up a browser and do your request based on your chosen options below.
18
 
19
  :param url: Target url.
20
  :param kwargs: Browser session configuration options including:
21
  - headless: Run the browser in headless/hidden (default), or headful/visible mode.
22
- - block_images: Prevent the loading of images through Firefox preferences.
23
- - disable_resources: Drop requests of unnecessary resources for a speed boost.
24
- - block_webrtc: Blocks WebRTC entirely.
25
- - allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
 
26
  - network_idle: Wait for the page until there are no network connections for at least 500 ms.
27
- - load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
28
- - humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement.
29
- - solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
30
- - wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.
31
  - timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
 
32
  - page_action: Added for automation. A function that takes the `page` object and does the automation you need.
33
  - wait_selector: Wait for a specific CSS selector to be in a specific state.
34
- - init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
35
- - addons: List of Firefox addons to use. Must be paths to extracted addons.
 
 
36
  - wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
37
- - cookies: Set cookies for the next request.
 
 
 
 
 
 
38
  - google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
39
- - extra_headers: A dictionary of extra headers to add to the request.
40
  - proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
41
- - os_randomize: If enabled, Scrapling will randomize the OS fingerprints used.
42
- - disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
43
- - geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
44
  - selector_config: The arguments that will be passed in the end while creating the final Selector's class.
45
- - additional_args: Additional arguments to be passed to Camoufox as additional settings.
46
  :return: A `Response` object.
47
  """
48
  selector_config = kwargs.get("selector_config", {}) or kwargs.get(
@@ -57,37 +61,42 @@ class StealthyFetcher(BaseFetcher):
57
  return engine.fetch(url)
58
 
59
  @classmethod
60
- async def async_fetch(cls, url: str, **kwargs: Unpack[CamoufoxSession]) -> Response:
61
  """
62
  Opens up a browser and do your request based on your chosen options below.
63
 
64
  :param url: Target url.
65
  :param kwargs: Browser session configuration options including:
66
  - headless: Run the browser in headless/hidden (default), or headful/visible mode.
67
- - block_images: Prevent the loading of images through Firefox preferences.
68
- - disable_resources: Drop requests of unnecessary resources for a speed boost.
69
- - block_webrtc: Blocks WebRTC entirely.
70
- - allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
 
71
  - network_idle: Wait for the page until there are no network connections for at least 500 ms.
72
- - load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
73
- - humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement.
74
- - solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
75
- - wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.
76
  - timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
 
77
  - page_action: Added for automation. A function that takes the `page` object and does the automation you need.
78
  - wait_selector: Wait for a specific CSS selector to be in a specific state.
79
- - init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
80
- - addons: List of Firefox addons to use. Must be paths to extracted addons.
 
 
81
  - wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
82
- - cookies: Set cookies for the next request.
 
 
 
 
 
 
83
  - google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
84
- - extra_headers: A dictionary of extra headers to add to the request.
85
  - proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
86
- - os_randomize: If enabled, Scrapling will randomize the OS fingerprints used.
87
- - disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
88
- - geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
89
  - selector_config: The arguments that will be passed in the end while creating the final Selector's class.
90
- - additional_args: Additional arguments to be passed to Camoufox as additional settings.
91
  :return: A `Response` object.
92
  """
93
  selector_config = kwargs.get("selector_config", {}) or kwargs.get(
 
1
  from scrapling.core._types import Unpack
2
+ from scrapling.engines._browsers._types import StealthSession
3
  from scrapling.engines.toolbelt.custom import BaseFetcher, Response
4
+ from scrapling.engines._browsers._stealth import StealthySession, AsyncStealthySession
5
 
6
 
7
  class StealthyFetcher(BaseFetcher):
8
+ """A `Fetcher` class type which is a completely stealthy built on top of Chromium.
9
 
10
+ It works as real browsers passing almost all online tests/protections with many customization options.
 
11
  """
12
 
13
  @classmethod
14
+ def fetch(cls, url: str, **kwargs: Unpack[StealthSession]) -> Response:
15
  """
16
  Opens up a browser and do your request based on your chosen options below.
17
 
18
  :param url: Target url.
19
  :param kwargs: Browser session configuration options including:
20
  - headless: Run the browser in headless/hidden (default), or headful/visible mode.
21
+ - disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
22
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
23
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
24
+ - useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
25
+ - cookies: Set cookies for the next request.
26
  - network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
 
 
 
27
  - timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
28
+ - wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
29
  - page_action: Added for automation. A function that takes the `page` object and does the automation you need.
30
  - wait_selector: Wait for a specific CSS selector to be in a specific state.
31
+ - init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
32
+ - locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
33
+ rules. Defaults to the system default locale.
34
+ - timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
35
  - wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
36
+ - solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
37
+ - real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
38
+ - hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
39
+ - block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
40
+ - allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
41
+ - load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
42
+ - cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
43
  - google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
44
+ - extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
45
  - proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
46
+ - user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
47
+ - extra_flags: A list of additional browser flags to pass to the browser on launch.
 
48
  - selector_config: The arguments that will be passed in the end while creating the final Selector's class.
49
+ - additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
50
  :return: A `Response` object.
51
  """
52
  selector_config = kwargs.get("selector_config", {}) or kwargs.get(
 
61
  return engine.fetch(url)
62
 
63
  @classmethod
64
+ async def async_fetch(cls, url: str, **kwargs: Unpack[StealthSession]) -> Response:
65
  """
66
  Opens up a browser and do your request based on your chosen options below.
67
 
68
  :param url: Target url.
69
  :param kwargs: Browser session configuration options including:
70
  - headless: Run the browser in headless/hidden (default), or headful/visible mode.
71
+ - disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
72
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
73
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
74
+ - useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
75
+ - cookies: Set cookies for the next request.
76
  - network_idle: Wait for the page until there are no network connections for at least 500 ms.
 
 
 
 
77
  - timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
78
+ - wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
79
  - page_action: Added for automation. A function that takes the `page` object and does the automation you need.
80
  - wait_selector: Wait for a specific CSS selector to be in a specific state.
81
+ - init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
82
+ - locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
83
+ rules. Defaults to the system default locale.
84
+ - timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
85
  - wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
86
+ - solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
87
+ - real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
88
+ - hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
89
+ - block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
90
+ - allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
91
+ - load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
92
+ - cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
93
  - google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
94
+ - extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
95
  - proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
96
+ - user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
97
+ - extra_flags: A list of additional browser flags to pass to the browser on launch.
 
98
  - selector_config: The arguments that will be passed in the end while creating the final Selector's class.
99
+ - additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
100
  :return: A `Response` object.
101
  """
102
  selector_config = kwargs.get("selector_config", {}) or kwargs.get(