Karim shoair commited on
Commit ·
ee2299e
1
Parent(s): 123011a
refactor(fetchers)!: Replace Camoufox with patchright and many optimizations
Browse files- DynamicFetcher became 20% faster
- StealthyFetcher became 99% faster
- Scrapling size decreased
- Code became ~400 lines shorter
- Most importantly, scrapling is more stable and reliable now.
- Less confusing for new users.
- More...
- scrapling/cli.py +18 -51
- scrapling/core/_types.py +14 -0
- scrapling/core/ai.py +79 -92
- scrapling/engines/_browsers/_base.py +108 -150
- scrapling/engines/_browsers/_config_tools.py +0 -85
- scrapling/engines/_browsers/_controllers.py +34 -65
- scrapling/engines/_browsers/{_camoufox.py → _stealth.py} +112 -114
- scrapling/engines/_browsers/_types.py +16 -25
- scrapling/engines/_browsers/_validators.py +29 -74
- scrapling/engines/constants.py +0 -1
- scrapling/fetchers/__init__.py +4 -4
- scrapling/fetchers/{firefox.py → stealth_chrome.py} +48 -39
scrapling/cli.py
CHANGED
|
@@ -125,14 +125,9 @@ def install(force): # pragma: no cover
|
|
| 125 |
"playwright",
|
| 126 |
"install-deps",
|
| 127 |
"chromium",
|
| 128 |
-
"firefox",
|
| 129 |
],
|
| 130 |
"Playwright dependencies",
|
| 131 |
)
|
| 132 |
-
__Execute(
|
| 133 |
-
[python_executable, "-m", "camoufox", "fetch", "--browserforge"],
|
| 134 |
-
"Camoufox browser and databases",
|
| 135 |
-
)
|
| 136 |
# if no errors raised by the above commands, then we add the below file
|
| 137 |
__PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").touch()
|
| 138 |
else:
|
|
@@ -611,16 +606,10 @@ def delete(
|
|
| 611 |
)
|
| 612 |
@option("--wait-selector", help="CSS selector to wait for before proceeding")
|
| 613 |
@option("--locale", default="en-US", help="Browser locale (default: en-US)")
|
| 614 |
-
@option("--stealth/--no-stealth", default=False, help="Enable stealth mode (default: False)")
|
| 615 |
-
@option(
|
| 616 |
-
"--hide-canvas/--show-canvas",
|
| 617 |
-
default=False,
|
| 618 |
-
help="Add noise to canvas operations (default: False)",
|
| 619 |
-
)
|
| 620 |
@option(
|
| 621 |
-
"--
|
| 622 |
default=False,
|
| 623 |
-
help="
|
| 624 |
)
|
| 625 |
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
|
| 626 |
@option(
|
|
@@ -640,9 +629,7 @@ def fetch(
|
|
| 640 |
css_selector,
|
| 641 |
wait_selector,
|
| 642 |
locale,
|
| 643 |
-
|
| 644 |
-
hide_canvas,
|
| 645 |
-
disable_webgl,
|
| 646 |
proxy,
|
| 647 |
extra_headers,
|
| 648 |
):
|
|
@@ -659,9 +646,7 @@ def fetch(
|
|
| 659 |
:param css_selector: CSS selector to extract specific content.
|
| 660 |
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 661 |
:param locale: Set the locale for the browser.
|
| 662 |
-
:param
|
| 663 |
-
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
| 664 |
-
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
| 665 |
:param proxy: The proxy to be used with requests.
|
| 666 |
:param extra_headers: Extra headers to add to the request.
|
| 667 |
"""
|
|
@@ -676,9 +661,7 @@ def fetch(
|
|
| 676 |
"network_idle": network_idle,
|
| 677 |
"timeout": timeout,
|
| 678 |
"locale": locale,
|
| 679 |
-
"
|
| 680 |
-
"hide_canvas": hide_canvas,
|
| 681 |
-
"disable_webgl": disable_webgl,
|
| 682 |
}
|
| 683 |
|
| 684 |
if wait > 0:
|
|
@@ -703,11 +686,6 @@ def fetch(
|
|
| 703 |
default=True,
|
| 704 |
help="Run browser in headless mode (default: True)",
|
| 705 |
)
|
| 706 |
-
@option(
|
| 707 |
-
"--block-images/--allow-images",
|
| 708 |
-
default=False,
|
| 709 |
-
help="Block image loading (default: False)",
|
| 710 |
-
)
|
| 711 |
@option(
|
| 712 |
"--disable-resources/--enable-resources",
|
| 713 |
default=False,
|
|
@@ -718,11 +696,6 @@ def fetch(
|
|
| 718 |
default=False,
|
| 719 |
help="Block WebRTC entirely (default: False)",
|
| 720 |
)
|
| 721 |
-
@option(
|
| 722 |
-
"--humanize/--no-humanize",
|
| 723 |
-
default=False,
|
| 724 |
-
help="Humanize cursor movement (default: False)",
|
| 725 |
-
)
|
| 726 |
@option(
|
| 727 |
"--solve-cloudflare/--no-solve-cloudflare",
|
| 728 |
default=False,
|
|
@@ -735,9 +708,14 @@ def fetch(
|
|
| 735 |
help="Wait for network idle (default: False)",
|
| 736 |
)
|
| 737 |
@option(
|
| 738 |
-
"--
|
| 739 |
default=False,
|
| 740 |
-
help="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 741 |
)
|
| 742 |
@option(
|
| 743 |
"--timeout",
|
|
@@ -757,11 +735,6 @@ def fetch(
|
|
| 757 |
help="CSS selector to extract specific content from the page. It returns all matches.",
|
| 758 |
)
|
| 759 |
@option("--wait-selector", help="CSS selector to wait for before proceeding")
|
| 760 |
-
@option(
|
| 761 |
-
"--geoip/--no-geoip",
|
| 762 |
-
default=False,
|
| 763 |
-
help="Use IP geolocation for timezone/locale (default: False)",
|
| 764 |
-
)
|
| 765 |
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
|
| 766 |
@option(
|
| 767 |
"--extra-headers",
|
|
@@ -773,19 +746,17 @@ def stealthy_fetch(
|
|
| 773 |
url,
|
| 774 |
output_file,
|
| 775 |
headless,
|
| 776 |
-
block_images,
|
| 777 |
disable_resources,
|
| 778 |
block_webrtc,
|
| 779 |
-
humanize,
|
| 780 |
solve_cloudflare,
|
| 781 |
allow_webgl,
|
| 782 |
network_idle,
|
| 783 |
-
|
|
|
|
| 784 |
timeout,
|
| 785 |
wait,
|
| 786 |
css_selector,
|
| 787 |
wait_selector,
|
| 788 |
-
geoip,
|
| 789 |
proxy,
|
| 790 |
extra_headers,
|
| 791 |
):
|
|
@@ -795,19 +766,17 @@ def stealthy_fetch(
|
|
| 795 |
:param url: Target url.
|
| 796 |
:param output_file: Output file path (.md for Markdown, .html for HTML).
|
| 797 |
:param headless: Run the browser in headless/hidden, or headful/visible mode.
|
| 798 |
-
:param block_images: Prevent the loading of images through Firefox preferences.
|
| 799 |
:param disable_resources: Drop requests of unnecessary resources for a speed boost.
|
| 800 |
:param block_webrtc: Blocks WebRTC entirely.
|
| 801 |
-
:param humanize: Humanize the cursor movement.
|
| 802 |
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges.
|
| 803 |
:param allow_webgl: Allow WebGL (recommended to keep enabled).
|
| 804 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 805 |
-
:param
|
|
|
|
| 806 |
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page.
|
| 807 |
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before returning.
|
| 808 |
:param css_selector: CSS selector to extract specific content.
|
| 809 |
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 810 |
-
:param geoip: Automatically use IP's longitude, latitude, timezone, country, locale.
|
| 811 |
:param proxy: The proxy to be used with requests.
|
| 812 |
:param extra_headers: Extra headers to add to the request.
|
| 813 |
"""
|
|
@@ -818,16 +787,14 @@ def stealthy_fetch(
|
|
| 818 |
# Build request arguments
|
| 819 |
kwargs = {
|
| 820 |
"headless": headless,
|
| 821 |
-
"block_images": block_images,
|
| 822 |
"disable_resources": disable_resources,
|
| 823 |
"block_webrtc": block_webrtc,
|
| 824 |
-
"humanize": humanize,
|
| 825 |
"solve_cloudflare": solve_cloudflare,
|
| 826 |
"allow_webgl": allow_webgl,
|
| 827 |
"network_idle": network_idle,
|
| 828 |
-
"
|
|
|
|
| 829 |
"timeout": timeout,
|
| 830 |
-
"geoip": geoip,
|
| 831 |
}
|
| 832 |
|
| 833 |
if wait > 0:
|
|
|
|
| 125 |
"playwright",
|
| 126 |
"install-deps",
|
| 127 |
"chromium",
|
|
|
|
| 128 |
],
|
| 129 |
"Playwright dependencies",
|
| 130 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
# if no errors raised by the above commands, then we add the below file
|
| 132 |
__PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").touch()
|
| 133 |
else:
|
|
|
|
| 606 |
)
|
| 607 |
@option("--wait-selector", help="CSS selector to wait for before proceeding")
|
| 608 |
@option("--locale", default="en-US", help="Browser locale (default: en-US)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 609 |
@option(
|
| 610 |
+
"--real-chrome/--no-real-chrome",
|
| 611 |
default=False,
|
| 612 |
+
help="If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False)",
|
| 613 |
)
|
| 614 |
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
|
| 615 |
@option(
|
|
|
|
| 629 |
css_selector,
|
| 630 |
wait_selector,
|
| 631 |
locale,
|
| 632 |
+
real_chrome,
|
|
|
|
|
|
|
| 633 |
proxy,
|
| 634 |
extra_headers,
|
| 635 |
):
|
|
|
|
| 646 |
:param css_selector: CSS selector to extract specific content.
|
| 647 |
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 648 |
:param locale: Set the locale for the browser.
|
| 649 |
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
|
|
|
|
|
|
| 650 |
:param proxy: The proxy to be used with requests.
|
| 651 |
:param extra_headers: Extra headers to add to the request.
|
| 652 |
"""
|
|
|
|
| 661 |
"network_idle": network_idle,
|
| 662 |
"timeout": timeout,
|
| 663 |
"locale": locale,
|
| 664 |
+
"real_chrome": real_chrome,
|
|
|
|
|
|
|
| 665 |
}
|
| 666 |
|
| 667 |
if wait > 0:
|
|
|
|
| 686 |
default=True,
|
| 687 |
help="Run browser in headless mode (default: True)",
|
| 688 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 689 |
@option(
|
| 690 |
"--disable-resources/--enable-resources",
|
| 691 |
default=False,
|
|
|
|
| 696 |
default=False,
|
| 697 |
help="Block WebRTC entirely (default: False)",
|
| 698 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 699 |
@option(
|
| 700 |
"--solve-cloudflare/--no-solve-cloudflare",
|
| 701 |
default=False,
|
|
|
|
| 708 |
help="Wait for network idle (default: False)",
|
| 709 |
)
|
| 710 |
@option(
|
| 711 |
+
"--real-chrome/--no-real-chrome",
|
| 712 |
default=False,
|
| 713 |
+
help="If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False)",
|
| 714 |
+
)
|
| 715 |
+
@option(
|
| 716 |
+
"--hide-canvas/--show-canvas",
|
| 717 |
+
default=False,
|
| 718 |
+
help="Add noise to canvas operations (default: False)",
|
| 719 |
)
|
| 720 |
@option(
|
| 721 |
"--timeout",
|
|
|
|
| 735 |
help="CSS selector to extract specific content from the page. It returns all matches.",
|
| 736 |
)
|
| 737 |
@option("--wait-selector", help="CSS selector to wait for before proceeding")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 738 |
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
|
| 739 |
@option(
|
| 740 |
"--extra-headers",
|
|
|
|
| 746 |
url,
|
| 747 |
output_file,
|
| 748 |
headless,
|
|
|
|
| 749 |
disable_resources,
|
| 750 |
block_webrtc,
|
|
|
|
| 751 |
solve_cloudflare,
|
| 752 |
allow_webgl,
|
| 753 |
network_idle,
|
| 754 |
+
real_chrome,
|
| 755 |
+
hide_canvas,
|
| 756 |
timeout,
|
| 757 |
wait,
|
| 758 |
css_selector,
|
| 759 |
wait_selector,
|
|
|
|
| 760 |
proxy,
|
| 761 |
extra_headers,
|
| 762 |
):
|
|
|
|
| 766 |
:param url: Target url.
|
| 767 |
:param output_file: Output file path (.md for Markdown, .html for HTML).
|
| 768 |
:param headless: Run the browser in headless/hidden, or headful/visible mode.
|
|
|
|
| 769 |
:param disable_resources: Drop requests of unnecessary resources for a speed boost.
|
| 770 |
:param block_webrtc: Blocks WebRTC entirely.
|
|
|
|
| 771 |
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges.
|
| 772 |
:param allow_webgl: Allow WebGL (recommended to keep enabled).
|
| 773 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 774 |
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 775 |
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
| 776 |
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page.
|
| 777 |
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before returning.
|
| 778 |
:param css_selector: CSS selector to extract specific content.
|
| 779 |
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
|
|
|
| 780 |
:param proxy: The proxy to be used with requests.
|
| 781 |
:param extra_headers: Extra headers to add to the request.
|
| 782 |
"""
|
|
|
|
| 787 |
# Build request arguments
|
| 788 |
kwargs = {
|
| 789 |
"headless": headless,
|
|
|
|
| 790 |
"disable_resources": disable_resources,
|
| 791 |
"block_webrtc": block_webrtc,
|
|
|
|
| 792 |
"solve_cloudflare": solve_cloudflare,
|
| 793 |
"allow_webgl": allow_webgl,
|
| 794 |
"network_idle": network_idle,
|
| 795 |
+
"real_chrome": real_chrome,
|
| 796 |
+
"hide_canvas": hide_canvas,
|
| 797 |
"timeout": timeout,
|
|
|
|
| 798 |
}
|
| 799 |
|
| 800 |
if wait > 0:
|
scrapling/core/_types.py
CHANGED
|
@@ -57,3 +57,17 @@ except ImportError: # pragma: no cover
|
|
| 57 |
from typing_extensions import Self # Backport
|
| 58 |
except ImportError:
|
| 59 |
Self = object
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
from typing_extensions import Self # Backport
|
| 58 |
except ImportError:
|
| 59 |
Self = object
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# Copied from `playwright._impl._api_structures.SetCookieParam`
|
| 63 |
+
class SetCookieParam(TypedDict, total=False):
|
| 64 |
+
name: str
|
| 65 |
+
value: str
|
| 66 |
+
url: Optional[str]
|
| 67 |
+
domain: Optional[str]
|
| 68 |
+
path: Optional[str]
|
| 69 |
+
expires: Optional[float]
|
| 70 |
+
httpOnly: Optional[bool]
|
| 71 |
+
secure: Optional[bool]
|
| 72 |
+
sameSite: Optional[Literal["Lax", "None", "Strict"]]
|
| 73 |
+
partitionKey: Optional[str]
|
scrapling/core/ai.py
CHANGED
|
@@ -213,13 +213,11 @@ class ScraplingMCPServer:
|
|
| 213 |
main_content_only: bool = True,
|
| 214 |
headless: bool = False,
|
| 215 |
google_search: bool = True,
|
| 216 |
-
hide_canvas: bool = False,
|
| 217 |
-
disable_webgl: bool = False,
|
| 218 |
real_chrome: bool = False,
|
| 219 |
-
stealth: bool = False,
|
| 220 |
wait: int | float = 0,
|
| 221 |
proxy: Optional[str | Dict[str, str]] = None,
|
| 222 |
-
|
|
|
|
| 223 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 224 |
useragent: Optional[str] = None,
|
| 225 |
cdp_url: Optional[str] = None,
|
|
@@ -251,12 +249,11 @@ class ScraplingMCPServer:
|
|
| 251 |
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 252 |
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 253 |
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 254 |
-
:param
|
|
|
|
|
|
|
| 255 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 256 |
-
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
| 257 |
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 258 |
-
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
| 259 |
-
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
| 260 |
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 261 |
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 262 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
|
@@ -269,15 +266,13 @@ class ScraplingMCPServer:
|
|
| 269 |
locale=locale,
|
| 270 |
timeout=timeout,
|
| 271 |
cookies=cookies,
|
| 272 |
-
stealth=stealth,
|
| 273 |
cdp_url=cdp_url,
|
| 274 |
headless=headless,
|
| 275 |
useragent=useragent,
|
| 276 |
-
|
| 277 |
real_chrome=real_chrome,
|
| 278 |
network_idle=network_idle,
|
| 279 |
wait_selector=wait_selector,
|
| 280 |
-
disable_webgl=disable_webgl,
|
| 281 |
extra_headers=extra_headers,
|
| 282 |
google_search=google_search,
|
| 283 |
disable_resources=disable_resources,
|
|
@@ -301,13 +296,11 @@ class ScraplingMCPServer:
|
|
| 301 |
main_content_only: bool = True,
|
| 302 |
headless: bool = False,
|
| 303 |
google_search: bool = True,
|
| 304 |
-
hide_canvas: bool = False,
|
| 305 |
-
disable_webgl: bool = False,
|
| 306 |
real_chrome: bool = False,
|
| 307 |
-
stealth: bool = False,
|
| 308 |
wait: int | float = 0,
|
| 309 |
proxy: Optional[str | Dict[str, str]] = None,
|
| 310 |
-
|
|
|
|
| 311 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 312 |
useragent: Optional[str] = None,
|
| 313 |
cdp_url: Optional[str] = None,
|
|
@@ -339,12 +332,11 @@ class ScraplingMCPServer:
|
|
| 339 |
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 340 |
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 341 |
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 342 |
-
:param
|
|
|
|
|
|
|
| 343 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 344 |
-
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
| 345 |
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 346 |
-
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
| 347 |
-
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
| 348 |
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 349 |
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 350 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
|
@@ -356,17 +348,15 @@ class ScraplingMCPServer:
|
|
| 356 |
locale=locale,
|
| 357 |
timeout=timeout,
|
| 358 |
cookies=cookies,
|
| 359 |
-
stealth=stealth,
|
| 360 |
cdp_url=cdp_url,
|
| 361 |
headless=headless,
|
| 362 |
max_pages=len(urls),
|
| 363 |
useragent=useragent,
|
| 364 |
-
|
| 365 |
real_chrome=real_chrome,
|
| 366 |
network_idle=network_idle,
|
| 367 |
wait_selector=wait_selector,
|
| 368 |
google_search=google_search,
|
| 369 |
-
disable_webgl=disable_webgl,
|
| 370 |
extra_headers=extra_headers,
|
| 371 |
disable_resources=disable_resources,
|
| 372 |
wait_selector_state=wait_selector_state,
|
|
@@ -393,29 +383,29 @@ class ScraplingMCPServer:
|
|
| 393 |
css_selector: Optional[str] = None,
|
| 394 |
main_content_only: bool = True,
|
| 395 |
headless: bool = True, # noqa: F821
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
block_webrtc: bool = False,
|
| 399 |
-
allow_webgl: bool = True,
|
| 400 |
-
network_idle: bool = False,
|
| 401 |
-
humanize: bool | float = True,
|
| 402 |
-
solve_cloudflare: bool = False,
|
| 403 |
wait: int | float = 0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 404 |
timeout: int | float = 30000,
|
|
|
|
| 405 |
wait_selector: Optional[str] = None,
|
| 406 |
-
addons: Optional[List[str]] = None,
|
| 407 |
-
wait_selector_state: SelectorWaitStates = "attached",
|
| 408 |
cookies: Optional[List[Dict]] = None,
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
geoip: bool = False,
|
| 415 |
additional_args: Optional[Dict] = None,
|
| 416 |
) -> ResponseModel:
|
| 417 |
-
"""Use
|
| 418 |
-
Note: This is
|
| 419 |
Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
|
| 420 |
|
| 421 |
:param url: The URL to request.
|
|
@@ -426,54 +416,53 @@ class ScraplingMCPServer:
|
|
| 426 |
:param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
|
| 427 |
:param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
|
| 428 |
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 429 |
-
:param block_images: Prevent the loading of images through Firefox preferences.
|
| 430 |
-
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 431 |
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
| 432 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 433 |
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 434 |
-
:param
|
| 435 |
:param cookies: Set cookies for the next request.
|
| 436 |
-
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
| 437 |
-
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
| 438 |
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
| 439 |
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
| 440 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 441 |
-
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
| 442 |
-
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
| 443 |
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 444 |
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 445 |
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 446 |
-
:param
|
| 447 |
-
|
|
|
|
| 448 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 449 |
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 450 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 451 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 452 |
-
:param additional_args: Additional arguments to be passed to
|
| 453 |
"""
|
| 454 |
page = await StealthyFetcher.async_fetch(
|
| 455 |
url,
|
| 456 |
wait=wait,
|
| 457 |
proxy=proxy,
|
| 458 |
-
|
| 459 |
-
|
| 460 |
timeout=timeout,
|
| 461 |
cookies=cookies,
|
| 462 |
headless=headless,
|
| 463 |
-
|
|
|
|
|
|
|
|
|
|
| 464 |
allow_webgl=allow_webgl,
|
| 465 |
-
disable_ads=disable_ads,
|
| 466 |
network_idle=network_idle,
|
| 467 |
-
block_images=block_images,
|
| 468 |
block_webrtc=block_webrtc,
|
| 469 |
-
os_randomize=os_randomize,
|
| 470 |
wait_selector=wait_selector,
|
| 471 |
google_search=google_search,
|
| 472 |
extra_headers=extra_headers,
|
|
|
|
| 473 |
solve_cloudflare=solve_cloudflare,
|
| 474 |
disable_resources=disable_resources,
|
| 475 |
wait_selector_state=wait_selector_state,
|
| 476 |
-
additional_args=additional_args,
|
| 477 |
)
|
| 478 |
return _ContentTranslator(
|
| 479 |
Convertor._extract_content(
|
|
@@ -492,29 +481,29 @@ class ScraplingMCPServer:
|
|
| 492 |
css_selector: Optional[str] = None,
|
| 493 |
main_content_only: bool = True,
|
| 494 |
headless: bool = True, # noqa: F821
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
block_webrtc: bool = False,
|
| 498 |
-
allow_webgl: bool = True,
|
| 499 |
-
network_idle: bool = False,
|
| 500 |
-
humanize: bool | float = True,
|
| 501 |
-
solve_cloudflare: bool = False,
|
| 502 |
wait: int | float = 0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 503 |
timeout: int | float = 30000,
|
|
|
|
| 504 |
wait_selector: Optional[str] = None,
|
| 505 |
-
addons: Optional[List[str]] = None,
|
| 506 |
-
wait_selector_state: SelectorWaitStates = "attached",
|
| 507 |
cookies: Optional[List[Dict]] = None,
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
geoip: bool = False,
|
| 514 |
additional_args: Optional[Dict] = None,
|
| 515 |
) -> List[ResponseModel]:
|
| 516 |
-
"""Use
|
| 517 |
-
Note: This is
|
| 518 |
Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
|
| 519 |
|
| 520 |
:param urls: A tuple of the URLs to request.
|
|
@@ -525,54 +514,52 @@ class ScraplingMCPServer:
|
|
| 525 |
:param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
|
| 526 |
:param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
|
| 527 |
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 528 |
-
:param block_images: Prevent the loading of images through Firefox preferences.
|
| 529 |
-
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 530 |
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
| 531 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 532 |
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 533 |
-
:param
|
| 534 |
:param cookies: Set cookies for the next request.
|
| 535 |
-
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
| 536 |
-
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
| 537 |
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
| 538 |
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
| 539 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 540 |
-
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
| 541 |
-
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
| 542 |
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 543 |
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 544 |
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 545 |
-
:param
|
| 546 |
-
|
|
|
|
| 547 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 548 |
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 549 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 550 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 551 |
-
:param additional_args: Additional arguments to be passed to
|
| 552 |
"""
|
| 553 |
async with AsyncStealthySession(
|
| 554 |
wait=wait,
|
| 555 |
proxy=proxy,
|
| 556 |
-
|
| 557 |
-
|
| 558 |
timeout=timeout,
|
| 559 |
cookies=cookies,
|
| 560 |
headless=headless,
|
| 561 |
-
|
| 562 |
-
|
|
|
|
|
|
|
| 563 |
allow_webgl=allow_webgl,
|
| 564 |
-
disable_ads=disable_ads,
|
| 565 |
-
block_images=block_images,
|
| 566 |
-
block_webrtc=block_webrtc,
|
| 567 |
network_idle=network_idle,
|
| 568 |
-
|
| 569 |
wait_selector=wait_selector,
|
| 570 |
google_search=google_search,
|
| 571 |
extra_headers=extra_headers,
|
|
|
|
| 572 |
solve_cloudflare=solve_cloudflare,
|
| 573 |
disable_resources=disable_resources,
|
| 574 |
wait_selector_state=wait_selector_state,
|
| 575 |
-
additional_args=additional_args,
|
| 576 |
) as session:
|
| 577 |
tasks = [session.fetch(url) for url in urls]
|
| 578 |
responses = await gather(*tasks)
|
|
|
|
| 213 |
main_content_only: bool = True,
|
| 214 |
headless: bool = False,
|
| 215 |
google_search: bool = True,
|
|
|
|
|
|
|
| 216 |
real_chrome: bool = False,
|
|
|
|
| 217 |
wait: int | float = 0,
|
| 218 |
proxy: Optional[str | Dict[str, str]] = None,
|
| 219 |
+
timezone_id: str | None = None,
|
| 220 |
+
locale: str | None = None,
|
| 221 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 222 |
useragent: Optional[str] = None,
|
| 223 |
cdp_url: Optional[str] = None,
|
|
|
|
| 249 |
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 250 |
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 251 |
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 252 |
+
:param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
|
| 253 |
+
:param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
|
| 254 |
+
rules. Defaults to the system default locale.
|
| 255 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
|
|
|
| 256 |
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
|
|
|
|
|
|
| 257 |
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 258 |
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 259 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
|
|
|
| 266 |
locale=locale,
|
| 267 |
timeout=timeout,
|
| 268 |
cookies=cookies,
|
|
|
|
| 269 |
cdp_url=cdp_url,
|
| 270 |
headless=headless,
|
| 271 |
useragent=useragent,
|
| 272 |
+
timezone_id=timezone_id,
|
| 273 |
real_chrome=real_chrome,
|
| 274 |
network_idle=network_idle,
|
| 275 |
wait_selector=wait_selector,
|
|
|
|
| 276 |
extra_headers=extra_headers,
|
| 277 |
google_search=google_search,
|
| 278 |
disable_resources=disable_resources,
|
|
|
|
| 296 |
main_content_only: bool = True,
|
| 297 |
headless: bool = False,
|
| 298 |
google_search: bool = True,
|
|
|
|
|
|
|
| 299 |
real_chrome: bool = False,
|
|
|
|
| 300 |
wait: int | float = 0,
|
| 301 |
proxy: Optional[str | Dict[str, str]] = None,
|
| 302 |
+
timezone_id: str | None = None,
|
| 303 |
+
locale: str | None = None,
|
| 304 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 305 |
useragent: Optional[str] = None,
|
| 306 |
cdp_url: Optional[str] = None,
|
|
|
|
| 332 |
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 333 |
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 334 |
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 335 |
+
:param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
|
| 336 |
+
:param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
|
| 337 |
+
rules. Defaults to the system default locale.
|
| 338 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
|
|
|
| 339 |
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
|
|
|
|
|
|
| 340 |
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 341 |
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 342 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
|
|
|
| 348 |
locale=locale,
|
| 349 |
timeout=timeout,
|
| 350 |
cookies=cookies,
|
|
|
|
| 351 |
cdp_url=cdp_url,
|
| 352 |
headless=headless,
|
| 353 |
max_pages=len(urls),
|
| 354 |
useragent=useragent,
|
| 355 |
+
timezone_id=timezone_id,
|
| 356 |
real_chrome=real_chrome,
|
| 357 |
network_idle=network_idle,
|
| 358 |
wait_selector=wait_selector,
|
| 359 |
google_search=google_search,
|
|
|
|
| 360 |
extra_headers=extra_headers,
|
| 361 |
disable_resources=disable_resources,
|
| 362 |
wait_selector_state=wait_selector_state,
|
|
|
|
| 383 |
css_selector: Optional[str] = None,
|
| 384 |
main_content_only: bool = True,
|
| 385 |
headless: bool = True, # noqa: F821
|
| 386 |
+
google_search: bool = True,
|
| 387 |
+
real_chrome: bool = False,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 388 |
wait: int | float = 0,
|
| 389 |
+
proxy: Optional[str | Dict[str, str]] = None,
|
| 390 |
+
timezone_id: str | None = None,
|
| 391 |
+
locale: str | None = None,
|
| 392 |
+
extra_headers: Optional[Dict[str, str]] = None,
|
| 393 |
+
useragent: Optional[str] = None,
|
| 394 |
+
hide_canvas: bool = False,
|
| 395 |
+
cdp_url: Optional[str] = None,
|
| 396 |
timeout: int | float = 30000,
|
| 397 |
+
disable_resources: bool = False,
|
| 398 |
wait_selector: Optional[str] = None,
|
|
|
|
|
|
|
| 399 |
cookies: Optional[List[Dict]] = None,
|
| 400 |
+
network_idle: bool = False,
|
| 401 |
+
wait_selector_state: SelectorWaitStates = "attached",
|
| 402 |
+
block_webrtc: bool = False,
|
| 403 |
+
allow_webgl: bool = True,
|
| 404 |
+
solve_cloudflare: bool = False,
|
|
|
|
| 405 |
additional_args: Optional[Dict] = None,
|
| 406 |
) -> ResponseModel:
|
| 407 |
+
"""Use the stealthy fetcher to fetch a URL and return a structured output of the result.
|
| 408 |
+
Note: This is the only suitable fetcher for high protection levels.
|
| 409 |
Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
|
| 410 |
|
| 411 |
:param url: The URL to request.
|
|
|
|
| 416 |
:param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
|
| 417 |
:param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
|
| 418 |
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
|
|
|
|
|
|
| 419 |
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
| 420 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 421 |
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 422 |
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 423 |
:param cookies: Set cookies for the next request.
|
|
|
|
|
|
|
| 424 |
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
| 425 |
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
| 426 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
|
|
|
|
|
|
| 427 |
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 428 |
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 429 |
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 430 |
+
:param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
|
| 431 |
+
:param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
|
| 432 |
+
rules. Defaults to the system default locale.
|
| 433 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 434 |
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 435 |
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
| 436 |
+
:param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
|
| 437 |
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 438 |
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 439 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 440 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 441 |
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 442 |
"""
|
| 443 |
page = await StealthyFetcher.async_fetch(
|
| 444 |
url,
|
| 445 |
wait=wait,
|
| 446 |
proxy=proxy,
|
| 447 |
+
locale=locale,
|
| 448 |
+
cdp_url=cdp_url,
|
| 449 |
timeout=timeout,
|
| 450 |
cookies=cookies,
|
| 451 |
headless=headless,
|
| 452 |
+
useragent=useragent,
|
| 453 |
+
timezone_id=timezone_id,
|
| 454 |
+
real_chrome=real_chrome,
|
| 455 |
+
hide_canvas=hide_canvas,
|
| 456 |
allow_webgl=allow_webgl,
|
|
|
|
| 457 |
network_idle=network_idle,
|
|
|
|
| 458 |
block_webrtc=block_webrtc,
|
|
|
|
| 459 |
wait_selector=wait_selector,
|
| 460 |
google_search=google_search,
|
| 461 |
extra_headers=extra_headers,
|
| 462 |
+
additional_args=additional_args,
|
| 463 |
solve_cloudflare=solve_cloudflare,
|
| 464 |
disable_resources=disable_resources,
|
| 465 |
wait_selector_state=wait_selector_state,
|
|
|
|
| 466 |
)
|
| 467 |
return _ContentTranslator(
|
| 468 |
Convertor._extract_content(
|
|
|
|
| 481 |
css_selector: Optional[str] = None,
|
| 482 |
main_content_only: bool = True,
|
| 483 |
headless: bool = True, # noqa: F821
|
| 484 |
+
google_search: bool = True,
|
| 485 |
+
real_chrome: bool = False,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 486 |
wait: int | float = 0,
|
| 487 |
+
proxy: Optional[str | Dict[str, str]] = None,
|
| 488 |
+
timezone_id: str | None = None,
|
| 489 |
+
locale: str | None = None,
|
| 490 |
+
extra_headers: Optional[Dict[str, str]] = None,
|
| 491 |
+
useragent: Optional[str] = None,
|
| 492 |
+
hide_canvas: bool = False,
|
| 493 |
+
cdp_url: Optional[str] = None,
|
| 494 |
timeout: int | float = 30000,
|
| 495 |
+
disable_resources: bool = False,
|
| 496 |
wait_selector: Optional[str] = None,
|
|
|
|
|
|
|
| 497 |
cookies: Optional[List[Dict]] = None,
|
| 498 |
+
network_idle: bool = False,
|
| 499 |
+
wait_selector_state: SelectorWaitStates = "attached",
|
| 500 |
+
block_webrtc: bool = False,
|
| 501 |
+
allow_webgl: bool = True,
|
| 502 |
+
solve_cloudflare: bool = False,
|
|
|
|
| 503 |
additional_args: Optional[Dict] = None,
|
| 504 |
) -> List[ResponseModel]:
|
| 505 |
+
"""Use the stealthy fetcher to fetch a group of URLs at the same time, and for each page return a structured output of the result.
|
| 506 |
+
Note: This is the only suitable fetcher for high protection levels.
|
| 507 |
Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
|
| 508 |
|
| 509 |
:param urls: A tuple of the URLs to request.
|
|
|
|
| 514 |
:param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
|
| 515 |
:param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
|
| 516 |
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
|
|
|
|
|
|
| 517 |
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
| 518 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 519 |
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 520 |
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 521 |
:param cookies: Set cookies for the next request.
|
|
|
|
|
|
|
| 522 |
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
| 523 |
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
| 524 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
|
|
|
|
|
|
| 525 |
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 526 |
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 527 |
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 528 |
+
:param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
|
| 529 |
+
:param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
|
| 530 |
+
rules. Defaults to the system default locale.
|
| 531 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 532 |
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 533 |
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
| 534 |
+
:param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
|
| 535 |
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 536 |
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 537 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 538 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 539 |
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 540 |
"""
|
| 541 |
async with AsyncStealthySession(
|
| 542 |
wait=wait,
|
| 543 |
proxy=proxy,
|
| 544 |
+
locale=locale,
|
| 545 |
+
cdp_url=cdp_url,
|
| 546 |
timeout=timeout,
|
| 547 |
cookies=cookies,
|
| 548 |
headless=headless,
|
| 549 |
+
useragent=useragent,
|
| 550 |
+
timezone_id=timezone_id,
|
| 551 |
+
real_chrome=real_chrome,
|
| 552 |
+
hide_canvas=hide_canvas,
|
| 553 |
allow_webgl=allow_webgl,
|
|
|
|
|
|
|
|
|
|
| 554 |
network_idle=network_idle,
|
| 555 |
+
block_webrtc=block_webrtc,
|
| 556 |
wait_selector=wait_selector,
|
| 557 |
google_search=google_search,
|
| 558 |
extra_headers=extra_headers,
|
| 559 |
+
additional_args=additional_args,
|
| 560 |
solve_cloudflare=solve_cloudflare,
|
| 561 |
disable_resources=disable_resources,
|
| 562 |
wait_selector_state=wait_selector_state,
|
|
|
|
| 563 |
) as session:
|
| 564 |
tasks = [session.fetch(url) for url in urls]
|
| 565 |
responses = await gather(*tasks)
|
scrapling/engines/_browsers/_base.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
from time import time
|
| 2 |
from asyncio import sleep as asyncio_sleep, Lock
|
| 3 |
|
| 4 |
-
from camoufox import DefaultAddons
|
| 5 |
from playwright.sync_api._generated import Page
|
| 6 |
from playwright.sync_api import (
|
| 7 |
Frame,
|
|
@@ -17,18 +16,18 @@ from playwright.async_api import (
|
|
| 17 |
BrowserContext as AsyncBrowserContext,
|
| 18 |
)
|
| 19 |
from playwright._impl._errors import Error as PlaywrightError
|
| 20 |
-
from camoufox.pkgman import installed_verstr as camoufox_version
|
| 21 |
-
from camoufox.utils import launch_options as generate_launch_options
|
| 22 |
|
| 23 |
from ._page import PageInfo, PagePool
|
| 24 |
from scrapling.parser import Selector
|
| 25 |
-
from
|
| 26 |
-
from
|
| 27 |
-
from ._validators import validate, PlaywrightConfig, CamoufoxConfig
|
| 28 |
-
from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
|
| 29 |
from scrapling.engines.toolbelt.navigation import intercept_route, async_intercept_route
|
| 30 |
-
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
|
| 34 |
class SyncSession:
|
|
@@ -84,10 +83,6 @@ class SyncSession:
|
|
| 84 |
if disable_resources:
|
| 85 |
page.route("**/*", intercept_route)
|
| 86 |
|
| 87 |
-
if getattr(self, "stealth", False):
|
| 88 |
-
for script in _compiled_stealth_scripts():
|
| 89 |
-
page.add_init_script(script=script)
|
| 90 |
-
|
| 91 |
page_info = self.page_pool.add_page(page)
|
| 92 |
page_info.mark_busy()
|
| 93 |
return page_info
|
|
@@ -202,10 +197,6 @@ class AsyncSession:
|
|
| 202 |
if disable_resources:
|
| 203 |
await page.route("**/*", async_intercept_route)
|
| 204 |
|
| 205 |
-
if getattr(self, "stealth", False):
|
| 206 |
-
for script in _compiled_stealth_scripts():
|
| 207 |
-
await page.add_init_script(script=script)
|
| 208 |
-
|
| 209 |
return self.page_pool.add_page(page)
|
| 210 |
|
| 211 |
def get_pool_stats(self) -> Dict[str, int]:
|
|
@@ -251,151 +242,118 @@ class AsyncSession:
|
|
| 251 |
return handle_response
|
| 252 |
|
| 253 |
|
| 254 |
-
class
|
| 255 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
if "__max_pages" in params:
|
| 257 |
params["max_pages"] = params.pop("__max_pages")
|
| 258 |
|
| 259 |
-
config = validate(params, model=
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
self.
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
self._extra_flags = config.extra_flags
|
| 283 |
-
self._selector_config = config.selector_config
|
| 284 |
-
self._timezone_id = config.timezone_id
|
| 285 |
-
self._additional_args = config.additional_args
|
| 286 |
-
self._page_action = config.page_action
|
| 287 |
-
self._user_data_dir = config.user_data_dir
|
| 288 |
-
self._headers_keys = {header.lower() for header in self._extra_headers.keys()} if self._extra_headers else set()
|
| 289 |
-
self.__initiate_browser_options__()
|
| 290 |
-
|
| 291 |
-
def __initiate_browser_options__(self):
|
| 292 |
-
if TYPE_CHECKING:
|
| 293 |
-
assert isinstance(self._proxy, tuple)
|
| 294 |
-
|
| 295 |
-
if not self._cdp_url:
|
| 296 |
-
# `launch_options` is used with persistent context
|
| 297 |
-
self.launch_options = dict(
|
| 298 |
-
_launch_kwargs(
|
| 299 |
-
self._headless,
|
| 300 |
-
self._proxy,
|
| 301 |
-
self._locale,
|
| 302 |
-
tuple(self._extra_headers.items()) if self._extra_headers else tuple(),
|
| 303 |
-
self._useragent,
|
| 304 |
-
self._real_chrome,
|
| 305 |
-
self._stealth,
|
| 306 |
-
self._hide_canvas,
|
| 307 |
-
self._disable_webgl,
|
| 308 |
-
self._timezone_id,
|
| 309 |
-
tuple(self._extra_flags) if self._extra_flags else tuple(),
|
| 310 |
-
)
|
| 311 |
)
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
self.
|
| 315 |
-
self.
|
| 316 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
else:
|
| 318 |
# while `context_options` is left to be used when cdp mode is enabled
|
| 319 |
-
self.
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
self._proxy,
|
| 323 |
-
self._locale,
|
| 324 |
-
tuple(self._extra_headers.items()) if self._extra_headers else tuple(),
|
| 325 |
-
self._useragent,
|
| 326 |
-
self._stealth,
|
| 327 |
-
)
|
| 328 |
-
)
|
| 329 |
-
self.context_options["extra_http_headers"] = dict(self.context_options["extra_http_headers"])
|
| 330 |
-
self.context_options["proxy"] = dict(self.context_options["proxy"]) or None
|
| 331 |
-
self.context_options.update(cast(Dict, self._additional_args))
|
| 332 |
|
| 333 |
|
| 334 |
-
class
|
| 335 |
def __validate__(self, **params):
|
| 336 |
-
|
| 337 |
-
|
| 338 |
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
self.
|
| 343 |
-
self.
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
self._page_action = config.page_action
|
| 354 |
-
self._wait_selector = config.wait_selector
|
| 355 |
-
self._init_script = config.init_script
|
| 356 |
-
self._addons = config.addons
|
| 357 |
-
self._wait_selector_state = config.wait_selector_state
|
| 358 |
-
self._cookies = config.cookies
|
| 359 |
-
self._google_search = config.google_search
|
| 360 |
-
self._extra_headers = config.extra_headers
|
| 361 |
-
self._proxy = config.proxy
|
| 362 |
-
self._os_randomize = config.os_randomize
|
| 363 |
-
self._disable_ads = config.disable_ads
|
| 364 |
-
self._geoip = config.geoip
|
| 365 |
-
self._selector_config = config.selector_config
|
| 366 |
-
self._additional_args = config.additional_args
|
| 367 |
-
self._user_data_dir = config.user_data_dir
|
| 368 |
-
self._headers_keys = {header.lower() for header in self._extra_headers.keys()} if self._extra_headers else set()
|
| 369 |
-
self.__initiate_browser_options__()
|
| 370 |
-
|
| 371 |
-
def __initiate_browser_options__(self):
|
| 372 |
-
"""Initiate browser options."""
|
| 373 |
-
self.launch_options: Dict[str, Any] = generate_launch_options(
|
| 374 |
-
**{
|
| 375 |
-
"geoip": self._geoip,
|
| 376 |
-
"proxy": dict(self._proxy) if self._proxy and isinstance(self._proxy, tuple) else self._proxy,
|
| 377 |
-
"addons": self._addons,
|
| 378 |
-
"exclude_addons": [] if self._disable_ads else [DefaultAddons.UBO],
|
| 379 |
-
"headless": self._headless,
|
| 380 |
-
"humanize": True if self._solve_cloudflare else self._humanize,
|
| 381 |
-
"i_know_what_im_doing": True, # To turn warnings off with the user configurations
|
| 382 |
-
"allow_webgl": self._allow_webgl,
|
| 383 |
-
"block_webrtc": self._block_webrtc,
|
| 384 |
-
"block_images": self._block_images, # Careful! it makes some websites don't finish loading at all like stackoverflow even in headful mode.
|
| 385 |
-
"os": None if self._os_randomize else get_os_name(),
|
| 386 |
-
"user_data_dir": self._user_data_dir,
|
| 387 |
-
"ff_version": __ff_version_str__,
|
| 388 |
-
"firefox_user_prefs": {
|
| 389 |
-
# This is what enabling `enable_cache` does internally, so we do it from here instead
|
| 390 |
-
"browser.sessionhistory.max_entries": 10,
|
| 391 |
-
"browser.sessionhistory.max_total_viewers": -1,
|
| 392 |
-
"browser.cache.memory.enable": True,
|
| 393 |
-
"browser.cache.disk_cache_ssl": True,
|
| 394 |
-
"browser.cache.disk.smart_size.enabled": True,
|
| 395 |
-
},
|
| 396 |
-
**cast(Dict, self._additional_args),
|
| 397 |
}
|
| 398 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 399 |
|
| 400 |
@staticmethod
|
| 401 |
def _detect_cloudflare(page_content: str) -> str | None:
|
|
|
|
| 1 |
from time import time
|
| 2 |
from asyncio import sleep as asyncio_sleep, Lock
|
| 3 |
|
|
|
|
| 4 |
from playwright.sync_api._generated import Page
|
| 5 |
from playwright.sync_api import (
|
| 6 |
Frame,
|
|
|
|
| 16 |
BrowserContext as AsyncBrowserContext,
|
| 17 |
)
|
| 18 |
from playwright._impl._errors import Error as PlaywrightError
|
|
|
|
|
|
|
| 19 |
|
| 20 |
from ._page import PageInfo, PagePool
|
| 21 |
from scrapling.parser import Selector
|
| 22 |
+
from ._validators import validate, PlaywrightConfig, StealthConfig
|
| 23 |
+
from ._config_tools import __default_chrome_useragent__, __default_useragent__
|
|
|
|
|
|
|
| 24 |
from scrapling.engines.toolbelt.navigation import intercept_route, async_intercept_route
|
| 25 |
+
from scrapling.core._types import Any, cast, Dict, List, Optional, Callable, TYPE_CHECKING, overload, Tuple
|
| 26 |
+
from scrapling.engines.constants import (
|
| 27 |
+
DEFAULT_STEALTH_FLAGS,
|
| 28 |
+
HARMFUL_DEFAULT_ARGS,
|
| 29 |
+
DEFAULT_FLAGS,
|
| 30 |
+
)
|
| 31 |
|
| 32 |
|
| 33 |
class SyncSession:
|
|
|
|
| 83 |
if disable_resources:
|
| 84 |
page.route("**/*", intercept_route)
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
page_info = self.page_pool.add_page(page)
|
| 87 |
page_info.mark_busy()
|
| 88 |
return page_info
|
|
|
|
| 197 |
if disable_resources:
|
| 198 |
await page.route("**/*", async_intercept_route)
|
| 199 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
return self.page_pool.add_page(page)
|
| 201 |
|
| 202 |
def get_pool_stats(self) -> Dict[str, int]:
|
|
|
|
| 242 |
return handle_response
|
| 243 |
|
| 244 |
|
| 245 |
+
class BaseSessionMixin:
|
| 246 |
+
@overload
|
| 247 |
+
def __validate_routine__(self, params: Dict, model: type[StealthConfig]) -> StealthConfig: ...
|
| 248 |
+
|
| 249 |
+
@overload
|
| 250 |
+
def __validate_routine__(self, params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...
|
| 251 |
+
|
| 252 |
+
def __validate_routine__(
|
| 253 |
+
self, params: Dict, model: type[PlaywrightConfig] | type[StealthConfig]
|
| 254 |
+
) -> PlaywrightConfig | StealthConfig:
|
| 255 |
+
# Dark color scheme bypasses the 'prefersLightColor' check in creepjs
|
| 256 |
+
self._context_options: Dict[str, Any] = {"color_scheme": "dark", "device_scale_factor": 2}
|
| 257 |
+
self._launch_options: Dict[str, Any] = self._context_options | {
|
| 258 |
+
"args": DEFAULT_FLAGS,
|
| 259 |
+
"ignore_default_args": HARMFUL_DEFAULT_ARGS,
|
| 260 |
+
}
|
| 261 |
if "__max_pages" in params:
|
| 262 |
params["max_pages"] = params.pop("__max_pages")
|
| 263 |
|
| 264 |
+
config = validate(params, model=model)
|
| 265 |
+
self._headers_keys = (
|
| 266 |
+
{header.lower() for header in config.extra_headers.keys()} if config.extra_headers else set()
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
return config
|
| 270 |
+
|
| 271 |
+
def __generate_options__(self, extra_flags: Tuple | None = None) -> None:
|
| 272 |
+
config = cast(PlaywrightConfig, getattr(self, "_config", None))
|
| 273 |
+
self._context_options.update(
|
| 274 |
+
{
|
| 275 |
+
"proxy": config.proxy,
|
| 276 |
+
"locale": config.locale,
|
| 277 |
+
"timezone_id": config.timezone_id,
|
| 278 |
+
"extra_http_headers": config.extra_headers,
|
| 279 |
+
}
|
| 280 |
+
)
|
| 281 |
+
# The default useragent in the headful is always correct now in the current versions of Playwright
|
| 282 |
+
if config.useragent:
|
| 283 |
+
self._context_options["user_agent"] = config.useragent
|
| 284 |
+
elif not config.useragent and config.headless:
|
| 285 |
+
self._context_options["user_agent"] = (
|
| 286 |
+
__default_chrome_useragent__ if config.real_chrome else __default_useragent__
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
)
|
| 288 |
+
|
| 289 |
+
if not config.cdp_url:
|
| 290 |
+
self._launch_options |= self._context_options
|
| 291 |
+
self._context_options = {}
|
| 292 |
+
flags = self._launch_options["args"]
|
| 293 |
+
if config.extra_flags or extra_flags:
|
| 294 |
+
flags = list(set(flags + (config.extra_flags or extra_flags)))
|
| 295 |
+
|
| 296 |
+
self._launch_options.update(
|
| 297 |
+
{
|
| 298 |
+
"args": flags,
|
| 299 |
+
"headless": config.headless,
|
| 300 |
+
"user_data_dir": config.user_data_dir,
|
| 301 |
+
"channel": "chrome" if config.real_chrome else "chromium",
|
| 302 |
+
}
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
if config.additional_args:
|
| 306 |
+
self._launch_options.update(config.additional_args)
|
| 307 |
else:
|
| 308 |
# while `context_options` is left to be used when cdp mode is enabled
|
| 309 |
+
self._launch_options = dict()
|
| 310 |
+
if config.additional_args:
|
| 311 |
+
self._context_options.update(config.additional_args)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
|
| 313 |
|
| 314 |
+
class DynamicSessionMixin(BaseSessionMixin):
|
| 315 |
def __validate__(self, **params):
|
| 316 |
+
self._config = self.__validate_routine__(params, model=PlaywrightConfig)
|
| 317 |
+
self.__generate_options__()
|
| 318 |
|
| 319 |
+
|
| 320 |
+
class StealthySessionMixin(BaseSessionMixin):
|
| 321 |
+
def __validate__(self, **params):
|
| 322 |
+
self._config: StealthConfig = self.__validate_routine__(params, model=StealthConfig)
|
| 323 |
+
self._context_options.update(
|
| 324 |
+
{
|
| 325 |
+
"is_mobile": False,
|
| 326 |
+
"has_touch": False,
|
| 327 |
+
# I'm thinking about disabling it to rest from all Service Workers' headache, but let's keep it as it is for now
|
| 328 |
+
"service_workers": "allow",
|
| 329 |
+
"ignore_https_errors": True,
|
| 330 |
+
"screen": {"width": 1920, "height": 1080},
|
| 331 |
+
"viewport": {"width": 1920, "height": 1080},
|
| 332 |
+
"permissions": ["geolocation", "notifications"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
}
|
| 334 |
)
|
| 335 |
+
self.__generate_stealth_options()
|
| 336 |
+
|
| 337 |
+
def __generate_stealth_options(self) -> None:
|
| 338 |
+
flags = tuple()
|
| 339 |
+
if not self._config.cdp_url:
|
| 340 |
+
flags = DEFAULT_FLAGS + DEFAULT_STEALTH_FLAGS
|
| 341 |
+
|
| 342 |
+
if self._config.block_webrtc:
|
| 343 |
+
flags += (
|
| 344 |
+
"--webrtc-ip-handling-policy=disable_non_proxied_udp",
|
| 345 |
+
"--force-webrtc-ip-handling-policy", # Ensures the policy is enforced
|
| 346 |
+
)
|
| 347 |
+
if not self._config.allow_webgl:
|
| 348 |
+
flags += (
|
| 349 |
+
"--disable-webgl",
|
| 350 |
+
"--disable-webgl-image-chromium",
|
| 351 |
+
"--disable-webgl2",
|
| 352 |
+
)
|
| 353 |
+
if self._config.hide_canvas:
|
| 354 |
+
flags += ("--fingerprinting-canvas-image-data-noise",)
|
| 355 |
+
|
| 356 |
+
super(StealthySessionMixin, self).__generate_options__(flags)
|
| 357 |
|
| 358 |
@staticmethod
|
| 359 |
def _detect_cloudflare(page_content: str) -> str | None:
|
scrapling/engines/_browsers/_config_tools.py
CHANGED
|
@@ -58,88 +58,3 @@ def _set_flags(hide_canvas, disable_webgl): # pragma: no cover
|
|
| 58 |
)
|
| 59 |
|
| 60 |
return flags
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
@lru_cache(2, typed=True)
|
| 64 |
-
def _launch_kwargs(
|
| 65 |
-
headless,
|
| 66 |
-
proxy: Tuple,
|
| 67 |
-
locale,
|
| 68 |
-
extra_headers,
|
| 69 |
-
useragent,
|
| 70 |
-
real_chrome,
|
| 71 |
-
stealth,
|
| 72 |
-
hide_canvas,
|
| 73 |
-
disable_webgl,
|
| 74 |
-
timezone_id,
|
| 75 |
-
extra_flags: Tuple,
|
| 76 |
-
) -> Tuple:
|
| 77 |
-
"""Creates the arguments we will use while launching playwright's browser"""
|
| 78 |
-
base_args = DEFAULT_FLAGS
|
| 79 |
-
if extra_flags:
|
| 80 |
-
base_args = base_args + extra_flags
|
| 81 |
-
|
| 82 |
-
launch_kwargs = {
|
| 83 |
-
"locale": locale,
|
| 84 |
-
"timezone_id": timezone_id or None,
|
| 85 |
-
"headless": headless,
|
| 86 |
-
"args": base_args,
|
| 87 |
-
"color_scheme": "dark", # Bypasses the 'prefersLightColor' check in creepjs
|
| 88 |
-
"proxy": proxy or tuple(),
|
| 89 |
-
"device_scale_factor": 2,
|
| 90 |
-
"ignore_default_args": HARMFUL_DEFAULT_ARGS,
|
| 91 |
-
"channel": "chrome" if real_chrome else "chromium",
|
| 92 |
-
"extra_http_headers": extra_headers or tuple(),
|
| 93 |
-
}
|
| 94 |
-
# The default useragent in the headful is always correct now in the current versions of Playwright
|
| 95 |
-
if useragent:
|
| 96 |
-
launch_kwargs["user_agent"] = useragent
|
| 97 |
-
elif not useragent and headless:
|
| 98 |
-
launch_kwargs["user_agent"] = __default_chrome_useragent__ if real_chrome else __default_useragent__
|
| 99 |
-
|
| 100 |
-
if stealth:
|
| 101 |
-
stealth_args = base_args + _set_flags(hide_canvas, disable_webgl)
|
| 102 |
-
launch_kwargs.update(
|
| 103 |
-
{
|
| 104 |
-
"args": stealth_args,
|
| 105 |
-
"chromium_sandbox": True,
|
| 106 |
-
"is_mobile": False,
|
| 107 |
-
"has_touch": False,
|
| 108 |
-
# I'm thinking about disabling it to rest from all Service Workers' headache, but let's keep it as it is for now
|
| 109 |
-
"service_workers": "allow",
|
| 110 |
-
"ignore_https_errors": True,
|
| 111 |
-
"screen": {"width": 1920, "height": 1080},
|
| 112 |
-
"viewport": {"width": 1920, "height": 1080},
|
| 113 |
-
"permissions": ["geolocation", "notifications"],
|
| 114 |
-
}
|
| 115 |
-
)
|
| 116 |
-
|
| 117 |
-
return tuple(launch_kwargs.items())
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
@lru_cache(2, typed=True)
|
| 121 |
-
def _context_kwargs(proxy, locale, extra_headers, useragent, stealth) -> Tuple:
|
| 122 |
-
"""Creates the arguments for the browser context"""
|
| 123 |
-
context_kwargs = {
|
| 124 |
-
"proxy": proxy or tuple(),
|
| 125 |
-
"locale": locale,
|
| 126 |
-
"color_scheme": "dark", # Bypasses the 'prefersLightColor' check in creepjs
|
| 127 |
-
"device_scale_factor": 2,
|
| 128 |
-
"extra_http_headers": extra_headers or tuple(),
|
| 129 |
-
"user_agent": useragent or __default_useragent__,
|
| 130 |
-
}
|
| 131 |
-
if stealth:
|
| 132 |
-
context_kwargs.update(
|
| 133 |
-
{
|
| 134 |
-
"is_mobile": False,
|
| 135 |
-
"has_touch": False,
|
| 136 |
-
# I'm thinking about disabling it to rest from all Service Workers' headache, but let's keep it as it is for now
|
| 137 |
-
"service_workers": "allow",
|
| 138 |
-
"ignore_https_errors": True,
|
| 139 |
-
"screen": {"width": 1920, "height": 1080},
|
| 140 |
-
"viewport": {"width": 1920, "height": 1080},
|
| 141 |
-
"permissions": ["geolocation", "notifications"],
|
| 142 |
-
}
|
| 143 |
-
)
|
| 144 |
-
|
| 145 |
-
return tuple(context_kwargs.items())
|
|
|
|
| 58 |
)
|
| 59 |
|
| 60 |
return flags
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scrapling/engines/_browsers/_controllers.py
CHANGED
|
@@ -9,8 +9,6 @@ from playwright.async_api import (
|
|
| 9 |
Playwright as AsyncPlaywright,
|
| 10 |
BrowserContext as AsyncBrowserContext,
|
| 11 |
)
|
| 12 |
-
from patchright.sync_api import sync_playwright as sync_patchright
|
| 13 |
-
from patchright.async_api import async_playwright as async_patchright
|
| 14 |
|
| 15 |
from scrapling.core.utils import log
|
| 16 |
from scrapling.core._types import Unpack, TYPE_CHECKING
|
|
@@ -21,44 +19,19 @@ from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
|
|
| 21 |
from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
|
| 22 |
|
| 23 |
|
| 24 |
-
class DynamicSession(
|
| 25 |
"""A Browser session manager with page pooling."""
|
| 26 |
|
| 27 |
__slots__ = (
|
| 28 |
-
"
|
| 29 |
-
"
|
| 30 |
-
"
|
| 31 |
-
"
|
| 32 |
-
"
|
| 33 |
-
"
|
| 34 |
-
"_google_search",
|
| 35 |
-
"_proxy",
|
| 36 |
-
"_locale",
|
| 37 |
-
"_extra_headers",
|
| 38 |
-
"_useragent",
|
| 39 |
-
"_timeout",
|
| 40 |
-
"_cookies",
|
| 41 |
-
"_disable_resources",
|
| 42 |
-
"_network_idle",
|
| 43 |
-
"_load_dom",
|
| 44 |
-
"_wait_selector",
|
| 45 |
-
"_init_script",
|
| 46 |
-
"_wait_selector_state",
|
| 47 |
-
"_wait",
|
| 48 |
"playwright",
|
| 49 |
-
"browser",
|
| 50 |
"context",
|
| 51 |
-
"page_pool",
|
| 52 |
"_closed",
|
| 53 |
-
"_selector_config",
|
| 54 |
-
"_page_action",
|
| 55 |
-
"launch_options",
|
| 56 |
-
"context_options",
|
| 57 |
-
"_cdp_url",
|
| 58 |
-
"_headers_keys",
|
| 59 |
-
"_extra_flags",
|
| 60 |
-
"_additional_args",
|
| 61 |
-
"_user_data_dir",
|
| 62 |
)
|
| 63 |
|
| 64 |
def __init__(self, **kwargs: Unpack[PlaywrightSession]):
|
|
@@ -76,8 +49,9 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
| 76 |
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 77 |
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 78 |
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
| 79 |
-
:param locale:
|
| 80 |
-
|
|
|
|
| 81 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 82 |
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
| 83 |
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
|
@@ -94,27 +68,24 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
| 94 |
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 95 |
"""
|
| 96 |
self.__validate__(**kwargs)
|
| 97 |
-
super().__init__(
|
| 98 |
|
| 99 |
def start(self):
|
| 100 |
"""Create a browser for this instance and context."""
|
| 101 |
if not self.playwright:
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
self.playwright: Playwright = sync_context().start() # pyright: ignore [reportAttributeAccessIssue]
|
| 105 |
|
| 106 |
-
if self.
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
)
|
| 110 |
else:
|
| 111 |
-
self.context = self.playwright.chromium.launch_persistent_context(**self.
|
| 112 |
|
| 113 |
-
if self.
|
| 114 |
-
self.context.add_init_script(path=self.
|
| 115 |
|
| 116 |
-
if self.
|
| 117 |
-
self.context.add_cookies(self.
|
| 118 |
else:
|
| 119 |
raise RuntimeError("Session has been already started")
|
| 120 |
|
|
@@ -139,7 +110,6 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
| 139 |
:return: A `Response` object.
|
| 140 |
"""
|
| 141 |
params = _validate(kwargs, self, PlaywrightConfig)
|
| 142 |
-
|
| 143 |
if self._closed: # pragma: no cover
|
| 144 |
raise RuntimeError("Context manager has been closed")
|
| 145 |
|
|
@@ -193,7 +163,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
| 193 |
raise e
|
| 194 |
|
| 195 |
|
| 196 |
-
class AsyncDynamicSession(
|
| 197 |
"""An async Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory."""
|
| 198 |
|
| 199 |
def __init__(self, **kwargs: Unpack[PlaywrightSession]):
|
|
@@ -212,8 +182,9 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
| 212 |
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 213 |
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 214 |
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
| 215 |
-
:param locale:
|
| 216 |
-
|
|
|
|
| 217 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 218 |
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
| 219 |
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
|
@@ -230,28 +201,26 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
| 230 |
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 231 |
"""
|
| 232 |
self.__validate__(**kwargs)
|
| 233 |
-
super().__init__(max_pages=self.
|
| 234 |
|
| 235 |
async def start(self):
|
| 236 |
"""Create a browser for this instance and context."""
|
| 237 |
if not self.playwright:
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
self.playwright: AsyncPlaywright = await async_context().start() # pyright: ignore [reportAttributeAccessIssue]
|
| 241 |
|
| 242 |
-
if self.
|
| 243 |
-
browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self.
|
| 244 |
-
self.context: AsyncBrowserContext = await browser.new_context(**self.
|
| 245 |
else:
|
| 246 |
self.context: AsyncBrowserContext = await self.playwright.chromium.launch_persistent_context(
|
| 247 |
-
**self.
|
| 248 |
)
|
| 249 |
|
| 250 |
-
if self.
|
| 251 |
-
await self.context.add_init_script(path=self.
|
| 252 |
|
| 253 |
-
if self.
|
| 254 |
-
await self.context.add_cookies(self.
|
| 255 |
else:
|
| 256 |
raise RuntimeError("Session has been already started")
|
| 257 |
|
|
|
|
| 9 |
Playwright as AsyncPlaywright,
|
| 10 |
BrowserContext as AsyncBrowserContext,
|
| 11 |
)
|
|
|
|
|
|
|
| 12 |
|
| 13 |
from scrapling.core.utils import log
|
| 14 |
from scrapling.core._types import Unpack, TYPE_CHECKING
|
|
|
|
| 19 |
from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
|
| 20 |
|
| 21 |
|
| 22 |
+
class DynamicSession(SyncSession, DynamicSessionMixin):
|
| 23 |
"""A Browser session manager with page pooling."""
|
| 24 |
|
| 25 |
__slots__ = (
|
| 26 |
+
"_config",
|
| 27 |
+
"_context_options",
|
| 28 |
+
"_launch_options",
|
| 29 |
+
"max_pages",
|
| 30 |
+
"page_pool",
|
| 31 |
+
"_max_wait_for_page",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
"playwright",
|
|
|
|
| 33 |
"context",
|
|
|
|
| 34 |
"_closed",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
)
|
| 36 |
|
| 37 |
def __init__(self, **kwargs: Unpack[PlaywrightSession]):
|
|
|
|
| 49 |
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 50 |
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 51 |
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
| 52 |
+
:param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
|
| 53 |
+
rules. Defaults to the system default locale.
|
| 54 |
+
:param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
|
| 55 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 56 |
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
| 57 |
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
|
|
|
| 68 |
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 69 |
"""
|
| 70 |
self.__validate__(**kwargs)
|
| 71 |
+
super().__init__()
|
| 72 |
|
| 73 |
def start(self):
|
| 74 |
"""Create a browser for this instance and context."""
|
| 75 |
if not self.playwright:
|
| 76 |
+
self.playwright: Playwright = sync_playwright().start() # pyright: ignore [reportAttributeAccessIssue]
|
|
|
|
|
|
|
| 77 |
|
| 78 |
+
if self._config.cdp_url: # pragma: no cover
|
| 79 |
+
browser = self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
|
| 80 |
+
self.context = browser.new_context(**self._context_options)
|
|
|
|
| 81 |
else:
|
| 82 |
+
self.context = self.playwright.chromium.launch_persistent_context(**self._launch_options)
|
| 83 |
|
| 84 |
+
if self._config.init_script: # pragma: no cover
|
| 85 |
+
self.context.add_init_script(path=self._config.init_script)
|
| 86 |
|
| 87 |
+
if self._config.cookies: # pragma: no cover
|
| 88 |
+
self.context.add_cookies(self._config.cookies)
|
| 89 |
else:
|
| 90 |
raise RuntimeError("Session has been already started")
|
| 91 |
|
|
|
|
| 110 |
:return: A `Response` object.
|
| 111 |
"""
|
| 112 |
params = _validate(kwargs, self, PlaywrightConfig)
|
|
|
|
| 113 |
if self._closed: # pragma: no cover
|
| 114 |
raise RuntimeError("Context manager has been closed")
|
| 115 |
|
|
|
|
| 163 |
raise e
|
| 164 |
|
| 165 |
|
| 166 |
+
class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
|
| 167 |
"""An async Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory."""
|
| 168 |
|
| 169 |
def __init__(self, **kwargs: Unpack[PlaywrightSession]):
|
|
|
|
| 182 |
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 183 |
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 184 |
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
| 185 |
+
:param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
|
| 186 |
+
rules. Defaults to the system default locale.
|
| 187 |
+
:param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
|
| 188 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 189 |
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
| 190 |
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
|
|
|
| 201 |
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 202 |
"""
|
| 203 |
self.__validate__(**kwargs)
|
| 204 |
+
super().__init__(max_pages=self._config.max_pages)
|
| 205 |
|
| 206 |
async def start(self):
|
| 207 |
"""Create a browser for this instance and context."""
|
| 208 |
if not self.playwright:
|
| 209 |
+
self.playwright: AsyncPlaywright = await async_playwright().start() # pyright: ignore [reportAttributeAccessIssue]
|
|
|
|
|
|
|
| 210 |
|
| 211 |
+
if self._config.cdp_url:
|
| 212 |
+
browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
|
| 213 |
+
self.context: AsyncBrowserContext = await browser.new_context(**self._context_options)
|
| 214 |
else:
|
| 215 |
self.context: AsyncBrowserContext = await self.playwright.chromium.launch_persistent_context(
|
| 216 |
+
**self._launch_options
|
| 217 |
)
|
| 218 |
|
| 219 |
+
if self._config.init_script: # pragma: no cover
|
| 220 |
+
await self.context.add_init_script(path=self._config.init_script)
|
| 221 |
|
| 222 |
+
if self._config.cookies:
|
| 223 |
+
await self.context.add_cookies(self._config.cookies) # pyright: ignore
|
| 224 |
else:
|
| 225 |
raise RuntimeError("Session has been already started")
|
| 226 |
|
scrapling/engines/_browsers/{_camoufox.py → _stealth.py}
RENAMED
|
@@ -2,117 +2,102 @@ from random import randint
|
|
| 2 |
from re import compile as re_compile
|
| 3 |
|
| 4 |
from playwright.sync_api import (
|
| 5 |
-
Page,
|
| 6 |
Locator,
|
| 7 |
-
|
|
|
|
| 8 |
)
|
| 9 |
from playwright.async_api import (
|
| 10 |
-
async_playwright,
|
| 11 |
Page as async_Page,
|
| 12 |
Locator as AsyncLocator,
|
| 13 |
Playwright as AsyncPlaywright,
|
| 14 |
BrowserContext as AsyncBrowserContext,
|
| 15 |
)
|
|
|
|
|
|
|
| 16 |
|
| 17 |
from scrapling.core.utils import log
|
| 18 |
-
from ._types import
|
| 19 |
-
from
|
|
|
|
| 20 |
from ._base import SyncSession, AsyncSession, StealthySessionMixin
|
| 21 |
-
from ._validators import validate_fetch as _validate,
|
| 22 |
from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
|
| 23 |
from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
|
| 24 |
|
| 25 |
__CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
|
| 26 |
|
| 27 |
|
| 28 |
-
class StealthySession(
|
| 29 |
-
"""A Stealthy session manager with page pooling."""
|
| 30 |
|
| 31 |
__slots__ = (
|
| 32 |
-
"
|
| 33 |
-
"
|
| 34 |
-
"
|
| 35 |
-
"
|
| 36 |
-
"
|
| 37 |
-
"
|
| 38 |
-
"_network_idle",
|
| 39 |
-
"_load_dom",
|
| 40 |
-
"_humanize",
|
| 41 |
-
"_solve_cloudflare",
|
| 42 |
-
"_wait",
|
| 43 |
-
"_timeout",
|
| 44 |
-
"_page_action",
|
| 45 |
-
"_wait_selector",
|
| 46 |
-
"_init_script",
|
| 47 |
-
"_addons",
|
| 48 |
-
"_wait_selector_state",
|
| 49 |
-
"_cookies",
|
| 50 |
-
"_google_search",
|
| 51 |
-
"_extra_headers",
|
| 52 |
-
"_proxy",
|
| 53 |
-
"_os_randomize",
|
| 54 |
-
"_disable_ads",
|
| 55 |
-
"_geoip",
|
| 56 |
-
"_selector_config",
|
| 57 |
-
"_additional_args",
|
| 58 |
"playwright",
|
| 59 |
-
"browser",
|
| 60 |
"context",
|
| 61 |
-
"page_pool",
|
| 62 |
"_closed",
|
| 63 |
-
"launch_options",
|
| 64 |
-
"_headers_keys",
|
| 65 |
-
"_user_data_dir",
|
| 66 |
)
|
| 67 |
|
| 68 |
-
def __init__(self, **kwargs: Unpack[
|
| 69 |
-
"""A Browser session manager with page pooling
|
| 70 |
|
| 71 |
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 72 |
-
:param block_images: Prevent the loading of images through Firefox preferences.
|
| 73 |
-
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 74 |
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
| 75 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 76 |
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 77 |
-
:param
|
| 78 |
:param cookies: Set cookies for the next request.
|
| 79 |
-
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
| 80 |
-
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
| 81 |
-
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
| 82 |
-
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
| 83 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 84 |
-
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 85 |
-
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
| 86 |
-
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
| 87 |
-
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 88 |
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
|
|
|
| 89 |
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 90 |
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 91 |
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
| 92 |
-
:param
|
| 93 |
-
|
|
|
|
| 94 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 96 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 97 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 98 |
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
|
|
|
| 99 |
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 100 |
-
:param additional_args: Additional arguments to be passed to
|
| 101 |
"""
|
| 102 |
self.__validate__(**kwargs)
|
| 103 |
-
super().__init__(
|
| 104 |
|
| 105 |
def start(self):
|
| 106 |
"""Create a browser for this instance and context."""
|
| 107 |
if not self.playwright:
|
| 108 |
-
self.playwright = sync_playwright().start()
|
| 109 |
-
self.context = self.playwright.firefox.launch_persistent_context(**self.launch_options)
|
| 110 |
|
| 111 |
-
if self.
|
| 112 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
-
if self.
|
| 115 |
-
self.context.add_cookies(self.
|
| 116 |
else:
|
| 117 |
raise RuntimeError("Session has been already started")
|
| 118 |
|
|
@@ -148,22 +133,27 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
| 148 |
outer_box = {}
|
| 149 |
iframe = page.frame(url=__CF_PATTERN__)
|
| 150 |
if iframe is not None:
|
| 151 |
-
self._wait_for_page_stability(iframe, True,
|
| 152 |
|
| 153 |
if challenge_type != "embedded":
|
| 154 |
while not iframe.frame_element().is_visible():
|
| 155 |
# Double-checking that the iframe is loaded
|
| 156 |
page.wait_for_timeout(500)
|
|
|
|
| 157 |
outer_box: Any = iframe.frame_element().bounding_box()
|
| 158 |
|
| 159 |
if not iframe or not outer_box:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
outer_box: Any = page.locator(box_selector).last.bounding_box()
|
| 161 |
|
| 162 |
# Calculate the Captcha coordinates for any viewport
|
| 163 |
captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)
|
| 164 |
|
| 165 |
# Move the mouse to the center of the window, then press and hold the left mouse button
|
| 166 |
-
page.mouse.click(captcha_x, captcha_y, delay=
|
| 167 |
self._wait_for_networkidle(page)
|
| 168 |
if iframe is not None:
|
| 169 |
# Wait for the frame to be removed from the page (with 30s timeout = 300 iterations * 100 ms)
|
|
@@ -182,7 +172,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
| 182 |
log.info("Cloudflare captcha is solved")
|
| 183 |
return
|
| 184 |
|
| 185 |
-
def fetch(self, url: str, **kwargs: Unpack[
|
| 186 |
"""Opens up the browser and do your request based on your chosen options.
|
| 187 |
|
| 188 |
:param url: The Target url.
|
|
@@ -203,8 +193,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
| 203 |
- selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 204 |
:return: A `Response` object.
|
| 205 |
"""
|
| 206 |
-
params = _validate(kwargs, self,
|
| 207 |
-
|
| 208 |
if self._closed: # pragma: no cover
|
| 209 |
raise RuntimeError("Context manager has been closed")
|
| 210 |
|
|
@@ -233,7 +222,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
| 233 |
if params.page_action:
|
| 234 |
try:
|
| 235 |
_ = params.page_action(page_info.page)
|
| 236 |
-
except Exception as e:
|
| 237 |
log.error(f"Error executing page_action: {e}")
|
| 238 |
|
| 239 |
if params.wait_selector:
|
|
@@ -242,10 +231,12 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
| 242 |
waiter.first.wait_for(state=params.wait_selector_state)
|
| 243 |
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
| 244 |
self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
|
| 245 |
-
except Exception as e:
|
| 246 |
log.error(f"Error waiting for selector {params.wait_selector}: {e}")
|
| 247 |
|
| 248 |
page_info.page.wait_for_timeout(params.wait)
|
|
|
|
|
|
|
| 249 |
response = ResponseFactory.from_playwright_response(
|
| 250 |
page_info.page, first_response, final_response[0], params.selector_config
|
| 251 |
)
|
|
@@ -256,72 +247,79 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
| 256 |
|
| 257 |
return response
|
| 258 |
|
| 259 |
-
except Exception as e:
|
| 260 |
page_info.mark_error()
|
| 261 |
raise e
|
| 262 |
|
| 263 |
|
| 264 |
-
class AsyncStealthySession(
|
| 265 |
-
"""
|
| 266 |
|
| 267 |
-
def __init__(self, **kwargs: Unpack[
|
| 268 |
-
"""A Browser session manager with page pooling
|
| 269 |
|
| 270 |
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 271 |
-
:param block_images: Prevent the loading of images through Firefox preferences.
|
| 272 |
-
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 273 |
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
| 274 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 275 |
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 276 |
-
:param
|
| 277 |
:param cookies: Set cookies for the next request.
|
| 278 |
-
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
| 279 |
-
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
| 280 |
-
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
| 281 |
-
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
| 282 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 283 |
-
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 284 |
-
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
| 285 |
-
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
| 286 |
-
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 287 |
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
|
|
|
| 288 |
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 289 |
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 290 |
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
| 291 |
-
:param
|
| 292 |
-
|
|
|
|
| 293 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 295 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 296 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 297 |
-
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
| 298 |
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
|
|
|
| 299 |
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 300 |
-
:param additional_args: Additional arguments to be passed to
|
| 301 |
"""
|
| 302 |
self.__validate__(**kwargs)
|
| 303 |
-
super().__init__(max_pages=self.
|
| 304 |
|
| 305 |
async def start(self):
|
| 306 |
"""Create a browser for this instance and context."""
|
| 307 |
if not self.playwright:
|
| 308 |
-
self.playwright: AsyncPlaywright = await async_playwright().start()
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
|
| 313 |
-
|
| 314 |
-
await self.context.add_init_script(
|
| 315 |
|
| 316 |
-
if self.
|
| 317 |
-
await self.context.
|
|
|
|
|
|
|
|
|
|
| 318 |
else:
|
| 319 |
raise RuntimeError("Session has been already started")
|
| 320 |
|
| 321 |
-
async def _cloudflare_solver(self, page: async_Page): # pragma: no cover
|
| 322 |
-
"""Solve the cloudflare challenge displayed on the playwright page passed
|
| 323 |
|
| 324 |
-
:param page: The
|
| 325 |
:return:
|
| 326 |
"""
|
| 327 |
await self._wait_for_networkidle(page, timeout=5000)
|
|
@@ -331,7 +329,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
| 331 |
return
|
| 332 |
else:
|
| 333 |
log.info(f'The turnstile version discovered is "{challenge_type}"')
|
| 334 |
-
if challenge_type == "non-interactive":
|
| 335 |
while "<title>Just a moment...</title>" in (await ResponseFactory._get_async_page_content(page)):
|
| 336 |
log.info("Waiting for Cloudflare wait page to disappear.")
|
| 337 |
await page.wait_for_timeout(1000)
|
|
@@ -350,22 +348,27 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
| 350 |
outer_box = {}
|
| 351 |
iframe = page.frame(url=__CF_PATTERN__)
|
| 352 |
if iframe is not None:
|
| 353 |
-
await self._wait_for_page_stability(iframe, True,
|
| 354 |
|
| 355 |
if challenge_type != "embedded":
|
| 356 |
while not await (await iframe.frame_element()).is_visible():
|
| 357 |
# Double-checking that the iframe is loaded
|
| 358 |
await page.wait_for_timeout(500)
|
| 359 |
-
|
|
|
|
| 360 |
|
| 361 |
if not iframe or not outer_box:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 362 |
outer_box: Any = await page.locator(box_selector).last.bounding_box()
|
| 363 |
|
| 364 |
# Calculate the Captcha coordinates for any viewport
|
| 365 |
captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)
|
| 366 |
|
| 367 |
# Move the mouse to the center of the window, then press and hold the left mouse button
|
| 368 |
-
await page.mouse.click(captcha_x, captcha_y, delay=
|
| 369 |
await self._wait_for_networkidle(page)
|
| 370 |
if iframe is not None:
|
| 371 |
# Wait for the frame to be removed from the page (with 30s timeout = 300 iterations * 100 ms)
|
|
@@ -377,14 +380,14 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
| 377 |
await page.wait_for_timeout(100)
|
| 378 |
attempts += 1
|
| 379 |
if challenge_type != "embedded":
|
| 380 |
-
await page.locator(box_selector).wait_for(state="detached")
|
| 381 |
await page.locator(".zone-name-title").wait_for(state="hidden")
|
| 382 |
await self._wait_for_page_stability(page, True, False)
|
| 383 |
|
| 384 |
log.info("Cloudflare captcha is solved")
|
| 385 |
return
|
| 386 |
|
| 387 |
-
async def fetch(self, url: str, **kwargs: Unpack[
|
| 388 |
"""Opens up the browser and do your request based on your chosen options.
|
| 389 |
|
| 390 |
:param url: The Target url.
|
|
@@ -405,7 +408,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
| 405 |
- selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 406 |
:return: A `Response` object.
|
| 407 |
"""
|
| 408 |
-
params = _validate(kwargs, self,
|
| 409 |
|
| 410 |
if self._closed: # pragma: no cover
|
| 411 |
raise RuntimeError("Context manager has been closed")
|
|
@@ -418,10 +421,6 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
| 418 |
final_response = [None]
|
| 419 |
handle_response = self._create_response_handler(page_info, final_response)
|
| 420 |
|
| 421 |
-
if TYPE_CHECKING:
|
| 422 |
-
if not isinstance(page_info.page, async_Page):
|
| 423 |
-
raise TypeError
|
| 424 |
-
|
| 425 |
try:
|
| 426 |
# Navigate to URL and wait for a specified state
|
| 427 |
page_info.page.on("response", handle_response)
|
|
@@ -461,9 +460,8 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
| 461 |
# Close the page to free up resources
|
| 462 |
await page_info.page.close()
|
| 463 |
self.page_pool.pages.remove(page_info)
|
| 464 |
-
|
| 465 |
return response
|
| 466 |
|
| 467 |
-
except Exception as e:
|
| 468 |
page_info.mark_error()
|
| 469 |
raise e
|
|
|
|
| 2 |
from re import compile as re_compile
|
| 3 |
|
| 4 |
from playwright.sync_api import (
|
|
|
|
| 5 |
Locator,
|
| 6 |
+
Page,
|
| 7 |
+
Playwright,
|
| 8 |
)
|
| 9 |
from playwright.async_api import (
|
|
|
|
| 10 |
Page as async_Page,
|
| 11 |
Locator as AsyncLocator,
|
| 12 |
Playwright as AsyncPlaywright,
|
| 13 |
BrowserContext as AsyncBrowserContext,
|
| 14 |
)
|
| 15 |
+
from patchright.sync_api import sync_playwright
|
| 16 |
+
from patchright.async_api import async_playwright
|
| 17 |
|
| 18 |
from scrapling.core.utils import log
|
| 19 |
+
from scrapling.core._types import Any, Unpack
|
| 20 |
+
from ._config_tools import _compiled_stealth_scripts
|
| 21 |
+
from ._types import StealthSession, StealthFetchParams
|
| 22 |
from ._base import SyncSession, AsyncSession, StealthySessionMixin
|
| 23 |
+
from ._validators import validate_fetch as _validate, StealthConfig
|
| 24 |
from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
|
| 25 |
from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
|
| 26 |
|
| 27 |
__CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
|
| 28 |
|
| 29 |
|
| 30 |
+
class StealthySession(SyncSession, StealthySessionMixin):
|
| 31 |
+
"""A Stealthy Browser session manager with page pooling."""
|
| 32 |
|
| 33 |
__slots__ = (
|
| 34 |
+
"_config",
|
| 35 |
+
"_context_options",
|
| 36 |
+
"_launch_options",
|
| 37 |
+
"max_pages",
|
| 38 |
+
"page_pool",
|
| 39 |
+
"_max_wait_for_page",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
"playwright",
|
|
|
|
| 41 |
"context",
|
|
|
|
| 42 |
"_closed",
|
|
|
|
|
|
|
|
|
|
| 43 |
)
|
| 44 |
|
| 45 |
+
def __init__(self, **kwargs: Unpack[StealthSession]):
|
| 46 |
+
"""A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
|
| 47 |
|
| 48 |
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
|
|
|
|
|
|
| 49 |
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
| 50 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 51 |
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 52 |
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 53 |
:param cookies: Set cookies for the next request.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 56 |
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 57 |
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 58 |
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 59 |
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
| 60 |
+
:param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
|
| 61 |
+
rules. Defaults to the system default locale.
|
| 62 |
+
:param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
|
| 63 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 64 |
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
| 65 |
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 66 |
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
| 67 |
+
:param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
|
| 68 |
+
:param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
| 69 |
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 70 |
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 71 |
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 72 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 73 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 74 |
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
| 75 |
+
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
| 76 |
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 77 |
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 78 |
"""
|
| 79 |
self.__validate__(**kwargs)
|
| 80 |
+
super().__init__()
|
| 81 |
|
| 82 |
def start(self):
|
| 83 |
"""Create a browser for this instance and context."""
|
| 84 |
if not self.playwright:
|
| 85 |
+
self.playwright: Playwright = sync_playwright().start() # pyright: ignore [reportAttributeAccessIssue]
|
|
|
|
| 86 |
|
| 87 |
+
if self._config.cdp_url: # pragma: no cover
|
| 88 |
+
browser = self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
|
| 89 |
+
self.context = browser.new_context(**self._context_options)
|
| 90 |
+
else:
|
| 91 |
+
self.context = self.playwright.chromium.launch_persistent_context(**self._launch_options)
|
| 92 |
+
|
| 93 |
+
for script in _compiled_stealth_scripts():
|
| 94 |
+
self.context.add_init_script(script=script)
|
| 95 |
+
|
| 96 |
+
if self._config.init_script: # pragma: no cover
|
| 97 |
+
self.context.add_init_script(path=self._config.init_script)
|
| 98 |
|
| 99 |
+
if self._config.cookies: # pragma: no cover
|
| 100 |
+
self.context.add_cookies(self._config.cookies)
|
| 101 |
else:
|
| 102 |
raise RuntimeError("Session has been already started")
|
| 103 |
|
|
|
|
| 133 |
outer_box = {}
|
| 134 |
iframe = page.frame(url=__CF_PATTERN__)
|
| 135 |
if iframe is not None:
|
| 136 |
+
self._wait_for_page_stability(iframe, True, False)
|
| 137 |
|
| 138 |
if challenge_type != "embedded":
|
| 139 |
while not iframe.frame_element().is_visible():
|
| 140 |
# Double-checking that the iframe is loaded
|
| 141 |
page.wait_for_timeout(500)
|
| 142 |
+
|
| 143 |
outer_box: Any = iframe.frame_element().bounding_box()
|
| 144 |
|
| 145 |
if not iframe or not outer_box:
|
| 146 |
+
if "<title>Just a moment...</title>" not in (ResponseFactory._get_page_content(page)):
|
| 147 |
+
log.info("Cloudflare captcha is solved")
|
| 148 |
+
return
|
| 149 |
+
|
| 150 |
outer_box: Any = page.locator(box_selector).last.bounding_box()
|
| 151 |
|
| 152 |
# Calculate the Captcha coordinates for any viewport
|
| 153 |
captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)
|
| 154 |
|
| 155 |
# Move the mouse to the center of the window, then press and hold the left mouse button
|
| 156 |
+
page.mouse.click(captcha_x, captcha_y, delay=randint(100, 200), button="left")
|
| 157 |
self._wait_for_networkidle(page)
|
| 158 |
if iframe is not None:
|
| 159 |
# Wait for the frame to be removed from the page (with 30s timeout = 300 iterations * 100 ms)
|
|
|
|
| 172 |
log.info("Cloudflare captcha is solved")
|
| 173 |
return
|
| 174 |
|
| 175 |
+
def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Response:
|
| 176 |
"""Opens up the browser and do your request based on your chosen options.
|
| 177 |
|
| 178 |
:param url: The Target url.
|
|
|
|
| 193 |
- selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 194 |
:return: A `Response` object.
|
| 195 |
"""
|
| 196 |
+
params = _validate(kwargs, self, StealthConfig)
|
|
|
|
| 197 |
if self._closed: # pragma: no cover
|
| 198 |
raise RuntimeError("Context manager has been closed")
|
| 199 |
|
|
|
|
| 222 |
if params.page_action:
|
| 223 |
try:
|
| 224 |
_ = params.page_action(page_info.page)
|
| 225 |
+
except Exception as e: # pragma: no cover
|
| 226 |
log.error(f"Error executing page_action: {e}")
|
| 227 |
|
| 228 |
if params.wait_selector:
|
|
|
|
| 231 |
waiter.first.wait_for(state=params.wait_selector_state)
|
| 232 |
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
| 233 |
self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
|
| 234 |
+
except Exception as e: # pragma: no cover
|
| 235 |
log.error(f"Error waiting for selector {params.wait_selector}: {e}")
|
| 236 |
|
| 237 |
page_info.page.wait_for_timeout(params.wait)
|
| 238 |
+
|
| 239 |
+
# Create response object
|
| 240 |
response = ResponseFactory.from_playwright_response(
|
| 241 |
page_info.page, first_response, final_response[0], params.selector_config
|
| 242 |
)
|
|
|
|
| 247 |
|
| 248 |
return response
|
| 249 |
|
| 250 |
+
except Exception as e:
|
| 251 |
page_info.mark_error()
|
| 252 |
raise e
|
| 253 |
|
| 254 |
|
| 255 |
+
class AsyncStealthySession(AsyncSession, StealthySessionMixin):
|
| 256 |
+
"""An async Stealthy Browser session manager with page pooling."""
|
| 257 |
|
| 258 |
+
def __init__(self, **kwargs: Unpack[StealthSession]):
|
| 259 |
+
"""A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
|
| 260 |
|
| 261 |
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
|
|
|
|
|
|
| 262 |
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
| 263 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 264 |
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 265 |
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 266 |
:param cookies: Set cookies for the next request.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 269 |
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 270 |
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 271 |
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 272 |
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
| 273 |
+
:param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
|
| 274 |
+
rules. Defaults to the system default locale.
|
| 275 |
+
:param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
|
| 276 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 277 |
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
| 278 |
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 279 |
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
| 280 |
+
:param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
|
| 281 |
+
:param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
| 282 |
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 283 |
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 284 |
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 285 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 286 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
|
|
|
| 287 |
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
| 288 |
+
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
| 289 |
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 290 |
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 291 |
"""
|
| 292 |
self.__validate__(**kwargs)
|
| 293 |
+
super().__init__(max_pages=self._config.max_pages)
|
| 294 |
|
| 295 |
async def start(self):
|
| 296 |
"""Create a browser for this instance and context."""
|
| 297 |
if not self.playwright:
|
| 298 |
+
self.playwright: AsyncPlaywright = await async_playwright().start() # pyright: ignore [reportAttributeAccessIssue]
|
| 299 |
+
|
| 300 |
+
if self._config.cdp_url:
|
| 301 |
+
browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
|
| 302 |
+
self.context: AsyncBrowserContext = await browser.new_context(**self._context_options)
|
| 303 |
+
else:
|
| 304 |
+
self.context: AsyncBrowserContext = await self.playwright.chromium.launch_persistent_context(
|
| 305 |
+
**self._launch_options
|
| 306 |
+
)
|
| 307 |
|
| 308 |
+
for script in _compiled_stealth_scripts():
|
| 309 |
+
await self.context.add_init_script(script=script)
|
| 310 |
|
| 311 |
+
if self._config.init_script: # pragma: no cover
|
| 312 |
+
await self.context.add_init_script(path=self._config.init_script)
|
| 313 |
+
|
| 314 |
+
if self._config.cookies:
|
| 315 |
+
await self.context.add_cookies(self._config.cookies) # pyright: ignore
|
| 316 |
else:
|
| 317 |
raise RuntimeError("Session has been already started")
|
| 318 |
|
| 319 |
+
async def _cloudflare_solver(self, page: async_Page) -> None: # pragma: no cover
|
| 320 |
+
"""Solve the cloudflare challenge displayed on the playwright page passed
|
| 321 |
|
| 322 |
+
:param page: The targeted page
|
| 323 |
:return:
|
| 324 |
"""
|
| 325 |
await self._wait_for_networkidle(page, timeout=5000)
|
|
|
|
| 329 |
return
|
| 330 |
else:
|
| 331 |
log.info(f'The turnstile version discovered is "{challenge_type}"')
|
| 332 |
+
if challenge_type == "non-interactive":
|
| 333 |
while "<title>Just a moment...</title>" in (await ResponseFactory._get_async_page_content(page)):
|
| 334 |
log.info("Waiting for Cloudflare wait page to disappear.")
|
| 335 |
await page.wait_for_timeout(1000)
|
|
|
|
| 348 |
outer_box = {}
|
| 349 |
iframe = page.frame(url=__CF_PATTERN__)
|
| 350 |
if iframe is not None:
|
| 351 |
+
await self._wait_for_page_stability(iframe, True, False)
|
| 352 |
|
| 353 |
if challenge_type != "embedded":
|
| 354 |
while not await (await iframe.frame_element()).is_visible():
|
| 355 |
# Double-checking that the iframe is loaded
|
| 356 |
await page.wait_for_timeout(500)
|
| 357 |
+
|
| 358 |
+
outer_box: Any = (await iframe.frame_element()).bounding_box()
|
| 359 |
|
| 360 |
if not iframe or not outer_box:
|
| 361 |
+
if "<title>Just a moment...</title>" not in (await ResponseFactory._get_async_page_content(page)):
|
| 362 |
+
log.info("Cloudflare captcha is solved")
|
| 363 |
+
return
|
| 364 |
+
|
| 365 |
outer_box: Any = await page.locator(box_selector).last.bounding_box()
|
| 366 |
|
| 367 |
# Calculate the Captcha coordinates for any viewport
|
| 368 |
captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)
|
| 369 |
|
| 370 |
# Move the mouse to the center of the window, then press and hold the left mouse button
|
| 371 |
+
await page.mouse.click(captcha_x, captcha_y, delay=randint(100, 200), button="left")
|
| 372 |
await self._wait_for_networkidle(page)
|
| 373 |
if iframe is not None:
|
| 374 |
# Wait for the frame to be removed from the page (with 30s timeout = 300 iterations * 100 ms)
|
|
|
|
| 380 |
await page.wait_for_timeout(100)
|
| 381 |
attempts += 1
|
| 382 |
if challenge_type != "embedded":
|
| 383 |
+
await page.locator(box_selector).last.wait_for(state="detached")
|
| 384 |
await page.locator(".zone-name-title").wait_for(state="hidden")
|
| 385 |
await self._wait_for_page_stability(page, True, False)
|
| 386 |
|
| 387 |
log.info("Cloudflare captcha is solved")
|
| 388 |
return
|
| 389 |
|
| 390 |
+
async def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Response:
|
| 391 |
"""Opens up the browser and do your request based on your chosen options.
|
| 392 |
|
| 393 |
:param url: The Target url.
|
|
|
|
| 408 |
- selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 409 |
:return: A `Response` object.
|
| 410 |
"""
|
| 411 |
+
params = _validate(kwargs, self, StealthConfig)
|
| 412 |
|
| 413 |
if self._closed: # pragma: no cover
|
| 414 |
raise RuntimeError("Context manager has been closed")
|
|
|
|
| 421 |
final_response = [None]
|
| 422 |
handle_response = self._create_response_handler(page_info, final_response)
|
| 423 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 424 |
try:
|
| 425 |
# Navigate to URL and wait for a specified state
|
| 426 |
page_info.page.on("response", handle_response)
|
|
|
|
| 460 |
# Close the page to free up resources
|
| 461 |
await page_info.page.close()
|
| 462 |
self.page_pool.pages.remove(page_info)
|
|
|
|
| 463 |
return response
|
| 464 |
|
| 465 |
+
except Exception as e: # pragma: no cover
|
| 466 |
page_info.mark_error()
|
| 467 |
raise e
|
scrapling/engines/_browsers/_types.py
CHANGED
|
@@ -53,7 +53,7 @@ if TYPE_CHECKING: # pragma: no cover
|
|
| 53 |
json: Optional[Dict | List]
|
| 54 |
|
| 55 |
# Types for browser session
|
| 56 |
-
class
|
| 57 |
max_pages: int
|
| 58 |
headless: bool
|
| 59 |
disable_resources: bool
|
|
@@ -64,6 +64,7 @@ if TYPE_CHECKING: # pragma: no cover
|
|
| 64 |
cookies: Optional[Iterable[Dict]]
|
| 65 |
google_search: bool
|
| 66 |
wait: int | float
|
|
|
|
| 67 |
page_action: Optional[Callable]
|
| 68 |
proxy: Optional[str | Dict[str, str] | Tuple]
|
| 69 |
extra_headers: Optional[Dict[str, str]]
|
|
@@ -72,42 +73,32 @@ if TYPE_CHECKING: # pragma: no cover
|
|
| 72 |
user_data_dir: str
|
| 73 |
selector_config: Optional[Dict]
|
| 74 |
additional_args: Optional[Dict]
|
| 75 |
-
|
| 76 |
-
class PlaywrightSession(BrowserSession, total=False):
|
| 77 |
-
cdp_url: Optional[str]
|
| 78 |
-
hide_canvas: bool
|
| 79 |
-
disable_webgl: bool
|
| 80 |
real_chrome: bool
|
| 81 |
-
|
| 82 |
-
locale: str
|
| 83 |
useragent: Optional[str]
|
| 84 |
extra_flags: Optional[List[str]]
|
| 85 |
|
| 86 |
class PlaywrightFetchParams(TypedDict, total=False):
|
|
|
|
|
|
|
|
|
|
| 87 |
google_search: bool
|
| 88 |
timeout: int | float
|
| 89 |
-
wait: int | float
|
| 90 |
-
page_action: Optional[Callable]
|
| 91 |
-
extra_headers: Optional[Dict[str, str]]
|
| 92 |
disable_resources: bool
|
| 93 |
wait_selector: Optional[str]
|
| 94 |
-
|
| 95 |
-
network_idle: bool
|
| 96 |
-
load_dom: bool
|
| 97 |
selector_config: Optional[Dict]
|
|
|
|
|
|
|
| 98 |
|
| 99 |
-
class
|
| 100 |
-
block_images: bool
|
| 101 |
-
block_webrtc: bool
|
| 102 |
allow_webgl: bool
|
| 103 |
-
|
|
|
|
| 104 |
solve_cloudflare: bool
|
| 105 |
-
addons: Optional[List[str]]
|
| 106 |
-
os_randomize: bool
|
| 107 |
-
disable_ads: bool
|
| 108 |
-
geoip: bool
|
| 109 |
|
| 110 |
-
class
|
| 111 |
solve_cloudflare: bool
|
| 112 |
|
| 113 |
else: # pragma: no cover
|
|
@@ -116,5 +107,5 @@ else: # pragma: no cover
|
|
| 116 |
DataRequestParams = TypedDict
|
| 117 |
PlaywrightSession = TypedDict
|
| 118 |
PlaywrightFetchParams = TypedDict
|
| 119 |
-
|
| 120 |
-
|
|
|
|
| 53 |
json: Optional[Dict | List]
|
| 54 |
|
| 55 |
# Types for browser session
|
| 56 |
+
class PlaywrightSession(TypedDict, total=False):
|
| 57 |
max_pages: int
|
| 58 |
headless: bool
|
| 59 |
disable_resources: bool
|
|
|
|
| 64 |
cookies: Optional[Iterable[Dict]]
|
| 65 |
google_search: bool
|
| 66 |
wait: int | float
|
| 67 |
+
timezone_id: str | None
|
| 68 |
page_action: Optional[Callable]
|
| 69 |
proxy: Optional[str | Dict[str, str] | Tuple]
|
| 70 |
extra_headers: Optional[Dict[str, str]]
|
|
|
|
| 73 |
user_data_dir: str
|
| 74 |
selector_config: Optional[Dict]
|
| 75 |
additional_args: Optional[Dict]
|
| 76 |
+
locale: Optional[str]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
real_chrome: bool
|
| 78 |
+
cdp_url: Optional[str]
|
|
|
|
| 79 |
useragent: Optional[str]
|
| 80 |
extra_flags: Optional[List[str]]
|
| 81 |
|
| 82 |
class PlaywrightFetchParams(TypedDict, total=False):
|
| 83 |
+
load_dom: bool
|
| 84 |
+
wait: int | float
|
| 85 |
+
network_idle: bool
|
| 86 |
google_search: bool
|
| 87 |
timeout: int | float
|
|
|
|
|
|
|
|
|
|
| 88 |
disable_resources: bool
|
| 89 |
wait_selector: Optional[str]
|
| 90 |
+
page_action: Optional[Callable]
|
|
|
|
|
|
|
| 91 |
selector_config: Optional[Dict]
|
| 92 |
+
extra_headers: Optional[Dict[str, str]]
|
| 93 |
+
wait_selector_state: SelectorWaitStates
|
| 94 |
|
| 95 |
+
class StealthSession(PlaywrightSession, total=False):
|
|
|
|
|
|
|
| 96 |
allow_webgl: bool
|
| 97 |
+
hide_canvas: bool
|
| 98 |
+
block_webrtc: bool
|
| 99 |
solve_cloudflare: bool
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
+
class StealthFetchParams(PlaywrightFetchParams, total=False):
|
| 102 |
solve_cloudflare: bool
|
| 103 |
|
| 104 |
else: # pragma: no cover
|
|
|
|
| 107 |
DataRequestParams = TypedDict
|
| 108 |
PlaywrightSession = TypedDict
|
| 109 |
PlaywrightFetchParams = TypedDict
|
| 110 |
+
StealthSession = TypedDict
|
| 111 |
+
StealthFetchParams = TypedDict
|
scrapling/engines/_browsers/_validators.py
CHANGED
|
@@ -14,11 +14,13 @@ from scrapling.core._types import (
|
|
| 14 |
Optional,
|
| 15 |
Callable,
|
| 16 |
Iterable,
|
| 17 |
-
|
| 18 |
overload,
|
|
|
|
|
|
|
| 19 |
)
|
| 20 |
from scrapling.engines.toolbelt.navigation import construct_proxy_dict
|
| 21 |
-
from scrapling.engines._browsers._types import PlaywrightFetchParams,
|
| 22 |
|
| 23 |
|
| 24 |
# Custom validators for msgspec
|
|
@@ -68,26 +70,26 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False, weakref=True):
|
|
| 68 |
cdp_url: Optional[str] = None
|
| 69 |
headless: bool = True
|
| 70 |
google_search: bool = True
|
| 71 |
-
hide_canvas: bool = False
|
| 72 |
-
disable_webgl: bool = False
|
| 73 |
real_chrome: bool = False
|
| 74 |
-
stealth: bool = False
|
| 75 |
wait: Seconds = 0
|
| 76 |
page_action: Optional[Callable] = None
|
| 77 |
proxy: Optional[str | Dict[str, str] | Tuple] = None # The default value for proxy in Playwright's source is `None`
|
| 78 |
-
locale: str =
|
| 79 |
extra_headers: Optional[Dict[str, str]] = None
|
| 80 |
useragent: Optional[str] = None
|
| 81 |
timeout: Seconds = 30000
|
| 82 |
init_script: Optional[str] = None
|
| 83 |
disable_resources: bool = False
|
| 84 |
wait_selector: Optional[str] = None
|
| 85 |
-
cookies:
|
| 86 |
network_idle: bool = False
|
| 87 |
load_dom: bool = True
|
| 88 |
wait_selector_state: SelectorWaitStates = "attached"
|
| 89 |
user_data_dir: str = ""
|
| 90 |
-
timezone_id: str = ""
|
| 91 |
extra_flags: Optional[List[str]] = None
|
| 92 |
selector_config: Optional[Dict] = {}
|
| 93 |
additional_args: Optional[Dict] = {}
|
|
@@ -118,64 +120,18 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False, weakref=True):
|
|
| 118 |
raise ValueError(validation_msg)
|
| 119 |
|
| 120 |
|
| 121 |
-
class
|
| 122 |
-
"""Configuration struct for validation"""
|
| 123 |
-
|
| 124 |
-
max_pages: PagesCount = 1
|
| 125 |
-
headless: bool = True # noqa: F821
|
| 126 |
-
block_images: bool = False
|
| 127 |
-
disable_resources: bool = False
|
| 128 |
-
block_webrtc: bool = False
|
| 129 |
allow_webgl: bool = True
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
humanize: bool | float = True
|
| 133 |
solve_cloudflare: bool = False
|
| 134 |
-
wait: Seconds = 0
|
| 135 |
-
timeout: Seconds = 30000
|
| 136 |
-
init_script: Optional[str] = None
|
| 137 |
-
page_action: Optional[Callable] = None
|
| 138 |
-
wait_selector: Optional[str] = None
|
| 139 |
-
addons: Optional[List[str]] = None
|
| 140 |
-
wait_selector_state: SelectorWaitStates = "attached"
|
| 141 |
-
cookies: Optional[Iterable[Dict]] = None
|
| 142 |
-
google_search: bool = True
|
| 143 |
-
extra_headers: Optional[Dict[str, str]] = None
|
| 144 |
-
proxy: Optional[str | Dict[str, str] | Tuple] = None # The default value for proxy in Playwright's source is `None`
|
| 145 |
-
os_randomize: bool = False
|
| 146 |
-
disable_ads: bool = False
|
| 147 |
-
geoip: bool = False
|
| 148 |
-
user_data_dir: str = ""
|
| 149 |
-
selector_config: Optional[Dict] = {}
|
| 150 |
-
additional_args: Optional[Dict] = {}
|
| 151 |
|
| 152 |
def __post_init__(self):
|
| 153 |
"""Custom validation after msgspec validation"""
|
| 154 |
-
|
| 155 |
-
raise TypeError(f"page_action must be callable, got {type(self.page_action).__name__}")
|
| 156 |
-
if self.proxy:
|
| 157 |
-
self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
|
| 158 |
-
|
| 159 |
-
if self.addons:
|
| 160 |
-
for addon in self.addons:
|
| 161 |
-
_validate_addon_path(addon)
|
| 162 |
-
else:
|
| 163 |
-
self.addons = []
|
| 164 |
-
|
| 165 |
-
if self.init_script is not None:
|
| 166 |
-
validation_msg = _is_invalid_file_path(self.init_script)
|
| 167 |
-
if validation_msg:
|
| 168 |
-
raise ValueError(validation_msg)
|
| 169 |
-
|
| 170 |
-
if not self.cookies:
|
| 171 |
-
self.cookies = []
|
| 172 |
# Cloudflare timeout adjustment
|
| 173 |
if self.solve_cloudflare and self.timeout < 60_000:
|
| 174 |
self.timeout = 60_000
|
| 175 |
-
if not self.selector_config:
|
| 176 |
-
self.selector_config = {}
|
| 177 |
-
if not self.additional_args:
|
| 178 |
-
self.additional_args = {}
|
| 179 |
|
| 180 |
|
| 181 |
@dataclass
|
|
@@ -197,9 +153,9 @@ class _fetch_params:
|
|
| 197 |
|
| 198 |
|
| 199 |
def validate_fetch(
|
| 200 |
-
method_kwargs: Dict | PlaywrightFetchParams |
|
| 201 |
session: Any,
|
| 202 |
-
model: type[PlaywrightConfig] | type[
|
| 203 |
) -> _fetch_params: # pragma: no cover
|
| 204 |
result = {}
|
| 205 |
overrides = {}
|
|
@@ -210,21 +166,20 @@ def validate_fetch(
|
|
| 210 |
for key in fetch_param_fields:
|
| 211 |
if key in method_kwargs:
|
| 212 |
overrides[key] = method_kwargs[key]
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
attr_name = f"_{key}"
|
| 216 |
-
if hasattr(session, attr_name):
|
| 217 |
-
result[key] = getattr(session, attr_name)
|
| 218 |
|
| 219 |
if overrides:
|
| 220 |
validated_config = validate(overrides, model)
|
| 221 |
-
# Extract
|
|
|
|
| 222 |
validated_dict = {
|
| 223 |
-
|
| 224 |
-
for f in fields(_fetch_params)
|
| 225 |
-
if hasattr(validated_config, f.name)
|
| 226 |
}
|
| 227 |
-
|
|
|
|
|
|
|
|
|
|
| 228 |
|
| 229 |
# Start with session defaults, then overwrite with validated overrides
|
| 230 |
result.update(validated_dict)
|
|
@@ -238,7 +193,7 @@ def validate_fetch(
|
|
| 238 |
# Cache default values for each model to reduce validation overhead
|
| 239 |
models_default_values = {}
|
| 240 |
|
| 241 |
-
for _model in (
|
| 242 |
_defaults = {}
|
| 243 |
if hasattr(_model, "__struct_defaults__") and hasattr(_model, "__struct_fields__"):
|
| 244 |
for field_name, default_value in zip(_model.__struct_fields__, _model.__struct_defaults__): # type: ignore
|
|
@@ -256,14 +211,14 @@ def _filter_defaults(params: Dict, model: str) -> Dict:
|
|
| 256 |
|
| 257 |
|
| 258 |
@overload
|
| 259 |
-
def validate(params: Dict, model: type[
|
| 260 |
|
| 261 |
|
| 262 |
@overload
|
| 263 |
-
def validate(params: Dict, model: type[
|
| 264 |
|
| 265 |
|
| 266 |
-
def validate(params: Dict, model: type[PlaywrightConfig] | type[
|
| 267 |
try:
|
| 268 |
# Filter out params with the default values (no need to validate them) to speed up validation
|
| 269 |
filtered = _filter_defaults(params, model.__name__)
|
|
|
|
| 14 |
Optional,
|
| 15 |
Callable,
|
| 16 |
Iterable,
|
| 17 |
+
Sequence,
|
| 18 |
overload,
|
| 19 |
+
SetCookieParam,
|
| 20 |
+
SelectorWaitStates,
|
| 21 |
)
|
| 22 |
from scrapling.engines.toolbelt.navigation import construct_proxy_dict
|
| 23 |
+
from scrapling.engines._browsers._types import PlaywrightFetchParams, StealthFetchParams
|
| 24 |
|
| 25 |
|
| 26 |
# Custom validators for msgspec
|
|
|
|
| 70 |
cdp_url: Optional[str] = None
|
| 71 |
headless: bool = True
|
| 72 |
google_search: bool = True
|
| 73 |
+
# hide_canvas: bool = False
|
| 74 |
+
# disable_webgl: bool = False
|
| 75 |
real_chrome: bool = False
|
| 76 |
+
# stealth: bool = False
|
| 77 |
wait: Seconds = 0
|
| 78 |
page_action: Optional[Callable] = None
|
| 79 |
proxy: Optional[str | Dict[str, str] | Tuple] = None # The default value for proxy in Playwright's source is `None`
|
| 80 |
+
locale: str | None = None
|
| 81 |
extra_headers: Optional[Dict[str, str]] = None
|
| 82 |
useragent: Optional[str] = None
|
| 83 |
timeout: Seconds = 30000
|
| 84 |
init_script: Optional[str] = None
|
| 85 |
disable_resources: bool = False
|
| 86 |
wait_selector: Optional[str] = None
|
| 87 |
+
cookies: Sequence[SetCookieParam] | None = []
|
| 88 |
network_idle: bool = False
|
| 89 |
load_dom: bool = True
|
| 90 |
wait_selector_state: SelectorWaitStates = "attached"
|
| 91 |
user_data_dir: str = ""
|
| 92 |
+
timezone_id: str | None = ""
|
| 93 |
extra_flags: Optional[List[str]] = None
|
| 94 |
selector_config: Optional[Dict] = {}
|
| 95 |
additional_args: Optional[Dict] = {}
|
|
|
|
| 120 |
raise ValueError(validation_msg)
|
| 121 |
|
| 122 |
|
| 123 |
+
class StealthConfig(PlaywrightConfig, kw_only=True, frozen=False, weakref=True):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
allow_webgl: bool = True
|
| 125 |
+
hide_canvas: bool = False
|
| 126 |
+
block_webrtc: bool = False
|
|
|
|
| 127 |
solve_cloudflare: bool = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
def __post_init__(self):
|
| 130 |
"""Custom validation after msgspec validation"""
|
| 131 |
+
super(StealthConfig, self).__post_init__()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
# Cloudflare timeout adjustment
|
| 133 |
if self.solve_cloudflare and self.timeout < 60_000:
|
| 134 |
self.timeout = 60_000
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
|
| 137 |
@dataclass
|
|
|
|
| 153 |
|
| 154 |
|
| 155 |
def validate_fetch(
|
| 156 |
+
method_kwargs: Dict | PlaywrightFetchParams | StealthFetchParams,
|
| 157 |
session: Any,
|
| 158 |
+
model: type[PlaywrightConfig] | type[StealthConfig],
|
| 159 |
) -> _fetch_params: # pragma: no cover
|
| 160 |
result = {}
|
| 161 |
overrides = {}
|
|
|
|
| 166 |
for key in fetch_param_fields:
|
| 167 |
if key in method_kwargs:
|
| 168 |
overrides[key] = method_kwargs[key]
|
| 169 |
+
elif hasattr(session, "_config") and hasattr(session._config, key):
|
| 170 |
+
result[key] = getattr(session._config, key)
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
if overrides:
|
| 173 |
validated_config = validate(overrides, model)
|
| 174 |
+
# Extract ONLY the fields that were actually overridden (not all fields)
|
| 175 |
+
# This prevents validated defaults from overwriting session config values
|
| 176 |
validated_dict = {
|
| 177 |
+
field: getattr(validated_config, field) for field in overrides.keys() if hasattr(validated_config, field)
|
|
|
|
|
|
|
| 178 |
}
|
| 179 |
+
|
| 180 |
+
# Preserve solve_cloudflare if the user explicitly provided it, even if the model doesn't have it
|
| 181 |
+
if "solve_cloudflare" in overrides:
|
| 182 |
+
validated_dict["solve_cloudflare"] = overrides["solve_cloudflare"]
|
| 183 |
|
| 184 |
# Start with session defaults, then overwrite with validated overrides
|
| 185 |
result.update(validated_dict)
|
|
|
|
| 193 |
# Cache default values for each model to reduce validation overhead
|
| 194 |
models_default_values = {}
|
| 195 |
|
| 196 |
+
for _model in (StealthConfig, PlaywrightConfig):
|
| 197 |
_defaults = {}
|
| 198 |
if hasattr(_model, "__struct_defaults__") and hasattr(_model, "__struct_fields__"):
|
| 199 |
for field_name, default_value in zip(_model.__struct_fields__, _model.__struct_defaults__): # type: ignore
|
|
|
|
| 211 |
|
| 212 |
|
| 213 |
@overload
|
| 214 |
+
def validate(params: Dict, model: type[StealthConfig]) -> StealthConfig: ...
|
| 215 |
|
| 216 |
|
| 217 |
@overload
|
| 218 |
+
def validate(params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...
|
| 219 |
|
| 220 |
|
| 221 |
+
def validate(params: Dict, model: type[PlaywrightConfig] | type[StealthConfig]) -> PlaywrightConfig | StealthConfig:
|
| 222 |
try:
|
| 223 |
# Filter out params with the default values (no need to validate them) to speed up validation
|
| 224 |
filtered = _filter_defaults(params, model.__name__)
|
scrapling/engines/constants.py
CHANGED
|
@@ -74,7 +74,6 @@ DEFAULT_STEALTH_FLAGS = (
|
|
| 74 |
"--disable-domain-reliability",
|
| 75 |
"--disable-threaded-animation",
|
| 76 |
"--disable-threaded-scrolling",
|
| 77 |
-
# '--disable-reading-from-canvas', # For Firefox
|
| 78 |
"--enable-simple-cache-backend",
|
| 79 |
"--disable-background-networking",
|
| 80 |
"--enable-surface-synchronization",
|
|
|
|
| 74 |
"--disable-domain-reliability",
|
| 75 |
"--disable-threaded-animation",
|
| 76 |
"--disable-threaded-scrolling",
|
|
|
|
| 77 |
"--enable-simple-cache-backend",
|
| 78 |
"--disable-background-networking",
|
| 79 |
"--enable-surface-synchronization",
|
scrapling/fetchers/__init__.py
CHANGED
|
@@ -3,7 +3,7 @@ from typing import TYPE_CHECKING, Any
|
|
| 3 |
if TYPE_CHECKING:
|
| 4 |
from scrapling.fetchers.requests import Fetcher, AsyncFetcher, FetcherSession
|
| 5 |
from scrapling.fetchers.chrome import DynamicFetcher, DynamicSession, AsyncDynamicSession
|
| 6 |
-
from scrapling.fetchers.
|
| 7 |
|
| 8 |
|
| 9 |
# Lazy import mapping
|
|
@@ -14,9 +14,9 @@ _LAZY_IMPORTS = {
|
|
| 14 |
"DynamicFetcher": ("scrapling.fetchers.chrome", "DynamicFetcher"),
|
| 15 |
"DynamicSession": ("scrapling.fetchers.chrome", "DynamicSession"),
|
| 16 |
"AsyncDynamicSession": ("scrapling.fetchers.chrome", "AsyncDynamicSession"),
|
| 17 |
-
"StealthyFetcher": ("scrapling.fetchers.
|
| 18 |
-
"StealthySession": ("scrapling.fetchers.
|
| 19 |
-
"AsyncStealthySession": ("scrapling.fetchers.
|
| 20 |
}
|
| 21 |
|
| 22 |
__all__ = [
|
|
|
|
| 3 |
if TYPE_CHECKING:
|
| 4 |
from scrapling.fetchers.requests import Fetcher, AsyncFetcher, FetcherSession
|
| 5 |
from scrapling.fetchers.chrome import DynamicFetcher, DynamicSession, AsyncDynamicSession
|
| 6 |
+
from scrapling.fetchers.stealth_chrome import StealthyFetcher, StealthySession, AsyncStealthySession
|
| 7 |
|
| 8 |
|
| 9 |
# Lazy import mapping
|
|
|
|
| 14 |
"DynamicFetcher": ("scrapling.fetchers.chrome", "DynamicFetcher"),
|
| 15 |
"DynamicSession": ("scrapling.fetchers.chrome", "DynamicSession"),
|
| 16 |
"AsyncDynamicSession": ("scrapling.fetchers.chrome", "AsyncDynamicSession"),
|
| 17 |
+
"StealthyFetcher": ("scrapling.fetchers.stealth_chrome", "StealthyFetcher"),
|
| 18 |
+
"StealthySession": ("scrapling.fetchers.stealth_chrome", "StealthySession"),
|
| 19 |
+
"AsyncStealthySession": ("scrapling.fetchers.stealth_chrome", "AsyncStealthySession"),
|
| 20 |
}
|
| 21 |
|
| 22 |
__all__ = [
|
scrapling/fetchers/{firefox.py → stealth_chrome.py}
RENAMED
|
@@ -1,48 +1,52 @@
|
|
| 1 |
from scrapling.core._types import Unpack
|
| 2 |
-
from scrapling.engines._browsers._types import
|
| 3 |
from scrapling.engines.toolbelt.custom import BaseFetcher, Response
|
| 4 |
-
from scrapling.engines._browsers.
|
| 5 |
|
| 6 |
|
| 7 |
class StealthyFetcher(BaseFetcher):
|
| 8 |
-
"""A `Fetcher` class type
|
| 9 |
|
| 10 |
-
It works as real browsers passing almost all online tests/protections
|
| 11 |
-
Other added flavors include setting the faked OS fingerprints to match the user's OS, and the referer of every request is set as if this request came from Google's search of this URL's domain.
|
| 12 |
"""
|
| 13 |
|
| 14 |
@classmethod
|
| 15 |
-
def fetch(cls, url: str, **kwargs: Unpack[
|
| 16 |
"""
|
| 17 |
Opens up a browser and do your request based on your chosen options below.
|
| 18 |
|
| 19 |
:param url: Target url.
|
| 20 |
:param kwargs: Browser session configuration options including:
|
| 21 |
- headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 22 |
-
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
-
|
|
|
|
| 26 |
- network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 27 |
-
- load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 28 |
-
- humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement.
|
| 29 |
-
- solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
| 30 |
-
- wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.
|
| 31 |
- timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
|
|
|
| 32 |
- page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 33 |
- wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 34 |
-
- init_script: An absolute path to a JavaScript file to be executed on page creation
|
| 35 |
-
-
|
|
|
|
|
|
|
| 36 |
- wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 37 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
- google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 39 |
-
- extra_headers: A dictionary of extra headers to add to the request.
|
| 40 |
- proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 41 |
-
-
|
| 42 |
-
-
|
| 43 |
-
- geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
|
| 44 |
- selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 45 |
-
- additional_args: Additional arguments to be passed to
|
| 46 |
:return: A `Response` object.
|
| 47 |
"""
|
| 48 |
selector_config = kwargs.get("selector_config", {}) or kwargs.get(
|
|
@@ -57,37 +61,42 @@ class StealthyFetcher(BaseFetcher):
|
|
| 57 |
return engine.fetch(url)
|
| 58 |
|
| 59 |
@classmethod
|
| 60 |
-
async def async_fetch(cls, url: str, **kwargs: Unpack[
|
| 61 |
"""
|
| 62 |
Opens up a browser and do your request based on your chosen options below.
|
| 63 |
|
| 64 |
:param url: Target url.
|
| 65 |
:param kwargs: Browser session configuration options including:
|
| 66 |
- headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 67 |
-
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
-
|
|
|
|
| 71 |
- network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 72 |
-
- load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 73 |
-
- humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement.
|
| 74 |
-
- solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
| 75 |
-
- wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.
|
| 76 |
- timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
|
|
|
| 77 |
- page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 78 |
- wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 79 |
-
- init_script: An absolute path to a JavaScript file to be executed on page creation
|
| 80 |
-
-
|
|
|
|
|
|
|
| 81 |
- wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 82 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
- google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 84 |
-
- extra_headers: A dictionary of extra headers to add to the request.
|
| 85 |
- proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 86 |
-
-
|
| 87 |
-
-
|
| 88 |
-
- geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
|
| 89 |
- selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 90 |
-
- additional_args: Additional arguments to be passed to
|
| 91 |
:return: A `Response` object.
|
| 92 |
"""
|
| 93 |
selector_config = kwargs.get("selector_config", {}) or kwargs.get(
|
|
|
|
| 1 |
from scrapling.core._types import Unpack
|
| 2 |
+
from scrapling.engines._browsers._types import StealthSession
|
| 3 |
from scrapling.engines.toolbelt.custom import BaseFetcher, Response
|
| 4 |
+
from scrapling.engines._browsers._stealth import StealthySession, AsyncStealthySession
|
| 5 |
|
| 6 |
|
| 7 |
class StealthyFetcher(BaseFetcher):
|
| 8 |
+
"""A `Fetcher` class type which is a completely stealthy built on top of Chromium.
|
| 9 |
|
| 10 |
+
It works as real browsers passing almost all online tests/protections with many customization options.
|
|
|
|
| 11 |
"""
|
| 12 |
|
| 13 |
@classmethod
|
| 14 |
+
def fetch(cls, url: str, **kwargs: Unpack[StealthSession]) -> Response:
|
| 15 |
"""
|
| 16 |
Opens up a browser and do your request based on your chosen options below.
|
| 17 |
|
| 18 |
:param url: Target url.
|
| 19 |
:param kwargs: Browser session configuration options including:
|
| 20 |
- headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 21 |
+
- disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
| 22 |
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 23 |
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 24 |
+
- useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 25 |
+
- cookies: Set cookies for the next request.
|
| 26 |
- network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
- timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 28 |
+
- wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 29 |
- page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 30 |
- wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 31 |
+
- init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
| 32 |
+
- locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
|
| 33 |
+
rules. Defaults to the system default locale.
|
| 34 |
+
- timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
|
| 35 |
- wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 36 |
+
- solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
| 37 |
+
- real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 38 |
+
- hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
| 39 |
+
- block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
|
| 40 |
+
- allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
| 41 |
+
- load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 42 |
+
- cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 43 |
- google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 44 |
+
- extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 45 |
- proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 46 |
+
- user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
| 47 |
+
- extra_flags: A list of additional browser flags to pass to the browser on launch.
|
|
|
|
| 48 |
- selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 49 |
+
- additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 50 |
:return: A `Response` object.
|
| 51 |
"""
|
| 52 |
selector_config = kwargs.get("selector_config", {}) or kwargs.get(
|
|
|
|
| 61 |
return engine.fetch(url)
|
| 62 |
|
| 63 |
@classmethod
|
| 64 |
+
async def async_fetch(cls, url: str, **kwargs: Unpack[StealthSession]) -> Response:
|
| 65 |
"""
|
| 66 |
Opens up a browser and do your request based on your chosen options below.
|
| 67 |
|
| 68 |
:param url: Target url.
|
| 69 |
:param kwargs: Browser session configuration options including:
|
| 70 |
- headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 71 |
+
- disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
| 72 |
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 73 |
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 74 |
+
- useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 75 |
+
- cookies: Set cookies for the next request.
|
| 76 |
- network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
- timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 78 |
+
- wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 79 |
- page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 80 |
- wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 81 |
+
- init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
| 82 |
+
- locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
|
| 83 |
+
rules. Defaults to the system default locale.
|
| 84 |
+
- timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
|
| 85 |
- wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 86 |
+
- solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
| 87 |
+
- real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 88 |
+
- hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
| 89 |
+
- block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
|
| 90 |
+
- allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
| 91 |
+
- load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 92 |
+
- cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 93 |
- google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 94 |
+
- extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 95 |
- proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 96 |
+
- user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
| 97 |
+
- extra_flags: A list of additional browser flags to pass to the browser on launch.
|
|
|
|
| 98 |
- selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 99 |
+
- additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 100 |
:return: A `Response` object.
|
| 101 |
"""
|
| 102 |
selector_config = kwargs.get("selector_config", {}) or kwargs.get(
|