Karim shoair commited on
Commit ·
0cd97d9
1
Parent(s): 3ced0d2
feat(fetchers): Adding the foundation of the new browser-based fetchers logic
Browse files- scrapling/core/_types.py +1 -0
- scrapling/engines/_browsers/__init__.py +1 -0
- scrapling/engines/_browsers/_config_tools.py +99 -0
- scrapling/engines/_browsers/_controllers.py +615 -0
- scrapling/engines/_browsers/_page.py +93 -0
- scrapling/engines/_browsers/_validators.py +88 -0
- scrapling/engines/constants.py +9 -0
scrapling/core/_types.py
CHANGED
|
@@ -24,6 +24,7 @@ from typing import (
|
|
| 24 |
|
| 25 |
SUPPORTED_HTTP_METHODS = Literal["GET", "POST", "PUT", "DELETE"]
|
| 26 |
SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
|
|
|
|
| 27 |
StrOrBytes = Union[str, bytes]
|
| 28 |
|
| 29 |
try:
|
|
|
|
| 24 |
|
| 25 |
SUPPORTED_HTTP_METHODS = Literal["GET", "POST", "PUT", "DELETE"]
|
| 26 |
SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
|
| 27 |
+
PageLoadStates = Literal["commit", "domcontentloaded", "load", "networkidle"]
|
| 28 |
StrOrBytes = Union[str, bytes]
|
| 29 |
|
| 30 |
try:
|
scrapling/engines/_browsers/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from ._controllers import DynamicSession, AsyncDynamicSession
|
scrapling/engines/_browsers/_config_tools.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from functools import lru_cache
|
| 2 |
+
|
| 3 |
+
from scrapling.core._types import Tuple
|
| 4 |
+
from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS, HARMFUL_DEFAULT_ARGS
|
| 5 |
+
from scrapling.engines.toolbelt import js_bypass_path, generate_headers
|
| 6 |
+
|
| 7 |
+
__default_useragent__ = generate_headers(browser_mode=True).get("User-Agent")
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@lru_cache(1)
|
| 11 |
+
def _compiled_stealth_scripts():
|
| 12 |
+
"""Pre-read and compile stealth scripts"""
|
| 13 |
+
# Basic bypasses nothing fancy as I'm still working on it
|
| 14 |
+
# But with adding these bypasses to the above config, it bypasses many online tests like
|
| 15 |
+
# https://bot.sannysoft.com/
|
| 16 |
+
# https://kaliiiiiiiiii.github.io/brotector/
|
| 17 |
+
# https://pixelscan.net/
|
| 18 |
+
# https://iphey.com/
|
| 19 |
+
# https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
|
| 20 |
+
# https://arh.antoinevastel.com/bots/areyouheadless/
|
| 21 |
+
# https://prescience-data.github.io/execution-monitor.html
|
| 22 |
+
stealth_scripts_paths = tuple(
|
| 23 |
+
js_bypass_path(script)
|
| 24 |
+
for script in (
|
| 25 |
+
# Order is important
|
| 26 |
+
"webdriver_fully.js",
|
| 27 |
+
"window_chrome.js",
|
| 28 |
+
"navigator_plugins.js",
|
| 29 |
+
"pdf_viewer.js",
|
| 30 |
+
"notification_permission.js",
|
| 31 |
+
"screen_props.js",
|
| 32 |
+
"playwright_fingerprint.js",
|
| 33 |
+
)
|
| 34 |
+
)
|
| 35 |
+
scripts = []
|
| 36 |
+
for script_path in stealth_scripts_paths:
|
| 37 |
+
with open(script_path, "r") as f:
|
| 38 |
+
scripts.append(f.read())
|
| 39 |
+
return tuple(scripts)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@lru_cache(2, typed=True)
|
| 43 |
+
def _set_flags(hide_canvas, disable_webgl):
|
| 44 |
+
"""Returns the flags that will be used while launching the browser if stealth mode is enabled"""
|
| 45 |
+
flags = DEFAULT_STEALTH_FLAGS
|
| 46 |
+
if hide_canvas:
|
| 47 |
+
flags += ("--fingerprinting-canvas-image-data-noise",)
|
| 48 |
+
if disable_webgl:
|
| 49 |
+
flags += (
|
| 50 |
+
"--disable-webgl",
|
| 51 |
+
"--disable-webgl-image-chromium",
|
| 52 |
+
"--disable-webgl2",
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
return flags
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@lru_cache(2, typed=True)
|
| 59 |
+
def _launch_kwargs(headless, real_chrome, stealth, hide_canvas, disable_webgl) -> Tuple:
|
| 60 |
+
"""Creates the arguments we will use while launching playwright's browser"""
|
| 61 |
+
launch_kwargs = {
|
| 62 |
+
"headless": headless,
|
| 63 |
+
"ignore_default_args": HARMFUL_DEFAULT_ARGS,
|
| 64 |
+
"channel": "chrome" if real_chrome else "chromium",
|
| 65 |
+
}
|
| 66 |
+
if stealth:
|
| 67 |
+
launch_kwargs.update(
|
| 68 |
+
{"args": _set_flags(hide_canvas, disable_webgl), "chromium_sandbox": True}
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
return tuple(launch_kwargs.items())
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
@lru_cache(2, typed=True)
|
| 75 |
+
def _context_kwargs(proxy, locale, extra_headers, useragent, stealth) -> Tuple:
|
| 76 |
+
"""Creates the arguments for the browser context"""
|
| 77 |
+
context_kwargs = {
|
| 78 |
+
"proxy": proxy or tuple(),
|
| 79 |
+
"locale": locale,
|
| 80 |
+
"color_scheme": "dark", # Bypasses the 'prefersLightColor' check in creepjs
|
| 81 |
+
"device_scale_factor": 2,
|
| 82 |
+
"extra_http_headers": extra_headers or tuple(),
|
| 83 |
+
"user_agent": useragent or __default_useragent__,
|
| 84 |
+
}
|
| 85 |
+
if stealth:
|
| 86 |
+
context_kwargs.update(
|
| 87 |
+
{
|
| 88 |
+
"is_mobile": False,
|
| 89 |
+
"has_touch": False,
|
| 90 |
+
# I'm thinking about disabling it to rest from all Service Workers' headache, but let's keep it as it is for now
|
| 91 |
+
"service_workers": "allow",
|
| 92 |
+
"ignore_https_errors": True,
|
| 93 |
+
"screen": {"width": 1920, "height": 1080},
|
| 94 |
+
"viewport": {"width": 1920, "height": 1080},
|
| 95 |
+
"permissions": ["geolocation", "notifications"],
|
| 96 |
+
}
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
return tuple(context_kwargs.items())
|
scrapling/engines/_browsers/_controllers.py
ADDED
|
@@ -0,0 +1,615 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import asyncio
|
| 3 |
+
|
| 4 |
+
# from camoufox import AsyncNewBrowser, NewBrowser
|
| 5 |
+
from playwright.sync_api import (
|
| 6 |
+
sync_playwright,
|
| 7 |
+
BrowserType,
|
| 8 |
+
Browser,
|
| 9 |
+
BrowserContext,
|
| 10 |
+
Playwright,
|
| 11 |
+
Locator,
|
| 12 |
+
)
|
| 13 |
+
from playwright.async_api import (
|
| 14 |
+
async_playwright,
|
| 15 |
+
BrowserType as AsyncBrowserType,
|
| 16 |
+
Browser as AsyncBrowser,
|
| 17 |
+
BrowserContext as AsyncBrowserContext,
|
| 18 |
+
Playwright as AsyncPlaywright,
|
| 19 |
+
Locator as AsyncLocator,
|
| 20 |
+
)
|
| 21 |
+
from playwright.sync_api import Response as SyncPlaywrightResponse
|
| 22 |
+
from playwright.async_api import Response as AsyncPlaywrightResponse
|
| 23 |
+
from rebrowser_playwright.sync_api import sync_playwright as sync_rebrowser_playwright
|
| 24 |
+
from rebrowser_playwright.async_api import (
|
| 25 |
+
async_playwright as async_rebrowser_playwright,
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
from scrapling.core.utils import log
|
| 29 |
+
from ._page import PageInfo, PagePool
|
| 30 |
+
from ._validators import validate, PlaywrightConfig
|
| 31 |
+
from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
|
| 32 |
+
from scrapling.core._types import (
|
| 33 |
+
Dict,
|
| 34 |
+
Optional,
|
| 35 |
+
Union,
|
| 36 |
+
Iterable,
|
| 37 |
+
Callable,
|
| 38 |
+
SelectorWaitStates,
|
| 39 |
+
)
|
| 40 |
+
from scrapling.engines.toolbelt import (
|
| 41 |
+
Response,
|
| 42 |
+
ResponseFactory,
|
| 43 |
+
generate_convincing_referer,
|
| 44 |
+
intercept_route,
|
| 45 |
+
async_intercept_route,
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class DynamicSession:
|
| 50 |
+
"""A Browser session manager with page pooling."""
|
| 51 |
+
|
| 52 |
+
__slots__ = (
|
| 53 |
+
"max_pages",
|
| 54 |
+
"headless",
|
| 55 |
+
"hide_canvas",
|
| 56 |
+
"disable_webgl",
|
| 57 |
+
"real_chrome",
|
| 58 |
+
"stealth",
|
| 59 |
+
"google_search",
|
| 60 |
+
"proxy",
|
| 61 |
+
"locale",
|
| 62 |
+
"extra_headers",
|
| 63 |
+
"useragent",
|
| 64 |
+
"timeout",
|
| 65 |
+
"cookies",
|
| 66 |
+
"disable_resources",
|
| 67 |
+
"network_idle",
|
| 68 |
+
"wait_selector",
|
| 69 |
+
"wait_selector_state",
|
| 70 |
+
"wait",
|
| 71 |
+
"playwright",
|
| 72 |
+
"browser",
|
| 73 |
+
"context",
|
| 74 |
+
"page_pool",
|
| 75 |
+
"_closed",
|
| 76 |
+
"adaptor_arguments",
|
| 77 |
+
"page_action",
|
| 78 |
+
"launch_options",
|
| 79 |
+
"context_options",
|
| 80 |
+
"cdp_url",
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
def __init__(
|
| 84 |
+
self,
|
| 85 |
+
max_pages: int = 1,
|
| 86 |
+
headless: bool = True,
|
| 87 |
+
google_search: bool = True,
|
| 88 |
+
hide_canvas: bool = False,
|
| 89 |
+
disable_webgl: bool = False,
|
| 90 |
+
real_chrome: bool = False,
|
| 91 |
+
stealth: bool = False,
|
| 92 |
+
wait: Union[int, float] = 0,
|
| 93 |
+
page_action: Optional[Callable] = None,
|
| 94 |
+
proxy: Optional[Union[str, Dict[str, str]]] = None,
|
| 95 |
+
locale: str = "en-US",
|
| 96 |
+
extra_headers: Optional[Dict[str, str]] = None,
|
| 97 |
+
useragent: Optional[str] = None,
|
| 98 |
+
cdp_url: Optional[str] = None,
|
| 99 |
+
timeout: Union[int, float] = 30000,
|
| 100 |
+
disable_resources: bool = False,
|
| 101 |
+
wait_selector: Optional[str] = None,
|
| 102 |
+
cookies: Optional[Iterable[Dict]] = None,
|
| 103 |
+
network_idle: bool = False,
|
| 104 |
+
wait_selector_state: SelectorWaitStates = "attached",
|
| 105 |
+
adaptor_arguments: Optional[Dict] = None,
|
| 106 |
+
):
|
| 107 |
+
"""A Browser session manager with page pooling
|
| 108 |
+
|
| 109 |
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 110 |
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
| 111 |
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 112 |
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 113 |
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 114 |
+
:param cookies: Set cookies for the next request.
|
| 115 |
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 116 |
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 117 |
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 118 |
+
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
| 119 |
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 120 |
+
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
| 121 |
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 122 |
+
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
| 123 |
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 124 |
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
| 125 |
+
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
| 126 |
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
|
| 127 |
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
| 128 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 129 |
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 130 |
+
:param max_pages: The maximum number of pages to be opened at the same time. It will be used in rotation through a PagePool.
|
| 131 |
+
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
| 132 |
+
"""
|
| 133 |
+
|
| 134 |
+
params = {
|
| 135 |
+
"max_pages": max_pages,
|
| 136 |
+
"headless": headless,
|
| 137 |
+
"google_search": google_search,
|
| 138 |
+
"hide_canvas": hide_canvas,
|
| 139 |
+
"disable_webgl": disable_webgl,
|
| 140 |
+
"real_chrome": real_chrome,
|
| 141 |
+
"stealth": stealth,
|
| 142 |
+
"wait": wait,
|
| 143 |
+
"page_action": page_action,
|
| 144 |
+
"proxy": proxy,
|
| 145 |
+
"locale": locale,
|
| 146 |
+
"extra_headers": extra_headers,
|
| 147 |
+
"useragent": useragent,
|
| 148 |
+
"timeout": timeout,
|
| 149 |
+
"adaptor_arguments": adaptor_arguments,
|
| 150 |
+
"disable_resources": disable_resources,
|
| 151 |
+
"wait_selector": wait_selector,
|
| 152 |
+
"cookies": cookies,
|
| 153 |
+
"network_idle": network_idle,
|
| 154 |
+
"wait_selector_state": wait_selector_state,
|
| 155 |
+
"cdp_url": cdp_url,
|
| 156 |
+
}
|
| 157 |
+
config = validate(params, PlaywrightConfig)
|
| 158 |
+
|
| 159 |
+
self.max_pages = config.max_pages
|
| 160 |
+
self.headless = config.headless
|
| 161 |
+
self.hide_canvas = config.hide_canvas
|
| 162 |
+
self.disable_webgl = config.disable_webgl
|
| 163 |
+
self.real_chrome = config.real_chrome
|
| 164 |
+
self.stealth = config.stealth
|
| 165 |
+
self.google_search = config.google_search
|
| 166 |
+
self.wait = config.wait
|
| 167 |
+
self.proxy = config.proxy
|
| 168 |
+
self.locale = config.locale
|
| 169 |
+
self.extra_headers = config.extra_headers
|
| 170 |
+
self.useragent = config.useragent
|
| 171 |
+
self.timeout = config.timeout
|
| 172 |
+
self.cookies = list(config.cookies) if config.cookies else []
|
| 173 |
+
self.disable_resources = config.disable_resources
|
| 174 |
+
self.cdp_url = config.cdp_url
|
| 175 |
+
self.network_idle = config.network_idle
|
| 176 |
+
self.wait_selector = config.wait_selector
|
| 177 |
+
self.wait_selector_state = config.wait_selector_state
|
| 178 |
+
|
| 179 |
+
self.playwright: Optional[Playwright] = None
|
| 180 |
+
self.browser: Optional[Union[BrowserType, Browser]] = None
|
| 181 |
+
self.context: Optional[BrowserContext] = None
|
| 182 |
+
self.page_pool = PagePool(self.max_pages)
|
| 183 |
+
self._closed = False
|
| 184 |
+
self.adaptor_arguments = config.adaptor_arguments or {}
|
| 185 |
+
self.page_action = config.page_action
|
| 186 |
+
self.__initiate_browser_options__()
|
| 187 |
+
|
| 188 |
+
def __initiate_browser_options__(self):
|
| 189 |
+
self.launch_options = dict(
|
| 190 |
+
_launch_kwargs(
|
| 191 |
+
self.headless,
|
| 192 |
+
self.real_chrome,
|
| 193 |
+
self.stealth,
|
| 194 |
+
self.hide_canvas,
|
| 195 |
+
self.disable_webgl,
|
| 196 |
+
)
|
| 197 |
+
)
|
| 198 |
+
self.context_options = dict(
|
| 199 |
+
_context_kwargs(
|
| 200 |
+
self.proxy,
|
| 201 |
+
self.locale,
|
| 202 |
+
tuple(self.extra_headers.items()) if self.extra_headers else tuple(),
|
| 203 |
+
self.useragent,
|
| 204 |
+
self.stealth,
|
| 205 |
+
)
|
| 206 |
+
)
|
| 207 |
+
self.context_options["extra_http_headers"] = dict(
|
| 208 |
+
self.context_options["extra_http_headers"]
|
| 209 |
+
)
|
| 210 |
+
self.context_options["proxy"] = dict(self.context_options["proxy"]) or None
|
| 211 |
+
|
| 212 |
+
def __create__(self):
|
| 213 |
+
"""Create a browser for this instance and context."""
|
| 214 |
+
sync_context = sync_rebrowser_playwright
|
| 215 |
+
if not self.stealth or self.real_chrome:
|
| 216 |
+
# Because rebrowser_playwright doesn't play well with real browsers
|
| 217 |
+
sync_context = sync_playwright
|
| 218 |
+
|
| 219 |
+
self.playwright = sync_context().start()
|
| 220 |
+
|
| 221 |
+
browser_launcher = getattr(
|
| 222 |
+
self.playwright, "chrome" if self.real_chrome else "chromium"
|
| 223 |
+
)
|
| 224 |
+
if self.cdp_url:
|
| 225 |
+
self.browser = browser_launcher.connect_over_cdp(endpoint_url=self.cdp_url)
|
| 226 |
+
else:
|
| 227 |
+
self.browser = browser_launcher.launch(**self.launch_options)
|
| 228 |
+
|
| 229 |
+
self.context = self.browser.new_context(**self.context_options)
|
| 230 |
+
if self.cookies:
|
| 231 |
+
self.context.add_cookies(self.cookies)
|
| 232 |
+
|
| 233 |
+
def __enter__(self):
|
| 234 |
+
self.__create__()
|
| 235 |
+
return self
|
| 236 |
+
|
| 237 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 238 |
+
self.close()
|
| 239 |
+
|
| 240 |
+
def close(self):
|
| 241 |
+
"""Close all resources"""
|
| 242 |
+
if self._closed:
|
| 243 |
+
return
|
| 244 |
+
|
| 245 |
+
if self.context:
|
| 246 |
+
self.context.close()
|
| 247 |
+
self.context = None
|
| 248 |
+
|
| 249 |
+
if self.browser:
|
| 250 |
+
self.browser.close()
|
| 251 |
+
self.browser = None
|
| 252 |
+
|
| 253 |
+
if self.playwright:
|
| 254 |
+
self.playwright.stop()
|
| 255 |
+
self.playwright = None
|
| 256 |
+
|
| 257 |
+
self._closed = True
|
| 258 |
+
|
| 259 |
+
def _get_or_create_page(self) -> PageInfo:
|
| 260 |
+
"""Get an available page or create a new one"""
|
| 261 |
+
# Try to get a ready page first
|
| 262 |
+
page_info = self.page_pool.get_ready_page()
|
| 263 |
+
if page_info:
|
| 264 |
+
return page_info
|
| 265 |
+
|
| 266 |
+
# Create new page if under limit
|
| 267 |
+
if self.page_pool.pages_count < self.max_pages:
|
| 268 |
+
page = self.context.new_page()
|
| 269 |
+
page.set_default_navigation_timeout(self.timeout)
|
| 270 |
+
page.set_default_timeout(self.timeout)
|
| 271 |
+
if self.extra_headers:
|
| 272 |
+
page.set_extra_http_headers(self.extra_headers)
|
| 273 |
+
|
| 274 |
+
if self.disable_resources:
|
| 275 |
+
page.route("**/*", intercept_route)
|
| 276 |
+
|
| 277 |
+
if self.stealth:
|
| 278 |
+
for script in _compiled_stealth_scripts():
|
| 279 |
+
page.add_init_script(path=script)
|
| 280 |
+
|
| 281 |
+
return self.page_pool.add_page(page)
|
| 282 |
+
|
| 283 |
+
# Wait for a page to become available
|
| 284 |
+
max_wait = 30
|
| 285 |
+
start_time = time.time()
|
| 286 |
+
|
| 287 |
+
while time.time() - start_time < max_wait:
|
| 288 |
+
page_info = self.page_pool.get_ready_page()
|
| 289 |
+
if page_info:
|
| 290 |
+
return page_info
|
| 291 |
+
time.sleep(0.05)
|
| 292 |
+
|
| 293 |
+
raise TimeoutError("No pages available within timeout period")
|
| 294 |
+
|
| 295 |
+
def fetch(self, url: str) -> Response:
|
| 296 |
+
"""Opens up the browser and do your request based on your chosen options.
|
| 297 |
+
|
| 298 |
+
:param url: The Target url.
|
| 299 |
+
:return: A `Response` object.
|
| 300 |
+
"""
|
| 301 |
+
if self._closed:
|
| 302 |
+
raise RuntimeError("Context manager has been closed")
|
| 303 |
+
|
| 304 |
+
final_response = None
|
| 305 |
+
referer = generate_convincing_referer(url) if self.google_search else None
|
| 306 |
+
|
| 307 |
+
def handle_response(finished_response: SyncPlaywrightResponse):
|
| 308 |
+
nonlocal final_response
|
| 309 |
+
if (
|
| 310 |
+
finished_response.request.resource_type == "document"
|
| 311 |
+
and finished_response.request.is_navigation_request()
|
| 312 |
+
):
|
| 313 |
+
final_response = finished_response
|
| 314 |
+
|
| 315 |
+
page_info = self._get_or_create_page()
|
| 316 |
+
page_info.mark_busy(url=url)
|
| 317 |
+
|
| 318 |
+
try:
|
| 319 |
+
# Navigate to URL and wait for a specified state
|
| 320 |
+
page_info.page.on("response", handle_response)
|
| 321 |
+
first_response = page_info.page.goto(url, referer=referer)
|
| 322 |
+
page_info.page.wait_for_load_state(state="domcontentloaded")
|
| 323 |
+
|
| 324 |
+
if self.network_idle:
|
| 325 |
+
page_info.page.wait_for_load_state("networkidle")
|
| 326 |
+
|
| 327 |
+
if not first_response:
|
| 328 |
+
raise RuntimeError(f"Failed to get response for {url}")
|
| 329 |
+
|
| 330 |
+
if self.page_action is not None:
|
| 331 |
+
try:
|
| 332 |
+
page_info.page = self.page_action(page_info.page)
|
| 333 |
+
except Exception as e:
|
| 334 |
+
log.error(f"Error executing page_action: {e}")
|
| 335 |
+
|
| 336 |
+
if self.wait_selector:
|
| 337 |
+
try:
|
| 338 |
+
waiter: Locator = page_info.page.locator(self.wait_selector)
|
| 339 |
+
waiter.first.wait_for(state=self.wait_selector_state)
|
| 340 |
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
| 341 |
+
page_info.page.wait_for_load_state(state="load")
|
| 342 |
+
page_info.page.wait_for_load_state(state="domcontentloaded")
|
| 343 |
+
if self.network_idle:
|
| 344 |
+
page_info.page.wait_for_load_state("networkidle")
|
| 345 |
+
except Exception as e:
|
| 346 |
+
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
| 347 |
+
|
| 348 |
+
page_info.page.wait_for_timeout(self.wait)
|
| 349 |
+
|
| 350 |
+
# Create response object
|
| 351 |
+
response = ResponseFactory.from_playwright_response(
|
| 352 |
+
page_info.page, first_response, final_response, self.adaptor_arguments
|
| 353 |
+
)
|
| 354 |
+
|
| 355 |
+
# Mark page as ready for next use
|
| 356 |
+
page_info.mark_ready()
|
| 357 |
+
|
| 358 |
+
return response
|
| 359 |
+
|
| 360 |
+
except Exception as e:
|
| 361 |
+
page_info.mark_error()
|
| 362 |
+
raise e
|
| 363 |
+
|
| 364 |
+
def get_pool_stats(self) -> Dict[str, int]:
|
| 365 |
+
"""Get statistics about the current page pool"""
|
| 366 |
+
return {
|
| 367 |
+
"total_pages": self.page_pool.pages_count,
|
| 368 |
+
"ready_pages": self.page_pool.ready_count,
|
| 369 |
+
"busy_pages": self.page_pool.busy_count,
|
| 370 |
+
"max_pages": self.max_pages,
|
| 371 |
+
}
|
| 372 |
+
|
| 373 |
+
|
| 374 |
+
class AsyncDynamicSession(DynamicSession):
|
| 375 |
+
"""A Browser session manager with page pooling"""
|
| 376 |
+
|
| 377 |
+
def __init__(
|
| 378 |
+
self,
|
| 379 |
+
max_pages: int = 1,
|
| 380 |
+
headless: bool = True,
|
| 381 |
+
google_search: bool = True,
|
| 382 |
+
hide_canvas: bool = False,
|
| 383 |
+
disable_webgl: bool = False,
|
| 384 |
+
real_chrome: bool = False,
|
| 385 |
+
stealth: bool = False,
|
| 386 |
+
wait: Union[int, float] = 0,
|
| 387 |
+
page_action: Optional[Callable] = None,
|
| 388 |
+
proxy: Optional[Union[str, Dict[str, str]]] = None,
|
| 389 |
+
locale: str = "en-US",
|
| 390 |
+
extra_headers: Optional[Dict[str, str]] = None,
|
| 391 |
+
useragent: Optional[str] = None,
|
| 392 |
+
cdp_url: Optional[str] = None,
|
| 393 |
+
timeout: Union[int, float] = 30000,
|
| 394 |
+
disable_resources: bool = False,
|
| 395 |
+
wait_selector: Optional[str] = None,
|
| 396 |
+
cookies: Optional[Iterable[Dict]] = None,
|
| 397 |
+
network_idle: bool = False,
|
| 398 |
+
wait_selector_state: SelectorWaitStates = "attached",
|
| 399 |
+
adaptor_arguments: Optional[Dict] = None,
|
| 400 |
+
):
|
| 401 |
+
"""A Browser session manager with page pooling
|
| 402 |
+
|
| 403 |
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 404 |
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
| 405 |
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 406 |
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 407 |
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 408 |
+
:param cookies: Set cookies for the next request.
|
| 409 |
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 410 |
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 411 |
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 412 |
+
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
| 413 |
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 414 |
+
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
| 415 |
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 416 |
+
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
| 417 |
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 418 |
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
| 419 |
+
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
| 420 |
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
|
| 421 |
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
| 422 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 423 |
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 424 |
+
:param max_pages: The maximum number of pages to be opened at the same time. It will be used in rotation through a PagePool.
|
| 425 |
+
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
| 426 |
+
"""
|
| 427 |
+
|
| 428 |
+
super().__init__(
|
| 429 |
+
max_pages,
|
| 430 |
+
headless,
|
| 431 |
+
google_search,
|
| 432 |
+
hide_canvas,
|
| 433 |
+
disable_webgl,
|
| 434 |
+
real_chrome,
|
| 435 |
+
stealth,
|
| 436 |
+
wait,
|
| 437 |
+
page_action,
|
| 438 |
+
proxy,
|
| 439 |
+
locale,
|
| 440 |
+
extra_headers,
|
| 441 |
+
useragent,
|
| 442 |
+
cdp_url,
|
| 443 |
+
timeout,
|
| 444 |
+
disable_resources,
|
| 445 |
+
wait_selector,
|
| 446 |
+
cookies,
|
| 447 |
+
network_idle,
|
| 448 |
+
wait_selector_state,
|
| 449 |
+
adaptor_arguments,
|
| 450 |
+
)
|
| 451 |
+
|
| 452 |
+
self.playwright: Optional[AsyncPlaywright] = None
|
| 453 |
+
self.browser: Optional[Union[AsyncBrowserType, AsyncBrowser]] = None
|
| 454 |
+
self.context: Optional[AsyncBrowserContext] = None
|
| 455 |
+
self._lock = asyncio.Lock()
|
| 456 |
+
self.__enter__ = None
|
| 457 |
+
self.__exit__ = None
|
| 458 |
+
|
| 459 |
+
async def __create__(self):
|
| 460 |
+
"""Create a browser for this instance and context."""
|
| 461 |
+
async_context = async_rebrowser_playwright
|
| 462 |
+
if not self.stealth or self.real_chrome:
|
| 463 |
+
# Because rebrowser_playwright doesn't play well with real browsers
|
| 464 |
+
async_context = async_playwright
|
| 465 |
+
|
| 466 |
+
self.playwright: AsyncPlaywright = await async_context().start()
|
| 467 |
+
|
| 468 |
+
browser_launcher: AsyncBrowserType = getattr(
|
| 469 |
+
self.playwright, "chrome" if self.real_chrome else "chromium"
|
| 470 |
+
)
|
| 471 |
+
if self.cdp_url:
|
| 472 |
+
self.browser = await browser_launcher.connect_over_cdp(
|
| 473 |
+
endpoint_url=self.cdp_url
|
| 474 |
+
)
|
| 475 |
+
else:
|
| 476 |
+
self.browser = await browser_launcher.launch(**self.launch_options)
|
| 477 |
+
|
| 478 |
+
self.context: AsyncBrowserContext = await self.browser.new_context(
|
| 479 |
+
**self.context_options
|
| 480 |
+
)
|
| 481 |
+
|
| 482 |
+
if self.cookies:
|
| 483 |
+
await self.context.add_cookies(self.cookies)
|
| 484 |
+
|
| 485 |
+
async def __aenter__(self):
|
| 486 |
+
await self.__create__()
|
| 487 |
+
return self
|
| 488 |
+
|
| 489 |
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
| 490 |
+
await self.close()
|
| 491 |
+
|
| 492 |
+
async def close(self):
|
| 493 |
+
"""Close all resources"""
|
| 494 |
+
if self._closed:
|
| 495 |
+
return
|
| 496 |
+
|
| 497 |
+
if self.context:
|
| 498 |
+
await self.context.close()
|
| 499 |
+
self.context = None
|
| 500 |
+
|
| 501 |
+
if self.browser:
|
| 502 |
+
await self.browser.close()
|
| 503 |
+
self.browser = None
|
| 504 |
+
|
| 505 |
+
if self.playwright:
|
| 506 |
+
await self.playwright.stop()
|
| 507 |
+
self.playwright = None
|
| 508 |
+
|
| 509 |
+
self._closed = True
|
| 510 |
+
|
| 511 |
+
async def _get_or_create_page(self) -> PageInfo:
|
| 512 |
+
"""Get an available page or create a new one"""
|
| 513 |
+
async with self._lock:
|
| 514 |
+
# Try to get a ready page first
|
| 515 |
+
page_info = self.page_pool.get_ready_page()
|
| 516 |
+
if page_info:
|
| 517 |
+
return page_info
|
| 518 |
+
|
| 519 |
+
# Create new page if under limit
|
| 520 |
+
if self.page_pool.pages_count < self.max_pages:
|
| 521 |
+
page = await self.context.new_page()
|
| 522 |
+
page.set_default_navigation_timeout(self.timeout)
|
| 523 |
+
page.set_default_timeout(self.timeout)
|
| 524 |
+
if self.extra_headers:
|
| 525 |
+
await page.set_extra_http_headers(self.extra_headers)
|
| 526 |
+
|
| 527 |
+
if self.disable_resources:
|
| 528 |
+
await page.route("**/*", async_intercept_route)
|
| 529 |
+
|
| 530 |
+
if self.stealth:
|
| 531 |
+
for script in _compiled_stealth_scripts():
|
| 532 |
+
await page.add_init_script(path=script)
|
| 533 |
+
|
| 534 |
+
return self.page_pool.add_page(page)
|
| 535 |
+
|
| 536 |
+
# Wait for a page to become available
|
| 537 |
+
max_wait = 30 # seconds
|
| 538 |
+
start_time = time.time()
|
| 539 |
+
|
| 540 |
+
while time.time() - start_time < max_wait:
|
| 541 |
+
page_info = self.page_pool.get_ready_page()
|
| 542 |
+
if page_info:
|
| 543 |
+
return page_info
|
| 544 |
+
await asyncio.sleep(0.05)
|
| 545 |
+
|
| 546 |
+
raise TimeoutError("No pages available within timeout period")
|
| 547 |
+
|
| 548 |
+
async def fetch(self, url: str) -> Response:
|
| 549 |
+
"""Opens up the browser and do your request based on your chosen options.
|
| 550 |
+
|
| 551 |
+
:param url: The Target url.
|
| 552 |
+
:return: A `Response` object.
|
| 553 |
+
"""
|
| 554 |
+
if self._closed:
|
| 555 |
+
raise RuntimeError("Context manager has been closed")
|
| 556 |
+
|
| 557 |
+
final_response = None
|
| 558 |
+
referer = generate_convincing_referer(url) if self.google_search else None
|
| 559 |
+
|
| 560 |
+
async def handle_response(finished_response: AsyncPlaywrightResponse):
|
| 561 |
+
nonlocal final_response
|
| 562 |
+
if (
|
| 563 |
+
finished_response.request.resource_type == "document"
|
| 564 |
+
and finished_response.request.is_navigation_request()
|
| 565 |
+
):
|
| 566 |
+
final_response = finished_response
|
| 567 |
+
|
| 568 |
+
page_info = await self._get_or_create_page()
|
| 569 |
+
page_info.mark_busy(url=url)
|
| 570 |
+
|
| 571 |
+
try:
|
| 572 |
+
# Navigate to URL and wait for a specified state
|
| 573 |
+
page_info.page.on("response", handle_response)
|
| 574 |
+
first_response = await page_info.page.goto(url, referer=referer)
|
| 575 |
+
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
| 576 |
+
|
| 577 |
+
if self.network_idle:
|
| 578 |
+
await page_info.page.wait_for_load_state("networkidle")
|
| 579 |
+
|
| 580 |
+
if not first_response:
|
| 581 |
+
raise RuntimeError(f"Failed to get response for {url}")
|
| 582 |
+
|
| 583 |
+
if self.page_action is not None:
|
| 584 |
+
try:
|
| 585 |
+
page_info.page = await self.page_action(page_info.page)
|
| 586 |
+
except Exception as e:
|
| 587 |
+
log.error(f"Error executing page_action: {e}")
|
| 588 |
+
|
| 589 |
+
if self.wait_selector:
|
| 590 |
+
try:
|
| 591 |
+
waiter: AsyncLocator = page_info.page.locator(self.wait_selector)
|
| 592 |
+
await waiter.first.wait_for(state=self.wait_selector_state)
|
| 593 |
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
| 594 |
+
await page_info.page.wait_for_load_state(state="load")
|
| 595 |
+
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
| 596 |
+
if self.network_idle:
|
| 597 |
+
await page_info.page.wait_for_load_state("networkidle")
|
| 598 |
+
except Exception as e:
|
| 599 |
+
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
| 600 |
+
|
| 601 |
+
await page_info.page.wait_for_timeout(self.wait)
|
| 602 |
+
|
| 603 |
+
# Create response object
|
| 604 |
+
response = await ResponseFactory.from_async_playwright_response(
|
| 605 |
+
page_info.page, first_response, final_response, self.adaptor_arguments
|
| 606 |
+
)
|
| 607 |
+
|
| 608 |
+
# Mark page as ready for next use
|
| 609 |
+
page_info.mark_ready()
|
| 610 |
+
|
| 611 |
+
return response
|
| 612 |
+
|
| 613 |
+
except Exception as e:
|
| 614 |
+
page_info.mark_error()
|
| 615 |
+
raise e
|
scrapling/engines/_browsers/_page.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from threading import RLock
|
| 2 |
+
from dataclasses import dataclass
|
| 3 |
+
|
| 4 |
+
from playwright.sync_api import Page as SyncPage
|
| 5 |
+
from playwright.async_api import Page as AsyncPage
|
| 6 |
+
|
| 7 |
+
from scrapling.core._types import Optional, Union, List, Literal
|
| 8 |
+
|
| 9 |
+
PageState = Literal["ready", "busy", "error"] # States that a page can be in
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@dataclass
|
| 13 |
+
class PageInfo:
|
| 14 |
+
"""Information about the page and its current state"""
|
| 15 |
+
|
| 16 |
+
__slots__ = ("page", "state", "url")
|
| 17 |
+
page: Union[SyncPage, AsyncPage]
|
| 18 |
+
state: PageState
|
| 19 |
+
url: Optional[str]
|
| 20 |
+
|
| 21 |
+
def mark_busy(self, url: str = ""):
|
| 22 |
+
"""Mark the page as busy"""
|
| 23 |
+
self.state = "busy"
|
| 24 |
+
self.url = url
|
| 25 |
+
|
| 26 |
+
def mark_ready(self):
|
| 27 |
+
"""Mark the page as ready for new requests"""
|
| 28 |
+
self.state = "ready"
|
| 29 |
+
self.url = ""
|
| 30 |
+
|
| 31 |
+
def mark_error(self):
|
| 32 |
+
"""Mark the page as having an error"""
|
| 33 |
+
self.state = "error"
|
| 34 |
+
|
| 35 |
+
def __repr__(self):
|
| 36 |
+
return f'Page(URL="{self.url!r}", state={self.state!r})'
|
| 37 |
+
|
| 38 |
+
def __eq__(self, other_page):
|
| 39 |
+
"""Comparing this page to another page object."""
|
| 40 |
+
if other_page.__class__ is not self.__class__:
|
| 41 |
+
return NotImplemented
|
| 42 |
+
return self.page == other_page.page
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class PagePool:
|
| 46 |
+
"""Manages a pool of browser pages/tabs with state tracking"""
|
| 47 |
+
|
| 48 |
+
__slots__ = ("max_pages", "pages", "_lock")
|
| 49 |
+
|
| 50 |
+
def __init__(self, max_pages: int = 5):
|
| 51 |
+
self.max_pages = max_pages
|
| 52 |
+
self.pages: List[PageInfo] = []
|
| 53 |
+
self._lock = RLock()
|
| 54 |
+
|
| 55 |
+
def add_page(self, page: Union[SyncPage, AsyncPage]) -> PageInfo:
|
| 56 |
+
"""Add a new page to the pool"""
|
| 57 |
+
with self._lock:
|
| 58 |
+
if len(self.pages) >= self.max_pages:
|
| 59 |
+
raise RuntimeError(f"Maximum page limit ({self.max_pages}) reached")
|
| 60 |
+
|
| 61 |
+
page_info = PageInfo(page, "ready", "")
|
| 62 |
+
self.pages.append(page_info)
|
| 63 |
+
return page_info
|
| 64 |
+
|
| 65 |
+
def get_ready_page(self) -> Optional[PageInfo]:
|
| 66 |
+
"""Get a page that's ready for use"""
|
| 67 |
+
with self._lock:
|
| 68 |
+
for page_info in self.pages:
|
| 69 |
+
if page_info.state == "ready":
|
| 70 |
+
return page_info
|
| 71 |
+
return None
|
| 72 |
+
|
| 73 |
+
@property
|
| 74 |
+
def pages_count(self) -> int:
|
| 75 |
+
"""Get the total number of pages"""
|
| 76 |
+
return len(self.pages)
|
| 77 |
+
|
| 78 |
+
@property
|
| 79 |
+
def ready_count(self) -> int:
|
| 80 |
+
"""Get the number of ready pages"""
|
| 81 |
+
with self._lock:
|
| 82 |
+
return sum(1 for p in self.pages if p.state == "ready")
|
| 83 |
+
|
| 84 |
+
@property
|
| 85 |
+
def busy_count(self) -> int:
|
| 86 |
+
"""Get the number of busy pages"""
|
| 87 |
+
with self._lock:
|
| 88 |
+
return sum(1 for p in self.pages if p.state == "busy")
|
| 89 |
+
|
| 90 |
+
def cleanup_error_pages(self):
|
| 91 |
+
"""Remove pages in error state"""
|
| 92 |
+
with self._lock:
|
| 93 |
+
self.pages = [p for p in self.pages if p.state != "error"]
|
scrapling/engines/_browsers/_validators.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import msgspec
|
| 2 |
+
from urllib.parse import urlparse
|
| 3 |
+
|
| 4 |
+
from scrapling.core._types import (
|
| 5 |
+
Optional,
|
| 6 |
+
Union,
|
| 7 |
+
Dict,
|
| 8 |
+
Callable,
|
| 9 |
+
Iterable,
|
| 10 |
+
SelectorWaitStates,
|
| 11 |
+
)
|
| 12 |
+
from scrapling.engines.toolbelt import construct_proxy_dict
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class PlaywrightConfig(msgspec.Struct, kw_only=True, frozen=False):
|
| 16 |
+
"""Configuration struct for validation"""
|
| 17 |
+
|
| 18 |
+
max_pages: int = 1
|
| 19 |
+
cdp_url: Optional[str] = None
|
| 20 |
+
headless: bool = True
|
| 21 |
+
google_search: bool = True
|
| 22 |
+
hide_canvas: bool = False
|
| 23 |
+
disable_webgl: bool = False
|
| 24 |
+
real_chrome: bool = False
|
| 25 |
+
stealth: bool = False
|
| 26 |
+
wait: Union[int, float] = 0
|
| 27 |
+
page_action: Optional[Callable] = None
|
| 28 |
+
proxy: Optional[Union[str, Dict[str, str]]] = (
|
| 29 |
+
None # The default value for proxy in Playwright's source is `None`
|
| 30 |
+
)
|
| 31 |
+
locale: str = "en-US"
|
| 32 |
+
extra_headers: Optional[Dict[str, str]] = None
|
| 33 |
+
useragent: Optional[str] = None
|
| 34 |
+
timeout: Union[int, float] = 30000
|
| 35 |
+
disable_resources: bool = False
|
| 36 |
+
wait_selector: Optional[str] = None
|
| 37 |
+
cookies: Optional[Iterable[Dict]] = None
|
| 38 |
+
network_idle: bool = False
|
| 39 |
+
wait_selector_state: SelectorWaitStates = "attached"
|
| 40 |
+
adaptor_arguments: Optional[Dict] = None
|
| 41 |
+
|
| 42 |
+
def __post_init__(self):
|
| 43 |
+
"""Custom validation after msgspec validation"""
|
| 44 |
+
if self.max_pages < 1 or self.max_pages > 50:
|
| 45 |
+
raise ValueError("max_pages must be between 1 and 50")
|
| 46 |
+
if self.wait_selector_state not in (
|
| 47 |
+
"attached",
|
| 48 |
+
"detached",
|
| 49 |
+
"hidden",
|
| 50 |
+
"visible",
|
| 51 |
+
):
|
| 52 |
+
raise ValueError(f"Invalid wait_selector_state: {self.wait_selector_state}")
|
| 53 |
+
if self.timeout < 0:
|
| 54 |
+
raise ValueError("timeout must be >= 0")
|
| 55 |
+
if self.page_action is not None and not callable(self.page_action):
|
| 56 |
+
raise TypeError(
|
| 57 |
+
f"page_action must be callable, got {type(self.page_action).__name__}"
|
| 58 |
+
)
|
| 59 |
+
if self.proxy:
|
| 60 |
+
self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
|
| 61 |
+
if self.cdp_url:
|
| 62 |
+
self.__validate_cdp(self.cdp_url)
|
| 63 |
+
|
| 64 |
+
@staticmethod
|
| 65 |
+
def __validate_cdp(cdp_url):
|
| 66 |
+
try:
|
| 67 |
+
# Check the scheme
|
| 68 |
+
if not cdp_url.startswith(("ws://", "wss://")):
|
| 69 |
+
raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
|
| 70 |
+
|
| 71 |
+
# Validate hostname and port
|
| 72 |
+
if not urlparse(cdp_url).netloc:
|
| 73 |
+
raise ValueError("Invalid hostname for the CDP URL")
|
| 74 |
+
|
| 75 |
+
except AttributeError as e:
|
| 76 |
+
raise ValueError(f"Malformed CDP URL: {cdp_url}: {str(e)}")
|
| 77 |
+
|
| 78 |
+
except Exception as e:
|
| 79 |
+
raise ValueError(f"Invalid CDP URL '{cdp_url}': {str(e)}")
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def validate(params, model):
|
| 83 |
+
try:
|
| 84 |
+
config = msgspec.convert(params, model)
|
| 85 |
+
except msgspec.ValidationError as e:
|
| 86 |
+
raise TypeError(f"Invalid argument type: {e}")
|
| 87 |
+
|
| 88 |
+
return config
|
scrapling/engines/constants.py
CHANGED
|
@@ -12,6 +12,15 @@ DEFAULT_DISABLED_RESOURCES = {
|
|
| 12 |
"stylesheet",
|
| 13 |
}
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
DEFAULT_STEALTH_FLAGS = (
|
| 16 |
# Explanation: https://peter.sh/experiments/chromium-command-line-switches/
|
| 17 |
# Generally this will make the browser faster and less detectable
|
|
|
|
| 12 |
"stylesheet",
|
| 13 |
}
|
| 14 |
|
| 15 |
+
HARMFUL_DEFAULT_ARGS = (
|
| 16 |
+
# This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
|
| 17 |
+
"--enable-automation",
|
| 18 |
+
"--disable-popup-blocking",
|
| 19 |
+
# '--disable-component-update',
|
| 20 |
+
# '--disable-default-apps',
|
| 21 |
+
# '--disable-extensions',
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
DEFAULT_STEALTH_FLAGS = (
|
| 25 |
# Explanation: https://peter.sh/experiments/chromium-command-line-switches/
|
| 26 |
# Generally this will make the browser faster and less detectable
|