File size: 8,183 Bytes
11165c4 da46469 8bf21c0 da46469 8bf21c0 da46469 0cd97d9 c908f33 0cd97d9 62613cb 47dd985 da46469 ee2299e e5ecf76 ee2299e 0cd97d9 ed96cdc a85d2c8 ee2299e 0cd97d9 da46469 8bf21c0 624e46f da46469 8bf21c0 da46469 8bf21c0 da46469 8bf21c0 da46469 8bf21c0 da46469 8bf21c0 da46469 8bf21c0 624e46f 8bf21c0 da46469 024cbba da46469 8bf21c0 0cd97d9 da46469 0cd97d9 05b2188 0cd97d9 da46469 05b2188 0cd97d9 e5ecf76 ed96cdc 0cd97d9 da46469 03de577 c61a805 e5ecf76 0de8025 05b2188 47dd985 024cbba 0cd97d9 624e46f 0cd97d9 226b463 ed96cdc 0cd97d9 f62e5ea 0cd97d9 8bf21c0 da46469 62613cb b6969b2 8e67a4c 0de8025 0cd97d9 03de577 8bf21c0 0cd97d9 ee2299e 42a1f3d ee2299e 42a1f3d ee2299e da46469 42a1f3d da46469 47dd985 da46469 e5ecf76 da46469 8bf21c0 ee2299e c908f33 ee2299e 624e46f 31c2447 da46469 c908f33 31c2447 ee2299e da46469 8bf21c0 ee2299e 8bf21c0 ee2299e 8bf21c0 ee2299e da46469 4836205 8bf21c0 4836205 8bf21c0 47dd985 da46469 8bf21c0 ee2299e 8bf21c0 e5ecf76 ee2299e e5ecf76 ee2299e e5ecf76 ee2299e 0cd97d9 8bf21c0 e67328d da46469 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 | from pathlib import Path
from typing import Annotated
from functools import lru_cache
from urllib.parse import urlparse
from dataclasses import dataclass, fields
from msgspec import Struct, Meta, convert, ValidationError
from scrapling.core._types import (
Any,
Dict,
List,
Set,
Tuple,
Optional,
Callable,
Sequence,
overload,
SetCookieParam,
SelectorWaitStates,
)
from scrapling.engines.toolbelt.proxy_rotation import ProxyRotator
from scrapling.engines.toolbelt.navigation import construct_proxy_dict
from scrapling.engines._browsers._types import PlaywrightFetchParams, StealthFetchParams
# Custom validators for msgspec
@lru_cache(8)
def _is_invalid_file_path(value: str) -> bool | str: # pragma: no cover
"""Fast file path validation"""
path = Path(value)
if not path.exists():
return f"Init script path not found: {value}"
if not path.is_file():
return f"Init script is not a file: {value}"
if not path.is_absolute():
return f"Init script is not a absolute path: {value}"
return False
@lru_cache(2)
def _is_invalid_cdp_url(cdp_url: str) -> bool | str:
"""Fast CDP URL validation"""
if not cdp_url.startswith(("ws://", "wss://")):
return "CDP URL must use 'ws://' or 'wss://' scheme"
netloc = urlparse(cdp_url).netloc
if not netloc: # pragma: no cover
return "Invalid hostname for the CDP URL"
return False
# Type aliases for cleaner annotations
PagesCount = Annotated[int, Meta(ge=1, le=50)]
RetriesCount = Annotated[int, Meta(ge=1, le=10)]
Seconds = Annotated[int, float, Meta(ge=0)]
class PlaywrightConfig(Struct, kw_only=True, frozen=False, weakref=True):
"""Configuration struct for validation"""
max_pages: PagesCount = 1
headless: bool = True
disable_resources: bool = False
network_idle: bool = False
load_dom: bool = True
wait_selector: Optional[str] = None
wait_selector_state: SelectorWaitStates = "attached"
cookies: Sequence[SetCookieParam] | None = []
google_search: bool = True
wait: Seconds = 0
timezone_id: str | None = ""
page_action: Optional[Callable] = None
proxy: Optional[str | Dict[str, str] | Tuple] = None # The default value for proxy in Playwright's source is `None`
proxy_rotator: Optional[ProxyRotator] = None
extra_headers: Optional[Dict[str, str]] = None
timeout: Seconds = 30000
init_script: Optional[str] = None
user_data_dir: str = ""
selector_config: Optional[Dict] = {}
additional_args: Optional[Dict] = {}
locale: str | None = None
real_chrome: bool = False
cdp_url: Optional[str] = None
useragent: Optional[str] = None
extra_flags: Optional[List[str]] = None
blocked_domains: Optional[Set[str]] = None
retries: RetriesCount = 3
retry_delay: Seconds = 1
def __post_init__(self): # pragma: no cover
"""Custom validation after msgspec validation"""
if self.page_action and not callable(self.page_action):
raise TypeError(f"page_action must be callable, got {type(self.page_action).__name__}")
if self.proxy and self.proxy_rotator:
raise ValueError(
"Cannot use 'proxy_rotator' together with 'proxy'. "
"Use either a static proxy or proxy rotation, not both."
)
if self.proxy:
self.proxy = construct_proxy_dict(self.proxy)
if self.cdp_url:
cdp_msg = _is_invalid_cdp_url(self.cdp_url)
if cdp_msg:
raise ValueError(cdp_msg)
if not self.cookies:
self.cookies = []
if not self.extra_flags:
self.extra_flags = []
if not self.selector_config:
self.selector_config = {}
if not self.additional_args:
self.additional_args = {}
if self.init_script is not None:
validation_msg = _is_invalid_file_path(self.init_script)
if validation_msg:
raise ValueError(validation_msg)
class StealthConfig(PlaywrightConfig, kw_only=True, frozen=False, weakref=True):
allow_webgl: bool = True
hide_canvas: bool = False
block_webrtc: bool = False
solve_cloudflare: bool = False
def __post_init__(self):
"""Custom validation after msgspec validation"""
super(StealthConfig, self).__post_init__()
# Cloudflare timeout adjustment
if self.solve_cloudflare and self.timeout < 60_000:
self.timeout = 60_000
@dataclass
class _fetch_params:
"""A dataclass of all parameters used by `fetch` calls"""
google_search: bool
timeout: Seconds
wait: Seconds
page_action: Optional[Callable]
extra_headers: Optional[Dict[str, str]]
disable_resources: bool
wait_selector: Optional[str]
wait_selector_state: SelectorWaitStates
network_idle: bool
load_dom: bool
blocked_domains: Optional[Set[str]]
solve_cloudflare: bool
selector_config: Dict
def validate_fetch(
method_kwargs: Dict | PlaywrightFetchParams | StealthFetchParams,
session: Any,
model: type[PlaywrightConfig] | type[StealthConfig],
) -> _fetch_params: # pragma: no cover
result: Dict[str, Any] = {}
overrides: Dict[str, Any] = {}
kwargs_dict: Dict[str, Any] = dict(method_kwargs)
# Get all field names that _fetch_params needs
fetch_param_fields = {f.name for f in fields(_fetch_params)}
for key in fetch_param_fields:
if key in kwargs_dict:
overrides[key] = kwargs_dict[key]
elif hasattr(session, "_config") and hasattr(session._config, key):
result[key] = getattr(session._config, key)
if overrides:
validated_config = validate(overrides, model)
# Extract ONLY the fields that were actually overridden (not all fields)
# This prevents validated defaults from overwriting session config values
validated_dict = {
field: getattr(validated_config, field) for field in overrides.keys() if hasattr(validated_config, field)
}
# Preserve solve_cloudflare if the user explicitly provided it, even if the model doesn't have it
if "solve_cloudflare" in overrides:
validated_dict["solve_cloudflare"] = overrides["solve_cloudflare"]
# Start with session defaults, then overwrite with validated overrides
result.update(validated_dict)
# solve_cloudflare defaults to False for models that don't have it (PlaywrightConfig)
result.setdefault("solve_cloudflare", False)
result.setdefault("blocked_domains", None)
return _fetch_params(**result)
# Cache default values for each model to reduce validation overhead
models_default_values = {}
for _model in (StealthConfig, PlaywrightConfig):
_defaults = {}
if hasattr(_model, "__struct_defaults__") and hasattr(_model, "__struct_fields__"):
for field_name, default_value in zip(_model.__struct_fields__, _model.__struct_defaults__): # type: ignore
# Skip factory defaults - these are msgspec._core.Factory instances
if type(default_value).__name__ != "Factory":
_defaults[field_name] = default_value
models_default_values[_model.__name__] = _defaults.copy()
def _filter_defaults(params: Dict, model: str) -> Dict:
"""Filter out parameters that match their default values to reduce validation overhead."""
defaults = models_default_values[model]
return {k: v for k, v in params.items() if k not in defaults or v != defaults[k]}
@overload
def validate(params: Dict, model: type[StealthConfig]) -> StealthConfig: ...
@overload
def validate(params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...
def validate(params: Dict, model: type[PlaywrightConfig] | type[StealthConfig]) -> PlaywrightConfig | StealthConfig:
try:
# Filter out params with the default values (no need to validate them) to speed up validation
filtered = _filter_defaults(params, model.__name__)
return convert(filtered, model)
except ValidationError as e:
raise TypeError(f"Invalid argument type: {e}") from e
|