File size: 8,183 Bytes
11165c4
da46469
8bf21c0
da46469
8bf21c0
da46469
 
0cd97d9
 
c908f33
0cd97d9
62613cb
47dd985
da46469
 
 
ee2299e
e5ecf76
ee2299e
 
0cd97d9
ed96cdc
a85d2c8
ee2299e
0cd97d9
 
da46469
8bf21c0
624e46f
da46469
 
 
8bf21c0
da46469
8bf21c0
da46469
8bf21c0
 
da46469
 
8bf21c0
 
da46469
8bf21c0
 
da46469
8bf21c0
624e46f
8bf21c0
 
da46469
 
 
 
024cbba
da46469
 
 
8bf21c0
0cd97d9
 
da46469
0cd97d9
05b2188
 
 
 
 
 
0cd97d9
da46469
05b2188
0cd97d9
e5ecf76
ed96cdc
0cd97d9
da46469
03de577
c61a805
e5ecf76
0de8025
05b2188
 
 
 
 
47dd985
024cbba
 
0cd97d9
624e46f
0cd97d9
226b463
 
ed96cdc
 
 
 
 
0cd97d9
f62e5ea
0cd97d9
8bf21c0
 
 
da46469
62613cb
 
b6969b2
 
8e67a4c
 
0de8025
 
0cd97d9
03de577
8bf21c0
 
 
0cd97d9
 
ee2299e
42a1f3d
ee2299e
 
42a1f3d
 
 
 
ee2299e
da46469
42a1f3d
 
 
 
da46469
 
 
 
 
 
 
 
 
 
 
 
 
 
47dd985
da46469
e5ecf76
da46469
 
8bf21c0
ee2299e
c908f33
ee2299e
624e46f
31c2447
 
 
da46469
c908f33
 
 
 
31c2447
 
ee2299e
 
da46469
 
8bf21c0
ee2299e
 
8bf21c0
ee2299e
8bf21c0
ee2299e
 
 
 
da46469
4836205
 
8bf21c0
4836205
8bf21c0
47dd985
da46469
 
 
 
8bf21c0
 
 
ee2299e
8bf21c0
 
 
 
 
 
 
 
 
 
 
 
 
 
e5ecf76
 
 
ee2299e
e5ecf76
 
 
ee2299e
e5ecf76
 
ee2299e
0cd97d9
8bf21c0
 
 
e67328d
da46469
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
from pathlib import Path
from typing import Annotated
from functools import lru_cache
from urllib.parse import urlparse
from dataclasses import dataclass, fields

from msgspec import Struct, Meta, convert, ValidationError

from scrapling.core._types import (
    Any,
    Dict,
    List,
    Set,
    Tuple,
    Optional,
    Callable,
    Sequence,
    overload,
    SetCookieParam,
    SelectorWaitStates,
)
from scrapling.engines.toolbelt.proxy_rotation import ProxyRotator
from scrapling.engines.toolbelt.navigation import construct_proxy_dict
from scrapling.engines._browsers._types import PlaywrightFetchParams, StealthFetchParams


# Custom validators for msgspec
@lru_cache(8)
def _is_invalid_file_path(value: str) -> bool | str:  # pragma: no cover
    """Fast file path validation"""
    path = Path(value)
    if not path.exists():
        return f"Init script path not found: {value}"
    if not path.is_file():
        return f"Init script is not a file: {value}"
    if not path.is_absolute():
        return f"Init script is not a absolute path: {value}"
    return False


@lru_cache(2)
def _is_invalid_cdp_url(cdp_url: str) -> bool | str:
    """Fast CDP URL validation"""
    if not cdp_url.startswith(("ws://", "wss://")):
        return "CDP URL must use 'ws://' or 'wss://' scheme"

    netloc = urlparse(cdp_url).netloc
    if not netloc:  # pragma: no cover
        return "Invalid hostname for the CDP URL"
    return False


# Type aliases for cleaner annotations
PagesCount = Annotated[int, Meta(ge=1, le=50)]
RetriesCount = Annotated[int, Meta(ge=1, le=10)]
Seconds = Annotated[int, float, Meta(ge=0)]


class PlaywrightConfig(Struct, kw_only=True, frozen=False, weakref=True):
    """Configuration struct for validation"""

    max_pages: PagesCount = 1
    headless: bool = True
    disable_resources: bool = False
    network_idle: bool = False
    load_dom: bool = True
    wait_selector: Optional[str] = None
    wait_selector_state: SelectorWaitStates = "attached"
    cookies: Sequence[SetCookieParam] | None = []
    google_search: bool = True
    wait: Seconds = 0
    timezone_id: str | None = ""
    page_action: Optional[Callable] = None
    proxy: Optional[str | Dict[str, str] | Tuple] = None  # The default value for proxy in Playwright's source is `None`
    proxy_rotator: Optional[ProxyRotator] = None
    extra_headers: Optional[Dict[str, str]] = None
    timeout: Seconds = 30000
    init_script: Optional[str] = None
    user_data_dir: str = ""
    selector_config: Optional[Dict] = {}
    additional_args: Optional[Dict] = {}
    locale: str | None = None
    real_chrome: bool = False
    cdp_url: Optional[str] = None
    useragent: Optional[str] = None
    extra_flags: Optional[List[str]] = None
    blocked_domains: Optional[Set[str]] = None
    retries: RetriesCount = 3
    retry_delay: Seconds = 1

    def __post_init__(self):  # pragma: no cover
        """Custom validation after msgspec validation"""
        if self.page_action and not callable(self.page_action):
            raise TypeError(f"page_action must be callable, got {type(self.page_action).__name__}")
        if self.proxy and self.proxy_rotator:
            raise ValueError(
                "Cannot use 'proxy_rotator' together with 'proxy'. "
                "Use either a static proxy or proxy rotation, not both."
            )
        if self.proxy:
            self.proxy = construct_proxy_dict(self.proxy)
        if self.cdp_url:
            cdp_msg = _is_invalid_cdp_url(self.cdp_url)
            if cdp_msg:
                raise ValueError(cdp_msg)

        if not self.cookies:
            self.cookies = []
        if not self.extra_flags:
            self.extra_flags = []
        if not self.selector_config:
            self.selector_config = {}
        if not self.additional_args:
            self.additional_args = {}

        if self.init_script is not None:
            validation_msg = _is_invalid_file_path(self.init_script)
            if validation_msg:
                raise ValueError(validation_msg)


class StealthConfig(PlaywrightConfig, kw_only=True, frozen=False, weakref=True):
    allow_webgl: bool = True
    hide_canvas: bool = False
    block_webrtc: bool = False
    solve_cloudflare: bool = False

    def __post_init__(self):
        """Custom validation after msgspec validation"""
        super(StealthConfig, self).__post_init__()
        # Cloudflare timeout adjustment
        if self.solve_cloudflare and self.timeout < 60_000:
            self.timeout = 60_000


@dataclass
class _fetch_params:
    """A dataclass of all parameters used by `fetch` calls"""

    google_search: bool
    timeout: Seconds
    wait: Seconds
    page_action: Optional[Callable]
    extra_headers: Optional[Dict[str, str]]
    disable_resources: bool
    wait_selector: Optional[str]
    wait_selector_state: SelectorWaitStates
    network_idle: bool
    load_dom: bool
    blocked_domains: Optional[Set[str]]
    solve_cloudflare: bool
    selector_config: Dict


def validate_fetch(
    method_kwargs: Dict | PlaywrightFetchParams | StealthFetchParams,
    session: Any,
    model: type[PlaywrightConfig] | type[StealthConfig],
) -> _fetch_params:  # pragma: no cover
    result: Dict[str, Any] = {}
    overrides: Dict[str, Any] = {}
    kwargs_dict: Dict[str, Any] = dict(method_kwargs)

    # Get all field names that _fetch_params needs
    fetch_param_fields = {f.name for f in fields(_fetch_params)}

    for key in fetch_param_fields:
        if key in kwargs_dict:
            overrides[key] = kwargs_dict[key]
        elif hasattr(session, "_config") and hasattr(session._config, key):
            result[key] = getattr(session._config, key)

    if overrides:
        validated_config = validate(overrides, model)
        # Extract ONLY the fields that were actually overridden (not all fields)
        # This prevents validated defaults from overwriting session config values
        validated_dict = {
            field: getattr(validated_config, field) for field in overrides.keys() if hasattr(validated_config, field)
        }

        # Preserve solve_cloudflare if the user explicitly provided it, even if the model doesn't have it
        if "solve_cloudflare" in overrides:
            validated_dict["solve_cloudflare"] = overrides["solve_cloudflare"]

        # Start with session defaults, then overwrite with validated overrides
        result.update(validated_dict)

    # solve_cloudflare defaults to False for models that don't have it (PlaywrightConfig)
    result.setdefault("solve_cloudflare", False)
    result.setdefault("blocked_domains", None)

    return _fetch_params(**result)


# Cache default values for each model to reduce validation overhead
models_default_values = {}

for _model in (StealthConfig, PlaywrightConfig):
    _defaults = {}
    if hasattr(_model, "__struct_defaults__") and hasattr(_model, "__struct_fields__"):
        for field_name, default_value in zip(_model.__struct_fields__, _model.__struct_defaults__):  # type: ignore
            # Skip factory defaults - these are msgspec._core.Factory instances
            if type(default_value).__name__ != "Factory":
                _defaults[field_name] = default_value

    models_default_values[_model.__name__] = _defaults.copy()


def _filter_defaults(params: Dict, model: str) -> Dict:
    """Filter out parameters that match their default values to reduce validation overhead."""
    defaults = models_default_values[model]
    return {k: v for k, v in params.items() if k not in defaults or v != defaults[k]}


@overload
def validate(params: Dict, model: type[StealthConfig]) -> StealthConfig: ...


@overload
def validate(params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...


def validate(params: Dict, model: type[PlaywrightConfig] | type[StealthConfig]) -> PlaywrightConfig | StealthConfig:
    try:
        # Filter out params with the default values (no need to validate them) to speed up validation
        filtered = _filter_defaults(params, model.__name__)
        return convert(filtered, model)
    except ValidationError as e:
        raise TypeError(f"Invalid argument type: {e}") from e