GraziePrego commited on
Commit
5fabe30
·
verified ·
1 Parent(s): cff1e76

Upload folder using huggingface_hub

Browse files
__init__.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __author__ = "Karim Shoair (karim.shoair@pm.me)"
2
+ __version__ = "0.4.1"
3
+ __copyright__ = "Copyright (c) 2024 Karim Shoair"
4
+
5
+ from typing import Any, TYPE_CHECKING
6
+
7
+ if TYPE_CHECKING:
8
+ from scrapling.parser import Selector, Selectors
9
+ from scrapling.core.custom_types import AttributesHandler, TextHandler
10
+ from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
11
+
12
+
13
+ # Lazy import mapping
14
+ _LAZY_IMPORTS = {
15
+ "Fetcher": ("scrapling.fetchers", "Fetcher"),
16
+ "Selector": ("scrapling.parser", "Selector"),
17
+ "Selectors": ("scrapling.parser", "Selectors"),
18
+ "AttributesHandler": ("scrapling.core.custom_types", "AttributesHandler"),
19
+ "TextHandler": ("scrapling.core.custom_types", "TextHandler"),
20
+ "AsyncFetcher": ("scrapling.fetchers", "AsyncFetcher"),
21
+ "StealthyFetcher": ("scrapling.fetchers", "StealthyFetcher"),
22
+ "DynamicFetcher": ("scrapling.fetchers", "DynamicFetcher"),
23
+ }
24
+ __all__ = ["Selector", "Fetcher", "AsyncFetcher", "StealthyFetcher", "DynamicFetcher"]
25
+
26
+
27
+ def __getattr__(name: str) -> Any:
28
+ if name in _LAZY_IMPORTS:
29
+ module_path, class_name = _LAZY_IMPORTS[name]
30
+ module = __import__(module_path, fromlist=[class_name])
31
+ return getattr(module, class_name)
32
+ else:
33
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
34
+
35
+
36
+ def __dir__() -> list[str]:
37
+ """Support for dir() and autocomplete."""
38
+ return sorted(__all__ + ["fetchers", "parser", "cli", "core", "__author__", "__version__", "__copyright__"])
cli.py ADDED
@@ -0,0 +1,826 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from subprocess import check_output
3
+ from sys import executable as python_executable
4
+
5
+ from scrapling.core.utils import log
6
+ from scrapling.engines.toolbelt.custom import Response
7
+ from scrapling.core.utils._shell import _CookieParser, _ParseHeaders
8
+ from scrapling.core._types import List, Optional, Dict, Tuple, Any, Callable
9
+
10
+ from orjson import loads as json_loads, JSONDecodeError
11
+
12
+ try:
13
+ from click import command, option, Choice, group, argument
14
+ except (ImportError, ModuleNotFoundError) as e:
15
+ raise ModuleNotFoundError(
16
+ "You need to install scrapling with any of the extras to enable Shell commands. See: https://scrapling.readthedocs.io/en/latest/#installation"
17
+ ) from e
18
+
19
+ __OUTPUT_FILE_HELP__ = "The output file path can be an HTML file, a Markdown file of the HTML content, or the text content itself. Use file extensions (`.html`/`.md`/`.txt`) respectively."
20
+ __PACKAGE_DIR__ = Path(__file__).parent
21
+
22
+
23
+ def __Execute(cmd: List[str], help_line: str) -> None: # pragma: no cover
24
+ print(f"Installing {help_line}...")
25
+ _ = check_output(cmd, shell=False) # nosec B603
26
+ # I meant to not use try except here
27
+
28
+
29
+ def __ParseJSONData(json_string: Optional[str] = None) -> Optional[Dict[str, Any]]:
30
+ """Parse JSON string into a Python object"""
31
+ if not json_string:
32
+ return None
33
+
34
+ try:
35
+ return json_loads(json_string)
36
+ except JSONDecodeError as err: # pragma: no cover
37
+ raise ValueError(f"Invalid JSON data '{json_string}': {err}")
38
+
39
+
40
+ def __Request_and_Save(
41
+ fetcher_func: Callable[..., Response],
42
+ url: str,
43
+ output_file: str,
44
+ css_selector: Optional[str] = None,
45
+ **kwargs,
46
+ ) -> None:
47
+ """Make a request using the specified fetcher function and save the result"""
48
+ from scrapling.core.shell import Convertor
49
+
50
+ # Handle relative paths - convert to an absolute path based on the current working directory
51
+ output_path = Path(output_file)
52
+ if not output_path.is_absolute():
53
+ output_path = Path.cwd() / output_file
54
+
55
+ response = fetcher_func(url, **kwargs)
56
+ Convertor.write_content_to_file(response, str(output_path), css_selector)
57
+ log.info(f"Content successfully saved to '{output_path}'")
58
+
59
+
60
+ def __ParseExtractArguments(
61
+ headers: List[str], cookies: str, params: str, json: Optional[str] = None
62
+ ) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, str], Optional[Dict[str, str]]]:
63
+ """Parse arguments for extract command"""
64
+ parsed_headers, parsed_cookies = _ParseHeaders(headers)
65
+ if cookies:
66
+ for key, value in _CookieParser(cookies):
67
+ try:
68
+ parsed_cookies[key] = value
69
+ except Exception as err:
70
+ raise ValueError(f"Could not parse cookies '{cookies}': {err}")
71
+
72
+ parsed_json = __ParseJSONData(json)
73
+ parsed_params = {}
74
+ for param in params:
75
+ if "=" in param:
76
+ key, value = param.split("=", 1)
77
+ parsed_params[key] = value
78
+
79
+ return parsed_headers, parsed_cookies, parsed_params, parsed_json
80
+
81
+
82
+ def __BuildRequest(headers: List[str], cookies: str, params: str, json: Optional[str] = None, **kwargs) -> Dict:
83
+ """Build a request object using the specified arguments"""
84
+ # Parse parameters
85
+ parsed_headers, parsed_cookies, parsed_params, parsed_json = __ParseExtractArguments(headers, cookies, params, json)
86
+ # Build request arguments
87
+ request_kwargs: Dict[str, Any] = {
88
+ "headers": parsed_headers if parsed_headers else None,
89
+ "cookies": parsed_cookies if parsed_cookies else None,
90
+ }
91
+ if parsed_json:
92
+ request_kwargs["json"] = parsed_json
93
+ if parsed_params:
94
+ request_kwargs["params"] = parsed_params
95
+ if "proxy" in kwargs:
96
+ request_kwargs["proxy"] = kwargs.pop("proxy")
97
+
98
+ # Parse impersonate parameter if it contains commas (for random selection)
99
+ if "impersonate" in kwargs and "," in (kwargs.get("impersonate") or ""):
100
+ kwargs["impersonate"] = [browser.strip() for browser in kwargs["impersonate"].split(",")]
101
+
102
+ return {**request_kwargs, **kwargs}
103
+
104
+
105
+ @command(help="Install all Scrapling's Fetchers dependencies")
106
+ @option(
107
+ "-f",
108
+ "--force",
109
+ "force",
110
+ is_flag=True,
111
+ default=False,
112
+ type=bool,
113
+ help="Force Scrapling to reinstall all Fetchers dependencies",
114
+ )
115
+ def install(force): # pragma: no cover
116
+ if force or not __PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").exists():
117
+ __Execute(
118
+ [python_executable, "-m", "playwright", "install", "chromium"],
119
+ "Playwright browsers",
120
+ )
121
+ __Execute(
122
+ [
123
+ python_executable,
124
+ "-m",
125
+ "playwright",
126
+ "install-deps",
127
+ "chromium",
128
+ ],
129
+ "Playwright dependencies",
130
+ )
131
+ from tld.utils import update_tld_names
132
+
133
+ update_tld_names(fail_silently=True)
134
+ # if no errors raised by the above commands, then we add the below file
135
+ __PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").touch()
136
+ else:
137
+ print("The dependencies are already installed")
138
+
139
+
140
+ @command(help="Run Scrapling's MCP server (Check the docs for more info).")
141
+ @option(
142
+ "--http",
143
+ is_flag=True,
144
+ default=False,
145
+ help="Whether to run the MCP server in streamable-http transport or leave it as stdio (Default: False)",
146
+ )
147
+ @option(
148
+ "--host",
149
+ type=str,
150
+ default="0.0.0.0",
151
+ help="The host to use if streamable-http transport is enabled (Default: '0.0.0.0')",
152
+ )
153
+ @option(
154
+ "--port", type=int, default=8000, help="The port to use if streamable-http transport is enabled (Default: 8000)"
155
+ )
156
+ def mcp(http, host, port):
157
+ from scrapling.core.ai import ScraplingMCPServer
158
+
159
+ server = ScraplingMCPServer()
160
+ server.serve(http, host, port)
161
+
162
+
163
+ @command(help="Interactive scraping console")
164
+ @option(
165
+ "-c",
166
+ "--code",
167
+ "code",
168
+ is_flag=False,
169
+ default="",
170
+ type=str,
171
+ help="Evaluate the code in the shell, print the result and exit",
172
+ )
173
+ @option(
174
+ "-L",
175
+ "--loglevel",
176
+ "level",
177
+ is_flag=False,
178
+ default="debug",
179
+ type=Choice(["debug", "info", "warning", "error", "critical", "fatal"], case_sensitive=False),
180
+ help="Log level (default: DEBUG)",
181
+ )
182
+ def shell(code, level):
183
+ from scrapling.core.shell import CustomShell
184
+
185
+ console = CustomShell(code=code, log_level=level)
186
+ console.start()
187
+
188
+
189
+ @group(
190
+ help="Fetch web pages using various fetchers and extract full/selected HTML content as HTML, Markdown, or extract text content."
191
+ )
192
+ def extract():
193
+ """Extract content from web pages and save to files"""
194
+ pass
195
+
196
+
197
+ @extract.command(help=f"Perform a GET request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
198
+ @argument("url", required=True)
199
+ @argument("output_file", required=True)
200
+ @option(
201
+ "--headers",
202
+ "-H",
203
+ multiple=True,
204
+ help='HTTP headers in format "Key: Value" (can be used multiple times)',
205
+ )
206
+ @option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
207
+ @option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
208
+ @option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
209
+ @option(
210
+ "--css-selector",
211
+ "-s",
212
+ help="CSS selector to extract specific content from the page. It returns all matches.",
213
+ )
214
+ @option(
215
+ "--params",
216
+ "-p",
217
+ multiple=True,
218
+ help='Query parameters in format "key=value" (can be used multiple times)',
219
+ )
220
+ @option(
221
+ "--follow-redirects/--no-follow-redirects",
222
+ default=True,
223
+ help="Whether to follow redirects (default: True)",
224
+ )
225
+ @option(
226
+ "--verify/--no-verify",
227
+ default=True,
228
+ help="Whether to verify SSL certificates (default: True)",
229
+ )
230
+ @option(
231
+ "--impersonate",
232
+ help="Browser to impersonate. Can be a single browser (e.g., chrome) or comma-separated list for random selection (e.g., chrome,firefox,safari).",
233
+ )
234
+ @option(
235
+ "--stealthy-headers/--no-stealthy-headers",
236
+ default=True,
237
+ help="Use stealthy browser headers (default: True)",
238
+ )
239
+ def get(
240
+ url,
241
+ output_file,
242
+ headers,
243
+ cookies,
244
+ timeout,
245
+ proxy,
246
+ css_selector,
247
+ params,
248
+ follow_redirects,
249
+ verify,
250
+ impersonate,
251
+ stealthy_headers,
252
+ ):
253
+ """
254
+ Perform a GET request and save the content to a file.
255
+
256
+ :param url: Target URL for the request.
257
+ :param output_file: Output file path (.md for Markdown, .html for HTML).
258
+ :param headers: HTTP headers to include in the request.
259
+ :param cookies: Cookies to use in the request.
260
+ :param timeout: Number of seconds to wait before timing out.
261
+ :param proxy: Proxy URL to use. (Format: "http://username:password@localhost:8030")
262
+ :param css_selector: CSS selector to extract specific content.
263
+ :param params: Query string parameters for the request.
264
+ :param follow_redirects: Whether to follow redirects.
265
+ :param verify: Whether to verify HTTPS certificates.
266
+ :param impersonate: Browser version to impersonate.
267
+ :param stealthy_headers: If enabled, creates and adds real browser headers.
268
+ """
269
+
270
+ kwargs = __BuildRequest(
271
+ headers,
272
+ cookies,
273
+ params,
274
+ None,
275
+ timeout=timeout,
276
+ follow_redirects=follow_redirects,
277
+ verify=verify,
278
+ stealthy_headers=stealthy_headers,
279
+ impersonate=impersonate,
280
+ proxy=proxy,
281
+ )
282
+ from scrapling.fetchers import Fetcher
283
+
284
+ __Request_and_Save(Fetcher.get, url, output_file, css_selector, **kwargs)
285
+
286
+
287
+ @extract.command(help=f"Perform a POST request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
288
+ @argument("url", required=True)
289
+ @argument("output_file", required=True)
290
+ @option(
291
+ "--data",
292
+ "-d",
293
+ help='Form data to include in the request body (as string, ex: "param1=value1&param2=value2")',
294
+ )
295
+ @option("--json", "-j", help="JSON data to include in the request body (as string)")
296
+ @option(
297
+ "--headers",
298
+ "-H",
299
+ multiple=True,
300
+ help='HTTP headers in format "Key: Value" (can be used multiple times)',
301
+ )
302
+ @option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
303
+ @option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
304
+ @option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
305
+ @option(
306
+ "--css-selector",
307
+ "-s",
308
+ help="CSS selector to extract specific content from the page. It returns all matches.",
309
+ )
310
+ @option(
311
+ "--params",
312
+ "-p",
313
+ multiple=True,
314
+ help='Query parameters in format "key=value" (can be used multiple times)',
315
+ )
316
+ @option(
317
+ "--follow-redirects/--no-follow-redirects",
318
+ default=True,
319
+ help="Whether to follow redirects (default: True)",
320
+ )
321
+ @option(
322
+ "--verify/--no-verify",
323
+ default=True,
324
+ help="Whether to verify SSL certificates (default: True)",
325
+ )
326
+ @option(
327
+ "--impersonate",
328
+ help="Browser to impersonate. Can be a single browser (e.g., chrome) or comma-separated list for random selection (e.g., chrome,firefox,safari).",
329
+ )
330
+ @option(
331
+ "--stealthy-headers/--no-stealthy-headers",
332
+ default=True,
333
+ help="Use stealthy browser headers (default: True)",
334
+ )
335
+ def post(
336
+ url,
337
+ output_file,
338
+ data,
339
+ json,
340
+ headers,
341
+ cookies,
342
+ timeout,
343
+ proxy,
344
+ css_selector,
345
+ params,
346
+ follow_redirects,
347
+ verify,
348
+ impersonate,
349
+ stealthy_headers,
350
+ ):
351
+ """
352
+ Perform a POST request and save the content to a file.
353
+
354
+ :param url: Target URL for the request.
355
+ :param output_file: Output file path (.md for Markdown, .html for HTML).
356
+ :param data: Form data to include in the request body. (as string, ex: "param1=value1&param2=value2")
357
+ :param json: A JSON serializable object to include in the body of the request.
358
+ :param headers: Headers to include in the request.
359
+ :param cookies: Cookies to use in the request.
360
+ :param timeout: Number of seconds to wait before timing out.
361
+ :param proxy: Proxy URL to use.
362
+ :param css_selector: CSS selector to extract specific content.
363
+ :param params: Query string parameters for the request.
364
+ :param follow_redirects: Whether to follow redirects.
365
+ :param verify: Whether to verify HTTPS certificates.
366
+ :param impersonate: Browser version to impersonate.
367
+ :param stealthy_headers: If enabled, creates and adds real browser headers.
368
+ """
369
+
370
+ kwargs = __BuildRequest(
371
+ headers,
372
+ cookies,
373
+ params,
374
+ json,
375
+ timeout=timeout,
376
+ follow_redirects=follow_redirects,
377
+ verify=verify,
378
+ stealthy_headers=stealthy_headers,
379
+ impersonate=impersonate,
380
+ proxy=proxy,
381
+ data=data,
382
+ )
383
+ from scrapling.fetchers import Fetcher
384
+
385
+ __Request_and_Save(Fetcher.post, url, output_file, css_selector, **kwargs)
386
+
387
+
388
+ @extract.command(help=f"Perform a PUT request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
389
+ @argument("url", required=True)
390
+ @argument("output_file", required=True)
391
+ @option("--data", "-d", help="Form data to include in the request body")
392
+ @option("--json", "-j", help="JSON data to include in the request body (as string)")
393
+ @option(
394
+ "--headers",
395
+ "-H",
396
+ multiple=True,
397
+ help='HTTP headers in format "Key: Value" (can be used multiple times)',
398
+ )
399
+ @option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
400
+ @option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
401
+ @option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
402
+ @option(
403
+ "--css-selector",
404
+ "-s",
405
+ help="CSS selector to extract specific content from the page. It returns all matches.",
406
+ )
407
+ @option(
408
+ "--params",
409
+ "-p",
410
+ multiple=True,
411
+ help='Query parameters in format "key=value" (can be used multiple times)',
412
+ )
413
+ @option(
414
+ "--follow-redirects/--no-follow-redirects",
415
+ default=True,
416
+ help="Whether to follow redirects (default: True)",
417
+ )
418
+ @option(
419
+ "--verify/--no-verify",
420
+ default=True,
421
+ help="Whether to verify SSL certificates (default: True)",
422
+ )
423
+ @option(
424
+ "--impersonate",
425
+ help="Browser to impersonate. Can be a single browser (e.g., chrome) or comma-separated list for random selection (e.g., chrome,firefox,safari).",
426
+ )
427
+ @option(
428
+ "--stealthy-headers/--no-stealthy-headers",
429
+ default=True,
430
+ help="Use stealthy browser headers (default: True)",
431
+ )
432
+ def put(
433
+ url,
434
+ output_file,
435
+ data,
436
+ json,
437
+ headers,
438
+ cookies,
439
+ timeout,
440
+ proxy,
441
+ css_selector,
442
+ params,
443
+ follow_redirects,
444
+ verify,
445
+ impersonate,
446
+ stealthy_headers,
447
+ ):
448
+ """
449
+ Perform a PUT request and save the content to a file.
450
+
451
+ :param url: Target URL for the request.
452
+ :param output_file: Output file path (.md for Markdown, .html for HTML).
453
+ :param data: Form data to include in the request body.
454
+ :param json: A JSON serializable object to include in the body of the request.
455
+ :param headers: Headers to include in the request.
456
+ :param cookies: Cookies to use in the request.
457
+ :param timeout: Number of seconds to wait before timing out.
458
+ :param proxy: Proxy URL to use.
459
+ :param css_selector: CSS selector to extract specific content.
460
+ :param params: Query string parameters for the request.
461
+ :param follow_redirects: Whether to follow redirects.
462
+ :param verify: Whether to verify HTTPS certificates.
463
+ :param impersonate: Browser version to impersonate.
464
+ :param stealthy_headers: If enabled, creates and adds real browser headers.
465
+ """
466
+
467
+ kwargs = __BuildRequest(
468
+ headers,
469
+ cookies,
470
+ params,
471
+ json,
472
+ timeout=timeout,
473
+ follow_redirects=follow_redirects,
474
+ verify=verify,
475
+ stealthy_headers=stealthy_headers,
476
+ impersonate=impersonate,
477
+ proxy=proxy,
478
+ data=data,
479
+ )
480
+ from scrapling.fetchers import Fetcher
481
+
482
+ __Request_and_Save(Fetcher.put, url, output_file, css_selector, **kwargs)
483
+
484
+
485
+ @extract.command(help=f"Perform a DELETE request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
486
+ @argument("url", required=True)
487
+ @argument("output_file", required=True)
488
+ @option(
489
+ "--headers",
490
+ "-H",
491
+ multiple=True,
492
+ help='HTTP headers in format "Key: Value" (can be used multiple times)',
493
+ )
494
+ @option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
495
+ @option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
496
+ @option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
497
+ @option(
498
+ "--css-selector",
499
+ "-s",
500
+ help="CSS selector to extract specific content from the page. It returns all matches.",
501
+ )
502
+ @option(
503
+ "--params",
504
+ "-p",
505
+ multiple=True,
506
+ help='Query parameters in format "key=value" (can be used multiple times)',
507
+ )
508
+ @option(
509
+ "--follow-redirects/--no-follow-redirects",
510
+ default=True,
511
+ help="Whether to follow redirects (default: True)",
512
+ )
513
+ @option(
514
+ "--verify/--no-verify",
515
+ default=True,
516
+ help="Whether to verify SSL certificates (default: True)",
517
+ )
518
+ @option(
519
+ "--impersonate",
520
+ help="Browser to impersonate. Can be a single browser (e.g., chrome) or comma-separated list for random selection (e.g., chrome,firefox,safari).",
521
+ )
522
+ @option(
523
+ "--stealthy-headers/--no-stealthy-headers",
524
+ default=True,
525
+ help="Use stealthy browser headers (default: True)",
526
+ )
527
+ def delete(
528
+ url,
529
+ output_file,
530
+ headers,
531
+ cookies,
532
+ timeout,
533
+ proxy,
534
+ css_selector,
535
+ params,
536
+ follow_redirects,
537
+ verify,
538
+ impersonate,
539
+ stealthy_headers,
540
+ ):
541
+ """
542
+ Perform a DELETE request and save the content to a file.
543
+
544
+ :param url: Target URL for the request.
545
+ :param output_file: Output file path (.md for Markdown, .html for HTML).
546
+ :param headers: Headers to include in the request.
547
+ :param cookies: Cookies to use in the request.
548
+ :param timeout: Number of seconds to wait before timing out.
549
+ :param proxy: Proxy URL to use.
550
+ :param css_selector: CSS selector to extract specific content.
551
+ :param params: Query string parameters for the request.
552
+ :param follow_redirects: Whether to follow redirects.
553
+ :param verify: Whether to verify HTTPS certificates.
554
+ :param impersonate: Browser version to impersonate.
555
+ :param stealthy_headers: If enabled, creates and adds real browser headers.
556
+ """
557
+
558
+ kwargs = __BuildRequest(
559
+ headers,
560
+ cookies,
561
+ params,
562
+ None,
563
+ timeout=timeout,
564
+ follow_redirects=follow_redirects,
565
+ verify=verify,
566
+ stealthy_headers=stealthy_headers,
567
+ impersonate=impersonate,
568
+ proxy=proxy,
569
+ )
570
+ from scrapling.fetchers import Fetcher
571
+
572
+ __Request_and_Save(Fetcher.delete, url, output_file, css_selector, **kwargs)
573
+
574
+
575
+ @extract.command(help=f"Use DynamicFetcher to fetch content with browser automation.\n\n{__OUTPUT_FILE_HELP__}")
576
+ @argument("url", required=True)
577
+ @argument("output_file", required=True)
578
+ @option(
579
+ "--headless/--no-headless",
580
+ default=True,
581
+ help="Run browser in headless mode (default: True)",
582
+ )
583
+ @option(
584
+ "--disable-resources/--enable-resources",
585
+ default=False,
586
+ help="Drop unnecessary resources for speed boost (default: False)",
587
+ )
588
+ @option(
589
+ "--network-idle/--no-network-idle",
590
+ default=False,
591
+ help="Wait for network idle (default: False)",
592
+ )
593
+ @option(
594
+ "--timeout",
595
+ type=int,
596
+ default=30000,
597
+ help="Timeout in milliseconds (default: 30000)",
598
+ )
599
+ @option(
600
+ "--wait",
601
+ type=int,
602
+ default=0,
603
+ help="Additional wait time in milliseconds after page load (default: 0)",
604
+ )
605
+ @option(
606
+ "--css-selector",
607
+ "-s",
608
+ help="CSS selector to extract specific content from the page. It returns all matches.",
609
+ )
610
+ @option("--wait-selector", help="CSS selector to wait for before proceeding")
611
+ @option("--locale", default=None, help="Specify user locale. Defaults to the system default locale.")
612
+ @option(
613
+ "--real-chrome/--no-real-chrome",
614
+ default=False,
615
+ help="If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False)",
616
+ )
617
+ @option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
618
+ @option(
619
+ "--extra-headers",
620
+ "-H",
621
+ multiple=True,
622
+ help='Extra headers in format "Key: Value" (can be used multiple times)',
623
+ )
624
+ def fetch(
625
+ url,
626
+ output_file,
627
+ headless,
628
+ disable_resources,
629
+ network_idle,
630
+ timeout,
631
+ wait,
632
+ css_selector,
633
+ wait_selector,
634
+ locale,
635
+ real_chrome,
636
+ proxy,
637
+ extra_headers,
638
+ ):
639
+ """
640
+ Opens up a browser and fetch content using DynamicFetcher.
641
+
642
+ :param url: Target url.
643
+ :param output_file: Output file path (.md for Markdown, .html for HTML).
644
+ :param headless: Run the browser in headless/hidden or headful/visible mode.
645
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost.
646
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
647
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page.
648
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before returning.
649
+ :param css_selector: CSS selector to extract specific content.
650
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
651
+ :param locale: Set the locale for the browser.
652
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
653
+ :param proxy: The proxy to be used with requests.
654
+ :param extra_headers: Extra headers to add to the request.
655
+ """
656
+
657
+ # Parse parameters
658
+ parsed_headers, _ = _ParseHeaders(extra_headers, False)
659
+
660
+ # Build request arguments
661
+ kwargs = {
662
+ "headless": headless,
663
+ "disable_resources": disable_resources,
664
+ "network_idle": network_idle,
665
+ "timeout": timeout,
666
+ "locale": locale,
667
+ "real_chrome": real_chrome,
668
+ }
669
+
670
+ if wait > 0:
671
+ kwargs["wait"] = wait
672
+ if wait_selector:
673
+ kwargs["wait_selector"] = wait_selector
674
+ if proxy:
675
+ kwargs["proxy"] = proxy
676
+ if parsed_headers:
677
+ kwargs["extra_headers"] = parsed_headers
678
+
679
+ from scrapling.fetchers import DynamicFetcher
680
+
681
+ __Request_and_Save(DynamicFetcher.fetch, url, output_file, css_selector, **kwargs)
682
+
683
+
684
+ @extract.command(help=f"Use StealthyFetcher to fetch content with advanced stealth features.\n\n{__OUTPUT_FILE_HELP__}")
685
+ @argument("url", required=True)
686
+ @argument("output_file", required=True)
687
+ @option(
688
+ "--headless/--no-headless",
689
+ default=True,
690
+ help="Run browser in headless mode (default: True)",
691
+ )
692
+ @option(
693
+ "--disable-resources/--enable-resources",
694
+ default=False,
695
+ help="Drop unnecessary resources for speed boost (default: False)",
696
+ )
697
+ @option(
698
+ "--block-webrtc/--allow-webrtc",
699
+ default=False,
700
+ help="Block WebRTC entirely (default: False)",
701
+ )
702
+ @option(
703
+ "--solve-cloudflare/--no-solve-cloudflare",
704
+ default=False,
705
+ help="Solve Cloudflare challenges (default: False)",
706
+ )
707
+ @option("--allow-webgl/--block-webgl", default=True, help="Allow WebGL (default: True)")
708
+ @option(
709
+ "--network-idle/--no-network-idle",
710
+ default=False,
711
+ help="Wait for network idle (default: False)",
712
+ )
713
+ @option(
714
+ "--real-chrome/--no-real-chrome",
715
+ default=False,
716
+ help="If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False)",
717
+ )
718
+ @option(
719
+ "--hide-canvas/--show-canvas",
720
+ default=False,
721
+ help="Add noise to canvas operations (default: False)",
722
+ )
723
+ @option(
724
+ "--timeout",
725
+ type=int,
726
+ default=30000,
727
+ help="Timeout in milliseconds (default: 30000)",
728
+ )
729
+ @option(
730
+ "--wait",
731
+ type=int,
732
+ default=0,
733
+ help="Additional wait time in milliseconds after page load (default: 0)",
734
+ )
735
+ @option(
736
+ "--css-selector",
737
+ "-s",
738
+ help="CSS selector to extract specific content from the page. It returns all matches.",
739
+ )
740
+ @option("--wait-selector", help="CSS selector to wait for before proceeding")
741
+ @option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
742
+ @option(
743
+ "--extra-headers",
744
+ "-H",
745
+ multiple=True,
746
+ help='Extra headers in format "Key: Value" (can be used multiple times)',
747
+ )
748
+ def stealthy_fetch(
749
+ url,
750
+ output_file,
751
+ headless,
752
+ disable_resources,
753
+ block_webrtc,
754
+ solve_cloudflare,
755
+ allow_webgl,
756
+ network_idle,
757
+ real_chrome,
758
+ hide_canvas,
759
+ timeout,
760
+ wait,
761
+ css_selector,
762
+ wait_selector,
763
+ proxy,
764
+ extra_headers,
765
+ ):
766
+ """
767
+ Opens up a browser with advanced stealth features and fetch content using StealthyFetcher.
768
+
769
+ :param url: Target url.
770
+ :param output_file: Output file path (.md for Markdown, .html for HTML).
771
+ :param headless: Run the browser in headless/hidden, or headful/visible mode.
772
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost.
773
+ :param block_webrtc: Blocks WebRTC entirely.
774
+ :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges.
775
+ :param allow_webgl: Allow WebGL (recommended to keep enabled).
776
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
777
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
778
+ :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
779
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page.
780
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before returning.
781
+ :param css_selector: CSS selector to extract specific content.
782
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
783
+ :param proxy: The proxy to be used with requests.
784
+ :param extra_headers: Extra headers to add to the request.
785
+ """
786
+
787
+ # Parse parameters
788
+ parsed_headers, _ = _ParseHeaders(extra_headers, False)
789
+
790
+ # Build request arguments
791
+ kwargs = {
792
+ "headless": headless,
793
+ "disable_resources": disable_resources,
794
+ "block_webrtc": block_webrtc,
795
+ "solve_cloudflare": solve_cloudflare,
796
+ "allow_webgl": allow_webgl,
797
+ "network_idle": network_idle,
798
+ "real_chrome": real_chrome,
799
+ "hide_canvas": hide_canvas,
800
+ "timeout": timeout,
801
+ }
802
+
803
+ if wait > 0:
804
+ kwargs["wait"] = wait
805
+ if wait_selector:
806
+ kwargs["wait_selector"] = wait_selector
807
+ if proxy:
808
+ kwargs["proxy"] = proxy
809
+ if parsed_headers:
810
+ kwargs["extra_headers"] = parsed_headers
811
+
812
+ from scrapling.fetchers import StealthyFetcher
813
+
814
+ __Request_and_Save(StealthyFetcher.fetch, url, output_file, css_selector, **kwargs)
815
+
816
+
817
+ @group()
818
+ def main():
819
+ pass
820
+
821
+
822
+ # Adding commands
823
+ main.add_command(install)
824
+ main.add_command(shell)
825
+ main.add_command(extract)
826
+ main.add_command(mcp)
core/__init__.py ADDED
File without changes
core/_shell_signatures.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from scrapling.core._types import (
2
+ Any,
3
+ Dict,
4
+ List,
5
+ Tuple,
6
+ Sequence,
7
+ Callable,
8
+ Optional,
9
+ SetCookieParam,
10
+ SelectorWaitStates,
11
+ )
12
+
13
+ # Parameter definitions for shell function signatures (defined once at module level)
14
+ # Mirrors TypedDict definitions from _types.py but runtime-accessible for IPython introspection
15
+ _REQUESTS_PARAMS = {
16
+ "params": Optional[Dict | List | Tuple],
17
+ "cookies": Any,
18
+ "auth": Optional[Tuple[str, str]],
19
+ "impersonate": Any,
20
+ "http3": Optional[bool],
21
+ "stealthy_headers": Optional[bool],
22
+ "proxies": Any,
23
+ "proxy": Optional[str],
24
+ "proxy_auth": Optional[Tuple[str, str]],
25
+ "timeout": Optional[int | float],
26
+ "headers": Any,
27
+ "retries": Optional[int],
28
+ "retry_delay": Optional[int],
29
+ "follow_redirects": Optional[bool],
30
+ "max_redirects": Optional[int],
31
+ "verify": Optional[bool],
32
+ "cert": Optional[str | Tuple[str, str]],
33
+ "selector_config": Optional[Dict],
34
+ }
35
+
36
+ _FETCH_PARAMS = {
37
+ "headless": bool,
38
+ "disable_resources": bool,
39
+ "network_idle": bool,
40
+ "load_dom": bool,
41
+ "wait_selector": Optional[str],
42
+ "wait_selector_state": SelectorWaitStates,
43
+ "cookies": Sequence[SetCookieParam],
44
+ "google_search": bool,
45
+ "wait": int | float,
46
+ "timezone_id": str | None,
47
+ "page_action": Optional[Callable],
48
+ "proxy": Optional[str | Dict[str, str] | Tuple],
49
+ "extra_headers": Optional[Dict[str, str]],
50
+ "timeout": int | float,
51
+ "init_script": Optional[str],
52
+ "user_data_dir": str,
53
+ "selector_config": Optional[Dict],
54
+ "additional_args": Optional[Dict],
55
+ "locale": Optional[str],
56
+ "real_chrome": bool,
57
+ "cdp_url": Optional[str],
58
+ "useragent": Optional[str],
59
+ "extra_flags": Optional[List[str]],
60
+ }
61
+
62
+ _STEALTHY_FETCH_PARAMS = {
63
+ "headless": bool,
64
+ "disable_resources": bool,
65
+ "network_idle": bool,
66
+ "load_dom": bool,
67
+ "wait_selector": Optional[str],
68
+ "wait_selector_state": SelectorWaitStates,
69
+ "cookies": Sequence[SetCookieParam],
70
+ "google_search": bool,
71
+ "wait": int | float,
72
+ "timezone_id": str | None,
73
+ "page_action": Optional[Callable],
74
+ "proxy": Optional[str | Dict[str, str] | Tuple],
75
+ "extra_headers": Optional[Dict[str, str]],
76
+ "timeout": int | float,
77
+ "init_script": Optional[str],
78
+ "user_data_dir": str,
79
+ "selector_config": Optional[Dict],
80
+ "additional_args": Optional[Dict],
81
+ "locale": Optional[str],
82
+ "real_chrome": bool,
83
+ "cdp_url": Optional[str],
84
+ "useragent": Optional[str],
85
+ "extra_flags": Optional[List[str]],
86
+ "allow_webgl": bool,
87
+ "hide_canvas": bool,
88
+ "block_webrtc": bool,
89
+ "solve_cloudflare": bool,
90
+ }
91
+
92
+ # Mapping of function names to their parameter definitions
93
+ Signatures_map = {
94
+ "get": _REQUESTS_PARAMS,
95
+ "post": {**_REQUESTS_PARAMS, "data": Optional[Dict | str], "json": Optional[Dict | List]},
96
+ "put": {**_REQUESTS_PARAMS, "data": Optional[Dict | str], "json": Optional[Dict | List]},
97
+ "delete": _REQUESTS_PARAMS,
98
+ "fetch": _FETCH_PARAMS,
99
+ "stealthy_fetch": _STEALTHY_FETCH_PARAMS,
100
+ }
core/_types.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Type definitions for type checking purposes.
3
+ """
4
+
5
+ from typing import (
6
+ TYPE_CHECKING,
7
+ TypedDict,
8
+ TypeAlias,
9
+ cast,
10
+ overload,
11
+ Any,
12
+ Callable,
13
+ Dict,
14
+ Generator,
15
+ AsyncGenerator,
16
+ Generic,
17
+ Iterable,
18
+ List,
19
+ Set,
20
+ Literal,
21
+ Optional,
22
+ Iterator,
23
+ Pattern,
24
+ Sequence,
25
+ Tuple,
26
+ TypeVar,
27
+ Union,
28
+ Match,
29
+ Mapping,
30
+ Awaitable,
31
+ Protocol,
32
+ Coroutine,
33
+ SupportsIndex,
34
+ )
35
+ from typing_extensions import Self, Unpack
36
+
37
+ # Proxy can be a string URL or a dict (Playwright format: {"server": "...", "username": "...", "password": "..."})
38
+ ProxyType = Union[str, Dict[str, str]]
39
+ SUPPORTED_HTTP_METHODS = Literal["GET", "POST", "PUT", "DELETE"]
40
+ SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
41
+ PageLoadStates = Literal["commit", "domcontentloaded", "load", "networkidle"]
42
+ extraction_types = Literal["text", "html", "markdown"]
43
+ StrOrBytes = Union[str, bytes]
44
+
45
+
46
+ # Copied from `playwright._impl._api_structures.SetCookieParam`
47
+ class SetCookieParam(TypedDict, total=False):
48
+ name: str
49
+ value: str
50
+ url: Optional[str]
51
+ domain: Optional[str]
52
+ path: Optional[str]
53
+ expires: Optional[float]
54
+ httpOnly: Optional[bool]
55
+ secure: Optional[bool]
56
+ sameSite: Optional[Literal["Lax", "None", "Strict"]]
57
+ partitionKey: Optional[str]
core/ai.py ADDED
@@ -0,0 +1,653 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from asyncio import gather
2
+
3
+ from mcp.server.fastmcp import FastMCP
4
+ from pydantic import BaseModel, Field
5
+ from starlette.requests import Request
6
+ from starlette.responses import Response, JSONResponse
7
+
8
+ from scrapling.core.shell import Convertor
9
+ from scrapling.engines.toolbelt.custom import Response as _ScraplingResponse
10
+ from scrapling.engines.static import ImpersonateType
11
+ from scrapling.fetchers import (
12
+ Fetcher,
13
+ FetcherSession,
14
+ DynamicFetcher,
15
+ AsyncDynamicSession,
16
+ StealthyFetcher,
17
+ AsyncStealthySession,
18
+ )
19
+ from scrapling.core._types import (
20
+ Optional,
21
+ Tuple,
22
+ Mapping,
23
+ Dict,
24
+ List,
25
+ Any,
26
+ Generator,
27
+ Sequence,
28
+ SetCookieParam,
29
+ extraction_types,
30
+ SelectorWaitStates,
31
+ )
32
+
33
+
34
+ class ResponseModel(BaseModel):
35
+ """Request's response information structure."""
36
+
37
+ status: int = Field(description="The status code returned by the website.")
38
+ content: list[str] = Field(description="The content as Markdown/HTML or the text content of the page.")
39
+ url: str = Field(description="The URL given by the user that resulted in this response.")
40
+
41
+
42
+ def _content_translator(content: Generator[str, None, None], page: _ScraplingResponse) -> ResponseModel:
43
+ """Convert a content generator to a list of ResponseModel objects."""
44
+ return ResponseModel(status=page.status, content=[result for result in content], url=page.url)
45
+
46
+
47
+ def _normalize_credentials(credentials: Optional[Dict[str, str]]) -> Optional[Tuple[str, str]]:
48
+ """Convert a credentials dictionary to a tuple accepted by fetchers."""
49
+ if not credentials:
50
+ return None
51
+
52
+ username = credentials.get("username")
53
+ password = credentials.get("password")
54
+
55
+ if username is None or password is None:
56
+ raise ValueError("Credentials dictionary must contain both 'username' and 'password' keys")
57
+
58
+ return username, password
59
+
60
+
61
+ class ScraplingMCPServer:
62
+ @staticmethod
63
+ def get(
64
+ url: str,
65
+ impersonate: ImpersonateType = "chrome",
66
+ extraction_type: extraction_types = "markdown",
67
+ css_selector: Optional[str] = None,
68
+ main_content_only: bool = True,
69
+ params: Optional[Dict] = None,
70
+ headers: Optional[Mapping[str, Optional[str]]] = None,
71
+ cookies: Optional[Dict[str, str]] = None,
72
+ timeout: Optional[int | float] = 30,
73
+ follow_redirects: bool = True,
74
+ max_redirects: int = 30,
75
+ retries: Optional[int] = 3,
76
+ retry_delay: Optional[int] = 1,
77
+ proxy: Optional[str] = None,
78
+ proxy_auth: Optional[Dict[str, str]] = None,
79
+ auth: Optional[Dict[str, str]] = None,
80
+ verify: Optional[bool] = True,
81
+ http3: Optional[bool] = False,
82
+ stealthy_headers: Optional[bool] = True,
83
+ ) -> ResponseModel:
84
+ """Make GET HTTP request to a URL and return a structured output of the result.
85
+ Note: This is only suitable for low-mid protection levels. For high-protection levels or websites that require JS loading, use the other tools directly.
86
+ Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
87
+
88
+ :param url: The URL to request.
89
+ :param impersonate: Browser version to impersonate its fingerprint. It's using the latest chrome version by default.
90
+ :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
91
+ - Markdown will convert the page content to Markdown format.
92
+ - HTML will return the raw HTML content of the page.
93
+ - Text will return the text content of the page.
94
+ :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
95
+ :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
96
+ :param params: Query string parameters for the request.
97
+ :param headers: Headers to include in the request.
98
+ :param cookies: Cookies to use in the request.
99
+ :param timeout: Number of seconds to wait before timing out.
100
+ :param follow_redirects: Whether to follow redirects. Defaults to True.
101
+ :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
102
+ :param retries: Number of retry attempts. Defaults to 3.
103
+ :param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
104
+ :param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
105
+ Cannot be used together with the `proxies` parameter.
106
+ :param proxy_auth: HTTP basic auth for proxy in dictionary format with `username` and `password` keys.
107
+ :param auth: HTTP basic auth in dictionary format with `username` and `password` keys.
108
+ :param verify: Whether to verify HTTPS certificates.
109
+ :param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
110
+ :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
111
+ """
112
+ normalized_proxy_auth = _normalize_credentials(proxy_auth)
113
+ normalized_auth = _normalize_credentials(auth)
114
+
115
+ page = Fetcher.get(
116
+ url,
117
+ auth=normalized_auth,
118
+ proxy=proxy,
119
+ http3=http3,
120
+ verify=verify,
121
+ params=params,
122
+ proxy_auth=normalized_proxy_auth,
123
+ retry_delay=retry_delay,
124
+ stealthy_headers=stealthy_headers,
125
+ impersonate=impersonate,
126
+ headers=headers,
127
+ cookies=cookies,
128
+ timeout=timeout,
129
+ retries=retries,
130
+ max_redirects=max_redirects,
131
+ follow_redirects=follow_redirects,
132
+ )
133
+ return _content_translator(
134
+ Convertor._extract_content(
135
+ page,
136
+ css_selector=css_selector,
137
+ extraction_type=extraction_type,
138
+ main_content_only=main_content_only,
139
+ ),
140
+ page,
141
+ )
142
+
143
+ @staticmethod
144
+ async def bulk_get(
145
+ urls: List[str],
146
+ impersonate: ImpersonateType = "chrome",
147
+ extraction_type: extraction_types = "markdown",
148
+ css_selector: Optional[str] = None,
149
+ main_content_only: bool = True,
150
+ params: Optional[Dict] = None,
151
+ headers: Optional[Mapping[str, Optional[str]]] = None,
152
+ cookies: Optional[Dict[str, str]] = None,
153
+ timeout: Optional[int | float] = 30,
154
+ follow_redirects: bool = True,
155
+ max_redirects: int = 30,
156
+ retries: Optional[int] = 3,
157
+ retry_delay: Optional[int] = 1,
158
+ proxy: Optional[str] = None,
159
+ proxy_auth: Optional[Dict[str, str]] = None,
160
+ auth: Optional[Dict[str, str]] = None,
161
+ verify: Optional[bool] = True,
162
+ http3: Optional[bool] = False,
163
+ stealthy_headers: Optional[bool] = True,
164
+ ) -> List[ResponseModel]:
165
+ """Make GET HTTP request to a group of URLs and for each URL, return a structured output of the result.
166
+ Note: This is only suitable for low-mid protection levels. For high-protection levels or websites that require JS loading, use the other tools directly.
167
+ Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
168
+
169
+ :param urls: A list of the URLs to request.
170
+ :param impersonate: Browser version to impersonate its fingerprint. It's using the latest chrome version by default.
171
+ :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
172
+ - Markdown will convert the page content to Markdown format.
173
+ - HTML will return the raw HTML content of the page.
174
+ - Text will return the text content of the page.
175
+ :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
176
+ :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
177
+ :param params: Query string parameters for the request.
178
+ :param headers: Headers to include in the request.
179
+ :param cookies: Cookies to use in the request.
180
+ :param timeout: Number of seconds to wait before timing out.
181
+ :param follow_redirects: Whether to follow redirects. Defaults to True.
182
+ :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
183
+ :param retries: Number of retry attempts. Defaults to 3.
184
+ :param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
185
+ :param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
186
+ Cannot be used together with the `proxies` parameter.
187
+ :param proxy_auth: HTTP basic auth for proxy in dictionary format with `username` and `password` keys.
188
+ :param auth: HTTP basic auth in dictionary format with `username` and `password` keys.
189
+ :param verify: Whether to verify HTTPS certificates.
190
+ :param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
191
+ :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
192
+ """
193
+ normalized_proxy_auth = _normalize_credentials(proxy_auth)
194
+ normalized_auth = _normalize_credentials(auth)
195
+
196
+ async with FetcherSession() as session:
197
+ tasks: List[Any] = [
198
+ session.get(
199
+ url,
200
+ auth=normalized_auth,
201
+ proxy=proxy,
202
+ http3=http3,
203
+ verify=verify,
204
+ params=params,
205
+ headers=headers,
206
+ cookies=cookies,
207
+ timeout=timeout,
208
+ retries=retries,
209
+ proxy_auth=normalized_proxy_auth,
210
+ retry_delay=retry_delay,
211
+ impersonate=impersonate,
212
+ max_redirects=max_redirects,
213
+ follow_redirects=follow_redirects,
214
+ stealthy_headers=stealthy_headers,
215
+ )
216
+ for url in urls
217
+ ]
218
+ responses = await gather(*tasks)
219
+ return [
220
+ _content_translator(
221
+ Convertor._extract_content(
222
+ page,
223
+ css_selector=css_selector,
224
+ extraction_type=extraction_type,
225
+ main_content_only=main_content_only,
226
+ ),
227
+ page,
228
+ )
229
+ for page in responses
230
+ ]
231
+
232
+ @staticmethod
233
+ async def fetch(
234
+ url: str,
235
+ extraction_type: extraction_types = "markdown",
236
+ css_selector: Optional[str] = None,
237
+ main_content_only: bool = True,
238
+ headless: bool = True, # noqa: F821
239
+ google_search: bool = True,
240
+ real_chrome: bool = False,
241
+ wait: int | float = 0,
242
+ proxy: Optional[str | Dict[str, str]] = None,
243
+ timezone_id: str | None = None,
244
+ locale: str | None = None,
245
+ extra_headers: Optional[Dict[str, str]] = None,
246
+ useragent: Optional[str] = None,
247
+ cdp_url: Optional[str] = None,
248
+ timeout: int | float = 30000,
249
+ disable_resources: bool = False,
250
+ wait_selector: Optional[str] = None,
251
+ cookies: Sequence[SetCookieParam] | None = None,
252
+ network_idle: bool = False,
253
+ wait_selector_state: SelectorWaitStates = "attached",
254
+ ) -> ResponseModel:
255
+ """Use playwright to open a browser to fetch a URL and return a structured output of the result.
256
+ Note: This is only suitable for low-mid protection levels.
257
+ Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
258
+
259
+ :param url: The URL to request.
260
+ :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
261
+ - Markdown will convert the page content to Markdown format.
262
+ - HTML will return the raw HTML content of the page.
263
+ - Text will return the text content of the page.
264
+ :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
265
+ :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
266
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
267
+ :param disable_resources: Drop requests for unnecessary resources for a speed boost.
268
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
269
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
270
+ :param cookies: Set cookies for the next request. It should be in a dictionary format that Playwright accepts.
271
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
272
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
273
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
274
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
275
+ :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
276
+ :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
277
+ rules. Defaults to the system default locale.
278
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
279
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
280
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
281
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
282
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
283
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
284
+ """
285
+ page = await DynamicFetcher.async_fetch(
286
+ url,
287
+ wait=wait,
288
+ proxy=proxy,
289
+ locale=locale,
290
+ timeout=timeout,
291
+ cookies=cookies,
292
+ cdp_url=cdp_url,
293
+ headless=headless,
294
+ useragent=useragent,
295
+ timezone_id=timezone_id,
296
+ real_chrome=real_chrome,
297
+ network_idle=network_idle,
298
+ wait_selector=wait_selector,
299
+ extra_headers=extra_headers,
300
+ google_search=google_search,
301
+ disable_resources=disable_resources,
302
+ wait_selector_state=wait_selector_state,
303
+ )
304
+ return _content_translator(
305
+ Convertor._extract_content(
306
+ page,
307
+ css_selector=css_selector,
308
+ extraction_type=extraction_type,
309
+ main_content_only=main_content_only,
310
+ ),
311
+ page,
312
+ )
313
+
314
+ @staticmethod
315
+ async def bulk_fetch(
316
+ urls: List[str],
317
+ extraction_type: extraction_types = "markdown",
318
+ css_selector: Optional[str] = None,
319
+ main_content_only: bool = True,
320
+ headless: bool = True, # noqa: F821
321
+ google_search: bool = True,
322
+ real_chrome: bool = False,
323
+ wait: int | float = 0,
324
+ proxy: Optional[str | Dict[str, str]] = None,
325
+ timezone_id: str | None = None,
326
+ locale: str | None = None,
327
+ extra_headers: Optional[Dict[str, str]] = None,
328
+ useragent: Optional[str] = None,
329
+ cdp_url: Optional[str] = None,
330
+ timeout: int | float = 30000,
331
+ disable_resources: bool = False,
332
+ wait_selector: Optional[str] = None,
333
+ cookies: Sequence[SetCookieParam] | None = None,
334
+ network_idle: bool = False,
335
+ wait_selector_state: SelectorWaitStates = "attached",
336
+ ) -> List[ResponseModel]:
337
+ """Use playwright to open a browser, then fetch a group of URLs at the same time, and for each page return a structured output of the result.
338
+ Note: This is only suitable for low-mid protection levels.
339
+ Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
340
+
341
+ :param urls: A list of the URLs to request.
342
+ :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
343
+ - Markdown will convert the page content to Markdown format.
344
+ - HTML will return the raw HTML content of the page.
345
+ - Text will return the text content of the page.
346
+ :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
347
+ :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
348
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
349
+ :param disable_resources: Drop requests for unnecessary resources for a speed boost.
350
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
351
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
352
+ :param cookies: Set cookies for the next request. It should be in a dictionary format that Playwright accepts.
353
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
354
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
355
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
356
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
357
+ :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
358
+ :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
359
+ rules. Defaults to the system default locale.
360
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
361
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
362
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
363
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
364
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
365
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
366
+ """
367
+ async with AsyncDynamicSession(
368
+ wait=wait,
369
+ proxy=proxy,
370
+ locale=locale,
371
+ timeout=timeout,
372
+ cookies=cookies,
373
+ cdp_url=cdp_url,
374
+ headless=headless,
375
+ max_pages=len(urls),
376
+ useragent=useragent,
377
+ timezone_id=timezone_id,
378
+ real_chrome=real_chrome,
379
+ network_idle=network_idle,
380
+ wait_selector=wait_selector,
381
+ google_search=google_search,
382
+ extra_headers=extra_headers,
383
+ disable_resources=disable_resources,
384
+ wait_selector_state=wait_selector_state,
385
+ ) as session:
386
+ tasks = [session.fetch(url) for url in urls]
387
+ responses = await gather(*tasks)
388
+ return [
389
+ _content_translator(
390
+ Convertor._extract_content(
391
+ page,
392
+ css_selector=css_selector,
393
+ extraction_type=extraction_type,
394
+ main_content_only=main_content_only,
395
+ ),
396
+ page,
397
+ )
398
+ for page in responses
399
+ ]
400
+
401
+ @staticmethod
402
+ async def stealthy_fetch(
403
+ url: str,
404
+ extraction_type: extraction_types = "markdown",
405
+ css_selector: Optional[str] = None,
406
+ main_content_only: bool = True,
407
+ headless: bool = True, # noqa: F821
408
+ google_search: bool = True,
409
+ real_chrome: bool = False,
410
+ wait: int | float = 0,
411
+ proxy: Optional[str | Dict[str, str]] = None,
412
+ timezone_id: str | None = None,
413
+ locale: str | None = None,
414
+ extra_headers: Optional[Dict[str, str]] = None,
415
+ useragent: Optional[str] = None,
416
+ hide_canvas: bool = False,
417
+ cdp_url: Optional[str] = None,
418
+ timeout: int | float = 30000,
419
+ disable_resources: bool = False,
420
+ wait_selector: Optional[str] = None,
421
+ cookies: Sequence[SetCookieParam] | None = None,
422
+ network_idle: bool = False,
423
+ wait_selector_state: SelectorWaitStates = "attached",
424
+ block_webrtc: bool = False,
425
+ allow_webgl: bool = True,
426
+ solve_cloudflare: bool = False,
427
+ additional_args: Optional[Dict] = None,
428
+ ) -> ResponseModel:
429
+ """Use the stealthy fetcher to fetch a URL and return a structured output of the result.
430
+ Note: This is the only suitable fetcher for high protection levels.
431
+ Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
432
+
433
+ :param url: The URL to request.
434
+ :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
435
+ - Markdown will convert the page content to Markdown format.
436
+ - HTML will return the raw HTML content of the page.
437
+ - Text will return the text content of the page.
438
+ :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
439
+ :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
440
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
441
+ :param disable_resources: Drop requests for unnecessary resources for a speed boost.
442
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
443
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
444
+ :param cookies: Set cookies for the next request.
445
+ :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
446
+ :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
447
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
448
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
449
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
450
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
451
+ :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
452
+ :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
453
+ rules. Defaults to the system default locale.
454
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
455
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
456
+ :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
457
+ :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
458
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
459
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
460
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
461
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
462
+ :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
463
+ """
464
+ page = await StealthyFetcher.async_fetch(
465
+ url,
466
+ wait=wait,
467
+ proxy=proxy,
468
+ locale=locale,
469
+ cdp_url=cdp_url,
470
+ timeout=timeout,
471
+ cookies=cookies,
472
+ headless=headless,
473
+ useragent=useragent,
474
+ timezone_id=timezone_id,
475
+ real_chrome=real_chrome,
476
+ hide_canvas=hide_canvas,
477
+ allow_webgl=allow_webgl,
478
+ network_idle=network_idle,
479
+ block_webrtc=block_webrtc,
480
+ wait_selector=wait_selector,
481
+ google_search=google_search,
482
+ extra_headers=extra_headers,
483
+ additional_args=additional_args,
484
+ solve_cloudflare=solve_cloudflare,
485
+ disable_resources=disable_resources,
486
+ wait_selector_state=wait_selector_state,
487
+ )
488
+ return _content_translator(
489
+ Convertor._extract_content(
490
+ page,
491
+ css_selector=css_selector,
492
+ extraction_type=extraction_type,
493
+ main_content_only=main_content_only,
494
+ ),
495
+ page,
496
+ )
497
+
498
+ @staticmethod
499
+ async def bulk_stealthy_fetch(
500
+ urls: List[str],
501
+ extraction_type: extraction_types = "markdown",
502
+ css_selector: Optional[str] = None,
503
+ main_content_only: bool = True,
504
+ headless: bool = True, # noqa: F821
505
+ google_search: bool = True,
506
+ real_chrome: bool = False,
507
+ wait: int | float = 0,
508
+ proxy: Optional[str | Dict[str, str]] = None,
509
+ timezone_id: str | None = None,
510
+ locale: str | None = None,
511
+ extra_headers: Optional[Dict[str, str]] = None,
512
+ useragent: Optional[str] = None,
513
+ hide_canvas: bool = False,
514
+ cdp_url: Optional[str] = None,
515
+ timeout: int | float = 30000,
516
+ disable_resources: bool = False,
517
+ wait_selector: Optional[str] = None,
518
+ cookies: Sequence[SetCookieParam] | None = None,
519
+ network_idle: bool = False,
520
+ wait_selector_state: SelectorWaitStates = "attached",
521
+ block_webrtc: bool = False,
522
+ allow_webgl: bool = True,
523
+ solve_cloudflare: bool = False,
524
+ additional_args: Optional[Dict] = None,
525
+ ) -> List[ResponseModel]:
526
+ """Use the stealthy fetcher to fetch a group of URLs at the same time, and for each page return a structured output of the result.
527
+ Note: This is the only suitable fetcher for high protection levels.
528
+ Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
529
+
530
+ :param urls: A list of the URLs to request.
531
+ :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
532
+ - Markdown will convert the page content to Markdown format.
533
+ - HTML will return the raw HTML content of the page.
534
+ - Text will return the text content of the page.
535
+ :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
536
+ :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
537
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
538
+ :param disable_resources: Drop requests for unnecessary resources for a speed boost.
539
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
540
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
541
+ :param cookies: Set cookies for the next request.
542
+ :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
543
+ :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
544
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
545
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
546
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
547
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
548
+ :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
549
+ :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
550
+ rules. Defaults to the system default locale.
551
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
552
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
553
+ :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
554
+ :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
555
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
556
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
557
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
558
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
559
+ :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
560
+ """
561
+ async with AsyncStealthySession(
562
+ wait=wait,
563
+ proxy=proxy,
564
+ locale=locale,
565
+ cdp_url=cdp_url,
566
+ timeout=timeout,
567
+ cookies=cookies,
568
+ headless=headless,
569
+ useragent=useragent,
570
+ timezone_id=timezone_id,
571
+ real_chrome=real_chrome,
572
+ hide_canvas=hide_canvas,
573
+ allow_webgl=allow_webgl,
574
+ network_idle=network_idle,
575
+ block_webrtc=block_webrtc,
576
+ wait_selector=wait_selector,
577
+ google_search=google_search,
578
+ extra_headers=extra_headers,
579
+ additional_args=additional_args,
580
+ solve_cloudflare=solve_cloudflare,
581
+ disable_resources=disable_resources,
582
+ wait_selector_state=wait_selector_state,
583
+ ) as session:
584
+ tasks = [session.fetch(url) for url in urls]
585
+ responses = await gather(*tasks)
586
+ return [
587
+ _content_translator(
588
+ Convertor._extract_content(
589
+ page,
590
+ css_selector=css_selector,
591
+ extraction_type=extraction_type,
592
+ main_content_only=main_content_only,
593
+ ),
594
+ page,
595
+ )
596
+ for page in responses
597
+ ]
598
+
599
+ def serve(self, http: bool, host: str, port: int):
600
+ """Serve the MCP server."""
601
+ server = FastMCP(name="Scrapling", host=host, port=port)
602
+ server.add_tool(self.get, title="get", description=self.get.__doc__, structured_output=True)
603
+ server.add_tool(self.bulk_get, title="bulk_get", description=self.bulk_get.__doc__, structured_output=True)
604
+ server.add_tool(self.fetch, title="fetch", description=self.fetch.__doc__, structured_output=True)
605
+ server.add_tool(
606
+ self.bulk_fetch, title="bulk_fetch", description=self.bulk_fetch.__doc__, structured_output=True
607
+ )
608
+ server.add_tool(
609
+ self.stealthy_fetch, title="stealthy_fetch", description=self.stealthy_fetch.__doc__, structured_output=True
610
+ )
611
+ server.add_tool(
612
+ self.bulk_stealthy_fetch,
613
+ title="bulk_stealthy_fetch",
614
+ description=self.bulk_stealthy_fetch.__doc__,
615
+ structured_output=True,
616
+ )
617
+
618
+ @server.custom_route("/health", methods=["GET"])
619
+ async def health_check(request: Request) -> Response:
620
+ return JSONResponse({"status": "healthy"})
621
+
622
+ @server.custom_route("/api-docs", methods=["GET"])
623
+ async def api_docs(request: Request) -> Response:
624
+ tools = await server.list_tools()
625
+ return JSONResponse([tool.model_dump() for tool in tools])
626
+
627
+ if http:
628
+ import uvicorn
629
+
630
+ # Get the Starlette app from FastMCP
631
+ mcp_app = server.streamable_http_app()
632
+
633
+ try:
634
+ import gradio as gr
635
+ from scrapling.ui import create_ui
636
+
637
+ demo = create_ui()
638
+ # Mount Gradio app onto the MCP app
639
+ # When path="/", Gradio handles requests not handled by the underlying app (or vice versa depending on implementation)
640
+ # Actually gr.mount_gradio_app returns a NEW FastAPI app that mounts the input app.
641
+ # But here we want to mount Gradio ON TOP of MCP app or ALONGSIDE.
642
+ # mount_gradio_app(app, blocks, path) -> app
643
+ # It adds routes to `app`.
644
+ # Since mcp_app is Starlette, we might need to wrap it or cast it.
645
+ # Gradio supports Starlette.
646
+ app = gr.mount_gradio_app(mcp_app, demo, path="/")
647
+ except (ImportError, ModuleNotFoundError):
648
+ app = mcp_app
649
+ print("Gradio not installed or failed to load, running MCP server only.")
650
+
651
+ uvicorn.run(app, host=host, port=port)
652
+ else:
653
+ server.run(transport="stdio")
core/custom_types.py ADDED
@@ -0,0 +1,345 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections.abc import Mapping
2
+ from types import MappingProxyType
3
+ from re import compile as re_compile, UNICODE, IGNORECASE
4
+
5
+ from orjson import dumps, loads
6
+ from w3lib.html import replace_entities as _replace_entities
7
+
8
+ from scrapling.core._types import (
9
+ Any,
10
+ cast,
11
+ Dict,
12
+ List,
13
+ Union,
14
+ overload,
15
+ TypeVar,
16
+ Literal,
17
+ Pattern,
18
+ Iterable,
19
+ Generator,
20
+ SupportsIndex,
21
+ )
22
+ from scrapling.core.utils import _is_iterable, flatten, __CONSECUTIVE_SPACES_REGEX__
23
+
24
+ # Define type variable for AttributeHandler value type
25
+ _TextHandlerType = TypeVar("_TextHandlerType", bound="TextHandler")
26
+ __CLEANING_TABLE__ = str.maketrans("\t\r\n", " ")
27
+
28
+
29
+ class TextHandler(str):
30
+ """Extends standard Python string by adding more functionality"""
31
+
32
+ __slots__ = ()
33
+
34
+ def __getitem__(self, key: SupportsIndex | slice) -> "TextHandler": # pragma: no cover
35
+ lst = super().__getitem__(key)
36
+ return TextHandler(lst)
37
+
38
+ def split(self, sep: str | None = None, maxsplit: SupportsIndex = -1) -> list[Any]: # pragma: no cover
39
+ return TextHandlers([TextHandler(s) for s in super().split(sep, maxsplit)])
40
+
41
+ def strip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
42
+ return TextHandler(super().strip(chars))
43
+
44
+ def lstrip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
45
+ return TextHandler(super().lstrip(chars))
46
+
47
+ def rstrip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
48
+ return TextHandler(super().rstrip(chars))
49
+
50
+ def capitalize(self) -> Union[str, "TextHandler"]: # pragma: no cover
51
+ return TextHandler(super().capitalize())
52
+
53
+ def casefold(self) -> Union[str, "TextHandler"]: # pragma: no cover
54
+ return TextHandler(super().casefold())
55
+
56
+ def center(self, width: SupportsIndex, fillchar: str = " ") -> Union[str, "TextHandler"]: # pragma: no cover
57
+ return TextHandler(super().center(width, fillchar))
58
+
59
+ def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, "TextHandler"]: # pragma: no cover
60
+ return TextHandler(super().expandtabs(tabsize))
61
+
62
+ def format(self, *args: object, **kwargs: object) -> Union[str, "TextHandler"]: # pragma: no cover
63
+ return TextHandler(super().format(*args, **kwargs))
64
+
65
+ def format_map(self, mapping) -> Union[str, "TextHandler"]: # pragma: no cover
66
+ return TextHandler(super().format_map(mapping))
67
+
68
+ def join(self, iterable: Iterable[str]) -> Union[str, "TextHandler"]: # pragma: no cover
69
+ return TextHandler(super().join(iterable))
70
+
71
+ def ljust(self, width: SupportsIndex, fillchar: str = " ") -> Union[str, "TextHandler"]: # pragma: no cover
72
+ return TextHandler(super().ljust(width, fillchar))
73
+
74
+ def rjust(self, width: SupportsIndex, fillchar: str = " ") -> Union[str, "TextHandler"]: # pragma: no cover
75
+ return TextHandler(super().rjust(width, fillchar))
76
+
77
+ def swapcase(self) -> Union[str, "TextHandler"]: # pragma: no cover
78
+ return TextHandler(super().swapcase())
79
+
80
+ def title(self) -> Union[str, "TextHandler"]: # pragma: no cover
81
+ return TextHandler(super().title())
82
+
83
+ def translate(self, table) -> Union[str, "TextHandler"]: # pragma: no cover
84
+ return TextHandler(super().translate(table))
85
+
86
+ def zfill(self, width: SupportsIndex) -> Union[str, "TextHandler"]: # pragma: no cover
87
+ return TextHandler(super().zfill(width))
88
+
89
+ def replace(self, old: str, new: str, count: SupportsIndex = -1) -> Union[str, "TextHandler"]:
90
+ return TextHandler(super().replace(old, new, count))
91
+
92
+ def upper(self) -> Union[str, "TextHandler"]:
93
+ return TextHandler(super().upper())
94
+
95
+ def lower(self) -> Union[str, "TextHandler"]:
96
+ return TextHandler(super().lower())
97
+
98
+ ##############
99
+
100
+ def sort(self, reverse: bool = False) -> Union[str, "TextHandler"]:
101
+ """Return a sorted version of the string"""
102
+ return self.__class__("".join(sorted(self, reverse=reverse)))
103
+
104
+ def clean(self, remove_entities=False) -> Union[str, "TextHandler"]:
105
+ """Return a new version of the string after removing all white spaces and consecutive spaces"""
106
+ data = self.translate(__CLEANING_TABLE__)
107
+ if remove_entities:
108
+ data = _replace_entities(data)
109
+ return self.__class__(__CONSECUTIVE_SPACES_REGEX__.sub(" ", data).strip())
110
+
111
+ # For easy copy-paste from Scrapy/parsel code when needed :)
112
+ def get(self, default=None): # pragma: no cover
113
+ return self
114
+
115
+ def get_all(self): # pragma: no cover
116
+ return self
117
+
118
+ extract = get_all
119
+ extract_first = get
120
+
121
+ def json(self) -> Dict:
122
+ """Return JSON response if the response is jsonable otherwise throw error"""
123
+ # Using str function as a workaround for orjson issue with subclasses of str.
124
+ # Check this out: https://github.com/ijl/orjson/issues/445
125
+ return loads(str(self))
126
+
127
+ @overload
128
+ def re(
129
+ self,
130
+ regex: str | Pattern,
131
+ replace_entities: bool = True,
132
+ clean_match: bool = False,
133
+ case_sensitive: bool = True,
134
+ *,
135
+ check_match: Literal[True],
136
+ ) -> bool: ...
137
+
138
+ @overload
139
+ def re(
140
+ self,
141
+ regex: str | Pattern,
142
+ replace_entities: bool = True,
143
+ clean_match: bool = False,
144
+ case_sensitive: bool = True,
145
+ check_match: Literal[False] = False,
146
+ ) -> "TextHandlers": ...
147
+
148
+ def re(
149
+ self,
150
+ regex: str | Pattern,
151
+ replace_entities: bool = True,
152
+ clean_match: bool = False,
153
+ case_sensitive: bool = True,
154
+ check_match: bool = False,
155
+ ) -> Union["TextHandlers", bool]:
156
+ """Apply the given regex to the current text and return a list of strings with the matches.
157
+
158
+ :param regex: Can be either a compiled regular expression or a string.
159
+ :param replace_entities: If enabled character entity references are replaced by their corresponding character
160
+ :param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching
161
+ :param case_sensitive: If disabled, function will set the regex to ignore the letters-case while compiling it
162
+ :param check_match: Used to quickly check if this regex matches or not without any operations on the results
163
+
164
+ """
165
+ if isinstance(regex, str):
166
+ if case_sensitive:
167
+ regex = re_compile(regex, UNICODE)
168
+ else:
169
+ regex = re_compile(regex, flags=UNICODE | IGNORECASE)
170
+
171
+ input_text = self.clean() if clean_match else self
172
+ results = regex.findall(input_text)
173
+ if check_match:
174
+ return bool(results)
175
+
176
+ if all(_is_iterable(res) for res in results):
177
+ results = flatten(results)
178
+
179
+ if not replace_entities:
180
+ return TextHandlers([TextHandler(string) for string in results])
181
+
182
+ return TextHandlers([TextHandler(_replace_entities(s)) for s in results])
183
+
184
+ def re_first(
185
+ self,
186
+ regex: str | Pattern,
187
+ default: Any = None,
188
+ replace_entities: bool = True,
189
+ clean_match: bool = False,
190
+ case_sensitive: bool = True,
191
+ ) -> "TextHandler":
192
+ """Apply the given regex to text and return the first match if found, otherwise return the default value.
193
+
194
+ :param regex: Can be either a compiled regular expression or a string.
195
+ :param default: The default value to be returned if there is no match
196
+ :param replace_entities: If enabled character entity references are replaced by their corresponding character
197
+ :param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching
198
+ :param case_sensitive: If disabled, function will set the regex to ignore the letters-case while compiling it
199
+
200
+ """
201
+ result = self.re(
202
+ regex,
203
+ replace_entities,
204
+ clean_match=clean_match,
205
+ case_sensitive=case_sensitive,
206
+ )
207
+ return result[0] if result else default
208
+
209
+
210
+ class TextHandlers(List[TextHandler]):
211
+ """
212
+ The :class:`TextHandlers` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
213
+ """
214
+
215
+ __slots__ = ()
216
+
217
+ @overload
218
+ def __getitem__(self, pos: SupportsIndex) -> TextHandler: # pragma: no cover
219
+ pass
220
+
221
+ @overload
222
+ def __getitem__(self, pos: slice) -> "TextHandlers": # pragma: no cover
223
+ pass
224
+
225
+ def __getitem__(self, pos: SupportsIndex | slice) -> Union[TextHandler, "TextHandlers"]:
226
+ lst = super().__getitem__(pos)
227
+ if isinstance(pos, slice):
228
+ return TextHandlers(cast(List[TextHandler], lst))
229
+ return TextHandler(cast(TextHandler, lst))
230
+
231
+ def re(
232
+ self,
233
+ regex: str | Pattern,
234
+ replace_entities: bool = True,
235
+ clean_match: bool = False,
236
+ case_sensitive: bool = True,
237
+ ) -> "TextHandlers":
238
+ """Call the ``.re()`` method for each element in this list and return
239
+ their results flattened as TextHandlers.
240
+
241
+ :param regex: Can be either a compiled regular expression or a string.
242
+ :param replace_entities: If enabled character entity references are replaced by their corresponding character
243
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
244
+ :param case_sensitive: if disabled, the function will set the regex to ignore the letters-case while compiling it
245
+ """
246
+ results = [n.re(regex, replace_entities, clean_match, case_sensitive) for n in self]
247
+ return TextHandlers(flatten(results))
248
+
249
+ def re_first(
250
+ self,
251
+ regex: str | Pattern,
252
+ default: Any = None,
253
+ replace_entities: bool = True,
254
+ clean_match: bool = False,
255
+ case_sensitive: bool = True,
256
+ ) -> TextHandler: # pragma: no cover
257
+ """Call the ``.re_first()`` method for each element in this list and return
258
+ the first result or the default value otherwise.
259
+
260
+ :param regex: Can be either a compiled regular expression or a string.
261
+ :param default: The default value to be returned if there is no match
262
+ :param replace_entities: If enabled character entity references are replaced by their corresponding character
263
+ :param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching
264
+ :param case_sensitive: If disabled, function will set the regex to ignore the letters-case while compiling it
265
+ """
266
+ for n in self:
267
+ for result in n.re(regex, replace_entities, clean_match, case_sensitive):
268
+ return result
269
+ return default
270
+
271
+ # For easy copy-paste from Scrapy/parsel code when needed :)
272
+ def get(self, default=None):
273
+ """Returns the first item of the current list
274
+ :param default: the default value to return if the current list is empty
275
+ """
276
+ return self[0] if len(self) > 0 else default
277
+
278
+ def extract(self):
279
+ return self
280
+
281
+ extract_first = get
282
+ get_all = extract
283
+
284
+
285
+ class AttributesHandler(Mapping[str, _TextHandlerType]):
286
+ """A read-only mapping to use instead of the standard dictionary for the speed boost, but at the same time I use it to add more functionalities.
287
+ If the standard dictionary is needed, convert this class to a dictionary with the `dict` function
288
+ """
289
+
290
+ __slots__ = ("_data",)
291
+
292
+ def __init__(self, mapping: Any = None, **kwargs: Any) -> None:
293
+ mapping = (
294
+ {key: TextHandler(value) if isinstance(value, str) else value for key, value in mapping.items()}
295
+ if mapping is not None
296
+ else {}
297
+ )
298
+
299
+ if kwargs:
300
+ mapping.update(
301
+ {key: TextHandler(value) if isinstance(value, str) else value for key, value in kwargs.items()}
302
+ )
303
+
304
+ # Fastest read-only mapping type
305
+ self._data: Mapping[str, Any] = MappingProxyType(mapping)
306
+
307
+ def get(self, key: str, default: Any = None) -> _TextHandlerType:
308
+ """Acts like the standard dictionary `.get()` method"""
309
+ return self._data.get(key, default)
310
+
311
+ def search_values(self, keyword: str, partial: bool = False) -> Generator["AttributesHandler", None, None]:
312
+ """Search current attributes by values and return a dictionary of each matching item
313
+ :param keyword: The keyword to search for in the attribute values
314
+ :param partial: If True, the function will search if keyword in each value instead of perfect match
315
+ """
316
+ for key, value in self._data.items():
317
+ if partial:
318
+ if keyword in value:
319
+ yield AttributesHandler({key: value})
320
+ else:
321
+ if keyword == value:
322
+ yield AttributesHandler({key: value})
323
+
324
+ @property
325
+ def json_string(self) -> bytes:
326
+ """Convert current attributes to JSON bytes if the attributes are JSON serializable otherwise throws error"""
327
+ return dumps(dict(self._data))
328
+
329
+ def __getitem__(self, key: str) -> _TextHandlerType:
330
+ return self._data[key]
331
+
332
+ def __iter__(self):
333
+ return iter(self._data)
334
+
335
+ def __len__(self):
336
+ return len(self._data)
337
+
338
+ def __repr__(self):
339
+ return f"{self.__class__.__name__}({self._data})"
340
+
341
+ def __str__(self):
342
+ return str(self._data)
343
+
344
+ def __contains__(self, key):
345
+ return key in self._data
core/mixins.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from scrapling.core._types import Any, Dict
2
+
3
+
4
+ class SelectorsGeneration:
5
+ """
6
+ Functions for generating selectors
7
+ Trying to generate selectors like Firefox or maybe cleaner ones!? Ehm
8
+ Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591
9
+ """
10
+
11
+ # Note: This is a mixin class meant to be used with Selector.
12
+ # The methods access Selector attributes (._root, .parent, .attrib, .tag, etc.)
13
+ # through self, which will be a Selector instance at runtime.
14
+
15
+ def _general_selection(self: Any, selection: str = "css", full_path: bool = False) -> str:
16
+ """Generate a selector for the current element.
17
+ :return: A string of the generated selector.
18
+ """
19
+ if self._is_text_node(self._root):
20
+ return ""
21
+
22
+ selectorPath = []
23
+ target = self
24
+ css = selection.lower() == "css"
25
+ while target is not None:
26
+ if target.parent:
27
+ if target.attrib.get("id"):
28
+ # id is enough
29
+ part = f"#{target.attrib['id']}" if css else f"[@id='{target.attrib['id']}']"
30
+ selectorPath.append(part)
31
+ if not full_path:
32
+ return " > ".join(reversed(selectorPath)) if css else "//*" + "/".join(reversed(selectorPath))
33
+ else:
34
+ part = f"{target.tag}"
35
+ # We won't use classes anymore because I some websites share exact classes between elements
36
+ # classes = target.attrib.get('class', '').split()
37
+ # if classes and css:
38
+ # part += f".{'.'.join(classes)}"
39
+ # else:
40
+ counter: Dict[str, int] = {}
41
+ for child in target.parent.children:
42
+ counter.setdefault(child.tag, 0)
43
+ counter[child.tag] += 1
44
+ if child._root == target._root:
45
+ break
46
+
47
+ if counter[target.tag] > 1:
48
+ part += f":nth-of-type({counter[target.tag]})" if css else f"[{counter[target.tag]}]"
49
+
50
+ selectorPath.append(part)
51
+ target = target.parent
52
+ if target is None or target.tag == "html":
53
+ return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))
54
+ else:
55
+ break
56
+
57
+ return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))
58
+
59
+ @property
60
+ def generate_css_selector(self: Any) -> str:
61
+ """Generate a CSS selector for the current element
62
+ :return: A string of the generated selector.
63
+ """
64
+ return self._general_selection()
65
+
66
+ @property
67
+ def generate_full_css_selector(self: Any) -> str:
68
+ """Generate a complete CSS selector for the current element
69
+ :return: A string of the generated selector.
70
+ """
71
+ return self._general_selection(full_path=True)
72
+
73
+ @property
74
+ def generate_xpath_selector(self: Any) -> str:
75
+ """Generate an XPath selector for the current element
76
+ :return: A string of the generated selector.
77
+ """
78
+ return self._general_selection("xpath")
79
+
80
+ @property
81
+ def generate_full_xpath_selector(self: Any) -> str:
82
+ """Generate a complete XPath selector for the current element
83
+ :return: A string of the generated selector.
84
+ """
85
+ return self._general_selection("xpath", full_path=True)
core/shell.py ADDED
@@ -0,0 +1,643 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ from sys import stderr
3
+ from copy import deepcopy
4
+ from functools import wraps
5
+ from re import sub as re_sub
6
+ from collections import namedtuple
7
+ from shlex import split as shlex_split
8
+ from inspect import signature, Parameter
9
+ from tempfile import mkstemp as make_temp_file
10
+ from argparse import ArgumentParser, SUPPRESS
11
+ from webbrowser import open as open_in_browser
12
+ from urllib.parse import urlparse, urlunparse, parse_qsl
13
+ from logging import (
14
+ DEBUG,
15
+ INFO,
16
+ WARNING,
17
+ ERROR,
18
+ CRITICAL,
19
+ FATAL,
20
+ getLogger,
21
+ getLevelName,
22
+ )
23
+
24
+ from orjson import loads as json_loads, JSONDecodeError
25
+
26
+ from ._shell_signatures import Signatures_map
27
+ from scrapling import __version__
28
+ from scrapling.core.utils import log
29
+ from scrapling.parser import Selector, Selectors
30
+ from scrapling.core.custom_types import TextHandler
31
+ from scrapling.engines.toolbelt.custom import Response
32
+ from scrapling.core.utils._shell import _ParseHeaders, _CookieParser
33
+ from scrapling.core._types import (
34
+ Callable,
35
+ Dict,
36
+ Any,
37
+ cast,
38
+ Optional,
39
+ Generator,
40
+ extraction_types,
41
+ )
42
+
43
+
44
+ _known_logging_levels = {
45
+ "debug": DEBUG,
46
+ "info": INFO,
47
+ "warning": WARNING,
48
+ "error": ERROR,
49
+ "critical": CRITICAL,
50
+ "fatal": FATAL,
51
+ }
52
+
53
+
54
+ # Define the structure for parsed context - Simplified for Fetcher args
55
+ Request = namedtuple(
56
+ "Request",
57
+ [
58
+ "method",
59
+ "url",
60
+ "params",
61
+ "data", # Can be str, bytes, or dict (for urlencoded)
62
+ "json_data", # Python object (dict/list) for JSON payload
63
+ "headers",
64
+ "cookies",
65
+ "proxy",
66
+ "follow_redirects", # Added for -L flag
67
+ ],
68
+ )
69
+
70
+
71
+ # Suppress exit on error to handle parsing errors gracefully
72
+ class NoExitArgumentParser(ArgumentParser): # pragma: no cover
73
+ def error(self, message):
74
+ log.error(f"Curl arguments parsing error: {message}")
75
+ raise ValueError(f"Curl arguments parsing error: {message}")
76
+
77
+ def exit(self, status=0, message=None):
78
+ if message:
79
+ log.error(f"Scrapling shell exited with status {status}: {message}")
80
+ self._print_message(message, stderr)
81
+ raise ValueError(f"Scrapling shell exited with status {status}: {message or 'Unknown reason'}")
82
+
83
+
84
+ class CurlParser:
85
+ """Builds the argument parser for relevant curl flags from DevTools."""
86
+
87
+ def __init__(self) -> None:
88
+ from scrapling.fetchers import Fetcher as __Fetcher
89
+
90
+ self.__fetcher = __Fetcher
91
+ # We will use argparse parser to parse the curl command directly instead of regex
92
+ # We will focus more on flags that will show up on curl commands copied from DevTools's network tab
93
+ _parser = NoExitArgumentParser(add_help=False) # Disable default help
94
+ # Basic curl arguments
95
+ _parser.add_argument("curl_command_placeholder", nargs="?", help=SUPPRESS)
96
+ _parser.add_argument("url")
97
+ _parser.add_argument("-X", "--request", dest="method", default=None)
98
+ _parser.add_argument("-H", "--header", action="append", default=[])
99
+ _parser.add_argument(
100
+ "-A", "--user-agent", help="Will be parsed from -H if present"
101
+ ) # Note: DevTools usually includes this in -H
102
+
103
+ # Data arguments (prioritizing types common from DevTools)
104
+ _parser.add_argument("-d", "--data", default=None)
105
+ _parser.add_argument("--data-raw", default=None) # Often used by browsers for JSON body
106
+ _parser.add_argument("--data-binary", default=None)
107
+ # Keep urlencode for completeness, though less common from browser copy/paste
108
+ _parser.add_argument("--data-urlencode", action="append", default=[])
109
+ _parser.add_argument("-G", "--get", action="store_true") # Use GET and put data in URL
110
+
111
+ _parser.add_argument(
112
+ "-b",
113
+ "--cookie",
114
+ default=None,
115
+ help="Send cookies from string/file (string format used by DevTools)",
116
+ )
117
+
118
+ # Proxy
119
+ _parser.add_argument("-x", "--proxy", default=None)
120
+ _parser.add_argument("-U", "--proxy-user", default=None) # Basic proxy auth
121
+
122
+ # Connection/Security
123
+ _parser.add_argument("-k", "--insecure", action="store_true")
124
+ _parser.add_argument("--compressed", action="store_true") # Very common from browsers
125
+
126
+ # Other flags often included but may not map directly to request args
127
+ _parser.add_argument("-i", "--include", action="store_true")
128
+ _parser.add_argument("-s", "--silent", action="store_true")
129
+ _parser.add_argument("-v", "--verbose", action="store_true")
130
+
131
+ self.parser: NoExitArgumentParser = _parser
132
+ self._supported_methods = ("get", "post", "put", "delete")
133
+
134
+ # --- Main Parsing Logic ---
135
+ def parse(self, curl_command: str) -> Optional[Request]:
136
+ """Parses the curl command string into a structured context for Fetcher."""
137
+
138
+ clean_command = curl_command.strip().lstrip("curl").strip().replace("\\\n", " ")
139
+
140
+ try:
141
+ tokens = shlex_split(clean_command) # Split the string using shell-like syntax
142
+ except ValueError as e: # pragma: no cover
143
+ log.error(f"Could not split command line: {e}")
144
+ return None
145
+
146
+ try:
147
+ parsed_args, unknown = self.parser.parse_known_args(tokens)
148
+ if unknown:
149
+ raise AttributeError(f"Unknown/Unsupported curl arguments: {unknown}")
150
+
151
+ except ValueError: # pragma: no cover
152
+ return None
153
+
154
+ except AttributeError:
155
+ raise
156
+
157
+ except Exception as e: # pragma: no cover
158
+ log.error(f"An unexpected error occurred during curl arguments parsing: {e}")
159
+ return None
160
+
161
+ # --- Determine Method ---
162
+ method = "get" # Default
163
+ if parsed_args.get: # `-G` forces GET
164
+ method = "get"
165
+
166
+ elif parsed_args.method:
167
+ method = parsed_args.method.strip().lower()
168
+
169
+ # Infer POST if data is present (unless overridden by -X or -G)
170
+ elif any(
171
+ [
172
+ parsed_args.data,
173
+ parsed_args.data_raw,
174
+ parsed_args.data_binary,
175
+ parsed_args.data_urlencode,
176
+ ]
177
+ ):
178
+ method = "post"
179
+
180
+ headers, cookies = _ParseHeaders(parsed_args.header)
181
+
182
+ if parsed_args.cookie:
183
+ # We are focusing on the string format from DevTools.
184
+ try:
185
+ for key, value in _CookieParser(parsed_args.cookie):
186
+ # Update the cookie dict, potentially overwriting cookies with the same name from -H 'cookie:'
187
+ cookies[key] = value
188
+ log.debug(f"Parsed cookies from -b argument: {list(cookies.keys())}")
189
+ except Exception as e: # pragma: no cover
190
+ log.error(f"Could not parse cookie string from -b '{parsed_args.cookie}': {e}")
191
+
192
+ # --- Process Data Payload ---
193
+ params = dict()
194
+ data_payload: Optional[str | bytes | Dict] = None
195
+ json_payload: Optional[Any] = None
196
+
197
+ # DevTools often uses --data-raw for JSON bodies
198
+ # Precedence: --data-binary > --data-raw / -d > --data-urlencode
199
+ if parsed_args.data_binary is not None: # pragma: no cover
200
+ try:
201
+ data_payload = parsed_args.data_binary.encode("utf-8")
202
+ log.debug("Using data from --data-binary as bytes.")
203
+ except Exception as e:
204
+ log.warning(
205
+ f"Could not encode binary data '{parsed_args.data_binary}' as bytes: {e}. Using raw string."
206
+ )
207
+ data_payload = parsed_args.data_binary # Fallback to string
208
+
209
+ elif parsed_args.data_raw is not None:
210
+ data_payload = parsed_args.data_raw.lstrip("$")
211
+
212
+ elif parsed_args.data is not None:
213
+ data_payload = parsed_args.data
214
+
215
+ elif parsed_args.data_urlencode: # pragma: no cover
216
+ # Combine and parse urlencoded data
217
+ combined_data = "&".join(parsed_args.data_urlencode)
218
+ try:
219
+ data_payload = dict(parse_qsl(combined_data, keep_blank_values=True))
220
+ except Exception as e:
221
+ log.warning(f"Could not parse urlencoded data '{combined_data}': {e}. Treating as raw string.")
222
+ data_payload = combined_data
223
+
224
+ # Check if raw data looks like JSON, prefer 'json' param if so
225
+ if isinstance(data_payload, str):
226
+ try:
227
+ maybe_json = json_loads(data_payload)
228
+ if isinstance(maybe_json, (dict, list)):
229
+ json_payload = maybe_json
230
+ data_payload = None
231
+ except JSONDecodeError:
232
+ pass # Not JSON, keep it in data_payload
233
+
234
+ # Handle `-G`: Move data to params if the method is GET
235
+ if method == "get" and data_payload: # pragma: no cover
236
+ if isinstance(data_payload, dict): # From --data-urlencode likely
237
+ params.update(data_payload)
238
+ elif isinstance(data_payload, str):
239
+ try:
240
+ params.update(dict(parse_qsl(data_payload, keep_blank_values=True)))
241
+ except ValueError:
242
+ log.warning(f"Could not parse data '{data_payload}' into GET parameters for -G.")
243
+
244
+ if params:
245
+ data_payload = None # Clear data as it's moved to params
246
+ json_payload = None # Should not have JSON body with -G
247
+
248
+ # --- Process Proxy ---
249
+ proxies: Optional[Dict[str, str]] = None
250
+ if parsed_args.proxy:
251
+ proxy_url = f"http://{parsed_args.proxy}" if "://" not in parsed_args.proxy else parsed_args.proxy
252
+
253
+ if parsed_args.proxy_user:
254
+ user_pass = parsed_args.proxy_user
255
+ parts = urlparse(proxy_url)
256
+ netloc_parts = parts.netloc.split("@")
257
+ netloc = f"{user_pass}@{netloc_parts[-1]}" if len(netloc_parts) > 1 else f"{user_pass}@{parts.netloc}"
258
+ proxy_url = urlunparse(
259
+ (
260
+ parts.scheme,
261
+ netloc,
262
+ parts.path,
263
+ parts.params,
264
+ parts.query,
265
+ parts.fragment,
266
+ )
267
+ )
268
+
269
+ # Standard proxy dict format
270
+ proxies = {"http": proxy_url, "https": proxy_url}
271
+ log.debug(f"Using proxy configuration: {proxies}")
272
+
273
+ # --- Final Context ---
274
+ return Request(
275
+ method=method,
276
+ url=parsed_args.url,
277
+ params=params,
278
+ data=data_payload,
279
+ json_data=json_payload,
280
+ headers=headers,
281
+ cookies=cookies,
282
+ proxy=proxies,
283
+ follow_redirects=True, # Scrapling default is True
284
+ )
285
+
286
+ def convert2fetcher(self, curl_command: Request | str) -> Optional[Response]:
287
+ if isinstance(curl_command, (Request, str)):
288
+ request = self.parse(curl_command) if isinstance(curl_command, str) else curl_command
289
+
290
+ # Ensure request parsing was successful before proceeding
291
+ if request is None: # pragma: no cover
292
+ log.error("Failed to parse curl command, cannot convert to fetcher.")
293
+ return None
294
+
295
+ request_args = request._asdict()
296
+ method = request_args.pop("method").strip().lower()
297
+ if method in self._supported_methods:
298
+ request_args["json"] = request_args.pop("json_data")
299
+
300
+ # Ensure data/json are removed for non-POST/PUT methods
301
+ if method not in ("post", "put"):
302
+ _ = request_args.pop("data", None)
303
+ _ = request_args.pop("json", None)
304
+
305
+ try:
306
+ return getattr(self.__fetcher, method)(**request_args)
307
+ except Exception as e: # pragma: no cover
308
+ log.error(f"Error calling Fetcher.{method}: {e}")
309
+ return None
310
+ else: # pragma: no cover
311
+ log.error(f'Request method "{method}" isn\'t supported by Scrapling yet')
312
+ return None
313
+
314
+ else: # pragma: no cover
315
+ log.error("Input must be a valid curl command string or a Request object.")
316
+ return None
317
+
318
+
319
+ def _unpack_signature(func, signature_name=None):
320
+ """
321
+ Unpack TypedDict from Unpack[TypedDict] annotations in **kwargs and reconstruct the signature.
322
+
323
+ This allows the interactive shell to show individual parameters instead of just **kwargs, similar to how IDEs display them.
324
+ """
325
+ try:
326
+ sig = signature(func)
327
+ func_name = signature_name or getattr(func, "__name__", None)
328
+
329
+ # Check if this function has known parameters
330
+ if func_name not in Signatures_map:
331
+ return sig
332
+
333
+ new_params = []
334
+ for param in sig.parameters.values():
335
+ if param.kind == Parameter.VAR_KEYWORD:
336
+ # Replace **kwargs with individual keyword-only parameters
337
+ for field_name, field_type in Signatures_map[func_name].items():
338
+ new_params.append(
339
+ Parameter(field_name, Parameter.KEYWORD_ONLY, default=Parameter.empty, annotation=field_type)
340
+ )
341
+ else:
342
+ new_params.append(param)
343
+
344
+ # Reconstruct signature with unpacked parameters
345
+ if len(new_params) != len(sig.parameters):
346
+ return sig.replace(parameters=new_params)
347
+ return sig
348
+
349
+ except Exception: # pragma: no cover
350
+ return signature(func)
351
+
352
+
353
+ def show_page_in_browser(page: Selector): # pragma: no cover
354
+ if not page or not isinstance(page, Selector):
355
+ log.error("Input must be of type `Selector`")
356
+ return
357
+
358
+ try:
359
+ fd, fname = make_temp_file(prefix="scrapling_view_", suffix=".html")
360
+ with open(fd, "w", encoding=page.encoding) as f:
361
+ f.write(page.html_content)
362
+
363
+ open_in_browser(f"file://{fname}")
364
+ except IOError as e:
365
+ log.error(f"Failed to write temporary file for viewing: {e}")
366
+ except Exception as e:
367
+ log.error(f"An unexpected error occurred while viewing the page: {e}")
368
+
369
+
370
+ class CustomShell:
371
+ """A custom IPython shell with minimal dependencies"""
372
+
373
+ def __init__(self, code, log_level="debug"):
374
+ from IPython.terminal.embed import InteractiveShellEmbed as __InteractiveShellEmbed
375
+ from scrapling.fetchers import (
376
+ Fetcher as __Fetcher,
377
+ AsyncFetcher as __AsyncFetcher,
378
+ FetcherSession as __FetcherSession,
379
+ DynamicFetcher as __DynamicFetcher,
380
+ DynamicSession as __DynamicSession,
381
+ AsyncDynamicSession as __AsyncDynamicSession,
382
+ StealthyFetcher as __StealthyFetcher,
383
+ StealthySession as __StealthySession,
384
+ AsyncStealthySession as __AsyncStealthySession,
385
+ )
386
+
387
+ self.__InteractiveShellEmbed = __InteractiveShellEmbed
388
+ self.__Fetcher = __Fetcher
389
+ self.__AsyncFetcher = __AsyncFetcher
390
+ self.__FetcherSession = __FetcherSession
391
+ self.__DynamicFetcher = __DynamicFetcher
392
+ self.__DynamicSession = __DynamicSession
393
+ self.__AsyncDynamicSession = __AsyncDynamicSession
394
+ self.__StealthyFetcher = __StealthyFetcher
395
+ self.__StealthySession = __StealthySession
396
+ self.__AsyncStealthySession = __AsyncStealthySession
397
+ self.code = code
398
+ self.page = None
399
+ self.pages = Selectors([])
400
+ self._curl_parser = CurlParser()
401
+ log_level = log_level.strip().lower()
402
+
403
+ if _known_logging_levels.get(log_level):
404
+ self.log_level = _known_logging_levels[log_level]
405
+ else: # pragma: no cover
406
+ log.warning(f'Unknown log level "{log_level}", defaulting to "DEBUG"')
407
+ self.log_level = DEBUG
408
+
409
+ self.shell = None
410
+
411
+ # Initialize your application components
412
+ self.init_components()
413
+
414
+ def init_components(self):
415
+ """Initialize application components"""
416
+ # This is where you'd set up your application-specific objects
417
+ if self.log_level:
418
+ getLogger("scrapling").setLevel(self.log_level)
419
+
420
+ settings = self.__Fetcher.display_config()
421
+ settings.pop("storage", None)
422
+ settings.pop("storage_args", None)
423
+ log.info(f"Scrapling {__version__} shell started")
424
+ log.info(f"Logging level is set to '{getLevelName(self.log_level)}'")
425
+ log.info(f"Fetchers' parsing settings: {settings}")
426
+
427
+ @staticmethod
428
+ def banner():
429
+ """Create a custom banner for the shell"""
430
+ return f"""
431
+ -> Available Scrapling objects:
432
+ - Fetcher/AsyncFetcher/FetcherSession
433
+ - DynamicFetcher/DynamicSession/AsyncDynamicSession
434
+ - StealthyFetcher/StealthySession/AsyncStealthySession
435
+ - Selector
436
+
437
+ -> Useful shortcuts:
438
+ - {"get":<30} Shortcut for `Fetcher.get`
439
+ - {"post":<30} Shortcut for `Fetcher.post`
440
+ - {"put":<30} Shortcut for `Fetcher.put`
441
+ - {"delete":<30} Shortcut for `Fetcher.delete`
442
+ - {"fetch":<30} Shortcut for `DynamicFetcher.fetch`
443
+ - {"stealthy_fetch":<30} Shortcut for `StealthyFetcher.fetch`
444
+
445
+ -> Useful commands
446
+ - {"page / response":<30} The response object of the last page you fetched
447
+ - {"pages":<30} Selectors object of the last 5 response objects you fetched
448
+ - {"uncurl('curl_command')":<30} Convert curl command to a Request object. (Optimized to handle curl commands copied from DevTools network tab.)
449
+ - {"curl2fetcher('curl_command')":<30} Convert curl command and make the request with Fetcher. (Optimized to handle curl commands copied from DevTools network tab.)
450
+ - {"view(page)":<30} View page in a browser
451
+ - {"help()":<30} Show this help message (Shell help)
452
+
453
+ Type 'exit' or press Ctrl+D to exit.
454
+ """
455
+
456
+ def update_page(self, result): # pragma: no cover
457
+ """Update the current page and add to pages history"""
458
+ self.page = result
459
+ if isinstance(result, (Response, Selector)):
460
+ self.pages.append(result)
461
+ if len(self.pages) > 5:
462
+ self.pages.pop(0) # Remove the oldest item
463
+
464
+ # Update in IPython namespace too
465
+ if self.shell:
466
+ self.shell.user_ns["page"] = self.page
467
+ self.shell.user_ns["response"] = self.page
468
+ self.shell.user_ns["pages"] = self.pages
469
+
470
+ return result
471
+
472
+ def create_wrapper(
473
+ self, func: Callable, get_signature: bool = True, signature_name: Optional[str] = None
474
+ ) -> Callable:
475
+ """Create a wrapper that preserves function signature but updates page"""
476
+
477
+ @wraps(func)
478
+ def wrapper(*args: Any, **kwargs: Any) -> Any:
479
+ result = func(*args, **kwargs)
480
+ return self.update_page(result)
481
+
482
+ if get_signature:
483
+ # Explicitly preserve and unpack signature for IPython introspection and autocompletion
484
+ setattr(wrapper, "__signature__", _unpack_signature(func, signature_name))
485
+ else:
486
+ setattr(wrapper, "__signature__", signature(func))
487
+
488
+ return wrapper
489
+
490
+ def get_namespace(self):
491
+ """Create a namespace with application-specific objects"""
492
+
493
+ # Create wrapped versions of fetch functions
494
+ get = self.create_wrapper(self.__Fetcher.get)
495
+ post = self.create_wrapper(self.__Fetcher.post)
496
+ put = self.create_wrapper(self.__Fetcher.put)
497
+ delete = self.create_wrapper(self.__Fetcher.delete)
498
+ dynamic_fetch = self.create_wrapper(self.__DynamicFetcher.fetch)
499
+ stealthy_fetch = self.create_wrapper(self.__StealthyFetcher.fetch, signature_name="stealthy_fetch")
500
+ curl2fetcher = self.create_wrapper(self._curl_parser.convert2fetcher, get_signature=False)
501
+
502
+ # Create the namespace dictionary
503
+ return {
504
+ "get": get,
505
+ "post": post,
506
+ "put": put,
507
+ "delete": delete,
508
+ "Fetcher": self.__Fetcher,
509
+ "AsyncFetcher": self.__AsyncFetcher,
510
+ "FetcherSession": self.__FetcherSession,
511
+ "DynamicSession": self.__DynamicSession,
512
+ "AsyncDynamicSession": self.__AsyncDynamicSession,
513
+ "StealthySession": self.__StealthySession,
514
+ "AsyncStealthySession": self.__AsyncStealthySession,
515
+ "fetch": dynamic_fetch,
516
+ "DynamicFetcher": self.__DynamicFetcher,
517
+ "stealthy_fetch": stealthy_fetch,
518
+ "StealthyFetcher": self.__StealthyFetcher,
519
+ "Selector": Selector,
520
+ "page": self.page,
521
+ "response": self.page,
522
+ "pages": self.pages,
523
+ "view": show_page_in_browser,
524
+ "uncurl": self._curl_parser.parse,
525
+ "curl2fetcher": curl2fetcher,
526
+ "help": self.show_help,
527
+ }
528
+
529
+ def show_help(self): # pragma: no cover
530
+ """Show help information"""
531
+ print(self.banner())
532
+
533
+ def start(self): # pragma: no cover
534
+ """Start the interactive shell"""
535
+
536
+ # Get our namespace with application objects
537
+ namespace = self.get_namespace()
538
+ ipython_shell = self.__InteractiveShellEmbed(
539
+ banner1=self.banner(),
540
+ banner2="",
541
+ enable_tip=False,
542
+ exit_msg="Bye Bye",
543
+ user_ns=namespace,
544
+ )
545
+ self.shell = ipython_shell
546
+
547
+ # If a command was provided, execute it and exit
548
+ if self.code:
549
+ log.info(f"Executing provided code: {self.code}")
550
+ try:
551
+ ipython_shell.run_cell(self.code, store_history=False)
552
+ except Exception as e:
553
+ log.error(f"Error executing initial code: {e}")
554
+ return
555
+
556
+ ipython_shell()
557
+
558
+
559
+ class Convertor:
560
+ """Utils for the extract shell command"""
561
+
562
+ _extension_map: Dict[str, extraction_types] = {
563
+ "md": "markdown",
564
+ "html": "html",
565
+ "txt": "text",
566
+ }
567
+
568
+ @classmethod
569
+ def _convert_to_markdown(cls, body: TextHandler) -> str:
570
+ """Convert HTML content to Markdown"""
571
+ from markdownify import markdownify
572
+
573
+ return markdownify(body)
574
+
575
+ @classmethod
576
+ def _strip_noise_tags(cls, page: Selector) -> Selector:
577
+ """Return a copy of the Selector with noise tags removed."""
578
+ clean_root = deepcopy(page._root)
579
+ for element in clean_root.iter(*{"script", "style", "noscript", "svg"}):
580
+ element.drop_tree()
581
+ return Selector(root=clean_root, url=page.url)
582
+
583
+ @classmethod
584
+ def _extract_content(
585
+ cls,
586
+ page: Selector,
587
+ extraction_type: extraction_types = "markdown",
588
+ css_selector: Optional[str] = None,
589
+ main_content_only: bool = False,
590
+ ) -> Generator[str, None, None]:
591
+ """Extract the content of a Selector"""
592
+ if not page or not isinstance(page, Selector): # pragma: no cover
593
+ raise TypeError("Input must be of type `Selector`")
594
+ elif not extraction_type or extraction_type not in cls._extension_map.values():
595
+ raise ValueError(f"Unknown extraction type: {extraction_type}")
596
+ else:
597
+ if main_content_only:
598
+ page = cast(Selector, page.css("body").first) or page
599
+ page = cls._strip_noise_tags(page)
600
+
601
+ pages = [page] if not css_selector else cast(Selectors, page.css(css_selector))
602
+ for page in pages:
603
+ match extraction_type:
604
+ case "markdown":
605
+ yield cls._convert_to_markdown(page.html_content)
606
+ case "html":
607
+ yield page.html_content
608
+ case "text":
609
+ txt_content = page.get_all_text(
610
+ strip=True, ignore_tags=("script", "style", "noscript", "svg", "iframe")
611
+ )
612
+ for s in (
613
+ "\n",
614
+ "\r",
615
+ "\t",
616
+ " ",
617
+ ):
618
+ # Remove consecutive white-spaces
619
+ txt_content = TextHandler(re_sub(f"[{s}]+", s, txt_content))
620
+ yield txt_content
621
+ yield ""
622
+
623
+ @classmethod
624
+ def write_content_to_file(cls, page: Selector, filename: str, css_selector: Optional[str] = None) -> None:
625
+ """Write a Selector's content to a file"""
626
+ if not page or not isinstance(page, Selector): # pragma: no cover
627
+ raise TypeError("Input must be of type `Selector`")
628
+ elif not filename or not isinstance(filename, str) or not filename.strip():
629
+ raise ValueError("Filename must be provided")
630
+ elif not filename.endswith((".md", ".html", ".txt")):
631
+ raise ValueError("Unknown file type: filename must end with '.md', '.html', or '.txt'")
632
+ else:
633
+ with open(filename, "w", encoding=page.encoding) as f:
634
+ extension = filename.split(".")[-1]
635
+ f.write(
636
+ "".join(
637
+ cls._extract_content(
638
+ page,
639
+ cls._extension_map[extension],
640
+ css_selector=css_selector,
641
+ )
642
+ )
643
+ )
core/storage.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from hashlib import sha256
2
+ from threading import RLock
3
+ from functools import lru_cache
4
+ from abc import ABC, abstractmethod
5
+ from sqlite3 import connect as db_connect
6
+
7
+ from orjson import dumps, loads
8
+ from lxml.html import HtmlElement
9
+
10
+ from scrapling.core.utils import _StorageTools, log
11
+ from scrapling.core._types import Dict, Optional, Any, cast
12
+
13
+
14
+ class StorageSystemMixin(ABC): # pragma: no cover
15
+ # If you want to make your own storage system, you have to inherit from this
16
+ def __init__(self, url: Optional[str] = None):
17
+ """
18
+ :param url: URL of the website we are working on to separate it from other websites data
19
+ """
20
+ # Make the url in lowercase to handle this edge case until it's updated: https://github.com/barseghyanartur/tld/issues/124
21
+ self.url = url.lower() if (url and isinstance(url, str)) else None
22
+
23
+ @lru_cache(64, typed=True)
24
+ def _get_base_url(self, default_value: str = "default") -> str:
25
+ if not self.url:
26
+ return default_value
27
+
28
+ try:
29
+ from tld import get_tld, Result
30
+
31
+ # Fixing the inaccurate return type hint in `get_tld`
32
+ extracted: Result | None = cast(
33
+ Result, get_tld(self.url, as_object=True, fail_silently=True, fix_protocol=True)
34
+ )
35
+ if not extracted:
36
+ return default_value
37
+ return extracted.fld or extracted.domain or default_value
38
+ except AttributeError:
39
+ return default_value
40
+
41
+ @abstractmethod
42
+ def save(self, element: HtmlElement, identifier: str) -> None:
43
+ """Saves the element's unique properties to the storage for retrieval and relocation later
44
+
45
+ :param element: The element itself which we want to save to storage.
46
+ :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
47
+ the docs for more info.
48
+ """
49
+ raise NotImplementedError("Storage system must implement `save` method")
50
+
51
+ @abstractmethod
52
+ def retrieve(self, identifier: str) -> Optional[Dict]:
53
+ """Using the identifier, we search the storage and return the unique properties of the element
54
+
55
+ :param identifier: This is the identifier that will be used to retrieve the element from the storage. See
56
+ the docs for more info.
57
+ :return: A dictionary of the unique properties
58
+ """
59
+ raise NotImplementedError("Storage system must implement `save` method")
60
+
61
+ @staticmethod
62
+ @lru_cache(128, typed=True)
63
+ def _get_hash(identifier: str) -> str:
64
+ """If you want to hash identifier in your storage system, use this safer"""
65
+ _identifier = identifier.lower().strip()
66
+ # Hash functions have to take bytes
67
+ _identifier_bytes = _identifier.encode("utf-8")
68
+
69
+ hash_value = sha256(_identifier_bytes).hexdigest()
70
+ return f"{hash_value}_{len(_identifier_bytes)}" # Length to reduce collision chance
71
+
72
+
73
+ @lru_cache(1, typed=True)
74
+ class SQLiteStorageSystem(StorageSystemMixin):
75
+ """The recommended system to use, it's race condition safe and thread safe.
76
+ Mainly built, so the library can run in threaded frameworks like scrapy or threaded tools
77
+ > It's optimized for threaded applications, but running it without threads shouldn't make it slow."""
78
+
79
+ def __init__(self, storage_file: str, url: Optional[str] = None):
80
+ """
81
+ :param storage_file: File to be used to store elements' data.
82
+ :param url: URL of the website we are working on to separate it from other websites data
83
+
84
+ """
85
+ super().__init__(url)
86
+ self.storage_file = storage_file
87
+ self.lock = RLock() # Better than Lock for reentrancy
88
+ # >SQLite default mode in the earlier version is 1 not 2 (1=thread-safe 2=serialized)
89
+ # `check_same_thread=False` to allow it to be used across different threads.
90
+ self.connection = db_connect(self.storage_file, check_same_thread=False)
91
+ # WAL (Write-Ahead Logging) allows for better concurrency.
92
+ self.connection.execute("PRAGMA journal_mode=WAL")
93
+ self.cursor = self.connection.cursor()
94
+ self._setup_database()
95
+ log.debug(f'Storage system loaded with arguments (storage_file="{storage_file}", url="{url}")')
96
+
97
+ def _setup_database(self) -> None:
98
+ self.cursor.execute("""
99
+ CREATE TABLE IF NOT EXISTS storage (
100
+ id INTEGER PRIMARY KEY,
101
+ url TEXT,
102
+ identifier TEXT,
103
+ element_data TEXT,
104
+ UNIQUE (url, identifier)
105
+ )
106
+ """)
107
+ self.connection.commit()
108
+
109
+ def save(self, element: HtmlElement, identifier: str) -> None:
110
+ """Saves the elements unique properties to the storage for retrieval and relocation later
111
+
112
+ :param element: The element itself which we want to save to storage.
113
+ :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
114
+ the docs for more info.
115
+ """
116
+ url = self._get_base_url()
117
+ element_data = _StorageTools.element_to_dict(element)
118
+ with self.lock:
119
+ self.cursor.execute(
120
+ """
121
+ INSERT OR REPLACE INTO storage (url, identifier, element_data)
122
+ VALUES (?, ?, ?)
123
+ """,
124
+ (url, identifier, dumps(element_data)),
125
+ )
126
+ self.cursor.fetchall()
127
+ self.connection.commit()
128
+
129
+ def retrieve(self, identifier: str) -> Optional[Dict[str, Any]]:
130
+ """Using the identifier, we search the storage and return the unique properties of the element
131
+
132
+ :param identifier: This is the identifier that will be used to retrieve the element from the storage. See
133
+ the docs for more info.
134
+ :return: A dictionary of the unique properties
135
+ """
136
+ url = self._get_base_url()
137
+ with self.lock:
138
+ self.cursor.execute(
139
+ "SELECT element_data FROM storage WHERE url = ? AND identifier = ?",
140
+ (url, identifier),
141
+ )
142
+ result = self.cursor.fetchone()
143
+ if result:
144
+ return loads(result[0])
145
+ return None
146
+
147
+ def close(self):
148
+ """Close all connections. It will be useful when with some things like scrapy Spider.closed() function/signal"""
149
+ with self.lock:
150
+ self.connection.commit()
151
+ self.cursor.close()
152
+ self.connection.close()
153
+
154
+ def __del__(self):
155
+ """To ensure all connections are closed when the object is destroyed."""
156
+ self.close()
core/translator.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Most of this file is an adapted version of the parsel library's translator with some modifications simply for 1 important reason...
3
+
4
+ To add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match the Parsel/Scrapy selectors format which will be important in future releases but most importantly...
5
+
6
+ So you don't have to learn a new selectors/api method like what bs4 done with soupsieve :)
7
+
8
+ If you want to learn about this, head to https://cssselect.readthedocs.io/en/latest/#cssselect.FunctionalPseudoElement
9
+ """
10
+
11
+ from functools import lru_cache
12
+
13
+ from cssselect import HTMLTranslator as OriginalHTMLTranslator
14
+ from cssselect.xpath import ExpressionError, XPathExpr as OriginalXPathExpr
15
+ from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
16
+
17
+ from scrapling.core._types import Any, Protocol, Self
18
+
19
+
20
+ class XPathExpr(OriginalXPathExpr):
21
+ textnode: bool = False
22
+ attribute: str | None = None
23
+
24
+ @classmethod
25
+ def from_xpath(
26
+ cls,
27
+ xpath: OriginalXPathExpr,
28
+ textnode: bool = False,
29
+ attribute: str | None = None,
30
+ ) -> Self:
31
+ x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
32
+ x.textnode = textnode
33
+ x.attribute = attribute
34
+ return x
35
+
36
+ def __str__(self) -> str:
37
+ path = super().__str__()
38
+ if self.textnode:
39
+ if path == "*": # pragma: no cover
40
+ path = "text()"
41
+ elif path.endswith("::*/*"): # pragma: no cover
42
+ path = path[:-3] + "text()"
43
+ else:
44
+ path += "/text()"
45
+
46
+ if self.attribute is not None:
47
+ if path.endswith("::*/*"): # pragma: no cover
48
+ path = path[:-2]
49
+ path += f"/@{self.attribute}"
50
+
51
+ return path
52
+
53
+ def join(
54
+ self: Self,
55
+ combiner: str,
56
+ other: OriginalXPathExpr,
57
+ *args: Any,
58
+ **kwargs: Any,
59
+ ) -> Self:
60
+ if not isinstance(other, XPathExpr):
61
+ raise ValueError( # pragma: no cover
62
+ f"Expressions of type {__name__}.XPathExpr can ony join expressions"
63
+ f" of the same type (or its descendants), got {type(other)}"
64
+ )
65
+ super().join(combiner, other, *args, **kwargs)
66
+ self.textnode = other.textnode
67
+ self.attribute = other.attribute
68
+ return self
69
+
70
+
71
+ # e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
72
+ class TranslatorProtocol(Protocol):
73
+ def xpath_element(self, selector: Element) -> OriginalXPathExpr: # pyright: ignore # pragma: no cover
74
+ pass
75
+
76
+ def css_to_xpath(self, css: str, prefix: str = ...) -> str: # pyright: ignore # pragma: no cover
77
+ pass
78
+
79
+
80
+ class TranslatorMixin:
81
+ """This mixin adds support to CSS pseudo elements via dynamic dispatch.
82
+
83
+ Currently supported pseudo-elements are ``::text`` and ``::attr(ATTR_NAME)``.
84
+ """
85
+
86
+ def xpath_element(self: TranslatorProtocol, selector: Element) -> XPathExpr:
87
+ # https://github.com/python/mypy/issues/14757
88
+ xpath = super().xpath_element(selector) # type: ignore[safe-super]
89
+ return XPathExpr.from_xpath(xpath)
90
+
91
+ def xpath_pseudo_element(self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement) -> OriginalXPathExpr:
92
+ """
93
+ Dispatch method that transforms XPath to support the pseudo-element.
94
+ """
95
+ if isinstance(pseudo_element, FunctionalPseudoElement):
96
+ method_name = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element"
97
+ method = getattr(self, method_name, None)
98
+ if not method: # pragma: no cover
99
+ raise ExpressionError(f"The functional pseudo-element ::{pseudo_element.name}() is unknown")
100
+ xpath = method(xpath, pseudo_element)
101
+ else:
102
+ method_name = f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
103
+ method = getattr(self, method_name, None)
104
+ if not method: # pragma: no cover
105
+ raise ExpressionError(f"The pseudo-element ::{pseudo_element} is unknown")
106
+ xpath = method(xpath)
107
+ return xpath
108
+
109
+ @staticmethod
110
+ def xpath_attr_functional_pseudo_element(xpath: OriginalXPathExpr, function: FunctionalPseudoElement) -> XPathExpr:
111
+ """Support selecting attribute values using ::attr() pseudo-element"""
112
+ if function.argument_types() not in (["STRING"], ["IDENT"]): # pragma: no cover
113
+ raise ExpressionError(f"Expected a single string or ident for ::attr(), got {function.arguments!r}")
114
+ return XPathExpr.from_xpath(xpath, attribute=function.arguments[0].value)
115
+
116
+ @staticmethod
117
+ def xpath_text_simple_pseudo_element(xpath: OriginalXPathExpr) -> XPathExpr:
118
+ """Support selecting text nodes using ::text pseudo-element"""
119
+ return XPathExpr.from_xpath(xpath, textnode=True)
120
+
121
+
122
+ class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
123
+ def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
124
+ return super().css_to_xpath(css, prefix)
125
+
126
+
127
+ translator = HTMLTranslator()
128
+ # Using a function instead of the translator directly to avoid Pyright override error
129
+
130
+
131
+ @lru_cache(maxsize=256)
132
+ def css_to_xpath(query: str) -> str:
133
+ """Return the translated XPath version of a given CSS query"""
134
+ return translator.css_to_xpath(query)
core/utils/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ._utils import (
2
+ log,
3
+ set_logger,
4
+ reset_logger,
5
+ __CONSECUTIVE_SPACES_REGEX__,
6
+ flatten,
7
+ _is_iterable,
8
+ _StorageTools,
9
+ clean_spaces,
10
+ html_forbidden,
11
+ )
core/utils/_shell.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from http import cookies as Cookie
2
+
3
+
4
+ from scrapling.core._types import (
5
+ List,
6
+ Dict,
7
+ Tuple,
8
+ )
9
+
10
+
11
+ def _CookieParser(cookie_string):
12
+ # Errors will be handled on call so the log can be specified
13
+ cookie_parser = Cookie.SimpleCookie()
14
+ cookie_parser.load(cookie_string)
15
+ for key, morsel in cookie_parser.items():
16
+ yield key, morsel.value
17
+
18
+
19
+ def _ParseHeaders(header_lines: List[str], parse_cookies: bool = True) -> Tuple[Dict[str, str], Dict[str, str]]:
20
+ """Parses headers into separate header and cookie dictionaries."""
21
+ header_dict = dict()
22
+ cookie_dict = dict()
23
+
24
+ for header_line in header_lines:
25
+ if ":" not in header_line:
26
+ if header_line.endswith(";"):
27
+ header_key = header_line[:-1].strip()
28
+ header_value = ""
29
+ header_dict[header_key] = header_value
30
+ else:
31
+ raise ValueError(f"Could not parse header without colon: '{header_line}'.")
32
+ else:
33
+ header_key, header_value = header_line.split(":", 1)
34
+ header_key = header_key.strip()
35
+ header_value = header_value.strip()
36
+
37
+ if parse_cookies:
38
+ if header_key.lower() == "cookie":
39
+ try:
40
+ cookie_dict = {key: value for key, value in _CookieParser(header_value)}
41
+ except Exception as e: # pragma: no cover
42
+ raise ValueError(f"Could not parse cookie string from header '{header_value}': {e}")
43
+ else:
44
+ header_dict[header_key] = header_value
45
+ else:
46
+ header_dict[header_key] = header_value
47
+
48
+ return header_dict, cookie_dict
core/utils/_utils.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from itertools import chain
3
+ from re import compile as re_compile
4
+ from contextvars import ContextVar, Token
5
+
6
+ from lxml import html
7
+
8
+ from scrapling.core._types import Any, Dict, Iterable, List
9
+
10
+ # Using cache on top of a class is a brilliant way to achieve a Singleton design pattern without much code
11
+ from functools import lru_cache # isort:skip
12
+
13
+ html_forbidden = (html.HtmlComment,)
14
+
15
+ __CLEANING_TABLE__ = str.maketrans({"\t": " ", "\n": None, "\r": None})
16
+ __CONSECUTIVE_SPACES_REGEX__ = re_compile(r" +")
17
+
18
+
19
+ @lru_cache(1, typed=True)
20
+ def setup_logger():
21
+ """Create and configure a logger with a standard format.
22
+
23
+ :returns: logging.Logger: Configured logger instance
24
+ """
25
+ logger = logging.getLogger("scrapling")
26
+ logger.setLevel(logging.INFO)
27
+
28
+ formatter = logging.Formatter(fmt="[%(asctime)s] %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
29
+
30
+ console_handler = logging.StreamHandler()
31
+ console_handler.setFormatter(formatter)
32
+
33
+ # Add handler to logger (if not already added)
34
+ if not logger.handlers:
35
+ logger.addHandler(console_handler)
36
+
37
+ return logger
38
+
39
+
40
+ _current_logger: ContextVar[logging.Logger] = ContextVar("scrapling_logger", default=setup_logger())
41
+
42
+
43
+ class LoggerProxy:
44
+ def __getattr__(self, name: str):
45
+ return getattr(_current_logger.get(), name)
46
+
47
+
48
+ log = LoggerProxy()
49
+
50
+
51
+ def set_logger(logger: logging.Logger) -> Token:
52
+ """Set the current context logger. Returns token for reset."""
53
+ return _current_logger.set(logger)
54
+
55
+
56
+ def reset_logger(token: Token) -> None:
57
+ """Reset logger to previous state using token."""
58
+ _current_logger.reset(token)
59
+
60
+
61
+ def flatten(lst: Iterable[Any]) -> List[Any]:
62
+ return list(chain.from_iterable(lst))
63
+
64
+
65
+ def _is_iterable(obj: Any) -> bool:
66
+ # This will be used only in regex functions to make sure it's iterable but not string/bytes
67
+ return isinstance(
68
+ obj,
69
+ (
70
+ list,
71
+ tuple,
72
+ ),
73
+ )
74
+
75
+
76
+ class _StorageTools:
77
+ @staticmethod
78
+ def __clean_attributes(element: html.HtmlElement, forbidden: tuple = ()) -> Dict:
79
+ if not element.attrib:
80
+ return {}
81
+ return {k: v.strip() for k, v in element.attrib.items() if v and v.strip() and k not in forbidden}
82
+
83
+ @classmethod
84
+ def element_to_dict(cls, element: html.HtmlElement) -> Dict:
85
+ parent = element.getparent()
86
+ result = {
87
+ "tag": str(element.tag),
88
+ "attributes": cls.__clean_attributes(element),
89
+ "text": element.text.strip() if element.text else None,
90
+ "path": cls._get_element_path(element),
91
+ }
92
+ if parent is not None:
93
+ result.update(
94
+ {
95
+ "parent_name": parent.tag,
96
+ "parent_attribs": dict(parent.attrib),
97
+ "parent_text": parent.text.strip() if parent.text else None,
98
+ }
99
+ )
100
+
101
+ siblings = [child.tag for child in parent.iterchildren() if child != element]
102
+ if siblings:
103
+ result.update({"siblings": tuple(siblings)})
104
+
105
+ children = [child.tag for child in element.iterchildren() if not isinstance(child, html_forbidden)]
106
+ if children:
107
+ result.update({"children": tuple(children)})
108
+
109
+ return result
110
+
111
+ @classmethod
112
+ def _get_element_path(cls, element: html.HtmlElement):
113
+ parent = element.getparent()
114
+ return tuple((element.tag,) if parent is None else (cls._get_element_path(parent) + (element.tag,)))
115
+
116
+
117
+ @lru_cache(128, typed=True)
118
+ def clean_spaces(string):
119
+ string = string.translate(__CLEANING_TABLE__)
120
+ return __CONSECUTIVE_SPACES_REGEX__.sub(" ", string)
engines/__init__.py ADDED
File without changes
engines/_browsers/__init__.py ADDED
File without changes
engines/_browsers/_base.py ADDED
@@ -0,0 +1,534 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from time import time
2
+ from asyncio import sleep as asyncio_sleep, Lock
3
+ from contextlib import contextmanager, asynccontextmanager
4
+
5
+ from playwright.sync_api._generated import Page
6
+ from playwright.sync_api import (
7
+ Frame,
8
+ BrowserContext,
9
+ Response as SyncPlaywrightResponse,
10
+ )
11
+ from playwright.async_api._generated import Page as AsyncPage
12
+ from playwright.async_api import (
13
+ Frame as AsyncFrame,
14
+ Response as AsyncPlaywrightResponse,
15
+ BrowserContext as AsyncBrowserContext,
16
+ )
17
+ from playwright._impl._errors import Error as PlaywrightError
18
+
19
+ from scrapling.parser import Selector
20
+ from scrapling.engines._browsers._page import PageInfo, PagePool
21
+ from scrapling.engines._browsers._validators import validate, PlaywrightConfig, StealthConfig
22
+ from scrapling.engines._browsers._config_tools import __default_chrome_useragent__, __default_useragent__
23
+ from scrapling.engines.toolbelt.navigation import (
24
+ construct_proxy_dict,
25
+ create_intercept_handler,
26
+ create_async_intercept_handler,
27
+ )
28
+ from scrapling.core._types import (
29
+ Any,
30
+ Dict,
31
+ List,
32
+ Set,
33
+ Optional,
34
+ Callable,
35
+ TYPE_CHECKING,
36
+ cast,
37
+ overload,
38
+ Tuple,
39
+ ProxyType,
40
+ Generator,
41
+ AsyncGenerator,
42
+ )
43
+ from scrapling.engines.constants import STEALTH_ARGS, HARMFUL_ARGS, DEFAULT_ARGS
44
+
45
+
46
+ class SyncSession:
47
+ _config: "PlaywrightConfig | StealthConfig"
48
+ _context_options: Dict[str, Any]
49
+
50
+ def _build_context_with_proxy(self, proxy: Optional[ProxyType] = None) -> Dict[str, Any]:
51
+ raise NotImplementedError # pragma: no cover
52
+
53
+ def __init__(self, max_pages: int = 1):
54
+ self.max_pages = max_pages
55
+ self.page_pool = PagePool(max_pages)
56
+ self._max_wait_for_page = 60
57
+ self.playwright: Any = None
58
+ self.context: Any = None
59
+ self.browser: Any = None
60
+ self._is_alive = False
61
+
62
+ def start(self) -> None:
63
+ pass
64
+
65
+ def close(self): # pragma: no cover
66
+ """Close all resources"""
67
+ if not self._is_alive:
68
+ return
69
+
70
+ if self.context:
71
+ self.context.close()
72
+ self.context = None
73
+
74
+ if self.browser:
75
+ self.browser.close()
76
+ self.browser = None
77
+
78
+ if self.playwright:
79
+ self.playwright.stop()
80
+ self.playwright = None # pyright: ignore
81
+
82
+ self._is_alive = False
83
+
84
+ def __enter__(self):
85
+ self.start()
86
+ return self
87
+
88
+ def __exit__(self, exc_type, exc_val, exc_tb):
89
+ self.close()
90
+
91
+ def _initialize_context(self, config: PlaywrightConfig | StealthConfig, ctx: BrowserContext) -> BrowserContext:
92
+ """Initialize the browser context."""
93
+ if config.init_script:
94
+ ctx.add_init_script(path=config.init_script)
95
+
96
+ if config.cookies: # pragma: no cover
97
+ ctx.add_cookies(config.cookies)
98
+
99
+ return ctx
100
+
101
+ def _get_page(
102
+ self,
103
+ timeout: int | float,
104
+ extra_headers: Optional[Dict[str, str]],
105
+ disable_resources: bool,
106
+ blocked_domains: Optional[Set[str]] = None,
107
+ context: Optional[BrowserContext] = None,
108
+ ) -> PageInfo[Page]: # pragma: no cover
109
+ """Get a new page to use"""
110
+ # No need to check if a page is available or not in sync code because the code blocked before reaching here till the page closed, ofc.
111
+ ctx = context if context is not None else self.context
112
+ assert ctx is not None, "Browser context not initialized"
113
+ page = ctx.new_page()
114
+ page.set_default_navigation_timeout(timeout)
115
+ page.set_default_timeout(timeout)
116
+ if extra_headers:
117
+ page.set_extra_http_headers(extra_headers)
118
+
119
+ if disable_resources or blocked_domains:
120
+ page.route("**/*", create_intercept_handler(disable_resources, blocked_domains))
121
+ page_info = self.page_pool.add_page(page)
122
+ page_info.mark_busy()
123
+ return page_info
124
+
125
+ def get_pool_stats(self) -> Dict[str, int]:
126
+ """Get statistics about the current page pool"""
127
+ return {
128
+ "total_pages": self.page_pool.pages_count,
129
+ "busy_pages": self.page_pool.busy_count,
130
+ "max_pages": self.max_pages,
131
+ }
132
+
133
+ @staticmethod
134
+ def _wait_for_networkidle(page: Page | Frame, timeout: Optional[int] = None):
135
+ """Wait for the page to become idle (no network activity) even if there are never-ending requests."""
136
+ try:
137
+ page.wait_for_load_state("networkidle", timeout=timeout)
138
+ except (PlaywrightError, Exception):
139
+ pass
140
+
141
+ def _wait_for_page_stability(self, page: Page | Frame, load_dom: bool, network_idle: bool):
142
+ page.wait_for_load_state(state="load")
143
+ if load_dom:
144
+ page.wait_for_load_state(state="domcontentloaded")
145
+ if network_idle:
146
+ self._wait_for_networkidle(page)
147
+
148
+ @staticmethod
149
+ def _create_response_handler(page_info: PageInfo[Page], response_container: List) -> Callable:
150
+ """Create a response handler that captures the final navigation response.
151
+
152
+ :param page_info: The PageInfo object containing the page
153
+ :param response_container: A list to store the final response (mutable container)
154
+ :return: A callback function for page.on("response", ...)
155
+ """
156
+
157
+ def handle_response(finished_response: SyncPlaywrightResponse):
158
+ if (
159
+ finished_response.request.resource_type == "document"
160
+ and finished_response.request.is_navigation_request()
161
+ and finished_response.request.frame == page_info.page.main_frame
162
+ ):
163
+ response_container[0] = finished_response
164
+
165
+ return handle_response
166
+
167
+ @contextmanager
168
+ def _page_generator(
169
+ self,
170
+ timeout: int | float,
171
+ extra_headers: Optional[Dict[str, str]],
172
+ disable_resources: bool,
173
+ proxy: Optional[ProxyType] = None,
174
+ blocked_domains: Optional[Set[str]] = None,
175
+ ) -> Generator["PageInfo[Page]", None, None]:
176
+ """Acquire a page - either from persistent context or fresh context with proxy."""
177
+ if proxy:
178
+ # Rotation mode: create fresh context with the provided proxy
179
+ if not self.browser: # pragma: no cover
180
+ raise RuntimeError("Browser not initialized for proxy rotation mode")
181
+ context_options = self._build_context_with_proxy(proxy)
182
+ context: BrowserContext = self.browser.new_context(**context_options)
183
+
184
+ try:
185
+ context = self._initialize_context(self._config, context)
186
+ page_info = self._get_page(timeout, extra_headers, disable_resources, blocked_domains, context=context)
187
+ yield page_info
188
+ finally:
189
+ context.close()
190
+ else:
191
+ # Standard mode: use PagePool with persistent context
192
+ page_info = self._get_page(timeout, extra_headers, disable_resources, blocked_domains)
193
+ try:
194
+ yield page_info
195
+ finally:
196
+ page_info.page.close()
197
+ self.page_pool.pages.remove(page_info)
198
+
199
+
200
+ class AsyncSession:
201
+ _config: "PlaywrightConfig | StealthConfig"
202
+ _context_options: Dict[str, Any]
203
+
204
+ def _build_context_with_proxy(self, proxy: Optional[ProxyType] = None) -> Dict[str, Any]:
205
+ raise NotImplementedError # pragma: no cover
206
+
207
+ def __init__(self, max_pages: int = 1):
208
+ self.max_pages = max_pages
209
+ self.page_pool = PagePool(max_pages)
210
+ self._max_wait_for_page = 60
211
+ self.playwright: Any = None
212
+ self.context: Any = None
213
+ self.browser: Any = None
214
+ self._is_alive = False
215
+ self._lock = Lock()
216
+
217
+ async def start(self) -> None:
218
+ pass
219
+
220
+ async def close(self):
221
+ """Close all resources"""
222
+ if not self._is_alive: # pragma: no cover
223
+ return
224
+
225
+ if self.context:
226
+ await self.context.close()
227
+ self.context = None # pyright: ignore
228
+
229
+ if self.browser:
230
+ await self.browser.close()
231
+ self.browser = None
232
+
233
+ if self.playwright:
234
+ await self.playwright.stop()
235
+ self.playwright = None # pyright: ignore
236
+
237
+ self._is_alive = False
238
+
239
+ async def __aenter__(self):
240
+ await self.start()
241
+ return self
242
+
243
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
244
+ await self.close()
245
+
246
+ async def _initialize_context(
247
+ self, config: PlaywrightConfig | StealthConfig, ctx: AsyncBrowserContext
248
+ ) -> AsyncBrowserContext:
249
+ """Initialize the browser context."""
250
+ if config.init_script: # pragma: no cover
251
+ await ctx.add_init_script(path=config.init_script)
252
+
253
+ if config.cookies: # pragma: no cover
254
+ await ctx.add_cookies(config.cookies)
255
+
256
+ return ctx
257
+
258
+ async def _get_page(
259
+ self,
260
+ timeout: int | float,
261
+ extra_headers: Optional[Dict[str, str]],
262
+ disable_resources: bool,
263
+ blocked_domains: Optional[Set[str]] = None,
264
+ context: Optional[AsyncBrowserContext] = None,
265
+ ) -> PageInfo[AsyncPage]: # pragma: no cover
266
+ """Get a new page to use"""
267
+ ctx = context if context is not None else self.context
268
+ if TYPE_CHECKING:
269
+ assert ctx is not None, "Browser context not initialized"
270
+
271
+ async with self._lock:
272
+ # If we're at max capacity after cleanup, wait for busy pages to finish
273
+ if context is None and self.page_pool.pages_count >= self.max_pages:
274
+ # Only applies when using persistent context
275
+ start_time = time()
276
+ while time() - start_time < self._max_wait_for_page:
277
+ await asyncio_sleep(0.05)
278
+ if self.page_pool.pages_count < self.max_pages:
279
+ break
280
+ else:
281
+ raise TimeoutError(
282
+ f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
283
+ )
284
+
285
+ page = await ctx.new_page()
286
+ page.set_default_navigation_timeout(timeout)
287
+ page.set_default_timeout(timeout)
288
+ if extra_headers:
289
+ await page.set_extra_http_headers(extra_headers)
290
+
291
+ if disable_resources or blocked_domains:
292
+ await page.route("**/*", create_async_intercept_handler(disable_resources, blocked_domains))
293
+
294
+ return self.page_pool.add_page(page)
295
+
296
+ def get_pool_stats(self) -> Dict[str, int]:
297
+ """Get statistics about the current page pool"""
298
+ return {
299
+ "total_pages": self.page_pool.pages_count,
300
+ "busy_pages": self.page_pool.busy_count,
301
+ "max_pages": self.max_pages,
302
+ }
303
+
304
+ @staticmethod
305
+ async def _wait_for_networkidle(page: AsyncPage | AsyncFrame, timeout: Optional[int] = None):
306
+ """Wait for the page to become idle (no network activity) even if there are never-ending requests."""
307
+ try:
308
+ await page.wait_for_load_state("networkidle", timeout=timeout)
309
+ except (PlaywrightError, Exception):
310
+ pass
311
+
312
+ async def _wait_for_page_stability(self, page: AsyncPage | AsyncFrame, load_dom: bool, network_idle: bool):
313
+ await page.wait_for_load_state(state="load")
314
+ if load_dom:
315
+ await page.wait_for_load_state(state="domcontentloaded")
316
+ if network_idle:
317
+ await self._wait_for_networkidle(page)
318
+
319
+ @staticmethod
320
+ def _create_response_handler(page_info: PageInfo[AsyncPage], response_container: List) -> Callable:
321
+ """Create an async response handler that captures the final navigation response.
322
+
323
+ :param page_info: The PageInfo object containing the page
324
+ :param response_container: A list to store the final response (mutable container)
325
+ :return: A callback function for page.on("response", ...)
326
+ """
327
+
328
+ async def handle_response(finished_response: AsyncPlaywrightResponse):
329
+ if (
330
+ finished_response.request.resource_type == "document"
331
+ and finished_response.request.is_navigation_request()
332
+ and finished_response.request.frame == page_info.page.main_frame
333
+ ):
334
+ response_container[0] = finished_response
335
+
336
+ return handle_response
337
+
338
+ @asynccontextmanager
339
+ async def _page_generator(
340
+ self,
341
+ timeout: int | float,
342
+ extra_headers: Optional[Dict[str, str]],
343
+ disable_resources: bool,
344
+ proxy: Optional[ProxyType] = None,
345
+ blocked_domains: Optional[Set[str]] = None,
346
+ ) -> AsyncGenerator["PageInfo[AsyncPage]", None]:
347
+ """Acquire a page - either from persistent context or fresh context with proxy."""
348
+ if proxy:
349
+ # Rotation mode: create fresh context with the provided proxy
350
+ if not self.browser: # pragma: no cover
351
+ raise RuntimeError("Browser not initialized for proxy rotation mode")
352
+ context_options = self._build_context_with_proxy(proxy)
353
+ context: AsyncBrowserContext = await self.browser.new_context(**context_options)
354
+
355
+ try:
356
+ context = await self._initialize_context(self._config, context)
357
+ page_info = await self._get_page(
358
+ timeout, extra_headers, disable_resources, blocked_domains, context=context
359
+ )
360
+ yield page_info
361
+ finally:
362
+ await context.close()
363
+ else:
364
+ # Standard mode: use PagePool with persistent context
365
+ page_info = await self._get_page(timeout, extra_headers, disable_resources, blocked_domains)
366
+ try:
367
+ yield page_info
368
+ finally:
369
+ await page_info.page.close()
370
+ self.page_pool.pages.remove(page_info)
371
+
372
+
373
+ class BaseSessionMixin:
374
+ _config: "PlaywrightConfig | StealthConfig"
375
+
376
+ @overload
377
+ def __validate_routine__(self, params: Dict, model: type[StealthConfig]) -> StealthConfig: ...
378
+
379
+ @overload
380
+ def __validate_routine__(self, params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...
381
+
382
+ def __validate_routine__(
383
+ self, params: Dict, model: type[PlaywrightConfig] | type[StealthConfig]
384
+ ) -> PlaywrightConfig | StealthConfig:
385
+ # Dark color scheme bypasses the 'prefersLightColor' check in creepjs
386
+ self._context_options: Dict[str, Any] = {"color_scheme": "dark", "device_scale_factor": 2}
387
+ self._browser_options: Dict[str, Any] = {
388
+ "args": DEFAULT_ARGS,
389
+ "ignore_default_args": HARMFUL_ARGS,
390
+ }
391
+ if "__max_pages" in params:
392
+ params["max_pages"] = params.pop("__max_pages")
393
+
394
+ config = validate(params, model=model)
395
+ self._headers_keys = (
396
+ {header.lower() for header in config.extra_headers.keys()} if config.extra_headers else set()
397
+ )
398
+
399
+ return config
400
+
401
+ def __generate_options__(self, extra_flags: Tuple | None = None) -> None:
402
+ config: PlaywrightConfig | StealthConfig = self._config
403
+ self._context_options.update(
404
+ {
405
+ "proxy": config.proxy,
406
+ "locale": config.locale,
407
+ "timezone_id": config.timezone_id,
408
+ "extra_http_headers": config.extra_headers,
409
+ }
410
+ )
411
+ # The default useragent in the headful is always correct now in the current versions of Playwright
412
+ if config.useragent:
413
+ self._context_options["user_agent"] = config.useragent
414
+ elif not config.useragent and config.headless:
415
+ self._context_options["user_agent"] = (
416
+ __default_chrome_useragent__ if config.real_chrome else __default_useragent__
417
+ )
418
+
419
+ if not config.cdp_url:
420
+ flags = self._browser_options["args"]
421
+ if config.extra_flags or extra_flags:
422
+ flags = list(set(flags + (config.extra_flags or extra_flags)))
423
+
424
+ self._browser_options.update(
425
+ {
426
+ "args": flags,
427
+ "headless": config.headless,
428
+ "channel": "chrome" if config.real_chrome else "chromium",
429
+ }
430
+ )
431
+
432
+ self._user_data_dir = config.user_data_dir
433
+ else:
434
+ self._browser_options = {}
435
+
436
+ if config.additional_args:
437
+ self._context_options.update(config.additional_args)
438
+
439
+ def _build_context_with_proxy(self, proxy: Optional[ProxyType] = None) -> Dict[str, Any]:
440
+ """
441
+ Build context options with a specific proxy for rotation mode.
442
+
443
+ :param proxy: Proxy URL string or Playwright-style proxy dict to use for this context.
444
+ :return: Dictionary of context options for browser.new_context().
445
+ """
446
+
447
+ context_options = self._context_options.copy()
448
+
449
+ # Override proxy if provided
450
+ if proxy:
451
+ context_options["proxy"] = construct_proxy_dict(proxy)
452
+
453
+ return context_options
454
+
455
+
456
+ class DynamicSessionMixin(BaseSessionMixin):
457
+ def __validate__(self, **params):
458
+ self._config = self.__validate_routine__(params, model=PlaywrightConfig)
459
+ self.__generate_options__()
460
+
461
+
462
+ class StealthySessionMixin(BaseSessionMixin):
463
+ def __validate__(self, **params):
464
+ self._config = self.__validate_routine__(params, model=StealthConfig)
465
+ self._context_options.update(
466
+ {
467
+ "is_mobile": False,
468
+ "has_touch": False,
469
+ # I'm thinking about disabling it to rest from all Service Workers' headache, but let's keep it as it is for now
470
+ "service_workers": "allow",
471
+ "ignore_https_errors": True,
472
+ "screen": {"width": 1920, "height": 1080},
473
+ "viewport": {"width": 1920, "height": 1080},
474
+ "permissions": ["geolocation", "notifications"],
475
+ }
476
+ )
477
+ self.__generate_stealth_options()
478
+
479
+ def __generate_stealth_options(self) -> None:
480
+ config = cast(StealthConfig, self._config)
481
+ flags: Tuple[str, ...] = tuple()
482
+ if not config.cdp_url:
483
+ flags = DEFAULT_ARGS + STEALTH_ARGS
484
+
485
+ if config.block_webrtc:
486
+ flags += (
487
+ "--webrtc-ip-handling-policy=disable_non_proxied_udp",
488
+ "--force-webrtc-ip-handling-policy", # Ensures the policy is enforced
489
+ )
490
+ if not config.allow_webgl:
491
+ flags += (
492
+ "--disable-webgl",
493
+ "--disable-webgl-image-chromium",
494
+ "--disable-webgl2",
495
+ )
496
+ if config.hide_canvas:
497
+ flags += ("--fingerprinting-canvas-image-data-noise",)
498
+
499
+ super(StealthySessionMixin, self).__generate_options__(flags)
500
+
501
+ @staticmethod
502
+ def _detect_cloudflare(page_content: str) -> str | None:
503
+ """
504
+ Detect the type of Cloudflare challenge present in the provided page content.
505
+
506
+ This function analyzes the given page content to identify whether a specific
507
+ type of Cloudflare challenge is present. It checks for three predefined
508
+ challenge types: non-interactive, managed, and interactive. If a challenge
509
+ type is detected, it returns the corresponding type as a string. If no
510
+ challenge type is detected, it returns None.
511
+
512
+ Args:
513
+ page_content (str): The content of the page to analyze for Cloudflare
514
+ challenge types.
515
+
516
+ Returns:
517
+ str: A string representing the detected Cloudflare challenge type, if
518
+ found. Returns None if no challenge matches.
519
+ """
520
+ challenge_types = (
521
+ "non-interactive",
522
+ "managed",
523
+ "interactive",
524
+ )
525
+ for ctype in challenge_types:
526
+ if f"cType: '{ctype}'" in page_content:
527
+ return ctype
528
+
529
+ # Check if turnstile captcha is embedded inside the page (Usually inside a closed Shadow iframe)
530
+ selector = Selector(content=page_content)
531
+ if selector.css('script[src*="challenges.cloudflare.com/turnstile/v"]'):
532
+ return "embedded"
533
+
534
+ return None
engines/_browsers/_config_tools.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from scrapling.engines.toolbelt.fingerprints import generate_headers
2
+
3
+ __default_useragent__ = generate_headers(browser_mode=True).get("User-Agent")
4
+ __default_chrome_useragent__ = generate_headers(browser_mode="chrome").get("User-Agent")
engines/_browsers/_controllers.py ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from time import sleep as time_sleep
2
+ from asyncio import sleep as asyncio_sleep
3
+
4
+ from playwright.sync_api import (
5
+ Locator,
6
+ sync_playwright,
7
+ )
8
+ from playwright.async_api import (
9
+ async_playwright,
10
+ Locator as AsyncLocator,
11
+ )
12
+
13
+ from scrapling.core.utils import log
14
+ from scrapling.core._types import Optional, ProxyType, Unpack
15
+ from scrapling.engines.toolbelt.proxy_rotation import is_proxy_error
16
+ from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
17
+ from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
18
+ from scrapling.engines._browsers._types import PlaywrightSession, PlaywrightFetchParams
19
+ from scrapling.engines._browsers._base import SyncSession, AsyncSession, DynamicSessionMixin
20
+ from scrapling.engines._browsers._validators import validate_fetch as _validate, PlaywrightConfig
21
+
22
+
23
+ class DynamicSession(SyncSession, DynamicSessionMixin):
24
+ """A Browser session manager with page pooling."""
25
+
26
+ __slots__ = (
27
+ "_config",
28
+ "_context_options",
29
+ "_browser_options",
30
+ "_user_data_dir",
31
+ "_headers_keys",
32
+ "max_pages",
33
+ "page_pool",
34
+ "_max_wait_for_page",
35
+ "playwright",
36
+ "context",
37
+ )
38
+
39
+ def __init__(self, **kwargs: Unpack[PlaywrightSession]):
40
+ """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
41
+
42
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
43
+ :param disable_resources: Drop requests for unnecessary resources for a speed boost.
44
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
45
+ :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
46
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
47
+ :param cookies: Set cookies for the next request.
48
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
49
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
50
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
51
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
52
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
53
+ :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
54
+ :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
55
+ rules. Defaults to the system default locale.
56
+ :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
57
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
58
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
59
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
60
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
61
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
62
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
63
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
64
+ :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
65
+ :param extra_flags: A list of additional browser flags to pass to the browser on launch.
66
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
67
+ :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
68
+ """
69
+ self.__validate__(**kwargs)
70
+ super().__init__()
71
+
72
+ def start(self):
73
+ """Create a browser for this instance and context."""
74
+ if not self.playwright:
75
+ self.playwright = sync_playwright().start()
76
+
77
+ try:
78
+ if self._config.cdp_url: # pragma: no cover
79
+ self.browser = self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
80
+ if not self._config.proxy_rotator and self.browser:
81
+ self.context = self.browser.new_context(**self._context_options)
82
+ elif self._config.proxy_rotator:
83
+ self.browser = self.playwright.chromium.launch(**self._browser_options)
84
+ else:
85
+ persistent_options = (
86
+ self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
87
+ )
88
+ self.context = self.playwright.chromium.launch_persistent_context(**persistent_options)
89
+
90
+ if self.context:
91
+ self.context = self._initialize_context(self._config, self.context)
92
+
93
+ self._is_alive = True
94
+ except Exception:
95
+ # Clean up playwright if browser setup fails
96
+ self.playwright.stop()
97
+ self.playwright = None
98
+ raise
99
+ else:
100
+ raise RuntimeError("Session has been already started")
101
+
102
+ def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Response:
103
+ """Opens up the browser and do your request based on your chosen options.
104
+
105
+ :param url: The Target url.
106
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
107
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
108
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
109
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
110
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
111
+ :param disable_resources: Drop requests for unnecessary resources for a speed boost.
112
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
113
+ :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
114
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
115
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
116
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
117
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
118
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
119
+ :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
120
+ :return: A `Response` object.
121
+ """
122
+ static_proxy = kwargs.pop("proxy", None)
123
+
124
+ params = _validate(kwargs, self, PlaywrightConfig)
125
+ if not self._is_alive: # pragma: no cover
126
+ raise RuntimeError("Context manager has been closed")
127
+
128
+ request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
129
+ referer = (
130
+ generate_convincing_referer(url)
131
+ if (params.google_search and "referer" not in request_headers_keys)
132
+ else None
133
+ )
134
+
135
+ for attempt in range(self._config.retries):
136
+ proxy: Optional[ProxyType] = None
137
+ if self._config.proxy_rotator and static_proxy is None:
138
+ proxy = self._config.proxy_rotator.get_proxy()
139
+ else:
140
+ proxy = static_proxy
141
+
142
+ with self._page_generator(
143
+ params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
144
+ ) as page_info:
145
+ final_response = [None]
146
+ page = page_info.page
147
+ page.on("response", self._create_response_handler(page_info, final_response))
148
+
149
+ try:
150
+ first_response = page.goto(url, referer=referer)
151
+ self._wait_for_page_stability(page, params.load_dom, params.network_idle)
152
+
153
+ if not first_response:
154
+ raise RuntimeError(f"Failed to get response for {url}")
155
+
156
+ if params.page_action:
157
+ try:
158
+ _ = params.page_action(page)
159
+ except Exception as e: # pragma: no cover
160
+ log.error(f"Error executing page_action: {e}")
161
+
162
+ if params.wait_selector:
163
+ try:
164
+ waiter: Locator = page.locator(params.wait_selector)
165
+ waiter.first.wait_for(state=params.wait_selector_state)
166
+ self._wait_for_page_stability(page, params.load_dom, params.network_idle)
167
+ except Exception as e: # pragma: no cover
168
+ log.error(f"Error waiting for selector {params.wait_selector}: {e}")
169
+
170
+ page.wait_for_timeout(params.wait)
171
+
172
+ response = ResponseFactory.from_playwright_response(
173
+ page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
174
+ )
175
+ return response
176
+
177
+ except Exception as e:
178
+ page_info.mark_error()
179
+ if attempt < self._config.retries - 1:
180
+ if is_proxy_error(e):
181
+ log.warning(
182
+ f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
183
+ )
184
+ else:
185
+ log.warning(
186
+ f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
187
+ )
188
+ time_sleep(self._config.retry_delay)
189
+ else:
190
+ log.error(f"Failed after {self._config.retries} attempts: {e}")
191
+ raise
192
+
193
+ raise RuntimeError("Request failed") # pragma: no cover
194
+
195
+
196
+ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
197
+ """An async Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory."""
198
+
199
+ __slots__ = (
200
+ "_config",
201
+ "_context_options",
202
+ "_browser_options",
203
+ "_user_data_dir",
204
+ "_headers_keys",
205
+ )
206
+
207
+ def __init__(self, **kwargs: Unpack[PlaywrightSession]):
208
+ """A Browser session manager with page pooling
209
+
210
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
211
+ :param disable_resources: Drop requests for unnecessary resources for a speed boost.
212
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
213
+ :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
214
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
215
+ :param cookies: Set cookies for the next request.
216
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
217
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
218
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
219
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
220
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
221
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
222
+ :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
223
+ :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
224
+ rules. Defaults to the system default locale.
225
+ :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
226
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
227
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
228
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
229
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
230
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
231
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
232
+ :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
233
+ :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
234
+ :param extra_flags: A list of additional browser flags to pass to the browser on launch.
235
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
236
+ :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
237
+ """
238
+ self.__validate__(**kwargs)
239
+ super().__init__(max_pages=self._config.max_pages)
240
+
241
+ async def start(self) -> None:
242
+ """Create a browser for this instance and context."""
243
+ if not self.playwright:
244
+ self.playwright = await async_playwright().start()
245
+ try:
246
+ if self._config.cdp_url:
247
+ self.browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
248
+ if not self._config.proxy_rotator and self.browser:
249
+ self.context = await self.browser.new_context(**self._context_options)
250
+ elif self._config.proxy_rotator:
251
+ self.browser = await self.playwright.chromium.launch(**self._browser_options)
252
+ else:
253
+ persistent_options = (
254
+ self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
255
+ )
256
+ self.context = await self.playwright.chromium.launch_persistent_context(**persistent_options)
257
+
258
+ if self.context:
259
+ self.context = await self._initialize_context(self._config, self.context)
260
+
261
+ self._is_alive = True
262
+ except Exception:
263
+ # Clean up playwright if browser setup fails
264
+ await self.playwright.stop()
265
+ self.playwright = None
266
+ raise
267
+ else:
268
+ raise RuntimeError("Session has been already started")
269
+
270
+ async def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Response:
271
+ """Opens up the browser and do your request based on your chosen options.
272
+
273
+ :param url: The Target url.
274
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
275
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
276
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
277
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
278
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
279
+ :param disable_resources: Drop requests for unnecessary resources for a speed boost.
280
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
281
+ :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
282
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
283
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
284
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
285
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
286
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
287
+ :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
288
+ :return: A `Response` object.
289
+ """
290
+ static_proxy = kwargs.pop("proxy", None)
291
+
292
+ params = _validate(kwargs, self, PlaywrightConfig)
293
+
294
+ if not self._is_alive: # pragma: no cover
295
+ raise RuntimeError("Context manager has been closed")
296
+
297
+ request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
298
+ referer = (
299
+ generate_convincing_referer(url)
300
+ if (params.google_search and "referer" not in request_headers_keys)
301
+ else None
302
+ )
303
+
304
+ for attempt in range(self._config.retries):
305
+ proxy: Optional[ProxyType] = None
306
+ if self._config.proxy_rotator and static_proxy is None:
307
+ proxy = self._config.proxy_rotator.get_proxy()
308
+ else:
309
+ proxy = static_proxy
310
+
311
+ async with self._page_generator(
312
+ params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
313
+ ) as page_info:
314
+ final_response = [None]
315
+ page = page_info.page
316
+ page.on("response", self._create_response_handler(page_info, final_response))
317
+
318
+ try:
319
+ first_response = await page.goto(url, referer=referer)
320
+ await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
321
+
322
+ if not first_response:
323
+ raise RuntimeError(f"Failed to get response for {url}")
324
+
325
+ if params.page_action:
326
+ try:
327
+ _ = await params.page_action(page)
328
+ except Exception as e: # pragma: no cover
329
+ log.error(f"Error executing page_action: {e}")
330
+
331
+ if params.wait_selector:
332
+ try:
333
+ waiter: AsyncLocator = page.locator(params.wait_selector)
334
+ await waiter.first.wait_for(state=params.wait_selector_state)
335
+ await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
336
+ except Exception as e: # pragma: no cover
337
+ log.error(f"Error waiting for selector {params.wait_selector}: {e}")
338
+
339
+ await page.wait_for_timeout(params.wait)
340
+
341
+ response = await ResponseFactory.from_async_playwright_response(
342
+ page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
343
+ )
344
+ return response
345
+
346
+ except Exception as e:
347
+ page_info.mark_error()
348
+ if attempt < self._config.retries - 1:
349
+ if is_proxy_error(e):
350
+ log.warning(
351
+ f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
352
+ )
353
+ else:
354
+ log.warning(
355
+ f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
356
+ )
357
+ await asyncio_sleep(self._config.retry_delay)
358
+ else:
359
+ log.error(f"Failed after {self._config.retries} attempts: {e}")
360
+ raise
361
+
362
+ raise RuntimeError("Request failed") # pragma: no cover
engines/_browsers/_page.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from threading import RLock
2
+ from dataclasses import dataclass
3
+
4
+ from playwright.sync_api._generated import Page as SyncPage
5
+ from playwright.async_api._generated import Page as AsyncPage
6
+
7
+ from scrapling.core._types import Optional, List, Literal, overload, TypeVar, Generic, cast
8
+
9
+ PageState = Literal["ready", "busy", "error"] # States that a page can be in
10
+ PageType = TypeVar("PageType", SyncPage, AsyncPage)
11
+
12
+
13
+ @dataclass
14
+ class PageInfo(Generic[PageType]):
15
+ """Information about the page and its current state"""
16
+
17
+ __slots__ = ("page", "state", "url")
18
+ page: PageType
19
+ state: PageState
20
+ url: Optional[str]
21
+
22
+ def mark_busy(self, url: str = ""):
23
+ """Mark the page as busy"""
24
+ self.state = "busy"
25
+ self.url = url
26
+
27
+ def mark_error(self):
28
+ """Mark the page as having an error"""
29
+ self.state = "error"
30
+
31
+ def __repr__(self):
32
+ return f'Page(URL="{self.url!r}", state={self.state!r})'
33
+
34
+ def __eq__(self, other_page):
35
+ """Comparing this page to another page object."""
36
+ if other_page.__class__ is not self.__class__:
37
+ return NotImplemented
38
+ return self.page == other_page.page
39
+
40
+
41
+ class PagePool:
42
+ """Manages a pool of browser pages/tabs with state tracking"""
43
+
44
+ __slots__ = ("max_pages", "pages", "_lock")
45
+
46
+ def __init__(self, max_pages: int = 5):
47
+ self.max_pages = max_pages
48
+ self.pages: List[PageInfo[SyncPage] | PageInfo[AsyncPage]] = []
49
+ self._lock = RLock()
50
+
51
+ @overload
52
+ def add_page(self, page: SyncPage) -> PageInfo[SyncPage]: ...
53
+
54
+ @overload
55
+ def add_page(self, page: AsyncPage) -> PageInfo[AsyncPage]: ...
56
+
57
+ def add_page(self, page: SyncPage | AsyncPage) -> PageInfo[SyncPage] | PageInfo[AsyncPage]:
58
+ """Add a new page to the pool"""
59
+ with self._lock:
60
+ if len(self.pages) >= self.max_pages:
61
+ raise RuntimeError(f"Maximum page limit ({self.max_pages}) reached")
62
+
63
+ if isinstance(page, AsyncPage):
64
+ page_info: PageInfo[SyncPage] | PageInfo[AsyncPage] = cast(
65
+ PageInfo[AsyncPage], PageInfo(page, "ready", "")
66
+ )
67
+ else:
68
+ page_info = cast(PageInfo[SyncPage], PageInfo(page, "ready", ""))
69
+
70
+ self.pages.append(page_info)
71
+ return page_info
72
+
73
+ @property
74
+ def pages_count(self) -> int:
75
+ """Get the total number of pages"""
76
+ return len(self.pages)
77
+
78
+ @property
79
+ def busy_count(self) -> int:
80
+ """Get the number of busy pages"""
81
+ with self._lock:
82
+ return sum(1 for p in self.pages if p.state == "busy")
83
+
84
+ def cleanup_error_pages(self):
85
+ """Remove pages in error state"""
86
+ with self._lock:
87
+ self.pages = [p for p in self.pages if p.state != "error"]
engines/_browsers/_stealth.py ADDED
@@ -0,0 +1,541 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from random import randint
2
+ from re import compile as re_compile
3
+ from time import sleep as time_sleep
4
+ from asyncio import sleep as asyncio_sleep
5
+
6
+ from playwright.sync_api import Locator, Page, BrowserContext
7
+ from playwright.async_api import (
8
+ Page as async_Page,
9
+ Locator as AsyncLocator,
10
+ BrowserContext as AsyncBrowserContext,
11
+ )
12
+ from patchright.sync_api import sync_playwright
13
+ from patchright.async_api import async_playwright
14
+
15
+ from scrapling.core.utils import log
16
+ from scrapling.core._types import Any, Optional, ProxyType, Unpack
17
+ from scrapling.engines.toolbelt.proxy_rotation import is_proxy_error
18
+ from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
19
+ from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
20
+ from scrapling.engines._browsers._types import StealthSession, StealthFetchParams
21
+ from scrapling.engines._browsers._base import SyncSession, AsyncSession, StealthySessionMixin
22
+ from scrapling.engines._browsers._validators import validate_fetch as _validate, StealthConfig
23
+
24
+ __CF_PATTERN__ = re_compile(r"^https?://challenges\.cloudflare\.com/cdn-cgi/challenge-platform/.*")
25
+
26
+
27
+ class StealthySession(SyncSession, StealthySessionMixin):
28
+ """A Stealthy Browser session manager with page pooling."""
29
+
30
+ __slots__ = (
31
+ "_config",
32
+ "_context_options",
33
+ "_browser_options",
34
+ "_user_data_dir",
35
+ "_headers_keys",
36
+ "max_pages",
37
+ "page_pool",
38
+ "_max_wait_for_page",
39
+ "playwright",
40
+ "context",
41
+ )
42
+
43
+ def __init__(self, **kwargs: Unpack[StealthSession]):
44
+ """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
45
+
46
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
47
+ :param disable_resources: Drop requests for unnecessary resources for a speed boost.
48
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
49
+ :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
50
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
51
+ :param cookies: Set cookies for the next request.
52
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
53
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
54
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
55
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
56
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
57
+ :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
58
+ :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
59
+ rules. Defaults to the system default locale.
60
+ :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
61
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
62
+ :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
63
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
64
+ :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
65
+ :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
66
+ :param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
67
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
68
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
69
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
70
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
71
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
72
+ :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
73
+ :param extra_flags: A list of additional browser flags to pass to the browser on launch.
74
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
75
+ :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
76
+ """
77
+ self.__validate__(**kwargs)
78
+ super().__init__()
79
+
80
+ def start(self) -> None:
81
+ """Create a browser for this instance and context."""
82
+ if not self.playwright:
83
+ self.playwright = sync_playwright().start()
84
+
85
+ try:
86
+ if self._config.cdp_url: # pragma: no cover
87
+ self.browser = self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
88
+ if not self._config.proxy_rotator:
89
+ assert self.browser is not None
90
+ self.context = self.browser.new_context(**self._context_options)
91
+ elif self._config.proxy_rotator:
92
+ self.browser = self.playwright.chromium.launch(**self._browser_options)
93
+ else:
94
+ persistent_options = (
95
+ self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
96
+ )
97
+ self.context = self.playwright.chromium.launch_persistent_context(**persistent_options)
98
+
99
+ if self.context:
100
+ self.context = self._initialize_context(self._config, self.context)
101
+
102
+ self._is_alive = True
103
+ except Exception:
104
+ # Clean up playwright if browser setup fails
105
+ self.playwright.stop()
106
+ self.playwright = None
107
+ raise
108
+ else:
109
+ raise RuntimeError("Session has been already started")
110
+
111
+ def _cloudflare_solver(self, page: Page) -> None: # pragma: no cover
112
+ """Solve the cloudflare challenge displayed on the playwright page passed
113
+
114
+ :param page: The targeted page
115
+ :return:
116
+ """
117
+ self._wait_for_networkidle(page, timeout=5000)
118
+ challenge_type = self._detect_cloudflare(ResponseFactory._get_page_content(page))
119
+ if not challenge_type:
120
+ log.error("No Cloudflare challenge found.")
121
+ return None
122
+ else:
123
+ log.info(f'The turnstile version discovered is "{challenge_type}"')
124
+ if challenge_type == "non-interactive":
125
+ while "<title>Just a moment...</title>" in (ResponseFactory._get_page_content(page)):
126
+ log.info("Waiting for Cloudflare wait page to disappear.")
127
+ page.wait_for_timeout(1000)
128
+ page.wait_for_load_state()
129
+ log.info("Cloudflare captcha is solved")
130
+ return None
131
+
132
+ else:
133
+ box_selector = "#cf_turnstile div, #cf-turnstile div, .turnstile>div>div"
134
+ if challenge_type != "embedded":
135
+ box_selector = ".main-content p+div>div>div"
136
+ while "Verifying you are human." in ResponseFactory._get_page_content(page):
137
+ # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
138
+ page.wait_for_timeout(500)
139
+
140
+ outer_box: Any = {}
141
+ iframe = page.frame(url=__CF_PATTERN__)
142
+ if iframe is not None:
143
+ self._wait_for_page_stability(iframe, True, False)
144
+
145
+ if challenge_type != "embedded":
146
+ while not iframe.frame_element().is_visible():
147
+ # Double-checking that the iframe is loaded
148
+ page.wait_for_timeout(500)
149
+
150
+ outer_box = iframe.frame_element().bounding_box()
151
+
152
+ if not iframe or not outer_box:
153
+ if "<title>Just a moment...</title>" not in (ResponseFactory._get_page_content(page)):
154
+ log.info("Cloudflare captcha is solved")
155
+ return None
156
+
157
+ outer_box = page.locator(box_selector).last.bounding_box()
158
+
159
+ # Calculate the Captcha coordinates for any viewport
160
+ captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)
161
+
162
+ # Move the mouse to the center of the window, then press and hold the left mouse button
163
+ page.mouse.click(captcha_x, captcha_y, delay=randint(100, 200), button="left")
164
+ self._wait_for_networkidle(page)
165
+
166
+ if challenge_type != "embedded":
167
+ attempts = 0
168
+ while "<title>Just a moment...</title>" in ResponseFactory._get_page_content(page):
169
+ # Wait for the page
170
+ if attempts >= 100:
171
+ log.info("Cloudflare page didn't disappear after 10s, continuing...")
172
+ break
173
+ page.wait_for_timeout(100)
174
+ attempts += 1
175
+
176
+ # page.locator(box_selector).last.wait_for(state="detached")
177
+ # page.locator(".zone-name-title").wait_for(state="hidden")
178
+
179
+ self._wait_for_page_stability(page, True, False)
180
+
181
+ if "<title>Just a moment...</title>" not in (ResponseFactory._get_page_content(page)):
182
+ log.info("Cloudflare captcha is solved")
183
+ return None
184
+ else:
185
+ log.info("Looks like Cloudflare captcha is still present, solving again")
186
+ return self._cloudflare_solver(page)
187
+
188
+ def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Response:
189
+ """Opens up the browser and do your request based on your chosen options.
190
+
191
+ :param url: The Target url.
192
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
193
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
194
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
195
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
196
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
197
+ :param disable_resources: Drop requests for unnecessary resources for a speed boost.
198
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
199
+ :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
200
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
201
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
202
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
203
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
204
+ :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
205
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
206
+ :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
207
+ :return: A `Response` object.
208
+ """
209
+ static_proxy = kwargs.pop("proxy", None)
210
+
211
+ params = _validate(kwargs, self, StealthConfig)
212
+ if not self._is_alive: # pragma: no cover
213
+ raise RuntimeError("Context manager has been closed")
214
+
215
+ request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
216
+ referer = (
217
+ generate_convincing_referer(url)
218
+ if (params.google_search and "referer" not in request_headers_keys)
219
+ else None
220
+ )
221
+
222
+ for attempt in range(self._config.retries):
223
+ proxy: Optional[ProxyType] = None
224
+ if self._config.proxy_rotator and static_proxy is None:
225
+ proxy = self._config.proxy_rotator.get_proxy()
226
+ else:
227
+ proxy = static_proxy
228
+
229
+ with self._page_generator(
230
+ params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
231
+ ) as page_info:
232
+ final_response = [None]
233
+ page = page_info.page
234
+ page.on("response", self._create_response_handler(page_info, final_response))
235
+
236
+ try:
237
+ first_response = page.goto(url, referer=referer)
238
+ self._wait_for_page_stability(page, params.load_dom, params.network_idle)
239
+
240
+ if not first_response:
241
+ raise RuntimeError(f"Failed to get response for {url}")
242
+
243
+ if params.solve_cloudflare:
244
+ self._cloudflare_solver(page)
245
+ # Make sure the page is fully loaded after the captcha
246
+ self._wait_for_page_stability(page, params.load_dom, params.network_idle)
247
+
248
+ if params.page_action:
249
+ try:
250
+ _ = params.page_action(page)
251
+ except Exception as e: # pragma: no cover
252
+ log.error(f"Error executing page_action: {e}")
253
+
254
+ if params.wait_selector:
255
+ try:
256
+ waiter: Locator = page.locator(params.wait_selector)
257
+ waiter.first.wait_for(state=params.wait_selector_state)
258
+ self._wait_for_page_stability(page, params.load_dom, params.network_idle)
259
+ except Exception as e: # pragma: no cover
260
+ log.error(f"Error waiting for selector {params.wait_selector}: {e}")
261
+
262
+ page.wait_for_timeout(params.wait)
263
+
264
+ response = ResponseFactory.from_playwright_response(
265
+ page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
266
+ )
267
+ return response
268
+
269
+ except Exception as e:
270
+ page_info.mark_error()
271
+ if attempt < self._config.retries - 1:
272
+ if is_proxy_error(e):
273
+ log.warning(
274
+ f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
275
+ )
276
+ else:
277
+ log.warning(
278
+ f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
279
+ )
280
+ time_sleep(self._config.retry_delay)
281
+ else:
282
+ log.error(f"Failed after {self._config.retries} attempts: {e}")
283
+ raise
284
+
285
+ raise RuntimeError("Request failed") # pragma: no cover
286
+
287
+
288
+ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
289
+ """An async Stealthy Browser session manager with page pooling."""
290
+
291
+ __slots__ = (
292
+ "_config",
293
+ "_context_options",
294
+ "_browser_options",
295
+ "_user_data_dir",
296
+ "_headers_keys",
297
+ )
298
+
299
+ def __init__(self, **kwargs: Unpack[StealthSession]):
300
+ """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
301
+
302
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
303
+ :param disable_resources: Drop requests for unnecessary resources for a speed boost.
304
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
305
+ :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
306
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
307
+ :param cookies: Set cookies for the next request.
308
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
309
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
310
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
311
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
312
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
313
+ :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
314
+ :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
315
+ rules. Defaults to the system default locale.
316
+ :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
317
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
318
+ :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
319
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
320
+ :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
321
+ :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
322
+ :param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
323
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
324
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
325
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
326
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
327
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
328
+ :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
329
+ :param extra_flags: A list of additional browser flags to pass to the browser on launch.
330
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
331
+ :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
332
+ """
333
+ self.__validate__(**kwargs)
334
+ super().__init__(max_pages=self._config.max_pages)
335
+
336
+ async def start(self) -> None:
337
+ """Create a browser for this instance and context."""
338
+ if not self.playwright:
339
+ self.playwright = await async_playwright().start()
340
+ try:
341
+ if self._config.cdp_url:
342
+ self.browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
343
+ if not self._config.proxy_rotator:
344
+ assert self.browser is not None
345
+ self.context = await self.browser.new_context(**self._context_options)
346
+ elif self._config.proxy_rotator:
347
+ self.browser = await self.playwright.chromium.launch(**self._browser_options)
348
+ else:
349
+ persistent_options = (
350
+ self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
351
+ )
352
+ self.context = await self.playwright.chromium.launch_persistent_context(**persistent_options)
353
+
354
+ if self.context:
355
+ self.context = await self._initialize_context(self._config, self.context)
356
+
357
+ self._is_alive = True
358
+ except Exception:
359
+ # Clean up playwright if browser setup fails
360
+ await self.playwright.stop()
361
+ self.playwright = None
362
+ raise
363
+ else:
364
+ raise RuntimeError("Session has been already started")
365
+
366
+ async def _cloudflare_solver(self, page: async_Page) -> None: # pragma: no cover
367
+ """Solve the cloudflare challenge displayed on the playwright page passed
368
+
369
+ :param page: The targeted page
370
+ :return:
371
+ """
372
+ await self._wait_for_networkidle(page, timeout=5000)
373
+ challenge_type = self._detect_cloudflare(await ResponseFactory._get_async_page_content(page))
374
+ if not challenge_type:
375
+ log.error("No Cloudflare challenge found.")
376
+ return None
377
+ else:
378
+ log.info(f'The turnstile version discovered is "{challenge_type}"')
379
+ if challenge_type == "non-interactive":
380
+ while "<title>Just a moment...</title>" in (await ResponseFactory._get_async_page_content(page)):
381
+ log.info("Waiting for Cloudflare wait page to disappear.")
382
+ await page.wait_for_timeout(1000)
383
+ await page.wait_for_load_state()
384
+ log.info("Cloudflare captcha is solved")
385
+ return None
386
+
387
+ else:
388
+ box_selector = "#cf_turnstile div, #cf-turnstile div, .turnstile>div>div"
389
+ if challenge_type != "embedded":
390
+ box_selector = ".main-content p+div>div>div"
391
+ while "Verifying you are human." in (await ResponseFactory._get_async_page_content(page)):
392
+ # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
393
+ await page.wait_for_timeout(500)
394
+
395
+ outer_box: Any = {}
396
+ iframe = page.frame(url=__CF_PATTERN__)
397
+ if iframe is not None:
398
+ await self._wait_for_page_stability(iframe, True, False)
399
+
400
+ if challenge_type != "embedded":
401
+ while not await (await iframe.frame_element()).is_visible():
402
+ # Double-checking that the iframe is loaded
403
+ await page.wait_for_timeout(500)
404
+
405
+ outer_box = await (await iframe.frame_element()).bounding_box()
406
+
407
+ if not iframe or not outer_box:
408
+ if "<title>Just a moment...</title>" not in (await ResponseFactory._get_async_page_content(page)):
409
+ log.info("Cloudflare captcha is solved")
410
+ return None
411
+
412
+ outer_box = await page.locator(box_selector).last.bounding_box()
413
+
414
+ # Calculate the Captcha coordinates for any viewport
415
+ captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)
416
+
417
+ # Move the mouse to the center of the window, then press and hold the left mouse button
418
+ await page.mouse.click(captcha_x, captcha_y, delay=randint(100, 200), button="left")
419
+ await self._wait_for_networkidle(page)
420
+
421
+ if challenge_type != "embedded":
422
+ attempts = 0
423
+ while "<title>Just a moment...</title>" in (await ResponseFactory._get_async_page_content(page)):
424
+ # Wait for the page
425
+ if attempts >= 100:
426
+ log.info("Cloudflare page didn't disappear after 10s, continuing...")
427
+ break
428
+ await page.wait_for_timeout(100)
429
+ attempts += 1
430
+
431
+ # await page.locator(box_selector).last.wait_for(state="detached")
432
+ # await page.locator(".zone-name-title").wait_for(state="hidden")
433
+
434
+ await self._wait_for_page_stability(page, True, False)
435
+
436
+ if "<title>Just a moment...</title>" not in (await ResponseFactory._get_async_page_content(page)):
437
+ log.info("Cloudflare captcha is solved")
438
+ return None
439
+ else:
440
+ log.info("Looks like Cloudflare captcha is still present, solving again")
441
+ return await self._cloudflare_solver(page)
442
+
443
+ async def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Response:
444
+ """Opens up the browser and do your request based on your chosen options.
445
+
446
+ :param url: The Target url.
447
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
448
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
449
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
450
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
451
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
452
+ :param disable_resources: Drop requests for unnecessary resources for a speed boost.
453
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
454
+ :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
455
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
456
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
457
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
458
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
459
+ :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
460
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
461
+ :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
462
+ :return: A `Response` object.
463
+ """
464
+ static_proxy = kwargs.pop("proxy", None)
465
+
466
+ params = _validate(kwargs, self, StealthConfig)
467
+
468
+ if not self._is_alive: # pragma: no cover
469
+ raise RuntimeError("Context manager has been closed")
470
+
471
+ request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
472
+ referer = (
473
+ generate_convincing_referer(url)
474
+ if (params.google_search and "referer" not in request_headers_keys)
475
+ else None
476
+ )
477
+
478
+ for attempt in range(self._config.retries):
479
+ proxy: Optional[ProxyType] = None
480
+ if self._config.proxy_rotator and static_proxy is None:
481
+ proxy = self._config.proxy_rotator.get_proxy()
482
+ else:
483
+ proxy = static_proxy
484
+
485
+ async with self._page_generator(
486
+ params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
487
+ ) as page_info:
488
+ final_response = [None]
489
+ page = page_info.page
490
+ page.on("response", self._create_response_handler(page_info, final_response))
491
+
492
+ try:
493
+ first_response = await page.goto(url, referer=referer)
494
+ await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
495
+
496
+ if not first_response:
497
+ raise RuntimeError(f"Failed to get response for {url}")
498
+
499
+ if params.solve_cloudflare:
500
+ await self._cloudflare_solver(page)
501
+ # Make sure the page is fully loaded after the captcha
502
+ await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
503
+
504
+ if params.page_action:
505
+ try:
506
+ _ = await params.page_action(page)
507
+ except Exception as e: # pragma: no cover
508
+ log.error(f"Error executing page_action: {e}")
509
+
510
+ if params.wait_selector:
511
+ try:
512
+ waiter: AsyncLocator = page.locator(params.wait_selector)
513
+ await waiter.first.wait_for(state=params.wait_selector_state)
514
+ await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
515
+ except Exception as e: # pragma: no cover
516
+ log.error(f"Error waiting for selector {params.wait_selector}: {e}")
517
+
518
+ await page.wait_for_timeout(params.wait)
519
+
520
+ response = await ResponseFactory.from_async_playwright_response(
521
+ page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
522
+ )
523
+ return response
524
+
525
+ except Exception as e:
526
+ page_info.mark_error()
527
+ if attempt < self._config.retries - 1:
528
+ if is_proxy_error(e):
529
+ log.warning(
530
+ f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
531
+ )
532
+ else:
533
+ log.warning(
534
+ f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
535
+ )
536
+ await asyncio_sleep(self._config.retry_delay)
537
+ else:
538
+ log.error(f"Failed after {self._config.retries} attempts: {e}")
539
+ raise
540
+
541
+ raise RuntimeError("Request failed") # pragma: no cover
engines/_browsers/_types.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from io import BytesIO
2
+
3
+ from curl_cffi.requests import (
4
+ ProxySpec,
5
+ CookieTypes,
6
+ BrowserTypeLiteral,
7
+ )
8
+
9
+ from scrapling.core._types import (
10
+ Dict,
11
+ List,
12
+ Set,
13
+ Tuple,
14
+ Mapping,
15
+ Optional,
16
+ Callable,
17
+ Sequence,
18
+ TypedDict,
19
+ TypeAlias,
20
+ SetCookieParam,
21
+ SelectorWaitStates,
22
+ )
23
+ from scrapling.engines.toolbelt.proxy_rotation import ProxyRotator
24
+
25
+ # Type alias for `impersonate` parameter - accepts a single browser or list of browsers
26
+ ImpersonateType: TypeAlias = BrowserTypeLiteral | List[BrowserTypeLiteral] | None
27
+
28
+
29
+ # Types for session initialization
30
+ class RequestsSession(TypedDict, total=False):
31
+ impersonate: ImpersonateType
32
+ http3: Optional[bool]
33
+ stealthy_headers: Optional[bool]
34
+ proxies: Optional[ProxySpec]
35
+ proxy: Optional[str]
36
+ proxy_auth: Optional[Tuple[str, str]]
37
+ proxy_rotator: Optional[ProxyRotator]
38
+ timeout: Optional[int | float]
39
+ headers: Optional[Mapping[str, Optional[str]]]
40
+ retries: Optional[int]
41
+ retry_delay: Optional[int]
42
+ follow_redirects: Optional[bool]
43
+ max_redirects: Optional[int]
44
+ verify: Optional[bool]
45
+ cert: Optional[str | Tuple[str, str]]
46
+ selector_config: Optional[Dict]
47
+
48
+
49
+ # Types for GET request method parameters
50
+ class GetRequestParams(RequestsSession, total=False):
51
+ params: Optional[Dict | List | Tuple]
52
+ cookies: Optional[CookieTypes]
53
+ auth: Optional[Tuple[str, str]]
54
+
55
+
56
+ # Types for POST/PUT/DELETE request method parameters
57
+ class DataRequestParams(GetRequestParams, total=False):
58
+ data: Optional[Dict[str, str] | List[Tuple] | str | BytesIO | bytes]
59
+ json: Optional[Dict | List]
60
+
61
+
62
+ # Types for browser session
63
+ class PlaywrightSession(TypedDict, total=False):
64
+ max_pages: int
65
+ headless: bool
66
+ disable_resources: bool
67
+ network_idle: bool
68
+ load_dom: bool
69
+ wait_selector: Optional[str]
70
+ wait_selector_state: SelectorWaitStates
71
+ cookies: Sequence[SetCookieParam] | None
72
+ google_search: bool
73
+ wait: int | float
74
+ timezone_id: str | None
75
+ page_action: Optional[Callable]
76
+ proxy: Optional[str | Dict[str, str] | Tuple]
77
+ proxy_rotator: Optional[ProxyRotator]
78
+ extra_headers: Optional[Dict[str, str]]
79
+ timeout: int | float
80
+ init_script: Optional[str]
81
+ user_data_dir: str
82
+ selector_config: Optional[Dict]
83
+ additional_args: Optional[Dict]
84
+ locale: Optional[str]
85
+ real_chrome: bool
86
+ cdp_url: Optional[str]
87
+ useragent: Optional[str]
88
+ extra_flags: Optional[List[str]]
89
+ blocked_domains: Optional[Set[str]]
90
+ retries: int
91
+ retry_delay: int | float
92
+
93
+
94
+ class PlaywrightFetchParams(TypedDict, total=False):
95
+ load_dom: bool
96
+ wait: int | float
97
+ network_idle: bool
98
+ google_search: bool
99
+ timeout: int | float
100
+ disable_resources: bool
101
+ wait_selector: Optional[str]
102
+ page_action: Optional[Callable]
103
+ selector_config: Optional[Dict]
104
+ extra_headers: Optional[Dict[str, str]]
105
+ wait_selector_state: SelectorWaitStates
106
+ blocked_domains: Optional[Set[str]]
107
+ proxy: Optional[str | Dict[str, str]]
108
+
109
+
110
+ class StealthSession(PlaywrightSession, total=False):
111
+ allow_webgl: bool
112
+ hide_canvas: bool
113
+ block_webrtc: bool
114
+ solve_cloudflare: bool
115
+
116
+
117
+ class StealthFetchParams(PlaywrightFetchParams, total=False):
118
+ solve_cloudflare: bool
engines/_browsers/_validators.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import Annotated
3
+ from functools import lru_cache
4
+ from urllib.parse import urlparse
5
+ from dataclasses import dataclass, fields
6
+
7
+ from msgspec import Struct, Meta, convert, ValidationError
8
+
9
+ from scrapling.core._types import (
10
+ Any,
11
+ Dict,
12
+ List,
13
+ Set,
14
+ Tuple,
15
+ Optional,
16
+ Callable,
17
+ Sequence,
18
+ overload,
19
+ SetCookieParam,
20
+ SelectorWaitStates,
21
+ )
22
+ from scrapling.engines.toolbelt.proxy_rotation import ProxyRotator
23
+ from scrapling.engines.toolbelt.navigation import construct_proxy_dict
24
+ from scrapling.engines._browsers._types import PlaywrightFetchParams, StealthFetchParams
25
+
26
+
27
+ # Custom validators for msgspec
28
+ @lru_cache(8)
29
+ def _is_invalid_file_path(value: str) -> bool | str: # pragma: no cover
30
+ """Fast file path validation"""
31
+ path = Path(value)
32
+ if not path.exists():
33
+ return f"Init script path not found: {value}"
34
+ if not path.is_file():
35
+ return f"Init script is not a file: {value}"
36
+ if not path.is_absolute():
37
+ return f"Init script is not a absolute path: {value}"
38
+ return False
39
+
40
+
41
+ @lru_cache(2)
42
+ def _is_invalid_cdp_url(cdp_url: str) -> bool | str:
43
+ """Fast CDP URL validation"""
44
+ if not cdp_url.startswith(("ws://", "wss://")):
45
+ return "CDP URL must use 'ws://' or 'wss://' scheme"
46
+
47
+ netloc = urlparse(cdp_url).netloc
48
+ if not netloc: # pragma: no cover
49
+ return "Invalid hostname for the CDP URL"
50
+ return False
51
+
52
+
53
+ # Type aliases for cleaner annotations
54
+ PagesCount = Annotated[int, Meta(ge=1, le=50)]
55
+ RetriesCount = Annotated[int, Meta(ge=1, le=10)]
56
+ Seconds = Annotated[int, float, Meta(ge=0)]
57
+
58
+
59
+ class PlaywrightConfig(Struct, kw_only=True, frozen=False, weakref=True):
60
+ """Configuration struct for validation"""
61
+
62
+ max_pages: PagesCount = 1
63
+ headless: bool = True
64
+ disable_resources: bool = False
65
+ network_idle: bool = False
66
+ load_dom: bool = True
67
+ wait_selector: Optional[str] = None
68
+ wait_selector_state: SelectorWaitStates = "attached"
69
+ cookies: Sequence[SetCookieParam] | None = []
70
+ google_search: bool = True
71
+ wait: Seconds = 0
72
+ timezone_id: str | None = ""
73
+ page_action: Optional[Callable] = None
74
+ proxy: Optional[str | Dict[str, str] | Tuple] = None # The default value for proxy in Playwright's source is `None`
75
+ proxy_rotator: Optional[ProxyRotator] = None
76
+ extra_headers: Optional[Dict[str, str]] = None
77
+ timeout: Seconds = 30000
78
+ init_script: Optional[str] = None
79
+ user_data_dir: str = ""
80
+ selector_config: Optional[Dict] = {}
81
+ additional_args: Optional[Dict] = {}
82
+ locale: str | None = None
83
+ real_chrome: bool = False
84
+ cdp_url: Optional[str] = None
85
+ useragent: Optional[str] = None
86
+ extra_flags: Optional[List[str]] = None
87
+ blocked_domains: Optional[Set[str]] = None
88
+ retries: RetriesCount = 3
89
+ retry_delay: Seconds = 1
90
+
91
+ def __post_init__(self): # pragma: no cover
92
+ """Custom validation after msgspec validation"""
93
+ if self.page_action and not callable(self.page_action):
94
+ raise TypeError(f"page_action must be callable, got {type(self.page_action).__name__}")
95
+ if self.proxy and self.proxy_rotator:
96
+ raise ValueError(
97
+ "Cannot use 'proxy_rotator' together with 'proxy'. "
98
+ "Use either a static proxy or proxy rotation, not both."
99
+ )
100
+ if self.proxy:
101
+ self.proxy = construct_proxy_dict(self.proxy)
102
+ if self.cdp_url:
103
+ cdp_msg = _is_invalid_cdp_url(self.cdp_url)
104
+ if cdp_msg:
105
+ raise ValueError(cdp_msg)
106
+
107
+ if not self.cookies:
108
+ self.cookies = []
109
+ if not self.extra_flags:
110
+ self.extra_flags = []
111
+ if not self.selector_config:
112
+ self.selector_config = {}
113
+ if not self.additional_args:
114
+ self.additional_args = {}
115
+
116
+ if self.init_script is not None:
117
+ validation_msg = _is_invalid_file_path(self.init_script)
118
+ if validation_msg:
119
+ raise ValueError(validation_msg)
120
+
121
+
122
+ class StealthConfig(PlaywrightConfig, kw_only=True, frozen=False, weakref=True):
123
+ allow_webgl: bool = True
124
+ hide_canvas: bool = False
125
+ block_webrtc: bool = False
126
+ solve_cloudflare: bool = False
127
+
128
+ def __post_init__(self):
129
+ """Custom validation after msgspec validation"""
130
+ super(StealthConfig, self).__post_init__()
131
+ # Cloudflare timeout adjustment
132
+ if self.solve_cloudflare and self.timeout < 60_000:
133
+ self.timeout = 60_000
134
+
135
+
136
+ @dataclass
137
+ class _fetch_params:
138
+ """A dataclass of all parameters used by `fetch` calls"""
139
+
140
+ google_search: bool
141
+ timeout: Seconds
142
+ wait: Seconds
143
+ page_action: Optional[Callable]
144
+ extra_headers: Optional[Dict[str, str]]
145
+ disable_resources: bool
146
+ wait_selector: Optional[str]
147
+ wait_selector_state: SelectorWaitStates
148
+ network_idle: bool
149
+ load_dom: bool
150
+ blocked_domains: Optional[Set[str]]
151
+ solve_cloudflare: bool
152
+ selector_config: Dict
153
+
154
+
155
+ def validate_fetch(
156
+ method_kwargs: Dict | PlaywrightFetchParams | StealthFetchParams,
157
+ session: Any,
158
+ model: type[PlaywrightConfig] | type[StealthConfig],
159
+ ) -> _fetch_params: # pragma: no cover
160
+ result: Dict[str, Any] = {}
161
+ overrides: Dict[str, Any] = {}
162
+ kwargs_dict: Dict[str, Any] = dict(method_kwargs)
163
+
164
+ # Get all field names that _fetch_params needs
165
+ fetch_param_fields = {f.name for f in fields(_fetch_params)}
166
+
167
+ for key in fetch_param_fields:
168
+ if key in kwargs_dict:
169
+ overrides[key] = kwargs_dict[key]
170
+ elif hasattr(session, "_config") and hasattr(session._config, key):
171
+ result[key] = getattr(session._config, key)
172
+
173
+ if overrides:
174
+ validated_config = validate(overrides, model)
175
+ # Extract ONLY the fields that were actually overridden (not all fields)
176
+ # This prevents validated defaults from overwriting session config values
177
+ validated_dict = {
178
+ field: getattr(validated_config, field) for field in overrides.keys() if hasattr(validated_config, field)
179
+ }
180
+
181
+ # Preserve solve_cloudflare if the user explicitly provided it, even if the model doesn't have it
182
+ if "solve_cloudflare" in overrides:
183
+ validated_dict["solve_cloudflare"] = overrides["solve_cloudflare"]
184
+
185
+ # Start with session defaults, then overwrite with validated overrides
186
+ result.update(validated_dict)
187
+
188
+ # solve_cloudflare defaults to False for models that don't have it (PlaywrightConfig)
189
+ result.setdefault("solve_cloudflare", False)
190
+ result.setdefault("blocked_domains", None)
191
+
192
+ return _fetch_params(**result)
193
+
194
+
195
+ # Cache default values for each model to reduce validation overhead
196
+ models_default_values = {}
197
+
198
+ for _model in (StealthConfig, PlaywrightConfig):
199
+ _defaults = {}
200
+ if hasattr(_model, "__struct_defaults__") and hasattr(_model, "__struct_fields__"):
201
+ for field_name, default_value in zip(_model.__struct_fields__, _model.__struct_defaults__): # type: ignore
202
+ # Skip factory defaults - these are msgspec._core.Factory instances
203
+ if type(default_value).__name__ != "Factory":
204
+ _defaults[field_name] = default_value
205
+
206
+ models_default_values[_model.__name__] = _defaults.copy()
207
+
208
+
209
+ def _filter_defaults(params: Dict, model: str) -> Dict:
210
+ """Filter out parameters that match their default values to reduce validation overhead."""
211
+ defaults = models_default_values[model]
212
+ return {k: v for k, v in params.items() if k not in defaults or v != defaults[k]}
213
+
214
+
215
+ @overload
216
+ def validate(params: Dict, model: type[StealthConfig]) -> StealthConfig: ...
217
+
218
+
219
+ @overload
220
+ def validate(params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...
221
+
222
+
223
+ def validate(params: Dict, model: type[PlaywrightConfig] | type[StealthConfig]) -> PlaywrightConfig | StealthConfig:
224
+ try:
225
+ # Filter out params with the default values (no need to validate them) to speed up validation
226
+ filtered = _filter_defaults(params, model.__name__)
227
+ return convert(filtered, model)
228
+ except ValidationError as e:
229
+ raise TypeError(f"Invalid argument type: {e}") from e
engines/constants.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Disable loading these resources for speed
2
+ EXTRA_RESOURCES = {
3
+ "font",
4
+ "image",
5
+ "media",
6
+ "beacon",
7
+ "object",
8
+ "imageset",
9
+ "texttrack",
10
+ "websocket",
11
+ "csp_report",
12
+ "stylesheet",
13
+ }
14
+
15
+ HARMFUL_ARGS = (
16
+ # This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
17
+ "--enable-automation",
18
+ "--disable-popup-blocking",
19
+ "--disable-component-update",
20
+ "--disable-default-apps",
21
+ "--disable-extensions",
22
+ )
23
+
24
+ DEFAULT_ARGS = (
25
+ # Speed up chromium browsers by default
26
+ "--no-pings",
27
+ "--no-first-run",
28
+ "--disable-infobars",
29
+ "--disable-breakpad",
30
+ "--no-service-autorun",
31
+ "--homepage=about:blank",
32
+ "--password-store=basic",
33
+ "--disable-hang-monitor",
34
+ "--no-default-browser-check",
35
+ "--disable-session-crashed-bubble",
36
+ "--disable-search-engine-choice-screen",
37
+ )
38
+
39
+ STEALTH_ARGS = (
40
+ # Explanation: https://peter.sh/experiments/chromium-command-line-switches/
41
+ # Generally this will make the browser faster and less detectable
42
+ # "--incognito",
43
+ "--test-type",
44
+ "--lang=en-US",
45
+ "--mute-audio",
46
+ "--disable-sync",
47
+ "--hide-scrollbars",
48
+ "--disable-logging",
49
+ "--start-maximized", # For headless check bypass
50
+ "--enable-async-dns",
51
+ "--accept-lang=en-US",
52
+ "--use-mock-keychain",
53
+ "--disable-translate",
54
+ "--disable-voice-input",
55
+ "--window-position=0,0",
56
+ "--disable-wake-on-wifi",
57
+ "--ignore-gpu-blocklist",
58
+ "--enable-tcp-fast-open",
59
+ "--enable-web-bluetooth",
60
+ "--disable-cloud-import",
61
+ "--disable-print-preview",
62
+ "--disable-dev-shm-usage",
63
+ # '--disable-popup-blocking',
64
+ "--metrics-recording-only",
65
+ "--disable-crash-reporter",
66
+ "--disable-partial-raster",
67
+ "--disable-gesture-typing",
68
+ "--disable-checker-imaging",
69
+ "--disable-prompt-on-repost",
70
+ "--force-color-profile=srgb",
71
+ "--font-render-hinting=none",
72
+ "--aggressive-cache-discard",
73
+ "--disable-cookie-encryption",
74
+ "--disable-domain-reliability",
75
+ "--disable-threaded-animation",
76
+ "--disable-threaded-scrolling",
77
+ "--enable-simple-cache-backend",
78
+ "--disable-background-networking",
79
+ "--enable-surface-synchronization",
80
+ "--disable-image-animation-resync",
81
+ "--disable-renderer-backgrounding",
82
+ "--disable-ipc-flooding-protection",
83
+ "--prerender-from-omnibox=disabled",
84
+ "--safebrowsing-disable-auto-update",
85
+ "--disable-offer-upload-credit-cards",
86
+ "--disable-background-timer-throttling",
87
+ "--disable-new-content-rendering-timeout",
88
+ "--run-all-compositor-stages-before-draw",
89
+ "--disable-client-side-phishing-detection",
90
+ "--disable-backgrounding-occluded-windows",
91
+ "--disable-layer-tree-host-memory-pressure",
92
+ "--autoplay-policy=user-gesture-required",
93
+ "--disable-offer-store-unmasked-wallet-cards",
94
+ "--disable-blink-features=AutomationControlled",
95
+ "--disable-component-extensions-with-background-pages",
96
+ "--enable-features=NetworkService,NetworkServiceInProcess,TrustTokens,TrustTokensAlwaysAllowIssuance",
97
+ "--blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4",
98
+ "--disable-features=AudioServiceOutOfProcess,TranslateUI,BlinkGenPropertyTrees",
99
+ )
engines/static.py ADDED
@@ -0,0 +1,770 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC
2
+ from random import choice
3
+ from time import sleep as time_sleep
4
+ from asyncio import sleep as asyncio_sleep
5
+
6
+ from curl_cffi.curl import CurlError
7
+ from curl_cffi import CurlHttpVersion
8
+ from curl_cffi.requests import (
9
+ BrowserTypeLiteral,
10
+ Session as CurlSession,
11
+ AsyncSession as AsyncCurlSession,
12
+ )
13
+
14
+ from scrapling.core.utils import log
15
+ from scrapling.core._types import (
16
+ Any,
17
+ Dict,
18
+ Tuple,
19
+ Unpack,
20
+ Optional,
21
+ Awaitable,
22
+ SUPPORTED_HTTP_METHODS,
23
+ )
24
+
25
+ from .toolbelt.custom import Response
26
+ from .toolbelt.convertor import ResponseFactory
27
+ from .toolbelt.proxy_rotation import ProxyRotator, is_proxy_error
28
+ from ._browsers._types import RequestsSession, GetRequestParams, DataRequestParams, ImpersonateType
29
+ from .toolbelt.fingerprints import generate_convincing_referer, generate_headers, __default_useragent__
30
+
31
+ _NO_SESSION: Any = object()
32
+
33
+
34
+ def _select_random_browser(impersonate: ImpersonateType) -> Optional[BrowserTypeLiteral]:
35
+ """
36
+ Handle browser selection logic for the ` impersonate ` parameter.
37
+
38
+ If impersonate is a list, randomly select one browser from it.
39
+ If it's a string or None, return as is.
40
+ """
41
+ if isinstance(impersonate, list):
42
+ if not impersonate:
43
+ return None
44
+ return choice(impersonate)
45
+ return impersonate
46
+
47
+
48
+ class _ConfigurationLogic(ABC):
49
+ # Core Logic Handler (Internal Engine)
50
+ __slots__ = (
51
+ "_default_impersonate",
52
+ "_stealth",
53
+ "_default_proxies",
54
+ "_default_proxy",
55
+ "_default_proxy_auth",
56
+ "_default_timeout",
57
+ "_default_headers",
58
+ "_default_retries",
59
+ "_default_retry_delay",
60
+ "_default_follow_redirects",
61
+ "_default_max_redirects",
62
+ "_default_verify",
63
+ "_default_cert",
64
+ "_default_http3",
65
+ "selector_config",
66
+ "_is_alive",
67
+ "_proxy_rotator",
68
+ )
69
+
70
+ def __init__(self, **kwargs: Unpack[RequestsSession]):
71
+ self._default_impersonate = kwargs.get("impersonate", "chrome")
72
+ self._stealth = kwargs.get("stealthy_headers", True)
73
+ self._default_proxies = kwargs.get("proxies") or {}
74
+ self._default_proxy = kwargs.get("proxy") or None
75
+ self._default_proxy_auth = kwargs.get("proxy_auth") or None
76
+ self._default_timeout = kwargs.get("timeout", 30)
77
+ self._default_headers = kwargs.get("headers") or {}
78
+ self._default_retries = kwargs.get("retries", 3)
79
+ self._default_retry_delay = kwargs.get("retry_delay", 1)
80
+ self._default_follow_redirects = kwargs.get("follow_redirects", True)
81
+ self._default_max_redirects = kwargs.get("max_redirects", 30)
82
+ self._default_verify = kwargs.get("verify", True)
83
+ self._default_cert = kwargs.get("cert") or None
84
+ self._default_http3 = kwargs.get("http3", False)
85
+ self.selector_config = kwargs.get("selector_config") or {}
86
+ self._is_alive = False
87
+ self._proxy_rotator: Optional[ProxyRotator] = kwargs.get("proxy_rotator")
88
+
89
+ if self._proxy_rotator and (self._default_proxy or self._default_proxies):
90
+ raise ValueError(
91
+ "Cannot use 'proxy_rotator' together with 'proxy' or 'proxies'. "
92
+ "Use either a static proxy or proxy rotation, not both."
93
+ )
94
+
95
+ @staticmethod
96
+ def _get_param(kwargs: Dict, key: str, default: Any) -> Any:
97
+ """Get parameter from kwargs if present, otherwise return default."""
98
+ return kwargs[key] if key in kwargs else default
99
+
100
+ def _merge_request_args(self, **method_kwargs) -> Dict[str, Any]:
101
+ """Merge request-specific arguments with default session arguments."""
102
+ url = method_kwargs.pop("url")
103
+
104
+ # Get parameters from kwargs or use defaults
105
+ impersonate = self._get_param(method_kwargs, "impersonate", self._default_impersonate)
106
+ impersonate = _select_random_browser(impersonate)
107
+ http3_enabled = self._get_param(method_kwargs, "http3", self._default_http3)
108
+ stealth = self._get_param(method_kwargs, "stealth", self._stealth)
109
+
110
+ final_args = {
111
+ "url": url,
112
+ # Curl automatically generates the suitable browser headers when you use `impersonate`
113
+ "headers": self._headers_job(
114
+ url,
115
+ self._get_param(method_kwargs, "headers", self._default_headers),
116
+ stealth,
117
+ bool(impersonate),
118
+ ),
119
+ "proxies": self._get_param(method_kwargs, "proxies", self._default_proxies),
120
+ "proxy": self._get_param(method_kwargs, "proxy", self._default_proxy),
121
+ "proxy_auth": self._get_param(method_kwargs, "proxy_auth", self._default_proxy_auth),
122
+ "timeout": self._get_param(method_kwargs, "timeout", self._default_timeout),
123
+ "allow_redirects": self._get_param(method_kwargs, "follow_redirects", self._default_follow_redirects),
124
+ "max_redirects": self._get_param(method_kwargs, "max_redirects", self._default_max_redirects),
125
+ "verify": self._get_param(method_kwargs, "verify", self._default_verify),
126
+ "cert": self._get_param(method_kwargs, "cert", self._default_cert),
127
+ "impersonate": impersonate,
128
+ }
129
+
130
+ # Add any remaining parameters that weren't explicitly handled above
131
+ # Skip the ones we already processed plus internal params
132
+ skip_keys = {
133
+ "impersonate",
134
+ "http3",
135
+ "stealth",
136
+ "headers",
137
+ "proxies",
138
+ "proxy",
139
+ "proxy_auth",
140
+ "timeout",
141
+ "follow_redirects",
142
+ "max_redirects",
143
+ "verify",
144
+ "cert",
145
+ "retries",
146
+ "retry_delay",
147
+ "selector_config",
148
+ # Browser session params (ignored by HTTP sessions)
149
+ "extra_headers",
150
+ "google_search",
151
+ }
152
+ for k, v in method_kwargs.items():
153
+ if k not in skip_keys and v is not None:
154
+ final_args[k] = v
155
+
156
+ if http3_enabled: # pragma: no cover
157
+ final_args["http_version"] = CurlHttpVersion.V3ONLY
158
+ if impersonate:
159
+ log.warning(
160
+ "The argument `http3` might cause errors if used with `impersonate` argument, try switching it off if you encounter any curl errors."
161
+ )
162
+
163
+ return final_args
164
+
165
+ def _headers_job(self, url, headers: Dict, stealth: bool, impersonate_enabled: bool) -> Dict:
166
+ """
167
+ 1. Adds a useragent to the headers if it doesn't have one
168
+ 2. Generates real headers and append them to current headers
169
+ 3. Generates a referer header that looks like as if this request came from a Google's search of the current URL's domain.
170
+ """
171
+ # Merge session headers with request headers, request takes precedence (if it was set)
172
+ final_headers = {**self._default_headers, **(headers if headers else {})}
173
+ headers_keys = {k.lower() for k in final_headers}
174
+ if stealth:
175
+ if "referer" not in headers_keys:
176
+ final_headers["referer"] = generate_convincing_referer(url)
177
+
178
+ if not impersonate_enabled: # Curl will generate the suitable headers
179
+ extra_headers = generate_headers(browser_mode=False)
180
+ final_headers.update(
181
+ {k: v for k, v in extra_headers.items() if k.lower() not in headers_keys}
182
+ ) # Don't overwrite user-supplied headers
183
+
184
+ elif "user-agent" not in headers_keys and not impersonate_enabled: # pragma: no cover
185
+ final_headers["User-Agent"] = __default_useragent__
186
+ log.debug(f"Can't find useragent in headers so '{final_headers['User-Agent']}' was used.")
187
+
188
+ return final_headers
189
+
190
+
191
+ class _SyncSessionLogic(_ConfigurationLogic):
192
+ __slots__ = ("_curl_session",)
193
+
194
+ def __init__(self, **kwargs: Unpack[RequestsSession]):
195
+ super().__init__(**kwargs)
196
+ self._curl_session: Optional[CurlSession] = None
197
+
198
+ def __enter__(self):
199
+ """Creates and returns a new synchronous Fetcher Session"""
200
+ if self._is_alive:
201
+ raise RuntimeError("This FetcherSession instance already has an active synchronous session.")
202
+
203
+ self._curl_session = CurlSession()
204
+ self._is_alive = True
205
+ return self
206
+
207
+ def __exit__(self, exc_type, exc_val, exc_tb):
208
+ """Closes the active synchronous session managed by this instance, if any."""
209
+ # For type checking (not accessed error)
210
+ _ = (
211
+ exc_type,
212
+ exc_val,
213
+ exc_tb,
214
+ )
215
+ if self._curl_session:
216
+ self._curl_session.close()
217
+ self._curl_session = None
218
+
219
+ self._is_alive = False
220
+
221
+ def _make_request(self, method: SUPPORTED_HTTP_METHODS, stealth: Optional[bool] = None, **kwargs) -> Response:
222
+ """
223
+ Perform an HTTP request using the configured session.
224
+ """
225
+ stealth = self._stealth if stealth is None else stealth
226
+
227
+ selector_config = self._get_param(kwargs, "selector_config", self.selector_config) or self.selector_config
228
+ max_retries = self._get_param(kwargs, "retries", self._default_retries)
229
+ retry_delay = self._get_param(kwargs, "retry_delay", self._default_retry_delay)
230
+ static_proxy = kwargs.pop("proxy", None)
231
+
232
+ session = self._curl_session
233
+ one_off_request = False
234
+ if session is _NO_SESSION and self.__enter__ is None:
235
+ # For usage inside FetcherClient
236
+ # It turns out `curl_cffi` caches impersonation state, so if you turned it off, then on then off, it won't be off on the last time.
237
+ session = CurlSession()
238
+ one_off_request = True
239
+
240
+ if not session:
241
+ raise RuntimeError("No active session available.") # pragma: no cover
242
+
243
+ try:
244
+ for attempt in range(max_retries):
245
+ if self._proxy_rotator and static_proxy is None:
246
+ proxy = self._proxy_rotator.get_proxy()
247
+ else:
248
+ proxy = static_proxy
249
+
250
+ request_args = self._merge_request_args(stealth=stealth, proxy=proxy, **kwargs)
251
+ try:
252
+ response = session.request(method, **request_args)
253
+ result = ResponseFactory.from_http_request(response, selector_config, meta={"proxy": proxy})
254
+ return result
255
+ except CurlError as e: # pragma: no cover
256
+ if attempt < max_retries - 1:
257
+ # Now if the rotator is enabled, we will try again with the new proxy
258
+ # If it's not enabled, then we will try again with the same proxy
259
+ if is_proxy_error(e):
260
+ log.warning(
261
+ f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {retry_delay} seconds..."
262
+ )
263
+ else:
264
+ log.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds...")
265
+ time_sleep(retry_delay)
266
+ else:
267
+ log.error(f"Failed after {max_retries} attempts: {e}")
268
+ raise # Raise the exception if all retries fail
269
+ finally:
270
+ if session and one_off_request:
271
+ session.close()
272
+
273
+ raise RuntimeError("No active session available.") # pragma: no cover
274
+
275
+ def get(self, url: str, **kwargs: Unpack[GetRequestParams]) -> Response:
276
+ """
277
+ Perform a GET request.
278
+
279
+ Any additional keyword arguments are passed to the `curl_cffi.requests.Session().request()` method.
280
+
281
+ :param url: Target URL for the request.
282
+ :param kwargs: Additional keyword arguments including:
283
+ - params: Query string parameters for the request.
284
+ - headers: Headers to include in the request.
285
+ - cookies: Cookies to use in the request.
286
+ - timeout: Number of seconds to wait before timing out.
287
+ - follow_redirects: Whether to follow redirects. Defaults to True.
288
+ - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
289
+ - retries: Number of retry attempts. Defaults to 3.
290
+ - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
291
+ - proxies: Dict of proxies to use.
292
+ - proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
293
+ - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
294
+ - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
295
+ - verify: Whether to verify HTTPS certificates.
296
+ - cert: Tuple of (cert, key) filenames for the client certificate.
297
+ - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
298
+ - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
299
+ - stealthy_headers: If enabled (default), it creates and adds real browser headers.
300
+ :return: A `Response` object.
301
+ """
302
+ stealthy_headers = kwargs.pop("stealthy_headers", None)
303
+ return self._make_request("GET", stealth=stealthy_headers, url=url, **kwargs)
304
+
305
+ def post(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
306
+ """
307
+ Perform a POST request.
308
+
309
+ Any additional keyword arguments are passed to the `curl_cffi.requests.Session().request()` method.
310
+
311
+ :param url: Target URL for the request.
312
+ :param kwargs: Additional keyword arguments including:
313
+ - data: Form data to include in the request body.
314
+ - json: A JSON serializable object to include in the body of the request.
315
+ - params: Query string parameters for the request.
316
+ - headers: Headers to include in the request.
317
+ - cookies: Cookies to use in the request.
318
+ - timeout: Number of seconds to wait before timing out.
319
+ - follow_redirects: Whether to follow redirects. Defaults to True.
320
+ - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
321
+ - retries: Number of retry attempts. Defaults to 3.
322
+ - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
323
+ - proxies: Dict of proxies to use.
324
+ - proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
325
+ - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
326
+ - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
327
+ - verify: Whether to verify HTTPS certificates.
328
+ - cert: Tuple of (cert, key) filenames for the client certificate.
329
+ - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
330
+ - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
331
+ - stealthy_headers: If enabled (default), it creates and adds real browser headers.
332
+ :return: A `Response` object.
333
+ """
334
+ stealthy_headers = kwargs.pop("stealthy_headers", None)
335
+ return self._make_request("POST", stealth=stealthy_headers, url=url, **kwargs)
336
+
337
+ def put(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
338
+ """
339
+ Perform a PUT request.
340
+
341
+ Any additional keyword arguments are passed to the `curl_cffi.requests.Session().request()` method.
342
+
343
+ :param url: Target URL for the request.
344
+ :param kwargs: Additional keyword arguments including:
345
+ - data: Form data to include in the request body.
346
+ - json: A JSON serializable object to include in the body of the request.
347
+ - params: Query string parameters for the request.
348
+ - headers: Headers to include in the request.
349
+ - cookies: Cookies to use in the request.
350
+ - timeout: Number of seconds to wait before timing out.
351
+ - follow_redirects: Whether to follow redirects. Defaults to True.
352
+ - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
353
+ - retries: Number of retry attempts. Defaults to 3.
354
+ - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
355
+ - proxies: Dict of proxies to use.
356
+ - proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
357
+ - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
358
+ - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
359
+ - verify: Whether to verify HTTPS certificates.
360
+ - cert: Tuple of (cert, key) filenames for the client certificate.
361
+ - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
362
+ - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
363
+ - stealthy_headers: If enabled (default), it creates and adds real browser headers.
364
+ :return: A `Response` object.
365
+ """
366
+ stealthy_headers = kwargs.pop("stealthy_headers", None)
367
+ return self._make_request("PUT", stealth=stealthy_headers, url=url, **kwargs)
368
+
369
+ def delete(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
370
+ """
371
+ Perform a DELETE request.
372
+
373
+ Any additional keyword arguments are passed to the `curl_cffi.requests.Session().request()` method.
374
+
375
+ :param url: Target URL for the request.
376
+ :param kwargs: Additional keyword arguments including:
377
+ - data: Form data to include in the request body.
378
+ - json: A JSON serializable object to include in the body of the request.
379
+ - params: Query string parameters for the request.
380
+ - headers: Headers to include in the request.
381
+ - cookies: Cookies to use in the request.
382
+ - timeout: Number of seconds to wait before timing out.
383
+ - follow_redirects: Whether to follow redirects. Defaults to True.
384
+ - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
385
+ - retries: Number of retry attempts. Defaults to 3.
386
+ - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
387
+ - proxies: Dict of proxies to use.
388
+ - proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
389
+ - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
390
+ - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
391
+ - verify: Whether to verify HTTPS certificates.
392
+ - cert: Tuple of (cert, key) filenames for the client certificate.
393
+ - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
394
+ - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
395
+ - stealthy_headers: If enabled (default), it creates and adds real browser headers.
396
+ :return: A `Response` object.
397
+ """
398
+ # Careful of sending a body in a DELETE request, it might cause some websites to reject the request as per https://www.rfc-editor.org/rfc/rfc7231#section-4.3.5,
399
+ # But some websites accept it, it depends on the implementation used.
400
+ stealthy_headers = kwargs.pop("stealthy_headers", None)
401
+ return self._make_request("DELETE", stealth=stealthy_headers, url=url, **kwargs)
402
+
403
+
404
+ class _ASyncSessionLogic(_ConfigurationLogic):
405
+ __slots__ = ("_async_curl_session",)
406
+
407
+ def __init__(self, **kwargs: Unpack[RequestsSession]):
408
+ super().__init__(**kwargs)
409
+ self._async_curl_session: Optional[AsyncCurlSession] = None
410
+
411
+ async def __aenter__(self): # pragma: no cover
412
+ """Creates and returns a new asynchronous Session."""
413
+ if self._is_alive:
414
+ raise RuntimeError("This FetcherSession instance already has an active asynchronous session.")
415
+
416
+ self._async_curl_session = AsyncCurlSession()
417
+ self._is_alive = True
418
+ return self
419
+
420
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
421
+ """Closes the active asynchronous session managed by this instance, if any."""
422
+ # For type checking (not accessed error)
423
+ _ = (
424
+ exc_type,
425
+ exc_val,
426
+ exc_tb,
427
+ )
428
+ if self._async_curl_session:
429
+ await self._async_curl_session.close()
430
+ self._async_curl_session = None
431
+
432
+ self._is_alive = False
433
+
434
+ async def _make_request(self, method: SUPPORTED_HTTP_METHODS, stealth: Optional[bool] = None, **kwargs) -> Response:
435
+ """
436
+ Perform an HTTP request using the configured session.
437
+ """
438
+ stealth = self._stealth if stealth is None else stealth
439
+
440
+ selector_config = self._get_param(kwargs, "selector_config", self.selector_config) or self.selector_config
441
+ max_retries = self._get_param(kwargs, "retries", self._default_retries)
442
+ retry_delay = self._get_param(kwargs, "retry_delay", self._default_retry_delay)
443
+ static_proxy = kwargs.pop("proxy", None)
444
+
445
+ session = self._async_curl_session
446
+ one_off_request = False
447
+ if session is _NO_SESSION and self.__aenter__ is None:
448
+ # For usage inside the ` AsyncFetcherClient ` class, and that's for several reasons
449
+ # 1. It turns out `curl_cffi` caches impersonation state, so if you turned it off, then on then off, it won't be off on the last time.
450
+ # 2. `curl_cffi` doesn't support making async requests without sessions
451
+ # 3. Using a single session for many requests at the same time in async doesn't sit well with curl_cffi.
452
+ session = AsyncCurlSession()
453
+ one_off_request = True
454
+
455
+ if not session:
456
+ raise RuntimeError("No active session available.") # pragma: no cover
457
+
458
+ try:
459
+ # Determine if we should use proxy rotation
460
+ for attempt in range(max_retries):
461
+ if self._proxy_rotator and static_proxy is None:
462
+ proxy = self._proxy_rotator.get_proxy()
463
+ else:
464
+ proxy = static_proxy
465
+
466
+ request_args = self._merge_request_args(stealth=stealth, proxy=proxy, **kwargs)
467
+ try:
468
+ response = await session.request(method, **request_args)
469
+ result = ResponseFactory.from_http_request(response, selector_config, meta={"proxy": proxy})
470
+ return result
471
+ except CurlError as e: # pragma: no cover
472
+ if attempt < max_retries - 1:
473
+ # Now if the rotator is enabled, we will try again with the new proxy
474
+ # If it's not enabled, then we will try again with the same proxy
475
+ if is_proxy_error(e):
476
+ log.warning(
477
+ f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {retry_delay} seconds..."
478
+ )
479
+ else:
480
+ log.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds...")
481
+
482
+ await asyncio_sleep(retry_delay)
483
+ else:
484
+ log.error(f"Failed after {max_retries} attempts: {e}")
485
+ raise # Raise the exception if all retries fail
486
+ finally:
487
+ if session and one_off_request:
488
+ await session.close()
489
+
490
+ raise RuntimeError("No active session available.") # pragma: no cover
491
+
492
+ def get(self, url: str, **kwargs: Unpack[GetRequestParams]) -> Awaitable[Response]:
493
+ """
494
+ Perform a GET request.
495
+
496
+ Any additional keyword arguments are passed to the `curl_cffi.requests.AsyncSession().request()` method.
497
+
498
+ :param url: Target URL for the request.
499
+ :param kwargs: Additional keyword arguments including:
500
+ - params: Query string parameters for the request.
501
+ - headers: Headers to include in the request.
502
+ - cookies: Cookies to use in the request.
503
+ - timeout: Number of seconds to wait before timing out.
504
+ - follow_redirects: Whether to follow redirects. Defaults to True.
505
+ - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
506
+ - retries: Number of retry attempts. Defaults to 3.
507
+ - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
508
+ - proxies: Dict of proxies to use.
509
+ - proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
510
+ - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
511
+ - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
512
+ - verify: Whether to verify HTTPS certificates.
513
+ - cert: Tuple of (cert, key) filenames for the client certificate.
514
+ - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
515
+ - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
516
+ - stealthy_headers: If enabled (default), it creates and adds real browser headers.
517
+ :return: A `Response` object.
518
+ """
519
+ stealthy_headers = kwargs.pop("stealthy_headers", None)
520
+ return self._make_request("GET", stealth=stealthy_headers, url=url, **kwargs)
521
+
522
+ def post(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
523
+ """
524
+ Perform a POST request.
525
+
526
+ Any additional keyword arguments are passed to the `curl_cffi.requests.AsyncSession().request()` method.
527
+
528
+ :param url: Target URL for the request.
529
+ :param kwargs: Additional keyword arguments including:
530
+ - data: Form data to include in the request body.
531
+ - json: A JSON serializable object to include in the body of the request.
532
+ - params: Query string parameters for the request.
533
+ - headers: Headers to include in the request.
534
+ - cookies: Cookies to use in the request.
535
+ - timeout: Number of seconds to wait before timing out.
536
+ - follow_redirects: Whether to follow redirects. Defaults to True.
537
+ - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
538
+ - retries: Number of retry attempts. Defaults to 3.
539
+ - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
540
+ - proxies: Dict of proxies to use.
541
+ - proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
542
+ - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
543
+ - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
544
+ - verify: Whether to verify HTTPS certificates.
545
+ - cert: Tuple of (cert, key) filenames for the client certificate.
546
+ - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
547
+ - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
548
+ - stealthy_headers: If enabled (default), it creates and adds real browser headers.
549
+ :return: A `Response` object.
550
+ """
551
+ stealthy_headers = kwargs.pop("stealthy_headers", None)
552
+ return self._make_request("POST", stealth=stealthy_headers, url=url, **kwargs)
553
+
554
+ def put(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
555
+ """
556
+ Perform a PUT request.
557
+
558
+ Any additional keyword arguments are passed to the `curl_cffi.requests.AsyncSession().request()` method.
559
+
560
+ :param url: Target URL for the request.
561
+ :param kwargs: Additional keyword arguments including:
562
+ - data: Form data to include in the request body.
563
+ - json: A JSON serializable object to include in the body of the request.
564
+ - params: Query string parameters for the request.
565
+ - headers: Headers to include in the request.
566
+ - cookies: Cookies to use in the request.
567
+ - timeout: Number of seconds to wait before timing out.
568
+ - follow_redirects: Whether to follow redirects. Defaults to True.
569
+ - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
570
+ - retries: Number of retry attempts. Defaults to 3.
571
+ - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
572
+ - proxies: Dict of proxies to use.
573
+ - proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
574
+ - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
575
+ - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
576
+ - verify: Whether to verify HTTPS certificates.
577
+ - cert: Tuple of (cert, key) filenames for the client certificate.
578
+ - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
579
+ - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
580
+ - stealthy_headers: If enabled (default), it creates and adds real browser headers.
581
+ :return: A `Response` object.
582
+ """
583
+ stealthy_headers = kwargs.pop("stealthy_headers", None)
584
+ return self._make_request("PUT", stealth=stealthy_headers, url=url, **kwargs)
585
+
586
+ def delete(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
587
+ """
588
+ Perform a DELETE request.
589
+
590
+ Any additional keyword arguments are passed to the `curl_cffi.requests.AsyncSession().request()` method.
591
+
592
+ :param url: Target URL for the request.
593
+ :param kwargs: Additional keyword arguments including:
594
+ - data: Form data to include in the request body.
595
+ - json: A JSON serializable object to include in the body of the request.
596
+ - params: Query string parameters for the request.
597
+ - headers: Headers to include in the request.
598
+ - cookies: Cookies to use in the request.
599
+ - timeout: Number of seconds to wait before timing out.
600
+ - follow_redirects: Whether to follow redirects. Defaults to True.
601
+ - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
602
+ - retries: Number of retry attempts. Defaults to 3.
603
+ - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
604
+ - proxies: Dict of proxies to use.
605
+ - proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
606
+ - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
607
+ - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
608
+ - verify: Whether to verify HTTPS certificates.
609
+ - cert: Tuple of (cert, key) filenames for the client certificate.
610
+ - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
611
+ - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
612
+ - stealthy_headers: If enabled (default), it creates and adds real browser headers.
613
+ :return: A `Response` object.
614
+ """
615
+ # Careful of sending a body in a DELETE request, it might cause some websites to reject the request as per https://www.rfc-editor.org/rfc/rfc7231#section-4.3.5,
616
+ # But some websites accept it, it depends on the implementation used.
617
+ stealthy_headers = kwargs.pop("stealthy_headers", None)
618
+ return self._make_request("DELETE", stealth=stealthy_headers, url=url, **kwargs)
619
+
620
+
621
+ class FetcherSession:
622
+ """
623
+ A factory context manager that provides configured Fetcher sessions.
624
+
625
+ When this manager is used in a 'with' or 'async with' block,
626
+ it yields a new session configured with the manager's defaults.
627
+ A single instance of this manager should ideally be used for one active
628
+ session at a time (or sequentially). Re-entering a context with the
629
+ same manager instance while a session is already active is disallowed.
630
+ """
631
+
632
+ __slots__ = (
633
+ "_default_impersonate",
634
+ "_stealth",
635
+ "_default_proxies",
636
+ "_default_proxy",
637
+ "_default_proxy_auth",
638
+ "_default_timeout",
639
+ "_default_headers",
640
+ "_default_retries",
641
+ "_default_retry_delay",
642
+ "_default_follow_redirects",
643
+ "_default_max_redirects",
644
+ "_default_verify",
645
+ "_default_cert",
646
+ "_default_http3",
647
+ "selector_config",
648
+ "_client",
649
+ "_is_alive",
650
+ "_proxy_rotator",
651
+ )
652
+
653
+ def __init__(
654
+ self,
655
+ impersonate: ImpersonateType = "chrome",
656
+ http3: Optional[bool] = False,
657
+ stealthy_headers: Optional[bool] = True,
658
+ proxies: Optional[Dict[str, str]] = None,
659
+ proxy: Optional[str] = None,
660
+ proxy_auth: Optional[Tuple[str, str]] = None,
661
+ timeout: Optional[int | float] = 30,
662
+ headers: Optional[Dict[str, str]] = None,
663
+ retries: Optional[int] = 3,
664
+ retry_delay: Optional[int] = 1,
665
+ follow_redirects: bool = True,
666
+ max_redirects: int = 30,
667
+ verify: bool = True,
668
+ cert: Optional[str | Tuple[str, str]] = None,
669
+ selector_config: Optional[Dict] = None,
670
+ proxy_rotator: Optional[ProxyRotator] = None,
671
+ ):
672
+ """
673
+ :param impersonate: Browser version to impersonate. Can be a single browser string or a list of browser strings for random selection. (Default: latest available Chrome version)
674
+ :param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
675
+ :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
676
+ :param proxies: Dict of proxies to use. Format: {"http": proxy_url, "https": proxy_url}.
677
+ :param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
678
+ Cannot be used together with the `proxies` parameter.
679
+ :param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
680
+ :param timeout: Number of seconds to wait before timing out.
681
+ :param headers: Headers to include in the session with every request.
682
+ :param retries: Number of retry attempts. Defaults to 3.
683
+ :param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
684
+ :param follow_redirects: Whether to follow redirects. Defaults to True.
685
+ :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
686
+ :param verify: Whether to verify HTTPS certificates. Defaults to True.
687
+ :param cert: Tuple of (cert, key) filenames for the client certificate.
688
+ :param selector_config: Arguments passed when creating the final Selector class.
689
+ :param proxy_rotator: A ProxyRotator instance for automatic proxy rotation.
690
+ """
691
+ self._default_impersonate: ImpersonateType = impersonate
692
+ self._stealth = stealthy_headers
693
+ self._default_proxies = proxies or {}
694
+ self._default_proxy = proxy or None
695
+ self._default_proxy_auth = proxy_auth or None
696
+ self._default_timeout = timeout
697
+ self._default_headers = headers or {}
698
+ self._default_retries = retries
699
+ self._default_retry_delay = retry_delay
700
+ self._default_follow_redirects = follow_redirects
701
+ self._default_max_redirects = max_redirects
702
+ self._default_verify = verify
703
+ self._default_cert = cert
704
+ self._default_http3 = http3
705
+ self.selector_config = selector_config or {}
706
+ self._is_alive = False
707
+ self._client: _SyncSessionLogic | _ASyncSessionLogic | None = None
708
+ self._proxy_rotator = proxy_rotator
709
+
710
+ def __enter__(self) -> _SyncSessionLogic:
711
+ """Creates and returns a new synchronous Fetcher Session"""
712
+ if self._client is None:
713
+ # Use **vars(self) to avoid repeating all parameters
714
+ config = {k.replace("_default_", ""): getattr(self, k) for k in self.__slots__ if k.startswith("_default")}
715
+ config["stealthy_headers"] = self._stealth
716
+ config["selector_config"] = self.selector_config
717
+ config["proxy_rotator"] = self._proxy_rotator
718
+ self._client = _SyncSessionLogic(**config)
719
+ self._is_alive = True
720
+ return self._client.__enter__()
721
+ raise RuntimeError("This FetcherSession instance already has an active synchronous session.")
722
+
723
+ def __exit__(self, exc_type, exc_val, exc_tb):
724
+ if self._client is not None and isinstance(self._client, _SyncSessionLogic):
725
+ self._client.__exit__(exc_type, exc_val, exc_tb)
726
+ self._client = None
727
+ self._is_alive = False
728
+ return
729
+ raise RuntimeError("Cannot exit invalid session")
730
+
731
+ async def __aenter__(self) -> _ASyncSessionLogic:
732
+ """Creates and returns a new asynchronous Session."""
733
+ if self._client is None:
734
+ # Use **vars(self) to avoid repeating all parameters
735
+ config = {k.replace("_default_", ""): getattr(self, k) for k in self.__slots__ if k.startswith("_default")}
736
+ config["stealthy_headers"] = self._stealth
737
+ config["selector_config"] = self.selector_config
738
+ config["proxy_rotator"] = self._proxy_rotator
739
+ self._client = _ASyncSessionLogic(**config)
740
+ self._is_alive = True
741
+ return await self._client.__aenter__()
742
+ raise RuntimeError("This FetcherSession instance already has an active asynchronous session.")
743
+
744
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
745
+ if self._client is not None and isinstance(self._client, _ASyncSessionLogic):
746
+ await self._client.__aexit__(exc_type, exc_val, exc_tb)
747
+ self._client = None
748
+ self._is_alive = False
749
+ return
750
+ raise RuntimeError("Cannot exit invalid session")
751
+
752
+
753
+ class FetcherClient(_SyncSessionLogic):
754
+ __slots__ = ("__enter__", "__exit__")
755
+
756
+ def __init__(self, **kwargs: Any) -> None:
757
+ super().__init__(**kwargs)
758
+ self.__enter__: Any = None
759
+ self.__exit__: Any = None
760
+ self._curl_session: Any = _NO_SESSION
761
+
762
+
763
+ class AsyncFetcherClient(_ASyncSessionLogic):
764
+ __slots__ = ("__aenter__", "__aexit__")
765
+
766
+ def __init__(self, **kwargs: Any) -> None:
767
+ super().__init__(**kwargs)
768
+ self.__aenter__: Any = None
769
+ self.__aexit__: Any = None
770
+ self._async_curl_session: Any = _NO_SESSION
engines/toolbelt/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .proxy_rotation import ProxyRotator, is_proxy_error, cyclic_rotation
2
+
3
+ __all__ = ["ProxyRotator", "is_proxy_error", "cyclic_rotation"]
engines/toolbelt/convertor.py ADDED
@@ -0,0 +1,306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import lru_cache
2
+ from re import compile as re_compile
3
+
4
+ from curl_cffi.requests import Response as CurlResponse
5
+ from playwright._impl._errors import Error as PlaywrightError
6
+ from playwright.sync_api import Page as SyncPage, Response as SyncResponse
7
+ from playwright.async_api import Page as AsyncPage, Response as AsyncResponse
8
+
9
+ from scrapling.core.utils import log
10
+ from .custom import Response, StatusText
11
+ from scrapling.core._types import Dict, Optional
12
+
13
+ __CHARSET_RE__ = re_compile(r"charset=([\w-]+)")
14
+
15
+
16
+ class ResponseFactory:
17
+ """
18
+ Factory class for creating `Response` objects from various sources.
19
+
20
+ This class provides multiple static and instance methods for building standardized `Response` objects
21
+ from diverse input sources such as Playwright responses, asynchronous Playwright responses,
22
+ and raw HTTP request responses. It supports handling response histories, constructing the proper
23
+ response objects, and managing encoding, headers, cookies, and other attributes.
24
+ """
25
+
26
+ @classmethod
27
+ @lru_cache(maxsize=16)
28
+ def __extract_browser_encoding(cls, content_type: str | None, default: str = "utf-8") -> str:
29
+ """Extract browser encoding from headers.
30
+ Ex: from header "content-type: text/html; charset=utf-8" -> "utf-8
31
+ """
32
+ if content_type:
33
+ # Because Playwright can't do that by themselves like all libraries for some reason :3
34
+ match = __CHARSET_RE__.search(content_type)
35
+ return match.group(1) if match else default
36
+ return default
37
+
38
+ @classmethod
39
+ def _process_response_history(cls, first_response: SyncResponse, parser_arguments: Dict) -> list[Response]:
40
+ """Process response history to build a list of `Response` objects"""
41
+ history: list[Response] = []
42
+ current_request = first_response.request.redirected_from
43
+
44
+ try:
45
+ while current_request:
46
+ try:
47
+ current_response = current_request.response()
48
+ history.insert(
49
+ 0,
50
+ Response(
51
+ **{
52
+ "url": current_request.url,
53
+ # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
54
+ "content": "",
55
+ "status": current_response.status if current_response else 301,
56
+ "reason": (current_response.status_text or StatusText.get(current_response.status))
57
+ if current_response
58
+ else StatusText.get(301),
59
+ "encoding": cls.__extract_browser_encoding(
60
+ current_response.headers.get("content-type", "")
61
+ )
62
+ if current_response
63
+ else "utf-8",
64
+ "cookies": tuple(),
65
+ "headers": current_response.all_headers() if current_response else {},
66
+ "request_headers": current_request.all_headers(),
67
+ **parser_arguments,
68
+ }
69
+ ),
70
+ )
71
+ except Exception as e: # pragma: no cover
72
+ log.error(f"Error processing redirect: {e}")
73
+ break
74
+
75
+ current_request = current_request.redirected_from
76
+ except Exception as e: # pragma: no cover
77
+ log.error(f"Error processing response history: {e}")
78
+
79
+ return history
80
+
81
+ @classmethod
82
+ def from_playwright_response(
83
+ cls,
84
+ page: SyncPage,
85
+ first_response: SyncResponse,
86
+ final_response: Optional[SyncResponse],
87
+ parser_arguments: Dict,
88
+ meta: Optional[Dict] = None,
89
+ ) -> Response:
90
+ """
91
+ Transforms a Playwright response into an internal `Response` object, encapsulating
92
+ the page's content, response status, headers, and relevant metadata.
93
+
94
+ The function handles potential issues, such as empty or missing final responses,
95
+ by falling back to the first response if necessary. Encoding and status text
96
+ are also derived from the provided response headers or reasonable defaults.
97
+ Additionally, the page content and cookies are extracted for further use.
98
+
99
+ :param page: A synchronous Playwright `Page` instance that represents the current browser page. Required to retrieve the page's URL, cookies, and content.
100
+ :param final_response: The last response received for the given request from the Playwright instance. Typically used as the main response object to derive status, headers, and other metadata.
101
+ :param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.
102
+ :param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into
103
+ the `Response` object.
104
+ :param meta: Additional meta data to be saved with the response.
105
+
106
+ :return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.
107
+ :rtype: Response
108
+ """
109
+ # In case we didn't catch a document type somehow
110
+ final_response = final_response if final_response else first_response
111
+ if not final_response:
112
+ raise ValueError("Failed to get a response from the page")
113
+
114
+ encoding = cls.__extract_browser_encoding(final_response.headers.get("content-type", ""))
115
+ # PlayWright API sometimes give empty status text for some reason!
116
+ status_text = final_response.status_text or StatusText.get(final_response.status)
117
+
118
+ history = cls._process_response_history(first_response, parser_arguments)
119
+ try:
120
+ if "html" in final_response.all_headers().get("content-type", ""):
121
+ page_content = cls._get_page_content(page).encode("utf-8")
122
+ else:
123
+ page_content = final_response.body()
124
+ except Exception as e: # pragma: no cover
125
+ log.error(f"Error getting page content: {e}")
126
+ page_content = b""
127
+
128
+ return Response(
129
+ **{
130
+ "url": page.url,
131
+ "content": page_content,
132
+ "status": final_response.status,
133
+ "reason": status_text,
134
+ "encoding": encoding,
135
+ "cookies": tuple(dict(cookie) for cookie in page.context.cookies()),
136
+ "headers": first_response.all_headers(),
137
+ "request_headers": first_response.request.all_headers(),
138
+ "history": history,
139
+ "meta": meta,
140
+ **parser_arguments,
141
+ }
142
+ )
143
+
144
+ @classmethod
145
+ async def _async_process_response_history(
146
+ cls, first_response: AsyncResponse, parser_arguments: Dict
147
+ ) -> list[Response]:
148
+ """Process response history to build a list of `Response` objects"""
149
+ history: list[Response] = []
150
+ current_request = first_response.request.redirected_from
151
+
152
+ try:
153
+ while current_request:
154
+ try:
155
+ current_response = await current_request.response()
156
+ history.insert(
157
+ 0,
158
+ Response(
159
+ **{
160
+ "url": current_request.url,
161
+ # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
162
+ "content": "",
163
+ "status": current_response.status if current_response else 301,
164
+ "reason": (current_response.status_text or StatusText.get(current_response.status))
165
+ if current_response
166
+ else StatusText.get(301),
167
+ "encoding": cls.__extract_browser_encoding(
168
+ current_response.headers.get("content-type", "")
169
+ )
170
+ if current_response
171
+ else "utf-8",
172
+ "cookies": tuple(),
173
+ "headers": await current_response.all_headers() if current_response else {},
174
+ "request_headers": await current_request.all_headers(),
175
+ **parser_arguments,
176
+ }
177
+ ),
178
+ )
179
+ except Exception as e: # pragma: no cover
180
+ log.error(f"Error processing redirect: {e}")
181
+ break
182
+
183
+ current_request = current_request.redirected_from
184
+ except Exception as e: # pragma: no cover
185
+ log.error(f"Error processing response history: {e}")
186
+
187
+ return history
188
+
189
+ @classmethod
190
+ def _get_page_content(cls, page: SyncPage) -> str:
191
+ """
192
+ A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
193
+ :param page: The page to extract content from.
194
+ :return:
195
+ """
196
+ while True:
197
+ try:
198
+ return page.content() or ""
199
+ except PlaywrightError:
200
+ page.wait_for_timeout(500)
201
+ continue
202
+ return "" # pyright: ignore
203
+
204
+ @classmethod
205
+ async def _get_async_page_content(cls, page: AsyncPage) -> str:
206
+ """
207
+ A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
208
+ :param page: The page to extract content from.
209
+ :return:
210
+ """
211
+ while True:
212
+ try:
213
+ return (await page.content()) or ""
214
+ except PlaywrightError:
215
+ await page.wait_for_timeout(500)
216
+ continue
217
+ return "" # pyright: ignore
218
+
219
+ @classmethod
220
+ async def from_async_playwright_response(
221
+ cls,
222
+ page: AsyncPage,
223
+ first_response: AsyncResponse,
224
+ final_response: Optional[AsyncResponse],
225
+ parser_arguments: Dict,
226
+ meta: Optional[Dict] = None,
227
+ ) -> Response:
228
+ """
229
+ Transforms a Playwright response into an internal `Response` object, encapsulating
230
+ the page's content, response status, headers, and relevant metadata.
231
+
232
+ The function handles potential issues, such as empty or missing final responses,
233
+ by falling back to the first response if necessary. Encoding and status text
234
+ are also derived from the provided response headers or reasonable defaults.
235
+ Additionally, the page content and cookies are extracted for further use.
236
+
237
+ :param page: An asynchronous Playwright `Page` instance that represents the current browser page. Required to retrieve the page's URL, cookies, and content.
238
+ :param final_response: The last response received for the given request from the Playwright instance. Typically used as the main response object to derive status, headers, and other metadata.
239
+ :param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.
240
+ :param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into
241
+ the `Response` object.
242
+ :param meta: Additional meta data to be saved with the response.
243
+
244
+ :return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.
245
+ :rtype: Response
246
+ """
247
+ # In case we didn't catch a document type somehow
248
+ final_response = final_response if final_response else first_response
249
+ if not final_response:
250
+ raise ValueError("Failed to get a response from the page")
251
+
252
+ encoding = cls.__extract_browser_encoding(final_response.headers.get("content-type", ""))
253
+ # PlayWright API sometimes give empty status text for some reason!
254
+ status_text = final_response.status_text or StatusText.get(final_response.status)
255
+
256
+ history = await cls._async_process_response_history(first_response, parser_arguments)
257
+ try:
258
+ if "html" in (await final_response.all_headers()).get("content-type", ""):
259
+ page_content = (await cls._get_async_page_content(page)).encode("utf-8")
260
+ else:
261
+ page_content = await final_response.body()
262
+ except Exception as e: # pragma: no cover
263
+ log.error(f"Error getting page content in async: {e}")
264
+ page_content = b""
265
+
266
+ return Response(
267
+ **{
268
+ "url": page.url,
269
+ "content": page_content,
270
+ "status": final_response.status,
271
+ "reason": status_text,
272
+ "encoding": encoding,
273
+ "cookies": tuple(dict(cookie) for cookie in await page.context.cookies()),
274
+ "headers": await first_response.all_headers(),
275
+ "request_headers": await first_response.request.all_headers(),
276
+ "history": history,
277
+ "meta": meta,
278
+ **parser_arguments,
279
+ }
280
+ )
281
+
282
+ @staticmethod
283
+ def from_http_request(response: CurlResponse, parser_arguments: Dict, meta: Optional[Dict] = None) -> Response:
284
+ """Takes `curl_cffi` response and generates `Response` object from it.
285
+
286
+ :param response: `curl_cffi` response object
287
+ :param parser_arguments: Additional arguments to be passed to the `Response` object constructor.
288
+ :param meta: Optional metadata dictionary to attach to the Response.
289
+ :return: A `Response` object that is the same as `Selector` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
290
+ """
291
+ return Response(
292
+ **{
293
+ "url": response.url,
294
+ "content": response.content,
295
+ "status": response.status_code,
296
+ "reason": response.reason,
297
+ "encoding": response.encoding or "utf-8",
298
+ "cookies": dict(response.cookies),
299
+ "headers": dict(response.headers),
300
+ "request_headers": dict(response.request.headers) if response.request else {},
301
+ "method": response.request.method if response.request else "GET",
302
+ "history": response.history, # https://github.com/lexiforest/curl_cffi/issues/82
303
+ "meta": meta,
304
+ **parser_arguments,
305
+ }
306
+ )
engines/toolbelt/custom.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Functions related to custom types or type checking
3
+ """
4
+
5
+ from functools import lru_cache
6
+
7
+ from scrapling.core.utils import log
8
+ from scrapling.core._types import (
9
+ Any,
10
+ Dict,
11
+ cast,
12
+ List,
13
+ Tuple,
14
+ Union,
15
+ Optional,
16
+ Callable,
17
+ Sequence,
18
+ TYPE_CHECKING,
19
+ AsyncGenerator,
20
+ )
21
+ from scrapling.core.custom_types import MappingProxyType
22
+ from scrapling.parser import Selector, SQLiteStorageSystem
23
+
24
+ if TYPE_CHECKING:
25
+ from scrapling.spiders import Request
26
+
27
+
28
+ class Response(Selector):
29
+ """This class is returned by all engines as a way to unify the response type between different libraries."""
30
+
31
+ def __init__(
32
+ self,
33
+ url: str,
34
+ content: str | bytes,
35
+ status: int,
36
+ reason: str,
37
+ cookies: Tuple[Dict[str, str], ...] | Dict[str, str],
38
+ headers: Dict,
39
+ request_headers: Dict,
40
+ encoding: str = "utf-8",
41
+ method: str = "GET",
42
+ history: List | None = None,
43
+ meta: Dict[str, Any] | None = None,
44
+ **selector_config: Any,
45
+ ):
46
+ if isinstance(content, str):
47
+ content = content.encode("utf-8")
48
+
49
+ adaptive_domain: str = cast(str, selector_config.pop("adaptive_domain", ""))
50
+ self.status = status
51
+ self.reason = reason
52
+ self.cookies = cookies
53
+ self.headers = headers
54
+ self.request_headers = request_headers
55
+ self.history = history or []
56
+ super().__init__(
57
+ content=content,
58
+ url=adaptive_domain or url,
59
+ encoding=encoding,
60
+ **selector_config,
61
+ )
62
+ # For easier debugging while working from a Python shell
63
+ log.info(f"Fetched ({status}) <{method} {url}> (referer: {request_headers.get('referer')})")
64
+
65
+ if meta and not isinstance(meta, dict):
66
+ raise TypeError(f"Response meta should be dictionary but got {type(meta).__name__} instead!")
67
+
68
+ self.meta: Dict[str, Any] = meta or {}
69
+ self.request: Optional["Request"] = None # Will be set by crawler
70
+
71
+ @property
72
+ def body(self) -> bytes:
73
+ """Return the raw body of the response as bytes."""
74
+ return cast(bytes, cast(Sequence, self._raw_body))
75
+
76
+ def follow(
77
+ self,
78
+ url: str,
79
+ sid: str = "",
80
+ callback: Callable[["Response"], AsyncGenerator[Union[Dict[str, Any], "Request", None], None]] | None = None,
81
+ priority: int | None = None,
82
+ dont_filter: bool = False,
83
+ meta: dict[str, Any] | None = None,
84
+ referer_flow: bool = True,
85
+ **kwargs: Any,
86
+ ) -> Any:
87
+ """Create a Request to follow a URL.
88
+
89
+ This is a helper method for spiders to easily follow links found in pages.
90
+
91
+ **IMPORTANT**: The below arguments if left empty, the corresponding value from the previous request will be used. The only exception is `dont_filter`.
92
+
93
+ :param url: The URL to follow (can be relative, will be joined with current URL)
94
+ :param sid: The session id to use
95
+ :param callback: Spider callback method to use
96
+ :param priority: The priority number to use, the higher the number, the higher priority to be processed first.
97
+ :param dont_filter: If this request has been done before, disable the filter to allow it again.
98
+ :param meta: Additional meta data to included in the request
99
+ :param referer_flow: Enabled by default, set the current response url as referer for the new request url.
100
+ :param kwargs: Additional Request arguments
101
+ :return: Request object ready to be yielded
102
+ """
103
+ from scrapling.spiders import Request
104
+
105
+ if not self.request or not isinstance(self.request, Request):
106
+ raise TypeError("This response has no request set yet.")
107
+
108
+ # Merge original session kwargs with new kwargs (new takes precedence)
109
+ session_kwargs = {**self.request._session_kwargs, **kwargs}
110
+
111
+ if referer_flow:
112
+ # For requests
113
+ headers = session_kwargs.get("headers", {})
114
+ headers["referer"] = self.url
115
+ session_kwargs["headers"] = headers
116
+
117
+ # For browsers
118
+ extra_headers = session_kwargs.get("extra_headers", {})
119
+ extra_headers["referer"] = self.url
120
+ session_kwargs["extra_headers"] = extra_headers
121
+
122
+ session_kwargs["google_search"] = False
123
+
124
+ return Request(
125
+ url=self.urljoin(url),
126
+ sid=sid or self.request.sid,
127
+ callback=callback or self.request.callback,
128
+ priority=priority if priority is not None else self.request.priority,
129
+ dont_filter=dont_filter,
130
+ meta={**(self.meta or {}), **(meta or {})},
131
+ **session_kwargs,
132
+ )
133
+
134
+ def __str__(self) -> str:
135
+ return f"<{self.status} {self.url}>"
136
+
137
+
138
+ class BaseFetcher:
139
+ __slots__ = ()
140
+ huge_tree: bool = True
141
+ adaptive: Optional[bool] = False
142
+ storage: Any = SQLiteStorageSystem
143
+ keep_cdata: Optional[bool] = False
144
+ storage_args: Optional[Dict] = None
145
+ keep_comments: Optional[bool] = False
146
+ adaptive_domain: str = ""
147
+ parser_keywords: Tuple = (
148
+ "huge_tree",
149
+ "adaptive",
150
+ "storage",
151
+ "keep_cdata",
152
+ "storage_args",
153
+ "keep_comments",
154
+ "adaptive_domain",
155
+ ) # Left open for the user
156
+
157
+ def __init__(self, *args, **kwargs):
158
+ # For backward-compatibility before 0.2.99
159
+ args_str = ", ".join(args) or ""
160
+ kwargs_str = ", ".join(f"{k}={v}" for k, v in kwargs.items()) or ""
161
+ if args_str:
162
+ args_str += ", "
163
+
164
+ log.warning(
165
+ f"This logic is deprecated now, and have no effect; It will be removed with v0.3. Use `{self.__class__.__name__}.configure({args_str}{kwargs_str})` instead before fetching"
166
+ )
167
+ pass
168
+
169
+ @classmethod
170
+ def display_config(cls):
171
+ return dict(
172
+ huge_tree=cls.huge_tree,
173
+ keep_comments=cls.keep_comments,
174
+ keep_cdata=cls.keep_cdata,
175
+ adaptive=cls.adaptive,
176
+ storage=cls.storage,
177
+ storage_args=cls.storage_args,
178
+ adaptive_domain=cls.adaptive_domain,
179
+ )
180
+
181
+ @classmethod
182
+ def configure(cls, **kwargs):
183
+ """Set multiple arguments for the parser at once globally
184
+
185
+ :param kwargs: The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata, adaptive, storage, storage_args, adaptive_domain
186
+ """
187
+ for key, value in kwargs.items():
188
+ key = key.strip().lower()
189
+ if hasattr(cls, key):
190
+ if key in cls.parser_keywords:
191
+ setattr(cls, key, value)
192
+ else:
193
+ # Yup, no fun allowed LOL
194
+ raise AttributeError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
195
+ else:
196
+ raise ValueError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
197
+
198
+ if not kwargs:
199
+ raise AttributeError(f"You must pass a keyword to configure, current keywords: {cls.parser_keywords}?")
200
+
201
+ @classmethod
202
+ def _generate_parser_arguments(cls) -> Dict:
203
+ # Selector class parameters
204
+ # I won't validate Selector's class parameters here again, I will leave it to be validated later
205
+ parser_arguments = dict(
206
+ huge_tree=cls.huge_tree,
207
+ keep_comments=cls.keep_comments,
208
+ keep_cdata=cls.keep_cdata,
209
+ adaptive=cls.adaptive,
210
+ storage=cls.storage,
211
+ storage_args=cls.storage_args,
212
+ adaptive_domain=cls.adaptive_domain,
213
+ )
214
+
215
+ return parser_arguments
216
+
217
+
218
+ class StatusText:
219
+ """A class that gets the status text of the response status code.
220
+
221
+ Reference: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status
222
+ """
223
+
224
+ _phrases = MappingProxyType(
225
+ {
226
+ 100: "Continue",
227
+ 101: "Switching Protocols",
228
+ 102: "Processing",
229
+ 103: "Early Hints",
230
+ 200: "OK",
231
+ 201: "Created",
232
+ 202: "Accepted",
233
+ 203: "Non-Authoritative Information",
234
+ 204: "No Content",
235
+ 205: "Reset Content",
236
+ 206: "Partial Content",
237
+ 207: "Multi-Status",
238
+ 208: "Already Reported",
239
+ 226: "IM Used",
240
+ 300: "Multiple Choices",
241
+ 301: "Moved Permanently",
242
+ 302: "Found",
243
+ 303: "See Other",
244
+ 304: "Not Modified",
245
+ 305: "Use Proxy",
246
+ 307: "Temporary Redirect",
247
+ 308: "Permanent Redirect",
248
+ 400: "Bad Request",
249
+ 401: "Unauthorized",
250
+ 402: "Payment Required",
251
+ 403: "Forbidden",
252
+ 404: "Not Found",
253
+ 405: "Method Not Allowed",
254
+ 406: "Not Acceptable",
255
+ 407: "Proxy Authentication Required",
256
+ 408: "Request Timeout",
257
+ 409: "Conflict",
258
+ 410: "Gone",
259
+ 411: "Length Required",
260
+ 412: "Precondition Failed",
261
+ 413: "Payload Too Large",
262
+ 414: "URI Too Long",
263
+ 415: "Unsupported Media Type",
264
+ 416: "Range Not Satisfiable",
265
+ 417: "Expectation Failed",
266
+ 418: "I'm a teapot",
267
+ 421: "Misdirected Request",
268
+ 422: "Unprocessable Entity",
269
+ 423: "Locked",
270
+ 424: "Failed Dependency",
271
+ 425: "Too Early",
272
+ 426: "Upgrade Required",
273
+ 428: "Precondition Required",
274
+ 429: "Too Many Requests",
275
+ 431: "Request Header Fields Too Large",
276
+ 451: "Unavailable For Legal Reasons",
277
+ 500: "Internal Server Error",
278
+ 501: "Not Implemented",
279
+ 502: "Bad Gateway",
280
+ 503: "Service Unavailable",
281
+ 504: "Gateway Timeout",
282
+ 505: "HTTP Version Not Supported",
283
+ 506: "Variant Also Negotiates",
284
+ 507: "Insufficient Storage",
285
+ 508: "Loop Detected",
286
+ 510: "Not Extended",
287
+ 511: "Network Authentication Required",
288
+ }
289
+ )
290
+
291
+ @classmethod
292
+ @lru_cache(maxsize=128)
293
+ def get(cls, status_code: int) -> str:
294
+ """Get the phrase for a given HTTP status code."""
295
+ return cls._phrases.get(status_code, "Unknown Status Code")
engines/toolbelt/fingerprints.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Functions related to generating headers and fingerprints generally
3
+ """
4
+
5
+ from functools import lru_cache
6
+ from platform import system as platform_system
7
+
8
+ from tld import get_tld, Result
9
+ from browserforge.headers import Browser, HeaderGenerator
10
+ from browserforge.headers.generator import SUPPORTED_OPERATING_SYSTEMS
11
+
12
+ from scrapling.core._types import Dict, Literal, Tuple, cast
13
+
14
+ __OS_NAME__ = platform_system()
15
+ OSName = Literal["linux", "macos", "windows"]
16
+ # Current versions hardcoded for now (Playwright doesn't allow to know the version of a browser without launching it)
17
+ chromium_version = 141
18
+ chrome_version = 143
19
+
20
+
21
+ @lru_cache(10, typed=True)
22
+ def generate_convincing_referer(url: str) -> str | None:
23
+ """Takes the domain from the URL without the subdomain/suffix and make it look like you were searching Google for this website
24
+
25
+ >>> generate_convincing_referer('https://www.somewebsite.com/blah')
26
+ 'https://www.google.com/search?q=somewebsite'
27
+
28
+ :param url: The URL you are about to fetch.
29
+ :return: Google's search URL of the domain name, or None for localhost/IP addresses
30
+ """
31
+ # Fixing the inaccurate return type hint in `get_tld`
32
+ extracted: Result | None = cast(Result, get_tld(url, as_object=True, fail_silently=True))
33
+ if not extracted:
34
+ return None
35
+
36
+ website_name = extracted.domain
37
+
38
+ # Skip generating referer for localhost, IP addresses, or when there's no valid domain
39
+ if not website_name or not extracted.tld or website_name in ("localhost", "127.0.0.1", "::1"):
40
+ return None
41
+
42
+ # Check if it's an IP address (simple check for IPv4)
43
+ if all(part.isdigit() for part in website_name.split(".") if part):
44
+ return None
45
+
46
+ return f"https://www.google.com/search?q={website_name}"
47
+
48
+
49
+ @lru_cache(1, typed=True)
50
+ def get_os_name() -> OSName | Tuple:
51
+ """Get the current OS name in the same format needed for browserforge, if the OS is Unknown, return None so browserforge uses all.
52
+
53
+ :return: Current OS name or `None` otherwise
54
+ """
55
+ match __OS_NAME__: # pragma: no cover
56
+ case "Linux":
57
+ return "linux"
58
+ case "Darwin":
59
+ return "macos"
60
+ case "Windows":
61
+ return "windows"
62
+ case _:
63
+ return SUPPORTED_OPERATING_SYSTEMS
64
+
65
+
66
+ def generate_headers(browser_mode: bool | str = False) -> Dict:
67
+ """Generate real browser-like headers using browserforge's generator
68
+
69
+ :param browser_mode: If enabled, the headers created are used for playwright, so it has to match everything
70
+ :return: A dictionary of the generated headers
71
+ """
72
+ # In the browser mode, we don't care about anything other than matching the OS and the browser type with the browser we are using,
73
+ # So we don't raise any inconsistency red flags while websites fingerprinting us
74
+ os_name = get_os_name()
75
+ ver = chrome_version if browser_mode and browser_mode == "chrome" else chromium_version
76
+ browsers = [Browser(name="chrome", min_version=ver, max_version=ver)]
77
+ if not browser_mode:
78
+ os_name = ("windows", "macos", "linux")
79
+ browsers.extend(
80
+ [
81
+ Browser(name="firefox", min_version=142),
82
+ Browser(name="edge", min_version=140),
83
+ ]
84
+ )
85
+ return HeaderGenerator(browser=browsers, os=os_name, device="desktop").generate()
86
+
87
+
88
+ __default_useragent__ = generate_headers(browser_mode=False).get("User-Agent")
engines/toolbelt/navigation.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Functions related to files and URLs
3
+ """
4
+
5
+ from urllib.parse import urlparse
6
+
7
+ from playwright.async_api import Route as async_Route
8
+ from msgspec import Struct, structs, convert, ValidationError
9
+ from playwright.sync_api import Route
10
+
11
+ from scrapling.core.utils import log
12
+ from scrapling.core._types import Dict, Set, Tuple, Optional, Callable
13
+ from scrapling.engines.constants import EXTRA_RESOURCES
14
+
15
+
16
+ class ProxyDict(Struct):
17
+ server: str
18
+ username: str = ""
19
+ password: str = ""
20
+
21
+
22
+ def create_intercept_handler(disable_resources: bool, blocked_domains: Optional[Set[str]] = None) -> Callable:
23
+ """Create a route handler that blocks both resource types and specific domains.
24
+
25
+ :param disable_resources: Whether to block default resource types.
26
+ :param blocked_domains: Set of domain names to block requests to.
27
+ :return: A sync route handler function.
28
+ """
29
+ disabled_resources = EXTRA_RESOURCES if disable_resources else set()
30
+ domains = blocked_domains or set()
31
+
32
+ def handler(route: Route):
33
+ if route.request.resource_type in disabled_resources:
34
+ log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
35
+ route.abort()
36
+ elif domains:
37
+ hostname = urlparse(route.request.url).hostname or ""
38
+ if any(hostname == d or hostname.endswith("." + d) for d in domains):
39
+ log.debug(f'Blocking request to blocked domain "{hostname}" ({route.request.url})')
40
+ route.abort()
41
+ else:
42
+ route.continue_()
43
+ else:
44
+ route.continue_()
45
+
46
+ return handler
47
+
48
+
49
+ def create_async_intercept_handler(disable_resources: bool, blocked_domains: Optional[Set[str]] = None) -> Callable:
50
+ """Create an async route handler that blocks both resource types and specific domains.
51
+
52
+ :param disable_resources: Whether to block default resource types.
53
+ :param blocked_domains: Set of domain names to block requests to.
54
+ :return: An async route handler function.
55
+ """
56
+ disabled_resources = EXTRA_RESOURCES if disable_resources else set()
57
+ domains = blocked_domains or set()
58
+
59
+ async def handler(route: async_Route):
60
+ if route.request.resource_type in disabled_resources:
61
+ log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
62
+ await route.abort()
63
+ elif domains:
64
+ hostname = urlparse(route.request.url).hostname or ""
65
+ if any(hostname == d or hostname.endswith("." + d) for d in domains):
66
+ log.debug(f'Blocking request to blocked domain "{hostname}" ({route.request.url})')
67
+ await route.abort()
68
+ else:
69
+ await route.continue_()
70
+ else:
71
+ await route.continue_()
72
+
73
+ return handler
74
+
75
+
76
+ def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple) -> Dict:
77
+ """Validate a proxy and return it in the acceptable format for Playwright
78
+ Reference: https://playwright.dev/python/docs/network#http-proxy
79
+
80
+ :param proxy_string: A string or a dictionary representation of the proxy.
81
+ :return:
82
+ """
83
+ if isinstance(proxy_string, str):
84
+ proxy = urlparse(proxy_string)
85
+ if proxy.scheme not in ("http", "https", "socks4", "socks5") or not proxy.hostname:
86
+ raise ValueError("Invalid proxy string!")
87
+
88
+ try:
89
+ result = {
90
+ "server": f"{proxy.scheme}://{proxy.hostname}",
91
+ "username": proxy.username or "",
92
+ "password": proxy.password or "",
93
+ }
94
+ if proxy.port:
95
+ result["server"] += f":{proxy.port}"
96
+ return result
97
+ except ValueError:
98
+ # Urllib will say that one of the parameters above can't be casted to the correct type like `int` for port etc...
99
+ raise ValueError("The proxy argument's string is in invalid format!")
100
+
101
+ elif isinstance(proxy_string, dict):
102
+ try:
103
+ validated = convert(proxy_string, ProxyDict)
104
+ result_dict = structs.asdict(validated)
105
+ return result_dict
106
+ except ValidationError as e:
107
+ raise TypeError(f"Invalid proxy dictionary: {e}")
108
+
109
+ raise TypeError(f"Invalid proxy string: {proxy_string}")
engines/toolbelt/proxy_rotation.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from threading import Lock
2
+
3
+ from scrapling.core._types import Callable, Dict, List, Tuple, ProxyType
4
+
5
+
6
+ RotationStrategy = Callable[[List[ProxyType], int], Tuple[ProxyType, int]]
7
+ _PROXY_ERROR_INDICATORS = {
8
+ "net::err_proxy",
9
+ "net::err_tunnel",
10
+ "connection refused",
11
+ "connection reset",
12
+ "connection timed out",
13
+ "failed to connect",
14
+ "could not resolve proxy",
15
+ }
16
+
17
+
18
+ def _get_proxy_key(proxy: ProxyType) -> str:
19
+ """Generate a unique key for a proxy (for dicts it's server plus username)."""
20
+ if isinstance(proxy, str):
21
+ return proxy
22
+ server = proxy.get("server", "")
23
+ username = proxy.get("username", "")
24
+ return f"{server}|{username}"
25
+
26
+
27
+ def is_proxy_error(error: Exception) -> bool:
28
+ """Check if an error is proxy-related. Works for both HTTP and browser errors."""
29
+ error_msg = str(error).lower()
30
+ return any(indicator in error_msg for indicator in _PROXY_ERROR_INDICATORS)
31
+
32
+
33
+ def cyclic_rotation(proxies: List[ProxyType], current_index: int) -> Tuple[ProxyType, int]:
34
+ """Default cyclic rotation strategy — iterates through proxies sequentially, wrapping around at the end."""
35
+ idx = current_index % len(proxies)
36
+ return proxies[idx], (idx + 1) % len(proxies)
37
+
38
+
39
+ class ProxyRotator:
40
+ """
41
+ A thread-safe proxy rotator with pluggable rotation strategies.
42
+
43
+ Supports:
44
+ - Cyclic rotation (default)
45
+ - Custom rotation strategies via callable
46
+ - Both string URLs and Playwright-style dict proxies
47
+ """
48
+
49
+ __slots__ = ("_proxies", "_proxy_to_index", "_strategy", "_current_index", "_lock")
50
+
51
+ def __init__(
52
+ self,
53
+ proxies: List[ProxyType],
54
+ strategy: RotationStrategy = cyclic_rotation,
55
+ ):
56
+ """
57
+ Initialize the proxy rotator.
58
+
59
+ :param proxies: List of proxy URLs or Playwright-style proxy dicts.
60
+ - String format: "http://proxy1:8080" or "http://user:pass@proxy:8080"
61
+ - Dict format: {"server": "http://proxy:8080", "username": "user", "password": "pass"}
62
+ :param strategy: Rotation strategy function. Takes (proxies, current_index) and returns (proxy, next_index). Defaults to cyclic_rotation.
63
+ """
64
+ if not proxies:
65
+ raise ValueError("At least one proxy must be provided")
66
+
67
+ if not callable(strategy):
68
+ raise TypeError(f"strategy must be callable, got {type(strategy).__name__}")
69
+
70
+ self._strategy = strategy
71
+ self._lock = Lock()
72
+
73
+ # Validate and store proxies
74
+ self._proxies: List[ProxyType] = []
75
+ self._proxy_to_index: Dict[str, int] = {} # O(1) lookup by unique key (server + username)
76
+ for i, proxy in enumerate(proxies):
77
+ if isinstance(proxy, (str, dict)):
78
+ if isinstance(proxy, dict) and "server" not in proxy:
79
+ raise ValueError("Proxy dict must have a 'server' key")
80
+
81
+ self._proxy_to_index[_get_proxy_key(proxy)] = i
82
+ self._proxies.append(proxy)
83
+ else:
84
+ raise TypeError(f"Invalid proxy type: {type(proxy)}. Expected str or dict.")
85
+
86
+ self._current_index = 0
87
+
88
+ def get_proxy(self) -> ProxyType:
89
+ """Get the next proxy according to the rotation strategy."""
90
+ with self._lock:
91
+ proxy, self._current_index = self._strategy(self._proxies, self._current_index)
92
+ return proxy
93
+
94
+ @property
95
+ def proxies(self) -> List[ProxyType]:
96
+ """Get a copy of all configured proxies."""
97
+ return list(self._proxies)
98
+
99
+ def __len__(self) -> int:
100
+ """Return the total number of configured proxies."""
101
+ return len(self._proxies)
102
+
103
+ def __repr__(self) -> str:
104
+ return f"ProxyRotator(proxies={len(self._proxies)})"
fetchers/__init__.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import TYPE_CHECKING, Any
2
+ from scrapling.engines.toolbelt import ProxyRotator
3
+
4
+ if TYPE_CHECKING:
5
+ from scrapling.fetchers.requests import Fetcher, AsyncFetcher, FetcherSession
6
+ from scrapling.fetchers.chrome import DynamicFetcher, DynamicSession, AsyncDynamicSession
7
+ from scrapling.fetchers.stealth_chrome import StealthyFetcher, StealthySession, AsyncStealthySession
8
+
9
+
10
+ # Lazy import mapping
11
+ _LAZY_IMPORTS = {
12
+ "Fetcher": ("scrapling.fetchers.requests", "Fetcher"),
13
+ "AsyncFetcher": ("scrapling.fetchers.requests", "AsyncFetcher"),
14
+ "FetcherSession": ("scrapling.fetchers.requests", "FetcherSession"),
15
+ "DynamicFetcher": ("scrapling.fetchers.chrome", "DynamicFetcher"),
16
+ "DynamicSession": ("scrapling.fetchers.chrome", "DynamicSession"),
17
+ "AsyncDynamicSession": ("scrapling.fetchers.chrome", "AsyncDynamicSession"),
18
+ "StealthyFetcher": ("scrapling.fetchers.stealth_chrome", "StealthyFetcher"),
19
+ "StealthySession": ("scrapling.fetchers.stealth_chrome", "StealthySession"),
20
+ "AsyncStealthySession": ("scrapling.fetchers.stealth_chrome", "AsyncStealthySession"),
21
+ }
22
+
23
+ __all__ = [
24
+ "Fetcher",
25
+ "AsyncFetcher",
26
+ "ProxyRotator",
27
+ "FetcherSession",
28
+ "DynamicFetcher",
29
+ "DynamicSession",
30
+ "AsyncDynamicSession",
31
+ "StealthyFetcher",
32
+ "StealthySession",
33
+ "AsyncStealthySession",
34
+ ]
35
+
36
+
37
+ def __getattr__(name: str) -> Any:
38
+ if name in _LAZY_IMPORTS:
39
+ module_path, class_name = _LAZY_IMPORTS[name]
40
+ module = __import__(module_path, fromlist=[class_name])
41
+ return getattr(module, class_name)
42
+ else:
43
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
44
+
45
+
46
+ def __dir__() -> list[str]:
47
+ """Support for dir() and autocomplete."""
48
+ return sorted(list(_LAZY_IMPORTS.keys()))
fetchers/chrome.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from scrapling.core._types import Unpack
2
+ from scrapling.engines._browsers._types import PlaywrightSession
3
+ from scrapling.engines.toolbelt.custom import BaseFetcher, Response
4
+ from scrapling.engines._browsers._controllers import DynamicSession, AsyncDynamicSession
5
+
6
+
7
+ class DynamicFetcher(BaseFetcher):
8
+ """A `Fetcher` that provide many options to fetch/load websites' pages through chromium-based browsers."""
9
+
10
+ @classmethod
11
+ def fetch(cls, url: str, **kwargs: Unpack[PlaywrightSession]) -> Response:
12
+ """Opens up a browser and do your request based on your chosen options below.
13
+
14
+ :param url: Target url.
15
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
16
+ :param disable_resources: Drop requests for unnecessary resources for a speed boost.
17
+ :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
18
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
19
+ :param cookies: Set cookies for the next request.
20
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
21
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
22
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
23
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.
24
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
25
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
26
+ :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
27
+ :param locale: Set the locale for the browser if wanted. Defaults to the system default locale.
28
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
29
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
30
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
31
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
32
+ :param extra_headers: A dictionary of extra headers to add to the request.
33
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
34
+ :param extra_flags: A list of additional browser flags to pass to the browser on launch.
35
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
36
+ :param additional_args: Additional arguments to be passed to Playwright's context as additional settings.
37
+ :return: A `Response` object.
38
+ """
39
+ selector_config = kwargs.get("selector_config", {}) or kwargs.get(
40
+ "custom_config", {}
41
+ ) # Checking `custom_config` for backward compatibility
42
+ if not isinstance(selector_config, dict):
43
+ raise TypeError("Argument `selector_config` must be a dictionary.")
44
+
45
+ kwargs["selector_config"] = {**cls._generate_parser_arguments(), **selector_config}
46
+
47
+ with DynamicSession(**kwargs) as session:
48
+ return session.fetch(url)
49
+
50
+ @classmethod
51
+ async def async_fetch(cls, url: str, **kwargs: Unpack[PlaywrightSession]) -> Response:
52
+ """Opens up a browser and do your request based on your chosen options below.
53
+
54
+ :param url: Target url.
55
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
56
+ :param disable_resources: Drop requests for unnecessary resources for a speed boost.
57
+ :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
58
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
59
+ :param cookies: Set cookies for the next request.
60
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
61
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
62
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
63
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.
64
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
65
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
66
+ :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
67
+ :param locale: Set the locale for the browser if wanted. Defaults to the system default locale.
68
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
69
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
70
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
71
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
72
+ :param extra_headers: A dictionary of extra headers to add to the request.
73
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
74
+ :param extra_flags: A list of additional browser flags to pass to the browser on launch.
75
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
76
+ :param additional_args: Additional arguments to be passed to Playwright's context as additional settings.
77
+ :return: A `Response` object.
78
+ """
79
+ selector_config = kwargs.get("selector_config", {}) or kwargs.get(
80
+ "custom_config", {}
81
+ ) # Checking `custom_config` for backward compatibility
82
+ if not isinstance(selector_config, dict):
83
+ raise TypeError("Argument `selector_config` must be a dictionary.")
84
+
85
+ kwargs["selector_config"] = {**cls._generate_parser_arguments(), **selector_config}
86
+
87
+ async with AsyncDynamicSession(**kwargs) as session:
88
+ return await session.fetch(url)
89
+
90
+
91
+ PlayWrightFetcher = DynamicFetcher # For backward-compatibility
fetchers/requests.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from scrapling.engines.static import (
2
+ FetcherSession,
3
+ FetcherClient as _FetcherClient,
4
+ AsyncFetcherClient as _AsyncFetcherClient,
5
+ )
6
+ from scrapling.engines.toolbelt.custom import BaseFetcher
7
+
8
+
9
+ __FetcherClientInstance__ = _FetcherClient()
10
+ __AsyncFetcherClientInstance__ = _AsyncFetcherClient()
11
+
12
+
13
+ class Fetcher(BaseFetcher):
14
+ """A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on `curl_cffi`."""
15
+
16
+ get = __FetcherClientInstance__.get
17
+ post = __FetcherClientInstance__.post
18
+ put = __FetcherClientInstance__.put
19
+ delete = __FetcherClientInstance__.delete
20
+
21
+
22
+ class AsyncFetcher(BaseFetcher):
23
+ """A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on `curl_cffi`."""
24
+
25
+ get = __AsyncFetcherClientInstance__.get
26
+ post = __AsyncFetcherClientInstance__.post
27
+ put = __AsyncFetcherClientInstance__.put
28
+ delete = __AsyncFetcherClientInstance__.delete
fetchers/stealth_chrome.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from scrapling.core._types import Unpack
2
+ from scrapling.engines._browsers._types import StealthSession
3
+ from scrapling.engines.toolbelt.custom import BaseFetcher, Response
4
+ from scrapling.engines._browsers._stealth import StealthySession, AsyncStealthySession
5
+
6
+
7
+ class StealthyFetcher(BaseFetcher):
8
+ """A `Fetcher` class type which is a completely stealthy built on top of Chromium.
9
+
10
+ It works as real browsers passing almost all online tests/protections with many customization options.
11
+ """
12
+
13
+ @classmethod
14
+ def fetch(cls, url: str, **kwargs: Unpack[StealthSession]) -> Response:
15
+ """
16
+ Opens up a browser and do your request based on your chosen options below.
17
+
18
+ :param url: Target url.
19
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
20
+ :param disable_resources: Drop requests for unnecessary resources for a speed boost.
21
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
22
+ :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
23
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
24
+ :param cookies: Set cookies for the next request.
25
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
26
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
27
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
28
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
29
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
30
+ :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
31
+ :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
32
+ rules. Defaults to the system default locale.
33
+ :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
34
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
35
+ :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
36
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
37
+ :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
38
+ :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
39
+ :param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
40
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
41
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
42
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
43
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
44
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
45
+ :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
46
+ :param extra_flags: A list of additional browser flags to pass to the browser on launch.
47
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
48
+ :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
49
+ :return: A `Response` object.
50
+ """
51
+ selector_config = kwargs.get("selector_config", {}) or kwargs.get(
52
+ "custom_config", {}
53
+ ) # Checking `custom_config` for backward compatibility
54
+ if not isinstance(selector_config, dict):
55
+ raise TypeError("Argument `selector_config` must be a dictionary.")
56
+
57
+ kwargs["selector_config"] = {**cls._generate_parser_arguments(), **selector_config}
58
+
59
+ with StealthySession(**kwargs) as engine:
60
+ return engine.fetch(url)
61
+
62
+ @classmethod
63
+ async def async_fetch(cls, url: str, **kwargs: Unpack[StealthSession]) -> Response:
64
+ """
65
+ Opens up a browser and do your request based on your chosen options below.
66
+
67
+ :param url: Target url.
68
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
69
+ :param disable_resources: Drop requests for unnecessary resources for a speed boost.
70
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
71
+ :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
72
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
73
+ :param cookies: Set cookies for the next request.
74
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
75
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
76
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
77
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
78
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
79
+ :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
80
+ :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
81
+ rules. Defaults to the system default locale.
82
+ :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
83
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
84
+ :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
85
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
86
+ :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
87
+ :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
88
+ :param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
89
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
90
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
91
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
92
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
93
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
94
+ :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
95
+ :param extra_flags: A list of additional browser flags to pass to the browser on launch.
96
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
97
+ :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
98
+ :return: A `Response` object.
99
+ """
100
+ selector_config = kwargs.get("selector_config", {}) or kwargs.get(
101
+ "custom_config", {}
102
+ ) # Checking `custom_config` for backward compatibility
103
+ if not isinstance(selector_config, dict):
104
+ raise TypeError("Argument `selector_config` must be a dictionary.")
105
+
106
+ kwargs["selector_config"] = {**cls._generate_parser_arguments(), **selector_config}
107
+
108
+ async with AsyncStealthySession(**kwargs) as engine:
109
+ return await engine.fetch(url)
parser.py ADDED
@@ -0,0 +1,1363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from inspect import signature
3
+ from urllib.parse import urljoin
4
+ from difflib import SequenceMatcher
5
+ from re import Pattern as re_Pattern
6
+
7
+ from lxml.html import HtmlElement, HTMLParser
8
+ from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
9
+ from lxml.etree import (
10
+ XPath,
11
+ tostring,
12
+ fromstring,
13
+ XPathError,
14
+ XPathEvalError,
15
+ _ElementUnicodeResult,
16
+ )
17
+
18
+ from scrapling.core._types import (
19
+ Any,
20
+ Set,
21
+ Dict,
22
+ cast,
23
+ List,
24
+ Tuple,
25
+ Union,
26
+ TypeVar,
27
+ Pattern,
28
+ Callable,
29
+ Literal,
30
+ Optional,
31
+ Iterable,
32
+ overload,
33
+ Generator,
34
+ SupportsIndex,
35
+ TYPE_CHECKING,
36
+ )
37
+ from scrapling.core.custom_types import AttributesHandler, TextHandler, TextHandlers
38
+ from scrapling.core.mixins import SelectorsGeneration
39
+ from scrapling.core.storage import (
40
+ SQLiteStorageSystem,
41
+ StorageSystemMixin,
42
+ _StorageTools,
43
+ )
44
+ from scrapling.core.translator import css_to_xpath as _css_to_xpath
45
+ from scrapling.core.utils import clean_spaces, flatten, html_forbidden, log
46
+
47
+ __DEFAULT_DB_FILE__ = str(Path(__file__).parent / "elements_storage.db")
48
+ # Attributes that are Python reserved words and can't be used directly
49
+ # Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
50
+ # https://www.w3schools.com/python/python_ref_keywords.asp
51
+ _whitelisted = {
52
+ "class_": "class",
53
+ "for_": "for",
54
+ }
55
+ _T = TypeVar("_T")
56
+ # Pre-compiled selectors for efficiency
57
+ _find_all_elements = XPath(".//*")
58
+ _find_all_elements_with_spaces = XPath(
59
+ ".//*[normalize-space(text())]"
60
+ ) # This selector gets all elements with text content
61
+
62
+
63
+ class Selector(SelectorsGeneration):
64
+ __slots__ = (
65
+ "url",
66
+ "encoding",
67
+ "__adaptive_enabled",
68
+ "_root",
69
+ "_storage",
70
+ "__keep_comments",
71
+ "__huge_tree_enabled",
72
+ "__attributes",
73
+ "__text",
74
+ "__tag",
75
+ "__keep_cdata",
76
+ "_raw_body",
77
+ )
78
+
79
+ def __init__(
80
+ self,
81
+ content: Optional[str | bytes] = None,
82
+ url: str = "",
83
+ encoding: str = "utf-8",
84
+ huge_tree: bool = True,
85
+ root: Optional[HtmlElement] = None,
86
+ keep_comments: Optional[bool] = False,
87
+ keep_cdata: Optional[bool] = False,
88
+ adaptive: Optional[bool] = False,
89
+ _storage: Optional[StorageSystemMixin] = None,
90
+ storage: Any = SQLiteStorageSystem,
91
+ storage_args: Optional[Dict] = None,
92
+ **_,
93
+ ):
94
+ """The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
95
+ with expressions in CSS, XPath, or with simply text. Check the docs for more info.
96
+
97
+ Here we try to extend module ``lxml.html.HtmlElement`` while maintaining a simpler interface, We are not
98
+ inheriting from the ``lxml.html.HtmlElement`` because it's not pickleable, which makes a lot of reference jobs
99
+ not possible. You can test it here and see code explodes with `AssertionError: invalid Element proxy at...`.
100
+ It's an old issue with lxml, see `this entry <https://bugs.launchpad.net/lxml/+bug/736708>`
101
+
102
+ :param content: HTML content as either string or bytes.
103
+ :param url: It allows storing a URL with the HTML data for retrieving later.
104
+ :param encoding: The encoding type that will be used in HTML parsing, default is `UTF-8`
105
+ :param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
106
+ the libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
107
+ :param root: Used internally to pass etree objects instead of text/body arguments, it takes the highest priority.
108
+ Don't use it unless you know what you are doing!
109
+ :param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
110
+ :param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
111
+ :param adaptive: Globally turn off the adaptive feature in all functions, this argument takes higher
112
+ priority over all adaptive related arguments/functions in the class.
113
+ :param storage: The storage class to be passed for adaptive functionalities, see ``Docs`` for more info.
114
+ :param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
115
+ If empty, default values will be used.
116
+ """
117
+ if root is None and content is None:
118
+ raise ValueError("Selector class needs HTML content, or root arguments to work")
119
+
120
+ self.url = url
121
+ self._raw_body: str | bytes = ""
122
+ self.encoding = encoding
123
+ self.__keep_cdata = keep_cdata
124
+ self.__huge_tree_enabled = huge_tree
125
+ self.__keep_comments = keep_comments
126
+ # For selector stuff
127
+ self.__text: Optional[TextHandler] = None
128
+ self.__attributes: Optional[AttributesHandler] = None
129
+ self.__tag: Optional[str] = None
130
+ self._storage: Optional[StorageSystemMixin] = None
131
+ if root is None:
132
+ body: str | bytes
133
+ if isinstance(content, str):
134
+ body = content.strip().replace("\x00", "") or "<html/>"
135
+ elif isinstance(content, bytes):
136
+ body = content.replace(b"\x00", b"")
137
+ else:
138
+ raise TypeError(f"content argument must be str or bytes, got {type(content)}")
139
+
140
+ # https://lxml.de/api/lxml.etree.HTMLParser-class.html
141
+ _parser_kwargs: Dict[str, Any] = dict(
142
+ recover=True,
143
+ remove_blank_text=True,
144
+ remove_comments=(not keep_comments),
145
+ encoding=encoding,
146
+ compact=True,
147
+ huge_tree=huge_tree,
148
+ default_doctype=True, # Supported by lxml but missing from stubs
149
+ strip_cdata=(not keep_cdata),
150
+ )
151
+ parser = HTMLParser(**_parser_kwargs)
152
+ self._root = cast(HtmlElement, fromstring(body or "<html/>", parser=parser, base_url=url or ""))
153
+ self._raw_body = content
154
+
155
+ else:
156
+ self._root = cast(HtmlElement, root)
157
+
158
+ if self._is_text_node(root):
159
+ self.__adaptive_enabled = False
160
+ return
161
+
162
+ self.__adaptive_enabled = bool(adaptive)
163
+
164
+ if self.__adaptive_enabled:
165
+ if _storage is not None:
166
+ self._storage = _storage
167
+ else:
168
+ if not storage_args:
169
+ storage_args = {
170
+ "storage_file": __DEFAULT_DB_FILE__,
171
+ "url": url,
172
+ }
173
+
174
+ if not hasattr(storage, "__wrapped__"):
175
+ raise ValueError("Storage class must be wrapped with lru_cache decorator, see docs for info")
176
+
177
+ if not issubclass(storage.__wrapped__, StorageSystemMixin): # pragma: no cover
178
+ raise ValueError("Storage system must be inherited from class `StorageSystemMixin`")
179
+
180
+ self._storage = storage(**storage_args)
181
+
182
+ def __getitem__(self, key: str) -> TextHandler:
183
+ if self._is_text_node(self._root):
184
+ raise TypeError("Text nodes do not have attributes")
185
+ return self.attrib[key]
186
+
187
+ def __contains__(self, key: str) -> bool:
188
+ if self._is_text_node(self._root):
189
+ return False
190
+ return key in self.attrib
191
+
192
+ # Node functionalities, I wanted to move to a separate Mixin class, but it had a slight impact on performance
193
+ @staticmethod
194
+ def _is_text_node(
195
+ element: HtmlElement | _ElementUnicodeResult,
196
+ ) -> bool:
197
+ """Return True if the given element is a result of a string expression
198
+ Examples:
199
+ XPath -> '/text()', '/@attribute', etc...
200
+ CSS3 -> '::text', '::attr(attrib)'...
201
+ """
202
+ # Faster than checking `element.is_attribute or element.is_text or element.is_tail`
203
+ return issubclass(type(element), _ElementUnicodeResult)
204
+
205
+ def __element_convertor(self, element: HtmlElement | _ElementUnicodeResult) -> "Selector":
206
+ """Used internally to convert a single HtmlElement or text node to Selector directly without checks"""
207
+ return Selector(
208
+ root=element,
209
+ url=self.url,
210
+ encoding=self.encoding,
211
+ adaptive=self.__adaptive_enabled,
212
+ _storage=self._storage,
213
+ keep_comments=self.__keep_comments,
214
+ keep_cdata=self.__keep_cdata,
215
+ huge_tree=self.__huge_tree_enabled,
216
+ )
217
+
218
+ def __elements_convertor(self, elements: List[HtmlElement | _ElementUnicodeResult]) -> "Selectors":
219
+ # Store them for non-repeated call-ups
220
+ url = self.url
221
+ encoding = self.encoding
222
+ adaptive = self.__adaptive_enabled
223
+ storage = self._storage
224
+ comments = self.__keep_comments
225
+ cdata = self.__keep_cdata
226
+ huge_tree = self.__huge_tree_enabled
227
+
228
+ return Selectors(
229
+ Selector(
230
+ root=el,
231
+ url=url,
232
+ encoding=encoding,
233
+ adaptive=adaptive,
234
+ _storage=storage,
235
+ keep_comments=comments,
236
+ keep_cdata=cdata,
237
+ huge_tree=huge_tree,
238
+ )
239
+ for el in elements
240
+ )
241
+
242
+ def __handle_elements(self, result: List[HtmlElement | _ElementUnicodeResult]) -> "Selectors":
243
+ """Used internally in all functions to convert results to Selectors in bulk"""
244
+ if not result:
245
+ return Selectors()
246
+
247
+ return self.__elements_convertor(result)
248
+
249
+ def __getstate__(self) -> Any:
250
+ # lxml don't like it :)
251
+ raise TypeError("Can't pickle Selector objects")
252
+
253
+ # The following four properties I made them into functions instead of variables directly
254
+ # So they don't slow down the process of initializing many instances of the class and gets executed only
255
+ # when the user needs them for the first time for that specific element and gets cached for next times
256
+ # Doing that only made the library performance test sky rocked multiple times faster than before
257
+ # because I was executing them on initialization before :))
258
+ @property
259
+ def tag(self) -> str:
260
+ """Get the tag name of the element"""
261
+ if self._is_text_node(self._root):
262
+ return "#text"
263
+ if not self.__tag:
264
+ self.__tag = str(self._root.tag)
265
+ return self.__tag or ""
266
+
267
+ @property
268
+ def text(self) -> TextHandler:
269
+ """Get text content of the element"""
270
+ if self._is_text_node(self._root):
271
+ return TextHandler(str(self._root))
272
+ if self.__text is None:
273
+ # If you want to escape lxml default behavior and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
274
+ # before extracting text, then keep `keep_comments` set to False while initializing the first class
275
+ self.__text = TextHandler(self._root.text or "")
276
+ return self.__text
277
+
278
+ def get_all_text(
279
+ self,
280
+ separator: str = "\n",
281
+ strip: bool = False,
282
+ ignore_tags: Tuple = (
283
+ "script",
284
+ "style",
285
+ ),
286
+ valid_values: bool = True,
287
+ ) -> TextHandler:
288
+ """Get all child strings of this element, concatenated using the given separator.
289
+
290
+ :param separator: Strings will be concatenated using this separator.
291
+ :param strip: If True, strings will be stripped before being concatenated.
292
+ :param ignore_tags: A tuple of all tag names you want to ignore
293
+ :param valid_values: If enabled, elements with text-content that is empty or only whitespaces will be ignored
294
+
295
+ :return: A TextHandler
296
+ """
297
+ if self._is_text_node(self._root):
298
+ return TextHandler(str(self._root))
299
+
300
+ ignored_elements: set[Any] = set()
301
+ if ignore_tags:
302
+ for element in self._root.iter(*ignore_tags):
303
+ ignored_elements.add(element)
304
+ ignored_elements.update(cast(list, _find_all_elements(element)))
305
+
306
+ _all_strings = []
307
+ for node in self._root.iter():
308
+ if node not in ignored_elements:
309
+ text = node.text
310
+ if text and isinstance(text, str):
311
+ processed_text = text.strip() if strip else text
312
+ if not valid_values or processed_text.strip():
313
+ _all_strings.append(processed_text)
314
+
315
+ return cast(TextHandler, TextHandler(separator).join(_all_strings))
316
+
317
+ def urljoin(self, relative_url: str) -> str:
318
+ """Join this Selector's url with a relative url to form an absolute full URL."""
319
+ return urljoin(self.url, relative_url)
320
+
321
+ @property
322
+ def attrib(self) -> AttributesHandler:
323
+ """Get attributes of the element"""
324
+ if self._is_text_node(self._root):
325
+ return AttributesHandler({})
326
+ if not self.__attributes:
327
+ self.__attributes = AttributesHandler(self._root.attrib)
328
+ return self.__attributes
329
+
330
+ @property
331
+ def html_content(self) -> TextHandler:
332
+ """Return the inner HTML code of the element"""
333
+ if self._is_text_node(self._root):
334
+ return TextHandler(str(self._root))
335
+ content = tostring(self._root, encoding=self.encoding, method="html", with_tail=False)
336
+ if isinstance(content, bytes):
337
+ content = content.strip().decode(self.encoding)
338
+ return TextHandler(content)
339
+
340
+ @property
341
+ def body(self) -> str | bytes:
342
+ """Return the raw body of the current `Selector` without any processing. Useful for binary and non-HTML requests."""
343
+ if self._is_text_node(self._root):
344
+ return ""
345
+ return self._raw_body
346
+
347
+ def prettify(self) -> TextHandler:
348
+ """Return a prettified version of the element's inner html-code"""
349
+ if self._is_text_node(self._root):
350
+ return TextHandler(str(self._root))
351
+ content = tostring(
352
+ self._root,
353
+ encoding=self.encoding,
354
+ pretty_print=True,
355
+ method="html",
356
+ with_tail=False,
357
+ )
358
+ if isinstance(content, bytes):
359
+ content = content.strip().decode(self.encoding)
360
+ return TextHandler(content)
361
+
362
+ def has_class(self, class_name: str) -> bool:
363
+ """Check if the element has a specific class
364
+ :param class_name: The class name to check for
365
+ :return: True if element has class with that name otherwise False
366
+ """
367
+ if self._is_text_node(self._root):
368
+ return False
369
+ return class_name in self._root.classes
370
+
371
+ @property
372
+ def parent(self) -> Optional["Selector"]:
373
+ """Return the direct parent of the element or ``None`` otherwise"""
374
+ _parent = self._root.getparent()
375
+ return self.__element_convertor(_parent) if _parent is not None else None
376
+
377
+ @property
378
+ def below_elements(self) -> "Selectors":
379
+ """Return all elements under the current element in the DOM tree"""
380
+ if self._is_text_node(self._root):
381
+ return Selectors()
382
+ below = cast(List, _find_all_elements(self._root))
383
+ return self.__elements_convertor(below) if below is not None else Selectors()
384
+
385
+ @property
386
+ def children(self) -> "Selectors":
387
+ """Return the children elements of the current element or empty list otherwise"""
388
+ if self._is_text_node(self._root):
389
+ return Selectors()
390
+ return Selectors(
391
+ self.__element_convertor(child)
392
+ for child in self._root.iterchildren()
393
+ if not isinstance(child, html_forbidden)
394
+ )
395
+
396
+ @property
397
+ def siblings(self) -> "Selectors":
398
+ """Return other children of the current element's parent or empty list otherwise"""
399
+ if self.parent:
400
+ return Selectors(child for child in self.parent.children if child._root != self._root)
401
+ return Selectors()
402
+
403
+ def iterancestors(self) -> Generator["Selector", None, None]:
404
+ """Return a generator that loops over all ancestors of the element, starting with the element's parent."""
405
+ if self._is_text_node(self._root):
406
+ return
407
+ for ancestor in self._root.iterancestors():
408
+ yield self.__element_convertor(ancestor)
409
+
410
+ def find_ancestor(self, func: Callable[["Selector"], bool]) -> Optional["Selector"]:
411
+ """Loop over all ancestors of the element till one match the passed function
412
+ :param func: A function that takes each ancestor as an argument and returns True/False
413
+ :return: The first ancestor that match the function or ``None`` otherwise.
414
+ """
415
+ for ancestor in self.iterancestors():
416
+ if func(ancestor):
417
+ return ancestor
418
+ return None
419
+
420
+ @property
421
+ def path(self) -> "Selectors":
422
+ """Returns a list of type `Selectors` that contains the path leading to the current element from the root."""
423
+ lst = list(self.iterancestors())
424
+ return Selectors(lst)
425
+
426
+ @property
427
+ def next(self) -> Optional["Selector"]:
428
+ """Returns the next element of the current element in the children of the parent or ``None`` otherwise."""
429
+ if self._is_text_node(self._root):
430
+ return None
431
+ next_element = self._root.getnext()
432
+ while next_element is not None and isinstance(next_element, html_forbidden):
433
+ # Ignore HTML comments and unwanted types
434
+ next_element = next_element.getnext()
435
+
436
+ return self.__element_convertor(next_element) if next_element is not None else None
437
+
438
+ @property
439
+ def previous(self) -> Optional["Selector"]:
440
+ """Returns the previous element of the current element in the children of the parent or ``None`` otherwise."""
441
+ if self._is_text_node(self._root):
442
+ return None
443
+ prev_element = self._root.getprevious()
444
+ while prev_element is not None and isinstance(prev_element, html_forbidden):
445
+ # Ignore HTML comments and unwanted types
446
+ prev_element = prev_element.getprevious()
447
+
448
+ return self.__element_convertor(prev_element) if prev_element is not None else None
449
+
450
+ def get(self) -> TextHandler:
451
+ """
452
+ Serialize this element to a string.
453
+ For text nodes, returns the text value. For HTML elements, returns the outer HTML.
454
+ """
455
+ if self._is_text_node(self._root):
456
+ return TextHandler(str(self._root))
457
+ return self.html_content
458
+
459
+ def getall(self) -> TextHandlers:
460
+ """Return a single-element list containing this element's serialized string."""
461
+ return TextHandlers([self.get()])
462
+
463
+ extract = getall
464
+ extract_first = get
465
+
466
+ def __str__(self) -> str:
467
+ if self._is_text_node(self._root):
468
+ return str(self._root)
469
+ return self.html_content
470
+
471
+ def __repr__(self) -> str:
472
+ length_limit = 40
473
+
474
+ if self._is_text_node(self._root):
475
+ text = str(self._root)
476
+ if len(text) > length_limit:
477
+ text = text[:length_limit].strip() + "..."
478
+ return f"<text='{text}'>"
479
+
480
+ content = clean_spaces(self.html_content)
481
+ if len(content) > length_limit:
482
+ content = content[:length_limit].strip() + "..."
483
+ data = f"<data='{content}'"
484
+
485
+ if self.parent:
486
+ parent_content = clean_spaces(self.parent.html_content)
487
+ if len(parent_content) > length_limit:
488
+ parent_content = parent_content[:length_limit].strip() + "..."
489
+
490
+ data += f" parent='{parent_content}'"
491
+
492
+ return data + ">"
493
+
494
+ # From here we start with the selecting functions
495
+ @overload
496
+ def relocate(
497
+ self, element: Union[Dict, HtmlElement, "Selector"], percentage: int, selector_type: Literal[True]
498
+ ) -> "Selectors": ...
499
+
500
+ @overload
501
+ def relocate(
502
+ self, element: Union[Dict, HtmlElement, "Selector"], percentage: int, selector_type: Literal[False] = False
503
+ ) -> List[HtmlElement]: ...
504
+
505
+ def relocate(
506
+ self,
507
+ element: Union[Dict, HtmlElement, "Selector"],
508
+ percentage: int = 0,
509
+ selector_type: bool = False,
510
+ ) -> Union[List[HtmlElement], "Selectors"]:
511
+ """This function will search again for the element in the page tree, used automatically on page structure change
512
+
513
+ :param element: The element we want to relocate in the tree
514
+ :param percentage: The minimum percentage to accept and not going lower than that. Be aware that the percentage
515
+ calculation depends solely on the page structure, so don't play with this number unless you must know
516
+ what you are doing!
517
+ :param selector_type: If True, the return result will be converted to `Selectors` object
518
+ :return: List of pure HTML elements that got the highest matching score or 'Selectors' object
519
+ """
520
+ score_table: Dict[float, List[Any]] = {}
521
+ # Note: `element` will most likely always be a dictionary at this point.
522
+ if isinstance(element, self.__class__):
523
+ element = element._root
524
+
525
+ if issubclass(type(element), HtmlElement):
526
+ element = _StorageTools.element_to_dict(element)
527
+
528
+ for node in cast(List, _find_all_elements(self._root)):
529
+ # Collect all elements in the page, then for each element get the matching score of it against the node.
530
+ # Hence: the code doesn't stop even if the score was 100%
531
+ # because there might be another element(s) left in page with the same score
532
+ score = self.__calculate_similarity_score(cast(Dict, element), node)
533
+ score_table.setdefault(score, []).append(node)
534
+
535
+ if score_table:
536
+ highest_probability = max(score_table.keys())
537
+ if score_table[highest_probability] and highest_probability >= percentage:
538
+ if log.getEffectiveLevel() < 20:
539
+ # No need to execute this part if the logging level is not debugging
540
+ log.debug(f"Highest probability was {highest_probability}%")
541
+ log.debug("Top 5 best matching elements are: ")
542
+ for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
543
+ log.debug(f"{percent} -> {self.__elements_convertor(score_table[percent])}")
544
+
545
+ if not selector_type:
546
+ return score_table[highest_probability]
547
+ return self.__elements_convertor(score_table[highest_probability])
548
+ return []
549
+
550
+ def css(
551
+ self,
552
+ selector: str,
553
+ identifier: str = "",
554
+ adaptive: bool = False,
555
+ auto_save: bool = False,
556
+ percentage: int = 0,
557
+ ) -> "Selectors":
558
+ """Search the current tree with CSS3 selectors
559
+
560
+ **Important:
561
+ It's recommended to use the identifier argument if you plan to use a different selector later
562
+ and want to relocate the same element(s)**
563
+
564
+ :param selector: The CSS3 selector to be used.
565
+ :param adaptive: Enabled will make the function try to relocate the element if it was 'saved' before
566
+ :param identifier: A string that will be used to save/retrieve element's data in adaptive,
567
+ otherwise the selector will be used.
568
+ :param auto_save: Automatically save new elements for `adaptive` later
569
+ :param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.
570
+ Be aware that the percentage calculation depends solely on the page structure, so don't play with this
571
+ number unless you must know what you are doing!
572
+
573
+ :return: `Selectors` class.
574
+ """
575
+ if self._is_text_node(self._root):
576
+ return Selectors()
577
+
578
+ try:
579
+ if not self.__adaptive_enabled or "," not in selector:
580
+ # No need to split selectors in this case, let's save some CPU cycles :)
581
+ xpath_selector = _css_to_xpath(selector)
582
+ return self.xpath(
583
+ xpath_selector,
584
+ identifier or selector,
585
+ adaptive,
586
+ auto_save,
587
+ percentage,
588
+ )
589
+
590
+ results = Selectors()
591
+ for single_selector in split_selectors(selector):
592
+ # I'm doing this only so the `save` function saves data correctly for combined selectors
593
+ # Like using the ',' to combine two different selectors that point to different elements.
594
+ xpath_selector = _css_to_xpath(single_selector.canonical())
595
+ results += self.xpath(
596
+ xpath_selector,
597
+ identifier or single_selector.canonical(),
598
+ adaptive,
599
+ auto_save,
600
+ percentage,
601
+ )
602
+
603
+ return Selectors(results)
604
+ except (
605
+ SelectorError,
606
+ SelectorSyntaxError,
607
+ ) as e:
608
+ raise SelectorSyntaxError(f"Invalid CSS selector '{selector}': {str(e)}") from e
609
+
610
+ def xpath(
611
+ self,
612
+ selector: str,
613
+ identifier: str = "",
614
+ adaptive: bool = False,
615
+ auto_save: bool = False,
616
+ percentage: int = 0,
617
+ **kwargs: Any,
618
+ ) -> "Selectors":
619
+ """Search the current tree with XPath selectors
620
+
621
+ **Important:
622
+ It's recommended to use the identifier argument if you plan to use a different selector later
623
+ and want to relocate the same element(s)**
624
+
625
+ Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
626
+
627
+ :param selector: The XPath selector to be used.
628
+ :param adaptive: Enabled will make the function try to relocate the element if it was 'saved' before
629
+ :param identifier: A string that will be used to save/retrieve element's data in adaptive,
630
+ otherwise the selector will be used.
631
+ :param auto_save: Automatically save new elements for `adaptive` later
632
+ :param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.
633
+ Be aware that the percentage calculation depends solely on the page structure, so don't play with this
634
+ number unless you must know what you are doing!
635
+
636
+ :return: `Selectors` class.
637
+ """
638
+ if self._is_text_node(self._root):
639
+ return Selectors()
640
+
641
+ try:
642
+ if elements := self._root.xpath(selector, **kwargs):
643
+ if not self.__adaptive_enabled and auto_save:
644
+ log.warning(
645
+ "Argument `auto_save` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
646
+ )
647
+ elif self.__adaptive_enabled and auto_save:
648
+ self.save(elements[0], identifier or selector)
649
+
650
+ return self.__handle_elements(elements)
651
+ elif self.__adaptive_enabled:
652
+ if adaptive:
653
+ element_data = self.retrieve(identifier or selector)
654
+ if element_data:
655
+ elements = self.relocate(element_data, percentage)
656
+ if elements is not None and auto_save:
657
+ self.save(elements[0], identifier or selector)
658
+
659
+ return self.__handle_elements(elements)
660
+ else:
661
+ if adaptive:
662
+ log.warning(
663
+ "Argument `adaptive` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
664
+ )
665
+ elif auto_save:
666
+ log.warning(
667
+ "Argument `auto_save` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
668
+ )
669
+
670
+ return self.__handle_elements(elements)
671
+
672
+ except (
673
+ SelectorError,
674
+ SelectorSyntaxError,
675
+ XPathError,
676
+ XPathEvalError,
677
+ ) as e:
678
+ raise SelectorSyntaxError(f"Invalid XPath selector: {selector}") from e
679
+
680
+ def find_all(
681
+ self,
682
+ *args: str | Iterable[str] | Pattern | Callable | Dict[str, str],
683
+ **kwargs: str,
684
+ ) -> "Selectors":
685
+ """Find elements by filters of your creations for ease.
686
+
687
+ :param args: Tag name(s), iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
688
+ :param kwargs: The attributes you want to filter elements based on it.
689
+ :return: The `Selectors` object of the elements or empty list
690
+ """
691
+ if self._is_text_node(self._root):
692
+ return Selectors()
693
+
694
+ if not args and not kwargs:
695
+ raise TypeError("You have to pass something to search with, like tag name(s), tag attributes, or both.")
696
+
697
+ attributes: Dict[str, Any] = dict()
698
+ tags: Set[str] = set()
699
+ patterns: Set[Pattern] = set()
700
+ results, functions, selectors = Selectors(), [], []
701
+
702
+ # Brace yourself for a wonderful journey!
703
+ for arg in args:
704
+ if isinstance(arg, str):
705
+ tags.add(arg)
706
+
707
+ elif type(arg) in (list, tuple, set):
708
+ arg = cast(Iterable, arg) # Type narrowing for type checkers like pyright
709
+ if not all(map(lambda x: isinstance(x, str), arg)):
710
+ raise TypeError("Nested Iterables are not accepted, only iterables of tag names are accepted")
711
+ tags.update(set(arg))
712
+
713
+ elif isinstance(arg, dict):
714
+ if not all([(isinstance(k, str) and isinstance(v, str)) for k, v in arg.items()]):
715
+ raise TypeError(
716
+ "Nested dictionaries are not accepted, only string keys and string values are accepted"
717
+ )
718
+ attributes.update(arg)
719
+
720
+ elif isinstance(arg, re_Pattern):
721
+ patterns.add(arg)
722
+
723
+ elif callable(arg):
724
+ if len(signature(arg).parameters) > 0:
725
+ functions.append(arg)
726
+ else:
727
+ raise TypeError(
728
+ "Callable filter function must have at least one argument to take `Selector` objects."
729
+ )
730
+
731
+ else:
732
+ raise TypeError(f'Argument with type "{type(arg)}" is not accepted, please read the docs.')
733
+
734
+ if not all([(isinstance(k, str) and isinstance(v, str)) for k, v in kwargs.items()]):
735
+ raise TypeError("Only string values are accepted for arguments")
736
+
737
+ for attribute_name, value in kwargs.items():
738
+ # Only replace names for kwargs, replacing them in dictionaries doesn't make sense
739
+ attribute_name = _whitelisted.get(attribute_name, attribute_name)
740
+ attributes[attribute_name] = value
741
+
742
+ # It's easier and faster to build a selector than traversing the tree
743
+ tags = tags or set("*")
744
+ for tag in tags:
745
+ selector = tag
746
+ for key, value in attributes.items():
747
+ value = value.replace('"', r"\"") # Escape double quotes in user input
748
+ # Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
749
+ selector += '[{}="{}"]'.format(key, value)
750
+ if selector != "*":
751
+ selectors.append(selector)
752
+
753
+ if selectors:
754
+ results = cast(Selectors, self.css(", ".join(selectors)))
755
+ if results:
756
+ # From the results, get the ones that fulfill passed regex patterns
757
+ for pattern in patterns:
758
+ results = results.filter(lambda e: e.text.re(pattern, check_match=True))
759
+
760
+ # From the results, get the ones that fulfill passed functions
761
+ for function in functions:
762
+ results = results.filter(function)
763
+ else:
764
+ results = results or self.below_elements
765
+ for pattern in patterns:
766
+ results = results.filter(lambda e: e.text.re(pattern, check_match=True))
767
+
768
+ # Collect an element if it fulfills the passed function otherwise
769
+ for function in functions:
770
+ results = results.filter(function)
771
+
772
+ return results
773
+
774
+ def find(
775
+ self,
776
+ *args: str | Iterable[str] | Pattern | Callable | Dict[str, str],
777
+ **kwargs: str,
778
+ ) -> Optional["Selector"]:
779
+ """Find elements by filters of your creations for ease, then return the first result. Otherwise return `None`.
780
+
781
+ :param args: Tag name(s), iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
782
+ :param kwargs: The attributes you want to filter elements based on it.
783
+ :return: The `Selector` object of the element or `None` if the result didn't match
784
+ """
785
+ for element in self.find_all(*args, **kwargs):
786
+ return element
787
+ return None
788
+
789
+ def __calculate_similarity_score(self, original: Dict, candidate: HtmlElement) -> float:
790
+ """Used internally to calculate a score that shows how a candidate element similar to the original one
791
+
792
+ :param original: The original element in the form of the dictionary generated from `element_to_dict` function
793
+ :param candidate: The element to compare with the original element.
794
+ :return: A percentage score of how similar is the candidate to the original element
795
+ """
796
+ score: float = 0
797
+ checks: int = 0
798
+ data = _StorageTools.element_to_dict(candidate)
799
+
800
+ score += 1 if original["tag"] == data["tag"] else 0
801
+ checks += 1
802
+
803
+ if original["text"]:
804
+ score += SequenceMatcher(None, original["text"], data.get("text") or "").ratio()
805
+ checks += 1
806
+
807
+ # if both don't have attributes, it still counts for something!
808
+ score += self.__calculate_dict_diff(original["attributes"], data["attributes"])
809
+ checks += 1
810
+
811
+ # Separate similarity test for class, id, href,... this will help in full structural changes
812
+ for attrib in (
813
+ "class",
814
+ "id",
815
+ "href",
816
+ "src",
817
+ ):
818
+ if original["attributes"].get(attrib):
819
+ score += SequenceMatcher(
820
+ None,
821
+ original["attributes"][attrib],
822
+ data["attributes"].get(attrib) or "",
823
+ ).ratio()
824
+ checks += 1
825
+
826
+ score += SequenceMatcher(None, original["path"], data["path"]).ratio()
827
+ checks += 1
828
+
829
+ if original.get("parent_name"):
830
+ # Then we start comparing parents' data
831
+ if data.get("parent_name"):
832
+ score += SequenceMatcher(None, original["parent_name"], data.get("parent_name") or "").ratio()
833
+ checks += 1
834
+
835
+ score += self.__calculate_dict_diff(original["parent_attribs"], data.get("parent_attribs") or {})
836
+ checks += 1
837
+
838
+ if original["parent_text"]:
839
+ score += SequenceMatcher(
840
+ None,
841
+ original["parent_text"],
842
+ data.get("parent_text") or "",
843
+ ).ratio()
844
+ checks += 1
845
+ # else:
846
+ # # The original element has a parent and this one not, this is not a good sign
847
+ # score -= 0.1
848
+
849
+ if original.get("siblings"):
850
+ score += SequenceMatcher(None, original["siblings"], data.get("siblings") or []).ratio()
851
+ checks += 1
852
+
853
+ # How % sure? let's see
854
+ return round((score / checks) * 100, 2)
855
+
856
+ @staticmethod
857
+ def __calculate_dict_diff(dict1: Dict, dict2: Dict) -> float:
858
+ """Used internally to calculate similarity between two dictionaries as SequenceMatcher doesn't accept dictionaries"""
859
+ score = SequenceMatcher(None, tuple(dict1.keys()), tuple(dict2.keys())).ratio() * 0.5
860
+ score += SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio() * 0.5
861
+ return score
862
+
863
+ def save(self, element: HtmlElement, identifier: str) -> None:
864
+ """Saves the element's unique properties to the storage for retrieval and relocation later
865
+
866
+ :param element: The element itself that we want to save to storage, it can be a ` Selector ` or pure ` HtmlElement `
867
+ :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
868
+ the docs for more info.
869
+ """
870
+ if self.__adaptive_enabled and self._storage:
871
+ target_element: Any = element
872
+ if isinstance(target_element, self.__class__):
873
+ target_element = target_element._root
874
+
875
+ if self._is_text_node(target_element):
876
+ target_element = target_element.getparent()
877
+
878
+ self._storage.save(target_element, identifier)
879
+ else:
880
+ raise RuntimeError(
881
+ "Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
882
+ )
883
+
884
+ def retrieve(self, identifier: str) -> Optional[Dict[str, Any]]:
885
+ """Using the identifier, we search the storage and return the unique properties of the element
886
+
887
+ :param identifier: This is the identifier that will be used to retrieve the element from the storage. See
888
+ the docs for more info.
889
+ :return: A dictionary of the unique properties
890
+ """
891
+ if self.__adaptive_enabled and self._storage:
892
+ return self._storage.retrieve(identifier)
893
+
894
+ raise RuntimeError(
895
+ "Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
896
+ )
897
+
898
+ # Operations on text functions
899
+ def json(self) -> Dict:
900
+ """Return JSON response if the response is jsonable otherwise throws error"""
901
+ if self._is_text_node(self._root):
902
+ return TextHandler(str(self._root)).json()
903
+ if self._raw_body and isinstance(self._raw_body, (str, bytes)):
904
+ if isinstance(self._raw_body, str):
905
+ return TextHandler(self._raw_body).json()
906
+ else:
907
+ if TYPE_CHECKING:
908
+ assert isinstance(self._raw_body, bytes)
909
+ return TextHandler(self._raw_body.decode()).json()
910
+ elif self.text:
911
+ return self.text.json()
912
+ else:
913
+ return self.get_all_text(strip=True).json()
914
+
915
+ def re(
916
+ self,
917
+ regex: str | Pattern[str],
918
+ replace_entities: bool = True,
919
+ clean_match: bool = False,
920
+ case_sensitive: bool = True,
921
+ ) -> TextHandlers:
922
+ """Apply the given regex to the current text and return a list of strings with the matches.
923
+
924
+ :param regex: Can be either a compiled regular expression or a string.
925
+ :param replace_entities: If enabled character entity references are replaced by their corresponding character
926
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
927
+ :param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
928
+ """
929
+ return self.text.re(regex, replace_entities, clean_match, case_sensitive)
930
+
931
+ def re_first(
932
+ self,
933
+ regex: str | Pattern[str],
934
+ default=None,
935
+ replace_entities: bool = True,
936
+ clean_match: bool = False,
937
+ case_sensitive: bool = True,
938
+ ) -> TextHandler:
939
+ """Apply the given regex to text and return the first match if found, otherwise return the default value.
940
+
941
+ :param regex: Can be either a compiled regular expression or a string.
942
+ :param default: The default value to be returned if there is no match
943
+ :param replace_entities: if enabled character entity references are replaced by their corresponding character
944
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
945
+ :param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
946
+ """
947
+ return self.text.re_first(regex, default, replace_entities, clean_match, case_sensitive)
948
+
949
+ @staticmethod
950
+ def __get_attributes(element: HtmlElement, ignore_attributes: List | Tuple) -> Dict:
951
+ """Return attributes dictionary without the ignored list"""
952
+ return {k: v for k, v in element.attrib.items() if k not in ignore_attributes}
953
+
954
+ def __are_alike(
955
+ self,
956
+ original: HtmlElement,
957
+ original_attributes: Dict,
958
+ candidate: HtmlElement,
959
+ ignore_attributes: List | Tuple,
960
+ similarity_threshold: float,
961
+ match_text: bool = False,
962
+ ) -> bool:
963
+ """Calculate a score of how much these elements are alike and return True
964
+ if the score is higher or equals the threshold"""
965
+ candidate_attributes = (
966
+ self.__get_attributes(candidate, ignore_attributes) if ignore_attributes else candidate.attrib
967
+ )
968
+ score: float = 0
969
+ checks: int = 0
970
+
971
+ if original_attributes:
972
+ score += sum(
973
+ SequenceMatcher(None, v, candidate_attributes.get(k, "")).ratio()
974
+ for k, v in original_attributes.items()
975
+ )
976
+ checks += len(candidate_attributes)
977
+ else:
978
+ if not candidate_attributes:
979
+ # Both don't have attributes, this must mean something
980
+ score += 1
981
+ checks += 1
982
+
983
+ if match_text:
984
+ score += SequenceMatcher(
985
+ None,
986
+ clean_spaces(original.text or ""),
987
+ clean_spaces(candidate.text or ""),
988
+ ).ratio()
989
+ checks += 1
990
+
991
+ if checks:
992
+ return round(score / checks, 2) >= similarity_threshold
993
+ return False
994
+
995
+ def find_similar(
996
+ self,
997
+ similarity_threshold: float = 0.2,
998
+ ignore_attributes: List | Tuple = (
999
+ "href",
1000
+ "src",
1001
+ ),
1002
+ match_text: bool = False,
1003
+ ) -> "Selectors":
1004
+ """Find elements that are in the same tree depth in the page with the same tag name and same parent tag etc...
1005
+ then return the ones that match the current element attributes with a percentage higher than the input threshold.
1006
+
1007
+ This function is inspired by AutoScraper and made for cases where you, for example, found a product div inside
1008
+ a products-list container and want to find other products using that element as a starting point EXCEPT
1009
+ this function works in any case without depending on the element type.
1010
+
1011
+ :param similarity_threshold: The percentage to use while comparing element attributes.
1012
+ Note: Elements found before attributes matching/comparison will be sharing the same depth, same tag name,
1013
+ same parent tag name, and same grand parent tag name. So they are 99% likely to be correct unless you are
1014
+ extremely unlucky, then attributes matching comes into play, so don't play with this number unless
1015
+ you are getting the results you don't want.
1016
+ Also, if the current element doesn't have attributes and the similar element as well, then it's a 100% match.
1017
+ :param ignore_attributes: Attribute names passed will be ignored while matching the attributes in the last step.
1018
+ The default value is to ignore `href` and `src` as URLs can change a lot between elements, so it's unreliable
1019
+ :param match_text: If True, element text content will be taken into calculation while matching.
1020
+ Not recommended to use in normal cases, but it depends.
1021
+
1022
+ :return: A ``Selectors`` container of ``Selector`` objects or empty list
1023
+ """
1024
+ if self._is_text_node(self._root):
1025
+ return Selectors()
1026
+
1027
+ # We will use the elements' root from now on to get the speed boost of using Lxml directly
1028
+ root = self._root
1029
+ similar_elements = list()
1030
+
1031
+ current_depth = len(list(root.iterancestors()))
1032
+ target_attrs = self.__get_attributes(root, ignore_attributes) if ignore_attributes else root.attrib
1033
+
1034
+ path_parts = [self.tag]
1035
+ if (parent := root.getparent()) is not None:
1036
+ path_parts.insert(0, parent.tag)
1037
+ if (grandparent := parent.getparent()) is not None:
1038
+ path_parts.insert(0, grandparent.tag)
1039
+
1040
+ xpath_path = "//{}".format("/".join(path_parts))
1041
+ potential_matches = root.xpath(f"{xpath_path}[count(ancestor::*) = {current_depth}]")
1042
+
1043
+ for potential_match in potential_matches:
1044
+ if potential_match != root and self.__are_alike(
1045
+ root,
1046
+ target_attrs,
1047
+ potential_match,
1048
+ ignore_attributes,
1049
+ similarity_threshold,
1050
+ match_text,
1051
+ ):
1052
+ similar_elements.append(potential_match)
1053
+
1054
+ return Selectors(map(self.__element_convertor, similar_elements))
1055
+
1056
+ @overload
1057
+ def find_by_text(
1058
+ self,
1059
+ text: str,
1060
+ first_match: Literal[True] = ...,
1061
+ partial: bool = ...,
1062
+ case_sensitive: bool = ...,
1063
+ clean_match: bool = ...,
1064
+ ) -> "Selector": ...
1065
+
1066
+ @overload
1067
+ def find_by_text(
1068
+ self,
1069
+ text: str,
1070
+ first_match: Literal[False],
1071
+ partial: bool = ...,
1072
+ case_sensitive: bool = ...,
1073
+ clean_match: bool = ...,
1074
+ ) -> "Selectors": ...
1075
+
1076
+ def find_by_text(
1077
+ self,
1078
+ text: str,
1079
+ first_match: bool = True,
1080
+ partial: bool = False,
1081
+ case_sensitive: bool = False,
1082
+ clean_match: bool = True,
1083
+ ) -> Union["Selectors", "Selector"]:
1084
+ """Find elements that its text content fully/partially matches input.
1085
+ :param text: Text query to match
1086
+ :param first_match: Returns the first element that matches conditions, enabled by default
1087
+ :param partial: If enabled, the function returns elements that contain the input text
1088
+ :param case_sensitive: if enabled, the letters case will be taken into consideration
1089
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
1090
+ """
1091
+ if self._is_text_node(self._root):
1092
+ return Selectors()
1093
+
1094
+ results = Selectors()
1095
+ if not case_sensitive:
1096
+ text = text.lower()
1097
+
1098
+ possible_targets = cast(List, _find_all_elements_with_spaces(self._root))
1099
+ if possible_targets:
1100
+ for node in self.__elements_convertor(possible_targets):
1101
+ """Check if element matches given text otherwise, traverse the children tree and iterate"""
1102
+ node_text: TextHandler = node.text
1103
+ if clean_match:
1104
+ node_text = TextHandler(node_text.clean())
1105
+
1106
+ if not case_sensitive:
1107
+ node_text = TextHandler(node_text.lower())
1108
+
1109
+ if partial:
1110
+ if text in node_text:
1111
+ results.append(node)
1112
+ elif text == node_text:
1113
+ results.append(node)
1114
+
1115
+ if first_match and results:
1116
+ # we got an element so we should stop
1117
+ break
1118
+
1119
+ if first_match:
1120
+ if results:
1121
+ return results[0]
1122
+ return results
1123
+
1124
+ @overload
1125
+ def find_by_regex(
1126
+ self,
1127
+ query: str | Pattern[str],
1128
+ first_match: Literal[True] = ...,
1129
+ case_sensitive: bool = ...,
1130
+ clean_match: bool = ...,
1131
+ ) -> "Selector": ...
1132
+
1133
+ @overload
1134
+ def find_by_regex(
1135
+ self,
1136
+ query: str | Pattern[str],
1137
+ first_match: Literal[False],
1138
+ case_sensitive: bool = ...,
1139
+ clean_match: bool = ...,
1140
+ ) -> "Selectors": ...
1141
+
1142
+ def find_by_regex(
1143
+ self,
1144
+ query: str | Pattern[str],
1145
+ first_match: bool = True,
1146
+ case_sensitive: bool = False,
1147
+ clean_match: bool = True,
1148
+ ) -> Union["Selectors", "Selector"]:
1149
+ """Find elements that its text content matches the input regex pattern.
1150
+ :param query: Regex query/pattern to match
1151
+ :param first_match: Return the first element that matches conditions; enabled by default.
1152
+ :param case_sensitive: If enabled, the letters case will be taken into consideration in the regex.
1153
+ :param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching.
1154
+ """
1155
+ if self._is_text_node(self._root):
1156
+ return Selectors()
1157
+
1158
+ results = Selectors()
1159
+
1160
+ possible_targets = cast(List, _find_all_elements_with_spaces(self._root))
1161
+ if possible_targets:
1162
+ for node in self.__elements_convertor(possible_targets):
1163
+ """Check if element matches given regex otherwise, traverse the children tree and iterate"""
1164
+ node_text = node.text
1165
+ if node_text.re(
1166
+ query,
1167
+ check_match=True,
1168
+ clean_match=clean_match,
1169
+ case_sensitive=case_sensitive,
1170
+ ):
1171
+ results.append(node)
1172
+
1173
+ if first_match and results:
1174
+ # we got an element so we should stop
1175
+ break
1176
+
1177
+ if results and first_match:
1178
+ return results[0]
1179
+ return results
1180
+
1181
+
1182
+ class Selectors(List[Selector]):
1183
+ """
1184
+ The `Selectors` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
1185
+ """
1186
+
1187
+ __slots__ = ()
1188
+
1189
+ @overload
1190
+ def __getitem__(self, pos: SupportsIndex) -> Selector:
1191
+ pass
1192
+
1193
+ @overload
1194
+ def __getitem__(self, pos: slice) -> "Selectors":
1195
+ pass
1196
+
1197
+ def __getitem__(self, pos: SupportsIndex | slice) -> Union[Selector, "Selectors"]:
1198
+ lst = super().__getitem__(pos)
1199
+ if isinstance(pos, slice):
1200
+ return self.__class__(cast(List[Selector], lst))
1201
+ else:
1202
+ return cast(Selector, lst)
1203
+
1204
+ def xpath(
1205
+ self,
1206
+ selector: str,
1207
+ identifier: str = "",
1208
+ auto_save: bool = False,
1209
+ percentage: int = 0,
1210
+ **kwargs: Any,
1211
+ ) -> "Selectors":
1212
+ """
1213
+ Call the ``.xpath()`` method for each element in this list and return
1214
+ their results as another `Selectors` class.
1215
+
1216
+ **Important:
1217
+ It's recommended to use the identifier argument if you plan to use a different selector later
1218
+ and want to relocate the same element(s)**
1219
+
1220
+ Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
1221
+
1222
+ :param selector: The XPath selector to be used.
1223
+ :param identifier: A string that will be used to retrieve element's data in adaptive,
1224
+ otherwise the selector will be used.
1225
+ :param auto_save: Automatically save new elements for `adaptive` later
1226
+ :param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.
1227
+ Be aware that the percentage calculation depends solely on the page structure, so don't play with this
1228
+ number unless you must know what you are doing!
1229
+
1230
+ :return: `Selectors` class.
1231
+ """
1232
+ results = [n.xpath(selector, identifier or selector, False, auto_save, percentage, **kwargs) for n in self]
1233
+ return self.__class__(flatten(results))
1234
+
1235
+ def css(
1236
+ self,
1237
+ selector: str,
1238
+ identifier: str = "",
1239
+ auto_save: bool = False,
1240
+ percentage: int = 0,
1241
+ ) -> "Selectors":
1242
+ """
1243
+ Call the ``.css()`` method for each element in this list and return
1244
+ their results flattened as another `Selectors` class.
1245
+
1246
+ **Important:
1247
+ It's recommended to use the identifier argument if you plan to use a different selector later
1248
+ and want to relocate the same element(s)**
1249
+
1250
+ :param selector: The CSS3 selector to be used.
1251
+ :param identifier: A string that will be used to retrieve element's data in adaptive,
1252
+ otherwise the selector will be used.
1253
+ :param auto_save: Automatically save new elements for `adaptive` later
1254
+ :param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.
1255
+ Be aware that the percentage calculation depends solely on the page structure, so don't play with this
1256
+ number unless you must know what you are doing!
1257
+
1258
+ :return: `Selectors` class.
1259
+ """
1260
+ results = [n.css(selector, identifier or selector, False, auto_save, percentage) for n in self]
1261
+ return self.__class__(flatten(results))
1262
+
1263
+ def re(
1264
+ self,
1265
+ regex: str | Pattern,
1266
+ replace_entities: bool = True,
1267
+ clean_match: bool = False,
1268
+ case_sensitive: bool = True,
1269
+ ) -> TextHandlers:
1270
+ """Call the ``.re()`` method for each element in this list and return
1271
+ their results flattened as List of TextHandler.
1272
+
1273
+ :param regex: Can be either a compiled regular expression or a string.
1274
+ :param replace_entities: If enabled character entity references are replaced by their corresponding character
1275
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
1276
+ :param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
1277
+ """
1278
+ results = [n.re(regex, replace_entities, clean_match, case_sensitive) for n in self]
1279
+ return TextHandlers(flatten(results))
1280
+
1281
+ def re_first(
1282
+ self,
1283
+ regex: str | Pattern,
1284
+ default: Any = None,
1285
+ replace_entities: bool = True,
1286
+ clean_match: bool = False,
1287
+ case_sensitive: bool = True,
1288
+ ) -> TextHandler:
1289
+ """Call the ``.re_first()`` method for each element in this list and return
1290
+ the first result or the default value otherwise.
1291
+
1292
+ :param regex: Can be either a compiled regular expression or a string.
1293
+ :param default: The default value to be returned if there is no match
1294
+ :param replace_entities: if enabled character entity references are replaced by their corresponding character
1295
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
1296
+ :param case_sensitive: if disabled, function will set the regex to ignore the letters case while compiling it
1297
+ """
1298
+ for n in self:
1299
+ for result in n.re(regex, replace_entities, clean_match, case_sensitive):
1300
+ return result
1301
+ return default
1302
+
1303
+ def search(self, func: Callable[["Selector"], bool]) -> Optional["Selector"]:
1304
+ """Loop over all current elements and return the first element that matches the passed function
1305
+ :param func: A function that takes each element as an argument and returns True/False
1306
+ :return: The first element that match the function or ``None`` otherwise.
1307
+ """
1308
+ for element in self:
1309
+ if func(element):
1310
+ return element
1311
+ return None
1312
+
1313
+ def filter(self, func: Callable[["Selector"], bool]) -> "Selectors":
1314
+ """Filter current elements based on the passed function
1315
+ :param func: A function that takes each element as an argument and returns True/False
1316
+ :return: The new `Selectors` object or empty list otherwise.
1317
+ """
1318
+ return self.__class__([element for element in self if func(element)])
1319
+
1320
+ @overload
1321
+ def get(self) -> Optional[TextHandler]: ...
1322
+
1323
+ @overload
1324
+ def get(self, default: _T) -> Union[TextHandler, _T]: ...
1325
+
1326
+ def get(self, default=None):
1327
+ """Returns the serialized string of the first element, or ``default`` if empty.
1328
+ :param default: the default value to return if the current list is empty
1329
+ """
1330
+ for x in self:
1331
+ return x.get()
1332
+ return default
1333
+
1334
+ def getall(self) -> TextHandlers:
1335
+ """Serialize all elements and return as a TextHandlers list."""
1336
+ return TextHandlers([x.get() for x in self])
1337
+
1338
+ extract = getall
1339
+ extract_first = get
1340
+
1341
+ @property
1342
+ def first(self) -> Optional[Selector]:
1343
+ """Returns the first Selector item of the current list or `None` if the list is empty"""
1344
+ return self[0] if len(self) > 0 else None
1345
+
1346
+ @property
1347
+ def last(self) -> Optional[Selector]:
1348
+ """Returns the last Selector item of the current list or `None` if the list is empty"""
1349
+ return self[-1] if len(self) > 0 else None
1350
+
1351
+ @property
1352
+ def length(self) -> int:
1353
+ """Returns the length of the current list"""
1354
+ return len(self)
1355
+
1356
+ def __getstate__(self) -> Any: # pragma: no cover
1357
+ # lxml don't like it :)
1358
+ raise TypeError("Can't pickle Selectors object")
1359
+
1360
+
1361
+ # For backward compatibility
1362
+ Adaptor = Selector
1363
+ Adaptors = Selectors
py.typed ADDED
@@ -0,0 +1 @@
 
 
1
+
spiders/__init__.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .request import Request
2
+ from .result import CrawlResult
3
+ from .scheduler import Scheduler
4
+ from .engine import CrawlerEngine
5
+ from .session import SessionManager
6
+ from .spider import Spider, SessionConfigurationError
7
+ from scrapling.engines.toolbelt.custom import Response
8
+
9
+ __all__ = [
10
+ "Spider",
11
+ "SessionConfigurationError",
12
+ "Request",
13
+ "CrawlerEngine",
14
+ "CrawlResult",
15
+ "SessionManager",
16
+ "Scheduler",
17
+ "Response",
18
+ ]
spiders/checkpoint.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ from pathlib import Path
3
+ from dataclasses import dataclass, field
4
+
5
+ import anyio
6
+ from anyio import Path as AsyncPath
7
+
8
+ from scrapling.core.utils import log
9
+ from scrapling.core._types import Set, List, Optional, TYPE_CHECKING
10
+
11
+ if TYPE_CHECKING:
12
+ from scrapling.spiders.request import Request
13
+
14
+
15
+ @dataclass
16
+ class CheckpointData:
17
+ """Container for checkpoint state."""
18
+
19
+ requests: List["Request"] = field(default_factory=list)
20
+ seen: Set[bytes] = field(default_factory=set)
21
+
22
+
23
+ class CheckpointManager:
24
+ """Manages saving and loading checkpoint state to/from disk."""
25
+
26
+ CHECKPOINT_FILE = "checkpoint.pkl"
27
+
28
+ def __init__(self, crawldir: str | Path | AsyncPath, interval: float = 300.0):
29
+ self.crawldir = AsyncPath(crawldir)
30
+ self._checkpoint_path = self.crawldir / self.CHECKPOINT_FILE
31
+ self.interval = interval
32
+ if not isinstance(interval, (int, float)):
33
+ raise TypeError("Checkpoints interval must be integer or float.")
34
+ else:
35
+ if interval < 0:
36
+ raise ValueError("Checkpoints interval must be equal or greater than 0.")
37
+
38
+ async def has_checkpoint(self) -> bool:
39
+ """Check if a checkpoint exists."""
40
+ return await self._checkpoint_path.exists()
41
+
42
+ async def save(self, data: CheckpointData) -> None:
43
+ """Save checkpoint data to disk atomically."""
44
+ await self.crawldir.mkdir(parents=True, exist_ok=True)
45
+
46
+ temp_path = self._checkpoint_path.with_suffix(".tmp")
47
+
48
+ try:
49
+ serialized = pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL)
50
+ async with await anyio.open_file(temp_path, "wb") as f:
51
+ await f.write(serialized)
52
+
53
+ await temp_path.rename(self._checkpoint_path)
54
+
55
+ log.info(f"Checkpoint saved: {len(data.requests)} requests, {len(data.seen)} seen URLs")
56
+ except Exception as e:
57
+ # Clean up temp file if it exists
58
+ if await temp_path.exists():
59
+ await temp_path.unlink()
60
+ log.error(f"Failed to save checkpoint: {e}")
61
+ raise
62
+
63
+ async def load(self) -> Optional[CheckpointData]:
64
+ """Load checkpoint data from disk.
65
+
66
+ Returns None if no checkpoint exists or if loading fails.
67
+ """
68
+ if not await self.has_checkpoint():
69
+ return None
70
+
71
+ try:
72
+ async with await anyio.open_file(self._checkpoint_path, "rb") as f:
73
+ content = await f.read()
74
+ data: CheckpointData = pickle.loads(content)
75
+
76
+ log.info(f"Checkpoint loaded: {len(data.requests)} requests, {len(data.seen)} seen URLs")
77
+ return data
78
+
79
+ except Exception as e:
80
+ log.error(f"Failed to load checkpoint (starting fresh): {e}")
81
+ return None
82
+
83
+ async def cleanup(self) -> None:
84
+ """Delete checkpoint file after successful completion."""
85
+ try:
86
+ if await self._checkpoint_path.exists():
87
+ await self._checkpoint_path.unlink()
88
+ log.debug("Checkpoint file cleaned up")
89
+ except Exception as e:
90
+ log.warning(f"Failed to cleanup checkpoint file: {e}")
spiders/engine.py ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pprint
3
+ from pathlib import Path
4
+
5
+ import anyio
6
+ from anyio import Path as AsyncPath
7
+ from anyio import create_task_group, CapacityLimiter, create_memory_object_stream, EndOfStream
8
+
9
+ from scrapling.core.utils import log
10
+ from scrapling.spiders.request import Request
11
+ from scrapling.spiders.scheduler import Scheduler
12
+ from scrapling.spiders.session import SessionManager
13
+ from scrapling.spiders.result import CrawlStats, ItemList
14
+ from scrapling.spiders.checkpoint import CheckpointManager, CheckpointData
15
+ from scrapling.core._types import Dict, Union, Optional, TYPE_CHECKING, Any, AsyncGenerator
16
+
17
+ if TYPE_CHECKING:
18
+ from scrapling.spiders.spider import Spider
19
+
20
+
21
+ def _dump(obj: Dict) -> str:
22
+ return json.dumps(obj, indent=4)
23
+
24
+
25
+ class CrawlerEngine:
26
+ """Orchestrates the crawling process."""
27
+
28
+ def __init__(
29
+ self,
30
+ spider: "Spider",
31
+ session_manager: SessionManager,
32
+ crawldir: Optional[Union[str, Path, AsyncPath]] = None,
33
+ interval: float = 300.0,
34
+ ):
35
+ self.spider = spider
36
+ self.session_manager = session_manager
37
+ self.scheduler = Scheduler(
38
+ include_kwargs=spider.fp_include_kwargs,
39
+ include_headers=spider.fp_include_headers,
40
+ keep_fragments=spider.fp_keep_fragments,
41
+ )
42
+ self.stats = CrawlStats()
43
+
44
+ self._global_limiter = CapacityLimiter(spider.concurrent_requests)
45
+ self._domain_limiters: dict[str, CapacityLimiter] = {}
46
+ self._allowed_domains: set[str] = spider.allowed_domains or set()
47
+
48
+ self._active_tasks: int = 0
49
+ self._running: bool = False
50
+ self._items: ItemList = ItemList()
51
+ self._item_stream: Any = None
52
+
53
+ self._checkpoint_system_enabled = bool(crawldir)
54
+ self._checkpoint_manager = CheckpointManager(crawldir or "", interval)
55
+ self._last_checkpoint_time: float = 0.0
56
+ self._pause_requested: bool = False
57
+ self._force_stop: bool = False
58
+ self.paused: bool = False
59
+
60
+ def _is_domain_allowed(self, request: Request) -> bool:
61
+ """Check if the request's domain is in allowed_domains."""
62
+ if not self._allowed_domains:
63
+ return True
64
+
65
+ domain = request.domain
66
+ for allowed in self._allowed_domains:
67
+ if domain == allowed or domain.endswith("." + allowed):
68
+ return True
69
+ return False
70
+
71
+ def _rate_limiter(self, domain: str) -> CapacityLimiter:
72
+ """Get or create a per-domain concurrency limiter if enabled, otherwise use the global limiter."""
73
+ if self.spider.concurrent_requests_per_domain:
74
+ if domain not in self._domain_limiters:
75
+ self._domain_limiters[domain] = CapacityLimiter(self.spider.concurrent_requests_per_domain)
76
+ return self._domain_limiters[domain]
77
+ return self._global_limiter
78
+
79
+ def _normalize_request(self, request: Request) -> None:
80
+ """Normalize request fields before enqueueing.
81
+
82
+ Resolves empty sid to the session manager's default session ID.
83
+ This ensures consistent fingerprinting for requests using the same session.
84
+ """
85
+ if not request.sid:
86
+ request.sid = self.session_manager.default_session_id
87
+
88
+ async def _process_request(self, request: Request) -> None:
89
+ """Download and process a single request."""
90
+ async with self._rate_limiter(request.domain):
91
+ if self.spider.download_delay:
92
+ await anyio.sleep(self.spider.download_delay)
93
+
94
+ if request._session_kwargs.get("proxy"):
95
+ self.stats.proxies.append(request._session_kwargs["proxy"])
96
+ if request._session_kwargs.get("proxies"):
97
+ self.stats.proxies.append(dict(request._session_kwargs["proxies"]))
98
+ try:
99
+ response = await self.session_manager.fetch(request)
100
+ self.stats.increment_requests_count(request.sid or self.session_manager.default_session_id)
101
+ self.stats.increment_response_bytes(request.domain, len(response.body))
102
+ self.stats.increment_status(response.status)
103
+
104
+ except Exception as e:
105
+ self.stats.failed_requests_count += 1
106
+ await self.spider.on_error(request, e)
107
+ return
108
+
109
+ if await self.spider.is_blocked(response):
110
+ self.stats.blocked_requests_count += 1
111
+ if request._retry_count < self.spider.max_blocked_retries:
112
+ retry_request = request.copy()
113
+ retry_request._retry_count += 1
114
+ retry_request.priority -= 1 # Don't retry immediately
115
+ retry_request.dont_filter = True
116
+ retry_request._session_kwargs.pop("proxy", None)
117
+ retry_request._session_kwargs.pop("proxies", None)
118
+
119
+ new_request = await self.spider.retry_blocked_request(retry_request, response)
120
+ self._normalize_request(new_request)
121
+ await self.scheduler.enqueue(new_request)
122
+ log.info(
123
+ f"Scheduled blocked request for retry ({retry_request._retry_count}/{self.spider.max_blocked_retries}): {request.url}"
124
+ )
125
+ else:
126
+ log.warning(f"Max retries exceeded for blocked request: {request.url}")
127
+ return
128
+
129
+ callback = request.callback if request.callback else self.spider.parse
130
+ try:
131
+ async for result in callback(response):
132
+ if isinstance(result, Request):
133
+ if self._is_domain_allowed(result):
134
+ self._normalize_request(result)
135
+ await self.scheduler.enqueue(result)
136
+ else:
137
+ self.stats.offsite_requests_count += 1
138
+ log.debug(f"Filtered offsite request to: {result.url}")
139
+ elif isinstance(result, dict):
140
+ processed_result = await self.spider.on_scraped_item(result)
141
+ if processed_result:
142
+ self.stats.items_scraped += 1
143
+ log.debug(f"Scraped from {str(response)}\n{pprint.pformat(processed_result)}")
144
+ if self._item_stream:
145
+ await self._item_stream.send(processed_result)
146
+ else:
147
+ self._items.append(processed_result)
148
+ else:
149
+ self.stats.items_dropped += 1
150
+ log.warning(f"Dropped from {str(response)}\n{processed_result}")
151
+ elif result is not None:
152
+ log.error(f"Spider must return Request, dict or None, got '{type(result)}' in {request}")
153
+ except Exception as e:
154
+ msg = f"Spider error processing {request}:\n {e}"
155
+ log.error(msg, exc_info=e)
156
+ await self.spider.on_error(request, e)
157
+
158
+ async def _task_wrapper(self, request: Request) -> None:
159
+ """Wrapper to track active task count."""
160
+ try:
161
+ await self._process_request(request)
162
+ finally:
163
+ self._active_tasks -= 1
164
+
165
+ def request_pause(self) -> None:
166
+ """Request a graceful pause of the crawl.
167
+
168
+ First call: requests graceful pause (waits for active tasks).
169
+ Second call: forces immediate stop.
170
+ """
171
+ if self._force_stop:
172
+ return # Already forcing stop
173
+
174
+ if self._pause_requested:
175
+ # Second Ctrl+C - force stop
176
+ self._force_stop = True
177
+ log.warning("Force stop requested, cancelling immediately...")
178
+ else:
179
+ self._pause_requested = True
180
+ log.info(
181
+ "Pause requested, waiting for in-flight requests to complete (press Ctrl+C again to force stop)..."
182
+ )
183
+
184
+ async def _save_checkpoint(self) -> None:
185
+ """Save current state to checkpoint files."""
186
+ requests, seen = self.scheduler.snapshot()
187
+ data = CheckpointData(requests=requests, seen=seen)
188
+ await self._checkpoint_manager.save(data)
189
+ self._last_checkpoint_time = anyio.current_time()
190
+
191
+ def _is_checkpoint_time(self) -> bool:
192
+ """Check if it's time for the periodic checkpoint."""
193
+ if not self._checkpoint_system_enabled:
194
+ return False
195
+
196
+ if self._checkpoint_manager.interval == 0:
197
+ return False
198
+
199
+ current_time = anyio.current_time()
200
+ return (current_time - self._last_checkpoint_time) >= self._checkpoint_manager.interval
201
+
202
+ async def _restore_from_checkpoint(self) -> bool:
203
+ """Attempt to restore state from checkpoint.
204
+
205
+ Returns True if successfully restored, False otherwise.
206
+ """
207
+ if not self._checkpoint_system_enabled:
208
+ raise
209
+
210
+ data = await self._checkpoint_manager.load()
211
+ if data is None:
212
+ return False
213
+
214
+ self.scheduler.restore(data)
215
+
216
+ # Restore callbacks from spider after scheduler restore
217
+ for request in data.requests:
218
+ request._restore_callback(self.spider)
219
+
220
+ return True
221
+
222
+ async def crawl(self) -> CrawlStats:
223
+ """Run the spider and return CrawlStats."""
224
+ self._running = True
225
+ self._items.clear()
226
+ self.paused = False
227
+ self._pause_requested = False
228
+ self._force_stop = False
229
+ self.stats = CrawlStats(start_time=anyio.current_time())
230
+
231
+ # Check for existing checkpoint
232
+ resuming = (await self._restore_from_checkpoint()) if self._checkpoint_system_enabled else False
233
+ self._last_checkpoint_time = anyio.current_time()
234
+
235
+ async with self.session_manager:
236
+ self.stats.concurrent_requests = self.spider.concurrent_requests
237
+ self.stats.concurrent_requests_per_domain = self.spider.concurrent_requests_per_domain
238
+ self.stats.download_delay = self.spider.download_delay
239
+ await self.spider.on_start(resuming=resuming)
240
+
241
+ try:
242
+ if not resuming:
243
+ async for request in self.spider.start_requests():
244
+ self._normalize_request(request)
245
+ await self.scheduler.enqueue(request)
246
+ else:
247
+ log.info("Resuming from checkpoint, skipping start_requests()")
248
+
249
+ # Process queue
250
+ async with create_task_group() as tg:
251
+ while self._running:
252
+ if self._pause_requested:
253
+ if self._active_tasks == 0 or self._force_stop:
254
+ if self._force_stop:
255
+ log.warning(f"Force stopping with {self._active_tasks} active tasks")
256
+ tg.cancel_scope.cancel()
257
+
258
+ # Only save checkpoint if checkpoint system is enabled
259
+ if self._checkpoint_system_enabled:
260
+ await self._save_checkpoint()
261
+ self.paused = True
262
+ log.info("Spider paused, checkpoint saved")
263
+ else:
264
+ log.info("Spider stopped gracefully")
265
+
266
+ self._running = False
267
+ break
268
+
269
+ # Wait briefly and check again
270
+ await anyio.sleep(0.05)
271
+ continue
272
+
273
+ if self._checkpoint_system_enabled and self._is_checkpoint_time():
274
+ await self._save_checkpoint()
275
+
276
+ if self.scheduler.is_empty:
277
+ # Empty queue + no active tasks = done
278
+ if self._active_tasks == 0:
279
+ self._running = False
280
+ log.debug("Spider idle")
281
+ break
282
+
283
+ # Brief wait for callbacks to enqueue new requests
284
+ await anyio.sleep(0.05)
285
+ continue
286
+
287
+ # Only spawn tasks up to concurrent_requests limit
288
+ # This prevents spawning thousands of waiting tasks
289
+ if self._active_tasks >= self.spider.concurrent_requests:
290
+ await anyio.sleep(0.01)
291
+ continue
292
+
293
+ request = await self.scheduler.dequeue()
294
+ self._active_tasks += 1
295
+ tg.start_soon(self._task_wrapper, request)
296
+
297
+ finally:
298
+ await self.spider.on_close()
299
+ # Clean up checkpoint files on successful completion (not paused)
300
+ if not self.paused and self._checkpoint_system_enabled:
301
+ await self._checkpoint_manager.cleanup()
302
+
303
+ self.stats.log_levels_counter = self.spider._log_counter.get_counts()
304
+ self.stats.end_time = anyio.current_time()
305
+ log.info(_dump(self.stats.to_dict()))
306
+ return self.stats
307
+
308
+ @property
309
+ def items(self) -> ItemList:
310
+ """Access scraped items."""
311
+ return self._items
312
+
313
+ def __aiter__(self) -> AsyncGenerator[dict, None]:
314
+ return self._stream()
315
+
316
+ async def _stream(self) -> AsyncGenerator[dict, None]:
317
+ """Async generator that runs crawl and yields items."""
318
+ send, recv = create_memory_object_stream[dict](100)
319
+ self._item_stream = send
320
+
321
+ async def run():
322
+ try:
323
+ await self.crawl()
324
+ finally:
325
+ await send.aclose()
326
+
327
+ async with create_task_group() as tg:
328
+ tg.start_soon(run)
329
+ try:
330
+ async for item in recv:
331
+ yield item
332
+ except EndOfStream:
333
+ pass
spiders/request.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ from io import BytesIO
3
+ from functools import cached_property
4
+ from urllib.parse import urlparse, urlencode
5
+
6
+ import orjson
7
+ from w3lib.url import canonicalize_url
8
+
9
+ from scrapling.engines.toolbelt.custom import Response
10
+ from scrapling.core._types import Any, AsyncGenerator, Callable, Dict, Optional, Union, Tuple, TYPE_CHECKING
11
+
12
+ if TYPE_CHECKING:
13
+ from scrapling.spiders.spider import Spider
14
+
15
+
16
+ def _convert_to_bytes(value: str | bytes) -> bytes:
17
+ if isinstance(value, bytes):
18
+ return value
19
+ if not isinstance(value, str):
20
+ raise TypeError(f"Can't convert {type(value).__name__} to bytes")
21
+
22
+ return value.encode(encoding="utf-8", errors="ignore")
23
+
24
+
25
+ class Request:
26
+ def __init__(
27
+ self,
28
+ url: str,
29
+ sid: str = "",
30
+ callback: Callable[[Response], AsyncGenerator[Union[Dict[str, Any], "Request", None], None]] | None = None,
31
+ priority: int = 0,
32
+ dont_filter: bool = False,
33
+ meta: dict[str, Any] | None = None,
34
+ _retry_count: int = 0,
35
+ **kwargs: Any,
36
+ ) -> None:
37
+ self.url: str = url
38
+ self.sid: str = sid
39
+ self.callback = callback
40
+ self.priority: int = priority
41
+ self.dont_filter: bool = dont_filter
42
+ self.meta: dict[str, Any] = meta if meta else {}
43
+ self._retry_count: int = _retry_count
44
+ self._session_kwargs = kwargs if kwargs else {}
45
+ self._fp: Optional[bytes] = None
46
+
47
+ def copy(self) -> "Request":
48
+ """Create a copy of this request."""
49
+ return Request(
50
+ url=self.url,
51
+ sid=self.sid,
52
+ callback=self.callback,
53
+ priority=self.priority,
54
+ dont_filter=self.dont_filter,
55
+ meta=self.meta.copy(),
56
+ _retry_count=self._retry_count,
57
+ **self._session_kwargs,
58
+ )
59
+
60
+ @cached_property
61
+ def domain(self) -> str:
62
+ return urlparse(self.url).netloc
63
+
64
+ def update_fingerprint(
65
+ self,
66
+ include_kwargs: bool = False,
67
+ include_headers: bool = False,
68
+ keep_fragments: bool = False,
69
+ ) -> bytes:
70
+ """Generate a unique fingerprint for deduplication.
71
+
72
+ Caches the result in self._fp after first computation.
73
+ """
74
+ if self._fp is not None:
75
+ return self._fp
76
+
77
+ post_data = self._session_kwargs.get("data", {})
78
+ body = b""
79
+ if post_data:
80
+ if isinstance(post_data, dict | list | tuple):
81
+ body = urlencode(post_data).encode()
82
+ elif isinstance(post_data, str):
83
+ body = post_data.encode()
84
+ elif isinstance(post_data, BytesIO):
85
+ body = post_data.getvalue()
86
+ elif isinstance(post_data, bytes):
87
+ body = post_data
88
+ else:
89
+ post_data = self._session_kwargs.get("json", {})
90
+ body = orjson.dumps(post_data) if post_data else b""
91
+
92
+ data: Dict[str, str | Tuple] = {
93
+ "sid": self.sid,
94
+ "body": body.hex(),
95
+ "method": self._session_kwargs.get("method", "GET"),
96
+ "url": canonicalize_url(self.url, keep_fragments=keep_fragments),
97
+ }
98
+
99
+ if include_kwargs:
100
+ kwargs = (key.lower() for key in self._session_kwargs.keys() if key.lower() not in ("data", "json"))
101
+ data["kwargs"] = "".join(set(_convert_to_bytes(key).hex() for key in kwargs))
102
+
103
+ if include_headers:
104
+ headers = self._session_kwargs.get("headers") or self._session_kwargs.get("extra_headers") or {}
105
+ processed_headers = {}
106
+ # Some header normalization
107
+ for key, value in headers.items():
108
+ processed_headers[_convert_to_bytes(key.lower()).hex()] = _convert_to_bytes(value.lower()).hex()
109
+ data["headers"] = tuple(processed_headers.items())
110
+
111
+ fp = hashlib.sha1(orjson.dumps(data, option=orjson.OPT_SORT_KEYS), usedforsecurity=False).digest()
112
+ self._fp = fp
113
+ return fp
114
+
115
+ def __repr__(self) -> str:
116
+ callback_name = getattr(self.callback, "__name__", None) or "None"
117
+ return f"<Request({self.url}) priority={self.priority} callback={callback_name}>"
118
+
119
+ def __str__(self) -> str:
120
+ return self.url
121
+
122
+ def __lt__(self, other: object) -> bool:
123
+ """Compare requests by priority"""
124
+ if not isinstance(other, Request):
125
+ return NotImplemented
126
+ return self.priority < other.priority
127
+
128
+ def __gt__(self, other: object) -> bool:
129
+ """Compare requests by priority"""
130
+ if not isinstance(other, Request):
131
+ return NotImplemented
132
+ return self.priority > other.priority
133
+
134
+ def __eq__(self, other: object) -> bool:
135
+ """Requests are equal if they have the same fingerprint."""
136
+ if not isinstance(other, Request):
137
+ return NotImplemented
138
+ if self._fp is None or other._fp is None:
139
+ raise RuntimeError("Cannot compare requests before generating their fingerprints!")
140
+ return self._fp == other._fp
141
+
142
+ def __getstate__(self) -> dict[str, Any]:
143
+ """Prepare state for pickling - store callback as name string for pickle compatibility."""
144
+ state = self.__dict__.copy()
145
+ state["_callback_name"] = getattr(self.callback, "__name__", None) if self.callback is not None else None
146
+ state["callback"] = None # Don't pickle the actual callable
147
+ return state
148
+
149
+ def __setstate__(self, state: dict[str, Any]) -> None:
150
+ """Restore state from pickle - callback restored later via _restore_callback()."""
151
+ self._callback_name: str | None = state.pop("_callback_name", None)
152
+ self.__dict__.update(state)
153
+
154
+ def _restore_callback(self, spider: "Spider") -> None:
155
+ """Restore callback from spider after unpickling.
156
+
157
+ :param spider: Spider instance to look up callback method on
158
+ """
159
+ if hasattr(self, "_callback_name") and self._callback_name:
160
+ self.callback = getattr(spider, self._callback_name, None) or spider.parse
161
+ del self._callback_name
162
+ elif hasattr(self, "_callback_name"):
163
+ del self._callback_name
spiders/result.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from dataclasses import dataclass, field
3
+
4
+ import orjson
5
+
6
+ from scrapling.core.utils import log
7
+ from scrapling.core._types import Any, Iterator, Dict, List, Tuple, Union
8
+
9
+
10
+ class ItemList(list):
11
+ """A list of scraped items with export capabilities."""
12
+
13
+ def to_json(self, path: Union[str, Path], *, indent: bool = False):
14
+ """Export items to a JSON file.
15
+
16
+ :param path: Path to the output file
17
+ :param indent: Pretty-print with 2-space indentation (slightly slower)
18
+ """
19
+ options = orjson.OPT_SERIALIZE_NUMPY
20
+ if indent:
21
+ options |= orjson.OPT_INDENT_2
22
+
23
+ file = Path(path)
24
+ file.parent.mkdir(parents=True, exist_ok=True)
25
+ file.write_bytes(orjson.dumps(list(self), option=options))
26
+ log.info("Saved %d items to %s", len(self), path)
27
+
28
+ def to_jsonl(self, path: Union[str, Path]):
29
+ """Export items as JSON Lines (one JSON object per line).
30
+
31
+ :param path: Path to the output file
32
+ """
33
+ Path(path).parent.mkdir(parents=True, exist_ok=True)
34
+ with open(path, "wb") as f:
35
+ for item in self:
36
+ f.write(orjson.dumps(item, option=orjson.OPT_SERIALIZE_NUMPY))
37
+ f.write(b"\n")
38
+ log.info("Saved %d items to %s", len(self), path)
39
+
40
+
41
+ @dataclass
42
+ class CrawlStats:
43
+ """Statistics for a crawl run."""
44
+
45
+ requests_count: int = 0
46
+ concurrent_requests: int = 0
47
+ concurrent_requests_per_domain: int = 0
48
+ failed_requests_count: int = 0
49
+ offsite_requests_count: int = 0
50
+ response_bytes: int = 0
51
+ items_scraped: int = 0
52
+ items_dropped: int = 0
53
+ start_time: float = 0.0
54
+ end_time: float = 0.0
55
+ download_delay: float = 0.0
56
+ blocked_requests_count: int = 0
57
+ custom_stats: Dict = field(default_factory=dict)
58
+ response_status_count: Dict = field(default_factory=dict)
59
+ domains_response_bytes: Dict = field(default_factory=dict)
60
+ sessions_requests_count: Dict = field(default_factory=dict)
61
+ proxies: List[str | Dict | Tuple] = field(default_factory=list)
62
+ log_levels_counter: Dict = field(default_factory=dict)
63
+
64
+ @property
65
+ def elapsed_seconds(self) -> float:
66
+ return self.end_time - self.start_time
67
+
68
+ @property
69
+ def requests_per_second(self) -> float:
70
+ if self.elapsed_seconds == 0:
71
+ return 0.0
72
+ return self.requests_count / self.elapsed_seconds
73
+
74
+ def increment_status(self, status: int) -> None:
75
+ self.response_status_count[f"status_{status}"] = self.response_status_count.get(f"status_{status}", 0) + 1
76
+
77
+ def increment_response_bytes(self, domain: str, count: int) -> None:
78
+ self.response_bytes += count
79
+ self.domains_response_bytes[domain] = self.domains_response_bytes.get(domain, 0) + count
80
+
81
+ def increment_requests_count(self, sid: str) -> None:
82
+ self.requests_count += 1
83
+ self.sessions_requests_count[sid] = self.sessions_requests_count.get(sid, 0) + 1
84
+
85
+ def to_dict(self) -> dict[str, Any]:
86
+ return {
87
+ "items_scraped": self.items_scraped,
88
+ "items_dropped": self.items_dropped,
89
+ "elapsed_seconds": round(self.elapsed_seconds, 2),
90
+ "download_delay": round(self.download_delay, 2),
91
+ "concurrent_requests": self.concurrent_requests,
92
+ "concurrent_requests_per_domain": self.concurrent_requests_per_domain,
93
+ "requests_count": self.requests_count,
94
+ "requests_per_second": round(self.requests_per_second, 2),
95
+ "sessions_requests_count": self.sessions_requests_count,
96
+ "failed_requests_count": self.failed_requests_count,
97
+ "offsite_requests_count": self.offsite_requests_count,
98
+ "blocked_requests_count": self.blocked_requests_count,
99
+ "response_status_count": self.response_status_count,
100
+ "response_bytes": self.response_bytes,
101
+ "domains_response_bytes": self.domains_response_bytes,
102
+ "proxies": self.proxies,
103
+ "custom_stats": self.custom_stats,
104
+ "log_count": self.log_levels_counter,
105
+ }
106
+
107
+
108
+ @dataclass
109
+ class CrawlResult:
110
+ """Complete result from a spider run."""
111
+
112
+ stats: CrawlStats
113
+ items: ItemList
114
+ paused: bool = False
115
+
116
+ @property
117
+ def completed(self) -> bool:
118
+ """True if the crawl completed normally (not paused)."""
119
+ return not self.paused
120
+
121
+ def __len__(self) -> int:
122
+ return len(self.items)
123
+
124
+ def __iter__(self) -> Iterator[dict[str, Any]]:
125
+ return iter(self.items)
spiders/scheduler.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from itertools import count
3
+
4
+ from scrapling.core.utils import log
5
+ from scrapling.spiders.request import Request
6
+ from scrapling.core._types import List, Set, Tuple, TYPE_CHECKING
7
+
8
+ if TYPE_CHECKING:
9
+ from scrapling.spiders.checkpoint import CheckpointData
10
+
11
+
12
+ class Scheduler:
13
+ """
14
+ Priority queue with URL deduplication. (heapq)
15
+
16
+ Higher priority requests are processed first.
17
+ Duplicate URLs are filtered unless dont_filter=True.
18
+ """
19
+
20
+ def __init__(self, include_kwargs: bool = False, include_headers: bool = False, keep_fragments: bool = False):
21
+ self._queue: asyncio.PriorityQueue[tuple[int, int, Request]] = asyncio.PriorityQueue()
22
+ self._seen: set[bytes] = set()
23
+ self._counter = count()
24
+ # Mirror dict for snapshot without draining queue
25
+ self._pending: dict[int, tuple[int, int, Request]] = {}
26
+ self._include_kwargs = include_kwargs
27
+ self._include_headers = include_headers
28
+ self._keep_fragments = keep_fragments
29
+
30
+ async def enqueue(self, request: Request) -> bool:
31
+ """Add a request to the queue."""
32
+ fingerprint = request.update_fingerprint(self._include_kwargs, self._include_headers, self._keep_fragments)
33
+
34
+ if not request.dont_filter and fingerprint in self._seen:
35
+ log.debug("Dropped duplicate request: %s", request)
36
+ return False
37
+
38
+ self._seen.add(fingerprint)
39
+
40
+ # Negative priority so higher priority = dequeued first
41
+ counter = next(self._counter)
42
+ item = (-request.priority, counter, request)
43
+ self._pending[counter] = item
44
+ await self._queue.put(item)
45
+ return True
46
+
47
+ async def dequeue(self) -> Request:
48
+ """Get the next request to process."""
49
+ _, counter, request = await self._queue.get()
50
+ self._pending.pop(counter, None)
51
+ return request
52
+
53
+ def __len__(self) -> int:
54
+ return self._queue.qsize()
55
+
56
+ @property
57
+ def is_empty(self) -> bool:
58
+ return self._queue.empty()
59
+
60
+ def snapshot(self) -> Tuple[List[Request], Set[bytes]]:
61
+ """Create a snapshot of the current state for checkpoints."""
62
+ sorted_items = sorted(self._pending.values(), key=lambda x: (x[0], x[1])) # Maintain queue order
63
+ requests = [item[2] for item in sorted_items]
64
+ return requests, self._seen.copy()
65
+
66
+ def restore(self, data: "CheckpointData") -> None:
67
+ """Restore scheduler state from checkpoint data.
68
+
69
+ :param data: CheckpointData containing requests and seen set
70
+ """
71
+ self._seen = data.seen.copy()
72
+
73
+ # Restore pending requests in order (they're already sorted by priority)
74
+ for request in data.requests:
75
+ counter = next(self._counter)
76
+ item = (-request.priority, counter, request)
77
+ self._pending[counter] = item
78
+ self._queue.put_nowait(item)
79
+
80
+ log.info(f"Scheduler restored: {len(data.requests)} requests, {len(data.seen)} seen")
spiders/session.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from asyncio import Lock
2
+
3
+ from scrapling.spiders.request import Request
4
+ from scrapling.engines.static import _ASyncSessionLogic
5
+ from scrapling.engines.toolbelt.convertor import Response
6
+ from scrapling.core._types import Set, cast, SUPPORTED_HTTP_METHODS
7
+ from scrapling.fetchers import AsyncDynamicSession, AsyncStealthySession, FetcherSession
8
+
9
+ Session = FetcherSession | AsyncDynamicSession | AsyncStealthySession
10
+
11
+
12
+ class SessionManager:
13
+ """Manages pre-configured session instances."""
14
+
15
+ def __init__(self) -> None:
16
+ self._sessions: dict[str, Session] = {}
17
+ self._default_session_id: str | None = None
18
+ self._started: bool = False
19
+ self._lazy_sessions: Set[str] = set()
20
+ self._lazy_lock = Lock()
21
+
22
+ def add(self, session_id: str, session: Session, *, default: bool = False, lazy: bool = False) -> "SessionManager":
23
+ """Register a session instance.
24
+
25
+ :param session_id: Name to reference this session in requests
26
+ :param session: Your pre-configured session instance
27
+ :param default: If True, this becomes the default session
28
+ :param lazy: If True, the session will be started only when a request uses its ID.
29
+ """
30
+ if session_id in self._sessions:
31
+ raise ValueError(f"Session '{session_id}' already registered")
32
+
33
+ self._sessions[session_id] = session
34
+
35
+ if default or self._default_session_id is None:
36
+ self._default_session_id = session_id
37
+
38
+ if lazy:
39
+ self._lazy_sessions.add(session_id)
40
+
41
+ return self
42
+
43
+ def remove(self, session_id: str) -> None:
44
+ """Removes a session.
45
+
46
+ :param session_id: ID of session to remove
47
+ """
48
+ _ = self.pop(session_id)
49
+
50
+ def pop(self, session_id: str) -> Session:
51
+ """Remove and returns a session.
52
+
53
+ :param session_id: ID of session to remove
54
+ """
55
+ if session_id not in self._sessions:
56
+ raise KeyError(f"Session '{session_id}' not found")
57
+
58
+ session = self._sessions.pop(session_id)
59
+ if session_id in self._lazy_sessions:
60
+ self._lazy_sessions.remove(session_id)
61
+
62
+ if session and self._default_session_id == session_id:
63
+ self._default_session_id = next(iter(self._sessions), None)
64
+
65
+ return session
66
+
67
+ @property
68
+ def default_session_id(self) -> str:
69
+ if self._default_session_id is None:
70
+ raise RuntimeError("No sessions registered")
71
+ return self._default_session_id
72
+
73
+ @property
74
+ def session_ids(self) -> list[str]:
75
+ return list(self._sessions.keys())
76
+
77
+ def get(self, session_id: str) -> Session:
78
+ if session_id not in self._sessions:
79
+ available = ", ".join(self._sessions.keys())
80
+ raise KeyError(f"Session '{session_id}' not found. Available: {available}")
81
+ return self._sessions[session_id]
82
+
83
+ async def start(self) -> None:
84
+ """Start all sessions that aren't already alive."""
85
+ if self._started:
86
+ return
87
+
88
+ for sid, session in self._sessions.items():
89
+ if sid not in self._lazy_sessions and not session._is_alive:
90
+ await session.__aenter__()
91
+
92
+ self._started = True
93
+
94
+ async def close(self) -> None:
95
+ """Close all registered sessions."""
96
+ for session in self._sessions.values():
97
+ _ = await session.__aexit__(None, None, None)
98
+
99
+ self._started = False
100
+
101
+ async def fetch(self, request: Request) -> Response:
102
+ sid = request.sid if request.sid else self.default_session_id
103
+ session = self.get(sid)
104
+
105
+ if session:
106
+ if sid in self._lazy_sessions and not session._is_alive:
107
+ async with self._lazy_lock:
108
+ if not session._is_alive:
109
+ await session.__aenter__()
110
+
111
+ if isinstance(session, FetcherSession):
112
+ client = session._client
113
+
114
+ if isinstance(client, _ASyncSessionLogic):
115
+ response = await client._make_request(
116
+ method=cast(SUPPORTED_HTTP_METHODS, request._session_kwargs.pop("method", "GET")),
117
+ url=request.url,
118
+ **request._session_kwargs,
119
+ )
120
+ else:
121
+ # Sync session or other types - shouldn't happen in async context
122
+ raise TypeError(f"Session type {type(client)} not supported for async fetch")
123
+ else:
124
+ response = await session.fetch(url=request.url, **request._session_kwargs)
125
+
126
+ response.request = request
127
+ # Merge request meta into response meta (response meta takes priority)
128
+ response.meta = {**request.meta, **response.meta}
129
+ return response
130
+ raise RuntimeError("No session found with the request session id")
131
+
132
+ async def __aenter__(self) -> "SessionManager":
133
+ await self.start()
134
+ return self
135
+
136
+ async def __aexit__(self, *exc) -> None:
137
+ await self.close()
138
+
139
+ def __contains__(self, session_id: str) -> bool:
140
+ """Check if a session ID is registered."""
141
+ return session_id in self._sessions
142
+
143
+ def __len__(self) -> int:
144
+ """Number of registered sessions."""
145
+ return len(self._sessions)
spiders/spider.py ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import signal
2
+ import logging
3
+ from pathlib import Path
4
+ from abc import ABC, abstractmethod
5
+
6
+ import anyio
7
+ from anyio import Path as AsyncPath
8
+
9
+ from scrapling.spiders.request import Request
10
+ from scrapling.spiders.engine import CrawlerEngine
11
+ from scrapling.spiders.session import SessionManager
12
+ from scrapling.core.utils import set_logger, reset_logger
13
+ from scrapling.spiders.result import CrawlResult, CrawlStats
14
+ from scrapling.core._types import Set, Any, Dict, Optional, Union, TYPE_CHECKING, AsyncGenerator
15
+
16
+ BLOCKED_CODES = {401, 403, 407, 429, 444, 500, 502, 503, 504}
17
+ if TYPE_CHECKING:
18
+ from scrapling.engines.toolbelt.custom import Response
19
+
20
+
21
+ class LogCounterHandler(logging.Handler):
22
+ """A logging handler that counts log messages by level."""
23
+
24
+ def __init__(self):
25
+ super().__init__()
26
+ self.counts = {
27
+ logging.DEBUG: 0,
28
+ logging.INFO: 0,
29
+ logging.WARNING: 0,
30
+ logging.ERROR: 0,
31
+ logging.CRITICAL: 0,
32
+ }
33
+
34
+ def emit(self, record: logging.LogRecord) -> None:
35
+ level = record.levelno
36
+ # Map to the closest standard level
37
+ if level >= logging.CRITICAL:
38
+ self.counts[logging.CRITICAL] += 1
39
+ elif level >= logging.ERROR:
40
+ self.counts[logging.ERROR] += 1
41
+ elif level >= logging.WARNING:
42
+ self.counts[logging.WARNING] += 1
43
+ elif level >= logging.INFO:
44
+ self.counts[logging.INFO] += 1
45
+ else:
46
+ self.counts[logging.DEBUG] += 1
47
+
48
+ def get_counts(self) -> Dict[str, int]:
49
+ """Return counts as a dictionary with string keys."""
50
+ return {
51
+ "debug": self.counts[logging.DEBUG],
52
+ "info": self.counts[logging.INFO],
53
+ "warning": self.counts[logging.WARNING],
54
+ "error": self.counts[logging.ERROR],
55
+ "critical": self.counts[logging.CRITICAL],
56
+ }
57
+
58
+
59
+ class SessionConfigurationError(Exception):
60
+ """Raised when session configuration fails."""
61
+
62
+ pass
63
+
64
+
65
+ class Spider(ABC):
66
+ """An abstract base class for creating web spiders.
67
+
68
+ Check the documentation website for more information.
69
+ """
70
+
71
+ name: Optional[str] = None
72
+ start_urls: list[str] = []
73
+ allowed_domains: Set[str] = set()
74
+
75
+ # Concurrency settings
76
+ concurrent_requests: int = 4
77
+ concurrent_requests_per_domain: int = 0
78
+ download_delay: float = 0.0
79
+ max_blocked_retries: int = 3
80
+
81
+ # Fingerprint adjustments
82
+ fp_include_kwargs: bool = False
83
+ fp_keep_fragments: bool = False
84
+ fp_include_headers: bool = False
85
+
86
+ # Logging settings
87
+ logging_level: int = logging.DEBUG
88
+ logging_format: str = "[%(asctime)s]:({spider_name}) %(levelname)s: %(message)s"
89
+ logging_date_format: str = "%Y-%m-%d %H:%M:%S"
90
+ log_file: Optional[str] = None
91
+
92
+ def __init__(self, crawldir: Optional[Union[str, Path, AsyncPath]] = None, interval: float = 300.0):
93
+ """Initialize the spider.
94
+
95
+ :param crawldir: Directory for checkpoint files. If provided, enables pause/resume.
96
+ :param interval: Seconds between periodic checkpoint saves (default 5 minutes).
97
+ """
98
+ if self.name is None:
99
+ raise ValueError(f"{self.__class__.__name__} must have a name.")
100
+
101
+ self.logger = logging.getLogger(f"scrapling.spiders.{self.name}")
102
+ self.logger.setLevel(self.logging_level)
103
+ self.logger.handlers.clear()
104
+ self.logger.propagate = False # Don't propagate to parent 'scrapling' logger
105
+
106
+ formatter = logging.Formatter(
107
+ fmt=self.logging_format.format(spider_name=self.name), datefmt=self.logging_date_format
108
+ )
109
+
110
+ # Add a log counter handler to track log counts by level
111
+ self._log_counter = LogCounterHandler()
112
+ self.logger.addHandler(self._log_counter)
113
+
114
+ console_handler = logging.StreamHandler()
115
+ console_handler.setFormatter(formatter)
116
+ self.logger.addHandler(console_handler)
117
+
118
+ if self.log_file:
119
+ Path(self.log_file).parent.mkdir(parents=True, exist_ok=True)
120
+ file_handler = logging.FileHandler(self.log_file)
121
+ file_handler.setFormatter(formatter)
122
+ self.logger.addHandler(file_handler)
123
+
124
+ self.crawldir: Optional[Path] = Path(crawldir) if crawldir else None
125
+ self._interval = interval
126
+ self._engine: Optional[CrawlerEngine] = None
127
+ self._original_sigint_handler: Any = None
128
+
129
+ self._session_manager = SessionManager()
130
+
131
+ try:
132
+ self.configure_sessions(self._session_manager)
133
+ except Exception as e:
134
+ raise SessionConfigurationError(f"Error in {self.__class__.__name__}.configure_sessions(): {e}") from e
135
+
136
+ if len(self._session_manager) == 0:
137
+ raise SessionConfigurationError(f"{self.__class__.__name__}.configure_sessions() did not add any sessions")
138
+
139
+ self.logger.info("Spider initialized")
140
+
141
+ async def start_requests(self) -> AsyncGenerator[Request, None]:
142
+ """Generate initial requests to start the crawl.
143
+
144
+ By default, this generates Request objects for each URL in `start_urls`
145
+ using the session manager's default session and `parse()` as callback.
146
+
147
+ Override this method for more control over initial requests
148
+ (e.g., to add custom headers, use different callbacks, etc.)
149
+ """
150
+ if not self.start_urls:
151
+ raise RuntimeError(
152
+ "Spider has no starting point, either set `start_urls` or override `start_requests` function."
153
+ )
154
+
155
+ for url in self.start_urls:
156
+ yield Request(url, sid=self._session_manager.default_session_id)
157
+
158
+ @abstractmethod
159
+ async def parse(self, response: "Response") -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
160
+ """Default callback for processing responses"""
161
+ raise NotImplementedError(f"{self.__class__.__name__} must implement parse() method")
162
+ yield # Make this a generator for type checkers
163
+
164
+ async def on_start(self, resuming: bool = False) -> None:
165
+ """Called before crawling starts. Override for setup logic.
166
+
167
+ :param resuming: It's enabled if the spider is resuming from a checkpoint, left for the user to use.
168
+ """
169
+ if resuming:
170
+ self.logger.debug("Resuming spider from checkpoint")
171
+ else:
172
+ self.logger.debug("Starting spider")
173
+
174
+ async def on_close(self) -> None:
175
+ """Called after crawling finishes. Override for cleanup logic."""
176
+ self.logger.debug("Spider closed")
177
+
178
+ async def on_error(self, request: Request, error: Exception) -> None:
179
+ """
180
+ Handle request errors for all spider requests.
181
+
182
+ Override for custom error handling.
183
+ """
184
+ pass
185
+
186
+ async def on_scraped_item(self, item: Dict[str, Any]) -> Dict[str, Any] | None:
187
+ """A hook to be overridden by users to do some processing on scraped items, return `None` to drop the item silently."""
188
+ return item
189
+
190
+ async def is_blocked(self, response: "Response") -> bool:
191
+ """Check if the response is blocked. Users should override this for custom detection logic."""
192
+ if response.status in BLOCKED_CODES:
193
+ return True
194
+ return False
195
+
196
+ async def retry_blocked_request(self, request: Request, response: "Response") -> Request:
197
+ """Users should override this to prepare the blocked request before retrying, if needed."""
198
+ return request
199
+
200
+ def __repr__(self) -> str:
201
+ """String representation of the spider."""
202
+ return f"<{self.__class__.__name__} '{self.name}'>"
203
+
204
+ def configure_sessions(self, manager: SessionManager) -> None:
205
+ """Configure sessions for this spider.
206
+
207
+ Override this method to add custom sessions.
208
+ The default implementation creates a FetcherSession session.
209
+
210
+ The first session added becomes the default for `start_requests()` unless specified otherwise.
211
+
212
+ :param manager: SessionManager to configure
213
+ """
214
+ from scrapling.fetchers import FetcherSession
215
+
216
+ manager.add("default", FetcherSession())
217
+
218
+ def pause(self):
219
+ """Request graceful shutdown of the crawling process."""
220
+ if self._engine:
221
+ self._engine.request_pause()
222
+ else:
223
+ raise RuntimeError("No active crawl to stop")
224
+
225
+ def _setup_signal_handler(self) -> None:
226
+ """Set up SIGINT handler for graceful pause."""
227
+
228
+ def handler(_signum: int, _frame: Any) -> None:
229
+ if self._engine:
230
+ self._engine.request_pause()
231
+ else:
232
+ # No engine yet, just raise KeyboardInterrupt
233
+ raise KeyboardInterrupt
234
+
235
+ try:
236
+ self._original_sigint_handler = signal.signal(signal.SIGINT, handler)
237
+ except ValueError:
238
+ self._original_sigint_handler = None
239
+
240
+ def _restore_signal_handler(self) -> None:
241
+ """Restore original SIGINT handler."""
242
+ if self._original_sigint_handler is not None:
243
+ try:
244
+ signal.signal(signal.SIGINT, self._original_sigint_handler)
245
+ except ValueError:
246
+ pass
247
+
248
+ async def __run(self) -> CrawlResult:
249
+ token = set_logger(self.logger)
250
+ try:
251
+ self._engine = CrawlerEngine(self, self._session_manager, self.crawldir, self._interval)
252
+ stats = await self._engine.crawl()
253
+ paused = self._engine.paused
254
+ return CrawlResult(stats=stats, items=self._engine.items, paused=paused)
255
+ finally:
256
+ self._engine = None
257
+ reset_logger(token)
258
+ # Close any file handlers to release file resources.
259
+ if self.log_file:
260
+ for handler in self.logger.handlers:
261
+ if isinstance(handler, logging.FileHandler):
262
+ handler.close()
263
+
264
+ def start(self, use_uvloop: bool = False, **backend_options: Any) -> CrawlResult:
265
+ """Run the spider and return results.
266
+
267
+ This is the main entry point for running a spider.
268
+ Handles async execution internally via anyio.
269
+
270
+ Pressing Ctrl+C will initiate graceful shutdown (waits for active tasks to complete).
271
+ Pressing Ctrl+C a second time will force immediate stop.
272
+
273
+ If crawldir is set, a checkpoint will also be saved on graceful shutdown,
274
+ allowing you to resume the crawl later by running the spider again.
275
+
276
+ :param use_uvloop: Whether to use the faster uvloop/winloop event loop implementation, if available.
277
+ :param backend_options: Asyncio backend options to be used with `anyio.run`
278
+ """
279
+ backend_options = backend_options or {}
280
+ if use_uvloop:
281
+ backend_options.update({"use_uvloop": True})
282
+
283
+ # Set up SIGINT handler for graceful shutdown
284
+ self._setup_signal_handler()
285
+ try:
286
+ return anyio.run(self.__run, backend="asyncio", backend_options=backend_options)
287
+ finally:
288
+ self._restore_signal_handler()
289
+
290
+ async def stream(self) -> AsyncGenerator[Dict[str, Any], None]:
291
+ """Stream items as they're scraped. Ideal for long-running spiders or building applications on top of the spiders.
292
+
293
+ Must be called from an async context. Yields items one by one as they are scraped.
294
+ Access `spider.stats` during iteration for real-time statistics.
295
+
296
+ Note: SIGINT handling for pause/resume is not available in stream mode.
297
+ """
298
+ token = set_logger(self.logger)
299
+ try:
300
+ self._engine = CrawlerEngine(self, self._session_manager, self.crawldir, self._interval)
301
+ async for item in self._engine:
302
+ yield item
303
+ finally:
304
+ self._engine = None
305
+ reset_logger(token)
306
+ if self.log_file:
307
+ for handler in self.logger.handlers:
308
+ if isinstance(handler, logging.FileHandler):
309
+ handler.close()
310
+
311
+ @property
312
+ def stats(self) -> CrawlStats:
313
+ """Access current crawl stats (works during streaming)."""
314
+ if self._engine:
315
+ return self._engine.stats
316
+ raise RuntimeError("No active crawl. Use this property inside `async for item in spider.stream():`")
ui.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from scrapling.core.ai import ScraplingMCPServer
3
+ import asyncio
4
+ from typing import Any
5
+
6
+ def create_ui():
7
+ with gr.Blocks(title="Scrapling") as demo:
8
+ gr.Markdown("# Scrapling Web Interface")
9
+
10
+ with gr.Tab("Fetch (HTTP)"):
11
+ gr.Markdown("Standard HTTP Fetcher. Fast but less stealthy.")
12
+ url_input = gr.Textbox(label="URL", placeholder="https://example.com")
13
+ selector_input = gr.Textbox(label="CSS Selector (Optional)", placeholder=".content")
14
+ output = gr.JSON(label="Result")
15
+ fetch_btn = gr.Button("Fetch")
16
+
17
+ async def fetch_wrapper(url, selector):
18
+ if not url:
19
+ return {"error": "URL is required"}
20
+ try:
21
+ # ScraplingMCPServer.get is synchronous or async?
22
+ # In code: staticmethod def get(...) -> ResponseModel:
23
+ # It calls Fetcher.get which is synchronous.
24
+ # Gradio handles async/sync. But running sync function in async context might block.
25
+ # Since it is blocking, we should probably run it in executor or just let Gradio handle it.
26
+ # But ScraplingMCPServer.get uses 'impersonate' which uses curl_cffi.
27
+ result = ScraplingMCPServer.get(url, css_selector=selector if selector else None)
28
+ return result.model_dump()
29
+ except Exception as e:
30
+ return {"error": str(e)}
31
+
32
+ fetch_btn.click(fetch_wrapper, inputs=[url_input, selector_input], outputs=output)
33
+
34
+ with gr.Tab("Stealthy Fetch (Browser)"):
35
+ gr.Markdown("Stealthy Browser Fetcher (Playwright). Slower but bypasses bot protection.")
36
+ s_url_input = gr.Textbox(label="URL")
37
+ s_selector_input = gr.Textbox(label="CSS Selector (Optional)")
38
+ s_headless = gr.Checkbox(label="Headless", value=True)
39
+ s_output = gr.JSON(label="Result")
40
+ s_fetch_btn = gr.Button("Stealthy Fetch")
41
+
42
+ async def stealthy_fetch_wrapper(url, selector, headless):
43
+ if not url:
44
+ return {"error": "URL is required"}
45
+ try:
46
+ result = await ScraplingMCPServer.stealthy_fetch(
47
+ url,
48
+ css_selector=selector if selector else None,
49
+ headless=headless
50
+ )
51
+ return result.model_dump()
52
+ except Exception as e:
53
+ return {"error": str(e)}
54
+
55
+ s_fetch_btn.click(stealthy_fetch_wrapper, inputs=[s_url_input, s_selector_input, s_headless], outputs=s_output)
56
+
57
+ return demo