Spaces:
Paused
Paused
Upload folder using huggingface_hub
Browse files- __init__.py +38 -0
- cli.py +826 -0
- core/__init__.py +0 -0
- core/_shell_signatures.py +100 -0
- core/_types.py +57 -0
- core/ai.py +653 -0
- core/custom_types.py +345 -0
- core/mixins.py +85 -0
- core/shell.py +643 -0
- core/storage.py +156 -0
- core/translator.py +134 -0
- core/utils/__init__.py +11 -0
- core/utils/_shell.py +48 -0
- core/utils/_utils.py +120 -0
- engines/__init__.py +0 -0
- engines/_browsers/__init__.py +0 -0
- engines/_browsers/_base.py +534 -0
- engines/_browsers/_config_tools.py +4 -0
- engines/_browsers/_controllers.py +362 -0
- engines/_browsers/_page.py +87 -0
- engines/_browsers/_stealth.py +541 -0
- engines/_browsers/_types.py +118 -0
- engines/_browsers/_validators.py +229 -0
- engines/constants.py +99 -0
- engines/static.py +770 -0
- engines/toolbelt/__init__.py +3 -0
- engines/toolbelt/convertor.py +306 -0
- engines/toolbelt/custom.py +295 -0
- engines/toolbelt/fingerprints.py +88 -0
- engines/toolbelt/navigation.py +109 -0
- engines/toolbelt/proxy_rotation.py +104 -0
- fetchers/__init__.py +48 -0
- fetchers/chrome.py +91 -0
- fetchers/requests.py +28 -0
- fetchers/stealth_chrome.py +109 -0
- parser.py +1363 -0
- py.typed +1 -0
- spiders/__init__.py +18 -0
- spiders/checkpoint.py +90 -0
- spiders/engine.py +333 -0
- spiders/request.py +163 -0
- spiders/result.py +125 -0
- spiders/scheduler.py +80 -0
- spiders/session.py +145 -0
- spiders/spider.py +316 -0
- ui.py +57 -0
__init__.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
| 2 |
+
__version__ = "0.4.1"
|
| 3 |
+
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
| 4 |
+
|
| 5 |
+
from typing import Any, TYPE_CHECKING
|
| 6 |
+
|
| 7 |
+
if TYPE_CHECKING:
|
| 8 |
+
from scrapling.parser import Selector, Selectors
|
| 9 |
+
from scrapling.core.custom_types import AttributesHandler, TextHandler
|
| 10 |
+
from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# Lazy import mapping
|
| 14 |
+
_LAZY_IMPORTS = {
|
| 15 |
+
"Fetcher": ("scrapling.fetchers", "Fetcher"),
|
| 16 |
+
"Selector": ("scrapling.parser", "Selector"),
|
| 17 |
+
"Selectors": ("scrapling.parser", "Selectors"),
|
| 18 |
+
"AttributesHandler": ("scrapling.core.custom_types", "AttributesHandler"),
|
| 19 |
+
"TextHandler": ("scrapling.core.custom_types", "TextHandler"),
|
| 20 |
+
"AsyncFetcher": ("scrapling.fetchers", "AsyncFetcher"),
|
| 21 |
+
"StealthyFetcher": ("scrapling.fetchers", "StealthyFetcher"),
|
| 22 |
+
"DynamicFetcher": ("scrapling.fetchers", "DynamicFetcher"),
|
| 23 |
+
}
|
| 24 |
+
__all__ = ["Selector", "Fetcher", "AsyncFetcher", "StealthyFetcher", "DynamicFetcher"]
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def __getattr__(name: str) -> Any:
|
| 28 |
+
if name in _LAZY_IMPORTS:
|
| 29 |
+
module_path, class_name = _LAZY_IMPORTS[name]
|
| 30 |
+
module = __import__(module_path, fromlist=[class_name])
|
| 31 |
+
return getattr(module, class_name)
|
| 32 |
+
else:
|
| 33 |
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def __dir__() -> list[str]:
|
| 37 |
+
"""Support for dir() and autocomplete."""
|
| 38 |
+
return sorted(__all__ + ["fetchers", "parser", "cli", "core", "__author__", "__version__", "__copyright__"])
|
cli.py
ADDED
|
@@ -0,0 +1,826 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
from subprocess import check_output
|
| 3 |
+
from sys import executable as python_executable
|
| 4 |
+
|
| 5 |
+
from scrapling.core.utils import log
|
| 6 |
+
from scrapling.engines.toolbelt.custom import Response
|
| 7 |
+
from scrapling.core.utils._shell import _CookieParser, _ParseHeaders
|
| 8 |
+
from scrapling.core._types import List, Optional, Dict, Tuple, Any, Callable
|
| 9 |
+
|
| 10 |
+
from orjson import loads as json_loads, JSONDecodeError
|
| 11 |
+
|
| 12 |
+
try:
|
| 13 |
+
from click import command, option, Choice, group, argument
|
| 14 |
+
except (ImportError, ModuleNotFoundError) as e:
|
| 15 |
+
raise ModuleNotFoundError(
|
| 16 |
+
"You need to install scrapling with any of the extras to enable Shell commands. See: https://scrapling.readthedocs.io/en/latest/#installation"
|
| 17 |
+
) from e
|
| 18 |
+
|
| 19 |
+
__OUTPUT_FILE_HELP__ = "The output file path can be an HTML file, a Markdown file of the HTML content, or the text content itself. Use file extensions (`.html`/`.md`/`.txt`) respectively."
|
| 20 |
+
__PACKAGE_DIR__ = Path(__file__).parent
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def __Execute(cmd: List[str], help_line: str) -> None: # pragma: no cover
|
| 24 |
+
print(f"Installing {help_line}...")
|
| 25 |
+
_ = check_output(cmd, shell=False) # nosec B603
|
| 26 |
+
# I meant to not use try except here
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def __ParseJSONData(json_string: Optional[str] = None) -> Optional[Dict[str, Any]]:
|
| 30 |
+
"""Parse JSON string into a Python object"""
|
| 31 |
+
if not json_string:
|
| 32 |
+
return None
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
return json_loads(json_string)
|
| 36 |
+
except JSONDecodeError as err: # pragma: no cover
|
| 37 |
+
raise ValueError(f"Invalid JSON data '{json_string}': {err}")
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def __Request_and_Save(
|
| 41 |
+
fetcher_func: Callable[..., Response],
|
| 42 |
+
url: str,
|
| 43 |
+
output_file: str,
|
| 44 |
+
css_selector: Optional[str] = None,
|
| 45 |
+
**kwargs,
|
| 46 |
+
) -> None:
|
| 47 |
+
"""Make a request using the specified fetcher function and save the result"""
|
| 48 |
+
from scrapling.core.shell import Convertor
|
| 49 |
+
|
| 50 |
+
# Handle relative paths - convert to an absolute path based on the current working directory
|
| 51 |
+
output_path = Path(output_file)
|
| 52 |
+
if not output_path.is_absolute():
|
| 53 |
+
output_path = Path.cwd() / output_file
|
| 54 |
+
|
| 55 |
+
response = fetcher_func(url, **kwargs)
|
| 56 |
+
Convertor.write_content_to_file(response, str(output_path), css_selector)
|
| 57 |
+
log.info(f"Content successfully saved to '{output_path}'")
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def __ParseExtractArguments(
|
| 61 |
+
headers: List[str], cookies: str, params: str, json: Optional[str] = None
|
| 62 |
+
) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, str], Optional[Dict[str, str]]]:
|
| 63 |
+
"""Parse arguments for extract command"""
|
| 64 |
+
parsed_headers, parsed_cookies = _ParseHeaders(headers)
|
| 65 |
+
if cookies:
|
| 66 |
+
for key, value in _CookieParser(cookies):
|
| 67 |
+
try:
|
| 68 |
+
parsed_cookies[key] = value
|
| 69 |
+
except Exception as err:
|
| 70 |
+
raise ValueError(f"Could not parse cookies '{cookies}': {err}")
|
| 71 |
+
|
| 72 |
+
parsed_json = __ParseJSONData(json)
|
| 73 |
+
parsed_params = {}
|
| 74 |
+
for param in params:
|
| 75 |
+
if "=" in param:
|
| 76 |
+
key, value = param.split("=", 1)
|
| 77 |
+
parsed_params[key] = value
|
| 78 |
+
|
| 79 |
+
return parsed_headers, parsed_cookies, parsed_params, parsed_json
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def __BuildRequest(headers: List[str], cookies: str, params: str, json: Optional[str] = None, **kwargs) -> Dict:
|
| 83 |
+
"""Build a request object using the specified arguments"""
|
| 84 |
+
# Parse parameters
|
| 85 |
+
parsed_headers, parsed_cookies, parsed_params, parsed_json = __ParseExtractArguments(headers, cookies, params, json)
|
| 86 |
+
# Build request arguments
|
| 87 |
+
request_kwargs: Dict[str, Any] = {
|
| 88 |
+
"headers": parsed_headers if parsed_headers else None,
|
| 89 |
+
"cookies": parsed_cookies if parsed_cookies else None,
|
| 90 |
+
}
|
| 91 |
+
if parsed_json:
|
| 92 |
+
request_kwargs["json"] = parsed_json
|
| 93 |
+
if parsed_params:
|
| 94 |
+
request_kwargs["params"] = parsed_params
|
| 95 |
+
if "proxy" in kwargs:
|
| 96 |
+
request_kwargs["proxy"] = kwargs.pop("proxy")
|
| 97 |
+
|
| 98 |
+
# Parse impersonate parameter if it contains commas (for random selection)
|
| 99 |
+
if "impersonate" in kwargs and "," in (kwargs.get("impersonate") or ""):
|
| 100 |
+
kwargs["impersonate"] = [browser.strip() for browser in kwargs["impersonate"].split(",")]
|
| 101 |
+
|
| 102 |
+
return {**request_kwargs, **kwargs}
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
@command(help="Install all Scrapling's Fetchers dependencies")
|
| 106 |
+
@option(
|
| 107 |
+
"-f",
|
| 108 |
+
"--force",
|
| 109 |
+
"force",
|
| 110 |
+
is_flag=True,
|
| 111 |
+
default=False,
|
| 112 |
+
type=bool,
|
| 113 |
+
help="Force Scrapling to reinstall all Fetchers dependencies",
|
| 114 |
+
)
|
| 115 |
+
def install(force): # pragma: no cover
|
| 116 |
+
if force or not __PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").exists():
|
| 117 |
+
__Execute(
|
| 118 |
+
[python_executable, "-m", "playwright", "install", "chromium"],
|
| 119 |
+
"Playwright browsers",
|
| 120 |
+
)
|
| 121 |
+
__Execute(
|
| 122 |
+
[
|
| 123 |
+
python_executable,
|
| 124 |
+
"-m",
|
| 125 |
+
"playwright",
|
| 126 |
+
"install-deps",
|
| 127 |
+
"chromium",
|
| 128 |
+
],
|
| 129 |
+
"Playwright dependencies",
|
| 130 |
+
)
|
| 131 |
+
from tld.utils import update_tld_names
|
| 132 |
+
|
| 133 |
+
update_tld_names(fail_silently=True)
|
| 134 |
+
# if no errors raised by the above commands, then we add the below file
|
| 135 |
+
__PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").touch()
|
| 136 |
+
else:
|
| 137 |
+
print("The dependencies are already installed")
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
@command(help="Run Scrapling's MCP server (Check the docs for more info).")
|
| 141 |
+
@option(
|
| 142 |
+
"--http",
|
| 143 |
+
is_flag=True,
|
| 144 |
+
default=False,
|
| 145 |
+
help="Whether to run the MCP server in streamable-http transport or leave it as stdio (Default: False)",
|
| 146 |
+
)
|
| 147 |
+
@option(
|
| 148 |
+
"--host",
|
| 149 |
+
type=str,
|
| 150 |
+
default="0.0.0.0",
|
| 151 |
+
help="The host to use if streamable-http transport is enabled (Default: '0.0.0.0')",
|
| 152 |
+
)
|
| 153 |
+
@option(
|
| 154 |
+
"--port", type=int, default=8000, help="The port to use if streamable-http transport is enabled (Default: 8000)"
|
| 155 |
+
)
|
| 156 |
+
def mcp(http, host, port):
|
| 157 |
+
from scrapling.core.ai import ScraplingMCPServer
|
| 158 |
+
|
| 159 |
+
server = ScraplingMCPServer()
|
| 160 |
+
server.serve(http, host, port)
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
@command(help="Interactive scraping console")
|
| 164 |
+
@option(
|
| 165 |
+
"-c",
|
| 166 |
+
"--code",
|
| 167 |
+
"code",
|
| 168 |
+
is_flag=False,
|
| 169 |
+
default="",
|
| 170 |
+
type=str,
|
| 171 |
+
help="Evaluate the code in the shell, print the result and exit",
|
| 172 |
+
)
|
| 173 |
+
@option(
|
| 174 |
+
"-L",
|
| 175 |
+
"--loglevel",
|
| 176 |
+
"level",
|
| 177 |
+
is_flag=False,
|
| 178 |
+
default="debug",
|
| 179 |
+
type=Choice(["debug", "info", "warning", "error", "critical", "fatal"], case_sensitive=False),
|
| 180 |
+
help="Log level (default: DEBUG)",
|
| 181 |
+
)
|
| 182 |
+
def shell(code, level):
|
| 183 |
+
from scrapling.core.shell import CustomShell
|
| 184 |
+
|
| 185 |
+
console = CustomShell(code=code, log_level=level)
|
| 186 |
+
console.start()
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
@group(
|
| 190 |
+
help="Fetch web pages using various fetchers and extract full/selected HTML content as HTML, Markdown, or extract text content."
|
| 191 |
+
)
|
| 192 |
+
def extract():
|
| 193 |
+
"""Extract content from web pages and save to files"""
|
| 194 |
+
pass
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
@extract.command(help=f"Perform a GET request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
|
| 198 |
+
@argument("url", required=True)
|
| 199 |
+
@argument("output_file", required=True)
|
| 200 |
+
@option(
|
| 201 |
+
"--headers",
|
| 202 |
+
"-H",
|
| 203 |
+
multiple=True,
|
| 204 |
+
help='HTTP headers in format "Key: Value" (can be used multiple times)',
|
| 205 |
+
)
|
| 206 |
+
@option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
|
| 207 |
+
@option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
|
| 208 |
+
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
|
| 209 |
+
@option(
|
| 210 |
+
"--css-selector",
|
| 211 |
+
"-s",
|
| 212 |
+
help="CSS selector to extract specific content from the page. It returns all matches.",
|
| 213 |
+
)
|
| 214 |
+
@option(
|
| 215 |
+
"--params",
|
| 216 |
+
"-p",
|
| 217 |
+
multiple=True,
|
| 218 |
+
help='Query parameters in format "key=value" (can be used multiple times)',
|
| 219 |
+
)
|
| 220 |
+
@option(
|
| 221 |
+
"--follow-redirects/--no-follow-redirects",
|
| 222 |
+
default=True,
|
| 223 |
+
help="Whether to follow redirects (default: True)",
|
| 224 |
+
)
|
| 225 |
+
@option(
|
| 226 |
+
"--verify/--no-verify",
|
| 227 |
+
default=True,
|
| 228 |
+
help="Whether to verify SSL certificates (default: True)",
|
| 229 |
+
)
|
| 230 |
+
@option(
|
| 231 |
+
"--impersonate",
|
| 232 |
+
help="Browser to impersonate. Can be a single browser (e.g., chrome) or comma-separated list for random selection (e.g., chrome,firefox,safari).",
|
| 233 |
+
)
|
| 234 |
+
@option(
|
| 235 |
+
"--stealthy-headers/--no-stealthy-headers",
|
| 236 |
+
default=True,
|
| 237 |
+
help="Use stealthy browser headers (default: True)",
|
| 238 |
+
)
|
| 239 |
+
def get(
|
| 240 |
+
url,
|
| 241 |
+
output_file,
|
| 242 |
+
headers,
|
| 243 |
+
cookies,
|
| 244 |
+
timeout,
|
| 245 |
+
proxy,
|
| 246 |
+
css_selector,
|
| 247 |
+
params,
|
| 248 |
+
follow_redirects,
|
| 249 |
+
verify,
|
| 250 |
+
impersonate,
|
| 251 |
+
stealthy_headers,
|
| 252 |
+
):
|
| 253 |
+
"""
|
| 254 |
+
Perform a GET request and save the content to a file.
|
| 255 |
+
|
| 256 |
+
:param url: Target URL for the request.
|
| 257 |
+
:param output_file: Output file path (.md for Markdown, .html for HTML).
|
| 258 |
+
:param headers: HTTP headers to include in the request.
|
| 259 |
+
:param cookies: Cookies to use in the request.
|
| 260 |
+
:param timeout: Number of seconds to wait before timing out.
|
| 261 |
+
:param proxy: Proxy URL to use. (Format: "http://username:password@localhost:8030")
|
| 262 |
+
:param css_selector: CSS selector to extract specific content.
|
| 263 |
+
:param params: Query string parameters for the request.
|
| 264 |
+
:param follow_redirects: Whether to follow redirects.
|
| 265 |
+
:param verify: Whether to verify HTTPS certificates.
|
| 266 |
+
:param impersonate: Browser version to impersonate.
|
| 267 |
+
:param stealthy_headers: If enabled, creates and adds real browser headers.
|
| 268 |
+
"""
|
| 269 |
+
|
| 270 |
+
kwargs = __BuildRequest(
|
| 271 |
+
headers,
|
| 272 |
+
cookies,
|
| 273 |
+
params,
|
| 274 |
+
None,
|
| 275 |
+
timeout=timeout,
|
| 276 |
+
follow_redirects=follow_redirects,
|
| 277 |
+
verify=verify,
|
| 278 |
+
stealthy_headers=stealthy_headers,
|
| 279 |
+
impersonate=impersonate,
|
| 280 |
+
proxy=proxy,
|
| 281 |
+
)
|
| 282 |
+
from scrapling.fetchers import Fetcher
|
| 283 |
+
|
| 284 |
+
__Request_and_Save(Fetcher.get, url, output_file, css_selector, **kwargs)
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
@extract.command(help=f"Perform a POST request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
|
| 288 |
+
@argument("url", required=True)
|
| 289 |
+
@argument("output_file", required=True)
|
| 290 |
+
@option(
|
| 291 |
+
"--data",
|
| 292 |
+
"-d",
|
| 293 |
+
help='Form data to include in the request body (as string, ex: "param1=value1¶m2=value2")',
|
| 294 |
+
)
|
| 295 |
+
@option("--json", "-j", help="JSON data to include in the request body (as string)")
|
| 296 |
+
@option(
|
| 297 |
+
"--headers",
|
| 298 |
+
"-H",
|
| 299 |
+
multiple=True,
|
| 300 |
+
help='HTTP headers in format "Key: Value" (can be used multiple times)',
|
| 301 |
+
)
|
| 302 |
+
@option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
|
| 303 |
+
@option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
|
| 304 |
+
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
|
| 305 |
+
@option(
|
| 306 |
+
"--css-selector",
|
| 307 |
+
"-s",
|
| 308 |
+
help="CSS selector to extract specific content from the page. It returns all matches.",
|
| 309 |
+
)
|
| 310 |
+
@option(
|
| 311 |
+
"--params",
|
| 312 |
+
"-p",
|
| 313 |
+
multiple=True,
|
| 314 |
+
help='Query parameters in format "key=value" (can be used multiple times)',
|
| 315 |
+
)
|
| 316 |
+
@option(
|
| 317 |
+
"--follow-redirects/--no-follow-redirects",
|
| 318 |
+
default=True,
|
| 319 |
+
help="Whether to follow redirects (default: True)",
|
| 320 |
+
)
|
| 321 |
+
@option(
|
| 322 |
+
"--verify/--no-verify",
|
| 323 |
+
default=True,
|
| 324 |
+
help="Whether to verify SSL certificates (default: True)",
|
| 325 |
+
)
|
| 326 |
+
@option(
|
| 327 |
+
"--impersonate",
|
| 328 |
+
help="Browser to impersonate. Can be a single browser (e.g., chrome) or comma-separated list for random selection (e.g., chrome,firefox,safari).",
|
| 329 |
+
)
|
| 330 |
+
@option(
|
| 331 |
+
"--stealthy-headers/--no-stealthy-headers",
|
| 332 |
+
default=True,
|
| 333 |
+
help="Use stealthy browser headers (default: True)",
|
| 334 |
+
)
|
| 335 |
+
def post(
|
| 336 |
+
url,
|
| 337 |
+
output_file,
|
| 338 |
+
data,
|
| 339 |
+
json,
|
| 340 |
+
headers,
|
| 341 |
+
cookies,
|
| 342 |
+
timeout,
|
| 343 |
+
proxy,
|
| 344 |
+
css_selector,
|
| 345 |
+
params,
|
| 346 |
+
follow_redirects,
|
| 347 |
+
verify,
|
| 348 |
+
impersonate,
|
| 349 |
+
stealthy_headers,
|
| 350 |
+
):
|
| 351 |
+
"""
|
| 352 |
+
Perform a POST request and save the content to a file.
|
| 353 |
+
|
| 354 |
+
:param url: Target URL for the request.
|
| 355 |
+
:param output_file: Output file path (.md for Markdown, .html for HTML).
|
| 356 |
+
:param data: Form data to include in the request body. (as string, ex: "param1=value1¶m2=value2")
|
| 357 |
+
:param json: A JSON serializable object to include in the body of the request.
|
| 358 |
+
:param headers: Headers to include in the request.
|
| 359 |
+
:param cookies: Cookies to use in the request.
|
| 360 |
+
:param timeout: Number of seconds to wait before timing out.
|
| 361 |
+
:param proxy: Proxy URL to use.
|
| 362 |
+
:param css_selector: CSS selector to extract specific content.
|
| 363 |
+
:param params: Query string parameters for the request.
|
| 364 |
+
:param follow_redirects: Whether to follow redirects.
|
| 365 |
+
:param verify: Whether to verify HTTPS certificates.
|
| 366 |
+
:param impersonate: Browser version to impersonate.
|
| 367 |
+
:param stealthy_headers: If enabled, creates and adds real browser headers.
|
| 368 |
+
"""
|
| 369 |
+
|
| 370 |
+
kwargs = __BuildRequest(
|
| 371 |
+
headers,
|
| 372 |
+
cookies,
|
| 373 |
+
params,
|
| 374 |
+
json,
|
| 375 |
+
timeout=timeout,
|
| 376 |
+
follow_redirects=follow_redirects,
|
| 377 |
+
verify=verify,
|
| 378 |
+
stealthy_headers=stealthy_headers,
|
| 379 |
+
impersonate=impersonate,
|
| 380 |
+
proxy=proxy,
|
| 381 |
+
data=data,
|
| 382 |
+
)
|
| 383 |
+
from scrapling.fetchers import Fetcher
|
| 384 |
+
|
| 385 |
+
__Request_and_Save(Fetcher.post, url, output_file, css_selector, **kwargs)
|
| 386 |
+
|
| 387 |
+
|
| 388 |
+
@extract.command(help=f"Perform a PUT request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
|
| 389 |
+
@argument("url", required=True)
|
| 390 |
+
@argument("output_file", required=True)
|
| 391 |
+
@option("--data", "-d", help="Form data to include in the request body")
|
| 392 |
+
@option("--json", "-j", help="JSON data to include in the request body (as string)")
|
| 393 |
+
@option(
|
| 394 |
+
"--headers",
|
| 395 |
+
"-H",
|
| 396 |
+
multiple=True,
|
| 397 |
+
help='HTTP headers in format "Key: Value" (can be used multiple times)',
|
| 398 |
+
)
|
| 399 |
+
@option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
|
| 400 |
+
@option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
|
| 401 |
+
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
|
| 402 |
+
@option(
|
| 403 |
+
"--css-selector",
|
| 404 |
+
"-s",
|
| 405 |
+
help="CSS selector to extract specific content from the page. It returns all matches.",
|
| 406 |
+
)
|
| 407 |
+
@option(
|
| 408 |
+
"--params",
|
| 409 |
+
"-p",
|
| 410 |
+
multiple=True,
|
| 411 |
+
help='Query parameters in format "key=value" (can be used multiple times)',
|
| 412 |
+
)
|
| 413 |
+
@option(
|
| 414 |
+
"--follow-redirects/--no-follow-redirects",
|
| 415 |
+
default=True,
|
| 416 |
+
help="Whether to follow redirects (default: True)",
|
| 417 |
+
)
|
| 418 |
+
@option(
|
| 419 |
+
"--verify/--no-verify",
|
| 420 |
+
default=True,
|
| 421 |
+
help="Whether to verify SSL certificates (default: True)",
|
| 422 |
+
)
|
| 423 |
+
@option(
|
| 424 |
+
"--impersonate",
|
| 425 |
+
help="Browser to impersonate. Can be a single browser (e.g., chrome) or comma-separated list for random selection (e.g., chrome,firefox,safari).",
|
| 426 |
+
)
|
| 427 |
+
@option(
|
| 428 |
+
"--stealthy-headers/--no-stealthy-headers",
|
| 429 |
+
default=True,
|
| 430 |
+
help="Use stealthy browser headers (default: True)",
|
| 431 |
+
)
|
| 432 |
+
def put(
|
| 433 |
+
url,
|
| 434 |
+
output_file,
|
| 435 |
+
data,
|
| 436 |
+
json,
|
| 437 |
+
headers,
|
| 438 |
+
cookies,
|
| 439 |
+
timeout,
|
| 440 |
+
proxy,
|
| 441 |
+
css_selector,
|
| 442 |
+
params,
|
| 443 |
+
follow_redirects,
|
| 444 |
+
verify,
|
| 445 |
+
impersonate,
|
| 446 |
+
stealthy_headers,
|
| 447 |
+
):
|
| 448 |
+
"""
|
| 449 |
+
Perform a PUT request and save the content to a file.
|
| 450 |
+
|
| 451 |
+
:param url: Target URL for the request.
|
| 452 |
+
:param output_file: Output file path (.md for Markdown, .html for HTML).
|
| 453 |
+
:param data: Form data to include in the request body.
|
| 454 |
+
:param json: A JSON serializable object to include in the body of the request.
|
| 455 |
+
:param headers: Headers to include in the request.
|
| 456 |
+
:param cookies: Cookies to use in the request.
|
| 457 |
+
:param timeout: Number of seconds to wait before timing out.
|
| 458 |
+
:param proxy: Proxy URL to use.
|
| 459 |
+
:param css_selector: CSS selector to extract specific content.
|
| 460 |
+
:param params: Query string parameters for the request.
|
| 461 |
+
:param follow_redirects: Whether to follow redirects.
|
| 462 |
+
:param verify: Whether to verify HTTPS certificates.
|
| 463 |
+
:param impersonate: Browser version to impersonate.
|
| 464 |
+
:param stealthy_headers: If enabled, creates and adds real browser headers.
|
| 465 |
+
"""
|
| 466 |
+
|
| 467 |
+
kwargs = __BuildRequest(
|
| 468 |
+
headers,
|
| 469 |
+
cookies,
|
| 470 |
+
params,
|
| 471 |
+
json,
|
| 472 |
+
timeout=timeout,
|
| 473 |
+
follow_redirects=follow_redirects,
|
| 474 |
+
verify=verify,
|
| 475 |
+
stealthy_headers=stealthy_headers,
|
| 476 |
+
impersonate=impersonate,
|
| 477 |
+
proxy=proxy,
|
| 478 |
+
data=data,
|
| 479 |
+
)
|
| 480 |
+
from scrapling.fetchers import Fetcher
|
| 481 |
+
|
| 482 |
+
__Request_and_Save(Fetcher.put, url, output_file, css_selector, **kwargs)
|
| 483 |
+
|
| 484 |
+
|
| 485 |
+
@extract.command(help=f"Perform a DELETE request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
|
| 486 |
+
@argument("url", required=True)
|
| 487 |
+
@argument("output_file", required=True)
|
| 488 |
+
@option(
|
| 489 |
+
"--headers",
|
| 490 |
+
"-H",
|
| 491 |
+
multiple=True,
|
| 492 |
+
help='HTTP headers in format "Key: Value" (can be used multiple times)',
|
| 493 |
+
)
|
| 494 |
+
@option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
|
| 495 |
+
@option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
|
| 496 |
+
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
|
| 497 |
+
@option(
|
| 498 |
+
"--css-selector",
|
| 499 |
+
"-s",
|
| 500 |
+
help="CSS selector to extract specific content from the page. It returns all matches.",
|
| 501 |
+
)
|
| 502 |
+
@option(
|
| 503 |
+
"--params",
|
| 504 |
+
"-p",
|
| 505 |
+
multiple=True,
|
| 506 |
+
help='Query parameters in format "key=value" (can be used multiple times)',
|
| 507 |
+
)
|
| 508 |
+
@option(
|
| 509 |
+
"--follow-redirects/--no-follow-redirects",
|
| 510 |
+
default=True,
|
| 511 |
+
help="Whether to follow redirects (default: True)",
|
| 512 |
+
)
|
| 513 |
+
@option(
|
| 514 |
+
"--verify/--no-verify",
|
| 515 |
+
default=True,
|
| 516 |
+
help="Whether to verify SSL certificates (default: True)",
|
| 517 |
+
)
|
| 518 |
+
@option(
|
| 519 |
+
"--impersonate",
|
| 520 |
+
help="Browser to impersonate. Can be a single browser (e.g., chrome) or comma-separated list for random selection (e.g., chrome,firefox,safari).",
|
| 521 |
+
)
|
| 522 |
+
@option(
|
| 523 |
+
"--stealthy-headers/--no-stealthy-headers",
|
| 524 |
+
default=True,
|
| 525 |
+
help="Use stealthy browser headers (default: True)",
|
| 526 |
+
)
|
| 527 |
+
def delete(
|
| 528 |
+
url,
|
| 529 |
+
output_file,
|
| 530 |
+
headers,
|
| 531 |
+
cookies,
|
| 532 |
+
timeout,
|
| 533 |
+
proxy,
|
| 534 |
+
css_selector,
|
| 535 |
+
params,
|
| 536 |
+
follow_redirects,
|
| 537 |
+
verify,
|
| 538 |
+
impersonate,
|
| 539 |
+
stealthy_headers,
|
| 540 |
+
):
|
| 541 |
+
"""
|
| 542 |
+
Perform a DELETE request and save the content to a file.
|
| 543 |
+
|
| 544 |
+
:param url: Target URL for the request.
|
| 545 |
+
:param output_file: Output file path (.md for Markdown, .html for HTML).
|
| 546 |
+
:param headers: Headers to include in the request.
|
| 547 |
+
:param cookies: Cookies to use in the request.
|
| 548 |
+
:param timeout: Number of seconds to wait before timing out.
|
| 549 |
+
:param proxy: Proxy URL to use.
|
| 550 |
+
:param css_selector: CSS selector to extract specific content.
|
| 551 |
+
:param params: Query string parameters for the request.
|
| 552 |
+
:param follow_redirects: Whether to follow redirects.
|
| 553 |
+
:param verify: Whether to verify HTTPS certificates.
|
| 554 |
+
:param impersonate: Browser version to impersonate.
|
| 555 |
+
:param stealthy_headers: If enabled, creates and adds real browser headers.
|
| 556 |
+
"""
|
| 557 |
+
|
| 558 |
+
kwargs = __BuildRequest(
|
| 559 |
+
headers,
|
| 560 |
+
cookies,
|
| 561 |
+
params,
|
| 562 |
+
None,
|
| 563 |
+
timeout=timeout,
|
| 564 |
+
follow_redirects=follow_redirects,
|
| 565 |
+
verify=verify,
|
| 566 |
+
stealthy_headers=stealthy_headers,
|
| 567 |
+
impersonate=impersonate,
|
| 568 |
+
proxy=proxy,
|
| 569 |
+
)
|
| 570 |
+
from scrapling.fetchers import Fetcher
|
| 571 |
+
|
| 572 |
+
__Request_and_Save(Fetcher.delete, url, output_file, css_selector, **kwargs)
|
| 573 |
+
|
| 574 |
+
|
| 575 |
+
@extract.command(help=f"Use DynamicFetcher to fetch content with browser automation.\n\n{__OUTPUT_FILE_HELP__}")
|
| 576 |
+
@argument("url", required=True)
|
| 577 |
+
@argument("output_file", required=True)
|
| 578 |
+
@option(
|
| 579 |
+
"--headless/--no-headless",
|
| 580 |
+
default=True,
|
| 581 |
+
help="Run browser in headless mode (default: True)",
|
| 582 |
+
)
|
| 583 |
+
@option(
|
| 584 |
+
"--disable-resources/--enable-resources",
|
| 585 |
+
default=False,
|
| 586 |
+
help="Drop unnecessary resources for speed boost (default: False)",
|
| 587 |
+
)
|
| 588 |
+
@option(
|
| 589 |
+
"--network-idle/--no-network-idle",
|
| 590 |
+
default=False,
|
| 591 |
+
help="Wait for network idle (default: False)",
|
| 592 |
+
)
|
| 593 |
+
@option(
|
| 594 |
+
"--timeout",
|
| 595 |
+
type=int,
|
| 596 |
+
default=30000,
|
| 597 |
+
help="Timeout in milliseconds (default: 30000)",
|
| 598 |
+
)
|
| 599 |
+
@option(
|
| 600 |
+
"--wait",
|
| 601 |
+
type=int,
|
| 602 |
+
default=0,
|
| 603 |
+
help="Additional wait time in milliseconds after page load (default: 0)",
|
| 604 |
+
)
|
| 605 |
+
@option(
|
| 606 |
+
"--css-selector",
|
| 607 |
+
"-s",
|
| 608 |
+
help="CSS selector to extract specific content from the page. It returns all matches.",
|
| 609 |
+
)
|
| 610 |
+
@option("--wait-selector", help="CSS selector to wait for before proceeding")
|
| 611 |
+
@option("--locale", default=None, help="Specify user locale. Defaults to the system default locale.")
|
| 612 |
+
@option(
|
| 613 |
+
"--real-chrome/--no-real-chrome",
|
| 614 |
+
default=False,
|
| 615 |
+
help="If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False)",
|
| 616 |
+
)
|
| 617 |
+
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
|
| 618 |
+
@option(
|
| 619 |
+
"--extra-headers",
|
| 620 |
+
"-H",
|
| 621 |
+
multiple=True,
|
| 622 |
+
help='Extra headers in format "Key: Value" (can be used multiple times)',
|
| 623 |
+
)
|
| 624 |
+
def fetch(
|
| 625 |
+
url,
|
| 626 |
+
output_file,
|
| 627 |
+
headless,
|
| 628 |
+
disable_resources,
|
| 629 |
+
network_idle,
|
| 630 |
+
timeout,
|
| 631 |
+
wait,
|
| 632 |
+
css_selector,
|
| 633 |
+
wait_selector,
|
| 634 |
+
locale,
|
| 635 |
+
real_chrome,
|
| 636 |
+
proxy,
|
| 637 |
+
extra_headers,
|
| 638 |
+
):
|
| 639 |
+
"""
|
| 640 |
+
Opens up a browser and fetch content using DynamicFetcher.
|
| 641 |
+
|
| 642 |
+
:param url: Target url.
|
| 643 |
+
:param output_file: Output file path (.md for Markdown, .html for HTML).
|
| 644 |
+
:param headless: Run the browser in headless/hidden or headful/visible mode.
|
| 645 |
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost.
|
| 646 |
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 647 |
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page.
|
| 648 |
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before returning.
|
| 649 |
+
:param css_selector: CSS selector to extract specific content.
|
| 650 |
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 651 |
+
:param locale: Set the locale for the browser.
|
| 652 |
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 653 |
+
:param proxy: The proxy to be used with requests.
|
| 654 |
+
:param extra_headers: Extra headers to add to the request.
|
| 655 |
+
"""
|
| 656 |
+
|
| 657 |
+
# Parse parameters
|
| 658 |
+
parsed_headers, _ = _ParseHeaders(extra_headers, False)
|
| 659 |
+
|
| 660 |
+
# Build request arguments
|
| 661 |
+
kwargs = {
|
| 662 |
+
"headless": headless,
|
| 663 |
+
"disable_resources": disable_resources,
|
| 664 |
+
"network_idle": network_idle,
|
| 665 |
+
"timeout": timeout,
|
| 666 |
+
"locale": locale,
|
| 667 |
+
"real_chrome": real_chrome,
|
| 668 |
+
}
|
| 669 |
+
|
| 670 |
+
if wait > 0:
|
| 671 |
+
kwargs["wait"] = wait
|
| 672 |
+
if wait_selector:
|
| 673 |
+
kwargs["wait_selector"] = wait_selector
|
| 674 |
+
if proxy:
|
| 675 |
+
kwargs["proxy"] = proxy
|
| 676 |
+
if parsed_headers:
|
| 677 |
+
kwargs["extra_headers"] = parsed_headers
|
| 678 |
+
|
| 679 |
+
from scrapling.fetchers import DynamicFetcher
|
| 680 |
+
|
| 681 |
+
__Request_and_Save(DynamicFetcher.fetch, url, output_file, css_selector, **kwargs)
|
| 682 |
+
|
| 683 |
+
|
| 684 |
+
@extract.command(help=f"Use StealthyFetcher to fetch content with advanced stealth features.\n\n{__OUTPUT_FILE_HELP__}")
|
| 685 |
+
@argument("url", required=True)
|
| 686 |
+
@argument("output_file", required=True)
|
| 687 |
+
@option(
|
| 688 |
+
"--headless/--no-headless",
|
| 689 |
+
default=True,
|
| 690 |
+
help="Run browser in headless mode (default: True)",
|
| 691 |
+
)
|
| 692 |
+
@option(
|
| 693 |
+
"--disable-resources/--enable-resources",
|
| 694 |
+
default=False,
|
| 695 |
+
help="Drop unnecessary resources for speed boost (default: False)",
|
| 696 |
+
)
|
| 697 |
+
@option(
|
| 698 |
+
"--block-webrtc/--allow-webrtc",
|
| 699 |
+
default=False,
|
| 700 |
+
help="Block WebRTC entirely (default: False)",
|
| 701 |
+
)
|
| 702 |
+
@option(
|
| 703 |
+
"--solve-cloudflare/--no-solve-cloudflare",
|
| 704 |
+
default=False,
|
| 705 |
+
help="Solve Cloudflare challenges (default: False)",
|
| 706 |
+
)
|
| 707 |
+
@option("--allow-webgl/--block-webgl", default=True, help="Allow WebGL (default: True)")
|
| 708 |
+
@option(
|
| 709 |
+
"--network-idle/--no-network-idle",
|
| 710 |
+
default=False,
|
| 711 |
+
help="Wait for network idle (default: False)",
|
| 712 |
+
)
|
| 713 |
+
@option(
|
| 714 |
+
"--real-chrome/--no-real-chrome",
|
| 715 |
+
default=False,
|
| 716 |
+
help="If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False)",
|
| 717 |
+
)
|
| 718 |
+
@option(
|
| 719 |
+
"--hide-canvas/--show-canvas",
|
| 720 |
+
default=False,
|
| 721 |
+
help="Add noise to canvas operations (default: False)",
|
| 722 |
+
)
|
| 723 |
+
@option(
|
| 724 |
+
"--timeout",
|
| 725 |
+
type=int,
|
| 726 |
+
default=30000,
|
| 727 |
+
help="Timeout in milliseconds (default: 30000)",
|
| 728 |
+
)
|
| 729 |
+
@option(
|
| 730 |
+
"--wait",
|
| 731 |
+
type=int,
|
| 732 |
+
default=0,
|
| 733 |
+
help="Additional wait time in milliseconds after page load (default: 0)",
|
| 734 |
+
)
|
| 735 |
+
@option(
|
| 736 |
+
"--css-selector",
|
| 737 |
+
"-s",
|
| 738 |
+
help="CSS selector to extract specific content from the page. It returns all matches.",
|
| 739 |
+
)
|
| 740 |
+
@option("--wait-selector", help="CSS selector to wait for before proceeding")
|
| 741 |
+
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
|
| 742 |
+
@option(
|
| 743 |
+
"--extra-headers",
|
| 744 |
+
"-H",
|
| 745 |
+
multiple=True,
|
| 746 |
+
help='Extra headers in format "Key: Value" (can be used multiple times)',
|
| 747 |
+
)
|
| 748 |
+
def stealthy_fetch(
|
| 749 |
+
url,
|
| 750 |
+
output_file,
|
| 751 |
+
headless,
|
| 752 |
+
disable_resources,
|
| 753 |
+
block_webrtc,
|
| 754 |
+
solve_cloudflare,
|
| 755 |
+
allow_webgl,
|
| 756 |
+
network_idle,
|
| 757 |
+
real_chrome,
|
| 758 |
+
hide_canvas,
|
| 759 |
+
timeout,
|
| 760 |
+
wait,
|
| 761 |
+
css_selector,
|
| 762 |
+
wait_selector,
|
| 763 |
+
proxy,
|
| 764 |
+
extra_headers,
|
| 765 |
+
):
|
| 766 |
+
"""
|
| 767 |
+
Opens up a browser with advanced stealth features and fetch content using StealthyFetcher.
|
| 768 |
+
|
| 769 |
+
:param url: Target url.
|
| 770 |
+
:param output_file: Output file path (.md for Markdown, .html for HTML).
|
| 771 |
+
:param headless: Run the browser in headless/hidden, or headful/visible mode.
|
| 772 |
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost.
|
| 773 |
+
:param block_webrtc: Blocks WebRTC entirely.
|
| 774 |
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges.
|
| 775 |
+
:param allow_webgl: Allow WebGL (recommended to keep enabled).
|
| 776 |
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 777 |
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 778 |
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
| 779 |
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page.
|
| 780 |
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before returning.
|
| 781 |
+
:param css_selector: CSS selector to extract specific content.
|
| 782 |
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 783 |
+
:param proxy: The proxy to be used with requests.
|
| 784 |
+
:param extra_headers: Extra headers to add to the request.
|
| 785 |
+
"""
|
| 786 |
+
|
| 787 |
+
# Parse parameters
|
| 788 |
+
parsed_headers, _ = _ParseHeaders(extra_headers, False)
|
| 789 |
+
|
| 790 |
+
# Build request arguments
|
| 791 |
+
kwargs = {
|
| 792 |
+
"headless": headless,
|
| 793 |
+
"disable_resources": disable_resources,
|
| 794 |
+
"block_webrtc": block_webrtc,
|
| 795 |
+
"solve_cloudflare": solve_cloudflare,
|
| 796 |
+
"allow_webgl": allow_webgl,
|
| 797 |
+
"network_idle": network_idle,
|
| 798 |
+
"real_chrome": real_chrome,
|
| 799 |
+
"hide_canvas": hide_canvas,
|
| 800 |
+
"timeout": timeout,
|
| 801 |
+
}
|
| 802 |
+
|
| 803 |
+
if wait > 0:
|
| 804 |
+
kwargs["wait"] = wait
|
| 805 |
+
if wait_selector:
|
| 806 |
+
kwargs["wait_selector"] = wait_selector
|
| 807 |
+
if proxy:
|
| 808 |
+
kwargs["proxy"] = proxy
|
| 809 |
+
if parsed_headers:
|
| 810 |
+
kwargs["extra_headers"] = parsed_headers
|
| 811 |
+
|
| 812 |
+
from scrapling.fetchers import StealthyFetcher
|
| 813 |
+
|
| 814 |
+
__Request_and_Save(StealthyFetcher.fetch, url, output_file, css_selector, **kwargs)
|
| 815 |
+
|
| 816 |
+
|
| 817 |
+
@group()
|
| 818 |
+
def main():
|
| 819 |
+
pass
|
| 820 |
+
|
| 821 |
+
|
| 822 |
+
# Adding commands
|
| 823 |
+
main.add_command(install)
|
| 824 |
+
main.add_command(shell)
|
| 825 |
+
main.add_command(extract)
|
| 826 |
+
main.add_command(mcp)
|
core/__init__.py
ADDED
|
File without changes
|
core/_shell_signatures.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from scrapling.core._types import (
|
| 2 |
+
Any,
|
| 3 |
+
Dict,
|
| 4 |
+
List,
|
| 5 |
+
Tuple,
|
| 6 |
+
Sequence,
|
| 7 |
+
Callable,
|
| 8 |
+
Optional,
|
| 9 |
+
SetCookieParam,
|
| 10 |
+
SelectorWaitStates,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
# Parameter definitions for shell function signatures (defined once at module level)
|
| 14 |
+
# Mirrors TypedDict definitions from _types.py but runtime-accessible for IPython introspection
|
| 15 |
+
_REQUESTS_PARAMS = {
|
| 16 |
+
"params": Optional[Dict | List | Tuple],
|
| 17 |
+
"cookies": Any,
|
| 18 |
+
"auth": Optional[Tuple[str, str]],
|
| 19 |
+
"impersonate": Any,
|
| 20 |
+
"http3": Optional[bool],
|
| 21 |
+
"stealthy_headers": Optional[bool],
|
| 22 |
+
"proxies": Any,
|
| 23 |
+
"proxy": Optional[str],
|
| 24 |
+
"proxy_auth": Optional[Tuple[str, str]],
|
| 25 |
+
"timeout": Optional[int | float],
|
| 26 |
+
"headers": Any,
|
| 27 |
+
"retries": Optional[int],
|
| 28 |
+
"retry_delay": Optional[int],
|
| 29 |
+
"follow_redirects": Optional[bool],
|
| 30 |
+
"max_redirects": Optional[int],
|
| 31 |
+
"verify": Optional[bool],
|
| 32 |
+
"cert": Optional[str | Tuple[str, str]],
|
| 33 |
+
"selector_config": Optional[Dict],
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
_FETCH_PARAMS = {
|
| 37 |
+
"headless": bool,
|
| 38 |
+
"disable_resources": bool,
|
| 39 |
+
"network_idle": bool,
|
| 40 |
+
"load_dom": bool,
|
| 41 |
+
"wait_selector": Optional[str],
|
| 42 |
+
"wait_selector_state": SelectorWaitStates,
|
| 43 |
+
"cookies": Sequence[SetCookieParam],
|
| 44 |
+
"google_search": bool,
|
| 45 |
+
"wait": int | float,
|
| 46 |
+
"timezone_id": str | None,
|
| 47 |
+
"page_action": Optional[Callable],
|
| 48 |
+
"proxy": Optional[str | Dict[str, str] | Tuple],
|
| 49 |
+
"extra_headers": Optional[Dict[str, str]],
|
| 50 |
+
"timeout": int | float,
|
| 51 |
+
"init_script": Optional[str],
|
| 52 |
+
"user_data_dir": str,
|
| 53 |
+
"selector_config": Optional[Dict],
|
| 54 |
+
"additional_args": Optional[Dict],
|
| 55 |
+
"locale": Optional[str],
|
| 56 |
+
"real_chrome": bool,
|
| 57 |
+
"cdp_url": Optional[str],
|
| 58 |
+
"useragent": Optional[str],
|
| 59 |
+
"extra_flags": Optional[List[str]],
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
_STEALTHY_FETCH_PARAMS = {
|
| 63 |
+
"headless": bool,
|
| 64 |
+
"disable_resources": bool,
|
| 65 |
+
"network_idle": bool,
|
| 66 |
+
"load_dom": bool,
|
| 67 |
+
"wait_selector": Optional[str],
|
| 68 |
+
"wait_selector_state": SelectorWaitStates,
|
| 69 |
+
"cookies": Sequence[SetCookieParam],
|
| 70 |
+
"google_search": bool,
|
| 71 |
+
"wait": int | float,
|
| 72 |
+
"timezone_id": str | None,
|
| 73 |
+
"page_action": Optional[Callable],
|
| 74 |
+
"proxy": Optional[str | Dict[str, str] | Tuple],
|
| 75 |
+
"extra_headers": Optional[Dict[str, str]],
|
| 76 |
+
"timeout": int | float,
|
| 77 |
+
"init_script": Optional[str],
|
| 78 |
+
"user_data_dir": str,
|
| 79 |
+
"selector_config": Optional[Dict],
|
| 80 |
+
"additional_args": Optional[Dict],
|
| 81 |
+
"locale": Optional[str],
|
| 82 |
+
"real_chrome": bool,
|
| 83 |
+
"cdp_url": Optional[str],
|
| 84 |
+
"useragent": Optional[str],
|
| 85 |
+
"extra_flags": Optional[List[str]],
|
| 86 |
+
"allow_webgl": bool,
|
| 87 |
+
"hide_canvas": bool,
|
| 88 |
+
"block_webrtc": bool,
|
| 89 |
+
"solve_cloudflare": bool,
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
# Mapping of function names to their parameter definitions
|
| 93 |
+
Signatures_map = {
|
| 94 |
+
"get": _REQUESTS_PARAMS,
|
| 95 |
+
"post": {**_REQUESTS_PARAMS, "data": Optional[Dict | str], "json": Optional[Dict | List]},
|
| 96 |
+
"put": {**_REQUESTS_PARAMS, "data": Optional[Dict | str], "json": Optional[Dict | List]},
|
| 97 |
+
"delete": _REQUESTS_PARAMS,
|
| 98 |
+
"fetch": _FETCH_PARAMS,
|
| 99 |
+
"stealthy_fetch": _STEALTHY_FETCH_PARAMS,
|
| 100 |
+
}
|
core/_types.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Type definitions for type checking purposes.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from typing import (
|
| 6 |
+
TYPE_CHECKING,
|
| 7 |
+
TypedDict,
|
| 8 |
+
TypeAlias,
|
| 9 |
+
cast,
|
| 10 |
+
overload,
|
| 11 |
+
Any,
|
| 12 |
+
Callable,
|
| 13 |
+
Dict,
|
| 14 |
+
Generator,
|
| 15 |
+
AsyncGenerator,
|
| 16 |
+
Generic,
|
| 17 |
+
Iterable,
|
| 18 |
+
List,
|
| 19 |
+
Set,
|
| 20 |
+
Literal,
|
| 21 |
+
Optional,
|
| 22 |
+
Iterator,
|
| 23 |
+
Pattern,
|
| 24 |
+
Sequence,
|
| 25 |
+
Tuple,
|
| 26 |
+
TypeVar,
|
| 27 |
+
Union,
|
| 28 |
+
Match,
|
| 29 |
+
Mapping,
|
| 30 |
+
Awaitable,
|
| 31 |
+
Protocol,
|
| 32 |
+
Coroutine,
|
| 33 |
+
SupportsIndex,
|
| 34 |
+
)
|
| 35 |
+
from typing_extensions import Self, Unpack
|
| 36 |
+
|
| 37 |
+
# Proxy can be a string URL or a dict (Playwright format: {"server": "...", "username": "...", "password": "..."})
|
| 38 |
+
ProxyType = Union[str, Dict[str, str]]
|
| 39 |
+
SUPPORTED_HTTP_METHODS = Literal["GET", "POST", "PUT", "DELETE"]
|
| 40 |
+
SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
|
| 41 |
+
PageLoadStates = Literal["commit", "domcontentloaded", "load", "networkidle"]
|
| 42 |
+
extraction_types = Literal["text", "html", "markdown"]
|
| 43 |
+
StrOrBytes = Union[str, bytes]
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# Copied from `playwright._impl._api_structures.SetCookieParam`
|
| 47 |
+
class SetCookieParam(TypedDict, total=False):
|
| 48 |
+
name: str
|
| 49 |
+
value: str
|
| 50 |
+
url: Optional[str]
|
| 51 |
+
domain: Optional[str]
|
| 52 |
+
path: Optional[str]
|
| 53 |
+
expires: Optional[float]
|
| 54 |
+
httpOnly: Optional[bool]
|
| 55 |
+
secure: Optional[bool]
|
| 56 |
+
sameSite: Optional[Literal["Lax", "None", "Strict"]]
|
| 57 |
+
partitionKey: Optional[str]
|
core/ai.py
ADDED
|
@@ -0,0 +1,653 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from asyncio import gather
|
| 2 |
+
|
| 3 |
+
from mcp.server.fastmcp import FastMCP
|
| 4 |
+
from pydantic import BaseModel, Field
|
| 5 |
+
from starlette.requests import Request
|
| 6 |
+
from starlette.responses import Response, JSONResponse
|
| 7 |
+
|
| 8 |
+
from scrapling.core.shell import Convertor
|
| 9 |
+
from scrapling.engines.toolbelt.custom import Response as _ScraplingResponse
|
| 10 |
+
from scrapling.engines.static import ImpersonateType
|
| 11 |
+
from scrapling.fetchers import (
|
| 12 |
+
Fetcher,
|
| 13 |
+
FetcherSession,
|
| 14 |
+
DynamicFetcher,
|
| 15 |
+
AsyncDynamicSession,
|
| 16 |
+
StealthyFetcher,
|
| 17 |
+
AsyncStealthySession,
|
| 18 |
+
)
|
| 19 |
+
from scrapling.core._types import (
|
| 20 |
+
Optional,
|
| 21 |
+
Tuple,
|
| 22 |
+
Mapping,
|
| 23 |
+
Dict,
|
| 24 |
+
List,
|
| 25 |
+
Any,
|
| 26 |
+
Generator,
|
| 27 |
+
Sequence,
|
| 28 |
+
SetCookieParam,
|
| 29 |
+
extraction_types,
|
| 30 |
+
SelectorWaitStates,
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class ResponseModel(BaseModel):
|
| 35 |
+
"""Request's response information structure."""
|
| 36 |
+
|
| 37 |
+
status: int = Field(description="The status code returned by the website.")
|
| 38 |
+
content: list[str] = Field(description="The content as Markdown/HTML or the text content of the page.")
|
| 39 |
+
url: str = Field(description="The URL given by the user that resulted in this response.")
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def _content_translator(content: Generator[str, None, None], page: _ScraplingResponse) -> ResponseModel:
|
| 43 |
+
"""Convert a content generator to a list of ResponseModel objects."""
|
| 44 |
+
return ResponseModel(status=page.status, content=[result for result in content], url=page.url)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def _normalize_credentials(credentials: Optional[Dict[str, str]]) -> Optional[Tuple[str, str]]:
|
| 48 |
+
"""Convert a credentials dictionary to a tuple accepted by fetchers."""
|
| 49 |
+
if not credentials:
|
| 50 |
+
return None
|
| 51 |
+
|
| 52 |
+
username = credentials.get("username")
|
| 53 |
+
password = credentials.get("password")
|
| 54 |
+
|
| 55 |
+
if username is None or password is None:
|
| 56 |
+
raise ValueError("Credentials dictionary must contain both 'username' and 'password' keys")
|
| 57 |
+
|
| 58 |
+
return username, password
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
class ScraplingMCPServer:
|
| 62 |
+
@staticmethod
|
| 63 |
+
def get(
|
| 64 |
+
url: str,
|
| 65 |
+
impersonate: ImpersonateType = "chrome",
|
| 66 |
+
extraction_type: extraction_types = "markdown",
|
| 67 |
+
css_selector: Optional[str] = None,
|
| 68 |
+
main_content_only: bool = True,
|
| 69 |
+
params: Optional[Dict] = None,
|
| 70 |
+
headers: Optional[Mapping[str, Optional[str]]] = None,
|
| 71 |
+
cookies: Optional[Dict[str, str]] = None,
|
| 72 |
+
timeout: Optional[int | float] = 30,
|
| 73 |
+
follow_redirects: bool = True,
|
| 74 |
+
max_redirects: int = 30,
|
| 75 |
+
retries: Optional[int] = 3,
|
| 76 |
+
retry_delay: Optional[int] = 1,
|
| 77 |
+
proxy: Optional[str] = None,
|
| 78 |
+
proxy_auth: Optional[Dict[str, str]] = None,
|
| 79 |
+
auth: Optional[Dict[str, str]] = None,
|
| 80 |
+
verify: Optional[bool] = True,
|
| 81 |
+
http3: Optional[bool] = False,
|
| 82 |
+
stealthy_headers: Optional[bool] = True,
|
| 83 |
+
) -> ResponseModel:
|
| 84 |
+
"""Make GET HTTP request to a URL and return a structured output of the result.
|
| 85 |
+
Note: This is only suitable for low-mid protection levels. For high-protection levels or websites that require JS loading, use the other tools directly.
|
| 86 |
+
Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
|
| 87 |
+
|
| 88 |
+
:param url: The URL to request.
|
| 89 |
+
:param impersonate: Browser version to impersonate its fingerprint. It's using the latest chrome version by default.
|
| 90 |
+
:param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
|
| 91 |
+
- Markdown will convert the page content to Markdown format.
|
| 92 |
+
- HTML will return the raw HTML content of the page.
|
| 93 |
+
- Text will return the text content of the page.
|
| 94 |
+
:param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
|
| 95 |
+
:param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
|
| 96 |
+
:param params: Query string parameters for the request.
|
| 97 |
+
:param headers: Headers to include in the request.
|
| 98 |
+
:param cookies: Cookies to use in the request.
|
| 99 |
+
:param timeout: Number of seconds to wait before timing out.
|
| 100 |
+
:param follow_redirects: Whether to follow redirects. Defaults to True.
|
| 101 |
+
:param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
| 102 |
+
:param retries: Number of retry attempts. Defaults to 3.
|
| 103 |
+
:param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
| 104 |
+
:param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
| 105 |
+
Cannot be used together with the `proxies` parameter.
|
| 106 |
+
:param proxy_auth: HTTP basic auth for proxy in dictionary format with `username` and `password` keys.
|
| 107 |
+
:param auth: HTTP basic auth in dictionary format with `username` and `password` keys.
|
| 108 |
+
:param verify: Whether to verify HTTPS certificates.
|
| 109 |
+
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
| 110 |
+
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
|
| 111 |
+
"""
|
| 112 |
+
normalized_proxy_auth = _normalize_credentials(proxy_auth)
|
| 113 |
+
normalized_auth = _normalize_credentials(auth)
|
| 114 |
+
|
| 115 |
+
page = Fetcher.get(
|
| 116 |
+
url,
|
| 117 |
+
auth=normalized_auth,
|
| 118 |
+
proxy=proxy,
|
| 119 |
+
http3=http3,
|
| 120 |
+
verify=verify,
|
| 121 |
+
params=params,
|
| 122 |
+
proxy_auth=normalized_proxy_auth,
|
| 123 |
+
retry_delay=retry_delay,
|
| 124 |
+
stealthy_headers=stealthy_headers,
|
| 125 |
+
impersonate=impersonate,
|
| 126 |
+
headers=headers,
|
| 127 |
+
cookies=cookies,
|
| 128 |
+
timeout=timeout,
|
| 129 |
+
retries=retries,
|
| 130 |
+
max_redirects=max_redirects,
|
| 131 |
+
follow_redirects=follow_redirects,
|
| 132 |
+
)
|
| 133 |
+
return _content_translator(
|
| 134 |
+
Convertor._extract_content(
|
| 135 |
+
page,
|
| 136 |
+
css_selector=css_selector,
|
| 137 |
+
extraction_type=extraction_type,
|
| 138 |
+
main_content_only=main_content_only,
|
| 139 |
+
),
|
| 140 |
+
page,
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
@staticmethod
|
| 144 |
+
async def bulk_get(
|
| 145 |
+
urls: List[str],
|
| 146 |
+
impersonate: ImpersonateType = "chrome",
|
| 147 |
+
extraction_type: extraction_types = "markdown",
|
| 148 |
+
css_selector: Optional[str] = None,
|
| 149 |
+
main_content_only: bool = True,
|
| 150 |
+
params: Optional[Dict] = None,
|
| 151 |
+
headers: Optional[Mapping[str, Optional[str]]] = None,
|
| 152 |
+
cookies: Optional[Dict[str, str]] = None,
|
| 153 |
+
timeout: Optional[int | float] = 30,
|
| 154 |
+
follow_redirects: bool = True,
|
| 155 |
+
max_redirects: int = 30,
|
| 156 |
+
retries: Optional[int] = 3,
|
| 157 |
+
retry_delay: Optional[int] = 1,
|
| 158 |
+
proxy: Optional[str] = None,
|
| 159 |
+
proxy_auth: Optional[Dict[str, str]] = None,
|
| 160 |
+
auth: Optional[Dict[str, str]] = None,
|
| 161 |
+
verify: Optional[bool] = True,
|
| 162 |
+
http3: Optional[bool] = False,
|
| 163 |
+
stealthy_headers: Optional[bool] = True,
|
| 164 |
+
) -> List[ResponseModel]:
|
| 165 |
+
"""Make GET HTTP request to a group of URLs and for each URL, return a structured output of the result.
|
| 166 |
+
Note: This is only suitable for low-mid protection levels. For high-protection levels or websites that require JS loading, use the other tools directly.
|
| 167 |
+
Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
|
| 168 |
+
|
| 169 |
+
:param urls: A list of the URLs to request.
|
| 170 |
+
:param impersonate: Browser version to impersonate its fingerprint. It's using the latest chrome version by default.
|
| 171 |
+
:param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
|
| 172 |
+
- Markdown will convert the page content to Markdown format.
|
| 173 |
+
- HTML will return the raw HTML content of the page.
|
| 174 |
+
- Text will return the text content of the page.
|
| 175 |
+
:param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
|
| 176 |
+
:param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
|
| 177 |
+
:param params: Query string parameters for the request.
|
| 178 |
+
:param headers: Headers to include in the request.
|
| 179 |
+
:param cookies: Cookies to use in the request.
|
| 180 |
+
:param timeout: Number of seconds to wait before timing out.
|
| 181 |
+
:param follow_redirects: Whether to follow redirects. Defaults to True.
|
| 182 |
+
:param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
| 183 |
+
:param retries: Number of retry attempts. Defaults to 3.
|
| 184 |
+
:param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
| 185 |
+
:param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
| 186 |
+
Cannot be used together with the `proxies` parameter.
|
| 187 |
+
:param proxy_auth: HTTP basic auth for proxy in dictionary format with `username` and `password` keys.
|
| 188 |
+
:param auth: HTTP basic auth in dictionary format with `username` and `password` keys.
|
| 189 |
+
:param verify: Whether to verify HTTPS certificates.
|
| 190 |
+
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
| 191 |
+
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
|
| 192 |
+
"""
|
| 193 |
+
normalized_proxy_auth = _normalize_credentials(proxy_auth)
|
| 194 |
+
normalized_auth = _normalize_credentials(auth)
|
| 195 |
+
|
| 196 |
+
async with FetcherSession() as session:
|
| 197 |
+
tasks: List[Any] = [
|
| 198 |
+
session.get(
|
| 199 |
+
url,
|
| 200 |
+
auth=normalized_auth,
|
| 201 |
+
proxy=proxy,
|
| 202 |
+
http3=http3,
|
| 203 |
+
verify=verify,
|
| 204 |
+
params=params,
|
| 205 |
+
headers=headers,
|
| 206 |
+
cookies=cookies,
|
| 207 |
+
timeout=timeout,
|
| 208 |
+
retries=retries,
|
| 209 |
+
proxy_auth=normalized_proxy_auth,
|
| 210 |
+
retry_delay=retry_delay,
|
| 211 |
+
impersonate=impersonate,
|
| 212 |
+
max_redirects=max_redirects,
|
| 213 |
+
follow_redirects=follow_redirects,
|
| 214 |
+
stealthy_headers=stealthy_headers,
|
| 215 |
+
)
|
| 216 |
+
for url in urls
|
| 217 |
+
]
|
| 218 |
+
responses = await gather(*tasks)
|
| 219 |
+
return [
|
| 220 |
+
_content_translator(
|
| 221 |
+
Convertor._extract_content(
|
| 222 |
+
page,
|
| 223 |
+
css_selector=css_selector,
|
| 224 |
+
extraction_type=extraction_type,
|
| 225 |
+
main_content_only=main_content_only,
|
| 226 |
+
),
|
| 227 |
+
page,
|
| 228 |
+
)
|
| 229 |
+
for page in responses
|
| 230 |
+
]
|
| 231 |
+
|
| 232 |
+
@staticmethod
|
| 233 |
+
async def fetch(
|
| 234 |
+
url: str,
|
| 235 |
+
extraction_type: extraction_types = "markdown",
|
| 236 |
+
css_selector: Optional[str] = None,
|
| 237 |
+
main_content_only: bool = True,
|
| 238 |
+
headless: bool = True, # noqa: F821
|
| 239 |
+
google_search: bool = True,
|
| 240 |
+
real_chrome: bool = False,
|
| 241 |
+
wait: int | float = 0,
|
| 242 |
+
proxy: Optional[str | Dict[str, str]] = None,
|
| 243 |
+
timezone_id: str | None = None,
|
| 244 |
+
locale: str | None = None,
|
| 245 |
+
extra_headers: Optional[Dict[str, str]] = None,
|
| 246 |
+
useragent: Optional[str] = None,
|
| 247 |
+
cdp_url: Optional[str] = None,
|
| 248 |
+
timeout: int | float = 30000,
|
| 249 |
+
disable_resources: bool = False,
|
| 250 |
+
wait_selector: Optional[str] = None,
|
| 251 |
+
cookies: Sequence[SetCookieParam] | None = None,
|
| 252 |
+
network_idle: bool = False,
|
| 253 |
+
wait_selector_state: SelectorWaitStates = "attached",
|
| 254 |
+
) -> ResponseModel:
|
| 255 |
+
"""Use playwright to open a browser to fetch a URL and return a structured output of the result.
|
| 256 |
+
Note: This is only suitable for low-mid protection levels.
|
| 257 |
+
Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
|
| 258 |
+
|
| 259 |
+
:param url: The URL to request.
|
| 260 |
+
:param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
|
| 261 |
+
- Markdown will convert the page content to Markdown format.
|
| 262 |
+
- HTML will return the raw HTML content of the page.
|
| 263 |
+
- Text will return the text content of the page.
|
| 264 |
+
:param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
|
| 265 |
+
:param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
|
| 266 |
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 267 |
+
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 268 |
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 269 |
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 270 |
+
:param cookies: Set cookies for the next request. It should be in a dictionary format that Playwright accepts.
|
| 271 |
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 272 |
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 273 |
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 274 |
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 275 |
+
:param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
|
| 276 |
+
:param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
|
| 277 |
+
rules. Defaults to the system default locale.
|
| 278 |
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 279 |
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 280 |
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 281 |
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 282 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 283 |
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 284 |
+
"""
|
| 285 |
+
page = await DynamicFetcher.async_fetch(
|
| 286 |
+
url,
|
| 287 |
+
wait=wait,
|
| 288 |
+
proxy=proxy,
|
| 289 |
+
locale=locale,
|
| 290 |
+
timeout=timeout,
|
| 291 |
+
cookies=cookies,
|
| 292 |
+
cdp_url=cdp_url,
|
| 293 |
+
headless=headless,
|
| 294 |
+
useragent=useragent,
|
| 295 |
+
timezone_id=timezone_id,
|
| 296 |
+
real_chrome=real_chrome,
|
| 297 |
+
network_idle=network_idle,
|
| 298 |
+
wait_selector=wait_selector,
|
| 299 |
+
extra_headers=extra_headers,
|
| 300 |
+
google_search=google_search,
|
| 301 |
+
disable_resources=disable_resources,
|
| 302 |
+
wait_selector_state=wait_selector_state,
|
| 303 |
+
)
|
| 304 |
+
return _content_translator(
|
| 305 |
+
Convertor._extract_content(
|
| 306 |
+
page,
|
| 307 |
+
css_selector=css_selector,
|
| 308 |
+
extraction_type=extraction_type,
|
| 309 |
+
main_content_only=main_content_only,
|
| 310 |
+
),
|
| 311 |
+
page,
|
| 312 |
+
)
|
| 313 |
+
|
| 314 |
+
@staticmethod
|
| 315 |
+
async def bulk_fetch(
|
| 316 |
+
urls: List[str],
|
| 317 |
+
extraction_type: extraction_types = "markdown",
|
| 318 |
+
css_selector: Optional[str] = None,
|
| 319 |
+
main_content_only: bool = True,
|
| 320 |
+
headless: bool = True, # noqa: F821
|
| 321 |
+
google_search: bool = True,
|
| 322 |
+
real_chrome: bool = False,
|
| 323 |
+
wait: int | float = 0,
|
| 324 |
+
proxy: Optional[str | Dict[str, str]] = None,
|
| 325 |
+
timezone_id: str | None = None,
|
| 326 |
+
locale: str | None = None,
|
| 327 |
+
extra_headers: Optional[Dict[str, str]] = None,
|
| 328 |
+
useragent: Optional[str] = None,
|
| 329 |
+
cdp_url: Optional[str] = None,
|
| 330 |
+
timeout: int | float = 30000,
|
| 331 |
+
disable_resources: bool = False,
|
| 332 |
+
wait_selector: Optional[str] = None,
|
| 333 |
+
cookies: Sequence[SetCookieParam] | None = None,
|
| 334 |
+
network_idle: bool = False,
|
| 335 |
+
wait_selector_state: SelectorWaitStates = "attached",
|
| 336 |
+
) -> List[ResponseModel]:
|
| 337 |
+
"""Use playwright to open a browser, then fetch a group of URLs at the same time, and for each page return a structured output of the result.
|
| 338 |
+
Note: This is only suitable for low-mid protection levels.
|
| 339 |
+
Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
|
| 340 |
+
|
| 341 |
+
:param urls: A list of the URLs to request.
|
| 342 |
+
:param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
|
| 343 |
+
- Markdown will convert the page content to Markdown format.
|
| 344 |
+
- HTML will return the raw HTML content of the page.
|
| 345 |
+
- Text will return the text content of the page.
|
| 346 |
+
:param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
|
| 347 |
+
:param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
|
| 348 |
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 349 |
+
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 350 |
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 351 |
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 352 |
+
:param cookies: Set cookies for the next request. It should be in a dictionary format that Playwright accepts.
|
| 353 |
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 354 |
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 355 |
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 356 |
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 357 |
+
:param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
|
| 358 |
+
:param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
|
| 359 |
+
rules. Defaults to the system default locale.
|
| 360 |
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 361 |
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 362 |
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 363 |
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 364 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 365 |
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 366 |
+
"""
|
| 367 |
+
async with AsyncDynamicSession(
|
| 368 |
+
wait=wait,
|
| 369 |
+
proxy=proxy,
|
| 370 |
+
locale=locale,
|
| 371 |
+
timeout=timeout,
|
| 372 |
+
cookies=cookies,
|
| 373 |
+
cdp_url=cdp_url,
|
| 374 |
+
headless=headless,
|
| 375 |
+
max_pages=len(urls),
|
| 376 |
+
useragent=useragent,
|
| 377 |
+
timezone_id=timezone_id,
|
| 378 |
+
real_chrome=real_chrome,
|
| 379 |
+
network_idle=network_idle,
|
| 380 |
+
wait_selector=wait_selector,
|
| 381 |
+
google_search=google_search,
|
| 382 |
+
extra_headers=extra_headers,
|
| 383 |
+
disable_resources=disable_resources,
|
| 384 |
+
wait_selector_state=wait_selector_state,
|
| 385 |
+
) as session:
|
| 386 |
+
tasks = [session.fetch(url) for url in urls]
|
| 387 |
+
responses = await gather(*tasks)
|
| 388 |
+
return [
|
| 389 |
+
_content_translator(
|
| 390 |
+
Convertor._extract_content(
|
| 391 |
+
page,
|
| 392 |
+
css_selector=css_selector,
|
| 393 |
+
extraction_type=extraction_type,
|
| 394 |
+
main_content_only=main_content_only,
|
| 395 |
+
),
|
| 396 |
+
page,
|
| 397 |
+
)
|
| 398 |
+
for page in responses
|
| 399 |
+
]
|
| 400 |
+
|
| 401 |
+
@staticmethod
|
| 402 |
+
async def stealthy_fetch(
|
| 403 |
+
url: str,
|
| 404 |
+
extraction_type: extraction_types = "markdown",
|
| 405 |
+
css_selector: Optional[str] = None,
|
| 406 |
+
main_content_only: bool = True,
|
| 407 |
+
headless: bool = True, # noqa: F821
|
| 408 |
+
google_search: bool = True,
|
| 409 |
+
real_chrome: bool = False,
|
| 410 |
+
wait: int | float = 0,
|
| 411 |
+
proxy: Optional[str | Dict[str, str]] = None,
|
| 412 |
+
timezone_id: str | None = None,
|
| 413 |
+
locale: str | None = None,
|
| 414 |
+
extra_headers: Optional[Dict[str, str]] = None,
|
| 415 |
+
useragent: Optional[str] = None,
|
| 416 |
+
hide_canvas: bool = False,
|
| 417 |
+
cdp_url: Optional[str] = None,
|
| 418 |
+
timeout: int | float = 30000,
|
| 419 |
+
disable_resources: bool = False,
|
| 420 |
+
wait_selector: Optional[str] = None,
|
| 421 |
+
cookies: Sequence[SetCookieParam] | None = None,
|
| 422 |
+
network_idle: bool = False,
|
| 423 |
+
wait_selector_state: SelectorWaitStates = "attached",
|
| 424 |
+
block_webrtc: bool = False,
|
| 425 |
+
allow_webgl: bool = True,
|
| 426 |
+
solve_cloudflare: bool = False,
|
| 427 |
+
additional_args: Optional[Dict] = None,
|
| 428 |
+
) -> ResponseModel:
|
| 429 |
+
"""Use the stealthy fetcher to fetch a URL and return a structured output of the result.
|
| 430 |
+
Note: This is the only suitable fetcher for high protection levels.
|
| 431 |
+
Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
|
| 432 |
+
|
| 433 |
+
:param url: The URL to request.
|
| 434 |
+
:param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
|
| 435 |
+
- Markdown will convert the page content to Markdown format.
|
| 436 |
+
- HTML will return the raw HTML content of the page.
|
| 437 |
+
- Text will return the text content of the page.
|
| 438 |
+
:param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
|
| 439 |
+
:param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
|
| 440 |
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 441 |
+
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 442 |
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 443 |
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 444 |
+
:param cookies: Set cookies for the next request.
|
| 445 |
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
| 446 |
+
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
| 447 |
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 448 |
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 449 |
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 450 |
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 451 |
+
:param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
|
| 452 |
+
:param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
|
| 453 |
+
rules. Defaults to the system default locale.
|
| 454 |
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 455 |
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 456 |
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
| 457 |
+
:param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
|
| 458 |
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 459 |
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 460 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 461 |
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 462 |
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 463 |
+
"""
|
| 464 |
+
page = await StealthyFetcher.async_fetch(
|
| 465 |
+
url,
|
| 466 |
+
wait=wait,
|
| 467 |
+
proxy=proxy,
|
| 468 |
+
locale=locale,
|
| 469 |
+
cdp_url=cdp_url,
|
| 470 |
+
timeout=timeout,
|
| 471 |
+
cookies=cookies,
|
| 472 |
+
headless=headless,
|
| 473 |
+
useragent=useragent,
|
| 474 |
+
timezone_id=timezone_id,
|
| 475 |
+
real_chrome=real_chrome,
|
| 476 |
+
hide_canvas=hide_canvas,
|
| 477 |
+
allow_webgl=allow_webgl,
|
| 478 |
+
network_idle=network_idle,
|
| 479 |
+
block_webrtc=block_webrtc,
|
| 480 |
+
wait_selector=wait_selector,
|
| 481 |
+
google_search=google_search,
|
| 482 |
+
extra_headers=extra_headers,
|
| 483 |
+
additional_args=additional_args,
|
| 484 |
+
solve_cloudflare=solve_cloudflare,
|
| 485 |
+
disable_resources=disable_resources,
|
| 486 |
+
wait_selector_state=wait_selector_state,
|
| 487 |
+
)
|
| 488 |
+
return _content_translator(
|
| 489 |
+
Convertor._extract_content(
|
| 490 |
+
page,
|
| 491 |
+
css_selector=css_selector,
|
| 492 |
+
extraction_type=extraction_type,
|
| 493 |
+
main_content_only=main_content_only,
|
| 494 |
+
),
|
| 495 |
+
page,
|
| 496 |
+
)
|
| 497 |
+
|
| 498 |
+
@staticmethod
|
| 499 |
+
async def bulk_stealthy_fetch(
|
| 500 |
+
urls: List[str],
|
| 501 |
+
extraction_type: extraction_types = "markdown",
|
| 502 |
+
css_selector: Optional[str] = None,
|
| 503 |
+
main_content_only: bool = True,
|
| 504 |
+
headless: bool = True, # noqa: F821
|
| 505 |
+
google_search: bool = True,
|
| 506 |
+
real_chrome: bool = False,
|
| 507 |
+
wait: int | float = 0,
|
| 508 |
+
proxy: Optional[str | Dict[str, str]] = None,
|
| 509 |
+
timezone_id: str | None = None,
|
| 510 |
+
locale: str | None = None,
|
| 511 |
+
extra_headers: Optional[Dict[str, str]] = None,
|
| 512 |
+
useragent: Optional[str] = None,
|
| 513 |
+
hide_canvas: bool = False,
|
| 514 |
+
cdp_url: Optional[str] = None,
|
| 515 |
+
timeout: int | float = 30000,
|
| 516 |
+
disable_resources: bool = False,
|
| 517 |
+
wait_selector: Optional[str] = None,
|
| 518 |
+
cookies: Sequence[SetCookieParam] | None = None,
|
| 519 |
+
network_idle: bool = False,
|
| 520 |
+
wait_selector_state: SelectorWaitStates = "attached",
|
| 521 |
+
block_webrtc: bool = False,
|
| 522 |
+
allow_webgl: bool = True,
|
| 523 |
+
solve_cloudflare: bool = False,
|
| 524 |
+
additional_args: Optional[Dict] = None,
|
| 525 |
+
) -> List[ResponseModel]:
|
| 526 |
+
"""Use the stealthy fetcher to fetch a group of URLs at the same time, and for each page return a structured output of the result.
|
| 527 |
+
Note: This is the only suitable fetcher for high protection levels.
|
| 528 |
+
Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
|
| 529 |
+
|
| 530 |
+
:param urls: A list of the URLs to request.
|
| 531 |
+
:param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
|
| 532 |
+
- Markdown will convert the page content to Markdown format.
|
| 533 |
+
- HTML will return the raw HTML content of the page.
|
| 534 |
+
- Text will return the text content of the page.
|
| 535 |
+
:param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
|
| 536 |
+
:param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
|
| 537 |
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 538 |
+
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 539 |
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 540 |
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 541 |
+
:param cookies: Set cookies for the next request.
|
| 542 |
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
| 543 |
+
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
| 544 |
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 545 |
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 546 |
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 547 |
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 548 |
+
:param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
|
| 549 |
+
:param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
|
| 550 |
+
rules. Defaults to the system default locale.
|
| 551 |
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 552 |
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 553 |
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
| 554 |
+
:param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
|
| 555 |
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 556 |
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 557 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 558 |
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 559 |
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 560 |
+
"""
|
| 561 |
+
async with AsyncStealthySession(
|
| 562 |
+
wait=wait,
|
| 563 |
+
proxy=proxy,
|
| 564 |
+
locale=locale,
|
| 565 |
+
cdp_url=cdp_url,
|
| 566 |
+
timeout=timeout,
|
| 567 |
+
cookies=cookies,
|
| 568 |
+
headless=headless,
|
| 569 |
+
useragent=useragent,
|
| 570 |
+
timezone_id=timezone_id,
|
| 571 |
+
real_chrome=real_chrome,
|
| 572 |
+
hide_canvas=hide_canvas,
|
| 573 |
+
allow_webgl=allow_webgl,
|
| 574 |
+
network_idle=network_idle,
|
| 575 |
+
block_webrtc=block_webrtc,
|
| 576 |
+
wait_selector=wait_selector,
|
| 577 |
+
google_search=google_search,
|
| 578 |
+
extra_headers=extra_headers,
|
| 579 |
+
additional_args=additional_args,
|
| 580 |
+
solve_cloudflare=solve_cloudflare,
|
| 581 |
+
disable_resources=disable_resources,
|
| 582 |
+
wait_selector_state=wait_selector_state,
|
| 583 |
+
) as session:
|
| 584 |
+
tasks = [session.fetch(url) for url in urls]
|
| 585 |
+
responses = await gather(*tasks)
|
| 586 |
+
return [
|
| 587 |
+
_content_translator(
|
| 588 |
+
Convertor._extract_content(
|
| 589 |
+
page,
|
| 590 |
+
css_selector=css_selector,
|
| 591 |
+
extraction_type=extraction_type,
|
| 592 |
+
main_content_only=main_content_only,
|
| 593 |
+
),
|
| 594 |
+
page,
|
| 595 |
+
)
|
| 596 |
+
for page in responses
|
| 597 |
+
]
|
| 598 |
+
|
| 599 |
+
def serve(self, http: bool, host: str, port: int):
|
| 600 |
+
"""Serve the MCP server."""
|
| 601 |
+
server = FastMCP(name="Scrapling", host=host, port=port)
|
| 602 |
+
server.add_tool(self.get, title="get", description=self.get.__doc__, structured_output=True)
|
| 603 |
+
server.add_tool(self.bulk_get, title="bulk_get", description=self.bulk_get.__doc__, structured_output=True)
|
| 604 |
+
server.add_tool(self.fetch, title="fetch", description=self.fetch.__doc__, structured_output=True)
|
| 605 |
+
server.add_tool(
|
| 606 |
+
self.bulk_fetch, title="bulk_fetch", description=self.bulk_fetch.__doc__, structured_output=True
|
| 607 |
+
)
|
| 608 |
+
server.add_tool(
|
| 609 |
+
self.stealthy_fetch, title="stealthy_fetch", description=self.stealthy_fetch.__doc__, structured_output=True
|
| 610 |
+
)
|
| 611 |
+
server.add_tool(
|
| 612 |
+
self.bulk_stealthy_fetch,
|
| 613 |
+
title="bulk_stealthy_fetch",
|
| 614 |
+
description=self.bulk_stealthy_fetch.__doc__,
|
| 615 |
+
structured_output=True,
|
| 616 |
+
)
|
| 617 |
+
|
| 618 |
+
@server.custom_route("/health", methods=["GET"])
|
| 619 |
+
async def health_check(request: Request) -> Response:
|
| 620 |
+
return JSONResponse({"status": "healthy"})
|
| 621 |
+
|
| 622 |
+
@server.custom_route("/api-docs", methods=["GET"])
|
| 623 |
+
async def api_docs(request: Request) -> Response:
|
| 624 |
+
tools = await server.list_tools()
|
| 625 |
+
return JSONResponse([tool.model_dump() for tool in tools])
|
| 626 |
+
|
| 627 |
+
if http:
|
| 628 |
+
import uvicorn
|
| 629 |
+
|
| 630 |
+
# Get the Starlette app from FastMCP
|
| 631 |
+
mcp_app = server.streamable_http_app()
|
| 632 |
+
|
| 633 |
+
try:
|
| 634 |
+
import gradio as gr
|
| 635 |
+
from scrapling.ui import create_ui
|
| 636 |
+
|
| 637 |
+
demo = create_ui()
|
| 638 |
+
# Mount Gradio app onto the MCP app
|
| 639 |
+
# When path="/", Gradio handles requests not handled by the underlying app (or vice versa depending on implementation)
|
| 640 |
+
# Actually gr.mount_gradio_app returns a NEW FastAPI app that mounts the input app.
|
| 641 |
+
# But here we want to mount Gradio ON TOP of MCP app or ALONGSIDE.
|
| 642 |
+
# mount_gradio_app(app, blocks, path) -> app
|
| 643 |
+
# It adds routes to `app`.
|
| 644 |
+
# Since mcp_app is Starlette, we might need to wrap it or cast it.
|
| 645 |
+
# Gradio supports Starlette.
|
| 646 |
+
app = gr.mount_gradio_app(mcp_app, demo, path="/")
|
| 647 |
+
except (ImportError, ModuleNotFoundError):
|
| 648 |
+
app = mcp_app
|
| 649 |
+
print("Gradio not installed or failed to load, running MCP server only.")
|
| 650 |
+
|
| 651 |
+
uvicorn.run(app, host=host, port=port)
|
| 652 |
+
else:
|
| 653 |
+
server.run(transport="stdio")
|
core/custom_types.py
ADDED
|
@@ -0,0 +1,345 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections.abc import Mapping
|
| 2 |
+
from types import MappingProxyType
|
| 3 |
+
from re import compile as re_compile, UNICODE, IGNORECASE
|
| 4 |
+
|
| 5 |
+
from orjson import dumps, loads
|
| 6 |
+
from w3lib.html import replace_entities as _replace_entities
|
| 7 |
+
|
| 8 |
+
from scrapling.core._types import (
|
| 9 |
+
Any,
|
| 10 |
+
cast,
|
| 11 |
+
Dict,
|
| 12 |
+
List,
|
| 13 |
+
Union,
|
| 14 |
+
overload,
|
| 15 |
+
TypeVar,
|
| 16 |
+
Literal,
|
| 17 |
+
Pattern,
|
| 18 |
+
Iterable,
|
| 19 |
+
Generator,
|
| 20 |
+
SupportsIndex,
|
| 21 |
+
)
|
| 22 |
+
from scrapling.core.utils import _is_iterable, flatten, __CONSECUTIVE_SPACES_REGEX__
|
| 23 |
+
|
| 24 |
+
# Define type variable for AttributeHandler value type
|
| 25 |
+
_TextHandlerType = TypeVar("_TextHandlerType", bound="TextHandler")
|
| 26 |
+
__CLEANING_TABLE__ = str.maketrans("\t\r\n", " ")
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class TextHandler(str):
|
| 30 |
+
"""Extends standard Python string by adding more functionality"""
|
| 31 |
+
|
| 32 |
+
__slots__ = ()
|
| 33 |
+
|
| 34 |
+
def __getitem__(self, key: SupportsIndex | slice) -> "TextHandler": # pragma: no cover
|
| 35 |
+
lst = super().__getitem__(key)
|
| 36 |
+
return TextHandler(lst)
|
| 37 |
+
|
| 38 |
+
def split(self, sep: str | None = None, maxsplit: SupportsIndex = -1) -> list[Any]: # pragma: no cover
|
| 39 |
+
return TextHandlers([TextHandler(s) for s in super().split(sep, maxsplit)])
|
| 40 |
+
|
| 41 |
+
def strip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
| 42 |
+
return TextHandler(super().strip(chars))
|
| 43 |
+
|
| 44 |
+
def lstrip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
| 45 |
+
return TextHandler(super().lstrip(chars))
|
| 46 |
+
|
| 47 |
+
def rstrip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
| 48 |
+
return TextHandler(super().rstrip(chars))
|
| 49 |
+
|
| 50 |
+
def capitalize(self) -> Union[str, "TextHandler"]: # pragma: no cover
|
| 51 |
+
return TextHandler(super().capitalize())
|
| 52 |
+
|
| 53 |
+
def casefold(self) -> Union[str, "TextHandler"]: # pragma: no cover
|
| 54 |
+
return TextHandler(super().casefold())
|
| 55 |
+
|
| 56 |
+
def center(self, width: SupportsIndex, fillchar: str = " ") -> Union[str, "TextHandler"]: # pragma: no cover
|
| 57 |
+
return TextHandler(super().center(width, fillchar))
|
| 58 |
+
|
| 59 |
+
def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, "TextHandler"]: # pragma: no cover
|
| 60 |
+
return TextHandler(super().expandtabs(tabsize))
|
| 61 |
+
|
| 62 |
+
def format(self, *args: object, **kwargs: object) -> Union[str, "TextHandler"]: # pragma: no cover
|
| 63 |
+
return TextHandler(super().format(*args, **kwargs))
|
| 64 |
+
|
| 65 |
+
def format_map(self, mapping) -> Union[str, "TextHandler"]: # pragma: no cover
|
| 66 |
+
return TextHandler(super().format_map(mapping))
|
| 67 |
+
|
| 68 |
+
def join(self, iterable: Iterable[str]) -> Union[str, "TextHandler"]: # pragma: no cover
|
| 69 |
+
return TextHandler(super().join(iterable))
|
| 70 |
+
|
| 71 |
+
def ljust(self, width: SupportsIndex, fillchar: str = " ") -> Union[str, "TextHandler"]: # pragma: no cover
|
| 72 |
+
return TextHandler(super().ljust(width, fillchar))
|
| 73 |
+
|
| 74 |
+
def rjust(self, width: SupportsIndex, fillchar: str = " ") -> Union[str, "TextHandler"]: # pragma: no cover
|
| 75 |
+
return TextHandler(super().rjust(width, fillchar))
|
| 76 |
+
|
| 77 |
+
def swapcase(self) -> Union[str, "TextHandler"]: # pragma: no cover
|
| 78 |
+
return TextHandler(super().swapcase())
|
| 79 |
+
|
| 80 |
+
def title(self) -> Union[str, "TextHandler"]: # pragma: no cover
|
| 81 |
+
return TextHandler(super().title())
|
| 82 |
+
|
| 83 |
+
def translate(self, table) -> Union[str, "TextHandler"]: # pragma: no cover
|
| 84 |
+
return TextHandler(super().translate(table))
|
| 85 |
+
|
| 86 |
+
def zfill(self, width: SupportsIndex) -> Union[str, "TextHandler"]: # pragma: no cover
|
| 87 |
+
return TextHandler(super().zfill(width))
|
| 88 |
+
|
| 89 |
+
def replace(self, old: str, new: str, count: SupportsIndex = -1) -> Union[str, "TextHandler"]:
|
| 90 |
+
return TextHandler(super().replace(old, new, count))
|
| 91 |
+
|
| 92 |
+
def upper(self) -> Union[str, "TextHandler"]:
|
| 93 |
+
return TextHandler(super().upper())
|
| 94 |
+
|
| 95 |
+
def lower(self) -> Union[str, "TextHandler"]:
|
| 96 |
+
return TextHandler(super().lower())
|
| 97 |
+
|
| 98 |
+
##############
|
| 99 |
+
|
| 100 |
+
def sort(self, reverse: bool = False) -> Union[str, "TextHandler"]:
|
| 101 |
+
"""Return a sorted version of the string"""
|
| 102 |
+
return self.__class__("".join(sorted(self, reverse=reverse)))
|
| 103 |
+
|
| 104 |
+
def clean(self, remove_entities=False) -> Union[str, "TextHandler"]:
|
| 105 |
+
"""Return a new version of the string after removing all white spaces and consecutive spaces"""
|
| 106 |
+
data = self.translate(__CLEANING_TABLE__)
|
| 107 |
+
if remove_entities:
|
| 108 |
+
data = _replace_entities(data)
|
| 109 |
+
return self.__class__(__CONSECUTIVE_SPACES_REGEX__.sub(" ", data).strip())
|
| 110 |
+
|
| 111 |
+
# For easy copy-paste from Scrapy/parsel code when needed :)
|
| 112 |
+
def get(self, default=None): # pragma: no cover
|
| 113 |
+
return self
|
| 114 |
+
|
| 115 |
+
def get_all(self): # pragma: no cover
|
| 116 |
+
return self
|
| 117 |
+
|
| 118 |
+
extract = get_all
|
| 119 |
+
extract_first = get
|
| 120 |
+
|
| 121 |
+
def json(self) -> Dict:
|
| 122 |
+
"""Return JSON response if the response is jsonable otherwise throw error"""
|
| 123 |
+
# Using str function as a workaround for orjson issue with subclasses of str.
|
| 124 |
+
# Check this out: https://github.com/ijl/orjson/issues/445
|
| 125 |
+
return loads(str(self))
|
| 126 |
+
|
| 127 |
+
@overload
|
| 128 |
+
def re(
|
| 129 |
+
self,
|
| 130 |
+
regex: str | Pattern,
|
| 131 |
+
replace_entities: bool = True,
|
| 132 |
+
clean_match: bool = False,
|
| 133 |
+
case_sensitive: bool = True,
|
| 134 |
+
*,
|
| 135 |
+
check_match: Literal[True],
|
| 136 |
+
) -> bool: ...
|
| 137 |
+
|
| 138 |
+
@overload
|
| 139 |
+
def re(
|
| 140 |
+
self,
|
| 141 |
+
regex: str | Pattern,
|
| 142 |
+
replace_entities: bool = True,
|
| 143 |
+
clean_match: bool = False,
|
| 144 |
+
case_sensitive: bool = True,
|
| 145 |
+
check_match: Literal[False] = False,
|
| 146 |
+
) -> "TextHandlers": ...
|
| 147 |
+
|
| 148 |
+
def re(
|
| 149 |
+
self,
|
| 150 |
+
regex: str | Pattern,
|
| 151 |
+
replace_entities: bool = True,
|
| 152 |
+
clean_match: bool = False,
|
| 153 |
+
case_sensitive: bool = True,
|
| 154 |
+
check_match: bool = False,
|
| 155 |
+
) -> Union["TextHandlers", bool]:
|
| 156 |
+
"""Apply the given regex to the current text and return a list of strings with the matches.
|
| 157 |
+
|
| 158 |
+
:param regex: Can be either a compiled regular expression or a string.
|
| 159 |
+
:param replace_entities: If enabled character entity references are replaced by their corresponding character
|
| 160 |
+
:param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 161 |
+
:param case_sensitive: If disabled, function will set the regex to ignore the letters-case while compiling it
|
| 162 |
+
:param check_match: Used to quickly check if this regex matches or not without any operations on the results
|
| 163 |
+
|
| 164 |
+
"""
|
| 165 |
+
if isinstance(regex, str):
|
| 166 |
+
if case_sensitive:
|
| 167 |
+
regex = re_compile(regex, UNICODE)
|
| 168 |
+
else:
|
| 169 |
+
regex = re_compile(regex, flags=UNICODE | IGNORECASE)
|
| 170 |
+
|
| 171 |
+
input_text = self.clean() if clean_match else self
|
| 172 |
+
results = regex.findall(input_text)
|
| 173 |
+
if check_match:
|
| 174 |
+
return bool(results)
|
| 175 |
+
|
| 176 |
+
if all(_is_iterable(res) for res in results):
|
| 177 |
+
results = flatten(results)
|
| 178 |
+
|
| 179 |
+
if not replace_entities:
|
| 180 |
+
return TextHandlers([TextHandler(string) for string in results])
|
| 181 |
+
|
| 182 |
+
return TextHandlers([TextHandler(_replace_entities(s)) for s in results])
|
| 183 |
+
|
| 184 |
+
def re_first(
|
| 185 |
+
self,
|
| 186 |
+
regex: str | Pattern,
|
| 187 |
+
default: Any = None,
|
| 188 |
+
replace_entities: bool = True,
|
| 189 |
+
clean_match: bool = False,
|
| 190 |
+
case_sensitive: bool = True,
|
| 191 |
+
) -> "TextHandler":
|
| 192 |
+
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
| 193 |
+
|
| 194 |
+
:param regex: Can be either a compiled regular expression or a string.
|
| 195 |
+
:param default: The default value to be returned if there is no match
|
| 196 |
+
:param replace_entities: If enabled character entity references are replaced by their corresponding character
|
| 197 |
+
:param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 198 |
+
:param case_sensitive: If disabled, function will set the regex to ignore the letters-case while compiling it
|
| 199 |
+
|
| 200 |
+
"""
|
| 201 |
+
result = self.re(
|
| 202 |
+
regex,
|
| 203 |
+
replace_entities,
|
| 204 |
+
clean_match=clean_match,
|
| 205 |
+
case_sensitive=case_sensitive,
|
| 206 |
+
)
|
| 207 |
+
return result[0] if result else default
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
class TextHandlers(List[TextHandler]):
|
| 211 |
+
"""
|
| 212 |
+
The :class:`TextHandlers` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
|
| 213 |
+
"""
|
| 214 |
+
|
| 215 |
+
__slots__ = ()
|
| 216 |
+
|
| 217 |
+
@overload
|
| 218 |
+
def __getitem__(self, pos: SupportsIndex) -> TextHandler: # pragma: no cover
|
| 219 |
+
pass
|
| 220 |
+
|
| 221 |
+
@overload
|
| 222 |
+
def __getitem__(self, pos: slice) -> "TextHandlers": # pragma: no cover
|
| 223 |
+
pass
|
| 224 |
+
|
| 225 |
+
def __getitem__(self, pos: SupportsIndex | slice) -> Union[TextHandler, "TextHandlers"]:
|
| 226 |
+
lst = super().__getitem__(pos)
|
| 227 |
+
if isinstance(pos, slice):
|
| 228 |
+
return TextHandlers(cast(List[TextHandler], lst))
|
| 229 |
+
return TextHandler(cast(TextHandler, lst))
|
| 230 |
+
|
| 231 |
+
def re(
|
| 232 |
+
self,
|
| 233 |
+
regex: str | Pattern,
|
| 234 |
+
replace_entities: bool = True,
|
| 235 |
+
clean_match: bool = False,
|
| 236 |
+
case_sensitive: bool = True,
|
| 237 |
+
) -> "TextHandlers":
|
| 238 |
+
"""Call the ``.re()`` method for each element in this list and return
|
| 239 |
+
their results flattened as TextHandlers.
|
| 240 |
+
|
| 241 |
+
:param regex: Can be either a compiled regular expression or a string.
|
| 242 |
+
:param replace_entities: If enabled character entity references are replaced by their corresponding character
|
| 243 |
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 244 |
+
:param case_sensitive: if disabled, the function will set the regex to ignore the letters-case while compiling it
|
| 245 |
+
"""
|
| 246 |
+
results = [n.re(regex, replace_entities, clean_match, case_sensitive) for n in self]
|
| 247 |
+
return TextHandlers(flatten(results))
|
| 248 |
+
|
| 249 |
+
def re_first(
|
| 250 |
+
self,
|
| 251 |
+
regex: str | Pattern,
|
| 252 |
+
default: Any = None,
|
| 253 |
+
replace_entities: bool = True,
|
| 254 |
+
clean_match: bool = False,
|
| 255 |
+
case_sensitive: bool = True,
|
| 256 |
+
) -> TextHandler: # pragma: no cover
|
| 257 |
+
"""Call the ``.re_first()`` method for each element in this list and return
|
| 258 |
+
the first result or the default value otherwise.
|
| 259 |
+
|
| 260 |
+
:param regex: Can be either a compiled regular expression or a string.
|
| 261 |
+
:param default: The default value to be returned if there is no match
|
| 262 |
+
:param replace_entities: If enabled character entity references are replaced by their corresponding character
|
| 263 |
+
:param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 264 |
+
:param case_sensitive: If disabled, function will set the regex to ignore the letters-case while compiling it
|
| 265 |
+
"""
|
| 266 |
+
for n in self:
|
| 267 |
+
for result in n.re(regex, replace_entities, clean_match, case_sensitive):
|
| 268 |
+
return result
|
| 269 |
+
return default
|
| 270 |
+
|
| 271 |
+
# For easy copy-paste from Scrapy/parsel code when needed :)
|
| 272 |
+
def get(self, default=None):
|
| 273 |
+
"""Returns the first item of the current list
|
| 274 |
+
:param default: the default value to return if the current list is empty
|
| 275 |
+
"""
|
| 276 |
+
return self[0] if len(self) > 0 else default
|
| 277 |
+
|
| 278 |
+
def extract(self):
|
| 279 |
+
return self
|
| 280 |
+
|
| 281 |
+
extract_first = get
|
| 282 |
+
get_all = extract
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
class AttributesHandler(Mapping[str, _TextHandlerType]):
|
| 286 |
+
"""A read-only mapping to use instead of the standard dictionary for the speed boost, but at the same time I use it to add more functionalities.
|
| 287 |
+
If the standard dictionary is needed, convert this class to a dictionary with the `dict` function
|
| 288 |
+
"""
|
| 289 |
+
|
| 290 |
+
__slots__ = ("_data",)
|
| 291 |
+
|
| 292 |
+
def __init__(self, mapping: Any = None, **kwargs: Any) -> None:
|
| 293 |
+
mapping = (
|
| 294 |
+
{key: TextHandler(value) if isinstance(value, str) else value for key, value in mapping.items()}
|
| 295 |
+
if mapping is not None
|
| 296 |
+
else {}
|
| 297 |
+
)
|
| 298 |
+
|
| 299 |
+
if kwargs:
|
| 300 |
+
mapping.update(
|
| 301 |
+
{key: TextHandler(value) if isinstance(value, str) else value for key, value in kwargs.items()}
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
# Fastest read-only mapping type
|
| 305 |
+
self._data: Mapping[str, Any] = MappingProxyType(mapping)
|
| 306 |
+
|
| 307 |
+
def get(self, key: str, default: Any = None) -> _TextHandlerType:
|
| 308 |
+
"""Acts like the standard dictionary `.get()` method"""
|
| 309 |
+
return self._data.get(key, default)
|
| 310 |
+
|
| 311 |
+
def search_values(self, keyword: str, partial: bool = False) -> Generator["AttributesHandler", None, None]:
|
| 312 |
+
"""Search current attributes by values and return a dictionary of each matching item
|
| 313 |
+
:param keyword: The keyword to search for in the attribute values
|
| 314 |
+
:param partial: If True, the function will search if keyword in each value instead of perfect match
|
| 315 |
+
"""
|
| 316 |
+
for key, value in self._data.items():
|
| 317 |
+
if partial:
|
| 318 |
+
if keyword in value:
|
| 319 |
+
yield AttributesHandler({key: value})
|
| 320 |
+
else:
|
| 321 |
+
if keyword == value:
|
| 322 |
+
yield AttributesHandler({key: value})
|
| 323 |
+
|
| 324 |
+
@property
|
| 325 |
+
def json_string(self) -> bytes:
|
| 326 |
+
"""Convert current attributes to JSON bytes if the attributes are JSON serializable otherwise throws error"""
|
| 327 |
+
return dumps(dict(self._data))
|
| 328 |
+
|
| 329 |
+
def __getitem__(self, key: str) -> _TextHandlerType:
|
| 330 |
+
return self._data[key]
|
| 331 |
+
|
| 332 |
+
def __iter__(self):
|
| 333 |
+
return iter(self._data)
|
| 334 |
+
|
| 335 |
+
def __len__(self):
|
| 336 |
+
return len(self._data)
|
| 337 |
+
|
| 338 |
+
def __repr__(self):
|
| 339 |
+
return f"{self.__class__.__name__}({self._data})"
|
| 340 |
+
|
| 341 |
+
def __str__(self):
|
| 342 |
+
return str(self._data)
|
| 343 |
+
|
| 344 |
+
def __contains__(self, key):
|
| 345 |
+
return key in self._data
|
core/mixins.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from scrapling.core._types import Any, Dict
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class SelectorsGeneration:
|
| 5 |
+
"""
|
| 6 |
+
Functions for generating selectors
|
| 7 |
+
Trying to generate selectors like Firefox or maybe cleaner ones!? Ehm
|
| 8 |
+
Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
# Note: This is a mixin class meant to be used with Selector.
|
| 12 |
+
# The methods access Selector attributes (._root, .parent, .attrib, .tag, etc.)
|
| 13 |
+
# through self, which will be a Selector instance at runtime.
|
| 14 |
+
|
| 15 |
+
def _general_selection(self: Any, selection: str = "css", full_path: bool = False) -> str:
|
| 16 |
+
"""Generate a selector for the current element.
|
| 17 |
+
:return: A string of the generated selector.
|
| 18 |
+
"""
|
| 19 |
+
if self._is_text_node(self._root):
|
| 20 |
+
return ""
|
| 21 |
+
|
| 22 |
+
selectorPath = []
|
| 23 |
+
target = self
|
| 24 |
+
css = selection.lower() == "css"
|
| 25 |
+
while target is not None:
|
| 26 |
+
if target.parent:
|
| 27 |
+
if target.attrib.get("id"):
|
| 28 |
+
# id is enough
|
| 29 |
+
part = f"#{target.attrib['id']}" if css else f"[@id='{target.attrib['id']}']"
|
| 30 |
+
selectorPath.append(part)
|
| 31 |
+
if not full_path:
|
| 32 |
+
return " > ".join(reversed(selectorPath)) if css else "//*" + "/".join(reversed(selectorPath))
|
| 33 |
+
else:
|
| 34 |
+
part = f"{target.tag}"
|
| 35 |
+
# We won't use classes anymore because I some websites share exact classes between elements
|
| 36 |
+
# classes = target.attrib.get('class', '').split()
|
| 37 |
+
# if classes and css:
|
| 38 |
+
# part += f".{'.'.join(classes)}"
|
| 39 |
+
# else:
|
| 40 |
+
counter: Dict[str, int] = {}
|
| 41 |
+
for child in target.parent.children:
|
| 42 |
+
counter.setdefault(child.tag, 0)
|
| 43 |
+
counter[child.tag] += 1
|
| 44 |
+
if child._root == target._root:
|
| 45 |
+
break
|
| 46 |
+
|
| 47 |
+
if counter[target.tag] > 1:
|
| 48 |
+
part += f":nth-of-type({counter[target.tag]})" if css else f"[{counter[target.tag]}]"
|
| 49 |
+
|
| 50 |
+
selectorPath.append(part)
|
| 51 |
+
target = target.parent
|
| 52 |
+
if target is None or target.tag == "html":
|
| 53 |
+
return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))
|
| 54 |
+
else:
|
| 55 |
+
break
|
| 56 |
+
|
| 57 |
+
return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))
|
| 58 |
+
|
| 59 |
+
@property
|
| 60 |
+
def generate_css_selector(self: Any) -> str:
|
| 61 |
+
"""Generate a CSS selector for the current element
|
| 62 |
+
:return: A string of the generated selector.
|
| 63 |
+
"""
|
| 64 |
+
return self._general_selection()
|
| 65 |
+
|
| 66 |
+
@property
|
| 67 |
+
def generate_full_css_selector(self: Any) -> str:
|
| 68 |
+
"""Generate a complete CSS selector for the current element
|
| 69 |
+
:return: A string of the generated selector.
|
| 70 |
+
"""
|
| 71 |
+
return self._general_selection(full_path=True)
|
| 72 |
+
|
| 73 |
+
@property
|
| 74 |
+
def generate_xpath_selector(self: Any) -> str:
|
| 75 |
+
"""Generate an XPath selector for the current element
|
| 76 |
+
:return: A string of the generated selector.
|
| 77 |
+
"""
|
| 78 |
+
return self._general_selection("xpath")
|
| 79 |
+
|
| 80 |
+
@property
|
| 81 |
+
def generate_full_xpath_selector(self: Any) -> str:
|
| 82 |
+
"""Generate a complete XPath selector for the current element
|
| 83 |
+
:return: A string of the generated selector.
|
| 84 |
+
"""
|
| 85 |
+
return self._general_selection("xpath", full_path=True)
|
core/shell.py
ADDED
|
@@ -0,0 +1,643 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
from sys import stderr
|
| 3 |
+
from copy import deepcopy
|
| 4 |
+
from functools import wraps
|
| 5 |
+
from re import sub as re_sub
|
| 6 |
+
from collections import namedtuple
|
| 7 |
+
from shlex import split as shlex_split
|
| 8 |
+
from inspect import signature, Parameter
|
| 9 |
+
from tempfile import mkstemp as make_temp_file
|
| 10 |
+
from argparse import ArgumentParser, SUPPRESS
|
| 11 |
+
from webbrowser import open as open_in_browser
|
| 12 |
+
from urllib.parse import urlparse, urlunparse, parse_qsl
|
| 13 |
+
from logging import (
|
| 14 |
+
DEBUG,
|
| 15 |
+
INFO,
|
| 16 |
+
WARNING,
|
| 17 |
+
ERROR,
|
| 18 |
+
CRITICAL,
|
| 19 |
+
FATAL,
|
| 20 |
+
getLogger,
|
| 21 |
+
getLevelName,
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
from orjson import loads as json_loads, JSONDecodeError
|
| 25 |
+
|
| 26 |
+
from ._shell_signatures import Signatures_map
|
| 27 |
+
from scrapling import __version__
|
| 28 |
+
from scrapling.core.utils import log
|
| 29 |
+
from scrapling.parser import Selector, Selectors
|
| 30 |
+
from scrapling.core.custom_types import TextHandler
|
| 31 |
+
from scrapling.engines.toolbelt.custom import Response
|
| 32 |
+
from scrapling.core.utils._shell import _ParseHeaders, _CookieParser
|
| 33 |
+
from scrapling.core._types import (
|
| 34 |
+
Callable,
|
| 35 |
+
Dict,
|
| 36 |
+
Any,
|
| 37 |
+
cast,
|
| 38 |
+
Optional,
|
| 39 |
+
Generator,
|
| 40 |
+
extraction_types,
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
_known_logging_levels = {
|
| 45 |
+
"debug": DEBUG,
|
| 46 |
+
"info": INFO,
|
| 47 |
+
"warning": WARNING,
|
| 48 |
+
"error": ERROR,
|
| 49 |
+
"critical": CRITICAL,
|
| 50 |
+
"fatal": FATAL,
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# Define the structure for parsed context - Simplified for Fetcher args
|
| 55 |
+
Request = namedtuple(
|
| 56 |
+
"Request",
|
| 57 |
+
[
|
| 58 |
+
"method",
|
| 59 |
+
"url",
|
| 60 |
+
"params",
|
| 61 |
+
"data", # Can be str, bytes, or dict (for urlencoded)
|
| 62 |
+
"json_data", # Python object (dict/list) for JSON payload
|
| 63 |
+
"headers",
|
| 64 |
+
"cookies",
|
| 65 |
+
"proxy",
|
| 66 |
+
"follow_redirects", # Added for -L flag
|
| 67 |
+
],
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
# Suppress exit on error to handle parsing errors gracefully
|
| 72 |
+
class NoExitArgumentParser(ArgumentParser): # pragma: no cover
|
| 73 |
+
def error(self, message):
|
| 74 |
+
log.error(f"Curl arguments parsing error: {message}")
|
| 75 |
+
raise ValueError(f"Curl arguments parsing error: {message}")
|
| 76 |
+
|
| 77 |
+
def exit(self, status=0, message=None):
|
| 78 |
+
if message:
|
| 79 |
+
log.error(f"Scrapling shell exited with status {status}: {message}")
|
| 80 |
+
self._print_message(message, stderr)
|
| 81 |
+
raise ValueError(f"Scrapling shell exited with status {status}: {message or 'Unknown reason'}")
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
class CurlParser:
|
| 85 |
+
"""Builds the argument parser for relevant curl flags from DevTools."""
|
| 86 |
+
|
| 87 |
+
def __init__(self) -> None:
|
| 88 |
+
from scrapling.fetchers import Fetcher as __Fetcher
|
| 89 |
+
|
| 90 |
+
self.__fetcher = __Fetcher
|
| 91 |
+
# We will use argparse parser to parse the curl command directly instead of regex
|
| 92 |
+
# We will focus more on flags that will show up on curl commands copied from DevTools's network tab
|
| 93 |
+
_parser = NoExitArgumentParser(add_help=False) # Disable default help
|
| 94 |
+
# Basic curl arguments
|
| 95 |
+
_parser.add_argument("curl_command_placeholder", nargs="?", help=SUPPRESS)
|
| 96 |
+
_parser.add_argument("url")
|
| 97 |
+
_parser.add_argument("-X", "--request", dest="method", default=None)
|
| 98 |
+
_parser.add_argument("-H", "--header", action="append", default=[])
|
| 99 |
+
_parser.add_argument(
|
| 100 |
+
"-A", "--user-agent", help="Will be parsed from -H if present"
|
| 101 |
+
) # Note: DevTools usually includes this in -H
|
| 102 |
+
|
| 103 |
+
# Data arguments (prioritizing types common from DevTools)
|
| 104 |
+
_parser.add_argument("-d", "--data", default=None)
|
| 105 |
+
_parser.add_argument("--data-raw", default=None) # Often used by browsers for JSON body
|
| 106 |
+
_parser.add_argument("--data-binary", default=None)
|
| 107 |
+
# Keep urlencode for completeness, though less common from browser copy/paste
|
| 108 |
+
_parser.add_argument("--data-urlencode", action="append", default=[])
|
| 109 |
+
_parser.add_argument("-G", "--get", action="store_true") # Use GET and put data in URL
|
| 110 |
+
|
| 111 |
+
_parser.add_argument(
|
| 112 |
+
"-b",
|
| 113 |
+
"--cookie",
|
| 114 |
+
default=None,
|
| 115 |
+
help="Send cookies from string/file (string format used by DevTools)",
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
# Proxy
|
| 119 |
+
_parser.add_argument("-x", "--proxy", default=None)
|
| 120 |
+
_parser.add_argument("-U", "--proxy-user", default=None) # Basic proxy auth
|
| 121 |
+
|
| 122 |
+
# Connection/Security
|
| 123 |
+
_parser.add_argument("-k", "--insecure", action="store_true")
|
| 124 |
+
_parser.add_argument("--compressed", action="store_true") # Very common from browsers
|
| 125 |
+
|
| 126 |
+
# Other flags often included but may not map directly to request args
|
| 127 |
+
_parser.add_argument("-i", "--include", action="store_true")
|
| 128 |
+
_parser.add_argument("-s", "--silent", action="store_true")
|
| 129 |
+
_parser.add_argument("-v", "--verbose", action="store_true")
|
| 130 |
+
|
| 131 |
+
self.parser: NoExitArgumentParser = _parser
|
| 132 |
+
self._supported_methods = ("get", "post", "put", "delete")
|
| 133 |
+
|
| 134 |
+
# --- Main Parsing Logic ---
|
| 135 |
+
def parse(self, curl_command: str) -> Optional[Request]:
|
| 136 |
+
"""Parses the curl command string into a structured context for Fetcher."""
|
| 137 |
+
|
| 138 |
+
clean_command = curl_command.strip().lstrip("curl").strip().replace("\\\n", " ")
|
| 139 |
+
|
| 140 |
+
try:
|
| 141 |
+
tokens = shlex_split(clean_command) # Split the string using shell-like syntax
|
| 142 |
+
except ValueError as e: # pragma: no cover
|
| 143 |
+
log.error(f"Could not split command line: {e}")
|
| 144 |
+
return None
|
| 145 |
+
|
| 146 |
+
try:
|
| 147 |
+
parsed_args, unknown = self.parser.parse_known_args(tokens)
|
| 148 |
+
if unknown:
|
| 149 |
+
raise AttributeError(f"Unknown/Unsupported curl arguments: {unknown}")
|
| 150 |
+
|
| 151 |
+
except ValueError: # pragma: no cover
|
| 152 |
+
return None
|
| 153 |
+
|
| 154 |
+
except AttributeError:
|
| 155 |
+
raise
|
| 156 |
+
|
| 157 |
+
except Exception as e: # pragma: no cover
|
| 158 |
+
log.error(f"An unexpected error occurred during curl arguments parsing: {e}")
|
| 159 |
+
return None
|
| 160 |
+
|
| 161 |
+
# --- Determine Method ---
|
| 162 |
+
method = "get" # Default
|
| 163 |
+
if parsed_args.get: # `-G` forces GET
|
| 164 |
+
method = "get"
|
| 165 |
+
|
| 166 |
+
elif parsed_args.method:
|
| 167 |
+
method = parsed_args.method.strip().lower()
|
| 168 |
+
|
| 169 |
+
# Infer POST if data is present (unless overridden by -X or -G)
|
| 170 |
+
elif any(
|
| 171 |
+
[
|
| 172 |
+
parsed_args.data,
|
| 173 |
+
parsed_args.data_raw,
|
| 174 |
+
parsed_args.data_binary,
|
| 175 |
+
parsed_args.data_urlencode,
|
| 176 |
+
]
|
| 177 |
+
):
|
| 178 |
+
method = "post"
|
| 179 |
+
|
| 180 |
+
headers, cookies = _ParseHeaders(parsed_args.header)
|
| 181 |
+
|
| 182 |
+
if parsed_args.cookie:
|
| 183 |
+
# We are focusing on the string format from DevTools.
|
| 184 |
+
try:
|
| 185 |
+
for key, value in _CookieParser(parsed_args.cookie):
|
| 186 |
+
# Update the cookie dict, potentially overwriting cookies with the same name from -H 'cookie:'
|
| 187 |
+
cookies[key] = value
|
| 188 |
+
log.debug(f"Parsed cookies from -b argument: {list(cookies.keys())}")
|
| 189 |
+
except Exception as e: # pragma: no cover
|
| 190 |
+
log.error(f"Could not parse cookie string from -b '{parsed_args.cookie}': {e}")
|
| 191 |
+
|
| 192 |
+
# --- Process Data Payload ---
|
| 193 |
+
params = dict()
|
| 194 |
+
data_payload: Optional[str | bytes | Dict] = None
|
| 195 |
+
json_payload: Optional[Any] = None
|
| 196 |
+
|
| 197 |
+
# DevTools often uses --data-raw for JSON bodies
|
| 198 |
+
# Precedence: --data-binary > --data-raw / -d > --data-urlencode
|
| 199 |
+
if parsed_args.data_binary is not None: # pragma: no cover
|
| 200 |
+
try:
|
| 201 |
+
data_payload = parsed_args.data_binary.encode("utf-8")
|
| 202 |
+
log.debug("Using data from --data-binary as bytes.")
|
| 203 |
+
except Exception as e:
|
| 204 |
+
log.warning(
|
| 205 |
+
f"Could not encode binary data '{parsed_args.data_binary}' as bytes: {e}. Using raw string."
|
| 206 |
+
)
|
| 207 |
+
data_payload = parsed_args.data_binary # Fallback to string
|
| 208 |
+
|
| 209 |
+
elif parsed_args.data_raw is not None:
|
| 210 |
+
data_payload = parsed_args.data_raw.lstrip("$")
|
| 211 |
+
|
| 212 |
+
elif parsed_args.data is not None:
|
| 213 |
+
data_payload = parsed_args.data
|
| 214 |
+
|
| 215 |
+
elif parsed_args.data_urlencode: # pragma: no cover
|
| 216 |
+
# Combine and parse urlencoded data
|
| 217 |
+
combined_data = "&".join(parsed_args.data_urlencode)
|
| 218 |
+
try:
|
| 219 |
+
data_payload = dict(parse_qsl(combined_data, keep_blank_values=True))
|
| 220 |
+
except Exception as e:
|
| 221 |
+
log.warning(f"Could not parse urlencoded data '{combined_data}': {e}. Treating as raw string.")
|
| 222 |
+
data_payload = combined_data
|
| 223 |
+
|
| 224 |
+
# Check if raw data looks like JSON, prefer 'json' param if so
|
| 225 |
+
if isinstance(data_payload, str):
|
| 226 |
+
try:
|
| 227 |
+
maybe_json = json_loads(data_payload)
|
| 228 |
+
if isinstance(maybe_json, (dict, list)):
|
| 229 |
+
json_payload = maybe_json
|
| 230 |
+
data_payload = None
|
| 231 |
+
except JSONDecodeError:
|
| 232 |
+
pass # Not JSON, keep it in data_payload
|
| 233 |
+
|
| 234 |
+
# Handle `-G`: Move data to params if the method is GET
|
| 235 |
+
if method == "get" and data_payload: # pragma: no cover
|
| 236 |
+
if isinstance(data_payload, dict): # From --data-urlencode likely
|
| 237 |
+
params.update(data_payload)
|
| 238 |
+
elif isinstance(data_payload, str):
|
| 239 |
+
try:
|
| 240 |
+
params.update(dict(parse_qsl(data_payload, keep_blank_values=True)))
|
| 241 |
+
except ValueError:
|
| 242 |
+
log.warning(f"Could not parse data '{data_payload}' into GET parameters for -G.")
|
| 243 |
+
|
| 244 |
+
if params:
|
| 245 |
+
data_payload = None # Clear data as it's moved to params
|
| 246 |
+
json_payload = None # Should not have JSON body with -G
|
| 247 |
+
|
| 248 |
+
# --- Process Proxy ---
|
| 249 |
+
proxies: Optional[Dict[str, str]] = None
|
| 250 |
+
if parsed_args.proxy:
|
| 251 |
+
proxy_url = f"http://{parsed_args.proxy}" if "://" not in parsed_args.proxy else parsed_args.proxy
|
| 252 |
+
|
| 253 |
+
if parsed_args.proxy_user:
|
| 254 |
+
user_pass = parsed_args.proxy_user
|
| 255 |
+
parts = urlparse(proxy_url)
|
| 256 |
+
netloc_parts = parts.netloc.split("@")
|
| 257 |
+
netloc = f"{user_pass}@{netloc_parts[-1]}" if len(netloc_parts) > 1 else f"{user_pass}@{parts.netloc}"
|
| 258 |
+
proxy_url = urlunparse(
|
| 259 |
+
(
|
| 260 |
+
parts.scheme,
|
| 261 |
+
netloc,
|
| 262 |
+
parts.path,
|
| 263 |
+
parts.params,
|
| 264 |
+
parts.query,
|
| 265 |
+
parts.fragment,
|
| 266 |
+
)
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
# Standard proxy dict format
|
| 270 |
+
proxies = {"http": proxy_url, "https": proxy_url}
|
| 271 |
+
log.debug(f"Using proxy configuration: {proxies}")
|
| 272 |
+
|
| 273 |
+
# --- Final Context ---
|
| 274 |
+
return Request(
|
| 275 |
+
method=method,
|
| 276 |
+
url=parsed_args.url,
|
| 277 |
+
params=params,
|
| 278 |
+
data=data_payload,
|
| 279 |
+
json_data=json_payload,
|
| 280 |
+
headers=headers,
|
| 281 |
+
cookies=cookies,
|
| 282 |
+
proxy=proxies,
|
| 283 |
+
follow_redirects=True, # Scrapling default is True
|
| 284 |
+
)
|
| 285 |
+
|
| 286 |
+
def convert2fetcher(self, curl_command: Request | str) -> Optional[Response]:
|
| 287 |
+
if isinstance(curl_command, (Request, str)):
|
| 288 |
+
request = self.parse(curl_command) if isinstance(curl_command, str) else curl_command
|
| 289 |
+
|
| 290 |
+
# Ensure request parsing was successful before proceeding
|
| 291 |
+
if request is None: # pragma: no cover
|
| 292 |
+
log.error("Failed to parse curl command, cannot convert to fetcher.")
|
| 293 |
+
return None
|
| 294 |
+
|
| 295 |
+
request_args = request._asdict()
|
| 296 |
+
method = request_args.pop("method").strip().lower()
|
| 297 |
+
if method in self._supported_methods:
|
| 298 |
+
request_args["json"] = request_args.pop("json_data")
|
| 299 |
+
|
| 300 |
+
# Ensure data/json are removed for non-POST/PUT methods
|
| 301 |
+
if method not in ("post", "put"):
|
| 302 |
+
_ = request_args.pop("data", None)
|
| 303 |
+
_ = request_args.pop("json", None)
|
| 304 |
+
|
| 305 |
+
try:
|
| 306 |
+
return getattr(self.__fetcher, method)(**request_args)
|
| 307 |
+
except Exception as e: # pragma: no cover
|
| 308 |
+
log.error(f"Error calling Fetcher.{method}: {e}")
|
| 309 |
+
return None
|
| 310 |
+
else: # pragma: no cover
|
| 311 |
+
log.error(f'Request method "{method}" isn\'t supported by Scrapling yet')
|
| 312 |
+
return None
|
| 313 |
+
|
| 314 |
+
else: # pragma: no cover
|
| 315 |
+
log.error("Input must be a valid curl command string or a Request object.")
|
| 316 |
+
return None
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
def _unpack_signature(func, signature_name=None):
|
| 320 |
+
"""
|
| 321 |
+
Unpack TypedDict from Unpack[TypedDict] annotations in **kwargs and reconstruct the signature.
|
| 322 |
+
|
| 323 |
+
This allows the interactive shell to show individual parameters instead of just **kwargs, similar to how IDEs display them.
|
| 324 |
+
"""
|
| 325 |
+
try:
|
| 326 |
+
sig = signature(func)
|
| 327 |
+
func_name = signature_name or getattr(func, "__name__", None)
|
| 328 |
+
|
| 329 |
+
# Check if this function has known parameters
|
| 330 |
+
if func_name not in Signatures_map:
|
| 331 |
+
return sig
|
| 332 |
+
|
| 333 |
+
new_params = []
|
| 334 |
+
for param in sig.parameters.values():
|
| 335 |
+
if param.kind == Parameter.VAR_KEYWORD:
|
| 336 |
+
# Replace **kwargs with individual keyword-only parameters
|
| 337 |
+
for field_name, field_type in Signatures_map[func_name].items():
|
| 338 |
+
new_params.append(
|
| 339 |
+
Parameter(field_name, Parameter.KEYWORD_ONLY, default=Parameter.empty, annotation=field_type)
|
| 340 |
+
)
|
| 341 |
+
else:
|
| 342 |
+
new_params.append(param)
|
| 343 |
+
|
| 344 |
+
# Reconstruct signature with unpacked parameters
|
| 345 |
+
if len(new_params) != len(sig.parameters):
|
| 346 |
+
return sig.replace(parameters=new_params)
|
| 347 |
+
return sig
|
| 348 |
+
|
| 349 |
+
except Exception: # pragma: no cover
|
| 350 |
+
return signature(func)
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
def show_page_in_browser(page: Selector): # pragma: no cover
|
| 354 |
+
if not page or not isinstance(page, Selector):
|
| 355 |
+
log.error("Input must be of type `Selector`")
|
| 356 |
+
return
|
| 357 |
+
|
| 358 |
+
try:
|
| 359 |
+
fd, fname = make_temp_file(prefix="scrapling_view_", suffix=".html")
|
| 360 |
+
with open(fd, "w", encoding=page.encoding) as f:
|
| 361 |
+
f.write(page.html_content)
|
| 362 |
+
|
| 363 |
+
open_in_browser(f"file://{fname}")
|
| 364 |
+
except IOError as e:
|
| 365 |
+
log.error(f"Failed to write temporary file for viewing: {e}")
|
| 366 |
+
except Exception as e:
|
| 367 |
+
log.error(f"An unexpected error occurred while viewing the page: {e}")
|
| 368 |
+
|
| 369 |
+
|
| 370 |
+
class CustomShell:
|
| 371 |
+
"""A custom IPython shell with minimal dependencies"""
|
| 372 |
+
|
| 373 |
+
def __init__(self, code, log_level="debug"):
|
| 374 |
+
from IPython.terminal.embed import InteractiveShellEmbed as __InteractiveShellEmbed
|
| 375 |
+
from scrapling.fetchers import (
|
| 376 |
+
Fetcher as __Fetcher,
|
| 377 |
+
AsyncFetcher as __AsyncFetcher,
|
| 378 |
+
FetcherSession as __FetcherSession,
|
| 379 |
+
DynamicFetcher as __DynamicFetcher,
|
| 380 |
+
DynamicSession as __DynamicSession,
|
| 381 |
+
AsyncDynamicSession as __AsyncDynamicSession,
|
| 382 |
+
StealthyFetcher as __StealthyFetcher,
|
| 383 |
+
StealthySession as __StealthySession,
|
| 384 |
+
AsyncStealthySession as __AsyncStealthySession,
|
| 385 |
+
)
|
| 386 |
+
|
| 387 |
+
self.__InteractiveShellEmbed = __InteractiveShellEmbed
|
| 388 |
+
self.__Fetcher = __Fetcher
|
| 389 |
+
self.__AsyncFetcher = __AsyncFetcher
|
| 390 |
+
self.__FetcherSession = __FetcherSession
|
| 391 |
+
self.__DynamicFetcher = __DynamicFetcher
|
| 392 |
+
self.__DynamicSession = __DynamicSession
|
| 393 |
+
self.__AsyncDynamicSession = __AsyncDynamicSession
|
| 394 |
+
self.__StealthyFetcher = __StealthyFetcher
|
| 395 |
+
self.__StealthySession = __StealthySession
|
| 396 |
+
self.__AsyncStealthySession = __AsyncStealthySession
|
| 397 |
+
self.code = code
|
| 398 |
+
self.page = None
|
| 399 |
+
self.pages = Selectors([])
|
| 400 |
+
self._curl_parser = CurlParser()
|
| 401 |
+
log_level = log_level.strip().lower()
|
| 402 |
+
|
| 403 |
+
if _known_logging_levels.get(log_level):
|
| 404 |
+
self.log_level = _known_logging_levels[log_level]
|
| 405 |
+
else: # pragma: no cover
|
| 406 |
+
log.warning(f'Unknown log level "{log_level}", defaulting to "DEBUG"')
|
| 407 |
+
self.log_level = DEBUG
|
| 408 |
+
|
| 409 |
+
self.shell = None
|
| 410 |
+
|
| 411 |
+
# Initialize your application components
|
| 412 |
+
self.init_components()
|
| 413 |
+
|
| 414 |
+
def init_components(self):
|
| 415 |
+
"""Initialize application components"""
|
| 416 |
+
# This is where you'd set up your application-specific objects
|
| 417 |
+
if self.log_level:
|
| 418 |
+
getLogger("scrapling").setLevel(self.log_level)
|
| 419 |
+
|
| 420 |
+
settings = self.__Fetcher.display_config()
|
| 421 |
+
settings.pop("storage", None)
|
| 422 |
+
settings.pop("storage_args", None)
|
| 423 |
+
log.info(f"Scrapling {__version__} shell started")
|
| 424 |
+
log.info(f"Logging level is set to '{getLevelName(self.log_level)}'")
|
| 425 |
+
log.info(f"Fetchers' parsing settings: {settings}")
|
| 426 |
+
|
| 427 |
+
@staticmethod
|
| 428 |
+
def banner():
|
| 429 |
+
"""Create a custom banner for the shell"""
|
| 430 |
+
return f"""
|
| 431 |
+
-> Available Scrapling objects:
|
| 432 |
+
- Fetcher/AsyncFetcher/FetcherSession
|
| 433 |
+
- DynamicFetcher/DynamicSession/AsyncDynamicSession
|
| 434 |
+
- StealthyFetcher/StealthySession/AsyncStealthySession
|
| 435 |
+
- Selector
|
| 436 |
+
|
| 437 |
+
-> Useful shortcuts:
|
| 438 |
+
- {"get":<30} Shortcut for `Fetcher.get`
|
| 439 |
+
- {"post":<30} Shortcut for `Fetcher.post`
|
| 440 |
+
- {"put":<30} Shortcut for `Fetcher.put`
|
| 441 |
+
- {"delete":<30} Shortcut for `Fetcher.delete`
|
| 442 |
+
- {"fetch":<30} Shortcut for `DynamicFetcher.fetch`
|
| 443 |
+
- {"stealthy_fetch":<30} Shortcut for `StealthyFetcher.fetch`
|
| 444 |
+
|
| 445 |
+
-> Useful commands
|
| 446 |
+
- {"page / response":<30} The response object of the last page you fetched
|
| 447 |
+
- {"pages":<30} Selectors object of the last 5 response objects you fetched
|
| 448 |
+
- {"uncurl('curl_command')":<30} Convert curl command to a Request object. (Optimized to handle curl commands copied from DevTools network tab.)
|
| 449 |
+
- {"curl2fetcher('curl_command')":<30} Convert curl command and make the request with Fetcher. (Optimized to handle curl commands copied from DevTools network tab.)
|
| 450 |
+
- {"view(page)":<30} View page in a browser
|
| 451 |
+
- {"help()":<30} Show this help message (Shell help)
|
| 452 |
+
|
| 453 |
+
Type 'exit' or press Ctrl+D to exit.
|
| 454 |
+
"""
|
| 455 |
+
|
| 456 |
+
def update_page(self, result): # pragma: no cover
|
| 457 |
+
"""Update the current page and add to pages history"""
|
| 458 |
+
self.page = result
|
| 459 |
+
if isinstance(result, (Response, Selector)):
|
| 460 |
+
self.pages.append(result)
|
| 461 |
+
if len(self.pages) > 5:
|
| 462 |
+
self.pages.pop(0) # Remove the oldest item
|
| 463 |
+
|
| 464 |
+
# Update in IPython namespace too
|
| 465 |
+
if self.shell:
|
| 466 |
+
self.shell.user_ns["page"] = self.page
|
| 467 |
+
self.shell.user_ns["response"] = self.page
|
| 468 |
+
self.shell.user_ns["pages"] = self.pages
|
| 469 |
+
|
| 470 |
+
return result
|
| 471 |
+
|
| 472 |
+
def create_wrapper(
|
| 473 |
+
self, func: Callable, get_signature: bool = True, signature_name: Optional[str] = None
|
| 474 |
+
) -> Callable:
|
| 475 |
+
"""Create a wrapper that preserves function signature but updates page"""
|
| 476 |
+
|
| 477 |
+
@wraps(func)
|
| 478 |
+
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
| 479 |
+
result = func(*args, **kwargs)
|
| 480 |
+
return self.update_page(result)
|
| 481 |
+
|
| 482 |
+
if get_signature:
|
| 483 |
+
# Explicitly preserve and unpack signature for IPython introspection and autocompletion
|
| 484 |
+
setattr(wrapper, "__signature__", _unpack_signature(func, signature_name))
|
| 485 |
+
else:
|
| 486 |
+
setattr(wrapper, "__signature__", signature(func))
|
| 487 |
+
|
| 488 |
+
return wrapper
|
| 489 |
+
|
| 490 |
+
def get_namespace(self):
|
| 491 |
+
"""Create a namespace with application-specific objects"""
|
| 492 |
+
|
| 493 |
+
# Create wrapped versions of fetch functions
|
| 494 |
+
get = self.create_wrapper(self.__Fetcher.get)
|
| 495 |
+
post = self.create_wrapper(self.__Fetcher.post)
|
| 496 |
+
put = self.create_wrapper(self.__Fetcher.put)
|
| 497 |
+
delete = self.create_wrapper(self.__Fetcher.delete)
|
| 498 |
+
dynamic_fetch = self.create_wrapper(self.__DynamicFetcher.fetch)
|
| 499 |
+
stealthy_fetch = self.create_wrapper(self.__StealthyFetcher.fetch, signature_name="stealthy_fetch")
|
| 500 |
+
curl2fetcher = self.create_wrapper(self._curl_parser.convert2fetcher, get_signature=False)
|
| 501 |
+
|
| 502 |
+
# Create the namespace dictionary
|
| 503 |
+
return {
|
| 504 |
+
"get": get,
|
| 505 |
+
"post": post,
|
| 506 |
+
"put": put,
|
| 507 |
+
"delete": delete,
|
| 508 |
+
"Fetcher": self.__Fetcher,
|
| 509 |
+
"AsyncFetcher": self.__AsyncFetcher,
|
| 510 |
+
"FetcherSession": self.__FetcherSession,
|
| 511 |
+
"DynamicSession": self.__DynamicSession,
|
| 512 |
+
"AsyncDynamicSession": self.__AsyncDynamicSession,
|
| 513 |
+
"StealthySession": self.__StealthySession,
|
| 514 |
+
"AsyncStealthySession": self.__AsyncStealthySession,
|
| 515 |
+
"fetch": dynamic_fetch,
|
| 516 |
+
"DynamicFetcher": self.__DynamicFetcher,
|
| 517 |
+
"stealthy_fetch": stealthy_fetch,
|
| 518 |
+
"StealthyFetcher": self.__StealthyFetcher,
|
| 519 |
+
"Selector": Selector,
|
| 520 |
+
"page": self.page,
|
| 521 |
+
"response": self.page,
|
| 522 |
+
"pages": self.pages,
|
| 523 |
+
"view": show_page_in_browser,
|
| 524 |
+
"uncurl": self._curl_parser.parse,
|
| 525 |
+
"curl2fetcher": curl2fetcher,
|
| 526 |
+
"help": self.show_help,
|
| 527 |
+
}
|
| 528 |
+
|
| 529 |
+
def show_help(self): # pragma: no cover
|
| 530 |
+
"""Show help information"""
|
| 531 |
+
print(self.banner())
|
| 532 |
+
|
| 533 |
+
def start(self): # pragma: no cover
|
| 534 |
+
"""Start the interactive shell"""
|
| 535 |
+
|
| 536 |
+
# Get our namespace with application objects
|
| 537 |
+
namespace = self.get_namespace()
|
| 538 |
+
ipython_shell = self.__InteractiveShellEmbed(
|
| 539 |
+
banner1=self.banner(),
|
| 540 |
+
banner2="",
|
| 541 |
+
enable_tip=False,
|
| 542 |
+
exit_msg="Bye Bye",
|
| 543 |
+
user_ns=namespace,
|
| 544 |
+
)
|
| 545 |
+
self.shell = ipython_shell
|
| 546 |
+
|
| 547 |
+
# If a command was provided, execute it and exit
|
| 548 |
+
if self.code:
|
| 549 |
+
log.info(f"Executing provided code: {self.code}")
|
| 550 |
+
try:
|
| 551 |
+
ipython_shell.run_cell(self.code, store_history=False)
|
| 552 |
+
except Exception as e:
|
| 553 |
+
log.error(f"Error executing initial code: {e}")
|
| 554 |
+
return
|
| 555 |
+
|
| 556 |
+
ipython_shell()
|
| 557 |
+
|
| 558 |
+
|
| 559 |
+
class Convertor:
|
| 560 |
+
"""Utils for the extract shell command"""
|
| 561 |
+
|
| 562 |
+
_extension_map: Dict[str, extraction_types] = {
|
| 563 |
+
"md": "markdown",
|
| 564 |
+
"html": "html",
|
| 565 |
+
"txt": "text",
|
| 566 |
+
}
|
| 567 |
+
|
| 568 |
+
@classmethod
|
| 569 |
+
def _convert_to_markdown(cls, body: TextHandler) -> str:
|
| 570 |
+
"""Convert HTML content to Markdown"""
|
| 571 |
+
from markdownify import markdownify
|
| 572 |
+
|
| 573 |
+
return markdownify(body)
|
| 574 |
+
|
| 575 |
+
@classmethod
|
| 576 |
+
def _strip_noise_tags(cls, page: Selector) -> Selector:
|
| 577 |
+
"""Return a copy of the Selector with noise tags removed."""
|
| 578 |
+
clean_root = deepcopy(page._root)
|
| 579 |
+
for element in clean_root.iter(*{"script", "style", "noscript", "svg"}):
|
| 580 |
+
element.drop_tree()
|
| 581 |
+
return Selector(root=clean_root, url=page.url)
|
| 582 |
+
|
| 583 |
+
@classmethod
|
| 584 |
+
def _extract_content(
|
| 585 |
+
cls,
|
| 586 |
+
page: Selector,
|
| 587 |
+
extraction_type: extraction_types = "markdown",
|
| 588 |
+
css_selector: Optional[str] = None,
|
| 589 |
+
main_content_only: bool = False,
|
| 590 |
+
) -> Generator[str, None, None]:
|
| 591 |
+
"""Extract the content of a Selector"""
|
| 592 |
+
if not page or not isinstance(page, Selector): # pragma: no cover
|
| 593 |
+
raise TypeError("Input must be of type `Selector`")
|
| 594 |
+
elif not extraction_type or extraction_type not in cls._extension_map.values():
|
| 595 |
+
raise ValueError(f"Unknown extraction type: {extraction_type}")
|
| 596 |
+
else:
|
| 597 |
+
if main_content_only:
|
| 598 |
+
page = cast(Selector, page.css("body").first) or page
|
| 599 |
+
page = cls._strip_noise_tags(page)
|
| 600 |
+
|
| 601 |
+
pages = [page] if not css_selector else cast(Selectors, page.css(css_selector))
|
| 602 |
+
for page in pages:
|
| 603 |
+
match extraction_type:
|
| 604 |
+
case "markdown":
|
| 605 |
+
yield cls._convert_to_markdown(page.html_content)
|
| 606 |
+
case "html":
|
| 607 |
+
yield page.html_content
|
| 608 |
+
case "text":
|
| 609 |
+
txt_content = page.get_all_text(
|
| 610 |
+
strip=True, ignore_tags=("script", "style", "noscript", "svg", "iframe")
|
| 611 |
+
)
|
| 612 |
+
for s in (
|
| 613 |
+
"\n",
|
| 614 |
+
"\r",
|
| 615 |
+
"\t",
|
| 616 |
+
" ",
|
| 617 |
+
):
|
| 618 |
+
# Remove consecutive white-spaces
|
| 619 |
+
txt_content = TextHandler(re_sub(f"[{s}]+", s, txt_content))
|
| 620 |
+
yield txt_content
|
| 621 |
+
yield ""
|
| 622 |
+
|
| 623 |
+
@classmethod
|
| 624 |
+
def write_content_to_file(cls, page: Selector, filename: str, css_selector: Optional[str] = None) -> None:
|
| 625 |
+
"""Write a Selector's content to a file"""
|
| 626 |
+
if not page or not isinstance(page, Selector): # pragma: no cover
|
| 627 |
+
raise TypeError("Input must be of type `Selector`")
|
| 628 |
+
elif not filename or not isinstance(filename, str) or not filename.strip():
|
| 629 |
+
raise ValueError("Filename must be provided")
|
| 630 |
+
elif not filename.endswith((".md", ".html", ".txt")):
|
| 631 |
+
raise ValueError("Unknown file type: filename must end with '.md', '.html', or '.txt'")
|
| 632 |
+
else:
|
| 633 |
+
with open(filename, "w", encoding=page.encoding) as f:
|
| 634 |
+
extension = filename.split(".")[-1]
|
| 635 |
+
f.write(
|
| 636 |
+
"".join(
|
| 637 |
+
cls._extract_content(
|
| 638 |
+
page,
|
| 639 |
+
cls._extension_map[extension],
|
| 640 |
+
css_selector=css_selector,
|
| 641 |
+
)
|
| 642 |
+
)
|
| 643 |
+
)
|
core/storage.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from hashlib import sha256
|
| 2 |
+
from threading import RLock
|
| 3 |
+
from functools import lru_cache
|
| 4 |
+
from abc import ABC, abstractmethod
|
| 5 |
+
from sqlite3 import connect as db_connect
|
| 6 |
+
|
| 7 |
+
from orjson import dumps, loads
|
| 8 |
+
from lxml.html import HtmlElement
|
| 9 |
+
|
| 10 |
+
from scrapling.core.utils import _StorageTools, log
|
| 11 |
+
from scrapling.core._types import Dict, Optional, Any, cast
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class StorageSystemMixin(ABC): # pragma: no cover
|
| 15 |
+
# If you want to make your own storage system, you have to inherit from this
|
| 16 |
+
def __init__(self, url: Optional[str] = None):
|
| 17 |
+
"""
|
| 18 |
+
:param url: URL of the website we are working on to separate it from other websites data
|
| 19 |
+
"""
|
| 20 |
+
# Make the url in lowercase to handle this edge case until it's updated: https://github.com/barseghyanartur/tld/issues/124
|
| 21 |
+
self.url = url.lower() if (url and isinstance(url, str)) else None
|
| 22 |
+
|
| 23 |
+
@lru_cache(64, typed=True)
|
| 24 |
+
def _get_base_url(self, default_value: str = "default") -> str:
|
| 25 |
+
if not self.url:
|
| 26 |
+
return default_value
|
| 27 |
+
|
| 28 |
+
try:
|
| 29 |
+
from tld import get_tld, Result
|
| 30 |
+
|
| 31 |
+
# Fixing the inaccurate return type hint in `get_tld`
|
| 32 |
+
extracted: Result | None = cast(
|
| 33 |
+
Result, get_tld(self.url, as_object=True, fail_silently=True, fix_protocol=True)
|
| 34 |
+
)
|
| 35 |
+
if not extracted:
|
| 36 |
+
return default_value
|
| 37 |
+
return extracted.fld or extracted.domain or default_value
|
| 38 |
+
except AttributeError:
|
| 39 |
+
return default_value
|
| 40 |
+
|
| 41 |
+
@abstractmethod
|
| 42 |
+
def save(self, element: HtmlElement, identifier: str) -> None:
|
| 43 |
+
"""Saves the element's unique properties to the storage for retrieval and relocation later
|
| 44 |
+
|
| 45 |
+
:param element: The element itself which we want to save to storage.
|
| 46 |
+
:param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
|
| 47 |
+
the docs for more info.
|
| 48 |
+
"""
|
| 49 |
+
raise NotImplementedError("Storage system must implement `save` method")
|
| 50 |
+
|
| 51 |
+
@abstractmethod
|
| 52 |
+
def retrieve(self, identifier: str) -> Optional[Dict]:
|
| 53 |
+
"""Using the identifier, we search the storage and return the unique properties of the element
|
| 54 |
+
|
| 55 |
+
:param identifier: This is the identifier that will be used to retrieve the element from the storage. See
|
| 56 |
+
the docs for more info.
|
| 57 |
+
:return: A dictionary of the unique properties
|
| 58 |
+
"""
|
| 59 |
+
raise NotImplementedError("Storage system must implement `save` method")
|
| 60 |
+
|
| 61 |
+
@staticmethod
|
| 62 |
+
@lru_cache(128, typed=True)
|
| 63 |
+
def _get_hash(identifier: str) -> str:
|
| 64 |
+
"""If you want to hash identifier in your storage system, use this safer"""
|
| 65 |
+
_identifier = identifier.lower().strip()
|
| 66 |
+
# Hash functions have to take bytes
|
| 67 |
+
_identifier_bytes = _identifier.encode("utf-8")
|
| 68 |
+
|
| 69 |
+
hash_value = sha256(_identifier_bytes).hexdigest()
|
| 70 |
+
return f"{hash_value}_{len(_identifier_bytes)}" # Length to reduce collision chance
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
@lru_cache(1, typed=True)
|
| 74 |
+
class SQLiteStorageSystem(StorageSystemMixin):
|
| 75 |
+
"""The recommended system to use, it's race condition safe and thread safe.
|
| 76 |
+
Mainly built, so the library can run in threaded frameworks like scrapy or threaded tools
|
| 77 |
+
> It's optimized for threaded applications, but running it without threads shouldn't make it slow."""
|
| 78 |
+
|
| 79 |
+
def __init__(self, storage_file: str, url: Optional[str] = None):
|
| 80 |
+
"""
|
| 81 |
+
:param storage_file: File to be used to store elements' data.
|
| 82 |
+
:param url: URL of the website we are working on to separate it from other websites data
|
| 83 |
+
|
| 84 |
+
"""
|
| 85 |
+
super().__init__(url)
|
| 86 |
+
self.storage_file = storage_file
|
| 87 |
+
self.lock = RLock() # Better than Lock for reentrancy
|
| 88 |
+
# >SQLite default mode in the earlier version is 1 not 2 (1=thread-safe 2=serialized)
|
| 89 |
+
# `check_same_thread=False` to allow it to be used across different threads.
|
| 90 |
+
self.connection = db_connect(self.storage_file, check_same_thread=False)
|
| 91 |
+
# WAL (Write-Ahead Logging) allows for better concurrency.
|
| 92 |
+
self.connection.execute("PRAGMA journal_mode=WAL")
|
| 93 |
+
self.cursor = self.connection.cursor()
|
| 94 |
+
self._setup_database()
|
| 95 |
+
log.debug(f'Storage system loaded with arguments (storage_file="{storage_file}", url="{url}")')
|
| 96 |
+
|
| 97 |
+
def _setup_database(self) -> None:
|
| 98 |
+
self.cursor.execute("""
|
| 99 |
+
CREATE TABLE IF NOT EXISTS storage (
|
| 100 |
+
id INTEGER PRIMARY KEY,
|
| 101 |
+
url TEXT,
|
| 102 |
+
identifier TEXT,
|
| 103 |
+
element_data TEXT,
|
| 104 |
+
UNIQUE (url, identifier)
|
| 105 |
+
)
|
| 106 |
+
""")
|
| 107 |
+
self.connection.commit()
|
| 108 |
+
|
| 109 |
+
def save(self, element: HtmlElement, identifier: str) -> None:
|
| 110 |
+
"""Saves the elements unique properties to the storage for retrieval and relocation later
|
| 111 |
+
|
| 112 |
+
:param element: The element itself which we want to save to storage.
|
| 113 |
+
:param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
|
| 114 |
+
the docs for more info.
|
| 115 |
+
"""
|
| 116 |
+
url = self._get_base_url()
|
| 117 |
+
element_data = _StorageTools.element_to_dict(element)
|
| 118 |
+
with self.lock:
|
| 119 |
+
self.cursor.execute(
|
| 120 |
+
"""
|
| 121 |
+
INSERT OR REPLACE INTO storage (url, identifier, element_data)
|
| 122 |
+
VALUES (?, ?, ?)
|
| 123 |
+
""",
|
| 124 |
+
(url, identifier, dumps(element_data)),
|
| 125 |
+
)
|
| 126 |
+
self.cursor.fetchall()
|
| 127 |
+
self.connection.commit()
|
| 128 |
+
|
| 129 |
+
def retrieve(self, identifier: str) -> Optional[Dict[str, Any]]:
|
| 130 |
+
"""Using the identifier, we search the storage and return the unique properties of the element
|
| 131 |
+
|
| 132 |
+
:param identifier: This is the identifier that will be used to retrieve the element from the storage. See
|
| 133 |
+
the docs for more info.
|
| 134 |
+
:return: A dictionary of the unique properties
|
| 135 |
+
"""
|
| 136 |
+
url = self._get_base_url()
|
| 137 |
+
with self.lock:
|
| 138 |
+
self.cursor.execute(
|
| 139 |
+
"SELECT element_data FROM storage WHERE url = ? AND identifier = ?",
|
| 140 |
+
(url, identifier),
|
| 141 |
+
)
|
| 142 |
+
result = self.cursor.fetchone()
|
| 143 |
+
if result:
|
| 144 |
+
return loads(result[0])
|
| 145 |
+
return None
|
| 146 |
+
|
| 147 |
+
def close(self):
|
| 148 |
+
"""Close all connections. It will be useful when with some things like scrapy Spider.closed() function/signal"""
|
| 149 |
+
with self.lock:
|
| 150 |
+
self.connection.commit()
|
| 151 |
+
self.cursor.close()
|
| 152 |
+
self.connection.close()
|
| 153 |
+
|
| 154 |
+
def __del__(self):
|
| 155 |
+
"""To ensure all connections are closed when the object is destroyed."""
|
| 156 |
+
self.close()
|
core/translator.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Most of this file is an adapted version of the parsel library's translator with some modifications simply for 1 important reason...
|
| 3 |
+
|
| 4 |
+
To add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match the Parsel/Scrapy selectors format which will be important in future releases but most importantly...
|
| 5 |
+
|
| 6 |
+
So you don't have to learn a new selectors/api method like what bs4 done with soupsieve :)
|
| 7 |
+
|
| 8 |
+
If you want to learn about this, head to https://cssselect.readthedocs.io/en/latest/#cssselect.FunctionalPseudoElement
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from functools import lru_cache
|
| 12 |
+
|
| 13 |
+
from cssselect import HTMLTranslator as OriginalHTMLTranslator
|
| 14 |
+
from cssselect.xpath import ExpressionError, XPathExpr as OriginalXPathExpr
|
| 15 |
+
from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
|
| 16 |
+
|
| 17 |
+
from scrapling.core._types import Any, Protocol, Self
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class XPathExpr(OriginalXPathExpr):
|
| 21 |
+
textnode: bool = False
|
| 22 |
+
attribute: str | None = None
|
| 23 |
+
|
| 24 |
+
@classmethod
|
| 25 |
+
def from_xpath(
|
| 26 |
+
cls,
|
| 27 |
+
xpath: OriginalXPathExpr,
|
| 28 |
+
textnode: bool = False,
|
| 29 |
+
attribute: str | None = None,
|
| 30 |
+
) -> Self:
|
| 31 |
+
x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
|
| 32 |
+
x.textnode = textnode
|
| 33 |
+
x.attribute = attribute
|
| 34 |
+
return x
|
| 35 |
+
|
| 36 |
+
def __str__(self) -> str:
|
| 37 |
+
path = super().__str__()
|
| 38 |
+
if self.textnode:
|
| 39 |
+
if path == "*": # pragma: no cover
|
| 40 |
+
path = "text()"
|
| 41 |
+
elif path.endswith("::*/*"): # pragma: no cover
|
| 42 |
+
path = path[:-3] + "text()"
|
| 43 |
+
else:
|
| 44 |
+
path += "/text()"
|
| 45 |
+
|
| 46 |
+
if self.attribute is not None:
|
| 47 |
+
if path.endswith("::*/*"): # pragma: no cover
|
| 48 |
+
path = path[:-2]
|
| 49 |
+
path += f"/@{self.attribute}"
|
| 50 |
+
|
| 51 |
+
return path
|
| 52 |
+
|
| 53 |
+
def join(
|
| 54 |
+
self: Self,
|
| 55 |
+
combiner: str,
|
| 56 |
+
other: OriginalXPathExpr,
|
| 57 |
+
*args: Any,
|
| 58 |
+
**kwargs: Any,
|
| 59 |
+
) -> Self:
|
| 60 |
+
if not isinstance(other, XPathExpr):
|
| 61 |
+
raise ValueError( # pragma: no cover
|
| 62 |
+
f"Expressions of type {__name__}.XPathExpr can ony join expressions"
|
| 63 |
+
f" of the same type (or its descendants), got {type(other)}"
|
| 64 |
+
)
|
| 65 |
+
super().join(combiner, other, *args, **kwargs)
|
| 66 |
+
self.textnode = other.textnode
|
| 67 |
+
self.attribute = other.attribute
|
| 68 |
+
return self
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
# e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
|
| 72 |
+
class TranslatorProtocol(Protocol):
|
| 73 |
+
def xpath_element(self, selector: Element) -> OriginalXPathExpr: # pyright: ignore # pragma: no cover
|
| 74 |
+
pass
|
| 75 |
+
|
| 76 |
+
def css_to_xpath(self, css: str, prefix: str = ...) -> str: # pyright: ignore # pragma: no cover
|
| 77 |
+
pass
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
class TranslatorMixin:
|
| 81 |
+
"""This mixin adds support to CSS pseudo elements via dynamic dispatch.
|
| 82 |
+
|
| 83 |
+
Currently supported pseudo-elements are ``::text`` and ``::attr(ATTR_NAME)``.
|
| 84 |
+
"""
|
| 85 |
+
|
| 86 |
+
def xpath_element(self: TranslatorProtocol, selector: Element) -> XPathExpr:
|
| 87 |
+
# https://github.com/python/mypy/issues/14757
|
| 88 |
+
xpath = super().xpath_element(selector) # type: ignore[safe-super]
|
| 89 |
+
return XPathExpr.from_xpath(xpath)
|
| 90 |
+
|
| 91 |
+
def xpath_pseudo_element(self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement) -> OriginalXPathExpr:
|
| 92 |
+
"""
|
| 93 |
+
Dispatch method that transforms XPath to support the pseudo-element.
|
| 94 |
+
"""
|
| 95 |
+
if isinstance(pseudo_element, FunctionalPseudoElement):
|
| 96 |
+
method_name = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element"
|
| 97 |
+
method = getattr(self, method_name, None)
|
| 98 |
+
if not method: # pragma: no cover
|
| 99 |
+
raise ExpressionError(f"The functional pseudo-element ::{pseudo_element.name}() is unknown")
|
| 100 |
+
xpath = method(xpath, pseudo_element)
|
| 101 |
+
else:
|
| 102 |
+
method_name = f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
|
| 103 |
+
method = getattr(self, method_name, None)
|
| 104 |
+
if not method: # pragma: no cover
|
| 105 |
+
raise ExpressionError(f"The pseudo-element ::{pseudo_element} is unknown")
|
| 106 |
+
xpath = method(xpath)
|
| 107 |
+
return xpath
|
| 108 |
+
|
| 109 |
+
@staticmethod
|
| 110 |
+
def xpath_attr_functional_pseudo_element(xpath: OriginalXPathExpr, function: FunctionalPseudoElement) -> XPathExpr:
|
| 111 |
+
"""Support selecting attribute values using ::attr() pseudo-element"""
|
| 112 |
+
if function.argument_types() not in (["STRING"], ["IDENT"]): # pragma: no cover
|
| 113 |
+
raise ExpressionError(f"Expected a single string or ident for ::attr(), got {function.arguments!r}")
|
| 114 |
+
return XPathExpr.from_xpath(xpath, attribute=function.arguments[0].value)
|
| 115 |
+
|
| 116 |
+
@staticmethod
|
| 117 |
+
def xpath_text_simple_pseudo_element(xpath: OriginalXPathExpr) -> XPathExpr:
|
| 118 |
+
"""Support selecting text nodes using ::text pseudo-element"""
|
| 119 |
+
return XPathExpr.from_xpath(xpath, textnode=True)
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
|
| 123 |
+
def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
|
| 124 |
+
return super().css_to_xpath(css, prefix)
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
translator = HTMLTranslator()
|
| 128 |
+
# Using a function instead of the translator directly to avoid Pyright override error
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
@lru_cache(maxsize=256)
|
| 132 |
+
def css_to_xpath(query: str) -> str:
|
| 133 |
+
"""Return the translated XPath version of a given CSS query"""
|
| 134 |
+
return translator.css_to_xpath(query)
|
core/utils/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from ._utils import (
|
| 2 |
+
log,
|
| 3 |
+
set_logger,
|
| 4 |
+
reset_logger,
|
| 5 |
+
__CONSECUTIVE_SPACES_REGEX__,
|
| 6 |
+
flatten,
|
| 7 |
+
_is_iterable,
|
| 8 |
+
_StorageTools,
|
| 9 |
+
clean_spaces,
|
| 10 |
+
html_forbidden,
|
| 11 |
+
)
|
core/utils/_shell.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from http import cookies as Cookie
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
from scrapling.core._types import (
|
| 5 |
+
List,
|
| 6 |
+
Dict,
|
| 7 |
+
Tuple,
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def _CookieParser(cookie_string):
|
| 12 |
+
# Errors will be handled on call so the log can be specified
|
| 13 |
+
cookie_parser = Cookie.SimpleCookie()
|
| 14 |
+
cookie_parser.load(cookie_string)
|
| 15 |
+
for key, morsel in cookie_parser.items():
|
| 16 |
+
yield key, morsel.value
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def _ParseHeaders(header_lines: List[str], parse_cookies: bool = True) -> Tuple[Dict[str, str], Dict[str, str]]:
|
| 20 |
+
"""Parses headers into separate header and cookie dictionaries."""
|
| 21 |
+
header_dict = dict()
|
| 22 |
+
cookie_dict = dict()
|
| 23 |
+
|
| 24 |
+
for header_line in header_lines:
|
| 25 |
+
if ":" not in header_line:
|
| 26 |
+
if header_line.endswith(";"):
|
| 27 |
+
header_key = header_line[:-1].strip()
|
| 28 |
+
header_value = ""
|
| 29 |
+
header_dict[header_key] = header_value
|
| 30 |
+
else:
|
| 31 |
+
raise ValueError(f"Could not parse header without colon: '{header_line}'.")
|
| 32 |
+
else:
|
| 33 |
+
header_key, header_value = header_line.split(":", 1)
|
| 34 |
+
header_key = header_key.strip()
|
| 35 |
+
header_value = header_value.strip()
|
| 36 |
+
|
| 37 |
+
if parse_cookies:
|
| 38 |
+
if header_key.lower() == "cookie":
|
| 39 |
+
try:
|
| 40 |
+
cookie_dict = {key: value for key, value in _CookieParser(header_value)}
|
| 41 |
+
except Exception as e: # pragma: no cover
|
| 42 |
+
raise ValueError(f"Could not parse cookie string from header '{header_value}': {e}")
|
| 43 |
+
else:
|
| 44 |
+
header_dict[header_key] = header_value
|
| 45 |
+
else:
|
| 46 |
+
header_dict[header_key] = header_value
|
| 47 |
+
|
| 48 |
+
return header_dict, cookie_dict
|
core/utils/_utils.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from itertools import chain
|
| 3 |
+
from re import compile as re_compile
|
| 4 |
+
from contextvars import ContextVar, Token
|
| 5 |
+
|
| 6 |
+
from lxml import html
|
| 7 |
+
|
| 8 |
+
from scrapling.core._types import Any, Dict, Iterable, List
|
| 9 |
+
|
| 10 |
+
# Using cache on top of a class is a brilliant way to achieve a Singleton design pattern without much code
|
| 11 |
+
from functools import lru_cache # isort:skip
|
| 12 |
+
|
| 13 |
+
html_forbidden = (html.HtmlComment,)
|
| 14 |
+
|
| 15 |
+
__CLEANING_TABLE__ = str.maketrans({"\t": " ", "\n": None, "\r": None})
|
| 16 |
+
__CONSECUTIVE_SPACES_REGEX__ = re_compile(r" +")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@lru_cache(1, typed=True)
|
| 20 |
+
def setup_logger():
|
| 21 |
+
"""Create and configure a logger with a standard format.
|
| 22 |
+
|
| 23 |
+
:returns: logging.Logger: Configured logger instance
|
| 24 |
+
"""
|
| 25 |
+
logger = logging.getLogger("scrapling")
|
| 26 |
+
logger.setLevel(logging.INFO)
|
| 27 |
+
|
| 28 |
+
formatter = logging.Formatter(fmt="[%(asctime)s] %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
|
| 29 |
+
|
| 30 |
+
console_handler = logging.StreamHandler()
|
| 31 |
+
console_handler.setFormatter(formatter)
|
| 32 |
+
|
| 33 |
+
# Add handler to logger (if not already added)
|
| 34 |
+
if not logger.handlers:
|
| 35 |
+
logger.addHandler(console_handler)
|
| 36 |
+
|
| 37 |
+
return logger
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
_current_logger: ContextVar[logging.Logger] = ContextVar("scrapling_logger", default=setup_logger())
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class LoggerProxy:
|
| 44 |
+
def __getattr__(self, name: str):
|
| 45 |
+
return getattr(_current_logger.get(), name)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
log = LoggerProxy()
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def set_logger(logger: logging.Logger) -> Token:
|
| 52 |
+
"""Set the current context logger. Returns token for reset."""
|
| 53 |
+
return _current_logger.set(logger)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def reset_logger(token: Token) -> None:
|
| 57 |
+
"""Reset logger to previous state using token."""
|
| 58 |
+
_current_logger.reset(token)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def flatten(lst: Iterable[Any]) -> List[Any]:
|
| 62 |
+
return list(chain.from_iterable(lst))
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def _is_iterable(obj: Any) -> bool:
|
| 66 |
+
# This will be used only in regex functions to make sure it's iterable but not string/bytes
|
| 67 |
+
return isinstance(
|
| 68 |
+
obj,
|
| 69 |
+
(
|
| 70 |
+
list,
|
| 71 |
+
tuple,
|
| 72 |
+
),
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
class _StorageTools:
|
| 77 |
+
@staticmethod
|
| 78 |
+
def __clean_attributes(element: html.HtmlElement, forbidden: tuple = ()) -> Dict:
|
| 79 |
+
if not element.attrib:
|
| 80 |
+
return {}
|
| 81 |
+
return {k: v.strip() for k, v in element.attrib.items() if v and v.strip() and k not in forbidden}
|
| 82 |
+
|
| 83 |
+
@classmethod
|
| 84 |
+
def element_to_dict(cls, element: html.HtmlElement) -> Dict:
|
| 85 |
+
parent = element.getparent()
|
| 86 |
+
result = {
|
| 87 |
+
"tag": str(element.tag),
|
| 88 |
+
"attributes": cls.__clean_attributes(element),
|
| 89 |
+
"text": element.text.strip() if element.text else None,
|
| 90 |
+
"path": cls._get_element_path(element),
|
| 91 |
+
}
|
| 92 |
+
if parent is not None:
|
| 93 |
+
result.update(
|
| 94 |
+
{
|
| 95 |
+
"parent_name": parent.tag,
|
| 96 |
+
"parent_attribs": dict(parent.attrib),
|
| 97 |
+
"parent_text": parent.text.strip() if parent.text else None,
|
| 98 |
+
}
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
siblings = [child.tag for child in parent.iterchildren() if child != element]
|
| 102 |
+
if siblings:
|
| 103 |
+
result.update({"siblings": tuple(siblings)})
|
| 104 |
+
|
| 105 |
+
children = [child.tag for child in element.iterchildren() if not isinstance(child, html_forbidden)]
|
| 106 |
+
if children:
|
| 107 |
+
result.update({"children": tuple(children)})
|
| 108 |
+
|
| 109 |
+
return result
|
| 110 |
+
|
| 111 |
+
@classmethod
|
| 112 |
+
def _get_element_path(cls, element: html.HtmlElement):
|
| 113 |
+
parent = element.getparent()
|
| 114 |
+
return tuple((element.tag,) if parent is None else (cls._get_element_path(parent) + (element.tag,)))
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
@lru_cache(128, typed=True)
|
| 118 |
+
def clean_spaces(string):
|
| 119 |
+
string = string.translate(__CLEANING_TABLE__)
|
| 120 |
+
return __CONSECUTIVE_SPACES_REGEX__.sub(" ", string)
|
engines/__init__.py
ADDED
|
File without changes
|
engines/_browsers/__init__.py
ADDED
|
File without changes
|
engines/_browsers/_base.py
ADDED
|
@@ -0,0 +1,534 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from time import time
|
| 2 |
+
from asyncio import sleep as asyncio_sleep, Lock
|
| 3 |
+
from contextlib import contextmanager, asynccontextmanager
|
| 4 |
+
|
| 5 |
+
from playwright.sync_api._generated import Page
|
| 6 |
+
from playwright.sync_api import (
|
| 7 |
+
Frame,
|
| 8 |
+
BrowserContext,
|
| 9 |
+
Response as SyncPlaywrightResponse,
|
| 10 |
+
)
|
| 11 |
+
from playwright.async_api._generated import Page as AsyncPage
|
| 12 |
+
from playwright.async_api import (
|
| 13 |
+
Frame as AsyncFrame,
|
| 14 |
+
Response as AsyncPlaywrightResponse,
|
| 15 |
+
BrowserContext as AsyncBrowserContext,
|
| 16 |
+
)
|
| 17 |
+
from playwright._impl._errors import Error as PlaywrightError
|
| 18 |
+
|
| 19 |
+
from scrapling.parser import Selector
|
| 20 |
+
from scrapling.engines._browsers._page import PageInfo, PagePool
|
| 21 |
+
from scrapling.engines._browsers._validators import validate, PlaywrightConfig, StealthConfig
|
| 22 |
+
from scrapling.engines._browsers._config_tools import __default_chrome_useragent__, __default_useragent__
|
| 23 |
+
from scrapling.engines.toolbelt.navigation import (
|
| 24 |
+
construct_proxy_dict,
|
| 25 |
+
create_intercept_handler,
|
| 26 |
+
create_async_intercept_handler,
|
| 27 |
+
)
|
| 28 |
+
from scrapling.core._types import (
|
| 29 |
+
Any,
|
| 30 |
+
Dict,
|
| 31 |
+
List,
|
| 32 |
+
Set,
|
| 33 |
+
Optional,
|
| 34 |
+
Callable,
|
| 35 |
+
TYPE_CHECKING,
|
| 36 |
+
cast,
|
| 37 |
+
overload,
|
| 38 |
+
Tuple,
|
| 39 |
+
ProxyType,
|
| 40 |
+
Generator,
|
| 41 |
+
AsyncGenerator,
|
| 42 |
+
)
|
| 43 |
+
from scrapling.engines.constants import STEALTH_ARGS, HARMFUL_ARGS, DEFAULT_ARGS
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class SyncSession:
|
| 47 |
+
_config: "PlaywrightConfig | StealthConfig"
|
| 48 |
+
_context_options: Dict[str, Any]
|
| 49 |
+
|
| 50 |
+
def _build_context_with_proxy(self, proxy: Optional[ProxyType] = None) -> Dict[str, Any]:
|
| 51 |
+
raise NotImplementedError # pragma: no cover
|
| 52 |
+
|
| 53 |
+
def __init__(self, max_pages: int = 1):
|
| 54 |
+
self.max_pages = max_pages
|
| 55 |
+
self.page_pool = PagePool(max_pages)
|
| 56 |
+
self._max_wait_for_page = 60
|
| 57 |
+
self.playwright: Any = None
|
| 58 |
+
self.context: Any = None
|
| 59 |
+
self.browser: Any = None
|
| 60 |
+
self._is_alive = False
|
| 61 |
+
|
| 62 |
+
def start(self) -> None:
|
| 63 |
+
pass
|
| 64 |
+
|
| 65 |
+
def close(self): # pragma: no cover
|
| 66 |
+
"""Close all resources"""
|
| 67 |
+
if not self._is_alive:
|
| 68 |
+
return
|
| 69 |
+
|
| 70 |
+
if self.context:
|
| 71 |
+
self.context.close()
|
| 72 |
+
self.context = None
|
| 73 |
+
|
| 74 |
+
if self.browser:
|
| 75 |
+
self.browser.close()
|
| 76 |
+
self.browser = None
|
| 77 |
+
|
| 78 |
+
if self.playwright:
|
| 79 |
+
self.playwright.stop()
|
| 80 |
+
self.playwright = None # pyright: ignore
|
| 81 |
+
|
| 82 |
+
self._is_alive = False
|
| 83 |
+
|
| 84 |
+
def __enter__(self):
|
| 85 |
+
self.start()
|
| 86 |
+
return self
|
| 87 |
+
|
| 88 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 89 |
+
self.close()
|
| 90 |
+
|
| 91 |
+
def _initialize_context(self, config: PlaywrightConfig | StealthConfig, ctx: BrowserContext) -> BrowserContext:
|
| 92 |
+
"""Initialize the browser context."""
|
| 93 |
+
if config.init_script:
|
| 94 |
+
ctx.add_init_script(path=config.init_script)
|
| 95 |
+
|
| 96 |
+
if config.cookies: # pragma: no cover
|
| 97 |
+
ctx.add_cookies(config.cookies)
|
| 98 |
+
|
| 99 |
+
return ctx
|
| 100 |
+
|
| 101 |
+
def _get_page(
|
| 102 |
+
self,
|
| 103 |
+
timeout: int | float,
|
| 104 |
+
extra_headers: Optional[Dict[str, str]],
|
| 105 |
+
disable_resources: bool,
|
| 106 |
+
blocked_domains: Optional[Set[str]] = None,
|
| 107 |
+
context: Optional[BrowserContext] = None,
|
| 108 |
+
) -> PageInfo[Page]: # pragma: no cover
|
| 109 |
+
"""Get a new page to use"""
|
| 110 |
+
# No need to check if a page is available or not in sync code because the code blocked before reaching here till the page closed, ofc.
|
| 111 |
+
ctx = context if context is not None else self.context
|
| 112 |
+
assert ctx is not None, "Browser context not initialized"
|
| 113 |
+
page = ctx.new_page()
|
| 114 |
+
page.set_default_navigation_timeout(timeout)
|
| 115 |
+
page.set_default_timeout(timeout)
|
| 116 |
+
if extra_headers:
|
| 117 |
+
page.set_extra_http_headers(extra_headers)
|
| 118 |
+
|
| 119 |
+
if disable_resources or blocked_domains:
|
| 120 |
+
page.route("**/*", create_intercept_handler(disable_resources, blocked_domains))
|
| 121 |
+
page_info = self.page_pool.add_page(page)
|
| 122 |
+
page_info.mark_busy()
|
| 123 |
+
return page_info
|
| 124 |
+
|
| 125 |
+
def get_pool_stats(self) -> Dict[str, int]:
|
| 126 |
+
"""Get statistics about the current page pool"""
|
| 127 |
+
return {
|
| 128 |
+
"total_pages": self.page_pool.pages_count,
|
| 129 |
+
"busy_pages": self.page_pool.busy_count,
|
| 130 |
+
"max_pages": self.max_pages,
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
@staticmethod
|
| 134 |
+
def _wait_for_networkidle(page: Page | Frame, timeout: Optional[int] = None):
|
| 135 |
+
"""Wait for the page to become idle (no network activity) even if there are never-ending requests."""
|
| 136 |
+
try:
|
| 137 |
+
page.wait_for_load_state("networkidle", timeout=timeout)
|
| 138 |
+
except (PlaywrightError, Exception):
|
| 139 |
+
pass
|
| 140 |
+
|
| 141 |
+
def _wait_for_page_stability(self, page: Page | Frame, load_dom: bool, network_idle: bool):
|
| 142 |
+
page.wait_for_load_state(state="load")
|
| 143 |
+
if load_dom:
|
| 144 |
+
page.wait_for_load_state(state="domcontentloaded")
|
| 145 |
+
if network_idle:
|
| 146 |
+
self._wait_for_networkidle(page)
|
| 147 |
+
|
| 148 |
+
@staticmethod
|
| 149 |
+
def _create_response_handler(page_info: PageInfo[Page], response_container: List) -> Callable:
|
| 150 |
+
"""Create a response handler that captures the final navigation response.
|
| 151 |
+
|
| 152 |
+
:param page_info: The PageInfo object containing the page
|
| 153 |
+
:param response_container: A list to store the final response (mutable container)
|
| 154 |
+
:return: A callback function for page.on("response", ...)
|
| 155 |
+
"""
|
| 156 |
+
|
| 157 |
+
def handle_response(finished_response: SyncPlaywrightResponse):
|
| 158 |
+
if (
|
| 159 |
+
finished_response.request.resource_type == "document"
|
| 160 |
+
and finished_response.request.is_navigation_request()
|
| 161 |
+
and finished_response.request.frame == page_info.page.main_frame
|
| 162 |
+
):
|
| 163 |
+
response_container[0] = finished_response
|
| 164 |
+
|
| 165 |
+
return handle_response
|
| 166 |
+
|
| 167 |
+
@contextmanager
|
| 168 |
+
def _page_generator(
|
| 169 |
+
self,
|
| 170 |
+
timeout: int | float,
|
| 171 |
+
extra_headers: Optional[Dict[str, str]],
|
| 172 |
+
disable_resources: bool,
|
| 173 |
+
proxy: Optional[ProxyType] = None,
|
| 174 |
+
blocked_domains: Optional[Set[str]] = None,
|
| 175 |
+
) -> Generator["PageInfo[Page]", None, None]:
|
| 176 |
+
"""Acquire a page - either from persistent context or fresh context with proxy."""
|
| 177 |
+
if proxy:
|
| 178 |
+
# Rotation mode: create fresh context with the provided proxy
|
| 179 |
+
if not self.browser: # pragma: no cover
|
| 180 |
+
raise RuntimeError("Browser not initialized for proxy rotation mode")
|
| 181 |
+
context_options = self._build_context_with_proxy(proxy)
|
| 182 |
+
context: BrowserContext = self.browser.new_context(**context_options)
|
| 183 |
+
|
| 184 |
+
try:
|
| 185 |
+
context = self._initialize_context(self._config, context)
|
| 186 |
+
page_info = self._get_page(timeout, extra_headers, disable_resources, blocked_domains, context=context)
|
| 187 |
+
yield page_info
|
| 188 |
+
finally:
|
| 189 |
+
context.close()
|
| 190 |
+
else:
|
| 191 |
+
# Standard mode: use PagePool with persistent context
|
| 192 |
+
page_info = self._get_page(timeout, extra_headers, disable_resources, blocked_domains)
|
| 193 |
+
try:
|
| 194 |
+
yield page_info
|
| 195 |
+
finally:
|
| 196 |
+
page_info.page.close()
|
| 197 |
+
self.page_pool.pages.remove(page_info)
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
class AsyncSession:
|
| 201 |
+
_config: "PlaywrightConfig | StealthConfig"
|
| 202 |
+
_context_options: Dict[str, Any]
|
| 203 |
+
|
| 204 |
+
def _build_context_with_proxy(self, proxy: Optional[ProxyType] = None) -> Dict[str, Any]:
|
| 205 |
+
raise NotImplementedError # pragma: no cover
|
| 206 |
+
|
| 207 |
+
def __init__(self, max_pages: int = 1):
|
| 208 |
+
self.max_pages = max_pages
|
| 209 |
+
self.page_pool = PagePool(max_pages)
|
| 210 |
+
self._max_wait_for_page = 60
|
| 211 |
+
self.playwright: Any = None
|
| 212 |
+
self.context: Any = None
|
| 213 |
+
self.browser: Any = None
|
| 214 |
+
self._is_alive = False
|
| 215 |
+
self._lock = Lock()
|
| 216 |
+
|
| 217 |
+
async def start(self) -> None:
|
| 218 |
+
pass
|
| 219 |
+
|
| 220 |
+
async def close(self):
|
| 221 |
+
"""Close all resources"""
|
| 222 |
+
if not self._is_alive: # pragma: no cover
|
| 223 |
+
return
|
| 224 |
+
|
| 225 |
+
if self.context:
|
| 226 |
+
await self.context.close()
|
| 227 |
+
self.context = None # pyright: ignore
|
| 228 |
+
|
| 229 |
+
if self.browser:
|
| 230 |
+
await self.browser.close()
|
| 231 |
+
self.browser = None
|
| 232 |
+
|
| 233 |
+
if self.playwright:
|
| 234 |
+
await self.playwright.stop()
|
| 235 |
+
self.playwright = None # pyright: ignore
|
| 236 |
+
|
| 237 |
+
self._is_alive = False
|
| 238 |
+
|
| 239 |
+
async def __aenter__(self):
|
| 240 |
+
await self.start()
|
| 241 |
+
return self
|
| 242 |
+
|
| 243 |
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
| 244 |
+
await self.close()
|
| 245 |
+
|
| 246 |
+
async def _initialize_context(
|
| 247 |
+
self, config: PlaywrightConfig | StealthConfig, ctx: AsyncBrowserContext
|
| 248 |
+
) -> AsyncBrowserContext:
|
| 249 |
+
"""Initialize the browser context."""
|
| 250 |
+
if config.init_script: # pragma: no cover
|
| 251 |
+
await ctx.add_init_script(path=config.init_script)
|
| 252 |
+
|
| 253 |
+
if config.cookies: # pragma: no cover
|
| 254 |
+
await ctx.add_cookies(config.cookies)
|
| 255 |
+
|
| 256 |
+
return ctx
|
| 257 |
+
|
| 258 |
+
async def _get_page(
|
| 259 |
+
self,
|
| 260 |
+
timeout: int | float,
|
| 261 |
+
extra_headers: Optional[Dict[str, str]],
|
| 262 |
+
disable_resources: bool,
|
| 263 |
+
blocked_domains: Optional[Set[str]] = None,
|
| 264 |
+
context: Optional[AsyncBrowserContext] = None,
|
| 265 |
+
) -> PageInfo[AsyncPage]: # pragma: no cover
|
| 266 |
+
"""Get a new page to use"""
|
| 267 |
+
ctx = context if context is not None else self.context
|
| 268 |
+
if TYPE_CHECKING:
|
| 269 |
+
assert ctx is not None, "Browser context not initialized"
|
| 270 |
+
|
| 271 |
+
async with self._lock:
|
| 272 |
+
# If we're at max capacity after cleanup, wait for busy pages to finish
|
| 273 |
+
if context is None and self.page_pool.pages_count >= self.max_pages:
|
| 274 |
+
# Only applies when using persistent context
|
| 275 |
+
start_time = time()
|
| 276 |
+
while time() - start_time < self._max_wait_for_page:
|
| 277 |
+
await asyncio_sleep(0.05)
|
| 278 |
+
if self.page_pool.pages_count < self.max_pages:
|
| 279 |
+
break
|
| 280 |
+
else:
|
| 281 |
+
raise TimeoutError(
|
| 282 |
+
f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
|
| 283 |
+
)
|
| 284 |
+
|
| 285 |
+
page = await ctx.new_page()
|
| 286 |
+
page.set_default_navigation_timeout(timeout)
|
| 287 |
+
page.set_default_timeout(timeout)
|
| 288 |
+
if extra_headers:
|
| 289 |
+
await page.set_extra_http_headers(extra_headers)
|
| 290 |
+
|
| 291 |
+
if disable_resources or blocked_domains:
|
| 292 |
+
await page.route("**/*", create_async_intercept_handler(disable_resources, blocked_domains))
|
| 293 |
+
|
| 294 |
+
return self.page_pool.add_page(page)
|
| 295 |
+
|
| 296 |
+
def get_pool_stats(self) -> Dict[str, int]:
|
| 297 |
+
"""Get statistics about the current page pool"""
|
| 298 |
+
return {
|
| 299 |
+
"total_pages": self.page_pool.pages_count,
|
| 300 |
+
"busy_pages": self.page_pool.busy_count,
|
| 301 |
+
"max_pages": self.max_pages,
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
@staticmethod
|
| 305 |
+
async def _wait_for_networkidle(page: AsyncPage | AsyncFrame, timeout: Optional[int] = None):
|
| 306 |
+
"""Wait for the page to become idle (no network activity) even if there are never-ending requests."""
|
| 307 |
+
try:
|
| 308 |
+
await page.wait_for_load_state("networkidle", timeout=timeout)
|
| 309 |
+
except (PlaywrightError, Exception):
|
| 310 |
+
pass
|
| 311 |
+
|
| 312 |
+
async def _wait_for_page_stability(self, page: AsyncPage | AsyncFrame, load_dom: bool, network_idle: bool):
|
| 313 |
+
await page.wait_for_load_state(state="load")
|
| 314 |
+
if load_dom:
|
| 315 |
+
await page.wait_for_load_state(state="domcontentloaded")
|
| 316 |
+
if network_idle:
|
| 317 |
+
await self._wait_for_networkidle(page)
|
| 318 |
+
|
| 319 |
+
@staticmethod
|
| 320 |
+
def _create_response_handler(page_info: PageInfo[AsyncPage], response_container: List) -> Callable:
|
| 321 |
+
"""Create an async response handler that captures the final navigation response.
|
| 322 |
+
|
| 323 |
+
:param page_info: The PageInfo object containing the page
|
| 324 |
+
:param response_container: A list to store the final response (mutable container)
|
| 325 |
+
:return: A callback function for page.on("response", ...)
|
| 326 |
+
"""
|
| 327 |
+
|
| 328 |
+
async def handle_response(finished_response: AsyncPlaywrightResponse):
|
| 329 |
+
if (
|
| 330 |
+
finished_response.request.resource_type == "document"
|
| 331 |
+
and finished_response.request.is_navigation_request()
|
| 332 |
+
and finished_response.request.frame == page_info.page.main_frame
|
| 333 |
+
):
|
| 334 |
+
response_container[0] = finished_response
|
| 335 |
+
|
| 336 |
+
return handle_response
|
| 337 |
+
|
| 338 |
+
@asynccontextmanager
|
| 339 |
+
async def _page_generator(
|
| 340 |
+
self,
|
| 341 |
+
timeout: int | float,
|
| 342 |
+
extra_headers: Optional[Dict[str, str]],
|
| 343 |
+
disable_resources: bool,
|
| 344 |
+
proxy: Optional[ProxyType] = None,
|
| 345 |
+
blocked_domains: Optional[Set[str]] = None,
|
| 346 |
+
) -> AsyncGenerator["PageInfo[AsyncPage]", None]:
|
| 347 |
+
"""Acquire a page - either from persistent context or fresh context with proxy."""
|
| 348 |
+
if proxy:
|
| 349 |
+
# Rotation mode: create fresh context with the provided proxy
|
| 350 |
+
if not self.browser: # pragma: no cover
|
| 351 |
+
raise RuntimeError("Browser not initialized for proxy rotation mode")
|
| 352 |
+
context_options = self._build_context_with_proxy(proxy)
|
| 353 |
+
context: AsyncBrowserContext = await self.browser.new_context(**context_options)
|
| 354 |
+
|
| 355 |
+
try:
|
| 356 |
+
context = await self._initialize_context(self._config, context)
|
| 357 |
+
page_info = await self._get_page(
|
| 358 |
+
timeout, extra_headers, disable_resources, blocked_domains, context=context
|
| 359 |
+
)
|
| 360 |
+
yield page_info
|
| 361 |
+
finally:
|
| 362 |
+
await context.close()
|
| 363 |
+
else:
|
| 364 |
+
# Standard mode: use PagePool with persistent context
|
| 365 |
+
page_info = await self._get_page(timeout, extra_headers, disable_resources, blocked_domains)
|
| 366 |
+
try:
|
| 367 |
+
yield page_info
|
| 368 |
+
finally:
|
| 369 |
+
await page_info.page.close()
|
| 370 |
+
self.page_pool.pages.remove(page_info)
|
| 371 |
+
|
| 372 |
+
|
| 373 |
+
class BaseSessionMixin:
|
| 374 |
+
_config: "PlaywrightConfig | StealthConfig"
|
| 375 |
+
|
| 376 |
+
@overload
|
| 377 |
+
def __validate_routine__(self, params: Dict, model: type[StealthConfig]) -> StealthConfig: ...
|
| 378 |
+
|
| 379 |
+
@overload
|
| 380 |
+
def __validate_routine__(self, params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...
|
| 381 |
+
|
| 382 |
+
def __validate_routine__(
|
| 383 |
+
self, params: Dict, model: type[PlaywrightConfig] | type[StealthConfig]
|
| 384 |
+
) -> PlaywrightConfig | StealthConfig:
|
| 385 |
+
# Dark color scheme bypasses the 'prefersLightColor' check in creepjs
|
| 386 |
+
self._context_options: Dict[str, Any] = {"color_scheme": "dark", "device_scale_factor": 2}
|
| 387 |
+
self._browser_options: Dict[str, Any] = {
|
| 388 |
+
"args": DEFAULT_ARGS,
|
| 389 |
+
"ignore_default_args": HARMFUL_ARGS,
|
| 390 |
+
}
|
| 391 |
+
if "__max_pages" in params:
|
| 392 |
+
params["max_pages"] = params.pop("__max_pages")
|
| 393 |
+
|
| 394 |
+
config = validate(params, model=model)
|
| 395 |
+
self._headers_keys = (
|
| 396 |
+
{header.lower() for header in config.extra_headers.keys()} if config.extra_headers else set()
|
| 397 |
+
)
|
| 398 |
+
|
| 399 |
+
return config
|
| 400 |
+
|
| 401 |
+
def __generate_options__(self, extra_flags: Tuple | None = None) -> None:
|
| 402 |
+
config: PlaywrightConfig | StealthConfig = self._config
|
| 403 |
+
self._context_options.update(
|
| 404 |
+
{
|
| 405 |
+
"proxy": config.proxy,
|
| 406 |
+
"locale": config.locale,
|
| 407 |
+
"timezone_id": config.timezone_id,
|
| 408 |
+
"extra_http_headers": config.extra_headers,
|
| 409 |
+
}
|
| 410 |
+
)
|
| 411 |
+
# The default useragent in the headful is always correct now in the current versions of Playwright
|
| 412 |
+
if config.useragent:
|
| 413 |
+
self._context_options["user_agent"] = config.useragent
|
| 414 |
+
elif not config.useragent and config.headless:
|
| 415 |
+
self._context_options["user_agent"] = (
|
| 416 |
+
__default_chrome_useragent__ if config.real_chrome else __default_useragent__
|
| 417 |
+
)
|
| 418 |
+
|
| 419 |
+
if not config.cdp_url:
|
| 420 |
+
flags = self._browser_options["args"]
|
| 421 |
+
if config.extra_flags or extra_flags:
|
| 422 |
+
flags = list(set(flags + (config.extra_flags or extra_flags)))
|
| 423 |
+
|
| 424 |
+
self._browser_options.update(
|
| 425 |
+
{
|
| 426 |
+
"args": flags,
|
| 427 |
+
"headless": config.headless,
|
| 428 |
+
"channel": "chrome" if config.real_chrome else "chromium",
|
| 429 |
+
}
|
| 430 |
+
)
|
| 431 |
+
|
| 432 |
+
self._user_data_dir = config.user_data_dir
|
| 433 |
+
else:
|
| 434 |
+
self._browser_options = {}
|
| 435 |
+
|
| 436 |
+
if config.additional_args:
|
| 437 |
+
self._context_options.update(config.additional_args)
|
| 438 |
+
|
| 439 |
+
def _build_context_with_proxy(self, proxy: Optional[ProxyType] = None) -> Dict[str, Any]:
|
| 440 |
+
"""
|
| 441 |
+
Build context options with a specific proxy for rotation mode.
|
| 442 |
+
|
| 443 |
+
:param proxy: Proxy URL string or Playwright-style proxy dict to use for this context.
|
| 444 |
+
:return: Dictionary of context options for browser.new_context().
|
| 445 |
+
"""
|
| 446 |
+
|
| 447 |
+
context_options = self._context_options.copy()
|
| 448 |
+
|
| 449 |
+
# Override proxy if provided
|
| 450 |
+
if proxy:
|
| 451 |
+
context_options["proxy"] = construct_proxy_dict(proxy)
|
| 452 |
+
|
| 453 |
+
return context_options
|
| 454 |
+
|
| 455 |
+
|
| 456 |
+
class DynamicSessionMixin(BaseSessionMixin):
|
| 457 |
+
def __validate__(self, **params):
|
| 458 |
+
self._config = self.__validate_routine__(params, model=PlaywrightConfig)
|
| 459 |
+
self.__generate_options__()
|
| 460 |
+
|
| 461 |
+
|
| 462 |
+
class StealthySessionMixin(BaseSessionMixin):
|
| 463 |
+
def __validate__(self, **params):
|
| 464 |
+
self._config = self.__validate_routine__(params, model=StealthConfig)
|
| 465 |
+
self._context_options.update(
|
| 466 |
+
{
|
| 467 |
+
"is_mobile": False,
|
| 468 |
+
"has_touch": False,
|
| 469 |
+
# I'm thinking about disabling it to rest from all Service Workers' headache, but let's keep it as it is for now
|
| 470 |
+
"service_workers": "allow",
|
| 471 |
+
"ignore_https_errors": True,
|
| 472 |
+
"screen": {"width": 1920, "height": 1080},
|
| 473 |
+
"viewport": {"width": 1920, "height": 1080},
|
| 474 |
+
"permissions": ["geolocation", "notifications"],
|
| 475 |
+
}
|
| 476 |
+
)
|
| 477 |
+
self.__generate_stealth_options()
|
| 478 |
+
|
| 479 |
+
def __generate_stealth_options(self) -> None:
|
| 480 |
+
config = cast(StealthConfig, self._config)
|
| 481 |
+
flags: Tuple[str, ...] = tuple()
|
| 482 |
+
if not config.cdp_url:
|
| 483 |
+
flags = DEFAULT_ARGS + STEALTH_ARGS
|
| 484 |
+
|
| 485 |
+
if config.block_webrtc:
|
| 486 |
+
flags += (
|
| 487 |
+
"--webrtc-ip-handling-policy=disable_non_proxied_udp",
|
| 488 |
+
"--force-webrtc-ip-handling-policy", # Ensures the policy is enforced
|
| 489 |
+
)
|
| 490 |
+
if not config.allow_webgl:
|
| 491 |
+
flags += (
|
| 492 |
+
"--disable-webgl",
|
| 493 |
+
"--disable-webgl-image-chromium",
|
| 494 |
+
"--disable-webgl2",
|
| 495 |
+
)
|
| 496 |
+
if config.hide_canvas:
|
| 497 |
+
flags += ("--fingerprinting-canvas-image-data-noise",)
|
| 498 |
+
|
| 499 |
+
super(StealthySessionMixin, self).__generate_options__(flags)
|
| 500 |
+
|
| 501 |
+
@staticmethod
|
| 502 |
+
def _detect_cloudflare(page_content: str) -> str | None:
|
| 503 |
+
"""
|
| 504 |
+
Detect the type of Cloudflare challenge present in the provided page content.
|
| 505 |
+
|
| 506 |
+
This function analyzes the given page content to identify whether a specific
|
| 507 |
+
type of Cloudflare challenge is present. It checks for three predefined
|
| 508 |
+
challenge types: non-interactive, managed, and interactive. If a challenge
|
| 509 |
+
type is detected, it returns the corresponding type as a string. If no
|
| 510 |
+
challenge type is detected, it returns None.
|
| 511 |
+
|
| 512 |
+
Args:
|
| 513 |
+
page_content (str): The content of the page to analyze for Cloudflare
|
| 514 |
+
challenge types.
|
| 515 |
+
|
| 516 |
+
Returns:
|
| 517 |
+
str: A string representing the detected Cloudflare challenge type, if
|
| 518 |
+
found. Returns None if no challenge matches.
|
| 519 |
+
"""
|
| 520 |
+
challenge_types = (
|
| 521 |
+
"non-interactive",
|
| 522 |
+
"managed",
|
| 523 |
+
"interactive",
|
| 524 |
+
)
|
| 525 |
+
for ctype in challenge_types:
|
| 526 |
+
if f"cType: '{ctype}'" in page_content:
|
| 527 |
+
return ctype
|
| 528 |
+
|
| 529 |
+
# Check if turnstile captcha is embedded inside the page (Usually inside a closed Shadow iframe)
|
| 530 |
+
selector = Selector(content=page_content)
|
| 531 |
+
if selector.css('script[src*="challenges.cloudflare.com/turnstile/v"]'):
|
| 532 |
+
return "embedded"
|
| 533 |
+
|
| 534 |
+
return None
|
engines/_browsers/_config_tools.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from scrapling.engines.toolbelt.fingerprints import generate_headers
|
| 2 |
+
|
| 3 |
+
__default_useragent__ = generate_headers(browser_mode=True).get("User-Agent")
|
| 4 |
+
__default_chrome_useragent__ = generate_headers(browser_mode="chrome").get("User-Agent")
|
engines/_browsers/_controllers.py
ADDED
|
@@ -0,0 +1,362 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from time import sleep as time_sleep
|
| 2 |
+
from asyncio import sleep as asyncio_sleep
|
| 3 |
+
|
| 4 |
+
from playwright.sync_api import (
|
| 5 |
+
Locator,
|
| 6 |
+
sync_playwright,
|
| 7 |
+
)
|
| 8 |
+
from playwright.async_api import (
|
| 9 |
+
async_playwright,
|
| 10 |
+
Locator as AsyncLocator,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
from scrapling.core.utils import log
|
| 14 |
+
from scrapling.core._types import Optional, ProxyType, Unpack
|
| 15 |
+
from scrapling.engines.toolbelt.proxy_rotation import is_proxy_error
|
| 16 |
+
from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
|
| 17 |
+
from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
|
| 18 |
+
from scrapling.engines._browsers._types import PlaywrightSession, PlaywrightFetchParams
|
| 19 |
+
from scrapling.engines._browsers._base import SyncSession, AsyncSession, DynamicSessionMixin
|
| 20 |
+
from scrapling.engines._browsers._validators import validate_fetch as _validate, PlaywrightConfig
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class DynamicSession(SyncSession, DynamicSessionMixin):
|
| 24 |
+
"""A Browser session manager with page pooling."""
|
| 25 |
+
|
| 26 |
+
__slots__ = (
|
| 27 |
+
"_config",
|
| 28 |
+
"_context_options",
|
| 29 |
+
"_browser_options",
|
| 30 |
+
"_user_data_dir",
|
| 31 |
+
"_headers_keys",
|
| 32 |
+
"max_pages",
|
| 33 |
+
"page_pool",
|
| 34 |
+
"_max_wait_for_page",
|
| 35 |
+
"playwright",
|
| 36 |
+
"context",
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
def __init__(self, **kwargs: Unpack[PlaywrightSession]):
|
| 40 |
+
"""A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
|
| 41 |
+
|
| 42 |
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 43 |
+
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 44 |
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 45 |
+
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
| 46 |
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 47 |
+
:param cookies: Set cookies for the next request.
|
| 48 |
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 49 |
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 50 |
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 51 |
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 52 |
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 53 |
+
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
| 54 |
+
:param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
|
| 55 |
+
rules. Defaults to the system default locale.
|
| 56 |
+
:param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
|
| 57 |
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 58 |
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 59 |
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 60 |
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 61 |
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 62 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 63 |
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 64 |
+
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
| 65 |
+
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
| 66 |
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 67 |
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 68 |
+
"""
|
| 69 |
+
self.__validate__(**kwargs)
|
| 70 |
+
super().__init__()
|
| 71 |
+
|
| 72 |
+
def start(self):
|
| 73 |
+
"""Create a browser for this instance and context."""
|
| 74 |
+
if not self.playwright:
|
| 75 |
+
self.playwright = sync_playwright().start()
|
| 76 |
+
|
| 77 |
+
try:
|
| 78 |
+
if self._config.cdp_url: # pragma: no cover
|
| 79 |
+
self.browser = self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
|
| 80 |
+
if not self._config.proxy_rotator and self.browser:
|
| 81 |
+
self.context = self.browser.new_context(**self._context_options)
|
| 82 |
+
elif self._config.proxy_rotator:
|
| 83 |
+
self.browser = self.playwright.chromium.launch(**self._browser_options)
|
| 84 |
+
else:
|
| 85 |
+
persistent_options = (
|
| 86 |
+
self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
|
| 87 |
+
)
|
| 88 |
+
self.context = self.playwright.chromium.launch_persistent_context(**persistent_options)
|
| 89 |
+
|
| 90 |
+
if self.context:
|
| 91 |
+
self.context = self._initialize_context(self._config, self.context)
|
| 92 |
+
|
| 93 |
+
self._is_alive = True
|
| 94 |
+
except Exception:
|
| 95 |
+
# Clean up playwright if browser setup fails
|
| 96 |
+
self.playwright.stop()
|
| 97 |
+
self.playwright = None
|
| 98 |
+
raise
|
| 99 |
+
else:
|
| 100 |
+
raise RuntimeError("Session has been already started")
|
| 101 |
+
|
| 102 |
+
def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Response:
|
| 103 |
+
"""Opens up the browser and do your request based on your chosen options.
|
| 104 |
+
|
| 105 |
+
:param url: The Target url.
|
| 106 |
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 107 |
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 108 |
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 109 |
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 110 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 111 |
+
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 112 |
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 113 |
+
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
| 114 |
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 115 |
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 116 |
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 117 |
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 118 |
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 119 |
+
:param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
|
| 120 |
+
:return: A `Response` object.
|
| 121 |
+
"""
|
| 122 |
+
static_proxy = kwargs.pop("proxy", None)
|
| 123 |
+
|
| 124 |
+
params = _validate(kwargs, self, PlaywrightConfig)
|
| 125 |
+
if not self._is_alive: # pragma: no cover
|
| 126 |
+
raise RuntimeError("Context manager has been closed")
|
| 127 |
+
|
| 128 |
+
request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
|
| 129 |
+
referer = (
|
| 130 |
+
generate_convincing_referer(url)
|
| 131 |
+
if (params.google_search and "referer" not in request_headers_keys)
|
| 132 |
+
else None
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
for attempt in range(self._config.retries):
|
| 136 |
+
proxy: Optional[ProxyType] = None
|
| 137 |
+
if self._config.proxy_rotator and static_proxy is None:
|
| 138 |
+
proxy = self._config.proxy_rotator.get_proxy()
|
| 139 |
+
else:
|
| 140 |
+
proxy = static_proxy
|
| 141 |
+
|
| 142 |
+
with self._page_generator(
|
| 143 |
+
params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
|
| 144 |
+
) as page_info:
|
| 145 |
+
final_response = [None]
|
| 146 |
+
page = page_info.page
|
| 147 |
+
page.on("response", self._create_response_handler(page_info, final_response))
|
| 148 |
+
|
| 149 |
+
try:
|
| 150 |
+
first_response = page.goto(url, referer=referer)
|
| 151 |
+
self._wait_for_page_stability(page, params.load_dom, params.network_idle)
|
| 152 |
+
|
| 153 |
+
if not first_response:
|
| 154 |
+
raise RuntimeError(f"Failed to get response for {url}")
|
| 155 |
+
|
| 156 |
+
if params.page_action:
|
| 157 |
+
try:
|
| 158 |
+
_ = params.page_action(page)
|
| 159 |
+
except Exception as e: # pragma: no cover
|
| 160 |
+
log.error(f"Error executing page_action: {e}")
|
| 161 |
+
|
| 162 |
+
if params.wait_selector:
|
| 163 |
+
try:
|
| 164 |
+
waiter: Locator = page.locator(params.wait_selector)
|
| 165 |
+
waiter.first.wait_for(state=params.wait_selector_state)
|
| 166 |
+
self._wait_for_page_stability(page, params.load_dom, params.network_idle)
|
| 167 |
+
except Exception as e: # pragma: no cover
|
| 168 |
+
log.error(f"Error waiting for selector {params.wait_selector}: {e}")
|
| 169 |
+
|
| 170 |
+
page.wait_for_timeout(params.wait)
|
| 171 |
+
|
| 172 |
+
response = ResponseFactory.from_playwright_response(
|
| 173 |
+
page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
|
| 174 |
+
)
|
| 175 |
+
return response
|
| 176 |
+
|
| 177 |
+
except Exception as e:
|
| 178 |
+
page_info.mark_error()
|
| 179 |
+
if attempt < self._config.retries - 1:
|
| 180 |
+
if is_proxy_error(e):
|
| 181 |
+
log.warning(
|
| 182 |
+
f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
|
| 183 |
+
)
|
| 184 |
+
else:
|
| 185 |
+
log.warning(
|
| 186 |
+
f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
|
| 187 |
+
)
|
| 188 |
+
time_sleep(self._config.retry_delay)
|
| 189 |
+
else:
|
| 190 |
+
log.error(f"Failed after {self._config.retries} attempts: {e}")
|
| 191 |
+
raise
|
| 192 |
+
|
| 193 |
+
raise RuntimeError("Request failed") # pragma: no cover
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
|
| 197 |
+
"""An async Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory."""
|
| 198 |
+
|
| 199 |
+
__slots__ = (
|
| 200 |
+
"_config",
|
| 201 |
+
"_context_options",
|
| 202 |
+
"_browser_options",
|
| 203 |
+
"_user_data_dir",
|
| 204 |
+
"_headers_keys",
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
def __init__(self, **kwargs: Unpack[PlaywrightSession]):
|
| 208 |
+
"""A Browser session manager with page pooling
|
| 209 |
+
|
| 210 |
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 211 |
+
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 212 |
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 213 |
+
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
| 214 |
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 215 |
+
:param cookies: Set cookies for the next request.
|
| 216 |
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 217 |
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 218 |
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 219 |
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 220 |
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 221 |
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 222 |
+
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
| 223 |
+
:param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
|
| 224 |
+
rules. Defaults to the system default locale.
|
| 225 |
+
:param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
|
| 226 |
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 227 |
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 228 |
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 229 |
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 230 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 231 |
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 232 |
+
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
| 233 |
+
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
| 234 |
+
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
| 235 |
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 236 |
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 237 |
+
"""
|
| 238 |
+
self.__validate__(**kwargs)
|
| 239 |
+
super().__init__(max_pages=self._config.max_pages)
|
| 240 |
+
|
| 241 |
+
async def start(self) -> None:
|
| 242 |
+
"""Create a browser for this instance and context."""
|
| 243 |
+
if not self.playwright:
|
| 244 |
+
self.playwright = await async_playwright().start()
|
| 245 |
+
try:
|
| 246 |
+
if self._config.cdp_url:
|
| 247 |
+
self.browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
|
| 248 |
+
if not self._config.proxy_rotator and self.browser:
|
| 249 |
+
self.context = await self.browser.new_context(**self._context_options)
|
| 250 |
+
elif self._config.proxy_rotator:
|
| 251 |
+
self.browser = await self.playwright.chromium.launch(**self._browser_options)
|
| 252 |
+
else:
|
| 253 |
+
persistent_options = (
|
| 254 |
+
self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
|
| 255 |
+
)
|
| 256 |
+
self.context = await self.playwright.chromium.launch_persistent_context(**persistent_options)
|
| 257 |
+
|
| 258 |
+
if self.context:
|
| 259 |
+
self.context = await self._initialize_context(self._config, self.context)
|
| 260 |
+
|
| 261 |
+
self._is_alive = True
|
| 262 |
+
except Exception:
|
| 263 |
+
# Clean up playwright if browser setup fails
|
| 264 |
+
await self.playwright.stop()
|
| 265 |
+
self.playwright = None
|
| 266 |
+
raise
|
| 267 |
+
else:
|
| 268 |
+
raise RuntimeError("Session has been already started")
|
| 269 |
+
|
| 270 |
+
async def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Response:
|
| 271 |
+
"""Opens up the browser and do your request based on your chosen options.
|
| 272 |
+
|
| 273 |
+
:param url: The Target url.
|
| 274 |
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 275 |
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 276 |
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 277 |
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 278 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 279 |
+
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 280 |
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 281 |
+
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
| 282 |
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 283 |
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 284 |
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 285 |
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 286 |
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 287 |
+
:param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
|
| 288 |
+
:return: A `Response` object.
|
| 289 |
+
"""
|
| 290 |
+
static_proxy = kwargs.pop("proxy", None)
|
| 291 |
+
|
| 292 |
+
params = _validate(kwargs, self, PlaywrightConfig)
|
| 293 |
+
|
| 294 |
+
if not self._is_alive: # pragma: no cover
|
| 295 |
+
raise RuntimeError("Context manager has been closed")
|
| 296 |
+
|
| 297 |
+
request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
|
| 298 |
+
referer = (
|
| 299 |
+
generate_convincing_referer(url)
|
| 300 |
+
if (params.google_search and "referer" not in request_headers_keys)
|
| 301 |
+
else None
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
for attempt in range(self._config.retries):
|
| 305 |
+
proxy: Optional[ProxyType] = None
|
| 306 |
+
if self._config.proxy_rotator and static_proxy is None:
|
| 307 |
+
proxy = self._config.proxy_rotator.get_proxy()
|
| 308 |
+
else:
|
| 309 |
+
proxy = static_proxy
|
| 310 |
+
|
| 311 |
+
async with self._page_generator(
|
| 312 |
+
params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
|
| 313 |
+
) as page_info:
|
| 314 |
+
final_response = [None]
|
| 315 |
+
page = page_info.page
|
| 316 |
+
page.on("response", self._create_response_handler(page_info, final_response))
|
| 317 |
+
|
| 318 |
+
try:
|
| 319 |
+
first_response = await page.goto(url, referer=referer)
|
| 320 |
+
await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
|
| 321 |
+
|
| 322 |
+
if not first_response:
|
| 323 |
+
raise RuntimeError(f"Failed to get response for {url}")
|
| 324 |
+
|
| 325 |
+
if params.page_action:
|
| 326 |
+
try:
|
| 327 |
+
_ = await params.page_action(page)
|
| 328 |
+
except Exception as e: # pragma: no cover
|
| 329 |
+
log.error(f"Error executing page_action: {e}")
|
| 330 |
+
|
| 331 |
+
if params.wait_selector:
|
| 332 |
+
try:
|
| 333 |
+
waiter: AsyncLocator = page.locator(params.wait_selector)
|
| 334 |
+
await waiter.first.wait_for(state=params.wait_selector_state)
|
| 335 |
+
await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
|
| 336 |
+
except Exception as e: # pragma: no cover
|
| 337 |
+
log.error(f"Error waiting for selector {params.wait_selector}: {e}")
|
| 338 |
+
|
| 339 |
+
await page.wait_for_timeout(params.wait)
|
| 340 |
+
|
| 341 |
+
response = await ResponseFactory.from_async_playwright_response(
|
| 342 |
+
page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
|
| 343 |
+
)
|
| 344 |
+
return response
|
| 345 |
+
|
| 346 |
+
except Exception as e:
|
| 347 |
+
page_info.mark_error()
|
| 348 |
+
if attempt < self._config.retries - 1:
|
| 349 |
+
if is_proxy_error(e):
|
| 350 |
+
log.warning(
|
| 351 |
+
f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
|
| 352 |
+
)
|
| 353 |
+
else:
|
| 354 |
+
log.warning(
|
| 355 |
+
f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
|
| 356 |
+
)
|
| 357 |
+
await asyncio_sleep(self._config.retry_delay)
|
| 358 |
+
else:
|
| 359 |
+
log.error(f"Failed after {self._config.retries} attempts: {e}")
|
| 360 |
+
raise
|
| 361 |
+
|
| 362 |
+
raise RuntimeError("Request failed") # pragma: no cover
|
engines/_browsers/_page.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from threading import RLock
|
| 2 |
+
from dataclasses import dataclass
|
| 3 |
+
|
| 4 |
+
from playwright.sync_api._generated import Page as SyncPage
|
| 5 |
+
from playwright.async_api._generated import Page as AsyncPage
|
| 6 |
+
|
| 7 |
+
from scrapling.core._types import Optional, List, Literal, overload, TypeVar, Generic, cast
|
| 8 |
+
|
| 9 |
+
PageState = Literal["ready", "busy", "error"] # States that a page can be in
|
| 10 |
+
PageType = TypeVar("PageType", SyncPage, AsyncPage)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@dataclass
|
| 14 |
+
class PageInfo(Generic[PageType]):
|
| 15 |
+
"""Information about the page and its current state"""
|
| 16 |
+
|
| 17 |
+
__slots__ = ("page", "state", "url")
|
| 18 |
+
page: PageType
|
| 19 |
+
state: PageState
|
| 20 |
+
url: Optional[str]
|
| 21 |
+
|
| 22 |
+
def mark_busy(self, url: str = ""):
|
| 23 |
+
"""Mark the page as busy"""
|
| 24 |
+
self.state = "busy"
|
| 25 |
+
self.url = url
|
| 26 |
+
|
| 27 |
+
def mark_error(self):
|
| 28 |
+
"""Mark the page as having an error"""
|
| 29 |
+
self.state = "error"
|
| 30 |
+
|
| 31 |
+
def __repr__(self):
|
| 32 |
+
return f'Page(URL="{self.url!r}", state={self.state!r})'
|
| 33 |
+
|
| 34 |
+
def __eq__(self, other_page):
|
| 35 |
+
"""Comparing this page to another page object."""
|
| 36 |
+
if other_page.__class__ is not self.__class__:
|
| 37 |
+
return NotImplemented
|
| 38 |
+
return self.page == other_page.page
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class PagePool:
|
| 42 |
+
"""Manages a pool of browser pages/tabs with state tracking"""
|
| 43 |
+
|
| 44 |
+
__slots__ = ("max_pages", "pages", "_lock")
|
| 45 |
+
|
| 46 |
+
def __init__(self, max_pages: int = 5):
|
| 47 |
+
self.max_pages = max_pages
|
| 48 |
+
self.pages: List[PageInfo[SyncPage] | PageInfo[AsyncPage]] = []
|
| 49 |
+
self._lock = RLock()
|
| 50 |
+
|
| 51 |
+
@overload
|
| 52 |
+
def add_page(self, page: SyncPage) -> PageInfo[SyncPage]: ...
|
| 53 |
+
|
| 54 |
+
@overload
|
| 55 |
+
def add_page(self, page: AsyncPage) -> PageInfo[AsyncPage]: ...
|
| 56 |
+
|
| 57 |
+
def add_page(self, page: SyncPage | AsyncPage) -> PageInfo[SyncPage] | PageInfo[AsyncPage]:
|
| 58 |
+
"""Add a new page to the pool"""
|
| 59 |
+
with self._lock:
|
| 60 |
+
if len(self.pages) >= self.max_pages:
|
| 61 |
+
raise RuntimeError(f"Maximum page limit ({self.max_pages}) reached")
|
| 62 |
+
|
| 63 |
+
if isinstance(page, AsyncPage):
|
| 64 |
+
page_info: PageInfo[SyncPage] | PageInfo[AsyncPage] = cast(
|
| 65 |
+
PageInfo[AsyncPage], PageInfo(page, "ready", "")
|
| 66 |
+
)
|
| 67 |
+
else:
|
| 68 |
+
page_info = cast(PageInfo[SyncPage], PageInfo(page, "ready", ""))
|
| 69 |
+
|
| 70 |
+
self.pages.append(page_info)
|
| 71 |
+
return page_info
|
| 72 |
+
|
| 73 |
+
@property
|
| 74 |
+
def pages_count(self) -> int:
|
| 75 |
+
"""Get the total number of pages"""
|
| 76 |
+
return len(self.pages)
|
| 77 |
+
|
| 78 |
+
@property
|
| 79 |
+
def busy_count(self) -> int:
|
| 80 |
+
"""Get the number of busy pages"""
|
| 81 |
+
with self._lock:
|
| 82 |
+
return sum(1 for p in self.pages if p.state == "busy")
|
| 83 |
+
|
| 84 |
+
def cleanup_error_pages(self):
|
| 85 |
+
"""Remove pages in error state"""
|
| 86 |
+
with self._lock:
|
| 87 |
+
self.pages = [p for p in self.pages if p.state != "error"]
|
engines/_browsers/_stealth.py
ADDED
|
@@ -0,0 +1,541 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from random import randint
|
| 2 |
+
from re import compile as re_compile
|
| 3 |
+
from time import sleep as time_sleep
|
| 4 |
+
from asyncio import sleep as asyncio_sleep
|
| 5 |
+
|
| 6 |
+
from playwright.sync_api import Locator, Page, BrowserContext
|
| 7 |
+
from playwright.async_api import (
|
| 8 |
+
Page as async_Page,
|
| 9 |
+
Locator as AsyncLocator,
|
| 10 |
+
BrowserContext as AsyncBrowserContext,
|
| 11 |
+
)
|
| 12 |
+
from patchright.sync_api import sync_playwright
|
| 13 |
+
from patchright.async_api import async_playwright
|
| 14 |
+
|
| 15 |
+
from scrapling.core.utils import log
|
| 16 |
+
from scrapling.core._types import Any, Optional, ProxyType, Unpack
|
| 17 |
+
from scrapling.engines.toolbelt.proxy_rotation import is_proxy_error
|
| 18 |
+
from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
|
| 19 |
+
from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
|
| 20 |
+
from scrapling.engines._browsers._types import StealthSession, StealthFetchParams
|
| 21 |
+
from scrapling.engines._browsers._base import SyncSession, AsyncSession, StealthySessionMixin
|
| 22 |
+
from scrapling.engines._browsers._validators import validate_fetch as _validate, StealthConfig
|
| 23 |
+
|
| 24 |
+
__CF_PATTERN__ = re_compile(r"^https?://challenges\.cloudflare\.com/cdn-cgi/challenge-platform/.*")
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class StealthySession(SyncSession, StealthySessionMixin):
|
| 28 |
+
"""A Stealthy Browser session manager with page pooling."""
|
| 29 |
+
|
| 30 |
+
__slots__ = (
|
| 31 |
+
"_config",
|
| 32 |
+
"_context_options",
|
| 33 |
+
"_browser_options",
|
| 34 |
+
"_user_data_dir",
|
| 35 |
+
"_headers_keys",
|
| 36 |
+
"max_pages",
|
| 37 |
+
"page_pool",
|
| 38 |
+
"_max_wait_for_page",
|
| 39 |
+
"playwright",
|
| 40 |
+
"context",
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
def __init__(self, **kwargs: Unpack[StealthSession]):
|
| 44 |
+
"""A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
|
| 45 |
+
|
| 46 |
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 47 |
+
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 48 |
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 49 |
+
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
| 50 |
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 51 |
+
:param cookies: Set cookies for the next request.
|
| 52 |
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 53 |
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 54 |
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 55 |
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 56 |
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 57 |
+
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
| 58 |
+
:param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
|
| 59 |
+
rules. Defaults to the system default locale.
|
| 60 |
+
:param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
|
| 61 |
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 62 |
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
| 63 |
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 64 |
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
| 65 |
+
:param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
|
| 66 |
+
:param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
| 67 |
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 68 |
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 69 |
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 70 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 71 |
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 72 |
+
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
| 73 |
+
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
| 74 |
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 75 |
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 76 |
+
"""
|
| 77 |
+
self.__validate__(**kwargs)
|
| 78 |
+
super().__init__()
|
| 79 |
+
|
| 80 |
+
def start(self) -> None:
|
| 81 |
+
"""Create a browser for this instance and context."""
|
| 82 |
+
if not self.playwright:
|
| 83 |
+
self.playwright = sync_playwright().start()
|
| 84 |
+
|
| 85 |
+
try:
|
| 86 |
+
if self._config.cdp_url: # pragma: no cover
|
| 87 |
+
self.browser = self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
|
| 88 |
+
if not self._config.proxy_rotator:
|
| 89 |
+
assert self.browser is not None
|
| 90 |
+
self.context = self.browser.new_context(**self._context_options)
|
| 91 |
+
elif self._config.proxy_rotator:
|
| 92 |
+
self.browser = self.playwright.chromium.launch(**self._browser_options)
|
| 93 |
+
else:
|
| 94 |
+
persistent_options = (
|
| 95 |
+
self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
|
| 96 |
+
)
|
| 97 |
+
self.context = self.playwright.chromium.launch_persistent_context(**persistent_options)
|
| 98 |
+
|
| 99 |
+
if self.context:
|
| 100 |
+
self.context = self._initialize_context(self._config, self.context)
|
| 101 |
+
|
| 102 |
+
self._is_alive = True
|
| 103 |
+
except Exception:
|
| 104 |
+
# Clean up playwright if browser setup fails
|
| 105 |
+
self.playwright.stop()
|
| 106 |
+
self.playwright = None
|
| 107 |
+
raise
|
| 108 |
+
else:
|
| 109 |
+
raise RuntimeError("Session has been already started")
|
| 110 |
+
|
| 111 |
+
def _cloudflare_solver(self, page: Page) -> None: # pragma: no cover
|
| 112 |
+
"""Solve the cloudflare challenge displayed on the playwright page passed
|
| 113 |
+
|
| 114 |
+
:param page: The targeted page
|
| 115 |
+
:return:
|
| 116 |
+
"""
|
| 117 |
+
self._wait_for_networkidle(page, timeout=5000)
|
| 118 |
+
challenge_type = self._detect_cloudflare(ResponseFactory._get_page_content(page))
|
| 119 |
+
if not challenge_type:
|
| 120 |
+
log.error("No Cloudflare challenge found.")
|
| 121 |
+
return None
|
| 122 |
+
else:
|
| 123 |
+
log.info(f'The turnstile version discovered is "{challenge_type}"')
|
| 124 |
+
if challenge_type == "non-interactive":
|
| 125 |
+
while "<title>Just a moment...</title>" in (ResponseFactory._get_page_content(page)):
|
| 126 |
+
log.info("Waiting for Cloudflare wait page to disappear.")
|
| 127 |
+
page.wait_for_timeout(1000)
|
| 128 |
+
page.wait_for_load_state()
|
| 129 |
+
log.info("Cloudflare captcha is solved")
|
| 130 |
+
return None
|
| 131 |
+
|
| 132 |
+
else:
|
| 133 |
+
box_selector = "#cf_turnstile div, #cf-turnstile div, .turnstile>div>div"
|
| 134 |
+
if challenge_type != "embedded":
|
| 135 |
+
box_selector = ".main-content p+div>div>div"
|
| 136 |
+
while "Verifying you are human." in ResponseFactory._get_page_content(page):
|
| 137 |
+
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
| 138 |
+
page.wait_for_timeout(500)
|
| 139 |
+
|
| 140 |
+
outer_box: Any = {}
|
| 141 |
+
iframe = page.frame(url=__CF_PATTERN__)
|
| 142 |
+
if iframe is not None:
|
| 143 |
+
self._wait_for_page_stability(iframe, True, False)
|
| 144 |
+
|
| 145 |
+
if challenge_type != "embedded":
|
| 146 |
+
while not iframe.frame_element().is_visible():
|
| 147 |
+
# Double-checking that the iframe is loaded
|
| 148 |
+
page.wait_for_timeout(500)
|
| 149 |
+
|
| 150 |
+
outer_box = iframe.frame_element().bounding_box()
|
| 151 |
+
|
| 152 |
+
if not iframe or not outer_box:
|
| 153 |
+
if "<title>Just a moment...</title>" not in (ResponseFactory._get_page_content(page)):
|
| 154 |
+
log.info("Cloudflare captcha is solved")
|
| 155 |
+
return None
|
| 156 |
+
|
| 157 |
+
outer_box = page.locator(box_selector).last.bounding_box()
|
| 158 |
+
|
| 159 |
+
# Calculate the Captcha coordinates for any viewport
|
| 160 |
+
captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)
|
| 161 |
+
|
| 162 |
+
# Move the mouse to the center of the window, then press and hold the left mouse button
|
| 163 |
+
page.mouse.click(captcha_x, captcha_y, delay=randint(100, 200), button="left")
|
| 164 |
+
self._wait_for_networkidle(page)
|
| 165 |
+
|
| 166 |
+
if challenge_type != "embedded":
|
| 167 |
+
attempts = 0
|
| 168 |
+
while "<title>Just a moment...</title>" in ResponseFactory._get_page_content(page):
|
| 169 |
+
# Wait for the page
|
| 170 |
+
if attempts >= 100:
|
| 171 |
+
log.info("Cloudflare page didn't disappear after 10s, continuing...")
|
| 172 |
+
break
|
| 173 |
+
page.wait_for_timeout(100)
|
| 174 |
+
attempts += 1
|
| 175 |
+
|
| 176 |
+
# page.locator(box_selector).last.wait_for(state="detached")
|
| 177 |
+
# page.locator(".zone-name-title").wait_for(state="hidden")
|
| 178 |
+
|
| 179 |
+
self._wait_for_page_stability(page, True, False)
|
| 180 |
+
|
| 181 |
+
if "<title>Just a moment...</title>" not in (ResponseFactory._get_page_content(page)):
|
| 182 |
+
log.info("Cloudflare captcha is solved")
|
| 183 |
+
return None
|
| 184 |
+
else:
|
| 185 |
+
log.info("Looks like Cloudflare captcha is still present, solving again")
|
| 186 |
+
return self._cloudflare_solver(page)
|
| 187 |
+
|
| 188 |
+
def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Response:
|
| 189 |
+
"""Opens up the browser and do your request based on your chosen options.
|
| 190 |
+
|
| 191 |
+
:param url: The Target url.
|
| 192 |
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 193 |
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 194 |
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 195 |
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 196 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 197 |
+
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 198 |
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 199 |
+
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
| 200 |
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 201 |
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 202 |
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 203 |
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 204 |
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
| 205 |
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 206 |
+
:param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
|
| 207 |
+
:return: A `Response` object.
|
| 208 |
+
"""
|
| 209 |
+
static_proxy = kwargs.pop("proxy", None)
|
| 210 |
+
|
| 211 |
+
params = _validate(kwargs, self, StealthConfig)
|
| 212 |
+
if not self._is_alive: # pragma: no cover
|
| 213 |
+
raise RuntimeError("Context manager has been closed")
|
| 214 |
+
|
| 215 |
+
request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
|
| 216 |
+
referer = (
|
| 217 |
+
generate_convincing_referer(url)
|
| 218 |
+
if (params.google_search and "referer" not in request_headers_keys)
|
| 219 |
+
else None
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
for attempt in range(self._config.retries):
|
| 223 |
+
proxy: Optional[ProxyType] = None
|
| 224 |
+
if self._config.proxy_rotator and static_proxy is None:
|
| 225 |
+
proxy = self._config.proxy_rotator.get_proxy()
|
| 226 |
+
else:
|
| 227 |
+
proxy = static_proxy
|
| 228 |
+
|
| 229 |
+
with self._page_generator(
|
| 230 |
+
params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
|
| 231 |
+
) as page_info:
|
| 232 |
+
final_response = [None]
|
| 233 |
+
page = page_info.page
|
| 234 |
+
page.on("response", self._create_response_handler(page_info, final_response))
|
| 235 |
+
|
| 236 |
+
try:
|
| 237 |
+
first_response = page.goto(url, referer=referer)
|
| 238 |
+
self._wait_for_page_stability(page, params.load_dom, params.network_idle)
|
| 239 |
+
|
| 240 |
+
if not first_response:
|
| 241 |
+
raise RuntimeError(f"Failed to get response for {url}")
|
| 242 |
+
|
| 243 |
+
if params.solve_cloudflare:
|
| 244 |
+
self._cloudflare_solver(page)
|
| 245 |
+
# Make sure the page is fully loaded after the captcha
|
| 246 |
+
self._wait_for_page_stability(page, params.load_dom, params.network_idle)
|
| 247 |
+
|
| 248 |
+
if params.page_action:
|
| 249 |
+
try:
|
| 250 |
+
_ = params.page_action(page)
|
| 251 |
+
except Exception as e: # pragma: no cover
|
| 252 |
+
log.error(f"Error executing page_action: {e}")
|
| 253 |
+
|
| 254 |
+
if params.wait_selector:
|
| 255 |
+
try:
|
| 256 |
+
waiter: Locator = page.locator(params.wait_selector)
|
| 257 |
+
waiter.first.wait_for(state=params.wait_selector_state)
|
| 258 |
+
self._wait_for_page_stability(page, params.load_dom, params.network_idle)
|
| 259 |
+
except Exception as e: # pragma: no cover
|
| 260 |
+
log.error(f"Error waiting for selector {params.wait_selector}: {e}")
|
| 261 |
+
|
| 262 |
+
page.wait_for_timeout(params.wait)
|
| 263 |
+
|
| 264 |
+
response = ResponseFactory.from_playwright_response(
|
| 265 |
+
page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
|
| 266 |
+
)
|
| 267 |
+
return response
|
| 268 |
+
|
| 269 |
+
except Exception as e:
|
| 270 |
+
page_info.mark_error()
|
| 271 |
+
if attempt < self._config.retries - 1:
|
| 272 |
+
if is_proxy_error(e):
|
| 273 |
+
log.warning(
|
| 274 |
+
f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
|
| 275 |
+
)
|
| 276 |
+
else:
|
| 277 |
+
log.warning(
|
| 278 |
+
f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
|
| 279 |
+
)
|
| 280 |
+
time_sleep(self._config.retry_delay)
|
| 281 |
+
else:
|
| 282 |
+
log.error(f"Failed after {self._config.retries} attempts: {e}")
|
| 283 |
+
raise
|
| 284 |
+
|
| 285 |
+
raise RuntimeError("Request failed") # pragma: no cover
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
class AsyncStealthySession(AsyncSession, StealthySessionMixin):
|
| 289 |
+
"""An async Stealthy Browser session manager with page pooling."""
|
| 290 |
+
|
| 291 |
+
__slots__ = (
|
| 292 |
+
"_config",
|
| 293 |
+
"_context_options",
|
| 294 |
+
"_browser_options",
|
| 295 |
+
"_user_data_dir",
|
| 296 |
+
"_headers_keys",
|
| 297 |
+
)
|
| 298 |
+
|
| 299 |
+
def __init__(self, **kwargs: Unpack[StealthSession]):
|
| 300 |
+
"""A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
|
| 301 |
+
|
| 302 |
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 303 |
+
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 304 |
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 305 |
+
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
| 306 |
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 307 |
+
:param cookies: Set cookies for the next request.
|
| 308 |
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 309 |
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 310 |
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 311 |
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 312 |
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 313 |
+
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
| 314 |
+
:param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
|
| 315 |
+
rules. Defaults to the system default locale.
|
| 316 |
+
:param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
|
| 317 |
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 318 |
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
| 319 |
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 320 |
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
| 321 |
+
:param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
|
| 322 |
+
:param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
| 323 |
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 324 |
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 325 |
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 326 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 327 |
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 328 |
+
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
| 329 |
+
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
| 330 |
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 331 |
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 332 |
+
"""
|
| 333 |
+
self.__validate__(**kwargs)
|
| 334 |
+
super().__init__(max_pages=self._config.max_pages)
|
| 335 |
+
|
| 336 |
+
async def start(self) -> None:
|
| 337 |
+
"""Create a browser for this instance and context."""
|
| 338 |
+
if not self.playwright:
|
| 339 |
+
self.playwright = await async_playwright().start()
|
| 340 |
+
try:
|
| 341 |
+
if self._config.cdp_url:
|
| 342 |
+
self.browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
|
| 343 |
+
if not self._config.proxy_rotator:
|
| 344 |
+
assert self.browser is not None
|
| 345 |
+
self.context = await self.browser.new_context(**self._context_options)
|
| 346 |
+
elif self._config.proxy_rotator:
|
| 347 |
+
self.browser = await self.playwright.chromium.launch(**self._browser_options)
|
| 348 |
+
else:
|
| 349 |
+
persistent_options = (
|
| 350 |
+
self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
|
| 351 |
+
)
|
| 352 |
+
self.context = await self.playwright.chromium.launch_persistent_context(**persistent_options)
|
| 353 |
+
|
| 354 |
+
if self.context:
|
| 355 |
+
self.context = await self._initialize_context(self._config, self.context)
|
| 356 |
+
|
| 357 |
+
self._is_alive = True
|
| 358 |
+
except Exception:
|
| 359 |
+
# Clean up playwright if browser setup fails
|
| 360 |
+
await self.playwright.stop()
|
| 361 |
+
self.playwright = None
|
| 362 |
+
raise
|
| 363 |
+
else:
|
| 364 |
+
raise RuntimeError("Session has been already started")
|
| 365 |
+
|
| 366 |
+
async def _cloudflare_solver(self, page: async_Page) -> None: # pragma: no cover
|
| 367 |
+
"""Solve the cloudflare challenge displayed on the playwright page passed
|
| 368 |
+
|
| 369 |
+
:param page: The targeted page
|
| 370 |
+
:return:
|
| 371 |
+
"""
|
| 372 |
+
await self._wait_for_networkidle(page, timeout=5000)
|
| 373 |
+
challenge_type = self._detect_cloudflare(await ResponseFactory._get_async_page_content(page))
|
| 374 |
+
if not challenge_type:
|
| 375 |
+
log.error("No Cloudflare challenge found.")
|
| 376 |
+
return None
|
| 377 |
+
else:
|
| 378 |
+
log.info(f'The turnstile version discovered is "{challenge_type}"')
|
| 379 |
+
if challenge_type == "non-interactive":
|
| 380 |
+
while "<title>Just a moment...</title>" in (await ResponseFactory._get_async_page_content(page)):
|
| 381 |
+
log.info("Waiting for Cloudflare wait page to disappear.")
|
| 382 |
+
await page.wait_for_timeout(1000)
|
| 383 |
+
await page.wait_for_load_state()
|
| 384 |
+
log.info("Cloudflare captcha is solved")
|
| 385 |
+
return None
|
| 386 |
+
|
| 387 |
+
else:
|
| 388 |
+
box_selector = "#cf_turnstile div, #cf-turnstile div, .turnstile>div>div"
|
| 389 |
+
if challenge_type != "embedded":
|
| 390 |
+
box_selector = ".main-content p+div>div>div"
|
| 391 |
+
while "Verifying you are human." in (await ResponseFactory._get_async_page_content(page)):
|
| 392 |
+
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
| 393 |
+
await page.wait_for_timeout(500)
|
| 394 |
+
|
| 395 |
+
outer_box: Any = {}
|
| 396 |
+
iframe = page.frame(url=__CF_PATTERN__)
|
| 397 |
+
if iframe is not None:
|
| 398 |
+
await self._wait_for_page_stability(iframe, True, False)
|
| 399 |
+
|
| 400 |
+
if challenge_type != "embedded":
|
| 401 |
+
while not await (await iframe.frame_element()).is_visible():
|
| 402 |
+
# Double-checking that the iframe is loaded
|
| 403 |
+
await page.wait_for_timeout(500)
|
| 404 |
+
|
| 405 |
+
outer_box = await (await iframe.frame_element()).bounding_box()
|
| 406 |
+
|
| 407 |
+
if not iframe or not outer_box:
|
| 408 |
+
if "<title>Just a moment...</title>" not in (await ResponseFactory._get_async_page_content(page)):
|
| 409 |
+
log.info("Cloudflare captcha is solved")
|
| 410 |
+
return None
|
| 411 |
+
|
| 412 |
+
outer_box = await page.locator(box_selector).last.bounding_box()
|
| 413 |
+
|
| 414 |
+
# Calculate the Captcha coordinates for any viewport
|
| 415 |
+
captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)
|
| 416 |
+
|
| 417 |
+
# Move the mouse to the center of the window, then press and hold the left mouse button
|
| 418 |
+
await page.mouse.click(captcha_x, captcha_y, delay=randint(100, 200), button="left")
|
| 419 |
+
await self._wait_for_networkidle(page)
|
| 420 |
+
|
| 421 |
+
if challenge_type != "embedded":
|
| 422 |
+
attempts = 0
|
| 423 |
+
while "<title>Just a moment...</title>" in (await ResponseFactory._get_async_page_content(page)):
|
| 424 |
+
# Wait for the page
|
| 425 |
+
if attempts >= 100:
|
| 426 |
+
log.info("Cloudflare page didn't disappear after 10s, continuing...")
|
| 427 |
+
break
|
| 428 |
+
await page.wait_for_timeout(100)
|
| 429 |
+
attempts += 1
|
| 430 |
+
|
| 431 |
+
# await page.locator(box_selector).last.wait_for(state="detached")
|
| 432 |
+
# await page.locator(".zone-name-title").wait_for(state="hidden")
|
| 433 |
+
|
| 434 |
+
await self._wait_for_page_stability(page, True, False)
|
| 435 |
+
|
| 436 |
+
if "<title>Just a moment...</title>" not in (await ResponseFactory._get_async_page_content(page)):
|
| 437 |
+
log.info("Cloudflare captcha is solved")
|
| 438 |
+
return None
|
| 439 |
+
else:
|
| 440 |
+
log.info("Looks like Cloudflare captcha is still present, solving again")
|
| 441 |
+
return await self._cloudflare_solver(page)
|
| 442 |
+
|
| 443 |
+
async def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Response:
|
| 444 |
+
"""Opens up the browser and do your request based on your chosen options.
|
| 445 |
+
|
| 446 |
+
:param url: The Target url.
|
| 447 |
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 448 |
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 449 |
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 450 |
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 451 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 452 |
+
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 453 |
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 454 |
+
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
| 455 |
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 456 |
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 457 |
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 458 |
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 459 |
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
| 460 |
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 461 |
+
:param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.
|
| 462 |
+
:return: A `Response` object.
|
| 463 |
+
"""
|
| 464 |
+
static_proxy = kwargs.pop("proxy", None)
|
| 465 |
+
|
| 466 |
+
params = _validate(kwargs, self, StealthConfig)
|
| 467 |
+
|
| 468 |
+
if not self._is_alive: # pragma: no cover
|
| 469 |
+
raise RuntimeError("Context manager has been closed")
|
| 470 |
+
|
| 471 |
+
request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
|
| 472 |
+
referer = (
|
| 473 |
+
generate_convincing_referer(url)
|
| 474 |
+
if (params.google_search and "referer" not in request_headers_keys)
|
| 475 |
+
else None
|
| 476 |
+
)
|
| 477 |
+
|
| 478 |
+
for attempt in range(self._config.retries):
|
| 479 |
+
proxy: Optional[ProxyType] = None
|
| 480 |
+
if self._config.proxy_rotator and static_proxy is None:
|
| 481 |
+
proxy = self._config.proxy_rotator.get_proxy()
|
| 482 |
+
else:
|
| 483 |
+
proxy = static_proxy
|
| 484 |
+
|
| 485 |
+
async with self._page_generator(
|
| 486 |
+
params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains
|
| 487 |
+
) as page_info:
|
| 488 |
+
final_response = [None]
|
| 489 |
+
page = page_info.page
|
| 490 |
+
page.on("response", self._create_response_handler(page_info, final_response))
|
| 491 |
+
|
| 492 |
+
try:
|
| 493 |
+
first_response = await page.goto(url, referer=referer)
|
| 494 |
+
await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
|
| 495 |
+
|
| 496 |
+
if not first_response:
|
| 497 |
+
raise RuntimeError(f"Failed to get response for {url}")
|
| 498 |
+
|
| 499 |
+
if params.solve_cloudflare:
|
| 500 |
+
await self._cloudflare_solver(page)
|
| 501 |
+
# Make sure the page is fully loaded after the captcha
|
| 502 |
+
await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
|
| 503 |
+
|
| 504 |
+
if params.page_action:
|
| 505 |
+
try:
|
| 506 |
+
_ = await params.page_action(page)
|
| 507 |
+
except Exception as e: # pragma: no cover
|
| 508 |
+
log.error(f"Error executing page_action: {e}")
|
| 509 |
+
|
| 510 |
+
if params.wait_selector:
|
| 511 |
+
try:
|
| 512 |
+
waiter: AsyncLocator = page.locator(params.wait_selector)
|
| 513 |
+
await waiter.first.wait_for(state=params.wait_selector_state)
|
| 514 |
+
await self._wait_for_page_stability(page, params.load_dom, params.network_idle)
|
| 515 |
+
except Exception as e: # pragma: no cover
|
| 516 |
+
log.error(f"Error waiting for selector {params.wait_selector}: {e}")
|
| 517 |
+
|
| 518 |
+
await page.wait_for_timeout(params.wait)
|
| 519 |
+
|
| 520 |
+
response = await ResponseFactory.from_async_playwright_response(
|
| 521 |
+
page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy}
|
| 522 |
+
)
|
| 523 |
+
return response
|
| 524 |
+
|
| 525 |
+
except Exception as e:
|
| 526 |
+
page_info.mark_error()
|
| 527 |
+
if attempt < self._config.retries - 1:
|
| 528 |
+
if is_proxy_error(e):
|
| 529 |
+
log.warning(
|
| 530 |
+
f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..."
|
| 531 |
+
)
|
| 532 |
+
else:
|
| 533 |
+
log.warning(
|
| 534 |
+
f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..."
|
| 535 |
+
)
|
| 536 |
+
await asyncio_sleep(self._config.retry_delay)
|
| 537 |
+
else:
|
| 538 |
+
log.error(f"Failed after {self._config.retries} attempts: {e}")
|
| 539 |
+
raise
|
| 540 |
+
|
| 541 |
+
raise RuntimeError("Request failed") # pragma: no cover
|
engines/_browsers/_types.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from io import BytesIO
|
| 2 |
+
|
| 3 |
+
from curl_cffi.requests import (
|
| 4 |
+
ProxySpec,
|
| 5 |
+
CookieTypes,
|
| 6 |
+
BrowserTypeLiteral,
|
| 7 |
+
)
|
| 8 |
+
|
| 9 |
+
from scrapling.core._types import (
|
| 10 |
+
Dict,
|
| 11 |
+
List,
|
| 12 |
+
Set,
|
| 13 |
+
Tuple,
|
| 14 |
+
Mapping,
|
| 15 |
+
Optional,
|
| 16 |
+
Callable,
|
| 17 |
+
Sequence,
|
| 18 |
+
TypedDict,
|
| 19 |
+
TypeAlias,
|
| 20 |
+
SetCookieParam,
|
| 21 |
+
SelectorWaitStates,
|
| 22 |
+
)
|
| 23 |
+
from scrapling.engines.toolbelt.proxy_rotation import ProxyRotator
|
| 24 |
+
|
| 25 |
+
# Type alias for `impersonate` parameter - accepts a single browser or list of browsers
|
| 26 |
+
ImpersonateType: TypeAlias = BrowserTypeLiteral | List[BrowserTypeLiteral] | None
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# Types for session initialization
|
| 30 |
+
class RequestsSession(TypedDict, total=False):
|
| 31 |
+
impersonate: ImpersonateType
|
| 32 |
+
http3: Optional[bool]
|
| 33 |
+
stealthy_headers: Optional[bool]
|
| 34 |
+
proxies: Optional[ProxySpec]
|
| 35 |
+
proxy: Optional[str]
|
| 36 |
+
proxy_auth: Optional[Tuple[str, str]]
|
| 37 |
+
proxy_rotator: Optional[ProxyRotator]
|
| 38 |
+
timeout: Optional[int | float]
|
| 39 |
+
headers: Optional[Mapping[str, Optional[str]]]
|
| 40 |
+
retries: Optional[int]
|
| 41 |
+
retry_delay: Optional[int]
|
| 42 |
+
follow_redirects: Optional[bool]
|
| 43 |
+
max_redirects: Optional[int]
|
| 44 |
+
verify: Optional[bool]
|
| 45 |
+
cert: Optional[str | Tuple[str, str]]
|
| 46 |
+
selector_config: Optional[Dict]
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# Types for GET request method parameters
|
| 50 |
+
class GetRequestParams(RequestsSession, total=False):
|
| 51 |
+
params: Optional[Dict | List | Tuple]
|
| 52 |
+
cookies: Optional[CookieTypes]
|
| 53 |
+
auth: Optional[Tuple[str, str]]
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
# Types for POST/PUT/DELETE request method parameters
|
| 57 |
+
class DataRequestParams(GetRequestParams, total=False):
|
| 58 |
+
data: Optional[Dict[str, str] | List[Tuple] | str | BytesIO | bytes]
|
| 59 |
+
json: Optional[Dict | List]
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# Types for browser session
|
| 63 |
+
class PlaywrightSession(TypedDict, total=False):
|
| 64 |
+
max_pages: int
|
| 65 |
+
headless: bool
|
| 66 |
+
disable_resources: bool
|
| 67 |
+
network_idle: bool
|
| 68 |
+
load_dom: bool
|
| 69 |
+
wait_selector: Optional[str]
|
| 70 |
+
wait_selector_state: SelectorWaitStates
|
| 71 |
+
cookies: Sequence[SetCookieParam] | None
|
| 72 |
+
google_search: bool
|
| 73 |
+
wait: int | float
|
| 74 |
+
timezone_id: str | None
|
| 75 |
+
page_action: Optional[Callable]
|
| 76 |
+
proxy: Optional[str | Dict[str, str] | Tuple]
|
| 77 |
+
proxy_rotator: Optional[ProxyRotator]
|
| 78 |
+
extra_headers: Optional[Dict[str, str]]
|
| 79 |
+
timeout: int | float
|
| 80 |
+
init_script: Optional[str]
|
| 81 |
+
user_data_dir: str
|
| 82 |
+
selector_config: Optional[Dict]
|
| 83 |
+
additional_args: Optional[Dict]
|
| 84 |
+
locale: Optional[str]
|
| 85 |
+
real_chrome: bool
|
| 86 |
+
cdp_url: Optional[str]
|
| 87 |
+
useragent: Optional[str]
|
| 88 |
+
extra_flags: Optional[List[str]]
|
| 89 |
+
blocked_domains: Optional[Set[str]]
|
| 90 |
+
retries: int
|
| 91 |
+
retry_delay: int | float
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
class PlaywrightFetchParams(TypedDict, total=False):
|
| 95 |
+
load_dom: bool
|
| 96 |
+
wait: int | float
|
| 97 |
+
network_idle: bool
|
| 98 |
+
google_search: bool
|
| 99 |
+
timeout: int | float
|
| 100 |
+
disable_resources: bool
|
| 101 |
+
wait_selector: Optional[str]
|
| 102 |
+
page_action: Optional[Callable]
|
| 103 |
+
selector_config: Optional[Dict]
|
| 104 |
+
extra_headers: Optional[Dict[str, str]]
|
| 105 |
+
wait_selector_state: SelectorWaitStates
|
| 106 |
+
blocked_domains: Optional[Set[str]]
|
| 107 |
+
proxy: Optional[str | Dict[str, str]]
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
class StealthSession(PlaywrightSession, total=False):
|
| 111 |
+
allow_webgl: bool
|
| 112 |
+
hide_canvas: bool
|
| 113 |
+
block_webrtc: bool
|
| 114 |
+
solve_cloudflare: bool
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
class StealthFetchParams(PlaywrightFetchParams, total=False):
|
| 118 |
+
solve_cloudflare: bool
|
engines/_browsers/_validators.py
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
from typing import Annotated
|
| 3 |
+
from functools import lru_cache
|
| 4 |
+
from urllib.parse import urlparse
|
| 5 |
+
from dataclasses import dataclass, fields
|
| 6 |
+
|
| 7 |
+
from msgspec import Struct, Meta, convert, ValidationError
|
| 8 |
+
|
| 9 |
+
from scrapling.core._types import (
|
| 10 |
+
Any,
|
| 11 |
+
Dict,
|
| 12 |
+
List,
|
| 13 |
+
Set,
|
| 14 |
+
Tuple,
|
| 15 |
+
Optional,
|
| 16 |
+
Callable,
|
| 17 |
+
Sequence,
|
| 18 |
+
overload,
|
| 19 |
+
SetCookieParam,
|
| 20 |
+
SelectorWaitStates,
|
| 21 |
+
)
|
| 22 |
+
from scrapling.engines.toolbelt.proxy_rotation import ProxyRotator
|
| 23 |
+
from scrapling.engines.toolbelt.navigation import construct_proxy_dict
|
| 24 |
+
from scrapling.engines._browsers._types import PlaywrightFetchParams, StealthFetchParams
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# Custom validators for msgspec
|
| 28 |
+
@lru_cache(8)
|
| 29 |
+
def _is_invalid_file_path(value: str) -> bool | str: # pragma: no cover
|
| 30 |
+
"""Fast file path validation"""
|
| 31 |
+
path = Path(value)
|
| 32 |
+
if not path.exists():
|
| 33 |
+
return f"Init script path not found: {value}"
|
| 34 |
+
if not path.is_file():
|
| 35 |
+
return f"Init script is not a file: {value}"
|
| 36 |
+
if not path.is_absolute():
|
| 37 |
+
return f"Init script is not a absolute path: {value}"
|
| 38 |
+
return False
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
@lru_cache(2)
|
| 42 |
+
def _is_invalid_cdp_url(cdp_url: str) -> bool | str:
|
| 43 |
+
"""Fast CDP URL validation"""
|
| 44 |
+
if not cdp_url.startswith(("ws://", "wss://")):
|
| 45 |
+
return "CDP URL must use 'ws://' or 'wss://' scheme"
|
| 46 |
+
|
| 47 |
+
netloc = urlparse(cdp_url).netloc
|
| 48 |
+
if not netloc: # pragma: no cover
|
| 49 |
+
return "Invalid hostname for the CDP URL"
|
| 50 |
+
return False
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
# Type aliases for cleaner annotations
|
| 54 |
+
PagesCount = Annotated[int, Meta(ge=1, le=50)]
|
| 55 |
+
RetriesCount = Annotated[int, Meta(ge=1, le=10)]
|
| 56 |
+
Seconds = Annotated[int, float, Meta(ge=0)]
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class PlaywrightConfig(Struct, kw_only=True, frozen=False, weakref=True):
|
| 60 |
+
"""Configuration struct for validation"""
|
| 61 |
+
|
| 62 |
+
max_pages: PagesCount = 1
|
| 63 |
+
headless: bool = True
|
| 64 |
+
disable_resources: bool = False
|
| 65 |
+
network_idle: bool = False
|
| 66 |
+
load_dom: bool = True
|
| 67 |
+
wait_selector: Optional[str] = None
|
| 68 |
+
wait_selector_state: SelectorWaitStates = "attached"
|
| 69 |
+
cookies: Sequence[SetCookieParam] | None = []
|
| 70 |
+
google_search: bool = True
|
| 71 |
+
wait: Seconds = 0
|
| 72 |
+
timezone_id: str | None = ""
|
| 73 |
+
page_action: Optional[Callable] = None
|
| 74 |
+
proxy: Optional[str | Dict[str, str] | Tuple] = None # The default value for proxy in Playwright's source is `None`
|
| 75 |
+
proxy_rotator: Optional[ProxyRotator] = None
|
| 76 |
+
extra_headers: Optional[Dict[str, str]] = None
|
| 77 |
+
timeout: Seconds = 30000
|
| 78 |
+
init_script: Optional[str] = None
|
| 79 |
+
user_data_dir: str = ""
|
| 80 |
+
selector_config: Optional[Dict] = {}
|
| 81 |
+
additional_args: Optional[Dict] = {}
|
| 82 |
+
locale: str | None = None
|
| 83 |
+
real_chrome: bool = False
|
| 84 |
+
cdp_url: Optional[str] = None
|
| 85 |
+
useragent: Optional[str] = None
|
| 86 |
+
extra_flags: Optional[List[str]] = None
|
| 87 |
+
blocked_domains: Optional[Set[str]] = None
|
| 88 |
+
retries: RetriesCount = 3
|
| 89 |
+
retry_delay: Seconds = 1
|
| 90 |
+
|
| 91 |
+
def __post_init__(self): # pragma: no cover
|
| 92 |
+
"""Custom validation after msgspec validation"""
|
| 93 |
+
if self.page_action and not callable(self.page_action):
|
| 94 |
+
raise TypeError(f"page_action must be callable, got {type(self.page_action).__name__}")
|
| 95 |
+
if self.proxy and self.proxy_rotator:
|
| 96 |
+
raise ValueError(
|
| 97 |
+
"Cannot use 'proxy_rotator' together with 'proxy'. "
|
| 98 |
+
"Use either a static proxy or proxy rotation, not both."
|
| 99 |
+
)
|
| 100 |
+
if self.proxy:
|
| 101 |
+
self.proxy = construct_proxy_dict(self.proxy)
|
| 102 |
+
if self.cdp_url:
|
| 103 |
+
cdp_msg = _is_invalid_cdp_url(self.cdp_url)
|
| 104 |
+
if cdp_msg:
|
| 105 |
+
raise ValueError(cdp_msg)
|
| 106 |
+
|
| 107 |
+
if not self.cookies:
|
| 108 |
+
self.cookies = []
|
| 109 |
+
if not self.extra_flags:
|
| 110 |
+
self.extra_flags = []
|
| 111 |
+
if not self.selector_config:
|
| 112 |
+
self.selector_config = {}
|
| 113 |
+
if not self.additional_args:
|
| 114 |
+
self.additional_args = {}
|
| 115 |
+
|
| 116 |
+
if self.init_script is not None:
|
| 117 |
+
validation_msg = _is_invalid_file_path(self.init_script)
|
| 118 |
+
if validation_msg:
|
| 119 |
+
raise ValueError(validation_msg)
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
class StealthConfig(PlaywrightConfig, kw_only=True, frozen=False, weakref=True):
|
| 123 |
+
allow_webgl: bool = True
|
| 124 |
+
hide_canvas: bool = False
|
| 125 |
+
block_webrtc: bool = False
|
| 126 |
+
solve_cloudflare: bool = False
|
| 127 |
+
|
| 128 |
+
def __post_init__(self):
|
| 129 |
+
"""Custom validation after msgspec validation"""
|
| 130 |
+
super(StealthConfig, self).__post_init__()
|
| 131 |
+
# Cloudflare timeout adjustment
|
| 132 |
+
if self.solve_cloudflare and self.timeout < 60_000:
|
| 133 |
+
self.timeout = 60_000
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
@dataclass
|
| 137 |
+
class _fetch_params:
|
| 138 |
+
"""A dataclass of all parameters used by `fetch` calls"""
|
| 139 |
+
|
| 140 |
+
google_search: bool
|
| 141 |
+
timeout: Seconds
|
| 142 |
+
wait: Seconds
|
| 143 |
+
page_action: Optional[Callable]
|
| 144 |
+
extra_headers: Optional[Dict[str, str]]
|
| 145 |
+
disable_resources: bool
|
| 146 |
+
wait_selector: Optional[str]
|
| 147 |
+
wait_selector_state: SelectorWaitStates
|
| 148 |
+
network_idle: bool
|
| 149 |
+
load_dom: bool
|
| 150 |
+
blocked_domains: Optional[Set[str]]
|
| 151 |
+
solve_cloudflare: bool
|
| 152 |
+
selector_config: Dict
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def validate_fetch(
|
| 156 |
+
method_kwargs: Dict | PlaywrightFetchParams | StealthFetchParams,
|
| 157 |
+
session: Any,
|
| 158 |
+
model: type[PlaywrightConfig] | type[StealthConfig],
|
| 159 |
+
) -> _fetch_params: # pragma: no cover
|
| 160 |
+
result: Dict[str, Any] = {}
|
| 161 |
+
overrides: Dict[str, Any] = {}
|
| 162 |
+
kwargs_dict: Dict[str, Any] = dict(method_kwargs)
|
| 163 |
+
|
| 164 |
+
# Get all field names that _fetch_params needs
|
| 165 |
+
fetch_param_fields = {f.name for f in fields(_fetch_params)}
|
| 166 |
+
|
| 167 |
+
for key in fetch_param_fields:
|
| 168 |
+
if key in kwargs_dict:
|
| 169 |
+
overrides[key] = kwargs_dict[key]
|
| 170 |
+
elif hasattr(session, "_config") and hasattr(session._config, key):
|
| 171 |
+
result[key] = getattr(session._config, key)
|
| 172 |
+
|
| 173 |
+
if overrides:
|
| 174 |
+
validated_config = validate(overrides, model)
|
| 175 |
+
# Extract ONLY the fields that were actually overridden (not all fields)
|
| 176 |
+
# This prevents validated defaults from overwriting session config values
|
| 177 |
+
validated_dict = {
|
| 178 |
+
field: getattr(validated_config, field) for field in overrides.keys() if hasattr(validated_config, field)
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
# Preserve solve_cloudflare if the user explicitly provided it, even if the model doesn't have it
|
| 182 |
+
if "solve_cloudflare" in overrides:
|
| 183 |
+
validated_dict["solve_cloudflare"] = overrides["solve_cloudflare"]
|
| 184 |
+
|
| 185 |
+
# Start with session defaults, then overwrite with validated overrides
|
| 186 |
+
result.update(validated_dict)
|
| 187 |
+
|
| 188 |
+
# solve_cloudflare defaults to False for models that don't have it (PlaywrightConfig)
|
| 189 |
+
result.setdefault("solve_cloudflare", False)
|
| 190 |
+
result.setdefault("blocked_domains", None)
|
| 191 |
+
|
| 192 |
+
return _fetch_params(**result)
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
# Cache default values for each model to reduce validation overhead
|
| 196 |
+
models_default_values = {}
|
| 197 |
+
|
| 198 |
+
for _model in (StealthConfig, PlaywrightConfig):
|
| 199 |
+
_defaults = {}
|
| 200 |
+
if hasattr(_model, "__struct_defaults__") and hasattr(_model, "__struct_fields__"):
|
| 201 |
+
for field_name, default_value in zip(_model.__struct_fields__, _model.__struct_defaults__): # type: ignore
|
| 202 |
+
# Skip factory defaults - these are msgspec._core.Factory instances
|
| 203 |
+
if type(default_value).__name__ != "Factory":
|
| 204 |
+
_defaults[field_name] = default_value
|
| 205 |
+
|
| 206 |
+
models_default_values[_model.__name__] = _defaults.copy()
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def _filter_defaults(params: Dict, model: str) -> Dict:
|
| 210 |
+
"""Filter out parameters that match their default values to reduce validation overhead."""
|
| 211 |
+
defaults = models_default_values[model]
|
| 212 |
+
return {k: v for k, v in params.items() if k not in defaults or v != defaults[k]}
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
@overload
|
| 216 |
+
def validate(params: Dict, model: type[StealthConfig]) -> StealthConfig: ...
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
@overload
|
| 220 |
+
def validate(params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
def validate(params: Dict, model: type[PlaywrightConfig] | type[StealthConfig]) -> PlaywrightConfig | StealthConfig:
|
| 224 |
+
try:
|
| 225 |
+
# Filter out params with the default values (no need to validate them) to speed up validation
|
| 226 |
+
filtered = _filter_defaults(params, model.__name__)
|
| 227 |
+
return convert(filtered, model)
|
| 228 |
+
except ValidationError as e:
|
| 229 |
+
raise TypeError(f"Invalid argument type: {e}") from e
|
engines/constants.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Disable loading these resources for speed
|
| 2 |
+
EXTRA_RESOURCES = {
|
| 3 |
+
"font",
|
| 4 |
+
"image",
|
| 5 |
+
"media",
|
| 6 |
+
"beacon",
|
| 7 |
+
"object",
|
| 8 |
+
"imageset",
|
| 9 |
+
"texttrack",
|
| 10 |
+
"websocket",
|
| 11 |
+
"csp_report",
|
| 12 |
+
"stylesheet",
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
HARMFUL_ARGS = (
|
| 16 |
+
# This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
|
| 17 |
+
"--enable-automation",
|
| 18 |
+
"--disable-popup-blocking",
|
| 19 |
+
"--disable-component-update",
|
| 20 |
+
"--disable-default-apps",
|
| 21 |
+
"--disable-extensions",
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
DEFAULT_ARGS = (
|
| 25 |
+
# Speed up chromium browsers by default
|
| 26 |
+
"--no-pings",
|
| 27 |
+
"--no-first-run",
|
| 28 |
+
"--disable-infobars",
|
| 29 |
+
"--disable-breakpad",
|
| 30 |
+
"--no-service-autorun",
|
| 31 |
+
"--homepage=about:blank",
|
| 32 |
+
"--password-store=basic",
|
| 33 |
+
"--disable-hang-monitor",
|
| 34 |
+
"--no-default-browser-check",
|
| 35 |
+
"--disable-session-crashed-bubble",
|
| 36 |
+
"--disable-search-engine-choice-screen",
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
STEALTH_ARGS = (
|
| 40 |
+
# Explanation: https://peter.sh/experiments/chromium-command-line-switches/
|
| 41 |
+
# Generally this will make the browser faster and less detectable
|
| 42 |
+
# "--incognito",
|
| 43 |
+
"--test-type",
|
| 44 |
+
"--lang=en-US",
|
| 45 |
+
"--mute-audio",
|
| 46 |
+
"--disable-sync",
|
| 47 |
+
"--hide-scrollbars",
|
| 48 |
+
"--disable-logging",
|
| 49 |
+
"--start-maximized", # For headless check bypass
|
| 50 |
+
"--enable-async-dns",
|
| 51 |
+
"--accept-lang=en-US",
|
| 52 |
+
"--use-mock-keychain",
|
| 53 |
+
"--disable-translate",
|
| 54 |
+
"--disable-voice-input",
|
| 55 |
+
"--window-position=0,0",
|
| 56 |
+
"--disable-wake-on-wifi",
|
| 57 |
+
"--ignore-gpu-blocklist",
|
| 58 |
+
"--enable-tcp-fast-open",
|
| 59 |
+
"--enable-web-bluetooth",
|
| 60 |
+
"--disable-cloud-import",
|
| 61 |
+
"--disable-print-preview",
|
| 62 |
+
"--disable-dev-shm-usage",
|
| 63 |
+
# '--disable-popup-blocking',
|
| 64 |
+
"--metrics-recording-only",
|
| 65 |
+
"--disable-crash-reporter",
|
| 66 |
+
"--disable-partial-raster",
|
| 67 |
+
"--disable-gesture-typing",
|
| 68 |
+
"--disable-checker-imaging",
|
| 69 |
+
"--disable-prompt-on-repost",
|
| 70 |
+
"--force-color-profile=srgb",
|
| 71 |
+
"--font-render-hinting=none",
|
| 72 |
+
"--aggressive-cache-discard",
|
| 73 |
+
"--disable-cookie-encryption",
|
| 74 |
+
"--disable-domain-reliability",
|
| 75 |
+
"--disable-threaded-animation",
|
| 76 |
+
"--disable-threaded-scrolling",
|
| 77 |
+
"--enable-simple-cache-backend",
|
| 78 |
+
"--disable-background-networking",
|
| 79 |
+
"--enable-surface-synchronization",
|
| 80 |
+
"--disable-image-animation-resync",
|
| 81 |
+
"--disable-renderer-backgrounding",
|
| 82 |
+
"--disable-ipc-flooding-protection",
|
| 83 |
+
"--prerender-from-omnibox=disabled",
|
| 84 |
+
"--safebrowsing-disable-auto-update",
|
| 85 |
+
"--disable-offer-upload-credit-cards",
|
| 86 |
+
"--disable-background-timer-throttling",
|
| 87 |
+
"--disable-new-content-rendering-timeout",
|
| 88 |
+
"--run-all-compositor-stages-before-draw",
|
| 89 |
+
"--disable-client-side-phishing-detection",
|
| 90 |
+
"--disable-backgrounding-occluded-windows",
|
| 91 |
+
"--disable-layer-tree-host-memory-pressure",
|
| 92 |
+
"--autoplay-policy=user-gesture-required",
|
| 93 |
+
"--disable-offer-store-unmasked-wallet-cards",
|
| 94 |
+
"--disable-blink-features=AutomationControlled",
|
| 95 |
+
"--disable-component-extensions-with-background-pages",
|
| 96 |
+
"--enable-features=NetworkService,NetworkServiceInProcess,TrustTokens,TrustTokensAlwaysAllowIssuance",
|
| 97 |
+
"--blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4",
|
| 98 |
+
"--disable-features=AudioServiceOutOfProcess,TranslateUI,BlinkGenPropertyTrees",
|
| 99 |
+
)
|
engines/static.py
ADDED
|
@@ -0,0 +1,770 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from abc import ABC
|
| 2 |
+
from random import choice
|
| 3 |
+
from time import sleep as time_sleep
|
| 4 |
+
from asyncio import sleep as asyncio_sleep
|
| 5 |
+
|
| 6 |
+
from curl_cffi.curl import CurlError
|
| 7 |
+
from curl_cffi import CurlHttpVersion
|
| 8 |
+
from curl_cffi.requests import (
|
| 9 |
+
BrowserTypeLiteral,
|
| 10 |
+
Session as CurlSession,
|
| 11 |
+
AsyncSession as AsyncCurlSession,
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
from scrapling.core.utils import log
|
| 15 |
+
from scrapling.core._types import (
|
| 16 |
+
Any,
|
| 17 |
+
Dict,
|
| 18 |
+
Tuple,
|
| 19 |
+
Unpack,
|
| 20 |
+
Optional,
|
| 21 |
+
Awaitable,
|
| 22 |
+
SUPPORTED_HTTP_METHODS,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
from .toolbelt.custom import Response
|
| 26 |
+
from .toolbelt.convertor import ResponseFactory
|
| 27 |
+
from .toolbelt.proxy_rotation import ProxyRotator, is_proxy_error
|
| 28 |
+
from ._browsers._types import RequestsSession, GetRequestParams, DataRequestParams, ImpersonateType
|
| 29 |
+
from .toolbelt.fingerprints import generate_convincing_referer, generate_headers, __default_useragent__
|
| 30 |
+
|
| 31 |
+
_NO_SESSION: Any = object()
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def _select_random_browser(impersonate: ImpersonateType) -> Optional[BrowserTypeLiteral]:
|
| 35 |
+
"""
|
| 36 |
+
Handle browser selection logic for the ` impersonate ` parameter.
|
| 37 |
+
|
| 38 |
+
If impersonate is a list, randomly select one browser from it.
|
| 39 |
+
If it's a string or None, return as is.
|
| 40 |
+
"""
|
| 41 |
+
if isinstance(impersonate, list):
|
| 42 |
+
if not impersonate:
|
| 43 |
+
return None
|
| 44 |
+
return choice(impersonate)
|
| 45 |
+
return impersonate
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class _ConfigurationLogic(ABC):
|
| 49 |
+
# Core Logic Handler (Internal Engine)
|
| 50 |
+
__slots__ = (
|
| 51 |
+
"_default_impersonate",
|
| 52 |
+
"_stealth",
|
| 53 |
+
"_default_proxies",
|
| 54 |
+
"_default_proxy",
|
| 55 |
+
"_default_proxy_auth",
|
| 56 |
+
"_default_timeout",
|
| 57 |
+
"_default_headers",
|
| 58 |
+
"_default_retries",
|
| 59 |
+
"_default_retry_delay",
|
| 60 |
+
"_default_follow_redirects",
|
| 61 |
+
"_default_max_redirects",
|
| 62 |
+
"_default_verify",
|
| 63 |
+
"_default_cert",
|
| 64 |
+
"_default_http3",
|
| 65 |
+
"selector_config",
|
| 66 |
+
"_is_alive",
|
| 67 |
+
"_proxy_rotator",
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
def __init__(self, **kwargs: Unpack[RequestsSession]):
|
| 71 |
+
self._default_impersonate = kwargs.get("impersonate", "chrome")
|
| 72 |
+
self._stealth = kwargs.get("stealthy_headers", True)
|
| 73 |
+
self._default_proxies = kwargs.get("proxies") or {}
|
| 74 |
+
self._default_proxy = kwargs.get("proxy") or None
|
| 75 |
+
self._default_proxy_auth = kwargs.get("proxy_auth") or None
|
| 76 |
+
self._default_timeout = kwargs.get("timeout", 30)
|
| 77 |
+
self._default_headers = kwargs.get("headers") or {}
|
| 78 |
+
self._default_retries = kwargs.get("retries", 3)
|
| 79 |
+
self._default_retry_delay = kwargs.get("retry_delay", 1)
|
| 80 |
+
self._default_follow_redirects = kwargs.get("follow_redirects", True)
|
| 81 |
+
self._default_max_redirects = kwargs.get("max_redirects", 30)
|
| 82 |
+
self._default_verify = kwargs.get("verify", True)
|
| 83 |
+
self._default_cert = kwargs.get("cert") or None
|
| 84 |
+
self._default_http3 = kwargs.get("http3", False)
|
| 85 |
+
self.selector_config = kwargs.get("selector_config") or {}
|
| 86 |
+
self._is_alive = False
|
| 87 |
+
self._proxy_rotator: Optional[ProxyRotator] = kwargs.get("proxy_rotator")
|
| 88 |
+
|
| 89 |
+
if self._proxy_rotator and (self._default_proxy or self._default_proxies):
|
| 90 |
+
raise ValueError(
|
| 91 |
+
"Cannot use 'proxy_rotator' together with 'proxy' or 'proxies'. "
|
| 92 |
+
"Use either a static proxy or proxy rotation, not both."
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
@staticmethod
|
| 96 |
+
def _get_param(kwargs: Dict, key: str, default: Any) -> Any:
|
| 97 |
+
"""Get parameter from kwargs if present, otherwise return default."""
|
| 98 |
+
return kwargs[key] if key in kwargs else default
|
| 99 |
+
|
| 100 |
+
def _merge_request_args(self, **method_kwargs) -> Dict[str, Any]:
|
| 101 |
+
"""Merge request-specific arguments with default session arguments."""
|
| 102 |
+
url = method_kwargs.pop("url")
|
| 103 |
+
|
| 104 |
+
# Get parameters from kwargs or use defaults
|
| 105 |
+
impersonate = self._get_param(method_kwargs, "impersonate", self._default_impersonate)
|
| 106 |
+
impersonate = _select_random_browser(impersonate)
|
| 107 |
+
http3_enabled = self._get_param(method_kwargs, "http3", self._default_http3)
|
| 108 |
+
stealth = self._get_param(method_kwargs, "stealth", self._stealth)
|
| 109 |
+
|
| 110 |
+
final_args = {
|
| 111 |
+
"url": url,
|
| 112 |
+
# Curl automatically generates the suitable browser headers when you use `impersonate`
|
| 113 |
+
"headers": self._headers_job(
|
| 114 |
+
url,
|
| 115 |
+
self._get_param(method_kwargs, "headers", self._default_headers),
|
| 116 |
+
stealth,
|
| 117 |
+
bool(impersonate),
|
| 118 |
+
),
|
| 119 |
+
"proxies": self._get_param(method_kwargs, "proxies", self._default_proxies),
|
| 120 |
+
"proxy": self._get_param(method_kwargs, "proxy", self._default_proxy),
|
| 121 |
+
"proxy_auth": self._get_param(method_kwargs, "proxy_auth", self._default_proxy_auth),
|
| 122 |
+
"timeout": self._get_param(method_kwargs, "timeout", self._default_timeout),
|
| 123 |
+
"allow_redirects": self._get_param(method_kwargs, "follow_redirects", self._default_follow_redirects),
|
| 124 |
+
"max_redirects": self._get_param(method_kwargs, "max_redirects", self._default_max_redirects),
|
| 125 |
+
"verify": self._get_param(method_kwargs, "verify", self._default_verify),
|
| 126 |
+
"cert": self._get_param(method_kwargs, "cert", self._default_cert),
|
| 127 |
+
"impersonate": impersonate,
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
# Add any remaining parameters that weren't explicitly handled above
|
| 131 |
+
# Skip the ones we already processed plus internal params
|
| 132 |
+
skip_keys = {
|
| 133 |
+
"impersonate",
|
| 134 |
+
"http3",
|
| 135 |
+
"stealth",
|
| 136 |
+
"headers",
|
| 137 |
+
"proxies",
|
| 138 |
+
"proxy",
|
| 139 |
+
"proxy_auth",
|
| 140 |
+
"timeout",
|
| 141 |
+
"follow_redirects",
|
| 142 |
+
"max_redirects",
|
| 143 |
+
"verify",
|
| 144 |
+
"cert",
|
| 145 |
+
"retries",
|
| 146 |
+
"retry_delay",
|
| 147 |
+
"selector_config",
|
| 148 |
+
# Browser session params (ignored by HTTP sessions)
|
| 149 |
+
"extra_headers",
|
| 150 |
+
"google_search",
|
| 151 |
+
}
|
| 152 |
+
for k, v in method_kwargs.items():
|
| 153 |
+
if k not in skip_keys and v is not None:
|
| 154 |
+
final_args[k] = v
|
| 155 |
+
|
| 156 |
+
if http3_enabled: # pragma: no cover
|
| 157 |
+
final_args["http_version"] = CurlHttpVersion.V3ONLY
|
| 158 |
+
if impersonate:
|
| 159 |
+
log.warning(
|
| 160 |
+
"The argument `http3` might cause errors if used with `impersonate` argument, try switching it off if you encounter any curl errors."
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
return final_args
|
| 164 |
+
|
| 165 |
+
def _headers_job(self, url, headers: Dict, stealth: bool, impersonate_enabled: bool) -> Dict:
|
| 166 |
+
"""
|
| 167 |
+
1. Adds a useragent to the headers if it doesn't have one
|
| 168 |
+
2. Generates real headers and append them to current headers
|
| 169 |
+
3. Generates a referer header that looks like as if this request came from a Google's search of the current URL's domain.
|
| 170 |
+
"""
|
| 171 |
+
# Merge session headers with request headers, request takes precedence (if it was set)
|
| 172 |
+
final_headers = {**self._default_headers, **(headers if headers else {})}
|
| 173 |
+
headers_keys = {k.lower() for k in final_headers}
|
| 174 |
+
if stealth:
|
| 175 |
+
if "referer" not in headers_keys:
|
| 176 |
+
final_headers["referer"] = generate_convincing_referer(url)
|
| 177 |
+
|
| 178 |
+
if not impersonate_enabled: # Curl will generate the suitable headers
|
| 179 |
+
extra_headers = generate_headers(browser_mode=False)
|
| 180 |
+
final_headers.update(
|
| 181 |
+
{k: v for k, v in extra_headers.items() if k.lower() not in headers_keys}
|
| 182 |
+
) # Don't overwrite user-supplied headers
|
| 183 |
+
|
| 184 |
+
elif "user-agent" not in headers_keys and not impersonate_enabled: # pragma: no cover
|
| 185 |
+
final_headers["User-Agent"] = __default_useragent__
|
| 186 |
+
log.debug(f"Can't find useragent in headers so '{final_headers['User-Agent']}' was used.")
|
| 187 |
+
|
| 188 |
+
return final_headers
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
class _SyncSessionLogic(_ConfigurationLogic):
|
| 192 |
+
__slots__ = ("_curl_session",)
|
| 193 |
+
|
| 194 |
+
def __init__(self, **kwargs: Unpack[RequestsSession]):
|
| 195 |
+
super().__init__(**kwargs)
|
| 196 |
+
self._curl_session: Optional[CurlSession] = None
|
| 197 |
+
|
| 198 |
+
def __enter__(self):
|
| 199 |
+
"""Creates and returns a new synchronous Fetcher Session"""
|
| 200 |
+
if self._is_alive:
|
| 201 |
+
raise RuntimeError("This FetcherSession instance already has an active synchronous session.")
|
| 202 |
+
|
| 203 |
+
self._curl_session = CurlSession()
|
| 204 |
+
self._is_alive = True
|
| 205 |
+
return self
|
| 206 |
+
|
| 207 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 208 |
+
"""Closes the active synchronous session managed by this instance, if any."""
|
| 209 |
+
# For type checking (not accessed error)
|
| 210 |
+
_ = (
|
| 211 |
+
exc_type,
|
| 212 |
+
exc_val,
|
| 213 |
+
exc_tb,
|
| 214 |
+
)
|
| 215 |
+
if self._curl_session:
|
| 216 |
+
self._curl_session.close()
|
| 217 |
+
self._curl_session = None
|
| 218 |
+
|
| 219 |
+
self._is_alive = False
|
| 220 |
+
|
| 221 |
+
def _make_request(self, method: SUPPORTED_HTTP_METHODS, stealth: Optional[bool] = None, **kwargs) -> Response:
|
| 222 |
+
"""
|
| 223 |
+
Perform an HTTP request using the configured session.
|
| 224 |
+
"""
|
| 225 |
+
stealth = self._stealth if stealth is None else stealth
|
| 226 |
+
|
| 227 |
+
selector_config = self._get_param(kwargs, "selector_config", self.selector_config) or self.selector_config
|
| 228 |
+
max_retries = self._get_param(kwargs, "retries", self._default_retries)
|
| 229 |
+
retry_delay = self._get_param(kwargs, "retry_delay", self._default_retry_delay)
|
| 230 |
+
static_proxy = kwargs.pop("proxy", None)
|
| 231 |
+
|
| 232 |
+
session = self._curl_session
|
| 233 |
+
one_off_request = False
|
| 234 |
+
if session is _NO_SESSION and self.__enter__ is None:
|
| 235 |
+
# For usage inside FetcherClient
|
| 236 |
+
# It turns out `curl_cffi` caches impersonation state, so if you turned it off, then on then off, it won't be off on the last time.
|
| 237 |
+
session = CurlSession()
|
| 238 |
+
one_off_request = True
|
| 239 |
+
|
| 240 |
+
if not session:
|
| 241 |
+
raise RuntimeError("No active session available.") # pragma: no cover
|
| 242 |
+
|
| 243 |
+
try:
|
| 244 |
+
for attempt in range(max_retries):
|
| 245 |
+
if self._proxy_rotator and static_proxy is None:
|
| 246 |
+
proxy = self._proxy_rotator.get_proxy()
|
| 247 |
+
else:
|
| 248 |
+
proxy = static_proxy
|
| 249 |
+
|
| 250 |
+
request_args = self._merge_request_args(stealth=stealth, proxy=proxy, **kwargs)
|
| 251 |
+
try:
|
| 252 |
+
response = session.request(method, **request_args)
|
| 253 |
+
result = ResponseFactory.from_http_request(response, selector_config, meta={"proxy": proxy})
|
| 254 |
+
return result
|
| 255 |
+
except CurlError as e: # pragma: no cover
|
| 256 |
+
if attempt < max_retries - 1:
|
| 257 |
+
# Now if the rotator is enabled, we will try again with the new proxy
|
| 258 |
+
# If it's not enabled, then we will try again with the same proxy
|
| 259 |
+
if is_proxy_error(e):
|
| 260 |
+
log.warning(
|
| 261 |
+
f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {retry_delay} seconds..."
|
| 262 |
+
)
|
| 263 |
+
else:
|
| 264 |
+
log.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds...")
|
| 265 |
+
time_sleep(retry_delay)
|
| 266 |
+
else:
|
| 267 |
+
log.error(f"Failed after {max_retries} attempts: {e}")
|
| 268 |
+
raise # Raise the exception if all retries fail
|
| 269 |
+
finally:
|
| 270 |
+
if session and one_off_request:
|
| 271 |
+
session.close()
|
| 272 |
+
|
| 273 |
+
raise RuntimeError("No active session available.") # pragma: no cover
|
| 274 |
+
|
| 275 |
+
def get(self, url: str, **kwargs: Unpack[GetRequestParams]) -> Response:
|
| 276 |
+
"""
|
| 277 |
+
Perform a GET request.
|
| 278 |
+
|
| 279 |
+
Any additional keyword arguments are passed to the `curl_cffi.requests.Session().request()` method.
|
| 280 |
+
|
| 281 |
+
:param url: Target URL for the request.
|
| 282 |
+
:param kwargs: Additional keyword arguments including:
|
| 283 |
+
- params: Query string parameters for the request.
|
| 284 |
+
- headers: Headers to include in the request.
|
| 285 |
+
- cookies: Cookies to use in the request.
|
| 286 |
+
- timeout: Number of seconds to wait before timing out.
|
| 287 |
+
- follow_redirects: Whether to follow redirects. Defaults to True.
|
| 288 |
+
- max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
| 289 |
+
- retries: Number of retry attempts. Defaults to 3.
|
| 290 |
+
- retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
| 291 |
+
- proxies: Dict of proxies to use.
|
| 292 |
+
- proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
| 293 |
+
- proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
|
| 294 |
+
- auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
|
| 295 |
+
- verify: Whether to verify HTTPS certificates.
|
| 296 |
+
- cert: Tuple of (cert, key) filenames for the client certificate.
|
| 297 |
+
- impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
|
| 298 |
+
- http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
| 299 |
+
- stealthy_headers: If enabled (default), it creates and adds real browser headers.
|
| 300 |
+
:return: A `Response` object.
|
| 301 |
+
"""
|
| 302 |
+
stealthy_headers = kwargs.pop("stealthy_headers", None)
|
| 303 |
+
return self._make_request("GET", stealth=stealthy_headers, url=url, **kwargs)
|
| 304 |
+
|
| 305 |
+
def post(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
|
| 306 |
+
"""
|
| 307 |
+
Perform a POST request.
|
| 308 |
+
|
| 309 |
+
Any additional keyword arguments are passed to the `curl_cffi.requests.Session().request()` method.
|
| 310 |
+
|
| 311 |
+
:param url: Target URL for the request.
|
| 312 |
+
:param kwargs: Additional keyword arguments including:
|
| 313 |
+
- data: Form data to include in the request body.
|
| 314 |
+
- json: A JSON serializable object to include in the body of the request.
|
| 315 |
+
- params: Query string parameters for the request.
|
| 316 |
+
- headers: Headers to include in the request.
|
| 317 |
+
- cookies: Cookies to use in the request.
|
| 318 |
+
- timeout: Number of seconds to wait before timing out.
|
| 319 |
+
- follow_redirects: Whether to follow redirects. Defaults to True.
|
| 320 |
+
- max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
| 321 |
+
- retries: Number of retry attempts. Defaults to 3.
|
| 322 |
+
- retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
| 323 |
+
- proxies: Dict of proxies to use.
|
| 324 |
+
- proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
| 325 |
+
- proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
|
| 326 |
+
- auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
|
| 327 |
+
- verify: Whether to verify HTTPS certificates.
|
| 328 |
+
- cert: Tuple of (cert, key) filenames for the client certificate.
|
| 329 |
+
- impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
|
| 330 |
+
- http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
| 331 |
+
- stealthy_headers: If enabled (default), it creates and adds real browser headers.
|
| 332 |
+
:return: A `Response` object.
|
| 333 |
+
"""
|
| 334 |
+
stealthy_headers = kwargs.pop("stealthy_headers", None)
|
| 335 |
+
return self._make_request("POST", stealth=stealthy_headers, url=url, **kwargs)
|
| 336 |
+
|
| 337 |
+
def put(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
|
| 338 |
+
"""
|
| 339 |
+
Perform a PUT request.
|
| 340 |
+
|
| 341 |
+
Any additional keyword arguments are passed to the `curl_cffi.requests.Session().request()` method.
|
| 342 |
+
|
| 343 |
+
:param url: Target URL for the request.
|
| 344 |
+
:param kwargs: Additional keyword arguments including:
|
| 345 |
+
- data: Form data to include in the request body.
|
| 346 |
+
- json: A JSON serializable object to include in the body of the request.
|
| 347 |
+
- params: Query string parameters for the request.
|
| 348 |
+
- headers: Headers to include in the request.
|
| 349 |
+
- cookies: Cookies to use in the request.
|
| 350 |
+
- timeout: Number of seconds to wait before timing out.
|
| 351 |
+
- follow_redirects: Whether to follow redirects. Defaults to True.
|
| 352 |
+
- max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
| 353 |
+
- retries: Number of retry attempts. Defaults to 3.
|
| 354 |
+
- retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
| 355 |
+
- proxies: Dict of proxies to use.
|
| 356 |
+
- proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
| 357 |
+
- proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
|
| 358 |
+
- auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
|
| 359 |
+
- verify: Whether to verify HTTPS certificates.
|
| 360 |
+
- cert: Tuple of (cert, key) filenames for the client certificate.
|
| 361 |
+
- impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
|
| 362 |
+
- http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
| 363 |
+
- stealthy_headers: If enabled (default), it creates and adds real browser headers.
|
| 364 |
+
:return: A `Response` object.
|
| 365 |
+
"""
|
| 366 |
+
stealthy_headers = kwargs.pop("stealthy_headers", None)
|
| 367 |
+
return self._make_request("PUT", stealth=stealthy_headers, url=url, **kwargs)
|
| 368 |
+
|
| 369 |
+
def delete(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:
|
| 370 |
+
"""
|
| 371 |
+
Perform a DELETE request.
|
| 372 |
+
|
| 373 |
+
Any additional keyword arguments are passed to the `curl_cffi.requests.Session().request()` method.
|
| 374 |
+
|
| 375 |
+
:param url: Target URL for the request.
|
| 376 |
+
:param kwargs: Additional keyword arguments including:
|
| 377 |
+
- data: Form data to include in the request body.
|
| 378 |
+
- json: A JSON serializable object to include in the body of the request.
|
| 379 |
+
- params: Query string parameters for the request.
|
| 380 |
+
- headers: Headers to include in the request.
|
| 381 |
+
- cookies: Cookies to use in the request.
|
| 382 |
+
- timeout: Number of seconds to wait before timing out.
|
| 383 |
+
- follow_redirects: Whether to follow redirects. Defaults to True.
|
| 384 |
+
- max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
| 385 |
+
- retries: Number of retry attempts. Defaults to 3.
|
| 386 |
+
- retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
| 387 |
+
- proxies: Dict of proxies to use.
|
| 388 |
+
- proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
| 389 |
+
- proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
|
| 390 |
+
- auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
|
| 391 |
+
- verify: Whether to verify HTTPS certificates.
|
| 392 |
+
- cert: Tuple of (cert, key) filenames for the client certificate.
|
| 393 |
+
- impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
|
| 394 |
+
- http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
| 395 |
+
- stealthy_headers: If enabled (default), it creates and adds real browser headers.
|
| 396 |
+
:return: A `Response` object.
|
| 397 |
+
"""
|
| 398 |
+
# Careful of sending a body in a DELETE request, it might cause some websites to reject the request as per https://www.rfc-editor.org/rfc/rfc7231#section-4.3.5,
|
| 399 |
+
# But some websites accept it, it depends on the implementation used.
|
| 400 |
+
stealthy_headers = kwargs.pop("stealthy_headers", None)
|
| 401 |
+
return self._make_request("DELETE", stealth=stealthy_headers, url=url, **kwargs)
|
| 402 |
+
|
| 403 |
+
|
| 404 |
+
class _ASyncSessionLogic(_ConfigurationLogic):
|
| 405 |
+
__slots__ = ("_async_curl_session",)
|
| 406 |
+
|
| 407 |
+
def __init__(self, **kwargs: Unpack[RequestsSession]):
|
| 408 |
+
super().__init__(**kwargs)
|
| 409 |
+
self._async_curl_session: Optional[AsyncCurlSession] = None
|
| 410 |
+
|
| 411 |
+
async def __aenter__(self): # pragma: no cover
|
| 412 |
+
"""Creates and returns a new asynchronous Session."""
|
| 413 |
+
if self._is_alive:
|
| 414 |
+
raise RuntimeError("This FetcherSession instance already has an active asynchronous session.")
|
| 415 |
+
|
| 416 |
+
self._async_curl_session = AsyncCurlSession()
|
| 417 |
+
self._is_alive = True
|
| 418 |
+
return self
|
| 419 |
+
|
| 420 |
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
| 421 |
+
"""Closes the active asynchronous session managed by this instance, if any."""
|
| 422 |
+
# For type checking (not accessed error)
|
| 423 |
+
_ = (
|
| 424 |
+
exc_type,
|
| 425 |
+
exc_val,
|
| 426 |
+
exc_tb,
|
| 427 |
+
)
|
| 428 |
+
if self._async_curl_session:
|
| 429 |
+
await self._async_curl_session.close()
|
| 430 |
+
self._async_curl_session = None
|
| 431 |
+
|
| 432 |
+
self._is_alive = False
|
| 433 |
+
|
| 434 |
+
async def _make_request(self, method: SUPPORTED_HTTP_METHODS, stealth: Optional[bool] = None, **kwargs) -> Response:
|
| 435 |
+
"""
|
| 436 |
+
Perform an HTTP request using the configured session.
|
| 437 |
+
"""
|
| 438 |
+
stealth = self._stealth if stealth is None else stealth
|
| 439 |
+
|
| 440 |
+
selector_config = self._get_param(kwargs, "selector_config", self.selector_config) or self.selector_config
|
| 441 |
+
max_retries = self._get_param(kwargs, "retries", self._default_retries)
|
| 442 |
+
retry_delay = self._get_param(kwargs, "retry_delay", self._default_retry_delay)
|
| 443 |
+
static_proxy = kwargs.pop("proxy", None)
|
| 444 |
+
|
| 445 |
+
session = self._async_curl_session
|
| 446 |
+
one_off_request = False
|
| 447 |
+
if session is _NO_SESSION and self.__aenter__ is None:
|
| 448 |
+
# For usage inside the ` AsyncFetcherClient ` class, and that's for several reasons
|
| 449 |
+
# 1. It turns out `curl_cffi` caches impersonation state, so if you turned it off, then on then off, it won't be off on the last time.
|
| 450 |
+
# 2. `curl_cffi` doesn't support making async requests without sessions
|
| 451 |
+
# 3. Using a single session for many requests at the same time in async doesn't sit well with curl_cffi.
|
| 452 |
+
session = AsyncCurlSession()
|
| 453 |
+
one_off_request = True
|
| 454 |
+
|
| 455 |
+
if not session:
|
| 456 |
+
raise RuntimeError("No active session available.") # pragma: no cover
|
| 457 |
+
|
| 458 |
+
try:
|
| 459 |
+
# Determine if we should use proxy rotation
|
| 460 |
+
for attempt in range(max_retries):
|
| 461 |
+
if self._proxy_rotator and static_proxy is None:
|
| 462 |
+
proxy = self._proxy_rotator.get_proxy()
|
| 463 |
+
else:
|
| 464 |
+
proxy = static_proxy
|
| 465 |
+
|
| 466 |
+
request_args = self._merge_request_args(stealth=stealth, proxy=proxy, **kwargs)
|
| 467 |
+
try:
|
| 468 |
+
response = await session.request(method, **request_args)
|
| 469 |
+
result = ResponseFactory.from_http_request(response, selector_config, meta={"proxy": proxy})
|
| 470 |
+
return result
|
| 471 |
+
except CurlError as e: # pragma: no cover
|
| 472 |
+
if attempt < max_retries - 1:
|
| 473 |
+
# Now if the rotator is enabled, we will try again with the new proxy
|
| 474 |
+
# If it's not enabled, then we will try again with the same proxy
|
| 475 |
+
if is_proxy_error(e):
|
| 476 |
+
log.warning(
|
| 477 |
+
f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {retry_delay} seconds..."
|
| 478 |
+
)
|
| 479 |
+
else:
|
| 480 |
+
log.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds...")
|
| 481 |
+
|
| 482 |
+
await asyncio_sleep(retry_delay)
|
| 483 |
+
else:
|
| 484 |
+
log.error(f"Failed after {max_retries} attempts: {e}")
|
| 485 |
+
raise # Raise the exception if all retries fail
|
| 486 |
+
finally:
|
| 487 |
+
if session and one_off_request:
|
| 488 |
+
await session.close()
|
| 489 |
+
|
| 490 |
+
raise RuntimeError("No active session available.") # pragma: no cover
|
| 491 |
+
|
| 492 |
+
def get(self, url: str, **kwargs: Unpack[GetRequestParams]) -> Awaitable[Response]:
|
| 493 |
+
"""
|
| 494 |
+
Perform a GET request.
|
| 495 |
+
|
| 496 |
+
Any additional keyword arguments are passed to the `curl_cffi.requests.AsyncSession().request()` method.
|
| 497 |
+
|
| 498 |
+
:param url: Target URL for the request.
|
| 499 |
+
:param kwargs: Additional keyword arguments including:
|
| 500 |
+
- params: Query string parameters for the request.
|
| 501 |
+
- headers: Headers to include in the request.
|
| 502 |
+
- cookies: Cookies to use in the request.
|
| 503 |
+
- timeout: Number of seconds to wait before timing out.
|
| 504 |
+
- follow_redirects: Whether to follow redirects. Defaults to True.
|
| 505 |
+
- max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
| 506 |
+
- retries: Number of retry attempts. Defaults to 3.
|
| 507 |
+
- retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
| 508 |
+
- proxies: Dict of proxies to use.
|
| 509 |
+
- proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
| 510 |
+
- proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
|
| 511 |
+
- auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
|
| 512 |
+
- verify: Whether to verify HTTPS certificates.
|
| 513 |
+
- cert: Tuple of (cert, key) filenames for the client certificate.
|
| 514 |
+
- impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
|
| 515 |
+
- http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
| 516 |
+
- stealthy_headers: If enabled (default), it creates and adds real browser headers.
|
| 517 |
+
:return: A `Response` object.
|
| 518 |
+
"""
|
| 519 |
+
stealthy_headers = kwargs.pop("stealthy_headers", None)
|
| 520 |
+
return self._make_request("GET", stealth=stealthy_headers, url=url, **kwargs)
|
| 521 |
+
|
| 522 |
+
def post(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
|
| 523 |
+
"""
|
| 524 |
+
Perform a POST request.
|
| 525 |
+
|
| 526 |
+
Any additional keyword arguments are passed to the `curl_cffi.requests.AsyncSession().request()` method.
|
| 527 |
+
|
| 528 |
+
:param url: Target URL for the request.
|
| 529 |
+
:param kwargs: Additional keyword arguments including:
|
| 530 |
+
- data: Form data to include in the request body.
|
| 531 |
+
- json: A JSON serializable object to include in the body of the request.
|
| 532 |
+
- params: Query string parameters for the request.
|
| 533 |
+
- headers: Headers to include in the request.
|
| 534 |
+
- cookies: Cookies to use in the request.
|
| 535 |
+
- timeout: Number of seconds to wait before timing out.
|
| 536 |
+
- follow_redirects: Whether to follow redirects. Defaults to True.
|
| 537 |
+
- max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
| 538 |
+
- retries: Number of retry attempts. Defaults to 3.
|
| 539 |
+
- retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
| 540 |
+
- proxies: Dict of proxies to use.
|
| 541 |
+
- proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
| 542 |
+
- proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
|
| 543 |
+
- auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
|
| 544 |
+
- verify: Whether to verify HTTPS certificates.
|
| 545 |
+
- cert: Tuple of (cert, key) filenames for the client certificate.
|
| 546 |
+
- impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
|
| 547 |
+
- http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
| 548 |
+
- stealthy_headers: If enabled (default), it creates and adds real browser headers.
|
| 549 |
+
:return: A `Response` object.
|
| 550 |
+
"""
|
| 551 |
+
stealthy_headers = kwargs.pop("stealthy_headers", None)
|
| 552 |
+
return self._make_request("POST", stealth=stealthy_headers, url=url, **kwargs)
|
| 553 |
+
|
| 554 |
+
def put(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
|
| 555 |
+
"""
|
| 556 |
+
Perform a PUT request.
|
| 557 |
+
|
| 558 |
+
Any additional keyword arguments are passed to the `curl_cffi.requests.AsyncSession().request()` method.
|
| 559 |
+
|
| 560 |
+
:param url: Target URL for the request.
|
| 561 |
+
:param kwargs: Additional keyword arguments including:
|
| 562 |
+
- data: Form data to include in the request body.
|
| 563 |
+
- json: A JSON serializable object to include in the body of the request.
|
| 564 |
+
- params: Query string parameters for the request.
|
| 565 |
+
- headers: Headers to include in the request.
|
| 566 |
+
- cookies: Cookies to use in the request.
|
| 567 |
+
- timeout: Number of seconds to wait before timing out.
|
| 568 |
+
- follow_redirects: Whether to follow redirects. Defaults to True.
|
| 569 |
+
- max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
| 570 |
+
- retries: Number of retry attempts. Defaults to 3.
|
| 571 |
+
- retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
| 572 |
+
- proxies: Dict of proxies to use.
|
| 573 |
+
- proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
| 574 |
+
- proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
|
| 575 |
+
- auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
|
| 576 |
+
- verify: Whether to verify HTTPS certificates.
|
| 577 |
+
- cert: Tuple of (cert, key) filenames for the client certificate.
|
| 578 |
+
- impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
|
| 579 |
+
- http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
| 580 |
+
- stealthy_headers: If enabled (default), it creates and adds real browser headers.
|
| 581 |
+
:return: A `Response` object.
|
| 582 |
+
"""
|
| 583 |
+
stealthy_headers = kwargs.pop("stealthy_headers", None)
|
| 584 |
+
return self._make_request("PUT", stealth=stealthy_headers, url=url, **kwargs)
|
| 585 |
+
|
| 586 |
+
def delete(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:
|
| 587 |
+
"""
|
| 588 |
+
Perform a DELETE request.
|
| 589 |
+
|
| 590 |
+
Any additional keyword arguments are passed to the `curl_cffi.requests.AsyncSession().request()` method.
|
| 591 |
+
|
| 592 |
+
:param url: Target URL for the request.
|
| 593 |
+
:param kwargs: Additional keyword arguments including:
|
| 594 |
+
- data: Form data to include in the request body.
|
| 595 |
+
- json: A JSON serializable object to include in the body of the request.
|
| 596 |
+
- params: Query string parameters for the request.
|
| 597 |
+
- headers: Headers to include in the request.
|
| 598 |
+
- cookies: Cookies to use in the request.
|
| 599 |
+
- timeout: Number of seconds to wait before timing out.
|
| 600 |
+
- follow_redirects: Whether to follow redirects. Defaults to True.
|
| 601 |
+
- max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
| 602 |
+
- retries: Number of retry attempts. Defaults to 3.
|
| 603 |
+
- retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
| 604 |
+
- proxies: Dict of proxies to use.
|
| 605 |
+
- proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
| 606 |
+
- proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
|
| 607 |
+
- auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
|
| 608 |
+
- verify: Whether to verify HTTPS certificates.
|
| 609 |
+
- cert: Tuple of (cert, key) filenames for the client certificate.
|
| 610 |
+
- impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
|
| 611 |
+
- http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
| 612 |
+
- stealthy_headers: If enabled (default), it creates and adds real browser headers.
|
| 613 |
+
:return: A `Response` object.
|
| 614 |
+
"""
|
| 615 |
+
# Careful of sending a body in a DELETE request, it might cause some websites to reject the request as per https://www.rfc-editor.org/rfc/rfc7231#section-4.3.5,
|
| 616 |
+
# But some websites accept it, it depends on the implementation used.
|
| 617 |
+
stealthy_headers = kwargs.pop("stealthy_headers", None)
|
| 618 |
+
return self._make_request("DELETE", stealth=stealthy_headers, url=url, **kwargs)
|
| 619 |
+
|
| 620 |
+
|
| 621 |
+
class FetcherSession:
|
| 622 |
+
"""
|
| 623 |
+
A factory context manager that provides configured Fetcher sessions.
|
| 624 |
+
|
| 625 |
+
When this manager is used in a 'with' or 'async with' block,
|
| 626 |
+
it yields a new session configured with the manager's defaults.
|
| 627 |
+
A single instance of this manager should ideally be used for one active
|
| 628 |
+
session at a time (or sequentially). Re-entering a context with the
|
| 629 |
+
same manager instance while a session is already active is disallowed.
|
| 630 |
+
"""
|
| 631 |
+
|
| 632 |
+
__slots__ = (
|
| 633 |
+
"_default_impersonate",
|
| 634 |
+
"_stealth",
|
| 635 |
+
"_default_proxies",
|
| 636 |
+
"_default_proxy",
|
| 637 |
+
"_default_proxy_auth",
|
| 638 |
+
"_default_timeout",
|
| 639 |
+
"_default_headers",
|
| 640 |
+
"_default_retries",
|
| 641 |
+
"_default_retry_delay",
|
| 642 |
+
"_default_follow_redirects",
|
| 643 |
+
"_default_max_redirects",
|
| 644 |
+
"_default_verify",
|
| 645 |
+
"_default_cert",
|
| 646 |
+
"_default_http3",
|
| 647 |
+
"selector_config",
|
| 648 |
+
"_client",
|
| 649 |
+
"_is_alive",
|
| 650 |
+
"_proxy_rotator",
|
| 651 |
+
)
|
| 652 |
+
|
| 653 |
+
def __init__(
|
| 654 |
+
self,
|
| 655 |
+
impersonate: ImpersonateType = "chrome",
|
| 656 |
+
http3: Optional[bool] = False,
|
| 657 |
+
stealthy_headers: Optional[bool] = True,
|
| 658 |
+
proxies: Optional[Dict[str, str]] = None,
|
| 659 |
+
proxy: Optional[str] = None,
|
| 660 |
+
proxy_auth: Optional[Tuple[str, str]] = None,
|
| 661 |
+
timeout: Optional[int | float] = 30,
|
| 662 |
+
headers: Optional[Dict[str, str]] = None,
|
| 663 |
+
retries: Optional[int] = 3,
|
| 664 |
+
retry_delay: Optional[int] = 1,
|
| 665 |
+
follow_redirects: bool = True,
|
| 666 |
+
max_redirects: int = 30,
|
| 667 |
+
verify: bool = True,
|
| 668 |
+
cert: Optional[str | Tuple[str, str]] = None,
|
| 669 |
+
selector_config: Optional[Dict] = None,
|
| 670 |
+
proxy_rotator: Optional[ProxyRotator] = None,
|
| 671 |
+
):
|
| 672 |
+
"""
|
| 673 |
+
:param impersonate: Browser version to impersonate. Can be a single browser string or a list of browser strings for random selection. (Default: latest available Chrome version)
|
| 674 |
+
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
| 675 |
+
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
|
| 676 |
+
:param proxies: Dict of proxies to use. Format: {"http": proxy_url, "https": proxy_url}.
|
| 677 |
+
:param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
| 678 |
+
Cannot be used together with the `proxies` parameter.
|
| 679 |
+
:param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
|
| 680 |
+
:param timeout: Number of seconds to wait before timing out.
|
| 681 |
+
:param headers: Headers to include in the session with every request.
|
| 682 |
+
:param retries: Number of retry attempts. Defaults to 3.
|
| 683 |
+
:param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
| 684 |
+
:param follow_redirects: Whether to follow redirects. Defaults to True.
|
| 685 |
+
:param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
| 686 |
+
:param verify: Whether to verify HTTPS certificates. Defaults to True.
|
| 687 |
+
:param cert: Tuple of (cert, key) filenames for the client certificate.
|
| 688 |
+
:param selector_config: Arguments passed when creating the final Selector class.
|
| 689 |
+
:param proxy_rotator: A ProxyRotator instance for automatic proxy rotation.
|
| 690 |
+
"""
|
| 691 |
+
self._default_impersonate: ImpersonateType = impersonate
|
| 692 |
+
self._stealth = stealthy_headers
|
| 693 |
+
self._default_proxies = proxies or {}
|
| 694 |
+
self._default_proxy = proxy or None
|
| 695 |
+
self._default_proxy_auth = proxy_auth or None
|
| 696 |
+
self._default_timeout = timeout
|
| 697 |
+
self._default_headers = headers or {}
|
| 698 |
+
self._default_retries = retries
|
| 699 |
+
self._default_retry_delay = retry_delay
|
| 700 |
+
self._default_follow_redirects = follow_redirects
|
| 701 |
+
self._default_max_redirects = max_redirects
|
| 702 |
+
self._default_verify = verify
|
| 703 |
+
self._default_cert = cert
|
| 704 |
+
self._default_http3 = http3
|
| 705 |
+
self.selector_config = selector_config or {}
|
| 706 |
+
self._is_alive = False
|
| 707 |
+
self._client: _SyncSessionLogic | _ASyncSessionLogic | None = None
|
| 708 |
+
self._proxy_rotator = proxy_rotator
|
| 709 |
+
|
| 710 |
+
def __enter__(self) -> _SyncSessionLogic:
|
| 711 |
+
"""Creates and returns a new synchronous Fetcher Session"""
|
| 712 |
+
if self._client is None:
|
| 713 |
+
# Use **vars(self) to avoid repeating all parameters
|
| 714 |
+
config = {k.replace("_default_", ""): getattr(self, k) for k in self.__slots__ if k.startswith("_default")}
|
| 715 |
+
config["stealthy_headers"] = self._stealth
|
| 716 |
+
config["selector_config"] = self.selector_config
|
| 717 |
+
config["proxy_rotator"] = self._proxy_rotator
|
| 718 |
+
self._client = _SyncSessionLogic(**config)
|
| 719 |
+
self._is_alive = True
|
| 720 |
+
return self._client.__enter__()
|
| 721 |
+
raise RuntimeError("This FetcherSession instance already has an active synchronous session.")
|
| 722 |
+
|
| 723 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 724 |
+
if self._client is not None and isinstance(self._client, _SyncSessionLogic):
|
| 725 |
+
self._client.__exit__(exc_type, exc_val, exc_tb)
|
| 726 |
+
self._client = None
|
| 727 |
+
self._is_alive = False
|
| 728 |
+
return
|
| 729 |
+
raise RuntimeError("Cannot exit invalid session")
|
| 730 |
+
|
| 731 |
+
async def __aenter__(self) -> _ASyncSessionLogic:
|
| 732 |
+
"""Creates and returns a new asynchronous Session."""
|
| 733 |
+
if self._client is None:
|
| 734 |
+
# Use **vars(self) to avoid repeating all parameters
|
| 735 |
+
config = {k.replace("_default_", ""): getattr(self, k) for k in self.__slots__ if k.startswith("_default")}
|
| 736 |
+
config["stealthy_headers"] = self._stealth
|
| 737 |
+
config["selector_config"] = self.selector_config
|
| 738 |
+
config["proxy_rotator"] = self._proxy_rotator
|
| 739 |
+
self._client = _ASyncSessionLogic(**config)
|
| 740 |
+
self._is_alive = True
|
| 741 |
+
return await self._client.__aenter__()
|
| 742 |
+
raise RuntimeError("This FetcherSession instance already has an active asynchronous session.")
|
| 743 |
+
|
| 744 |
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
| 745 |
+
if self._client is not None and isinstance(self._client, _ASyncSessionLogic):
|
| 746 |
+
await self._client.__aexit__(exc_type, exc_val, exc_tb)
|
| 747 |
+
self._client = None
|
| 748 |
+
self._is_alive = False
|
| 749 |
+
return
|
| 750 |
+
raise RuntimeError("Cannot exit invalid session")
|
| 751 |
+
|
| 752 |
+
|
| 753 |
+
class FetcherClient(_SyncSessionLogic):
|
| 754 |
+
__slots__ = ("__enter__", "__exit__")
|
| 755 |
+
|
| 756 |
+
def __init__(self, **kwargs: Any) -> None:
|
| 757 |
+
super().__init__(**kwargs)
|
| 758 |
+
self.__enter__: Any = None
|
| 759 |
+
self.__exit__: Any = None
|
| 760 |
+
self._curl_session: Any = _NO_SESSION
|
| 761 |
+
|
| 762 |
+
|
| 763 |
+
class AsyncFetcherClient(_ASyncSessionLogic):
|
| 764 |
+
__slots__ = ("__aenter__", "__aexit__")
|
| 765 |
+
|
| 766 |
+
def __init__(self, **kwargs: Any) -> None:
|
| 767 |
+
super().__init__(**kwargs)
|
| 768 |
+
self.__aenter__: Any = None
|
| 769 |
+
self.__aexit__: Any = None
|
| 770 |
+
self._async_curl_session: Any = _NO_SESSION
|
engines/toolbelt/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .proxy_rotation import ProxyRotator, is_proxy_error, cyclic_rotation
|
| 2 |
+
|
| 3 |
+
__all__ = ["ProxyRotator", "is_proxy_error", "cyclic_rotation"]
|
engines/toolbelt/convertor.py
ADDED
|
@@ -0,0 +1,306 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from functools import lru_cache
|
| 2 |
+
from re import compile as re_compile
|
| 3 |
+
|
| 4 |
+
from curl_cffi.requests import Response as CurlResponse
|
| 5 |
+
from playwright._impl._errors import Error as PlaywrightError
|
| 6 |
+
from playwright.sync_api import Page as SyncPage, Response as SyncResponse
|
| 7 |
+
from playwright.async_api import Page as AsyncPage, Response as AsyncResponse
|
| 8 |
+
|
| 9 |
+
from scrapling.core.utils import log
|
| 10 |
+
from .custom import Response, StatusText
|
| 11 |
+
from scrapling.core._types import Dict, Optional
|
| 12 |
+
|
| 13 |
+
__CHARSET_RE__ = re_compile(r"charset=([\w-]+)")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class ResponseFactory:
|
| 17 |
+
"""
|
| 18 |
+
Factory class for creating `Response` objects from various sources.
|
| 19 |
+
|
| 20 |
+
This class provides multiple static and instance methods for building standardized `Response` objects
|
| 21 |
+
from diverse input sources such as Playwright responses, asynchronous Playwright responses,
|
| 22 |
+
and raw HTTP request responses. It supports handling response histories, constructing the proper
|
| 23 |
+
response objects, and managing encoding, headers, cookies, and other attributes.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
@classmethod
|
| 27 |
+
@lru_cache(maxsize=16)
|
| 28 |
+
def __extract_browser_encoding(cls, content_type: str | None, default: str = "utf-8") -> str:
|
| 29 |
+
"""Extract browser encoding from headers.
|
| 30 |
+
Ex: from header "content-type: text/html; charset=utf-8" -> "utf-8
|
| 31 |
+
"""
|
| 32 |
+
if content_type:
|
| 33 |
+
# Because Playwright can't do that by themselves like all libraries for some reason :3
|
| 34 |
+
match = __CHARSET_RE__.search(content_type)
|
| 35 |
+
return match.group(1) if match else default
|
| 36 |
+
return default
|
| 37 |
+
|
| 38 |
+
@classmethod
|
| 39 |
+
def _process_response_history(cls, first_response: SyncResponse, parser_arguments: Dict) -> list[Response]:
|
| 40 |
+
"""Process response history to build a list of `Response` objects"""
|
| 41 |
+
history: list[Response] = []
|
| 42 |
+
current_request = first_response.request.redirected_from
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
while current_request:
|
| 46 |
+
try:
|
| 47 |
+
current_response = current_request.response()
|
| 48 |
+
history.insert(
|
| 49 |
+
0,
|
| 50 |
+
Response(
|
| 51 |
+
**{
|
| 52 |
+
"url": current_request.url,
|
| 53 |
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
| 54 |
+
"content": "",
|
| 55 |
+
"status": current_response.status if current_response else 301,
|
| 56 |
+
"reason": (current_response.status_text or StatusText.get(current_response.status))
|
| 57 |
+
if current_response
|
| 58 |
+
else StatusText.get(301),
|
| 59 |
+
"encoding": cls.__extract_browser_encoding(
|
| 60 |
+
current_response.headers.get("content-type", "")
|
| 61 |
+
)
|
| 62 |
+
if current_response
|
| 63 |
+
else "utf-8",
|
| 64 |
+
"cookies": tuple(),
|
| 65 |
+
"headers": current_response.all_headers() if current_response else {},
|
| 66 |
+
"request_headers": current_request.all_headers(),
|
| 67 |
+
**parser_arguments,
|
| 68 |
+
}
|
| 69 |
+
),
|
| 70 |
+
)
|
| 71 |
+
except Exception as e: # pragma: no cover
|
| 72 |
+
log.error(f"Error processing redirect: {e}")
|
| 73 |
+
break
|
| 74 |
+
|
| 75 |
+
current_request = current_request.redirected_from
|
| 76 |
+
except Exception as e: # pragma: no cover
|
| 77 |
+
log.error(f"Error processing response history: {e}")
|
| 78 |
+
|
| 79 |
+
return history
|
| 80 |
+
|
| 81 |
+
@classmethod
|
| 82 |
+
def from_playwright_response(
|
| 83 |
+
cls,
|
| 84 |
+
page: SyncPage,
|
| 85 |
+
first_response: SyncResponse,
|
| 86 |
+
final_response: Optional[SyncResponse],
|
| 87 |
+
parser_arguments: Dict,
|
| 88 |
+
meta: Optional[Dict] = None,
|
| 89 |
+
) -> Response:
|
| 90 |
+
"""
|
| 91 |
+
Transforms a Playwright response into an internal `Response` object, encapsulating
|
| 92 |
+
the page's content, response status, headers, and relevant metadata.
|
| 93 |
+
|
| 94 |
+
The function handles potential issues, such as empty or missing final responses,
|
| 95 |
+
by falling back to the first response if necessary. Encoding and status text
|
| 96 |
+
are also derived from the provided response headers or reasonable defaults.
|
| 97 |
+
Additionally, the page content and cookies are extracted for further use.
|
| 98 |
+
|
| 99 |
+
:param page: A synchronous Playwright `Page` instance that represents the current browser page. Required to retrieve the page's URL, cookies, and content.
|
| 100 |
+
:param final_response: The last response received for the given request from the Playwright instance. Typically used as the main response object to derive status, headers, and other metadata.
|
| 101 |
+
:param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.
|
| 102 |
+
:param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into
|
| 103 |
+
the `Response` object.
|
| 104 |
+
:param meta: Additional meta data to be saved with the response.
|
| 105 |
+
|
| 106 |
+
:return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.
|
| 107 |
+
:rtype: Response
|
| 108 |
+
"""
|
| 109 |
+
# In case we didn't catch a document type somehow
|
| 110 |
+
final_response = final_response if final_response else first_response
|
| 111 |
+
if not final_response:
|
| 112 |
+
raise ValueError("Failed to get a response from the page")
|
| 113 |
+
|
| 114 |
+
encoding = cls.__extract_browser_encoding(final_response.headers.get("content-type", ""))
|
| 115 |
+
# PlayWright API sometimes give empty status text for some reason!
|
| 116 |
+
status_text = final_response.status_text or StatusText.get(final_response.status)
|
| 117 |
+
|
| 118 |
+
history = cls._process_response_history(first_response, parser_arguments)
|
| 119 |
+
try:
|
| 120 |
+
if "html" in final_response.all_headers().get("content-type", ""):
|
| 121 |
+
page_content = cls._get_page_content(page).encode("utf-8")
|
| 122 |
+
else:
|
| 123 |
+
page_content = final_response.body()
|
| 124 |
+
except Exception as e: # pragma: no cover
|
| 125 |
+
log.error(f"Error getting page content: {e}")
|
| 126 |
+
page_content = b""
|
| 127 |
+
|
| 128 |
+
return Response(
|
| 129 |
+
**{
|
| 130 |
+
"url": page.url,
|
| 131 |
+
"content": page_content,
|
| 132 |
+
"status": final_response.status,
|
| 133 |
+
"reason": status_text,
|
| 134 |
+
"encoding": encoding,
|
| 135 |
+
"cookies": tuple(dict(cookie) for cookie in page.context.cookies()),
|
| 136 |
+
"headers": first_response.all_headers(),
|
| 137 |
+
"request_headers": first_response.request.all_headers(),
|
| 138 |
+
"history": history,
|
| 139 |
+
"meta": meta,
|
| 140 |
+
**parser_arguments,
|
| 141 |
+
}
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
@classmethod
|
| 145 |
+
async def _async_process_response_history(
|
| 146 |
+
cls, first_response: AsyncResponse, parser_arguments: Dict
|
| 147 |
+
) -> list[Response]:
|
| 148 |
+
"""Process response history to build a list of `Response` objects"""
|
| 149 |
+
history: list[Response] = []
|
| 150 |
+
current_request = first_response.request.redirected_from
|
| 151 |
+
|
| 152 |
+
try:
|
| 153 |
+
while current_request:
|
| 154 |
+
try:
|
| 155 |
+
current_response = await current_request.response()
|
| 156 |
+
history.insert(
|
| 157 |
+
0,
|
| 158 |
+
Response(
|
| 159 |
+
**{
|
| 160 |
+
"url": current_request.url,
|
| 161 |
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
| 162 |
+
"content": "",
|
| 163 |
+
"status": current_response.status if current_response else 301,
|
| 164 |
+
"reason": (current_response.status_text or StatusText.get(current_response.status))
|
| 165 |
+
if current_response
|
| 166 |
+
else StatusText.get(301),
|
| 167 |
+
"encoding": cls.__extract_browser_encoding(
|
| 168 |
+
current_response.headers.get("content-type", "")
|
| 169 |
+
)
|
| 170 |
+
if current_response
|
| 171 |
+
else "utf-8",
|
| 172 |
+
"cookies": tuple(),
|
| 173 |
+
"headers": await current_response.all_headers() if current_response else {},
|
| 174 |
+
"request_headers": await current_request.all_headers(),
|
| 175 |
+
**parser_arguments,
|
| 176 |
+
}
|
| 177 |
+
),
|
| 178 |
+
)
|
| 179 |
+
except Exception as e: # pragma: no cover
|
| 180 |
+
log.error(f"Error processing redirect: {e}")
|
| 181 |
+
break
|
| 182 |
+
|
| 183 |
+
current_request = current_request.redirected_from
|
| 184 |
+
except Exception as e: # pragma: no cover
|
| 185 |
+
log.error(f"Error processing response history: {e}")
|
| 186 |
+
|
| 187 |
+
return history
|
| 188 |
+
|
| 189 |
+
@classmethod
|
| 190 |
+
def _get_page_content(cls, page: SyncPage) -> str:
|
| 191 |
+
"""
|
| 192 |
+
A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
|
| 193 |
+
:param page: The page to extract content from.
|
| 194 |
+
:return:
|
| 195 |
+
"""
|
| 196 |
+
while True:
|
| 197 |
+
try:
|
| 198 |
+
return page.content() or ""
|
| 199 |
+
except PlaywrightError:
|
| 200 |
+
page.wait_for_timeout(500)
|
| 201 |
+
continue
|
| 202 |
+
return "" # pyright: ignore
|
| 203 |
+
|
| 204 |
+
@classmethod
|
| 205 |
+
async def _get_async_page_content(cls, page: AsyncPage) -> str:
|
| 206 |
+
"""
|
| 207 |
+
A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
|
| 208 |
+
:param page: The page to extract content from.
|
| 209 |
+
:return:
|
| 210 |
+
"""
|
| 211 |
+
while True:
|
| 212 |
+
try:
|
| 213 |
+
return (await page.content()) or ""
|
| 214 |
+
except PlaywrightError:
|
| 215 |
+
await page.wait_for_timeout(500)
|
| 216 |
+
continue
|
| 217 |
+
return "" # pyright: ignore
|
| 218 |
+
|
| 219 |
+
@classmethod
|
| 220 |
+
async def from_async_playwright_response(
|
| 221 |
+
cls,
|
| 222 |
+
page: AsyncPage,
|
| 223 |
+
first_response: AsyncResponse,
|
| 224 |
+
final_response: Optional[AsyncResponse],
|
| 225 |
+
parser_arguments: Dict,
|
| 226 |
+
meta: Optional[Dict] = None,
|
| 227 |
+
) -> Response:
|
| 228 |
+
"""
|
| 229 |
+
Transforms a Playwright response into an internal `Response` object, encapsulating
|
| 230 |
+
the page's content, response status, headers, and relevant metadata.
|
| 231 |
+
|
| 232 |
+
The function handles potential issues, such as empty or missing final responses,
|
| 233 |
+
by falling back to the first response if necessary. Encoding and status text
|
| 234 |
+
are also derived from the provided response headers or reasonable defaults.
|
| 235 |
+
Additionally, the page content and cookies are extracted for further use.
|
| 236 |
+
|
| 237 |
+
:param page: An asynchronous Playwright `Page` instance that represents the current browser page. Required to retrieve the page's URL, cookies, and content.
|
| 238 |
+
:param final_response: The last response received for the given request from the Playwright instance. Typically used as the main response object to derive status, headers, and other metadata.
|
| 239 |
+
:param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.
|
| 240 |
+
:param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into
|
| 241 |
+
the `Response` object.
|
| 242 |
+
:param meta: Additional meta data to be saved with the response.
|
| 243 |
+
|
| 244 |
+
:return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.
|
| 245 |
+
:rtype: Response
|
| 246 |
+
"""
|
| 247 |
+
# In case we didn't catch a document type somehow
|
| 248 |
+
final_response = final_response if final_response else first_response
|
| 249 |
+
if not final_response:
|
| 250 |
+
raise ValueError("Failed to get a response from the page")
|
| 251 |
+
|
| 252 |
+
encoding = cls.__extract_browser_encoding(final_response.headers.get("content-type", ""))
|
| 253 |
+
# PlayWright API sometimes give empty status text for some reason!
|
| 254 |
+
status_text = final_response.status_text or StatusText.get(final_response.status)
|
| 255 |
+
|
| 256 |
+
history = await cls._async_process_response_history(first_response, parser_arguments)
|
| 257 |
+
try:
|
| 258 |
+
if "html" in (await final_response.all_headers()).get("content-type", ""):
|
| 259 |
+
page_content = (await cls._get_async_page_content(page)).encode("utf-8")
|
| 260 |
+
else:
|
| 261 |
+
page_content = await final_response.body()
|
| 262 |
+
except Exception as e: # pragma: no cover
|
| 263 |
+
log.error(f"Error getting page content in async: {e}")
|
| 264 |
+
page_content = b""
|
| 265 |
+
|
| 266 |
+
return Response(
|
| 267 |
+
**{
|
| 268 |
+
"url": page.url,
|
| 269 |
+
"content": page_content,
|
| 270 |
+
"status": final_response.status,
|
| 271 |
+
"reason": status_text,
|
| 272 |
+
"encoding": encoding,
|
| 273 |
+
"cookies": tuple(dict(cookie) for cookie in await page.context.cookies()),
|
| 274 |
+
"headers": await first_response.all_headers(),
|
| 275 |
+
"request_headers": await first_response.request.all_headers(),
|
| 276 |
+
"history": history,
|
| 277 |
+
"meta": meta,
|
| 278 |
+
**parser_arguments,
|
| 279 |
+
}
|
| 280 |
+
)
|
| 281 |
+
|
| 282 |
+
@staticmethod
|
| 283 |
+
def from_http_request(response: CurlResponse, parser_arguments: Dict, meta: Optional[Dict] = None) -> Response:
|
| 284 |
+
"""Takes `curl_cffi` response and generates `Response` object from it.
|
| 285 |
+
|
| 286 |
+
:param response: `curl_cffi` response object
|
| 287 |
+
:param parser_arguments: Additional arguments to be passed to the `Response` object constructor.
|
| 288 |
+
:param meta: Optional metadata dictionary to attach to the Response.
|
| 289 |
+
:return: A `Response` object that is the same as `Selector` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 290 |
+
"""
|
| 291 |
+
return Response(
|
| 292 |
+
**{
|
| 293 |
+
"url": response.url,
|
| 294 |
+
"content": response.content,
|
| 295 |
+
"status": response.status_code,
|
| 296 |
+
"reason": response.reason,
|
| 297 |
+
"encoding": response.encoding or "utf-8",
|
| 298 |
+
"cookies": dict(response.cookies),
|
| 299 |
+
"headers": dict(response.headers),
|
| 300 |
+
"request_headers": dict(response.request.headers) if response.request else {},
|
| 301 |
+
"method": response.request.method if response.request else "GET",
|
| 302 |
+
"history": response.history, # https://github.com/lexiforest/curl_cffi/issues/82
|
| 303 |
+
"meta": meta,
|
| 304 |
+
**parser_arguments,
|
| 305 |
+
}
|
| 306 |
+
)
|
engines/toolbelt/custom.py
ADDED
|
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Functions related to custom types or type checking
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from functools import lru_cache
|
| 6 |
+
|
| 7 |
+
from scrapling.core.utils import log
|
| 8 |
+
from scrapling.core._types import (
|
| 9 |
+
Any,
|
| 10 |
+
Dict,
|
| 11 |
+
cast,
|
| 12 |
+
List,
|
| 13 |
+
Tuple,
|
| 14 |
+
Union,
|
| 15 |
+
Optional,
|
| 16 |
+
Callable,
|
| 17 |
+
Sequence,
|
| 18 |
+
TYPE_CHECKING,
|
| 19 |
+
AsyncGenerator,
|
| 20 |
+
)
|
| 21 |
+
from scrapling.core.custom_types import MappingProxyType
|
| 22 |
+
from scrapling.parser import Selector, SQLiteStorageSystem
|
| 23 |
+
|
| 24 |
+
if TYPE_CHECKING:
|
| 25 |
+
from scrapling.spiders import Request
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class Response(Selector):
|
| 29 |
+
"""This class is returned by all engines as a way to unify the response type between different libraries."""
|
| 30 |
+
|
| 31 |
+
def __init__(
|
| 32 |
+
self,
|
| 33 |
+
url: str,
|
| 34 |
+
content: str | bytes,
|
| 35 |
+
status: int,
|
| 36 |
+
reason: str,
|
| 37 |
+
cookies: Tuple[Dict[str, str], ...] | Dict[str, str],
|
| 38 |
+
headers: Dict,
|
| 39 |
+
request_headers: Dict,
|
| 40 |
+
encoding: str = "utf-8",
|
| 41 |
+
method: str = "GET",
|
| 42 |
+
history: List | None = None,
|
| 43 |
+
meta: Dict[str, Any] | None = None,
|
| 44 |
+
**selector_config: Any,
|
| 45 |
+
):
|
| 46 |
+
if isinstance(content, str):
|
| 47 |
+
content = content.encode("utf-8")
|
| 48 |
+
|
| 49 |
+
adaptive_domain: str = cast(str, selector_config.pop("adaptive_domain", ""))
|
| 50 |
+
self.status = status
|
| 51 |
+
self.reason = reason
|
| 52 |
+
self.cookies = cookies
|
| 53 |
+
self.headers = headers
|
| 54 |
+
self.request_headers = request_headers
|
| 55 |
+
self.history = history or []
|
| 56 |
+
super().__init__(
|
| 57 |
+
content=content,
|
| 58 |
+
url=adaptive_domain or url,
|
| 59 |
+
encoding=encoding,
|
| 60 |
+
**selector_config,
|
| 61 |
+
)
|
| 62 |
+
# For easier debugging while working from a Python shell
|
| 63 |
+
log.info(f"Fetched ({status}) <{method} {url}> (referer: {request_headers.get('referer')})")
|
| 64 |
+
|
| 65 |
+
if meta and not isinstance(meta, dict):
|
| 66 |
+
raise TypeError(f"Response meta should be dictionary but got {type(meta).__name__} instead!")
|
| 67 |
+
|
| 68 |
+
self.meta: Dict[str, Any] = meta or {}
|
| 69 |
+
self.request: Optional["Request"] = None # Will be set by crawler
|
| 70 |
+
|
| 71 |
+
@property
|
| 72 |
+
def body(self) -> bytes:
|
| 73 |
+
"""Return the raw body of the response as bytes."""
|
| 74 |
+
return cast(bytes, cast(Sequence, self._raw_body))
|
| 75 |
+
|
| 76 |
+
def follow(
|
| 77 |
+
self,
|
| 78 |
+
url: str,
|
| 79 |
+
sid: str = "",
|
| 80 |
+
callback: Callable[["Response"], AsyncGenerator[Union[Dict[str, Any], "Request", None], None]] | None = None,
|
| 81 |
+
priority: int | None = None,
|
| 82 |
+
dont_filter: bool = False,
|
| 83 |
+
meta: dict[str, Any] | None = None,
|
| 84 |
+
referer_flow: bool = True,
|
| 85 |
+
**kwargs: Any,
|
| 86 |
+
) -> Any:
|
| 87 |
+
"""Create a Request to follow a URL.
|
| 88 |
+
|
| 89 |
+
This is a helper method for spiders to easily follow links found in pages.
|
| 90 |
+
|
| 91 |
+
**IMPORTANT**: The below arguments if left empty, the corresponding value from the previous request will be used. The only exception is `dont_filter`.
|
| 92 |
+
|
| 93 |
+
:param url: The URL to follow (can be relative, will be joined with current URL)
|
| 94 |
+
:param sid: The session id to use
|
| 95 |
+
:param callback: Spider callback method to use
|
| 96 |
+
:param priority: The priority number to use, the higher the number, the higher priority to be processed first.
|
| 97 |
+
:param dont_filter: If this request has been done before, disable the filter to allow it again.
|
| 98 |
+
:param meta: Additional meta data to included in the request
|
| 99 |
+
:param referer_flow: Enabled by default, set the current response url as referer for the new request url.
|
| 100 |
+
:param kwargs: Additional Request arguments
|
| 101 |
+
:return: Request object ready to be yielded
|
| 102 |
+
"""
|
| 103 |
+
from scrapling.spiders import Request
|
| 104 |
+
|
| 105 |
+
if not self.request or not isinstance(self.request, Request):
|
| 106 |
+
raise TypeError("This response has no request set yet.")
|
| 107 |
+
|
| 108 |
+
# Merge original session kwargs with new kwargs (new takes precedence)
|
| 109 |
+
session_kwargs = {**self.request._session_kwargs, **kwargs}
|
| 110 |
+
|
| 111 |
+
if referer_flow:
|
| 112 |
+
# For requests
|
| 113 |
+
headers = session_kwargs.get("headers", {})
|
| 114 |
+
headers["referer"] = self.url
|
| 115 |
+
session_kwargs["headers"] = headers
|
| 116 |
+
|
| 117 |
+
# For browsers
|
| 118 |
+
extra_headers = session_kwargs.get("extra_headers", {})
|
| 119 |
+
extra_headers["referer"] = self.url
|
| 120 |
+
session_kwargs["extra_headers"] = extra_headers
|
| 121 |
+
|
| 122 |
+
session_kwargs["google_search"] = False
|
| 123 |
+
|
| 124 |
+
return Request(
|
| 125 |
+
url=self.urljoin(url),
|
| 126 |
+
sid=sid or self.request.sid,
|
| 127 |
+
callback=callback or self.request.callback,
|
| 128 |
+
priority=priority if priority is not None else self.request.priority,
|
| 129 |
+
dont_filter=dont_filter,
|
| 130 |
+
meta={**(self.meta or {}), **(meta or {})},
|
| 131 |
+
**session_kwargs,
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
def __str__(self) -> str:
|
| 135 |
+
return f"<{self.status} {self.url}>"
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
class BaseFetcher:
|
| 139 |
+
__slots__ = ()
|
| 140 |
+
huge_tree: bool = True
|
| 141 |
+
adaptive: Optional[bool] = False
|
| 142 |
+
storage: Any = SQLiteStorageSystem
|
| 143 |
+
keep_cdata: Optional[bool] = False
|
| 144 |
+
storage_args: Optional[Dict] = None
|
| 145 |
+
keep_comments: Optional[bool] = False
|
| 146 |
+
adaptive_domain: str = ""
|
| 147 |
+
parser_keywords: Tuple = (
|
| 148 |
+
"huge_tree",
|
| 149 |
+
"adaptive",
|
| 150 |
+
"storage",
|
| 151 |
+
"keep_cdata",
|
| 152 |
+
"storage_args",
|
| 153 |
+
"keep_comments",
|
| 154 |
+
"adaptive_domain",
|
| 155 |
+
) # Left open for the user
|
| 156 |
+
|
| 157 |
+
def __init__(self, *args, **kwargs):
|
| 158 |
+
# For backward-compatibility before 0.2.99
|
| 159 |
+
args_str = ", ".join(args) or ""
|
| 160 |
+
kwargs_str = ", ".join(f"{k}={v}" for k, v in kwargs.items()) or ""
|
| 161 |
+
if args_str:
|
| 162 |
+
args_str += ", "
|
| 163 |
+
|
| 164 |
+
log.warning(
|
| 165 |
+
f"This logic is deprecated now, and have no effect; It will be removed with v0.3. Use `{self.__class__.__name__}.configure({args_str}{kwargs_str})` instead before fetching"
|
| 166 |
+
)
|
| 167 |
+
pass
|
| 168 |
+
|
| 169 |
+
@classmethod
|
| 170 |
+
def display_config(cls):
|
| 171 |
+
return dict(
|
| 172 |
+
huge_tree=cls.huge_tree,
|
| 173 |
+
keep_comments=cls.keep_comments,
|
| 174 |
+
keep_cdata=cls.keep_cdata,
|
| 175 |
+
adaptive=cls.adaptive,
|
| 176 |
+
storage=cls.storage,
|
| 177 |
+
storage_args=cls.storage_args,
|
| 178 |
+
adaptive_domain=cls.adaptive_domain,
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
@classmethod
|
| 182 |
+
def configure(cls, **kwargs):
|
| 183 |
+
"""Set multiple arguments for the parser at once globally
|
| 184 |
+
|
| 185 |
+
:param kwargs: The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata, adaptive, storage, storage_args, adaptive_domain
|
| 186 |
+
"""
|
| 187 |
+
for key, value in kwargs.items():
|
| 188 |
+
key = key.strip().lower()
|
| 189 |
+
if hasattr(cls, key):
|
| 190 |
+
if key in cls.parser_keywords:
|
| 191 |
+
setattr(cls, key, value)
|
| 192 |
+
else:
|
| 193 |
+
# Yup, no fun allowed LOL
|
| 194 |
+
raise AttributeError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
|
| 195 |
+
else:
|
| 196 |
+
raise ValueError(f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?')
|
| 197 |
+
|
| 198 |
+
if not kwargs:
|
| 199 |
+
raise AttributeError(f"You must pass a keyword to configure, current keywords: {cls.parser_keywords}?")
|
| 200 |
+
|
| 201 |
+
@classmethod
|
| 202 |
+
def _generate_parser_arguments(cls) -> Dict:
|
| 203 |
+
# Selector class parameters
|
| 204 |
+
# I won't validate Selector's class parameters here again, I will leave it to be validated later
|
| 205 |
+
parser_arguments = dict(
|
| 206 |
+
huge_tree=cls.huge_tree,
|
| 207 |
+
keep_comments=cls.keep_comments,
|
| 208 |
+
keep_cdata=cls.keep_cdata,
|
| 209 |
+
adaptive=cls.adaptive,
|
| 210 |
+
storage=cls.storage,
|
| 211 |
+
storage_args=cls.storage_args,
|
| 212 |
+
adaptive_domain=cls.adaptive_domain,
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
return parser_arguments
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
class StatusText:
|
| 219 |
+
"""A class that gets the status text of the response status code.
|
| 220 |
+
|
| 221 |
+
Reference: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status
|
| 222 |
+
"""
|
| 223 |
+
|
| 224 |
+
_phrases = MappingProxyType(
|
| 225 |
+
{
|
| 226 |
+
100: "Continue",
|
| 227 |
+
101: "Switching Protocols",
|
| 228 |
+
102: "Processing",
|
| 229 |
+
103: "Early Hints",
|
| 230 |
+
200: "OK",
|
| 231 |
+
201: "Created",
|
| 232 |
+
202: "Accepted",
|
| 233 |
+
203: "Non-Authoritative Information",
|
| 234 |
+
204: "No Content",
|
| 235 |
+
205: "Reset Content",
|
| 236 |
+
206: "Partial Content",
|
| 237 |
+
207: "Multi-Status",
|
| 238 |
+
208: "Already Reported",
|
| 239 |
+
226: "IM Used",
|
| 240 |
+
300: "Multiple Choices",
|
| 241 |
+
301: "Moved Permanently",
|
| 242 |
+
302: "Found",
|
| 243 |
+
303: "See Other",
|
| 244 |
+
304: "Not Modified",
|
| 245 |
+
305: "Use Proxy",
|
| 246 |
+
307: "Temporary Redirect",
|
| 247 |
+
308: "Permanent Redirect",
|
| 248 |
+
400: "Bad Request",
|
| 249 |
+
401: "Unauthorized",
|
| 250 |
+
402: "Payment Required",
|
| 251 |
+
403: "Forbidden",
|
| 252 |
+
404: "Not Found",
|
| 253 |
+
405: "Method Not Allowed",
|
| 254 |
+
406: "Not Acceptable",
|
| 255 |
+
407: "Proxy Authentication Required",
|
| 256 |
+
408: "Request Timeout",
|
| 257 |
+
409: "Conflict",
|
| 258 |
+
410: "Gone",
|
| 259 |
+
411: "Length Required",
|
| 260 |
+
412: "Precondition Failed",
|
| 261 |
+
413: "Payload Too Large",
|
| 262 |
+
414: "URI Too Long",
|
| 263 |
+
415: "Unsupported Media Type",
|
| 264 |
+
416: "Range Not Satisfiable",
|
| 265 |
+
417: "Expectation Failed",
|
| 266 |
+
418: "I'm a teapot",
|
| 267 |
+
421: "Misdirected Request",
|
| 268 |
+
422: "Unprocessable Entity",
|
| 269 |
+
423: "Locked",
|
| 270 |
+
424: "Failed Dependency",
|
| 271 |
+
425: "Too Early",
|
| 272 |
+
426: "Upgrade Required",
|
| 273 |
+
428: "Precondition Required",
|
| 274 |
+
429: "Too Many Requests",
|
| 275 |
+
431: "Request Header Fields Too Large",
|
| 276 |
+
451: "Unavailable For Legal Reasons",
|
| 277 |
+
500: "Internal Server Error",
|
| 278 |
+
501: "Not Implemented",
|
| 279 |
+
502: "Bad Gateway",
|
| 280 |
+
503: "Service Unavailable",
|
| 281 |
+
504: "Gateway Timeout",
|
| 282 |
+
505: "HTTP Version Not Supported",
|
| 283 |
+
506: "Variant Also Negotiates",
|
| 284 |
+
507: "Insufficient Storage",
|
| 285 |
+
508: "Loop Detected",
|
| 286 |
+
510: "Not Extended",
|
| 287 |
+
511: "Network Authentication Required",
|
| 288 |
+
}
|
| 289 |
+
)
|
| 290 |
+
|
| 291 |
+
@classmethod
|
| 292 |
+
@lru_cache(maxsize=128)
|
| 293 |
+
def get(cls, status_code: int) -> str:
|
| 294 |
+
"""Get the phrase for a given HTTP status code."""
|
| 295 |
+
return cls._phrases.get(status_code, "Unknown Status Code")
|
engines/toolbelt/fingerprints.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Functions related to generating headers and fingerprints generally
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from functools import lru_cache
|
| 6 |
+
from platform import system as platform_system
|
| 7 |
+
|
| 8 |
+
from tld import get_tld, Result
|
| 9 |
+
from browserforge.headers import Browser, HeaderGenerator
|
| 10 |
+
from browserforge.headers.generator import SUPPORTED_OPERATING_SYSTEMS
|
| 11 |
+
|
| 12 |
+
from scrapling.core._types import Dict, Literal, Tuple, cast
|
| 13 |
+
|
| 14 |
+
__OS_NAME__ = platform_system()
|
| 15 |
+
OSName = Literal["linux", "macos", "windows"]
|
| 16 |
+
# Current versions hardcoded for now (Playwright doesn't allow to know the version of a browser without launching it)
|
| 17 |
+
chromium_version = 141
|
| 18 |
+
chrome_version = 143
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@lru_cache(10, typed=True)
|
| 22 |
+
def generate_convincing_referer(url: str) -> str | None:
|
| 23 |
+
"""Takes the domain from the URL without the subdomain/suffix and make it look like you were searching Google for this website
|
| 24 |
+
|
| 25 |
+
>>> generate_convincing_referer('https://www.somewebsite.com/blah')
|
| 26 |
+
'https://www.google.com/search?q=somewebsite'
|
| 27 |
+
|
| 28 |
+
:param url: The URL you are about to fetch.
|
| 29 |
+
:return: Google's search URL of the domain name, or None for localhost/IP addresses
|
| 30 |
+
"""
|
| 31 |
+
# Fixing the inaccurate return type hint in `get_tld`
|
| 32 |
+
extracted: Result | None = cast(Result, get_tld(url, as_object=True, fail_silently=True))
|
| 33 |
+
if not extracted:
|
| 34 |
+
return None
|
| 35 |
+
|
| 36 |
+
website_name = extracted.domain
|
| 37 |
+
|
| 38 |
+
# Skip generating referer for localhost, IP addresses, or when there's no valid domain
|
| 39 |
+
if not website_name or not extracted.tld or website_name in ("localhost", "127.0.0.1", "::1"):
|
| 40 |
+
return None
|
| 41 |
+
|
| 42 |
+
# Check if it's an IP address (simple check for IPv4)
|
| 43 |
+
if all(part.isdigit() for part in website_name.split(".") if part):
|
| 44 |
+
return None
|
| 45 |
+
|
| 46 |
+
return f"https://www.google.com/search?q={website_name}"
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
@lru_cache(1, typed=True)
|
| 50 |
+
def get_os_name() -> OSName | Tuple:
|
| 51 |
+
"""Get the current OS name in the same format needed for browserforge, if the OS is Unknown, return None so browserforge uses all.
|
| 52 |
+
|
| 53 |
+
:return: Current OS name or `None` otherwise
|
| 54 |
+
"""
|
| 55 |
+
match __OS_NAME__: # pragma: no cover
|
| 56 |
+
case "Linux":
|
| 57 |
+
return "linux"
|
| 58 |
+
case "Darwin":
|
| 59 |
+
return "macos"
|
| 60 |
+
case "Windows":
|
| 61 |
+
return "windows"
|
| 62 |
+
case _:
|
| 63 |
+
return SUPPORTED_OPERATING_SYSTEMS
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def generate_headers(browser_mode: bool | str = False) -> Dict:
|
| 67 |
+
"""Generate real browser-like headers using browserforge's generator
|
| 68 |
+
|
| 69 |
+
:param browser_mode: If enabled, the headers created are used for playwright, so it has to match everything
|
| 70 |
+
:return: A dictionary of the generated headers
|
| 71 |
+
"""
|
| 72 |
+
# In the browser mode, we don't care about anything other than matching the OS and the browser type with the browser we are using,
|
| 73 |
+
# So we don't raise any inconsistency red flags while websites fingerprinting us
|
| 74 |
+
os_name = get_os_name()
|
| 75 |
+
ver = chrome_version if browser_mode and browser_mode == "chrome" else chromium_version
|
| 76 |
+
browsers = [Browser(name="chrome", min_version=ver, max_version=ver)]
|
| 77 |
+
if not browser_mode:
|
| 78 |
+
os_name = ("windows", "macos", "linux")
|
| 79 |
+
browsers.extend(
|
| 80 |
+
[
|
| 81 |
+
Browser(name="firefox", min_version=142),
|
| 82 |
+
Browser(name="edge", min_version=140),
|
| 83 |
+
]
|
| 84 |
+
)
|
| 85 |
+
return HeaderGenerator(browser=browsers, os=os_name, device="desktop").generate()
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
__default_useragent__ = generate_headers(browser_mode=False).get("User-Agent")
|
engines/toolbelt/navigation.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Functions related to files and URLs
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from urllib.parse import urlparse
|
| 6 |
+
|
| 7 |
+
from playwright.async_api import Route as async_Route
|
| 8 |
+
from msgspec import Struct, structs, convert, ValidationError
|
| 9 |
+
from playwright.sync_api import Route
|
| 10 |
+
|
| 11 |
+
from scrapling.core.utils import log
|
| 12 |
+
from scrapling.core._types import Dict, Set, Tuple, Optional, Callable
|
| 13 |
+
from scrapling.engines.constants import EXTRA_RESOURCES
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class ProxyDict(Struct):
|
| 17 |
+
server: str
|
| 18 |
+
username: str = ""
|
| 19 |
+
password: str = ""
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def create_intercept_handler(disable_resources: bool, blocked_domains: Optional[Set[str]] = None) -> Callable:
|
| 23 |
+
"""Create a route handler that blocks both resource types and specific domains.
|
| 24 |
+
|
| 25 |
+
:param disable_resources: Whether to block default resource types.
|
| 26 |
+
:param blocked_domains: Set of domain names to block requests to.
|
| 27 |
+
:return: A sync route handler function.
|
| 28 |
+
"""
|
| 29 |
+
disabled_resources = EXTRA_RESOURCES if disable_resources else set()
|
| 30 |
+
domains = blocked_domains or set()
|
| 31 |
+
|
| 32 |
+
def handler(route: Route):
|
| 33 |
+
if route.request.resource_type in disabled_resources:
|
| 34 |
+
log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
|
| 35 |
+
route.abort()
|
| 36 |
+
elif domains:
|
| 37 |
+
hostname = urlparse(route.request.url).hostname or ""
|
| 38 |
+
if any(hostname == d or hostname.endswith("." + d) for d in domains):
|
| 39 |
+
log.debug(f'Blocking request to blocked domain "{hostname}" ({route.request.url})')
|
| 40 |
+
route.abort()
|
| 41 |
+
else:
|
| 42 |
+
route.continue_()
|
| 43 |
+
else:
|
| 44 |
+
route.continue_()
|
| 45 |
+
|
| 46 |
+
return handler
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def create_async_intercept_handler(disable_resources: bool, blocked_domains: Optional[Set[str]] = None) -> Callable:
|
| 50 |
+
"""Create an async route handler that blocks both resource types and specific domains.
|
| 51 |
+
|
| 52 |
+
:param disable_resources: Whether to block default resource types.
|
| 53 |
+
:param blocked_domains: Set of domain names to block requests to.
|
| 54 |
+
:return: An async route handler function.
|
| 55 |
+
"""
|
| 56 |
+
disabled_resources = EXTRA_RESOURCES if disable_resources else set()
|
| 57 |
+
domains = blocked_domains or set()
|
| 58 |
+
|
| 59 |
+
async def handler(route: async_Route):
|
| 60 |
+
if route.request.resource_type in disabled_resources:
|
| 61 |
+
log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
|
| 62 |
+
await route.abort()
|
| 63 |
+
elif domains:
|
| 64 |
+
hostname = urlparse(route.request.url).hostname or ""
|
| 65 |
+
if any(hostname == d or hostname.endswith("." + d) for d in domains):
|
| 66 |
+
log.debug(f'Blocking request to blocked domain "{hostname}" ({route.request.url})')
|
| 67 |
+
await route.abort()
|
| 68 |
+
else:
|
| 69 |
+
await route.continue_()
|
| 70 |
+
else:
|
| 71 |
+
await route.continue_()
|
| 72 |
+
|
| 73 |
+
return handler
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple) -> Dict:
|
| 77 |
+
"""Validate a proxy and return it in the acceptable format for Playwright
|
| 78 |
+
Reference: https://playwright.dev/python/docs/network#http-proxy
|
| 79 |
+
|
| 80 |
+
:param proxy_string: A string or a dictionary representation of the proxy.
|
| 81 |
+
:return:
|
| 82 |
+
"""
|
| 83 |
+
if isinstance(proxy_string, str):
|
| 84 |
+
proxy = urlparse(proxy_string)
|
| 85 |
+
if proxy.scheme not in ("http", "https", "socks4", "socks5") or not proxy.hostname:
|
| 86 |
+
raise ValueError("Invalid proxy string!")
|
| 87 |
+
|
| 88 |
+
try:
|
| 89 |
+
result = {
|
| 90 |
+
"server": f"{proxy.scheme}://{proxy.hostname}",
|
| 91 |
+
"username": proxy.username or "",
|
| 92 |
+
"password": proxy.password or "",
|
| 93 |
+
}
|
| 94 |
+
if proxy.port:
|
| 95 |
+
result["server"] += f":{proxy.port}"
|
| 96 |
+
return result
|
| 97 |
+
except ValueError:
|
| 98 |
+
# Urllib will say that one of the parameters above can't be casted to the correct type like `int` for port etc...
|
| 99 |
+
raise ValueError("The proxy argument's string is in invalid format!")
|
| 100 |
+
|
| 101 |
+
elif isinstance(proxy_string, dict):
|
| 102 |
+
try:
|
| 103 |
+
validated = convert(proxy_string, ProxyDict)
|
| 104 |
+
result_dict = structs.asdict(validated)
|
| 105 |
+
return result_dict
|
| 106 |
+
except ValidationError as e:
|
| 107 |
+
raise TypeError(f"Invalid proxy dictionary: {e}")
|
| 108 |
+
|
| 109 |
+
raise TypeError(f"Invalid proxy string: {proxy_string}")
|
engines/toolbelt/proxy_rotation.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from threading import Lock
|
| 2 |
+
|
| 3 |
+
from scrapling.core._types import Callable, Dict, List, Tuple, ProxyType
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
RotationStrategy = Callable[[List[ProxyType], int], Tuple[ProxyType, int]]
|
| 7 |
+
_PROXY_ERROR_INDICATORS = {
|
| 8 |
+
"net::err_proxy",
|
| 9 |
+
"net::err_tunnel",
|
| 10 |
+
"connection refused",
|
| 11 |
+
"connection reset",
|
| 12 |
+
"connection timed out",
|
| 13 |
+
"failed to connect",
|
| 14 |
+
"could not resolve proxy",
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def _get_proxy_key(proxy: ProxyType) -> str:
|
| 19 |
+
"""Generate a unique key for a proxy (for dicts it's server plus username)."""
|
| 20 |
+
if isinstance(proxy, str):
|
| 21 |
+
return proxy
|
| 22 |
+
server = proxy.get("server", "")
|
| 23 |
+
username = proxy.get("username", "")
|
| 24 |
+
return f"{server}|{username}"
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def is_proxy_error(error: Exception) -> bool:
|
| 28 |
+
"""Check if an error is proxy-related. Works for both HTTP and browser errors."""
|
| 29 |
+
error_msg = str(error).lower()
|
| 30 |
+
return any(indicator in error_msg for indicator in _PROXY_ERROR_INDICATORS)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def cyclic_rotation(proxies: List[ProxyType], current_index: int) -> Tuple[ProxyType, int]:
|
| 34 |
+
"""Default cyclic rotation strategy — iterates through proxies sequentially, wrapping around at the end."""
|
| 35 |
+
idx = current_index % len(proxies)
|
| 36 |
+
return proxies[idx], (idx + 1) % len(proxies)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class ProxyRotator:
|
| 40 |
+
"""
|
| 41 |
+
A thread-safe proxy rotator with pluggable rotation strategies.
|
| 42 |
+
|
| 43 |
+
Supports:
|
| 44 |
+
- Cyclic rotation (default)
|
| 45 |
+
- Custom rotation strategies via callable
|
| 46 |
+
- Both string URLs and Playwright-style dict proxies
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
__slots__ = ("_proxies", "_proxy_to_index", "_strategy", "_current_index", "_lock")
|
| 50 |
+
|
| 51 |
+
def __init__(
|
| 52 |
+
self,
|
| 53 |
+
proxies: List[ProxyType],
|
| 54 |
+
strategy: RotationStrategy = cyclic_rotation,
|
| 55 |
+
):
|
| 56 |
+
"""
|
| 57 |
+
Initialize the proxy rotator.
|
| 58 |
+
|
| 59 |
+
:param proxies: List of proxy URLs or Playwright-style proxy dicts.
|
| 60 |
+
- String format: "http://proxy1:8080" or "http://user:pass@proxy:8080"
|
| 61 |
+
- Dict format: {"server": "http://proxy:8080", "username": "user", "password": "pass"}
|
| 62 |
+
:param strategy: Rotation strategy function. Takes (proxies, current_index) and returns (proxy, next_index). Defaults to cyclic_rotation.
|
| 63 |
+
"""
|
| 64 |
+
if not proxies:
|
| 65 |
+
raise ValueError("At least one proxy must be provided")
|
| 66 |
+
|
| 67 |
+
if not callable(strategy):
|
| 68 |
+
raise TypeError(f"strategy must be callable, got {type(strategy).__name__}")
|
| 69 |
+
|
| 70 |
+
self._strategy = strategy
|
| 71 |
+
self._lock = Lock()
|
| 72 |
+
|
| 73 |
+
# Validate and store proxies
|
| 74 |
+
self._proxies: List[ProxyType] = []
|
| 75 |
+
self._proxy_to_index: Dict[str, int] = {} # O(1) lookup by unique key (server + username)
|
| 76 |
+
for i, proxy in enumerate(proxies):
|
| 77 |
+
if isinstance(proxy, (str, dict)):
|
| 78 |
+
if isinstance(proxy, dict) and "server" not in proxy:
|
| 79 |
+
raise ValueError("Proxy dict must have a 'server' key")
|
| 80 |
+
|
| 81 |
+
self._proxy_to_index[_get_proxy_key(proxy)] = i
|
| 82 |
+
self._proxies.append(proxy)
|
| 83 |
+
else:
|
| 84 |
+
raise TypeError(f"Invalid proxy type: {type(proxy)}. Expected str or dict.")
|
| 85 |
+
|
| 86 |
+
self._current_index = 0
|
| 87 |
+
|
| 88 |
+
def get_proxy(self) -> ProxyType:
|
| 89 |
+
"""Get the next proxy according to the rotation strategy."""
|
| 90 |
+
with self._lock:
|
| 91 |
+
proxy, self._current_index = self._strategy(self._proxies, self._current_index)
|
| 92 |
+
return proxy
|
| 93 |
+
|
| 94 |
+
@property
|
| 95 |
+
def proxies(self) -> List[ProxyType]:
|
| 96 |
+
"""Get a copy of all configured proxies."""
|
| 97 |
+
return list(self._proxies)
|
| 98 |
+
|
| 99 |
+
def __len__(self) -> int:
|
| 100 |
+
"""Return the total number of configured proxies."""
|
| 101 |
+
return len(self._proxies)
|
| 102 |
+
|
| 103 |
+
def __repr__(self) -> str:
|
| 104 |
+
return f"ProxyRotator(proxies={len(self._proxies)})"
|
fetchers/__init__.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import TYPE_CHECKING, Any
|
| 2 |
+
from scrapling.engines.toolbelt import ProxyRotator
|
| 3 |
+
|
| 4 |
+
if TYPE_CHECKING:
|
| 5 |
+
from scrapling.fetchers.requests import Fetcher, AsyncFetcher, FetcherSession
|
| 6 |
+
from scrapling.fetchers.chrome import DynamicFetcher, DynamicSession, AsyncDynamicSession
|
| 7 |
+
from scrapling.fetchers.stealth_chrome import StealthyFetcher, StealthySession, AsyncStealthySession
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
# Lazy import mapping
|
| 11 |
+
_LAZY_IMPORTS = {
|
| 12 |
+
"Fetcher": ("scrapling.fetchers.requests", "Fetcher"),
|
| 13 |
+
"AsyncFetcher": ("scrapling.fetchers.requests", "AsyncFetcher"),
|
| 14 |
+
"FetcherSession": ("scrapling.fetchers.requests", "FetcherSession"),
|
| 15 |
+
"DynamicFetcher": ("scrapling.fetchers.chrome", "DynamicFetcher"),
|
| 16 |
+
"DynamicSession": ("scrapling.fetchers.chrome", "DynamicSession"),
|
| 17 |
+
"AsyncDynamicSession": ("scrapling.fetchers.chrome", "AsyncDynamicSession"),
|
| 18 |
+
"StealthyFetcher": ("scrapling.fetchers.stealth_chrome", "StealthyFetcher"),
|
| 19 |
+
"StealthySession": ("scrapling.fetchers.stealth_chrome", "StealthySession"),
|
| 20 |
+
"AsyncStealthySession": ("scrapling.fetchers.stealth_chrome", "AsyncStealthySession"),
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
__all__ = [
|
| 24 |
+
"Fetcher",
|
| 25 |
+
"AsyncFetcher",
|
| 26 |
+
"ProxyRotator",
|
| 27 |
+
"FetcherSession",
|
| 28 |
+
"DynamicFetcher",
|
| 29 |
+
"DynamicSession",
|
| 30 |
+
"AsyncDynamicSession",
|
| 31 |
+
"StealthyFetcher",
|
| 32 |
+
"StealthySession",
|
| 33 |
+
"AsyncStealthySession",
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def __getattr__(name: str) -> Any:
|
| 38 |
+
if name in _LAZY_IMPORTS:
|
| 39 |
+
module_path, class_name = _LAZY_IMPORTS[name]
|
| 40 |
+
module = __import__(module_path, fromlist=[class_name])
|
| 41 |
+
return getattr(module, class_name)
|
| 42 |
+
else:
|
| 43 |
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def __dir__() -> list[str]:
|
| 47 |
+
"""Support for dir() and autocomplete."""
|
| 48 |
+
return sorted(list(_LAZY_IMPORTS.keys()))
|
fetchers/chrome.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from scrapling.core._types import Unpack
|
| 2 |
+
from scrapling.engines._browsers._types import PlaywrightSession
|
| 3 |
+
from scrapling.engines.toolbelt.custom import BaseFetcher, Response
|
| 4 |
+
from scrapling.engines._browsers._controllers import DynamicSession, AsyncDynamicSession
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class DynamicFetcher(BaseFetcher):
|
| 8 |
+
"""A `Fetcher` that provide many options to fetch/load websites' pages through chromium-based browsers."""
|
| 9 |
+
|
| 10 |
+
@classmethod
|
| 11 |
+
def fetch(cls, url: str, **kwargs: Unpack[PlaywrightSession]) -> Response:
|
| 12 |
+
"""Opens up a browser and do your request based on your chosen options below.
|
| 13 |
+
|
| 14 |
+
:param url: Target url.
|
| 15 |
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 16 |
+
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 17 |
+
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
| 18 |
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 19 |
+
:param cookies: Set cookies for the next request.
|
| 20 |
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 21 |
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 22 |
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 23 |
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.
|
| 24 |
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 25 |
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 26 |
+
:param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
|
| 27 |
+
:param locale: Set the locale for the browser if wanted. Defaults to the system default locale.
|
| 28 |
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 29 |
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 30 |
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 31 |
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 32 |
+
:param extra_headers: A dictionary of extra headers to add to the request.
|
| 33 |
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 34 |
+
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
| 35 |
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 36 |
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings.
|
| 37 |
+
:return: A `Response` object.
|
| 38 |
+
"""
|
| 39 |
+
selector_config = kwargs.get("selector_config", {}) or kwargs.get(
|
| 40 |
+
"custom_config", {}
|
| 41 |
+
) # Checking `custom_config` for backward compatibility
|
| 42 |
+
if not isinstance(selector_config, dict):
|
| 43 |
+
raise TypeError("Argument `selector_config` must be a dictionary.")
|
| 44 |
+
|
| 45 |
+
kwargs["selector_config"] = {**cls._generate_parser_arguments(), **selector_config}
|
| 46 |
+
|
| 47 |
+
with DynamicSession(**kwargs) as session:
|
| 48 |
+
return session.fetch(url)
|
| 49 |
+
|
| 50 |
+
@classmethod
|
| 51 |
+
async def async_fetch(cls, url: str, **kwargs: Unpack[PlaywrightSession]) -> Response:
|
| 52 |
+
"""Opens up a browser and do your request based on your chosen options below.
|
| 53 |
+
|
| 54 |
+
:param url: Target url.
|
| 55 |
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 56 |
+
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 57 |
+
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
| 58 |
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 59 |
+
:param cookies: Set cookies for the next request.
|
| 60 |
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 61 |
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 62 |
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 63 |
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.
|
| 64 |
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 65 |
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 66 |
+
:param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
|
| 67 |
+
:param locale: Set the locale for the browser if wanted. Defaults to the system default locale.
|
| 68 |
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 69 |
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 70 |
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 71 |
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 72 |
+
:param extra_headers: A dictionary of extra headers to add to the request.
|
| 73 |
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 74 |
+
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
| 75 |
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 76 |
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings.
|
| 77 |
+
:return: A `Response` object.
|
| 78 |
+
"""
|
| 79 |
+
selector_config = kwargs.get("selector_config", {}) or kwargs.get(
|
| 80 |
+
"custom_config", {}
|
| 81 |
+
) # Checking `custom_config` for backward compatibility
|
| 82 |
+
if not isinstance(selector_config, dict):
|
| 83 |
+
raise TypeError("Argument `selector_config` must be a dictionary.")
|
| 84 |
+
|
| 85 |
+
kwargs["selector_config"] = {**cls._generate_parser_arguments(), **selector_config}
|
| 86 |
+
|
| 87 |
+
async with AsyncDynamicSession(**kwargs) as session:
|
| 88 |
+
return await session.fetch(url)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
PlayWrightFetcher = DynamicFetcher # For backward-compatibility
|
fetchers/requests.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from scrapling.engines.static import (
|
| 2 |
+
FetcherSession,
|
| 3 |
+
FetcherClient as _FetcherClient,
|
| 4 |
+
AsyncFetcherClient as _AsyncFetcherClient,
|
| 5 |
+
)
|
| 6 |
+
from scrapling.engines.toolbelt.custom import BaseFetcher
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
__FetcherClientInstance__ = _FetcherClient()
|
| 10 |
+
__AsyncFetcherClientInstance__ = _AsyncFetcherClient()
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class Fetcher(BaseFetcher):
|
| 14 |
+
"""A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on `curl_cffi`."""
|
| 15 |
+
|
| 16 |
+
get = __FetcherClientInstance__.get
|
| 17 |
+
post = __FetcherClientInstance__.post
|
| 18 |
+
put = __FetcherClientInstance__.put
|
| 19 |
+
delete = __FetcherClientInstance__.delete
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class AsyncFetcher(BaseFetcher):
|
| 23 |
+
"""A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on `curl_cffi`."""
|
| 24 |
+
|
| 25 |
+
get = __AsyncFetcherClientInstance__.get
|
| 26 |
+
post = __AsyncFetcherClientInstance__.post
|
| 27 |
+
put = __AsyncFetcherClientInstance__.put
|
| 28 |
+
delete = __AsyncFetcherClientInstance__.delete
|
fetchers/stealth_chrome.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from scrapling.core._types import Unpack
|
| 2 |
+
from scrapling.engines._browsers._types import StealthSession
|
| 3 |
+
from scrapling.engines.toolbelt.custom import BaseFetcher, Response
|
| 4 |
+
from scrapling.engines._browsers._stealth import StealthySession, AsyncStealthySession
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class StealthyFetcher(BaseFetcher):
|
| 8 |
+
"""A `Fetcher` class type which is a completely stealthy built on top of Chromium.
|
| 9 |
+
|
| 10 |
+
It works as real browsers passing almost all online tests/protections with many customization options.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
@classmethod
|
| 14 |
+
def fetch(cls, url: str, **kwargs: Unpack[StealthSession]) -> Response:
|
| 15 |
+
"""
|
| 16 |
+
Opens up a browser and do your request based on your chosen options below.
|
| 17 |
+
|
| 18 |
+
:param url: Target url.
|
| 19 |
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 20 |
+
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 21 |
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 22 |
+
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
| 23 |
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 24 |
+
:param cookies: Set cookies for the next request.
|
| 25 |
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 26 |
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 27 |
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 28 |
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 29 |
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 30 |
+
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
| 31 |
+
:param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
|
| 32 |
+
rules. Defaults to the system default locale.
|
| 33 |
+
:param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
|
| 34 |
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 35 |
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
| 36 |
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 37 |
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
| 38 |
+
:param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
|
| 39 |
+
:param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
| 40 |
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 41 |
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 42 |
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 43 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 44 |
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 45 |
+
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
| 46 |
+
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
| 47 |
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 48 |
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 49 |
+
:return: A `Response` object.
|
| 50 |
+
"""
|
| 51 |
+
selector_config = kwargs.get("selector_config", {}) or kwargs.get(
|
| 52 |
+
"custom_config", {}
|
| 53 |
+
) # Checking `custom_config` for backward compatibility
|
| 54 |
+
if not isinstance(selector_config, dict):
|
| 55 |
+
raise TypeError("Argument `selector_config` must be a dictionary.")
|
| 56 |
+
|
| 57 |
+
kwargs["selector_config"] = {**cls._generate_parser_arguments(), **selector_config}
|
| 58 |
+
|
| 59 |
+
with StealthySession(**kwargs) as engine:
|
| 60 |
+
return engine.fetch(url)
|
| 61 |
+
|
| 62 |
+
@classmethod
|
| 63 |
+
async def async_fetch(cls, url: str, **kwargs: Unpack[StealthSession]) -> Response:
|
| 64 |
+
"""
|
| 65 |
+
Opens up a browser and do your request based on your chosen options below.
|
| 66 |
+
|
| 67 |
+
:param url: Target url.
|
| 68 |
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 69 |
+
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 70 |
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 71 |
+
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
| 72 |
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 73 |
+
:param cookies: Set cookies for the next request.
|
| 74 |
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 75 |
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 76 |
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 77 |
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 78 |
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
| 79 |
+
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
| 80 |
+
:param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting
|
| 81 |
+
rules. Defaults to the system default locale.
|
| 82 |
+
:param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.
|
| 83 |
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 84 |
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
| 85 |
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 86 |
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
| 87 |
+
:param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
|
| 88 |
+
:param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
| 89 |
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 90 |
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 91 |
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 92 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 93 |
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 94 |
+
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
| 95 |
+
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
| 96 |
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 97 |
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 98 |
+
:return: A `Response` object.
|
| 99 |
+
"""
|
| 100 |
+
selector_config = kwargs.get("selector_config", {}) or kwargs.get(
|
| 101 |
+
"custom_config", {}
|
| 102 |
+
) # Checking `custom_config` for backward compatibility
|
| 103 |
+
if not isinstance(selector_config, dict):
|
| 104 |
+
raise TypeError("Argument `selector_config` must be a dictionary.")
|
| 105 |
+
|
| 106 |
+
kwargs["selector_config"] = {**cls._generate_parser_arguments(), **selector_config}
|
| 107 |
+
|
| 108 |
+
async with AsyncStealthySession(**kwargs) as engine:
|
| 109 |
+
return await engine.fetch(url)
|
parser.py
ADDED
|
@@ -0,0 +1,1363 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
from inspect import signature
|
| 3 |
+
from urllib.parse import urljoin
|
| 4 |
+
from difflib import SequenceMatcher
|
| 5 |
+
from re import Pattern as re_Pattern
|
| 6 |
+
|
| 7 |
+
from lxml.html import HtmlElement, HTMLParser
|
| 8 |
+
from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
|
| 9 |
+
from lxml.etree import (
|
| 10 |
+
XPath,
|
| 11 |
+
tostring,
|
| 12 |
+
fromstring,
|
| 13 |
+
XPathError,
|
| 14 |
+
XPathEvalError,
|
| 15 |
+
_ElementUnicodeResult,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
from scrapling.core._types import (
|
| 19 |
+
Any,
|
| 20 |
+
Set,
|
| 21 |
+
Dict,
|
| 22 |
+
cast,
|
| 23 |
+
List,
|
| 24 |
+
Tuple,
|
| 25 |
+
Union,
|
| 26 |
+
TypeVar,
|
| 27 |
+
Pattern,
|
| 28 |
+
Callable,
|
| 29 |
+
Literal,
|
| 30 |
+
Optional,
|
| 31 |
+
Iterable,
|
| 32 |
+
overload,
|
| 33 |
+
Generator,
|
| 34 |
+
SupportsIndex,
|
| 35 |
+
TYPE_CHECKING,
|
| 36 |
+
)
|
| 37 |
+
from scrapling.core.custom_types import AttributesHandler, TextHandler, TextHandlers
|
| 38 |
+
from scrapling.core.mixins import SelectorsGeneration
|
| 39 |
+
from scrapling.core.storage import (
|
| 40 |
+
SQLiteStorageSystem,
|
| 41 |
+
StorageSystemMixin,
|
| 42 |
+
_StorageTools,
|
| 43 |
+
)
|
| 44 |
+
from scrapling.core.translator import css_to_xpath as _css_to_xpath
|
| 45 |
+
from scrapling.core.utils import clean_spaces, flatten, html_forbidden, log
|
| 46 |
+
|
| 47 |
+
__DEFAULT_DB_FILE__ = str(Path(__file__).parent / "elements_storage.db")
|
| 48 |
+
# Attributes that are Python reserved words and can't be used directly
|
| 49 |
+
# Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
|
| 50 |
+
# https://www.w3schools.com/python/python_ref_keywords.asp
|
| 51 |
+
_whitelisted = {
|
| 52 |
+
"class_": "class",
|
| 53 |
+
"for_": "for",
|
| 54 |
+
}
|
| 55 |
+
_T = TypeVar("_T")
|
| 56 |
+
# Pre-compiled selectors for efficiency
|
| 57 |
+
_find_all_elements = XPath(".//*")
|
| 58 |
+
_find_all_elements_with_spaces = XPath(
|
| 59 |
+
".//*[normalize-space(text())]"
|
| 60 |
+
) # This selector gets all elements with text content
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
class Selector(SelectorsGeneration):
|
| 64 |
+
__slots__ = (
|
| 65 |
+
"url",
|
| 66 |
+
"encoding",
|
| 67 |
+
"__adaptive_enabled",
|
| 68 |
+
"_root",
|
| 69 |
+
"_storage",
|
| 70 |
+
"__keep_comments",
|
| 71 |
+
"__huge_tree_enabled",
|
| 72 |
+
"__attributes",
|
| 73 |
+
"__text",
|
| 74 |
+
"__tag",
|
| 75 |
+
"__keep_cdata",
|
| 76 |
+
"_raw_body",
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
def __init__(
|
| 80 |
+
self,
|
| 81 |
+
content: Optional[str | bytes] = None,
|
| 82 |
+
url: str = "",
|
| 83 |
+
encoding: str = "utf-8",
|
| 84 |
+
huge_tree: bool = True,
|
| 85 |
+
root: Optional[HtmlElement] = None,
|
| 86 |
+
keep_comments: Optional[bool] = False,
|
| 87 |
+
keep_cdata: Optional[bool] = False,
|
| 88 |
+
adaptive: Optional[bool] = False,
|
| 89 |
+
_storage: Optional[StorageSystemMixin] = None,
|
| 90 |
+
storage: Any = SQLiteStorageSystem,
|
| 91 |
+
storage_args: Optional[Dict] = None,
|
| 92 |
+
**_,
|
| 93 |
+
):
|
| 94 |
+
"""The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
|
| 95 |
+
with expressions in CSS, XPath, or with simply text. Check the docs for more info.
|
| 96 |
+
|
| 97 |
+
Here we try to extend module ``lxml.html.HtmlElement`` while maintaining a simpler interface, We are not
|
| 98 |
+
inheriting from the ``lxml.html.HtmlElement`` because it's not pickleable, which makes a lot of reference jobs
|
| 99 |
+
not possible. You can test it here and see code explodes with `AssertionError: invalid Element proxy at...`.
|
| 100 |
+
It's an old issue with lxml, see `this entry <https://bugs.launchpad.net/lxml/+bug/736708>`
|
| 101 |
+
|
| 102 |
+
:param content: HTML content as either string or bytes.
|
| 103 |
+
:param url: It allows storing a URL with the HTML data for retrieving later.
|
| 104 |
+
:param encoding: The encoding type that will be used in HTML parsing, default is `UTF-8`
|
| 105 |
+
:param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
|
| 106 |
+
the libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
|
| 107 |
+
:param root: Used internally to pass etree objects instead of text/body arguments, it takes the highest priority.
|
| 108 |
+
Don't use it unless you know what you are doing!
|
| 109 |
+
:param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
|
| 110 |
+
:param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
|
| 111 |
+
:param adaptive: Globally turn off the adaptive feature in all functions, this argument takes higher
|
| 112 |
+
priority over all adaptive related arguments/functions in the class.
|
| 113 |
+
:param storage: The storage class to be passed for adaptive functionalities, see ``Docs`` for more info.
|
| 114 |
+
:param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
|
| 115 |
+
If empty, default values will be used.
|
| 116 |
+
"""
|
| 117 |
+
if root is None and content is None:
|
| 118 |
+
raise ValueError("Selector class needs HTML content, or root arguments to work")
|
| 119 |
+
|
| 120 |
+
self.url = url
|
| 121 |
+
self._raw_body: str | bytes = ""
|
| 122 |
+
self.encoding = encoding
|
| 123 |
+
self.__keep_cdata = keep_cdata
|
| 124 |
+
self.__huge_tree_enabled = huge_tree
|
| 125 |
+
self.__keep_comments = keep_comments
|
| 126 |
+
# For selector stuff
|
| 127 |
+
self.__text: Optional[TextHandler] = None
|
| 128 |
+
self.__attributes: Optional[AttributesHandler] = None
|
| 129 |
+
self.__tag: Optional[str] = None
|
| 130 |
+
self._storage: Optional[StorageSystemMixin] = None
|
| 131 |
+
if root is None:
|
| 132 |
+
body: str | bytes
|
| 133 |
+
if isinstance(content, str):
|
| 134 |
+
body = content.strip().replace("\x00", "") or "<html/>"
|
| 135 |
+
elif isinstance(content, bytes):
|
| 136 |
+
body = content.replace(b"\x00", b"")
|
| 137 |
+
else:
|
| 138 |
+
raise TypeError(f"content argument must be str or bytes, got {type(content)}")
|
| 139 |
+
|
| 140 |
+
# https://lxml.de/api/lxml.etree.HTMLParser-class.html
|
| 141 |
+
_parser_kwargs: Dict[str, Any] = dict(
|
| 142 |
+
recover=True,
|
| 143 |
+
remove_blank_text=True,
|
| 144 |
+
remove_comments=(not keep_comments),
|
| 145 |
+
encoding=encoding,
|
| 146 |
+
compact=True,
|
| 147 |
+
huge_tree=huge_tree,
|
| 148 |
+
default_doctype=True, # Supported by lxml but missing from stubs
|
| 149 |
+
strip_cdata=(not keep_cdata),
|
| 150 |
+
)
|
| 151 |
+
parser = HTMLParser(**_parser_kwargs)
|
| 152 |
+
self._root = cast(HtmlElement, fromstring(body or "<html/>", parser=parser, base_url=url or ""))
|
| 153 |
+
self._raw_body = content
|
| 154 |
+
|
| 155 |
+
else:
|
| 156 |
+
self._root = cast(HtmlElement, root)
|
| 157 |
+
|
| 158 |
+
if self._is_text_node(root):
|
| 159 |
+
self.__adaptive_enabled = False
|
| 160 |
+
return
|
| 161 |
+
|
| 162 |
+
self.__adaptive_enabled = bool(adaptive)
|
| 163 |
+
|
| 164 |
+
if self.__adaptive_enabled:
|
| 165 |
+
if _storage is not None:
|
| 166 |
+
self._storage = _storage
|
| 167 |
+
else:
|
| 168 |
+
if not storage_args:
|
| 169 |
+
storage_args = {
|
| 170 |
+
"storage_file": __DEFAULT_DB_FILE__,
|
| 171 |
+
"url": url,
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
if not hasattr(storage, "__wrapped__"):
|
| 175 |
+
raise ValueError("Storage class must be wrapped with lru_cache decorator, see docs for info")
|
| 176 |
+
|
| 177 |
+
if not issubclass(storage.__wrapped__, StorageSystemMixin): # pragma: no cover
|
| 178 |
+
raise ValueError("Storage system must be inherited from class `StorageSystemMixin`")
|
| 179 |
+
|
| 180 |
+
self._storage = storage(**storage_args)
|
| 181 |
+
|
| 182 |
+
def __getitem__(self, key: str) -> TextHandler:
|
| 183 |
+
if self._is_text_node(self._root):
|
| 184 |
+
raise TypeError("Text nodes do not have attributes")
|
| 185 |
+
return self.attrib[key]
|
| 186 |
+
|
| 187 |
+
def __contains__(self, key: str) -> bool:
|
| 188 |
+
if self._is_text_node(self._root):
|
| 189 |
+
return False
|
| 190 |
+
return key in self.attrib
|
| 191 |
+
|
| 192 |
+
# Node functionalities, I wanted to move to a separate Mixin class, but it had a slight impact on performance
|
| 193 |
+
@staticmethod
|
| 194 |
+
def _is_text_node(
|
| 195 |
+
element: HtmlElement | _ElementUnicodeResult,
|
| 196 |
+
) -> bool:
|
| 197 |
+
"""Return True if the given element is a result of a string expression
|
| 198 |
+
Examples:
|
| 199 |
+
XPath -> '/text()', '/@attribute', etc...
|
| 200 |
+
CSS3 -> '::text', '::attr(attrib)'...
|
| 201 |
+
"""
|
| 202 |
+
# Faster than checking `element.is_attribute or element.is_text or element.is_tail`
|
| 203 |
+
return issubclass(type(element), _ElementUnicodeResult)
|
| 204 |
+
|
| 205 |
+
def __element_convertor(self, element: HtmlElement | _ElementUnicodeResult) -> "Selector":
|
| 206 |
+
"""Used internally to convert a single HtmlElement or text node to Selector directly without checks"""
|
| 207 |
+
return Selector(
|
| 208 |
+
root=element,
|
| 209 |
+
url=self.url,
|
| 210 |
+
encoding=self.encoding,
|
| 211 |
+
adaptive=self.__adaptive_enabled,
|
| 212 |
+
_storage=self._storage,
|
| 213 |
+
keep_comments=self.__keep_comments,
|
| 214 |
+
keep_cdata=self.__keep_cdata,
|
| 215 |
+
huge_tree=self.__huge_tree_enabled,
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
def __elements_convertor(self, elements: List[HtmlElement | _ElementUnicodeResult]) -> "Selectors":
|
| 219 |
+
# Store them for non-repeated call-ups
|
| 220 |
+
url = self.url
|
| 221 |
+
encoding = self.encoding
|
| 222 |
+
adaptive = self.__adaptive_enabled
|
| 223 |
+
storage = self._storage
|
| 224 |
+
comments = self.__keep_comments
|
| 225 |
+
cdata = self.__keep_cdata
|
| 226 |
+
huge_tree = self.__huge_tree_enabled
|
| 227 |
+
|
| 228 |
+
return Selectors(
|
| 229 |
+
Selector(
|
| 230 |
+
root=el,
|
| 231 |
+
url=url,
|
| 232 |
+
encoding=encoding,
|
| 233 |
+
adaptive=adaptive,
|
| 234 |
+
_storage=storage,
|
| 235 |
+
keep_comments=comments,
|
| 236 |
+
keep_cdata=cdata,
|
| 237 |
+
huge_tree=huge_tree,
|
| 238 |
+
)
|
| 239 |
+
for el in elements
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
def __handle_elements(self, result: List[HtmlElement | _ElementUnicodeResult]) -> "Selectors":
|
| 243 |
+
"""Used internally in all functions to convert results to Selectors in bulk"""
|
| 244 |
+
if not result:
|
| 245 |
+
return Selectors()
|
| 246 |
+
|
| 247 |
+
return self.__elements_convertor(result)
|
| 248 |
+
|
| 249 |
+
def __getstate__(self) -> Any:
|
| 250 |
+
# lxml don't like it :)
|
| 251 |
+
raise TypeError("Can't pickle Selector objects")
|
| 252 |
+
|
| 253 |
+
# The following four properties I made them into functions instead of variables directly
|
| 254 |
+
# So they don't slow down the process of initializing many instances of the class and gets executed only
|
| 255 |
+
# when the user needs them for the first time for that specific element and gets cached for next times
|
| 256 |
+
# Doing that only made the library performance test sky rocked multiple times faster than before
|
| 257 |
+
# because I was executing them on initialization before :))
|
| 258 |
+
@property
|
| 259 |
+
def tag(self) -> str:
|
| 260 |
+
"""Get the tag name of the element"""
|
| 261 |
+
if self._is_text_node(self._root):
|
| 262 |
+
return "#text"
|
| 263 |
+
if not self.__tag:
|
| 264 |
+
self.__tag = str(self._root.tag)
|
| 265 |
+
return self.__tag or ""
|
| 266 |
+
|
| 267 |
+
@property
|
| 268 |
+
def text(self) -> TextHandler:
|
| 269 |
+
"""Get text content of the element"""
|
| 270 |
+
if self._is_text_node(self._root):
|
| 271 |
+
return TextHandler(str(self._root))
|
| 272 |
+
if self.__text is None:
|
| 273 |
+
# If you want to escape lxml default behavior and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
|
| 274 |
+
# before extracting text, then keep `keep_comments` set to False while initializing the first class
|
| 275 |
+
self.__text = TextHandler(self._root.text or "")
|
| 276 |
+
return self.__text
|
| 277 |
+
|
| 278 |
+
def get_all_text(
|
| 279 |
+
self,
|
| 280 |
+
separator: str = "\n",
|
| 281 |
+
strip: bool = False,
|
| 282 |
+
ignore_tags: Tuple = (
|
| 283 |
+
"script",
|
| 284 |
+
"style",
|
| 285 |
+
),
|
| 286 |
+
valid_values: bool = True,
|
| 287 |
+
) -> TextHandler:
|
| 288 |
+
"""Get all child strings of this element, concatenated using the given separator.
|
| 289 |
+
|
| 290 |
+
:param separator: Strings will be concatenated using this separator.
|
| 291 |
+
:param strip: If True, strings will be stripped before being concatenated.
|
| 292 |
+
:param ignore_tags: A tuple of all tag names you want to ignore
|
| 293 |
+
:param valid_values: If enabled, elements with text-content that is empty or only whitespaces will be ignored
|
| 294 |
+
|
| 295 |
+
:return: A TextHandler
|
| 296 |
+
"""
|
| 297 |
+
if self._is_text_node(self._root):
|
| 298 |
+
return TextHandler(str(self._root))
|
| 299 |
+
|
| 300 |
+
ignored_elements: set[Any] = set()
|
| 301 |
+
if ignore_tags:
|
| 302 |
+
for element in self._root.iter(*ignore_tags):
|
| 303 |
+
ignored_elements.add(element)
|
| 304 |
+
ignored_elements.update(cast(list, _find_all_elements(element)))
|
| 305 |
+
|
| 306 |
+
_all_strings = []
|
| 307 |
+
for node in self._root.iter():
|
| 308 |
+
if node not in ignored_elements:
|
| 309 |
+
text = node.text
|
| 310 |
+
if text and isinstance(text, str):
|
| 311 |
+
processed_text = text.strip() if strip else text
|
| 312 |
+
if not valid_values or processed_text.strip():
|
| 313 |
+
_all_strings.append(processed_text)
|
| 314 |
+
|
| 315 |
+
return cast(TextHandler, TextHandler(separator).join(_all_strings))
|
| 316 |
+
|
| 317 |
+
def urljoin(self, relative_url: str) -> str:
|
| 318 |
+
"""Join this Selector's url with a relative url to form an absolute full URL."""
|
| 319 |
+
return urljoin(self.url, relative_url)
|
| 320 |
+
|
| 321 |
+
@property
|
| 322 |
+
def attrib(self) -> AttributesHandler:
|
| 323 |
+
"""Get attributes of the element"""
|
| 324 |
+
if self._is_text_node(self._root):
|
| 325 |
+
return AttributesHandler({})
|
| 326 |
+
if not self.__attributes:
|
| 327 |
+
self.__attributes = AttributesHandler(self._root.attrib)
|
| 328 |
+
return self.__attributes
|
| 329 |
+
|
| 330 |
+
@property
|
| 331 |
+
def html_content(self) -> TextHandler:
|
| 332 |
+
"""Return the inner HTML code of the element"""
|
| 333 |
+
if self._is_text_node(self._root):
|
| 334 |
+
return TextHandler(str(self._root))
|
| 335 |
+
content = tostring(self._root, encoding=self.encoding, method="html", with_tail=False)
|
| 336 |
+
if isinstance(content, bytes):
|
| 337 |
+
content = content.strip().decode(self.encoding)
|
| 338 |
+
return TextHandler(content)
|
| 339 |
+
|
| 340 |
+
@property
|
| 341 |
+
def body(self) -> str | bytes:
|
| 342 |
+
"""Return the raw body of the current `Selector` without any processing. Useful for binary and non-HTML requests."""
|
| 343 |
+
if self._is_text_node(self._root):
|
| 344 |
+
return ""
|
| 345 |
+
return self._raw_body
|
| 346 |
+
|
| 347 |
+
def prettify(self) -> TextHandler:
|
| 348 |
+
"""Return a prettified version of the element's inner html-code"""
|
| 349 |
+
if self._is_text_node(self._root):
|
| 350 |
+
return TextHandler(str(self._root))
|
| 351 |
+
content = tostring(
|
| 352 |
+
self._root,
|
| 353 |
+
encoding=self.encoding,
|
| 354 |
+
pretty_print=True,
|
| 355 |
+
method="html",
|
| 356 |
+
with_tail=False,
|
| 357 |
+
)
|
| 358 |
+
if isinstance(content, bytes):
|
| 359 |
+
content = content.strip().decode(self.encoding)
|
| 360 |
+
return TextHandler(content)
|
| 361 |
+
|
| 362 |
+
def has_class(self, class_name: str) -> bool:
|
| 363 |
+
"""Check if the element has a specific class
|
| 364 |
+
:param class_name: The class name to check for
|
| 365 |
+
:return: True if element has class with that name otherwise False
|
| 366 |
+
"""
|
| 367 |
+
if self._is_text_node(self._root):
|
| 368 |
+
return False
|
| 369 |
+
return class_name in self._root.classes
|
| 370 |
+
|
| 371 |
+
@property
|
| 372 |
+
def parent(self) -> Optional["Selector"]:
|
| 373 |
+
"""Return the direct parent of the element or ``None`` otherwise"""
|
| 374 |
+
_parent = self._root.getparent()
|
| 375 |
+
return self.__element_convertor(_parent) if _parent is not None else None
|
| 376 |
+
|
| 377 |
+
@property
|
| 378 |
+
def below_elements(self) -> "Selectors":
|
| 379 |
+
"""Return all elements under the current element in the DOM tree"""
|
| 380 |
+
if self._is_text_node(self._root):
|
| 381 |
+
return Selectors()
|
| 382 |
+
below = cast(List, _find_all_elements(self._root))
|
| 383 |
+
return self.__elements_convertor(below) if below is not None else Selectors()
|
| 384 |
+
|
| 385 |
+
@property
|
| 386 |
+
def children(self) -> "Selectors":
|
| 387 |
+
"""Return the children elements of the current element or empty list otherwise"""
|
| 388 |
+
if self._is_text_node(self._root):
|
| 389 |
+
return Selectors()
|
| 390 |
+
return Selectors(
|
| 391 |
+
self.__element_convertor(child)
|
| 392 |
+
for child in self._root.iterchildren()
|
| 393 |
+
if not isinstance(child, html_forbidden)
|
| 394 |
+
)
|
| 395 |
+
|
| 396 |
+
@property
|
| 397 |
+
def siblings(self) -> "Selectors":
|
| 398 |
+
"""Return other children of the current element's parent or empty list otherwise"""
|
| 399 |
+
if self.parent:
|
| 400 |
+
return Selectors(child for child in self.parent.children if child._root != self._root)
|
| 401 |
+
return Selectors()
|
| 402 |
+
|
| 403 |
+
def iterancestors(self) -> Generator["Selector", None, None]:
|
| 404 |
+
"""Return a generator that loops over all ancestors of the element, starting with the element's parent."""
|
| 405 |
+
if self._is_text_node(self._root):
|
| 406 |
+
return
|
| 407 |
+
for ancestor in self._root.iterancestors():
|
| 408 |
+
yield self.__element_convertor(ancestor)
|
| 409 |
+
|
| 410 |
+
def find_ancestor(self, func: Callable[["Selector"], bool]) -> Optional["Selector"]:
|
| 411 |
+
"""Loop over all ancestors of the element till one match the passed function
|
| 412 |
+
:param func: A function that takes each ancestor as an argument and returns True/False
|
| 413 |
+
:return: The first ancestor that match the function or ``None`` otherwise.
|
| 414 |
+
"""
|
| 415 |
+
for ancestor in self.iterancestors():
|
| 416 |
+
if func(ancestor):
|
| 417 |
+
return ancestor
|
| 418 |
+
return None
|
| 419 |
+
|
| 420 |
+
@property
|
| 421 |
+
def path(self) -> "Selectors":
|
| 422 |
+
"""Returns a list of type `Selectors` that contains the path leading to the current element from the root."""
|
| 423 |
+
lst = list(self.iterancestors())
|
| 424 |
+
return Selectors(lst)
|
| 425 |
+
|
| 426 |
+
@property
|
| 427 |
+
def next(self) -> Optional["Selector"]:
|
| 428 |
+
"""Returns the next element of the current element in the children of the parent or ``None`` otherwise."""
|
| 429 |
+
if self._is_text_node(self._root):
|
| 430 |
+
return None
|
| 431 |
+
next_element = self._root.getnext()
|
| 432 |
+
while next_element is not None and isinstance(next_element, html_forbidden):
|
| 433 |
+
# Ignore HTML comments and unwanted types
|
| 434 |
+
next_element = next_element.getnext()
|
| 435 |
+
|
| 436 |
+
return self.__element_convertor(next_element) if next_element is not None else None
|
| 437 |
+
|
| 438 |
+
@property
|
| 439 |
+
def previous(self) -> Optional["Selector"]:
|
| 440 |
+
"""Returns the previous element of the current element in the children of the parent or ``None`` otherwise."""
|
| 441 |
+
if self._is_text_node(self._root):
|
| 442 |
+
return None
|
| 443 |
+
prev_element = self._root.getprevious()
|
| 444 |
+
while prev_element is not None and isinstance(prev_element, html_forbidden):
|
| 445 |
+
# Ignore HTML comments and unwanted types
|
| 446 |
+
prev_element = prev_element.getprevious()
|
| 447 |
+
|
| 448 |
+
return self.__element_convertor(prev_element) if prev_element is not None else None
|
| 449 |
+
|
| 450 |
+
def get(self) -> TextHandler:
|
| 451 |
+
"""
|
| 452 |
+
Serialize this element to a string.
|
| 453 |
+
For text nodes, returns the text value. For HTML elements, returns the outer HTML.
|
| 454 |
+
"""
|
| 455 |
+
if self._is_text_node(self._root):
|
| 456 |
+
return TextHandler(str(self._root))
|
| 457 |
+
return self.html_content
|
| 458 |
+
|
| 459 |
+
def getall(self) -> TextHandlers:
|
| 460 |
+
"""Return a single-element list containing this element's serialized string."""
|
| 461 |
+
return TextHandlers([self.get()])
|
| 462 |
+
|
| 463 |
+
extract = getall
|
| 464 |
+
extract_first = get
|
| 465 |
+
|
| 466 |
+
def __str__(self) -> str:
|
| 467 |
+
if self._is_text_node(self._root):
|
| 468 |
+
return str(self._root)
|
| 469 |
+
return self.html_content
|
| 470 |
+
|
| 471 |
+
def __repr__(self) -> str:
|
| 472 |
+
length_limit = 40
|
| 473 |
+
|
| 474 |
+
if self._is_text_node(self._root):
|
| 475 |
+
text = str(self._root)
|
| 476 |
+
if len(text) > length_limit:
|
| 477 |
+
text = text[:length_limit].strip() + "..."
|
| 478 |
+
return f"<text='{text}'>"
|
| 479 |
+
|
| 480 |
+
content = clean_spaces(self.html_content)
|
| 481 |
+
if len(content) > length_limit:
|
| 482 |
+
content = content[:length_limit].strip() + "..."
|
| 483 |
+
data = f"<data='{content}'"
|
| 484 |
+
|
| 485 |
+
if self.parent:
|
| 486 |
+
parent_content = clean_spaces(self.parent.html_content)
|
| 487 |
+
if len(parent_content) > length_limit:
|
| 488 |
+
parent_content = parent_content[:length_limit].strip() + "..."
|
| 489 |
+
|
| 490 |
+
data += f" parent='{parent_content}'"
|
| 491 |
+
|
| 492 |
+
return data + ">"
|
| 493 |
+
|
| 494 |
+
# From here we start with the selecting functions
|
| 495 |
+
@overload
|
| 496 |
+
def relocate(
|
| 497 |
+
self, element: Union[Dict, HtmlElement, "Selector"], percentage: int, selector_type: Literal[True]
|
| 498 |
+
) -> "Selectors": ...
|
| 499 |
+
|
| 500 |
+
@overload
|
| 501 |
+
def relocate(
|
| 502 |
+
self, element: Union[Dict, HtmlElement, "Selector"], percentage: int, selector_type: Literal[False] = False
|
| 503 |
+
) -> List[HtmlElement]: ...
|
| 504 |
+
|
| 505 |
+
def relocate(
|
| 506 |
+
self,
|
| 507 |
+
element: Union[Dict, HtmlElement, "Selector"],
|
| 508 |
+
percentage: int = 0,
|
| 509 |
+
selector_type: bool = False,
|
| 510 |
+
) -> Union[List[HtmlElement], "Selectors"]:
|
| 511 |
+
"""This function will search again for the element in the page tree, used automatically on page structure change
|
| 512 |
+
|
| 513 |
+
:param element: The element we want to relocate in the tree
|
| 514 |
+
:param percentage: The minimum percentage to accept and not going lower than that. Be aware that the percentage
|
| 515 |
+
calculation depends solely on the page structure, so don't play with this number unless you must know
|
| 516 |
+
what you are doing!
|
| 517 |
+
:param selector_type: If True, the return result will be converted to `Selectors` object
|
| 518 |
+
:return: List of pure HTML elements that got the highest matching score or 'Selectors' object
|
| 519 |
+
"""
|
| 520 |
+
score_table: Dict[float, List[Any]] = {}
|
| 521 |
+
# Note: `element` will most likely always be a dictionary at this point.
|
| 522 |
+
if isinstance(element, self.__class__):
|
| 523 |
+
element = element._root
|
| 524 |
+
|
| 525 |
+
if issubclass(type(element), HtmlElement):
|
| 526 |
+
element = _StorageTools.element_to_dict(element)
|
| 527 |
+
|
| 528 |
+
for node in cast(List, _find_all_elements(self._root)):
|
| 529 |
+
# Collect all elements in the page, then for each element get the matching score of it against the node.
|
| 530 |
+
# Hence: the code doesn't stop even if the score was 100%
|
| 531 |
+
# because there might be another element(s) left in page with the same score
|
| 532 |
+
score = self.__calculate_similarity_score(cast(Dict, element), node)
|
| 533 |
+
score_table.setdefault(score, []).append(node)
|
| 534 |
+
|
| 535 |
+
if score_table:
|
| 536 |
+
highest_probability = max(score_table.keys())
|
| 537 |
+
if score_table[highest_probability] and highest_probability >= percentage:
|
| 538 |
+
if log.getEffectiveLevel() < 20:
|
| 539 |
+
# No need to execute this part if the logging level is not debugging
|
| 540 |
+
log.debug(f"Highest probability was {highest_probability}%")
|
| 541 |
+
log.debug("Top 5 best matching elements are: ")
|
| 542 |
+
for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
|
| 543 |
+
log.debug(f"{percent} -> {self.__elements_convertor(score_table[percent])}")
|
| 544 |
+
|
| 545 |
+
if not selector_type:
|
| 546 |
+
return score_table[highest_probability]
|
| 547 |
+
return self.__elements_convertor(score_table[highest_probability])
|
| 548 |
+
return []
|
| 549 |
+
|
| 550 |
+
def css(
|
| 551 |
+
self,
|
| 552 |
+
selector: str,
|
| 553 |
+
identifier: str = "",
|
| 554 |
+
adaptive: bool = False,
|
| 555 |
+
auto_save: bool = False,
|
| 556 |
+
percentage: int = 0,
|
| 557 |
+
) -> "Selectors":
|
| 558 |
+
"""Search the current tree with CSS3 selectors
|
| 559 |
+
|
| 560 |
+
**Important:
|
| 561 |
+
It's recommended to use the identifier argument if you plan to use a different selector later
|
| 562 |
+
and want to relocate the same element(s)**
|
| 563 |
+
|
| 564 |
+
:param selector: The CSS3 selector to be used.
|
| 565 |
+
:param adaptive: Enabled will make the function try to relocate the element if it was 'saved' before
|
| 566 |
+
:param identifier: A string that will be used to save/retrieve element's data in adaptive,
|
| 567 |
+
otherwise the selector will be used.
|
| 568 |
+
:param auto_save: Automatically save new elements for `adaptive` later
|
| 569 |
+
:param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.
|
| 570 |
+
Be aware that the percentage calculation depends solely on the page structure, so don't play with this
|
| 571 |
+
number unless you must know what you are doing!
|
| 572 |
+
|
| 573 |
+
:return: `Selectors` class.
|
| 574 |
+
"""
|
| 575 |
+
if self._is_text_node(self._root):
|
| 576 |
+
return Selectors()
|
| 577 |
+
|
| 578 |
+
try:
|
| 579 |
+
if not self.__adaptive_enabled or "," not in selector:
|
| 580 |
+
# No need to split selectors in this case, let's save some CPU cycles :)
|
| 581 |
+
xpath_selector = _css_to_xpath(selector)
|
| 582 |
+
return self.xpath(
|
| 583 |
+
xpath_selector,
|
| 584 |
+
identifier or selector,
|
| 585 |
+
adaptive,
|
| 586 |
+
auto_save,
|
| 587 |
+
percentage,
|
| 588 |
+
)
|
| 589 |
+
|
| 590 |
+
results = Selectors()
|
| 591 |
+
for single_selector in split_selectors(selector):
|
| 592 |
+
# I'm doing this only so the `save` function saves data correctly for combined selectors
|
| 593 |
+
# Like using the ',' to combine two different selectors that point to different elements.
|
| 594 |
+
xpath_selector = _css_to_xpath(single_selector.canonical())
|
| 595 |
+
results += self.xpath(
|
| 596 |
+
xpath_selector,
|
| 597 |
+
identifier or single_selector.canonical(),
|
| 598 |
+
adaptive,
|
| 599 |
+
auto_save,
|
| 600 |
+
percentage,
|
| 601 |
+
)
|
| 602 |
+
|
| 603 |
+
return Selectors(results)
|
| 604 |
+
except (
|
| 605 |
+
SelectorError,
|
| 606 |
+
SelectorSyntaxError,
|
| 607 |
+
) as e:
|
| 608 |
+
raise SelectorSyntaxError(f"Invalid CSS selector '{selector}': {str(e)}") from e
|
| 609 |
+
|
| 610 |
+
def xpath(
|
| 611 |
+
self,
|
| 612 |
+
selector: str,
|
| 613 |
+
identifier: str = "",
|
| 614 |
+
adaptive: bool = False,
|
| 615 |
+
auto_save: bool = False,
|
| 616 |
+
percentage: int = 0,
|
| 617 |
+
**kwargs: Any,
|
| 618 |
+
) -> "Selectors":
|
| 619 |
+
"""Search the current tree with XPath selectors
|
| 620 |
+
|
| 621 |
+
**Important:
|
| 622 |
+
It's recommended to use the identifier argument if you plan to use a different selector later
|
| 623 |
+
and want to relocate the same element(s)**
|
| 624 |
+
|
| 625 |
+
Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
|
| 626 |
+
|
| 627 |
+
:param selector: The XPath selector to be used.
|
| 628 |
+
:param adaptive: Enabled will make the function try to relocate the element if it was 'saved' before
|
| 629 |
+
:param identifier: A string that will be used to save/retrieve element's data in adaptive,
|
| 630 |
+
otherwise the selector will be used.
|
| 631 |
+
:param auto_save: Automatically save new elements for `adaptive` later
|
| 632 |
+
:param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.
|
| 633 |
+
Be aware that the percentage calculation depends solely on the page structure, so don't play with this
|
| 634 |
+
number unless you must know what you are doing!
|
| 635 |
+
|
| 636 |
+
:return: `Selectors` class.
|
| 637 |
+
"""
|
| 638 |
+
if self._is_text_node(self._root):
|
| 639 |
+
return Selectors()
|
| 640 |
+
|
| 641 |
+
try:
|
| 642 |
+
if elements := self._root.xpath(selector, **kwargs):
|
| 643 |
+
if not self.__adaptive_enabled and auto_save:
|
| 644 |
+
log.warning(
|
| 645 |
+
"Argument `auto_save` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
|
| 646 |
+
)
|
| 647 |
+
elif self.__adaptive_enabled and auto_save:
|
| 648 |
+
self.save(elements[0], identifier or selector)
|
| 649 |
+
|
| 650 |
+
return self.__handle_elements(elements)
|
| 651 |
+
elif self.__adaptive_enabled:
|
| 652 |
+
if adaptive:
|
| 653 |
+
element_data = self.retrieve(identifier or selector)
|
| 654 |
+
if element_data:
|
| 655 |
+
elements = self.relocate(element_data, percentage)
|
| 656 |
+
if elements is not None and auto_save:
|
| 657 |
+
self.save(elements[0], identifier or selector)
|
| 658 |
+
|
| 659 |
+
return self.__handle_elements(elements)
|
| 660 |
+
else:
|
| 661 |
+
if adaptive:
|
| 662 |
+
log.warning(
|
| 663 |
+
"Argument `adaptive` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
|
| 664 |
+
)
|
| 665 |
+
elif auto_save:
|
| 666 |
+
log.warning(
|
| 667 |
+
"Argument `auto_save` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
|
| 668 |
+
)
|
| 669 |
+
|
| 670 |
+
return self.__handle_elements(elements)
|
| 671 |
+
|
| 672 |
+
except (
|
| 673 |
+
SelectorError,
|
| 674 |
+
SelectorSyntaxError,
|
| 675 |
+
XPathError,
|
| 676 |
+
XPathEvalError,
|
| 677 |
+
) as e:
|
| 678 |
+
raise SelectorSyntaxError(f"Invalid XPath selector: {selector}") from e
|
| 679 |
+
|
| 680 |
+
def find_all(
|
| 681 |
+
self,
|
| 682 |
+
*args: str | Iterable[str] | Pattern | Callable | Dict[str, str],
|
| 683 |
+
**kwargs: str,
|
| 684 |
+
) -> "Selectors":
|
| 685 |
+
"""Find elements by filters of your creations for ease.
|
| 686 |
+
|
| 687 |
+
:param args: Tag name(s), iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
|
| 688 |
+
:param kwargs: The attributes you want to filter elements based on it.
|
| 689 |
+
:return: The `Selectors` object of the elements or empty list
|
| 690 |
+
"""
|
| 691 |
+
if self._is_text_node(self._root):
|
| 692 |
+
return Selectors()
|
| 693 |
+
|
| 694 |
+
if not args and not kwargs:
|
| 695 |
+
raise TypeError("You have to pass something to search with, like tag name(s), tag attributes, or both.")
|
| 696 |
+
|
| 697 |
+
attributes: Dict[str, Any] = dict()
|
| 698 |
+
tags: Set[str] = set()
|
| 699 |
+
patterns: Set[Pattern] = set()
|
| 700 |
+
results, functions, selectors = Selectors(), [], []
|
| 701 |
+
|
| 702 |
+
# Brace yourself for a wonderful journey!
|
| 703 |
+
for arg in args:
|
| 704 |
+
if isinstance(arg, str):
|
| 705 |
+
tags.add(arg)
|
| 706 |
+
|
| 707 |
+
elif type(arg) in (list, tuple, set):
|
| 708 |
+
arg = cast(Iterable, arg) # Type narrowing for type checkers like pyright
|
| 709 |
+
if not all(map(lambda x: isinstance(x, str), arg)):
|
| 710 |
+
raise TypeError("Nested Iterables are not accepted, only iterables of tag names are accepted")
|
| 711 |
+
tags.update(set(arg))
|
| 712 |
+
|
| 713 |
+
elif isinstance(arg, dict):
|
| 714 |
+
if not all([(isinstance(k, str) and isinstance(v, str)) for k, v in arg.items()]):
|
| 715 |
+
raise TypeError(
|
| 716 |
+
"Nested dictionaries are not accepted, only string keys and string values are accepted"
|
| 717 |
+
)
|
| 718 |
+
attributes.update(arg)
|
| 719 |
+
|
| 720 |
+
elif isinstance(arg, re_Pattern):
|
| 721 |
+
patterns.add(arg)
|
| 722 |
+
|
| 723 |
+
elif callable(arg):
|
| 724 |
+
if len(signature(arg).parameters) > 0:
|
| 725 |
+
functions.append(arg)
|
| 726 |
+
else:
|
| 727 |
+
raise TypeError(
|
| 728 |
+
"Callable filter function must have at least one argument to take `Selector` objects."
|
| 729 |
+
)
|
| 730 |
+
|
| 731 |
+
else:
|
| 732 |
+
raise TypeError(f'Argument with type "{type(arg)}" is not accepted, please read the docs.')
|
| 733 |
+
|
| 734 |
+
if not all([(isinstance(k, str) and isinstance(v, str)) for k, v in kwargs.items()]):
|
| 735 |
+
raise TypeError("Only string values are accepted for arguments")
|
| 736 |
+
|
| 737 |
+
for attribute_name, value in kwargs.items():
|
| 738 |
+
# Only replace names for kwargs, replacing them in dictionaries doesn't make sense
|
| 739 |
+
attribute_name = _whitelisted.get(attribute_name, attribute_name)
|
| 740 |
+
attributes[attribute_name] = value
|
| 741 |
+
|
| 742 |
+
# It's easier and faster to build a selector than traversing the tree
|
| 743 |
+
tags = tags or set("*")
|
| 744 |
+
for tag in tags:
|
| 745 |
+
selector = tag
|
| 746 |
+
for key, value in attributes.items():
|
| 747 |
+
value = value.replace('"', r"\"") # Escape double quotes in user input
|
| 748 |
+
# Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
|
| 749 |
+
selector += '[{}="{}"]'.format(key, value)
|
| 750 |
+
if selector != "*":
|
| 751 |
+
selectors.append(selector)
|
| 752 |
+
|
| 753 |
+
if selectors:
|
| 754 |
+
results = cast(Selectors, self.css(", ".join(selectors)))
|
| 755 |
+
if results:
|
| 756 |
+
# From the results, get the ones that fulfill passed regex patterns
|
| 757 |
+
for pattern in patterns:
|
| 758 |
+
results = results.filter(lambda e: e.text.re(pattern, check_match=True))
|
| 759 |
+
|
| 760 |
+
# From the results, get the ones that fulfill passed functions
|
| 761 |
+
for function in functions:
|
| 762 |
+
results = results.filter(function)
|
| 763 |
+
else:
|
| 764 |
+
results = results or self.below_elements
|
| 765 |
+
for pattern in patterns:
|
| 766 |
+
results = results.filter(lambda e: e.text.re(pattern, check_match=True))
|
| 767 |
+
|
| 768 |
+
# Collect an element if it fulfills the passed function otherwise
|
| 769 |
+
for function in functions:
|
| 770 |
+
results = results.filter(function)
|
| 771 |
+
|
| 772 |
+
return results
|
| 773 |
+
|
| 774 |
+
def find(
|
| 775 |
+
self,
|
| 776 |
+
*args: str | Iterable[str] | Pattern | Callable | Dict[str, str],
|
| 777 |
+
**kwargs: str,
|
| 778 |
+
) -> Optional["Selector"]:
|
| 779 |
+
"""Find elements by filters of your creations for ease, then return the first result. Otherwise return `None`.
|
| 780 |
+
|
| 781 |
+
:param args: Tag name(s), iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
|
| 782 |
+
:param kwargs: The attributes you want to filter elements based on it.
|
| 783 |
+
:return: The `Selector` object of the element or `None` if the result didn't match
|
| 784 |
+
"""
|
| 785 |
+
for element in self.find_all(*args, **kwargs):
|
| 786 |
+
return element
|
| 787 |
+
return None
|
| 788 |
+
|
| 789 |
+
def __calculate_similarity_score(self, original: Dict, candidate: HtmlElement) -> float:
|
| 790 |
+
"""Used internally to calculate a score that shows how a candidate element similar to the original one
|
| 791 |
+
|
| 792 |
+
:param original: The original element in the form of the dictionary generated from `element_to_dict` function
|
| 793 |
+
:param candidate: The element to compare with the original element.
|
| 794 |
+
:return: A percentage score of how similar is the candidate to the original element
|
| 795 |
+
"""
|
| 796 |
+
score: float = 0
|
| 797 |
+
checks: int = 0
|
| 798 |
+
data = _StorageTools.element_to_dict(candidate)
|
| 799 |
+
|
| 800 |
+
score += 1 if original["tag"] == data["tag"] else 0
|
| 801 |
+
checks += 1
|
| 802 |
+
|
| 803 |
+
if original["text"]:
|
| 804 |
+
score += SequenceMatcher(None, original["text"], data.get("text") or "").ratio()
|
| 805 |
+
checks += 1
|
| 806 |
+
|
| 807 |
+
# if both don't have attributes, it still counts for something!
|
| 808 |
+
score += self.__calculate_dict_diff(original["attributes"], data["attributes"])
|
| 809 |
+
checks += 1
|
| 810 |
+
|
| 811 |
+
# Separate similarity test for class, id, href,... this will help in full structural changes
|
| 812 |
+
for attrib in (
|
| 813 |
+
"class",
|
| 814 |
+
"id",
|
| 815 |
+
"href",
|
| 816 |
+
"src",
|
| 817 |
+
):
|
| 818 |
+
if original["attributes"].get(attrib):
|
| 819 |
+
score += SequenceMatcher(
|
| 820 |
+
None,
|
| 821 |
+
original["attributes"][attrib],
|
| 822 |
+
data["attributes"].get(attrib) or "",
|
| 823 |
+
).ratio()
|
| 824 |
+
checks += 1
|
| 825 |
+
|
| 826 |
+
score += SequenceMatcher(None, original["path"], data["path"]).ratio()
|
| 827 |
+
checks += 1
|
| 828 |
+
|
| 829 |
+
if original.get("parent_name"):
|
| 830 |
+
# Then we start comparing parents' data
|
| 831 |
+
if data.get("parent_name"):
|
| 832 |
+
score += SequenceMatcher(None, original["parent_name"], data.get("parent_name") or "").ratio()
|
| 833 |
+
checks += 1
|
| 834 |
+
|
| 835 |
+
score += self.__calculate_dict_diff(original["parent_attribs"], data.get("parent_attribs") or {})
|
| 836 |
+
checks += 1
|
| 837 |
+
|
| 838 |
+
if original["parent_text"]:
|
| 839 |
+
score += SequenceMatcher(
|
| 840 |
+
None,
|
| 841 |
+
original["parent_text"],
|
| 842 |
+
data.get("parent_text") or "",
|
| 843 |
+
).ratio()
|
| 844 |
+
checks += 1
|
| 845 |
+
# else:
|
| 846 |
+
# # The original element has a parent and this one not, this is not a good sign
|
| 847 |
+
# score -= 0.1
|
| 848 |
+
|
| 849 |
+
if original.get("siblings"):
|
| 850 |
+
score += SequenceMatcher(None, original["siblings"], data.get("siblings") or []).ratio()
|
| 851 |
+
checks += 1
|
| 852 |
+
|
| 853 |
+
# How % sure? let's see
|
| 854 |
+
return round((score / checks) * 100, 2)
|
| 855 |
+
|
| 856 |
+
@staticmethod
|
| 857 |
+
def __calculate_dict_diff(dict1: Dict, dict2: Dict) -> float:
|
| 858 |
+
"""Used internally to calculate similarity between two dictionaries as SequenceMatcher doesn't accept dictionaries"""
|
| 859 |
+
score = SequenceMatcher(None, tuple(dict1.keys()), tuple(dict2.keys())).ratio() * 0.5
|
| 860 |
+
score += SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio() * 0.5
|
| 861 |
+
return score
|
| 862 |
+
|
| 863 |
+
def save(self, element: HtmlElement, identifier: str) -> None:
|
| 864 |
+
"""Saves the element's unique properties to the storage for retrieval and relocation later
|
| 865 |
+
|
| 866 |
+
:param element: The element itself that we want to save to storage, it can be a ` Selector ` or pure ` HtmlElement `
|
| 867 |
+
:param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
|
| 868 |
+
the docs for more info.
|
| 869 |
+
"""
|
| 870 |
+
if self.__adaptive_enabled and self._storage:
|
| 871 |
+
target_element: Any = element
|
| 872 |
+
if isinstance(target_element, self.__class__):
|
| 873 |
+
target_element = target_element._root
|
| 874 |
+
|
| 875 |
+
if self._is_text_node(target_element):
|
| 876 |
+
target_element = target_element.getparent()
|
| 877 |
+
|
| 878 |
+
self._storage.save(target_element, identifier)
|
| 879 |
+
else:
|
| 880 |
+
raise RuntimeError(
|
| 881 |
+
"Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
|
| 882 |
+
)
|
| 883 |
+
|
| 884 |
+
def retrieve(self, identifier: str) -> Optional[Dict[str, Any]]:
|
| 885 |
+
"""Using the identifier, we search the storage and return the unique properties of the element
|
| 886 |
+
|
| 887 |
+
:param identifier: This is the identifier that will be used to retrieve the element from the storage. See
|
| 888 |
+
the docs for more info.
|
| 889 |
+
:return: A dictionary of the unique properties
|
| 890 |
+
"""
|
| 891 |
+
if self.__adaptive_enabled and self._storage:
|
| 892 |
+
return self._storage.retrieve(identifier)
|
| 893 |
+
|
| 894 |
+
raise RuntimeError(
|
| 895 |
+
"Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
|
| 896 |
+
)
|
| 897 |
+
|
| 898 |
+
# Operations on text functions
|
| 899 |
+
def json(self) -> Dict:
|
| 900 |
+
"""Return JSON response if the response is jsonable otherwise throws error"""
|
| 901 |
+
if self._is_text_node(self._root):
|
| 902 |
+
return TextHandler(str(self._root)).json()
|
| 903 |
+
if self._raw_body and isinstance(self._raw_body, (str, bytes)):
|
| 904 |
+
if isinstance(self._raw_body, str):
|
| 905 |
+
return TextHandler(self._raw_body).json()
|
| 906 |
+
else:
|
| 907 |
+
if TYPE_CHECKING:
|
| 908 |
+
assert isinstance(self._raw_body, bytes)
|
| 909 |
+
return TextHandler(self._raw_body.decode()).json()
|
| 910 |
+
elif self.text:
|
| 911 |
+
return self.text.json()
|
| 912 |
+
else:
|
| 913 |
+
return self.get_all_text(strip=True).json()
|
| 914 |
+
|
| 915 |
+
def re(
|
| 916 |
+
self,
|
| 917 |
+
regex: str | Pattern[str],
|
| 918 |
+
replace_entities: bool = True,
|
| 919 |
+
clean_match: bool = False,
|
| 920 |
+
case_sensitive: bool = True,
|
| 921 |
+
) -> TextHandlers:
|
| 922 |
+
"""Apply the given regex to the current text and return a list of strings with the matches.
|
| 923 |
+
|
| 924 |
+
:param regex: Can be either a compiled regular expression or a string.
|
| 925 |
+
:param replace_entities: If enabled character entity references are replaced by their corresponding character
|
| 926 |
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 927 |
+
:param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
|
| 928 |
+
"""
|
| 929 |
+
return self.text.re(regex, replace_entities, clean_match, case_sensitive)
|
| 930 |
+
|
| 931 |
+
def re_first(
|
| 932 |
+
self,
|
| 933 |
+
regex: str | Pattern[str],
|
| 934 |
+
default=None,
|
| 935 |
+
replace_entities: bool = True,
|
| 936 |
+
clean_match: bool = False,
|
| 937 |
+
case_sensitive: bool = True,
|
| 938 |
+
) -> TextHandler:
|
| 939 |
+
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
| 940 |
+
|
| 941 |
+
:param regex: Can be either a compiled regular expression or a string.
|
| 942 |
+
:param default: The default value to be returned if there is no match
|
| 943 |
+
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
| 944 |
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 945 |
+
:param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
|
| 946 |
+
"""
|
| 947 |
+
return self.text.re_first(regex, default, replace_entities, clean_match, case_sensitive)
|
| 948 |
+
|
| 949 |
+
@staticmethod
|
| 950 |
+
def __get_attributes(element: HtmlElement, ignore_attributes: List | Tuple) -> Dict:
|
| 951 |
+
"""Return attributes dictionary without the ignored list"""
|
| 952 |
+
return {k: v for k, v in element.attrib.items() if k not in ignore_attributes}
|
| 953 |
+
|
| 954 |
+
def __are_alike(
|
| 955 |
+
self,
|
| 956 |
+
original: HtmlElement,
|
| 957 |
+
original_attributes: Dict,
|
| 958 |
+
candidate: HtmlElement,
|
| 959 |
+
ignore_attributes: List | Tuple,
|
| 960 |
+
similarity_threshold: float,
|
| 961 |
+
match_text: bool = False,
|
| 962 |
+
) -> bool:
|
| 963 |
+
"""Calculate a score of how much these elements are alike and return True
|
| 964 |
+
if the score is higher or equals the threshold"""
|
| 965 |
+
candidate_attributes = (
|
| 966 |
+
self.__get_attributes(candidate, ignore_attributes) if ignore_attributes else candidate.attrib
|
| 967 |
+
)
|
| 968 |
+
score: float = 0
|
| 969 |
+
checks: int = 0
|
| 970 |
+
|
| 971 |
+
if original_attributes:
|
| 972 |
+
score += sum(
|
| 973 |
+
SequenceMatcher(None, v, candidate_attributes.get(k, "")).ratio()
|
| 974 |
+
for k, v in original_attributes.items()
|
| 975 |
+
)
|
| 976 |
+
checks += len(candidate_attributes)
|
| 977 |
+
else:
|
| 978 |
+
if not candidate_attributes:
|
| 979 |
+
# Both don't have attributes, this must mean something
|
| 980 |
+
score += 1
|
| 981 |
+
checks += 1
|
| 982 |
+
|
| 983 |
+
if match_text:
|
| 984 |
+
score += SequenceMatcher(
|
| 985 |
+
None,
|
| 986 |
+
clean_spaces(original.text or ""),
|
| 987 |
+
clean_spaces(candidate.text or ""),
|
| 988 |
+
).ratio()
|
| 989 |
+
checks += 1
|
| 990 |
+
|
| 991 |
+
if checks:
|
| 992 |
+
return round(score / checks, 2) >= similarity_threshold
|
| 993 |
+
return False
|
| 994 |
+
|
| 995 |
+
def find_similar(
|
| 996 |
+
self,
|
| 997 |
+
similarity_threshold: float = 0.2,
|
| 998 |
+
ignore_attributes: List | Tuple = (
|
| 999 |
+
"href",
|
| 1000 |
+
"src",
|
| 1001 |
+
),
|
| 1002 |
+
match_text: bool = False,
|
| 1003 |
+
) -> "Selectors":
|
| 1004 |
+
"""Find elements that are in the same tree depth in the page with the same tag name and same parent tag etc...
|
| 1005 |
+
then return the ones that match the current element attributes with a percentage higher than the input threshold.
|
| 1006 |
+
|
| 1007 |
+
This function is inspired by AutoScraper and made for cases where you, for example, found a product div inside
|
| 1008 |
+
a products-list container and want to find other products using that element as a starting point EXCEPT
|
| 1009 |
+
this function works in any case without depending on the element type.
|
| 1010 |
+
|
| 1011 |
+
:param similarity_threshold: The percentage to use while comparing element attributes.
|
| 1012 |
+
Note: Elements found before attributes matching/comparison will be sharing the same depth, same tag name,
|
| 1013 |
+
same parent tag name, and same grand parent tag name. So they are 99% likely to be correct unless you are
|
| 1014 |
+
extremely unlucky, then attributes matching comes into play, so don't play with this number unless
|
| 1015 |
+
you are getting the results you don't want.
|
| 1016 |
+
Also, if the current element doesn't have attributes and the similar element as well, then it's a 100% match.
|
| 1017 |
+
:param ignore_attributes: Attribute names passed will be ignored while matching the attributes in the last step.
|
| 1018 |
+
The default value is to ignore `href` and `src` as URLs can change a lot between elements, so it's unreliable
|
| 1019 |
+
:param match_text: If True, element text content will be taken into calculation while matching.
|
| 1020 |
+
Not recommended to use in normal cases, but it depends.
|
| 1021 |
+
|
| 1022 |
+
:return: A ``Selectors`` container of ``Selector`` objects or empty list
|
| 1023 |
+
"""
|
| 1024 |
+
if self._is_text_node(self._root):
|
| 1025 |
+
return Selectors()
|
| 1026 |
+
|
| 1027 |
+
# We will use the elements' root from now on to get the speed boost of using Lxml directly
|
| 1028 |
+
root = self._root
|
| 1029 |
+
similar_elements = list()
|
| 1030 |
+
|
| 1031 |
+
current_depth = len(list(root.iterancestors()))
|
| 1032 |
+
target_attrs = self.__get_attributes(root, ignore_attributes) if ignore_attributes else root.attrib
|
| 1033 |
+
|
| 1034 |
+
path_parts = [self.tag]
|
| 1035 |
+
if (parent := root.getparent()) is not None:
|
| 1036 |
+
path_parts.insert(0, parent.tag)
|
| 1037 |
+
if (grandparent := parent.getparent()) is not None:
|
| 1038 |
+
path_parts.insert(0, grandparent.tag)
|
| 1039 |
+
|
| 1040 |
+
xpath_path = "//{}".format("/".join(path_parts))
|
| 1041 |
+
potential_matches = root.xpath(f"{xpath_path}[count(ancestor::*) = {current_depth}]")
|
| 1042 |
+
|
| 1043 |
+
for potential_match in potential_matches:
|
| 1044 |
+
if potential_match != root and self.__are_alike(
|
| 1045 |
+
root,
|
| 1046 |
+
target_attrs,
|
| 1047 |
+
potential_match,
|
| 1048 |
+
ignore_attributes,
|
| 1049 |
+
similarity_threshold,
|
| 1050 |
+
match_text,
|
| 1051 |
+
):
|
| 1052 |
+
similar_elements.append(potential_match)
|
| 1053 |
+
|
| 1054 |
+
return Selectors(map(self.__element_convertor, similar_elements))
|
| 1055 |
+
|
| 1056 |
+
@overload
|
| 1057 |
+
def find_by_text(
|
| 1058 |
+
self,
|
| 1059 |
+
text: str,
|
| 1060 |
+
first_match: Literal[True] = ...,
|
| 1061 |
+
partial: bool = ...,
|
| 1062 |
+
case_sensitive: bool = ...,
|
| 1063 |
+
clean_match: bool = ...,
|
| 1064 |
+
) -> "Selector": ...
|
| 1065 |
+
|
| 1066 |
+
@overload
|
| 1067 |
+
def find_by_text(
|
| 1068 |
+
self,
|
| 1069 |
+
text: str,
|
| 1070 |
+
first_match: Literal[False],
|
| 1071 |
+
partial: bool = ...,
|
| 1072 |
+
case_sensitive: bool = ...,
|
| 1073 |
+
clean_match: bool = ...,
|
| 1074 |
+
) -> "Selectors": ...
|
| 1075 |
+
|
| 1076 |
+
def find_by_text(
|
| 1077 |
+
self,
|
| 1078 |
+
text: str,
|
| 1079 |
+
first_match: bool = True,
|
| 1080 |
+
partial: bool = False,
|
| 1081 |
+
case_sensitive: bool = False,
|
| 1082 |
+
clean_match: bool = True,
|
| 1083 |
+
) -> Union["Selectors", "Selector"]:
|
| 1084 |
+
"""Find elements that its text content fully/partially matches input.
|
| 1085 |
+
:param text: Text query to match
|
| 1086 |
+
:param first_match: Returns the first element that matches conditions, enabled by default
|
| 1087 |
+
:param partial: If enabled, the function returns elements that contain the input text
|
| 1088 |
+
:param case_sensitive: if enabled, the letters case will be taken into consideration
|
| 1089 |
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 1090 |
+
"""
|
| 1091 |
+
if self._is_text_node(self._root):
|
| 1092 |
+
return Selectors()
|
| 1093 |
+
|
| 1094 |
+
results = Selectors()
|
| 1095 |
+
if not case_sensitive:
|
| 1096 |
+
text = text.lower()
|
| 1097 |
+
|
| 1098 |
+
possible_targets = cast(List, _find_all_elements_with_spaces(self._root))
|
| 1099 |
+
if possible_targets:
|
| 1100 |
+
for node in self.__elements_convertor(possible_targets):
|
| 1101 |
+
"""Check if element matches given text otherwise, traverse the children tree and iterate"""
|
| 1102 |
+
node_text: TextHandler = node.text
|
| 1103 |
+
if clean_match:
|
| 1104 |
+
node_text = TextHandler(node_text.clean())
|
| 1105 |
+
|
| 1106 |
+
if not case_sensitive:
|
| 1107 |
+
node_text = TextHandler(node_text.lower())
|
| 1108 |
+
|
| 1109 |
+
if partial:
|
| 1110 |
+
if text in node_text:
|
| 1111 |
+
results.append(node)
|
| 1112 |
+
elif text == node_text:
|
| 1113 |
+
results.append(node)
|
| 1114 |
+
|
| 1115 |
+
if first_match and results:
|
| 1116 |
+
# we got an element so we should stop
|
| 1117 |
+
break
|
| 1118 |
+
|
| 1119 |
+
if first_match:
|
| 1120 |
+
if results:
|
| 1121 |
+
return results[0]
|
| 1122 |
+
return results
|
| 1123 |
+
|
| 1124 |
+
@overload
|
| 1125 |
+
def find_by_regex(
|
| 1126 |
+
self,
|
| 1127 |
+
query: str | Pattern[str],
|
| 1128 |
+
first_match: Literal[True] = ...,
|
| 1129 |
+
case_sensitive: bool = ...,
|
| 1130 |
+
clean_match: bool = ...,
|
| 1131 |
+
) -> "Selector": ...
|
| 1132 |
+
|
| 1133 |
+
@overload
|
| 1134 |
+
def find_by_regex(
|
| 1135 |
+
self,
|
| 1136 |
+
query: str | Pattern[str],
|
| 1137 |
+
first_match: Literal[False],
|
| 1138 |
+
case_sensitive: bool = ...,
|
| 1139 |
+
clean_match: bool = ...,
|
| 1140 |
+
) -> "Selectors": ...
|
| 1141 |
+
|
| 1142 |
+
def find_by_regex(
|
| 1143 |
+
self,
|
| 1144 |
+
query: str | Pattern[str],
|
| 1145 |
+
first_match: bool = True,
|
| 1146 |
+
case_sensitive: bool = False,
|
| 1147 |
+
clean_match: bool = True,
|
| 1148 |
+
) -> Union["Selectors", "Selector"]:
|
| 1149 |
+
"""Find elements that its text content matches the input regex pattern.
|
| 1150 |
+
:param query: Regex query/pattern to match
|
| 1151 |
+
:param first_match: Return the first element that matches conditions; enabled by default.
|
| 1152 |
+
:param case_sensitive: If enabled, the letters case will be taken into consideration in the regex.
|
| 1153 |
+
:param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching.
|
| 1154 |
+
"""
|
| 1155 |
+
if self._is_text_node(self._root):
|
| 1156 |
+
return Selectors()
|
| 1157 |
+
|
| 1158 |
+
results = Selectors()
|
| 1159 |
+
|
| 1160 |
+
possible_targets = cast(List, _find_all_elements_with_spaces(self._root))
|
| 1161 |
+
if possible_targets:
|
| 1162 |
+
for node in self.__elements_convertor(possible_targets):
|
| 1163 |
+
"""Check if element matches given regex otherwise, traverse the children tree and iterate"""
|
| 1164 |
+
node_text = node.text
|
| 1165 |
+
if node_text.re(
|
| 1166 |
+
query,
|
| 1167 |
+
check_match=True,
|
| 1168 |
+
clean_match=clean_match,
|
| 1169 |
+
case_sensitive=case_sensitive,
|
| 1170 |
+
):
|
| 1171 |
+
results.append(node)
|
| 1172 |
+
|
| 1173 |
+
if first_match and results:
|
| 1174 |
+
# we got an element so we should stop
|
| 1175 |
+
break
|
| 1176 |
+
|
| 1177 |
+
if results and first_match:
|
| 1178 |
+
return results[0]
|
| 1179 |
+
return results
|
| 1180 |
+
|
| 1181 |
+
|
| 1182 |
+
class Selectors(List[Selector]):
|
| 1183 |
+
"""
|
| 1184 |
+
The `Selectors` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
|
| 1185 |
+
"""
|
| 1186 |
+
|
| 1187 |
+
__slots__ = ()
|
| 1188 |
+
|
| 1189 |
+
@overload
|
| 1190 |
+
def __getitem__(self, pos: SupportsIndex) -> Selector:
|
| 1191 |
+
pass
|
| 1192 |
+
|
| 1193 |
+
@overload
|
| 1194 |
+
def __getitem__(self, pos: slice) -> "Selectors":
|
| 1195 |
+
pass
|
| 1196 |
+
|
| 1197 |
+
def __getitem__(self, pos: SupportsIndex | slice) -> Union[Selector, "Selectors"]:
|
| 1198 |
+
lst = super().__getitem__(pos)
|
| 1199 |
+
if isinstance(pos, slice):
|
| 1200 |
+
return self.__class__(cast(List[Selector], lst))
|
| 1201 |
+
else:
|
| 1202 |
+
return cast(Selector, lst)
|
| 1203 |
+
|
| 1204 |
+
def xpath(
|
| 1205 |
+
self,
|
| 1206 |
+
selector: str,
|
| 1207 |
+
identifier: str = "",
|
| 1208 |
+
auto_save: bool = False,
|
| 1209 |
+
percentage: int = 0,
|
| 1210 |
+
**kwargs: Any,
|
| 1211 |
+
) -> "Selectors":
|
| 1212 |
+
"""
|
| 1213 |
+
Call the ``.xpath()`` method for each element in this list and return
|
| 1214 |
+
their results as another `Selectors` class.
|
| 1215 |
+
|
| 1216 |
+
**Important:
|
| 1217 |
+
It's recommended to use the identifier argument if you plan to use a different selector later
|
| 1218 |
+
and want to relocate the same element(s)**
|
| 1219 |
+
|
| 1220 |
+
Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
|
| 1221 |
+
|
| 1222 |
+
:param selector: The XPath selector to be used.
|
| 1223 |
+
:param identifier: A string that will be used to retrieve element's data in adaptive,
|
| 1224 |
+
otherwise the selector will be used.
|
| 1225 |
+
:param auto_save: Automatically save new elements for `adaptive` later
|
| 1226 |
+
:param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.
|
| 1227 |
+
Be aware that the percentage calculation depends solely on the page structure, so don't play with this
|
| 1228 |
+
number unless you must know what you are doing!
|
| 1229 |
+
|
| 1230 |
+
:return: `Selectors` class.
|
| 1231 |
+
"""
|
| 1232 |
+
results = [n.xpath(selector, identifier or selector, False, auto_save, percentage, **kwargs) for n in self]
|
| 1233 |
+
return self.__class__(flatten(results))
|
| 1234 |
+
|
| 1235 |
+
def css(
|
| 1236 |
+
self,
|
| 1237 |
+
selector: str,
|
| 1238 |
+
identifier: str = "",
|
| 1239 |
+
auto_save: bool = False,
|
| 1240 |
+
percentage: int = 0,
|
| 1241 |
+
) -> "Selectors":
|
| 1242 |
+
"""
|
| 1243 |
+
Call the ``.css()`` method for each element in this list and return
|
| 1244 |
+
their results flattened as another `Selectors` class.
|
| 1245 |
+
|
| 1246 |
+
**Important:
|
| 1247 |
+
It's recommended to use the identifier argument if you plan to use a different selector later
|
| 1248 |
+
and want to relocate the same element(s)**
|
| 1249 |
+
|
| 1250 |
+
:param selector: The CSS3 selector to be used.
|
| 1251 |
+
:param identifier: A string that will be used to retrieve element's data in adaptive,
|
| 1252 |
+
otherwise the selector will be used.
|
| 1253 |
+
:param auto_save: Automatically save new elements for `adaptive` later
|
| 1254 |
+
:param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.
|
| 1255 |
+
Be aware that the percentage calculation depends solely on the page structure, so don't play with this
|
| 1256 |
+
number unless you must know what you are doing!
|
| 1257 |
+
|
| 1258 |
+
:return: `Selectors` class.
|
| 1259 |
+
"""
|
| 1260 |
+
results = [n.css(selector, identifier or selector, False, auto_save, percentage) for n in self]
|
| 1261 |
+
return self.__class__(flatten(results))
|
| 1262 |
+
|
| 1263 |
+
def re(
|
| 1264 |
+
self,
|
| 1265 |
+
regex: str | Pattern,
|
| 1266 |
+
replace_entities: bool = True,
|
| 1267 |
+
clean_match: bool = False,
|
| 1268 |
+
case_sensitive: bool = True,
|
| 1269 |
+
) -> TextHandlers:
|
| 1270 |
+
"""Call the ``.re()`` method for each element in this list and return
|
| 1271 |
+
their results flattened as List of TextHandler.
|
| 1272 |
+
|
| 1273 |
+
:param regex: Can be either a compiled regular expression or a string.
|
| 1274 |
+
:param replace_entities: If enabled character entity references are replaced by their corresponding character
|
| 1275 |
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 1276 |
+
:param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
|
| 1277 |
+
"""
|
| 1278 |
+
results = [n.re(regex, replace_entities, clean_match, case_sensitive) for n in self]
|
| 1279 |
+
return TextHandlers(flatten(results))
|
| 1280 |
+
|
| 1281 |
+
def re_first(
|
| 1282 |
+
self,
|
| 1283 |
+
regex: str | Pattern,
|
| 1284 |
+
default: Any = None,
|
| 1285 |
+
replace_entities: bool = True,
|
| 1286 |
+
clean_match: bool = False,
|
| 1287 |
+
case_sensitive: bool = True,
|
| 1288 |
+
) -> TextHandler:
|
| 1289 |
+
"""Call the ``.re_first()`` method for each element in this list and return
|
| 1290 |
+
the first result or the default value otherwise.
|
| 1291 |
+
|
| 1292 |
+
:param regex: Can be either a compiled regular expression or a string.
|
| 1293 |
+
:param default: The default value to be returned if there is no match
|
| 1294 |
+
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
| 1295 |
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 1296 |
+
:param case_sensitive: if disabled, function will set the regex to ignore the letters case while compiling it
|
| 1297 |
+
"""
|
| 1298 |
+
for n in self:
|
| 1299 |
+
for result in n.re(regex, replace_entities, clean_match, case_sensitive):
|
| 1300 |
+
return result
|
| 1301 |
+
return default
|
| 1302 |
+
|
| 1303 |
+
def search(self, func: Callable[["Selector"], bool]) -> Optional["Selector"]:
|
| 1304 |
+
"""Loop over all current elements and return the first element that matches the passed function
|
| 1305 |
+
:param func: A function that takes each element as an argument and returns True/False
|
| 1306 |
+
:return: The first element that match the function or ``None`` otherwise.
|
| 1307 |
+
"""
|
| 1308 |
+
for element in self:
|
| 1309 |
+
if func(element):
|
| 1310 |
+
return element
|
| 1311 |
+
return None
|
| 1312 |
+
|
| 1313 |
+
def filter(self, func: Callable[["Selector"], bool]) -> "Selectors":
|
| 1314 |
+
"""Filter current elements based on the passed function
|
| 1315 |
+
:param func: A function that takes each element as an argument and returns True/False
|
| 1316 |
+
:return: The new `Selectors` object or empty list otherwise.
|
| 1317 |
+
"""
|
| 1318 |
+
return self.__class__([element for element in self if func(element)])
|
| 1319 |
+
|
| 1320 |
+
@overload
|
| 1321 |
+
def get(self) -> Optional[TextHandler]: ...
|
| 1322 |
+
|
| 1323 |
+
@overload
|
| 1324 |
+
def get(self, default: _T) -> Union[TextHandler, _T]: ...
|
| 1325 |
+
|
| 1326 |
+
def get(self, default=None):
|
| 1327 |
+
"""Returns the serialized string of the first element, or ``default`` if empty.
|
| 1328 |
+
:param default: the default value to return if the current list is empty
|
| 1329 |
+
"""
|
| 1330 |
+
for x in self:
|
| 1331 |
+
return x.get()
|
| 1332 |
+
return default
|
| 1333 |
+
|
| 1334 |
+
def getall(self) -> TextHandlers:
|
| 1335 |
+
"""Serialize all elements and return as a TextHandlers list."""
|
| 1336 |
+
return TextHandlers([x.get() for x in self])
|
| 1337 |
+
|
| 1338 |
+
extract = getall
|
| 1339 |
+
extract_first = get
|
| 1340 |
+
|
| 1341 |
+
@property
|
| 1342 |
+
def first(self) -> Optional[Selector]:
|
| 1343 |
+
"""Returns the first Selector item of the current list or `None` if the list is empty"""
|
| 1344 |
+
return self[0] if len(self) > 0 else None
|
| 1345 |
+
|
| 1346 |
+
@property
|
| 1347 |
+
def last(self) -> Optional[Selector]:
|
| 1348 |
+
"""Returns the last Selector item of the current list or `None` if the list is empty"""
|
| 1349 |
+
return self[-1] if len(self) > 0 else None
|
| 1350 |
+
|
| 1351 |
+
@property
|
| 1352 |
+
def length(self) -> int:
|
| 1353 |
+
"""Returns the length of the current list"""
|
| 1354 |
+
return len(self)
|
| 1355 |
+
|
| 1356 |
+
def __getstate__(self) -> Any: # pragma: no cover
|
| 1357 |
+
# lxml don't like it :)
|
| 1358 |
+
raise TypeError("Can't pickle Selectors object")
|
| 1359 |
+
|
| 1360 |
+
|
| 1361 |
+
# For backward compatibility
|
| 1362 |
+
Adaptor = Selector
|
| 1363 |
+
Adaptors = Selectors
|
py.typed
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
spiders/__init__.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .request import Request
|
| 2 |
+
from .result import CrawlResult
|
| 3 |
+
from .scheduler import Scheduler
|
| 4 |
+
from .engine import CrawlerEngine
|
| 5 |
+
from .session import SessionManager
|
| 6 |
+
from .spider import Spider, SessionConfigurationError
|
| 7 |
+
from scrapling.engines.toolbelt.custom import Response
|
| 8 |
+
|
| 9 |
+
__all__ = [
|
| 10 |
+
"Spider",
|
| 11 |
+
"SessionConfigurationError",
|
| 12 |
+
"Request",
|
| 13 |
+
"CrawlerEngine",
|
| 14 |
+
"CrawlResult",
|
| 15 |
+
"SessionManager",
|
| 16 |
+
"Scheduler",
|
| 17 |
+
"Response",
|
| 18 |
+
]
|
spiders/checkpoint.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pickle
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from dataclasses import dataclass, field
|
| 4 |
+
|
| 5 |
+
import anyio
|
| 6 |
+
from anyio import Path as AsyncPath
|
| 7 |
+
|
| 8 |
+
from scrapling.core.utils import log
|
| 9 |
+
from scrapling.core._types import Set, List, Optional, TYPE_CHECKING
|
| 10 |
+
|
| 11 |
+
if TYPE_CHECKING:
|
| 12 |
+
from scrapling.spiders.request import Request
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@dataclass
|
| 16 |
+
class CheckpointData:
|
| 17 |
+
"""Container for checkpoint state."""
|
| 18 |
+
|
| 19 |
+
requests: List["Request"] = field(default_factory=list)
|
| 20 |
+
seen: Set[bytes] = field(default_factory=set)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class CheckpointManager:
|
| 24 |
+
"""Manages saving and loading checkpoint state to/from disk."""
|
| 25 |
+
|
| 26 |
+
CHECKPOINT_FILE = "checkpoint.pkl"
|
| 27 |
+
|
| 28 |
+
def __init__(self, crawldir: str | Path | AsyncPath, interval: float = 300.0):
|
| 29 |
+
self.crawldir = AsyncPath(crawldir)
|
| 30 |
+
self._checkpoint_path = self.crawldir / self.CHECKPOINT_FILE
|
| 31 |
+
self.interval = interval
|
| 32 |
+
if not isinstance(interval, (int, float)):
|
| 33 |
+
raise TypeError("Checkpoints interval must be integer or float.")
|
| 34 |
+
else:
|
| 35 |
+
if interval < 0:
|
| 36 |
+
raise ValueError("Checkpoints interval must be equal or greater than 0.")
|
| 37 |
+
|
| 38 |
+
async def has_checkpoint(self) -> bool:
|
| 39 |
+
"""Check if a checkpoint exists."""
|
| 40 |
+
return await self._checkpoint_path.exists()
|
| 41 |
+
|
| 42 |
+
async def save(self, data: CheckpointData) -> None:
|
| 43 |
+
"""Save checkpoint data to disk atomically."""
|
| 44 |
+
await self.crawldir.mkdir(parents=True, exist_ok=True)
|
| 45 |
+
|
| 46 |
+
temp_path = self._checkpoint_path.with_suffix(".tmp")
|
| 47 |
+
|
| 48 |
+
try:
|
| 49 |
+
serialized = pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL)
|
| 50 |
+
async with await anyio.open_file(temp_path, "wb") as f:
|
| 51 |
+
await f.write(serialized)
|
| 52 |
+
|
| 53 |
+
await temp_path.rename(self._checkpoint_path)
|
| 54 |
+
|
| 55 |
+
log.info(f"Checkpoint saved: {len(data.requests)} requests, {len(data.seen)} seen URLs")
|
| 56 |
+
except Exception as e:
|
| 57 |
+
# Clean up temp file if it exists
|
| 58 |
+
if await temp_path.exists():
|
| 59 |
+
await temp_path.unlink()
|
| 60 |
+
log.error(f"Failed to save checkpoint: {e}")
|
| 61 |
+
raise
|
| 62 |
+
|
| 63 |
+
async def load(self) -> Optional[CheckpointData]:
|
| 64 |
+
"""Load checkpoint data from disk.
|
| 65 |
+
|
| 66 |
+
Returns None if no checkpoint exists or if loading fails.
|
| 67 |
+
"""
|
| 68 |
+
if not await self.has_checkpoint():
|
| 69 |
+
return None
|
| 70 |
+
|
| 71 |
+
try:
|
| 72 |
+
async with await anyio.open_file(self._checkpoint_path, "rb") as f:
|
| 73 |
+
content = await f.read()
|
| 74 |
+
data: CheckpointData = pickle.loads(content)
|
| 75 |
+
|
| 76 |
+
log.info(f"Checkpoint loaded: {len(data.requests)} requests, {len(data.seen)} seen URLs")
|
| 77 |
+
return data
|
| 78 |
+
|
| 79 |
+
except Exception as e:
|
| 80 |
+
log.error(f"Failed to load checkpoint (starting fresh): {e}")
|
| 81 |
+
return None
|
| 82 |
+
|
| 83 |
+
async def cleanup(self) -> None:
|
| 84 |
+
"""Delete checkpoint file after successful completion."""
|
| 85 |
+
try:
|
| 86 |
+
if await self._checkpoint_path.exists():
|
| 87 |
+
await self._checkpoint_path.unlink()
|
| 88 |
+
log.debug("Checkpoint file cleaned up")
|
| 89 |
+
except Exception as e:
|
| 90 |
+
log.warning(f"Failed to cleanup checkpoint file: {e}")
|
spiders/engine.py
ADDED
|
@@ -0,0 +1,333 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import pprint
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import anyio
|
| 6 |
+
from anyio import Path as AsyncPath
|
| 7 |
+
from anyio import create_task_group, CapacityLimiter, create_memory_object_stream, EndOfStream
|
| 8 |
+
|
| 9 |
+
from scrapling.core.utils import log
|
| 10 |
+
from scrapling.spiders.request import Request
|
| 11 |
+
from scrapling.spiders.scheduler import Scheduler
|
| 12 |
+
from scrapling.spiders.session import SessionManager
|
| 13 |
+
from scrapling.spiders.result import CrawlStats, ItemList
|
| 14 |
+
from scrapling.spiders.checkpoint import CheckpointManager, CheckpointData
|
| 15 |
+
from scrapling.core._types import Dict, Union, Optional, TYPE_CHECKING, Any, AsyncGenerator
|
| 16 |
+
|
| 17 |
+
if TYPE_CHECKING:
|
| 18 |
+
from scrapling.spiders.spider import Spider
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _dump(obj: Dict) -> str:
|
| 22 |
+
return json.dumps(obj, indent=4)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class CrawlerEngine:
|
| 26 |
+
"""Orchestrates the crawling process."""
|
| 27 |
+
|
| 28 |
+
def __init__(
|
| 29 |
+
self,
|
| 30 |
+
spider: "Spider",
|
| 31 |
+
session_manager: SessionManager,
|
| 32 |
+
crawldir: Optional[Union[str, Path, AsyncPath]] = None,
|
| 33 |
+
interval: float = 300.0,
|
| 34 |
+
):
|
| 35 |
+
self.spider = spider
|
| 36 |
+
self.session_manager = session_manager
|
| 37 |
+
self.scheduler = Scheduler(
|
| 38 |
+
include_kwargs=spider.fp_include_kwargs,
|
| 39 |
+
include_headers=spider.fp_include_headers,
|
| 40 |
+
keep_fragments=spider.fp_keep_fragments,
|
| 41 |
+
)
|
| 42 |
+
self.stats = CrawlStats()
|
| 43 |
+
|
| 44 |
+
self._global_limiter = CapacityLimiter(spider.concurrent_requests)
|
| 45 |
+
self._domain_limiters: dict[str, CapacityLimiter] = {}
|
| 46 |
+
self._allowed_domains: set[str] = spider.allowed_domains or set()
|
| 47 |
+
|
| 48 |
+
self._active_tasks: int = 0
|
| 49 |
+
self._running: bool = False
|
| 50 |
+
self._items: ItemList = ItemList()
|
| 51 |
+
self._item_stream: Any = None
|
| 52 |
+
|
| 53 |
+
self._checkpoint_system_enabled = bool(crawldir)
|
| 54 |
+
self._checkpoint_manager = CheckpointManager(crawldir or "", interval)
|
| 55 |
+
self._last_checkpoint_time: float = 0.0
|
| 56 |
+
self._pause_requested: bool = False
|
| 57 |
+
self._force_stop: bool = False
|
| 58 |
+
self.paused: bool = False
|
| 59 |
+
|
| 60 |
+
def _is_domain_allowed(self, request: Request) -> bool:
|
| 61 |
+
"""Check if the request's domain is in allowed_domains."""
|
| 62 |
+
if not self._allowed_domains:
|
| 63 |
+
return True
|
| 64 |
+
|
| 65 |
+
domain = request.domain
|
| 66 |
+
for allowed in self._allowed_domains:
|
| 67 |
+
if domain == allowed or domain.endswith("." + allowed):
|
| 68 |
+
return True
|
| 69 |
+
return False
|
| 70 |
+
|
| 71 |
+
def _rate_limiter(self, domain: str) -> CapacityLimiter:
|
| 72 |
+
"""Get or create a per-domain concurrency limiter if enabled, otherwise use the global limiter."""
|
| 73 |
+
if self.spider.concurrent_requests_per_domain:
|
| 74 |
+
if domain not in self._domain_limiters:
|
| 75 |
+
self._domain_limiters[domain] = CapacityLimiter(self.spider.concurrent_requests_per_domain)
|
| 76 |
+
return self._domain_limiters[domain]
|
| 77 |
+
return self._global_limiter
|
| 78 |
+
|
| 79 |
+
def _normalize_request(self, request: Request) -> None:
|
| 80 |
+
"""Normalize request fields before enqueueing.
|
| 81 |
+
|
| 82 |
+
Resolves empty sid to the session manager's default session ID.
|
| 83 |
+
This ensures consistent fingerprinting for requests using the same session.
|
| 84 |
+
"""
|
| 85 |
+
if not request.sid:
|
| 86 |
+
request.sid = self.session_manager.default_session_id
|
| 87 |
+
|
| 88 |
+
async def _process_request(self, request: Request) -> None:
|
| 89 |
+
"""Download and process a single request."""
|
| 90 |
+
async with self._rate_limiter(request.domain):
|
| 91 |
+
if self.spider.download_delay:
|
| 92 |
+
await anyio.sleep(self.spider.download_delay)
|
| 93 |
+
|
| 94 |
+
if request._session_kwargs.get("proxy"):
|
| 95 |
+
self.stats.proxies.append(request._session_kwargs["proxy"])
|
| 96 |
+
if request._session_kwargs.get("proxies"):
|
| 97 |
+
self.stats.proxies.append(dict(request._session_kwargs["proxies"]))
|
| 98 |
+
try:
|
| 99 |
+
response = await self.session_manager.fetch(request)
|
| 100 |
+
self.stats.increment_requests_count(request.sid or self.session_manager.default_session_id)
|
| 101 |
+
self.stats.increment_response_bytes(request.domain, len(response.body))
|
| 102 |
+
self.stats.increment_status(response.status)
|
| 103 |
+
|
| 104 |
+
except Exception as e:
|
| 105 |
+
self.stats.failed_requests_count += 1
|
| 106 |
+
await self.spider.on_error(request, e)
|
| 107 |
+
return
|
| 108 |
+
|
| 109 |
+
if await self.spider.is_blocked(response):
|
| 110 |
+
self.stats.blocked_requests_count += 1
|
| 111 |
+
if request._retry_count < self.spider.max_blocked_retries:
|
| 112 |
+
retry_request = request.copy()
|
| 113 |
+
retry_request._retry_count += 1
|
| 114 |
+
retry_request.priority -= 1 # Don't retry immediately
|
| 115 |
+
retry_request.dont_filter = True
|
| 116 |
+
retry_request._session_kwargs.pop("proxy", None)
|
| 117 |
+
retry_request._session_kwargs.pop("proxies", None)
|
| 118 |
+
|
| 119 |
+
new_request = await self.spider.retry_blocked_request(retry_request, response)
|
| 120 |
+
self._normalize_request(new_request)
|
| 121 |
+
await self.scheduler.enqueue(new_request)
|
| 122 |
+
log.info(
|
| 123 |
+
f"Scheduled blocked request for retry ({retry_request._retry_count}/{self.spider.max_blocked_retries}): {request.url}"
|
| 124 |
+
)
|
| 125 |
+
else:
|
| 126 |
+
log.warning(f"Max retries exceeded for blocked request: {request.url}")
|
| 127 |
+
return
|
| 128 |
+
|
| 129 |
+
callback = request.callback if request.callback else self.spider.parse
|
| 130 |
+
try:
|
| 131 |
+
async for result in callback(response):
|
| 132 |
+
if isinstance(result, Request):
|
| 133 |
+
if self._is_domain_allowed(result):
|
| 134 |
+
self._normalize_request(result)
|
| 135 |
+
await self.scheduler.enqueue(result)
|
| 136 |
+
else:
|
| 137 |
+
self.stats.offsite_requests_count += 1
|
| 138 |
+
log.debug(f"Filtered offsite request to: {result.url}")
|
| 139 |
+
elif isinstance(result, dict):
|
| 140 |
+
processed_result = await self.spider.on_scraped_item(result)
|
| 141 |
+
if processed_result:
|
| 142 |
+
self.stats.items_scraped += 1
|
| 143 |
+
log.debug(f"Scraped from {str(response)}\n{pprint.pformat(processed_result)}")
|
| 144 |
+
if self._item_stream:
|
| 145 |
+
await self._item_stream.send(processed_result)
|
| 146 |
+
else:
|
| 147 |
+
self._items.append(processed_result)
|
| 148 |
+
else:
|
| 149 |
+
self.stats.items_dropped += 1
|
| 150 |
+
log.warning(f"Dropped from {str(response)}\n{processed_result}")
|
| 151 |
+
elif result is not None:
|
| 152 |
+
log.error(f"Spider must return Request, dict or None, got '{type(result)}' in {request}")
|
| 153 |
+
except Exception as e:
|
| 154 |
+
msg = f"Spider error processing {request}:\n {e}"
|
| 155 |
+
log.error(msg, exc_info=e)
|
| 156 |
+
await self.spider.on_error(request, e)
|
| 157 |
+
|
| 158 |
+
async def _task_wrapper(self, request: Request) -> None:
|
| 159 |
+
"""Wrapper to track active task count."""
|
| 160 |
+
try:
|
| 161 |
+
await self._process_request(request)
|
| 162 |
+
finally:
|
| 163 |
+
self._active_tasks -= 1
|
| 164 |
+
|
| 165 |
+
def request_pause(self) -> None:
|
| 166 |
+
"""Request a graceful pause of the crawl.
|
| 167 |
+
|
| 168 |
+
First call: requests graceful pause (waits for active tasks).
|
| 169 |
+
Second call: forces immediate stop.
|
| 170 |
+
"""
|
| 171 |
+
if self._force_stop:
|
| 172 |
+
return # Already forcing stop
|
| 173 |
+
|
| 174 |
+
if self._pause_requested:
|
| 175 |
+
# Second Ctrl+C - force stop
|
| 176 |
+
self._force_stop = True
|
| 177 |
+
log.warning("Force stop requested, cancelling immediately...")
|
| 178 |
+
else:
|
| 179 |
+
self._pause_requested = True
|
| 180 |
+
log.info(
|
| 181 |
+
"Pause requested, waiting for in-flight requests to complete (press Ctrl+C again to force stop)..."
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
async def _save_checkpoint(self) -> None:
|
| 185 |
+
"""Save current state to checkpoint files."""
|
| 186 |
+
requests, seen = self.scheduler.snapshot()
|
| 187 |
+
data = CheckpointData(requests=requests, seen=seen)
|
| 188 |
+
await self._checkpoint_manager.save(data)
|
| 189 |
+
self._last_checkpoint_time = anyio.current_time()
|
| 190 |
+
|
| 191 |
+
def _is_checkpoint_time(self) -> bool:
|
| 192 |
+
"""Check if it's time for the periodic checkpoint."""
|
| 193 |
+
if not self._checkpoint_system_enabled:
|
| 194 |
+
return False
|
| 195 |
+
|
| 196 |
+
if self._checkpoint_manager.interval == 0:
|
| 197 |
+
return False
|
| 198 |
+
|
| 199 |
+
current_time = anyio.current_time()
|
| 200 |
+
return (current_time - self._last_checkpoint_time) >= self._checkpoint_manager.interval
|
| 201 |
+
|
| 202 |
+
async def _restore_from_checkpoint(self) -> bool:
|
| 203 |
+
"""Attempt to restore state from checkpoint.
|
| 204 |
+
|
| 205 |
+
Returns True if successfully restored, False otherwise.
|
| 206 |
+
"""
|
| 207 |
+
if not self._checkpoint_system_enabled:
|
| 208 |
+
raise
|
| 209 |
+
|
| 210 |
+
data = await self._checkpoint_manager.load()
|
| 211 |
+
if data is None:
|
| 212 |
+
return False
|
| 213 |
+
|
| 214 |
+
self.scheduler.restore(data)
|
| 215 |
+
|
| 216 |
+
# Restore callbacks from spider after scheduler restore
|
| 217 |
+
for request in data.requests:
|
| 218 |
+
request._restore_callback(self.spider)
|
| 219 |
+
|
| 220 |
+
return True
|
| 221 |
+
|
| 222 |
+
async def crawl(self) -> CrawlStats:
|
| 223 |
+
"""Run the spider and return CrawlStats."""
|
| 224 |
+
self._running = True
|
| 225 |
+
self._items.clear()
|
| 226 |
+
self.paused = False
|
| 227 |
+
self._pause_requested = False
|
| 228 |
+
self._force_stop = False
|
| 229 |
+
self.stats = CrawlStats(start_time=anyio.current_time())
|
| 230 |
+
|
| 231 |
+
# Check for existing checkpoint
|
| 232 |
+
resuming = (await self._restore_from_checkpoint()) if self._checkpoint_system_enabled else False
|
| 233 |
+
self._last_checkpoint_time = anyio.current_time()
|
| 234 |
+
|
| 235 |
+
async with self.session_manager:
|
| 236 |
+
self.stats.concurrent_requests = self.spider.concurrent_requests
|
| 237 |
+
self.stats.concurrent_requests_per_domain = self.spider.concurrent_requests_per_domain
|
| 238 |
+
self.stats.download_delay = self.spider.download_delay
|
| 239 |
+
await self.spider.on_start(resuming=resuming)
|
| 240 |
+
|
| 241 |
+
try:
|
| 242 |
+
if not resuming:
|
| 243 |
+
async for request in self.spider.start_requests():
|
| 244 |
+
self._normalize_request(request)
|
| 245 |
+
await self.scheduler.enqueue(request)
|
| 246 |
+
else:
|
| 247 |
+
log.info("Resuming from checkpoint, skipping start_requests()")
|
| 248 |
+
|
| 249 |
+
# Process queue
|
| 250 |
+
async with create_task_group() as tg:
|
| 251 |
+
while self._running:
|
| 252 |
+
if self._pause_requested:
|
| 253 |
+
if self._active_tasks == 0 or self._force_stop:
|
| 254 |
+
if self._force_stop:
|
| 255 |
+
log.warning(f"Force stopping with {self._active_tasks} active tasks")
|
| 256 |
+
tg.cancel_scope.cancel()
|
| 257 |
+
|
| 258 |
+
# Only save checkpoint if checkpoint system is enabled
|
| 259 |
+
if self._checkpoint_system_enabled:
|
| 260 |
+
await self._save_checkpoint()
|
| 261 |
+
self.paused = True
|
| 262 |
+
log.info("Spider paused, checkpoint saved")
|
| 263 |
+
else:
|
| 264 |
+
log.info("Spider stopped gracefully")
|
| 265 |
+
|
| 266 |
+
self._running = False
|
| 267 |
+
break
|
| 268 |
+
|
| 269 |
+
# Wait briefly and check again
|
| 270 |
+
await anyio.sleep(0.05)
|
| 271 |
+
continue
|
| 272 |
+
|
| 273 |
+
if self._checkpoint_system_enabled and self._is_checkpoint_time():
|
| 274 |
+
await self._save_checkpoint()
|
| 275 |
+
|
| 276 |
+
if self.scheduler.is_empty:
|
| 277 |
+
# Empty queue + no active tasks = done
|
| 278 |
+
if self._active_tasks == 0:
|
| 279 |
+
self._running = False
|
| 280 |
+
log.debug("Spider idle")
|
| 281 |
+
break
|
| 282 |
+
|
| 283 |
+
# Brief wait for callbacks to enqueue new requests
|
| 284 |
+
await anyio.sleep(0.05)
|
| 285 |
+
continue
|
| 286 |
+
|
| 287 |
+
# Only spawn tasks up to concurrent_requests limit
|
| 288 |
+
# This prevents spawning thousands of waiting tasks
|
| 289 |
+
if self._active_tasks >= self.spider.concurrent_requests:
|
| 290 |
+
await anyio.sleep(0.01)
|
| 291 |
+
continue
|
| 292 |
+
|
| 293 |
+
request = await self.scheduler.dequeue()
|
| 294 |
+
self._active_tasks += 1
|
| 295 |
+
tg.start_soon(self._task_wrapper, request)
|
| 296 |
+
|
| 297 |
+
finally:
|
| 298 |
+
await self.spider.on_close()
|
| 299 |
+
# Clean up checkpoint files on successful completion (not paused)
|
| 300 |
+
if not self.paused and self._checkpoint_system_enabled:
|
| 301 |
+
await self._checkpoint_manager.cleanup()
|
| 302 |
+
|
| 303 |
+
self.stats.log_levels_counter = self.spider._log_counter.get_counts()
|
| 304 |
+
self.stats.end_time = anyio.current_time()
|
| 305 |
+
log.info(_dump(self.stats.to_dict()))
|
| 306 |
+
return self.stats
|
| 307 |
+
|
| 308 |
+
@property
|
| 309 |
+
def items(self) -> ItemList:
|
| 310 |
+
"""Access scraped items."""
|
| 311 |
+
return self._items
|
| 312 |
+
|
| 313 |
+
def __aiter__(self) -> AsyncGenerator[dict, None]:
|
| 314 |
+
return self._stream()
|
| 315 |
+
|
| 316 |
+
async def _stream(self) -> AsyncGenerator[dict, None]:
|
| 317 |
+
"""Async generator that runs crawl and yields items."""
|
| 318 |
+
send, recv = create_memory_object_stream[dict](100)
|
| 319 |
+
self._item_stream = send
|
| 320 |
+
|
| 321 |
+
async def run():
|
| 322 |
+
try:
|
| 323 |
+
await self.crawl()
|
| 324 |
+
finally:
|
| 325 |
+
await send.aclose()
|
| 326 |
+
|
| 327 |
+
async with create_task_group() as tg:
|
| 328 |
+
tg.start_soon(run)
|
| 329 |
+
try:
|
| 330 |
+
async for item in recv:
|
| 331 |
+
yield item
|
| 332 |
+
except EndOfStream:
|
| 333 |
+
pass
|
spiders/request.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import hashlib
|
| 2 |
+
from io import BytesIO
|
| 3 |
+
from functools import cached_property
|
| 4 |
+
from urllib.parse import urlparse, urlencode
|
| 5 |
+
|
| 6 |
+
import orjson
|
| 7 |
+
from w3lib.url import canonicalize_url
|
| 8 |
+
|
| 9 |
+
from scrapling.engines.toolbelt.custom import Response
|
| 10 |
+
from scrapling.core._types import Any, AsyncGenerator, Callable, Dict, Optional, Union, Tuple, TYPE_CHECKING
|
| 11 |
+
|
| 12 |
+
if TYPE_CHECKING:
|
| 13 |
+
from scrapling.spiders.spider import Spider
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def _convert_to_bytes(value: str | bytes) -> bytes:
|
| 17 |
+
if isinstance(value, bytes):
|
| 18 |
+
return value
|
| 19 |
+
if not isinstance(value, str):
|
| 20 |
+
raise TypeError(f"Can't convert {type(value).__name__} to bytes")
|
| 21 |
+
|
| 22 |
+
return value.encode(encoding="utf-8", errors="ignore")
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class Request:
|
| 26 |
+
def __init__(
|
| 27 |
+
self,
|
| 28 |
+
url: str,
|
| 29 |
+
sid: str = "",
|
| 30 |
+
callback: Callable[[Response], AsyncGenerator[Union[Dict[str, Any], "Request", None], None]] | None = None,
|
| 31 |
+
priority: int = 0,
|
| 32 |
+
dont_filter: bool = False,
|
| 33 |
+
meta: dict[str, Any] | None = None,
|
| 34 |
+
_retry_count: int = 0,
|
| 35 |
+
**kwargs: Any,
|
| 36 |
+
) -> None:
|
| 37 |
+
self.url: str = url
|
| 38 |
+
self.sid: str = sid
|
| 39 |
+
self.callback = callback
|
| 40 |
+
self.priority: int = priority
|
| 41 |
+
self.dont_filter: bool = dont_filter
|
| 42 |
+
self.meta: dict[str, Any] = meta if meta else {}
|
| 43 |
+
self._retry_count: int = _retry_count
|
| 44 |
+
self._session_kwargs = kwargs if kwargs else {}
|
| 45 |
+
self._fp: Optional[bytes] = None
|
| 46 |
+
|
| 47 |
+
def copy(self) -> "Request":
|
| 48 |
+
"""Create a copy of this request."""
|
| 49 |
+
return Request(
|
| 50 |
+
url=self.url,
|
| 51 |
+
sid=self.sid,
|
| 52 |
+
callback=self.callback,
|
| 53 |
+
priority=self.priority,
|
| 54 |
+
dont_filter=self.dont_filter,
|
| 55 |
+
meta=self.meta.copy(),
|
| 56 |
+
_retry_count=self._retry_count,
|
| 57 |
+
**self._session_kwargs,
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
@cached_property
|
| 61 |
+
def domain(self) -> str:
|
| 62 |
+
return urlparse(self.url).netloc
|
| 63 |
+
|
| 64 |
+
def update_fingerprint(
|
| 65 |
+
self,
|
| 66 |
+
include_kwargs: bool = False,
|
| 67 |
+
include_headers: bool = False,
|
| 68 |
+
keep_fragments: bool = False,
|
| 69 |
+
) -> bytes:
|
| 70 |
+
"""Generate a unique fingerprint for deduplication.
|
| 71 |
+
|
| 72 |
+
Caches the result in self._fp after first computation.
|
| 73 |
+
"""
|
| 74 |
+
if self._fp is not None:
|
| 75 |
+
return self._fp
|
| 76 |
+
|
| 77 |
+
post_data = self._session_kwargs.get("data", {})
|
| 78 |
+
body = b""
|
| 79 |
+
if post_data:
|
| 80 |
+
if isinstance(post_data, dict | list | tuple):
|
| 81 |
+
body = urlencode(post_data).encode()
|
| 82 |
+
elif isinstance(post_data, str):
|
| 83 |
+
body = post_data.encode()
|
| 84 |
+
elif isinstance(post_data, BytesIO):
|
| 85 |
+
body = post_data.getvalue()
|
| 86 |
+
elif isinstance(post_data, bytes):
|
| 87 |
+
body = post_data
|
| 88 |
+
else:
|
| 89 |
+
post_data = self._session_kwargs.get("json", {})
|
| 90 |
+
body = orjson.dumps(post_data) if post_data else b""
|
| 91 |
+
|
| 92 |
+
data: Dict[str, str | Tuple] = {
|
| 93 |
+
"sid": self.sid,
|
| 94 |
+
"body": body.hex(),
|
| 95 |
+
"method": self._session_kwargs.get("method", "GET"),
|
| 96 |
+
"url": canonicalize_url(self.url, keep_fragments=keep_fragments),
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
if include_kwargs:
|
| 100 |
+
kwargs = (key.lower() for key in self._session_kwargs.keys() if key.lower() not in ("data", "json"))
|
| 101 |
+
data["kwargs"] = "".join(set(_convert_to_bytes(key).hex() for key in kwargs))
|
| 102 |
+
|
| 103 |
+
if include_headers:
|
| 104 |
+
headers = self._session_kwargs.get("headers") or self._session_kwargs.get("extra_headers") or {}
|
| 105 |
+
processed_headers = {}
|
| 106 |
+
# Some header normalization
|
| 107 |
+
for key, value in headers.items():
|
| 108 |
+
processed_headers[_convert_to_bytes(key.lower()).hex()] = _convert_to_bytes(value.lower()).hex()
|
| 109 |
+
data["headers"] = tuple(processed_headers.items())
|
| 110 |
+
|
| 111 |
+
fp = hashlib.sha1(orjson.dumps(data, option=orjson.OPT_SORT_KEYS), usedforsecurity=False).digest()
|
| 112 |
+
self._fp = fp
|
| 113 |
+
return fp
|
| 114 |
+
|
| 115 |
+
def __repr__(self) -> str:
|
| 116 |
+
callback_name = getattr(self.callback, "__name__", None) or "None"
|
| 117 |
+
return f"<Request({self.url}) priority={self.priority} callback={callback_name}>"
|
| 118 |
+
|
| 119 |
+
def __str__(self) -> str:
|
| 120 |
+
return self.url
|
| 121 |
+
|
| 122 |
+
def __lt__(self, other: object) -> bool:
|
| 123 |
+
"""Compare requests by priority"""
|
| 124 |
+
if not isinstance(other, Request):
|
| 125 |
+
return NotImplemented
|
| 126 |
+
return self.priority < other.priority
|
| 127 |
+
|
| 128 |
+
def __gt__(self, other: object) -> bool:
|
| 129 |
+
"""Compare requests by priority"""
|
| 130 |
+
if not isinstance(other, Request):
|
| 131 |
+
return NotImplemented
|
| 132 |
+
return self.priority > other.priority
|
| 133 |
+
|
| 134 |
+
def __eq__(self, other: object) -> bool:
|
| 135 |
+
"""Requests are equal if they have the same fingerprint."""
|
| 136 |
+
if not isinstance(other, Request):
|
| 137 |
+
return NotImplemented
|
| 138 |
+
if self._fp is None or other._fp is None:
|
| 139 |
+
raise RuntimeError("Cannot compare requests before generating their fingerprints!")
|
| 140 |
+
return self._fp == other._fp
|
| 141 |
+
|
| 142 |
+
def __getstate__(self) -> dict[str, Any]:
|
| 143 |
+
"""Prepare state for pickling - store callback as name string for pickle compatibility."""
|
| 144 |
+
state = self.__dict__.copy()
|
| 145 |
+
state["_callback_name"] = getattr(self.callback, "__name__", None) if self.callback is not None else None
|
| 146 |
+
state["callback"] = None # Don't pickle the actual callable
|
| 147 |
+
return state
|
| 148 |
+
|
| 149 |
+
def __setstate__(self, state: dict[str, Any]) -> None:
|
| 150 |
+
"""Restore state from pickle - callback restored later via _restore_callback()."""
|
| 151 |
+
self._callback_name: str | None = state.pop("_callback_name", None)
|
| 152 |
+
self.__dict__.update(state)
|
| 153 |
+
|
| 154 |
+
def _restore_callback(self, spider: "Spider") -> None:
|
| 155 |
+
"""Restore callback from spider after unpickling.
|
| 156 |
+
|
| 157 |
+
:param spider: Spider instance to look up callback method on
|
| 158 |
+
"""
|
| 159 |
+
if hasattr(self, "_callback_name") and self._callback_name:
|
| 160 |
+
self.callback = getattr(spider, self._callback_name, None) or spider.parse
|
| 161 |
+
del self._callback_name
|
| 162 |
+
elif hasattr(self, "_callback_name"):
|
| 163 |
+
del self._callback_name
|
spiders/result.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
from dataclasses import dataclass, field
|
| 3 |
+
|
| 4 |
+
import orjson
|
| 5 |
+
|
| 6 |
+
from scrapling.core.utils import log
|
| 7 |
+
from scrapling.core._types import Any, Iterator, Dict, List, Tuple, Union
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class ItemList(list):
|
| 11 |
+
"""A list of scraped items with export capabilities."""
|
| 12 |
+
|
| 13 |
+
def to_json(self, path: Union[str, Path], *, indent: bool = False):
|
| 14 |
+
"""Export items to a JSON file.
|
| 15 |
+
|
| 16 |
+
:param path: Path to the output file
|
| 17 |
+
:param indent: Pretty-print with 2-space indentation (slightly slower)
|
| 18 |
+
"""
|
| 19 |
+
options = orjson.OPT_SERIALIZE_NUMPY
|
| 20 |
+
if indent:
|
| 21 |
+
options |= orjson.OPT_INDENT_2
|
| 22 |
+
|
| 23 |
+
file = Path(path)
|
| 24 |
+
file.parent.mkdir(parents=True, exist_ok=True)
|
| 25 |
+
file.write_bytes(orjson.dumps(list(self), option=options))
|
| 26 |
+
log.info("Saved %d items to %s", len(self), path)
|
| 27 |
+
|
| 28 |
+
def to_jsonl(self, path: Union[str, Path]):
|
| 29 |
+
"""Export items as JSON Lines (one JSON object per line).
|
| 30 |
+
|
| 31 |
+
:param path: Path to the output file
|
| 32 |
+
"""
|
| 33 |
+
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
| 34 |
+
with open(path, "wb") as f:
|
| 35 |
+
for item in self:
|
| 36 |
+
f.write(orjson.dumps(item, option=orjson.OPT_SERIALIZE_NUMPY))
|
| 37 |
+
f.write(b"\n")
|
| 38 |
+
log.info("Saved %d items to %s", len(self), path)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
@dataclass
|
| 42 |
+
class CrawlStats:
|
| 43 |
+
"""Statistics for a crawl run."""
|
| 44 |
+
|
| 45 |
+
requests_count: int = 0
|
| 46 |
+
concurrent_requests: int = 0
|
| 47 |
+
concurrent_requests_per_domain: int = 0
|
| 48 |
+
failed_requests_count: int = 0
|
| 49 |
+
offsite_requests_count: int = 0
|
| 50 |
+
response_bytes: int = 0
|
| 51 |
+
items_scraped: int = 0
|
| 52 |
+
items_dropped: int = 0
|
| 53 |
+
start_time: float = 0.0
|
| 54 |
+
end_time: float = 0.0
|
| 55 |
+
download_delay: float = 0.0
|
| 56 |
+
blocked_requests_count: int = 0
|
| 57 |
+
custom_stats: Dict = field(default_factory=dict)
|
| 58 |
+
response_status_count: Dict = field(default_factory=dict)
|
| 59 |
+
domains_response_bytes: Dict = field(default_factory=dict)
|
| 60 |
+
sessions_requests_count: Dict = field(default_factory=dict)
|
| 61 |
+
proxies: List[str | Dict | Tuple] = field(default_factory=list)
|
| 62 |
+
log_levels_counter: Dict = field(default_factory=dict)
|
| 63 |
+
|
| 64 |
+
@property
|
| 65 |
+
def elapsed_seconds(self) -> float:
|
| 66 |
+
return self.end_time - self.start_time
|
| 67 |
+
|
| 68 |
+
@property
|
| 69 |
+
def requests_per_second(self) -> float:
|
| 70 |
+
if self.elapsed_seconds == 0:
|
| 71 |
+
return 0.0
|
| 72 |
+
return self.requests_count / self.elapsed_seconds
|
| 73 |
+
|
| 74 |
+
def increment_status(self, status: int) -> None:
|
| 75 |
+
self.response_status_count[f"status_{status}"] = self.response_status_count.get(f"status_{status}", 0) + 1
|
| 76 |
+
|
| 77 |
+
def increment_response_bytes(self, domain: str, count: int) -> None:
|
| 78 |
+
self.response_bytes += count
|
| 79 |
+
self.domains_response_bytes[domain] = self.domains_response_bytes.get(domain, 0) + count
|
| 80 |
+
|
| 81 |
+
def increment_requests_count(self, sid: str) -> None:
|
| 82 |
+
self.requests_count += 1
|
| 83 |
+
self.sessions_requests_count[sid] = self.sessions_requests_count.get(sid, 0) + 1
|
| 84 |
+
|
| 85 |
+
def to_dict(self) -> dict[str, Any]:
|
| 86 |
+
return {
|
| 87 |
+
"items_scraped": self.items_scraped,
|
| 88 |
+
"items_dropped": self.items_dropped,
|
| 89 |
+
"elapsed_seconds": round(self.elapsed_seconds, 2),
|
| 90 |
+
"download_delay": round(self.download_delay, 2),
|
| 91 |
+
"concurrent_requests": self.concurrent_requests,
|
| 92 |
+
"concurrent_requests_per_domain": self.concurrent_requests_per_domain,
|
| 93 |
+
"requests_count": self.requests_count,
|
| 94 |
+
"requests_per_second": round(self.requests_per_second, 2),
|
| 95 |
+
"sessions_requests_count": self.sessions_requests_count,
|
| 96 |
+
"failed_requests_count": self.failed_requests_count,
|
| 97 |
+
"offsite_requests_count": self.offsite_requests_count,
|
| 98 |
+
"blocked_requests_count": self.blocked_requests_count,
|
| 99 |
+
"response_status_count": self.response_status_count,
|
| 100 |
+
"response_bytes": self.response_bytes,
|
| 101 |
+
"domains_response_bytes": self.domains_response_bytes,
|
| 102 |
+
"proxies": self.proxies,
|
| 103 |
+
"custom_stats": self.custom_stats,
|
| 104 |
+
"log_count": self.log_levels_counter,
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
@dataclass
|
| 109 |
+
class CrawlResult:
|
| 110 |
+
"""Complete result from a spider run."""
|
| 111 |
+
|
| 112 |
+
stats: CrawlStats
|
| 113 |
+
items: ItemList
|
| 114 |
+
paused: bool = False
|
| 115 |
+
|
| 116 |
+
@property
|
| 117 |
+
def completed(self) -> bool:
|
| 118 |
+
"""True if the crawl completed normally (not paused)."""
|
| 119 |
+
return not self.paused
|
| 120 |
+
|
| 121 |
+
def __len__(self) -> int:
|
| 122 |
+
return len(self.items)
|
| 123 |
+
|
| 124 |
+
def __iter__(self) -> Iterator[dict[str, Any]]:
|
| 125 |
+
return iter(self.items)
|
spiders/scheduler.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
from itertools import count
|
| 3 |
+
|
| 4 |
+
from scrapling.core.utils import log
|
| 5 |
+
from scrapling.spiders.request import Request
|
| 6 |
+
from scrapling.core._types import List, Set, Tuple, TYPE_CHECKING
|
| 7 |
+
|
| 8 |
+
if TYPE_CHECKING:
|
| 9 |
+
from scrapling.spiders.checkpoint import CheckpointData
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class Scheduler:
|
| 13 |
+
"""
|
| 14 |
+
Priority queue with URL deduplication. (heapq)
|
| 15 |
+
|
| 16 |
+
Higher priority requests are processed first.
|
| 17 |
+
Duplicate URLs are filtered unless dont_filter=True.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
def __init__(self, include_kwargs: bool = False, include_headers: bool = False, keep_fragments: bool = False):
|
| 21 |
+
self._queue: asyncio.PriorityQueue[tuple[int, int, Request]] = asyncio.PriorityQueue()
|
| 22 |
+
self._seen: set[bytes] = set()
|
| 23 |
+
self._counter = count()
|
| 24 |
+
# Mirror dict for snapshot without draining queue
|
| 25 |
+
self._pending: dict[int, tuple[int, int, Request]] = {}
|
| 26 |
+
self._include_kwargs = include_kwargs
|
| 27 |
+
self._include_headers = include_headers
|
| 28 |
+
self._keep_fragments = keep_fragments
|
| 29 |
+
|
| 30 |
+
async def enqueue(self, request: Request) -> bool:
|
| 31 |
+
"""Add a request to the queue."""
|
| 32 |
+
fingerprint = request.update_fingerprint(self._include_kwargs, self._include_headers, self._keep_fragments)
|
| 33 |
+
|
| 34 |
+
if not request.dont_filter and fingerprint in self._seen:
|
| 35 |
+
log.debug("Dropped duplicate request: %s", request)
|
| 36 |
+
return False
|
| 37 |
+
|
| 38 |
+
self._seen.add(fingerprint)
|
| 39 |
+
|
| 40 |
+
# Negative priority so higher priority = dequeued first
|
| 41 |
+
counter = next(self._counter)
|
| 42 |
+
item = (-request.priority, counter, request)
|
| 43 |
+
self._pending[counter] = item
|
| 44 |
+
await self._queue.put(item)
|
| 45 |
+
return True
|
| 46 |
+
|
| 47 |
+
async def dequeue(self) -> Request:
|
| 48 |
+
"""Get the next request to process."""
|
| 49 |
+
_, counter, request = await self._queue.get()
|
| 50 |
+
self._pending.pop(counter, None)
|
| 51 |
+
return request
|
| 52 |
+
|
| 53 |
+
def __len__(self) -> int:
|
| 54 |
+
return self._queue.qsize()
|
| 55 |
+
|
| 56 |
+
@property
|
| 57 |
+
def is_empty(self) -> bool:
|
| 58 |
+
return self._queue.empty()
|
| 59 |
+
|
| 60 |
+
def snapshot(self) -> Tuple[List[Request], Set[bytes]]:
|
| 61 |
+
"""Create a snapshot of the current state for checkpoints."""
|
| 62 |
+
sorted_items = sorted(self._pending.values(), key=lambda x: (x[0], x[1])) # Maintain queue order
|
| 63 |
+
requests = [item[2] for item in sorted_items]
|
| 64 |
+
return requests, self._seen.copy()
|
| 65 |
+
|
| 66 |
+
def restore(self, data: "CheckpointData") -> None:
|
| 67 |
+
"""Restore scheduler state from checkpoint data.
|
| 68 |
+
|
| 69 |
+
:param data: CheckpointData containing requests and seen set
|
| 70 |
+
"""
|
| 71 |
+
self._seen = data.seen.copy()
|
| 72 |
+
|
| 73 |
+
# Restore pending requests in order (they're already sorted by priority)
|
| 74 |
+
for request in data.requests:
|
| 75 |
+
counter = next(self._counter)
|
| 76 |
+
item = (-request.priority, counter, request)
|
| 77 |
+
self._pending[counter] = item
|
| 78 |
+
self._queue.put_nowait(item)
|
| 79 |
+
|
| 80 |
+
log.info(f"Scheduler restored: {len(data.requests)} requests, {len(data.seen)} seen")
|
spiders/session.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from asyncio import Lock
|
| 2 |
+
|
| 3 |
+
from scrapling.spiders.request import Request
|
| 4 |
+
from scrapling.engines.static import _ASyncSessionLogic
|
| 5 |
+
from scrapling.engines.toolbelt.convertor import Response
|
| 6 |
+
from scrapling.core._types import Set, cast, SUPPORTED_HTTP_METHODS
|
| 7 |
+
from scrapling.fetchers import AsyncDynamicSession, AsyncStealthySession, FetcherSession
|
| 8 |
+
|
| 9 |
+
Session = FetcherSession | AsyncDynamicSession | AsyncStealthySession
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class SessionManager:
|
| 13 |
+
"""Manages pre-configured session instances."""
|
| 14 |
+
|
| 15 |
+
def __init__(self) -> None:
|
| 16 |
+
self._sessions: dict[str, Session] = {}
|
| 17 |
+
self._default_session_id: str | None = None
|
| 18 |
+
self._started: bool = False
|
| 19 |
+
self._lazy_sessions: Set[str] = set()
|
| 20 |
+
self._lazy_lock = Lock()
|
| 21 |
+
|
| 22 |
+
def add(self, session_id: str, session: Session, *, default: bool = False, lazy: bool = False) -> "SessionManager":
|
| 23 |
+
"""Register a session instance.
|
| 24 |
+
|
| 25 |
+
:param session_id: Name to reference this session in requests
|
| 26 |
+
:param session: Your pre-configured session instance
|
| 27 |
+
:param default: If True, this becomes the default session
|
| 28 |
+
:param lazy: If True, the session will be started only when a request uses its ID.
|
| 29 |
+
"""
|
| 30 |
+
if session_id in self._sessions:
|
| 31 |
+
raise ValueError(f"Session '{session_id}' already registered")
|
| 32 |
+
|
| 33 |
+
self._sessions[session_id] = session
|
| 34 |
+
|
| 35 |
+
if default or self._default_session_id is None:
|
| 36 |
+
self._default_session_id = session_id
|
| 37 |
+
|
| 38 |
+
if lazy:
|
| 39 |
+
self._lazy_sessions.add(session_id)
|
| 40 |
+
|
| 41 |
+
return self
|
| 42 |
+
|
| 43 |
+
def remove(self, session_id: str) -> None:
|
| 44 |
+
"""Removes a session.
|
| 45 |
+
|
| 46 |
+
:param session_id: ID of session to remove
|
| 47 |
+
"""
|
| 48 |
+
_ = self.pop(session_id)
|
| 49 |
+
|
| 50 |
+
def pop(self, session_id: str) -> Session:
|
| 51 |
+
"""Remove and returns a session.
|
| 52 |
+
|
| 53 |
+
:param session_id: ID of session to remove
|
| 54 |
+
"""
|
| 55 |
+
if session_id not in self._sessions:
|
| 56 |
+
raise KeyError(f"Session '{session_id}' not found")
|
| 57 |
+
|
| 58 |
+
session = self._sessions.pop(session_id)
|
| 59 |
+
if session_id in self._lazy_sessions:
|
| 60 |
+
self._lazy_sessions.remove(session_id)
|
| 61 |
+
|
| 62 |
+
if session and self._default_session_id == session_id:
|
| 63 |
+
self._default_session_id = next(iter(self._sessions), None)
|
| 64 |
+
|
| 65 |
+
return session
|
| 66 |
+
|
| 67 |
+
@property
|
| 68 |
+
def default_session_id(self) -> str:
|
| 69 |
+
if self._default_session_id is None:
|
| 70 |
+
raise RuntimeError("No sessions registered")
|
| 71 |
+
return self._default_session_id
|
| 72 |
+
|
| 73 |
+
@property
|
| 74 |
+
def session_ids(self) -> list[str]:
|
| 75 |
+
return list(self._sessions.keys())
|
| 76 |
+
|
| 77 |
+
def get(self, session_id: str) -> Session:
|
| 78 |
+
if session_id not in self._sessions:
|
| 79 |
+
available = ", ".join(self._sessions.keys())
|
| 80 |
+
raise KeyError(f"Session '{session_id}' not found. Available: {available}")
|
| 81 |
+
return self._sessions[session_id]
|
| 82 |
+
|
| 83 |
+
async def start(self) -> None:
|
| 84 |
+
"""Start all sessions that aren't already alive."""
|
| 85 |
+
if self._started:
|
| 86 |
+
return
|
| 87 |
+
|
| 88 |
+
for sid, session in self._sessions.items():
|
| 89 |
+
if sid not in self._lazy_sessions and not session._is_alive:
|
| 90 |
+
await session.__aenter__()
|
| 91 |
+
|
| 92 |
+
self._started = True
|
| 93 |
+
|
| 94 |
+
async def close(self) -> None:
|
| 95 |
+
"""Close all registered sessions."""
|
| 96 |
+
for session in self._sessions.values():
|
| 97 |
+
_ = await session.__aexit__(None, None, None)
|
| 98 |
+
|
| 99 |
+
self._started = False
|
| 100 |
+
|
| 101 |
+
async def fetch(self, request: Request) -> Response:
|
| 102 |
+
sid = request.sid if request.sid else self.default_session_id
|
| 103 |
+
session = self.get(sid)
|
| 104 |
+
|
| 105 |
+
if session:
|
| 106 |
+
if sid in self._lazy_sessions and not session._is_alive:
|
| 107 |
+
async with self._lazy_lock:
|
| 108 |
+
if not session._is_alive:
|
| 109 |
+
await session.__aenter__()
|
| 110 |
+
|
| 111 |
+
if isinstance(session, FetcherSession):
|
| 112 |
+
client = session._client
|
| 113 |
+
|
| 114 |
+
if isinstance(client, _ASyncSessionLogic):
|
| 115 |
+
response = await client._make_request(
|
| 116 |
+
method=cast(SUPPORTED_HTTP_METHODS, request._session_kwargs.pop("method", "GET")),
|
| 117 |
+
url=request.url,
|
| 118 |
+
**request._session_kwargs,
|
| 119 |
+
)
|
| 120 |
+
else:
|
| 121 |
+
# Sync session or other types - shouldn't happen in async context
|
| 122 |
+
raise TypeError(f"Session type {type(client)} not supported for async fetch")
|
| 123 |
+
else:
|
| 124 |
+
response = await session.fetch(url=request.url, **request._session_kwargs)
|
| 125 |
+
|
| 126 |
+
response.request = request
|
| 127 |
+
# Merge request meta into response meta (response meta takes priority)
|
| 128 |
+
response.meta = {**request.meta, **response.meta}
|
| 129 |
+
return response
|
| 130 |
+
raise RuntimeError("No session found with the request session id")
|
| 131 |
+
|
| 132 |
+
async def __aenter__(self) -> "SessionManager":
|
| 133 |
+
await self.start()
|
| 134 |
+
return self
|
| 135 |
+
|
| 136 |
+
async def __aexit__(self, *exc) -> None:
|
| 137 |
+
await self.close()
|
| 138 |
+
|
| 139 |
+
def __contains__(self, session_id: str) -> bool:
|
| 140 |
+
"""Check if a session ID is registered."""
|
| 141 |
+
return session_id in self._sessions
|
| 142 |
+
|
| 143 |
+
def __len__(self) -> int:
|
| 144 |
+
"""Number of registered sessions."""
|
| 145 |
+
return len(self._sessions)
|
spiders/spider.py
ADDED
|
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import signal
|
| 2 |
+
import logging
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from abc import ABC, abstractmethod
|
| 5 |
+
|
| 6 |
+
import anyio
|
| 7 |
+
from anyio import Path as AsyncPath
|
| 8 |
+
|
| 9 |
+
from scrapling.spiders.request import Request
|
| 10 |
+
from scrapling.spiders.engine import CrawlerEngine
|
| 11 |
+
from scrapling.spiders.session import SessionManager
|
| 12 |
+
from scrapling.core.utils import set_logger, reset_logger
|
| 13 |
+
from scrapling.spiders.result import CrawlResult, CrawlStats
|
| 14 |
+
from scrapling.core._types import Set, Any, Dict, Optional, Union, TYPE_CHECKING, AsyncGenerator
|
| 15 |
+
|
| 16 |
+
BLOCKED_CODES = {401, 403, 407, 429, 444, 500, 502, 503, 504}
|
| 17 |
+
if TYPE_CHECKING:
|
| 18 |
+
from scrapling.engines.toolbelt.custom import Response
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class LogCounterHandler(logging.Handler):
|
| 22 |
+
"""A logging handler that counts log messages by level."""
|
| 23 |
+
|
| 24 |
+
def __init__(self):
|
| 25 |
+
super().__init__()
|
| 26 |
+
self.counts = {
|
| 27 |
+
logging.DEBUG: 0,
|
| 28 |
+
logging.INFO: 0,
|
| 29 |
+
logging.WARNING: 0,
|
| 30 |
+
logging.ERROR: 0,
|
| 31 |
+
logging.CRITICAL: 0,
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
def emit(self, record: logging.LogRecord) -> None:
|
| 35 |
+
level = record.levelno
|
| 36 |
+
# Map to the closest standard level
|
| 37 |
+
if level >= logging.CRITICAL:
|
| 38 |
+
self.counts[logging.CRITICAL] += 1
|
| 39 |
+
elif level >= logging.ERROR:
|
| 40 |
+
self.counts[logging.ERROR] += 1
|
| 41 |
+
elif level >= logging.WARNING:
|
| 42 |
+
self.counts[logging.WARNING] += 1
|
| 43 |
+
elif level >= logging.INFO:
|
| 44 |
+
self.counts[logging.INFO] += 1
|
| 45 |
+
else:
|
| 46 |
+
self.counts[logging.DEBUG] += 1
|
| 47 |
+
|
| 48 |
+
def get_counts(self) -> Dict[str, int]:
|
| 49 |
+
"""Return counts as a dictionary with string keys."""
|
| 50 |
+
return {
|
| 51 |
+
"debug": self.counts[logging.DEBUG],
|
| 52 |
+
"info": self.counts[logging.INFO],
|
| 53 |
+
"warning": self.counts[logging.WARNING],
|
| 54 |
+
"error": self.counts[logging.ERROR],
|
| 55 |
+
"critical": self.counts[logging.CRITICAL],
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class SessionConfigurationError(Exception):
|
| 60 |
+
"""Raised when session configuration fails."""
|
| 61 |
+
|
| 62 |
+
pass
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class Spider(ABC):
|
| 66 |
+
"""An abstract base class for creating web spiders.
|
| 67 |
+
|
| 68 |
+
Check the documentation website for more information.
|
| 69 |
+
"""
|
| 70 |
+
|
| 71 |
+
name: Optional[str] = None
|
| 72 |
+
start_urls: list[str] = []
|
| 73 |
+
allowed_domains: Set[str] = set()
|
| 74 |
+
|
| 75 |
+
# Concurrency settings
|
| 76 |
+
concurrent_requests: int = 4
|
| 77 |
+
concurrent_requests_per_domain: int = 0
|
| 78 |
+
download_delay: float = 0.0
|
| 79 |
+
max_blocked_retries: int = 3
|
| 80 |
+
|
| 81 |
+
# Fingerprint adjustments
|
| 82 |
+
fp_include_kwargs: bool = False
|
| 83 |
+
fp_keep_fragments: bool = False
|
| 84 |
+
fp_include_headers: bool = False
|
| 85 |
+
|
| 86 |
+
# Logging settings
|
| 87 |
+
logging_level: int = logging.DEBUG
|
| 88 |
+
logging_format: str = "[%(asctime)s]:({spider_name}) %(levelname)s: %(message)s"
|
| 89 |
+
logging_date_format: str = "%Y-%m-%d %H:%M:%S"
|
| 90 |
+
log_file: Optional[str] = None
|
| 91 |
+
|
| 92 |
+
def __init__(self, crawldir: Optional[Union[str, Path, AsyncPath]] = None, interval: float = 300.0):
|
| 93 |
+
"""Initialize the spider.
|
| 94 |
+
|
| 95 |
+
:param crawldir: Directory for checkpoint files. If provided, enables pause/resume.
|
| 96 |
+
:param interval: Seconds between periodic checkpoint saves (default 5 minutes).
|
| 97 |
+
"""
|
| 98 |
+
if self.name is None:
|
| 99 |
+
raise ValueError(f"{self.__class__.__name__} must have a name.")
|
| 100 |
+
|
| 101 |
+
self.logger = logging.getLogger(f"scrapling.spiders.{self.name}")
|
| 102 |
+
self.logger.setLevel(self.logging_level)
|
| 103 |
+
self.logger.handlers.clear()
|
| 104 |
+
self.logger.propagate = False # Don't propagate to parent 'scrapling' logger
|
| 105 |
+
|
| 106 |
+
formatter = logging.Formatter(
|
| 107 |
+
fmt=self.logging_format.format(spider_name=self.name), datefmt=self.logging_date_format
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
# Add a log counter handler to track log counts by level
|
| 111 |
+
self._log_counter = LogCounterHandler()
|
| 112 |
+
self.logger.addHandler(self._log_counter)
|
| 113 |
+
|
| 114 |
+
console_handler = logging.StreamHandler()
|
| 115 |
+
console_handler.setFormatter(formatter)
|
| 116 |
+
self.logger.addHandler(console_handler)
|
| 117 |
+
|
| 118 |
+
if self.log_file:
|
| 119 |
+
Path(self.log_file).parent.mkdir(parents=True, exist_ok=True)
|
| 120 |
+
file_handler = logging.FileHandler(self.log_file)
|
| 121 |
+
file_handler.setFormatter(formatter)
|
| 122 |
+
self.logger.addHandler(file_handler)
|
| 123 |
+
|
| 124 |
+
self.crawldir: Optional[Path] = Path(crawldir) if crawldir else None
|
| 125 |
+
self._interval = interval
|
| 126 |
+
self._engine: Optional[CrawlerEngine] = None
|
| 127 |
+
self._original_sigint_handler: Any = None
|
| 128 |
+
|
| 129 |
+
self._session_manager = SessionManager()
|
| 130 |
+
|
| 131 |
+
try:
|
| 132 |
+
self.configure_sessions(self._session_manager)
|
| 133 |
+
except Exception as e:
|
| 134 |
+
raise SessionConfigurationError(f"Error in {self.__class__.__name__}.configure_sessions(): {e}") from e
|
| 135 |
+
|
| 136 |
+
if len(self._session_manager) == 0:
|
| 137 |
+
raise SessionConfigurationError(f"{self.__class__.__name__}.configure_sessions() did not add any sessions")
|
| 138 |
+
|
| 139 |
+
self.logger.info("Spider initialized")
|
| 140 |
+
|
| 141 |
+
async def start_requests(self) -> AsyncGenerator[Request, None]:
|
| 142 |
+
"""Generate initial requests to start the crawl.
|
| 143 |
+
|
| 144 |
+
By default, this generates Request objects for each URL in `start_urls`
|
| 145 |
+
using the session manager's default session and `parse()` as callback.
|
| 146 |
+
|
| 147 |
+
Override this method for more control over initial requests
|
| 148 |
+
(e.g., to add custom headers, use different callbacks, etc.)
|
| 149 |
+
"""
|
| 150 |
+
if not self.start_urls:
|
| 151 |
+
raise RuntimeError(
|
| 152 |
+
"Spider has no starting point, either set `start_urls` or override `start_requests` function."
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
for url in self.start_urls:
|
| 156 |
+
yield Request(url, sid=self._session_manager.default_session_id)
|
| 157 |
+
|
| 158 |
+
@abstractmethod
|
| 159 |
+
async def parse(self, response: "Response") -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
|
| 160 |
+
"""Default callback for processing responses"""
|
| 161 |
+
raise NotImplementedError(f"{self.__class__.__name__} must implement parse() method")
|
| 162 |
+
yield # Make this a generator for type checkers
|
| 163 |
+
|
| 164 |
+
async def on_start(self, resuming: bool = False) -> None:
|
| 165 |
+
"""Called before crawling starts. Override for setup logic.
|
| 166 |
+
|
| 167 |
+
:param resuming: It's enabled if the spider is resuming from a checkpoint, left for the user to use.
|
| 168 |
+
"""
|
| 169 |
+
if resuming:
|
| 170 |
+
self.logger.debug("Resuming spider from checkpoint")
|
| 171 |
+
else:
|
| 172 |
+
self.logger.debug("Starting spider")
|
| 173 |
+
|
| 174 |
+
async def on_close(self) -> None:
|
| 175 |
+
"""Called after crawling finishes. Override for cleanup logic."""
|
| 176 |
+
self.logger.debug("Spider closed")
|
| 177 |
+
|
| 178 |
+
async def on_error(self, request: Request, error: Exception) -> None:
|
| 179 |
+
"""
|
| 180 |
+
Handle request errors for all spider requests.
|
| 181 |
+
|
| 182 |
+
Override for custom error handling.
|
| 183 |
+
"""
|
| 184 |
+
pass
|
| 185 |
+
|
| 186 |
+
async def on_scraped_item(self, item: Dict[str, Any]) -> Dict[str, Any] | None:
|
| 187 |
+
"""A hook to be overridden by users to do some processing on scraped items, return `None` to drop the item silently."""
|
| 188 |
+
return item
|
| 189 |
+
|
| 190 |
+
async def is_blocked(self, response: "Response") -> bool:
|
| 191 |
+
"""Check if the response is blocked. Users should override this for custom detection logic."""
|
| 192 |
+
if response.status in BLOCKED_CODES:
|
| 193 |
+
return True
|
| 194 |
+
return False
|
| 195 |
+
|
| 196 |
+
async def retry_blocked_request(self, request: Request, response: "Response") -> Request:
|
| 197 |
+
"""Users should override this to prepare the blocked request before retrying, if needed."""
|
| 198 |
+
return request
|
| 199 |
+
|
| 200 |
+
def __repr__(self) -> str:
|
| 201 |
+
"""String representation of the spider."""
|
| 202 |
+
return f"<{self.__class__.__name__} '{self.name}'>"
|
| 203 |
+
|
| 204 |
+
def configure_sessions(self, manager: SessionManager) -> None:
|
| 205 |
+
"""Configure sessions for this spider.
|
| 206 |
+
|
| 207 |
+
Override this method to add custom sessions.
|
| 208 |
+
The default implementation creates a FetcherSession session.
|
| 209 |
+
|
| 210 |
+
The first session added becomes the default for `start_requests()` unless specified otherwise.
|
| 211 |
+
|
| 212 |
+
:param manager: SessionManager to configure
|
| 213 |
+
"""
|
| 214 |
+
from scrapling.fetchers import FetcherSession
|
| 215 |
+
|
| 216 |
+
manager.add("default", FetcherSession())
|
| 217 |
+
|
| 218 |
+
def pause(self):
|
| 219 |
+
"""Request graceful shutdown of the crawling process."""
|
| 220 |
+
if self._engine:
|
| 221 |
+
self._engine.request_pause()
|
| 222 |
+
else:
|
| 223 |
+
raise RuntimeError("No active crawl to stop")
|
| 224 |
+
|
| 225 |
+
def _setup_signal_handler(self) -> None:
|
| 226 |
+
"""Set up SIGINT handler for graceful pause."""
|
| 227 |
+
|
| 228 |
+
def handler(_signum: int, _frame: Any) -> None:
|
| 229 |
+
if self._engine:
|
| 230 |
+
self._engine.request_pause()
|
| 231 |
+
else:
|
| 232 |
+
# No engine yet, just raise KeyboardInterrupt
|
| 233 |
+
raise KeyboardInterrupt
|
| 234 |
+
|
| 235 |
+
try:
|
| 236 |
+
self._original_sigint_handler = signal.signal(signal.SIGINT, handler)
|
| 237 |
+
except ValueError:
|
| 238 |
+
self._original_sigint_handler = None
|
| 239 |
+
|
| 240 |
+
def _restore_signal_handler(self) -> None:
|
| 241 |
+
"""Restore original SIGINT handler."""
|
| 242 |
+
if self._original_sigint_handler is not None:
|
| 243 |
+
try:
|
| 244 |
+
signal.signal(signal.SIGINT, self._original_sigint_handler)
|
| 245 |
+
except ValueError:
|
| 246 |
+
pass
|
| 247 |
+
|
| 248 |
+
async def __run(self) -> CrawlResult:
|
| 249 |
+
token = set_logger(self.logger)
|
| 250 |
+
try:
|
| 251 |
+
self._engine = CrawlerEngine(self, self._session_manager, self.crawldir, self._interval)
|
| 252 |
+
stats = await self._engine.crawl()
|
| 253 |
+
paused = self._engine.paused
|
| 254 |
+
return CrawlResult(stats=stats, items=self._engine.items, paused=paused)
|
| 255 |
+
finally:
|
| 256 |
+
self._engine = None
|
| 257 |
+
reset_logger(token)
|
| 258 |
+
# Close any file handlers to release file resources.
|
| 259 |
+
if self.log_file:
|
| 260 |
+
for handler in self.logger.handlers:
|
| 261 |
+
if isinstance(handler, logging.FileHandler):
|
| 262 |
+
handler.close()
|
| 263 |
+
|
| 264 |
+
def start(self, use_uvloop: bool = False, **backend_options: Any) -> CrawlResult:
|
| 265 |
+
"""Run the spider and return results.
|
| 266 |
+
|
| 267 |
+
This is the main entry point for running a spider.
|
| 268 |
+
Handles async execution internally via anyio.
|
| 269 |
+
|
| 270 |
+
Pressing Ctrl+C will initiate graceful shutdown (waits for active tasks to complete).
|
| 271 |
+
Pressing Ctrl+C a second time will force immediate stop.
|
| 272 |
+
|
| 273 |
+
If crawldir is set, a checkpoint will also be saved on graceful shutdown,
|
| 274 |
+
allowing you to resume the crawl later by running the spider again.
|
| 275 |
+
|
| 276 |
+
:param use_uvloop: Whether to use the faster uvloop/winloop event loop implementation, if available.
|
| 277 |
+
:param backend_options: Asyncio backend options to be used with `anyio.run`
|
| 278 |
+
"""
|
| 279 |
+
backend_options = backend_options or {}
|
| 280 |
+
if use_uvloop:
|
| 281 |
+
backend_options.update({"use_uvloop": True})
|
| 282 |
+
|
| 283 |
+
# Set up SIGINT handler for graceful shutdown
|
| 284 |
+
self._setup_signal_handler()
|
| 285 |
+
try:
|
| 286 |
+
return anyio.run(self.__run, backend="asyncio", backend_options=backend_options)
|
| 287 |
+
finally:
|
| 288 |
+
self._restore_signal_handler()
|
| 289 |
+
|
| 290 |
+
async def stream(self) -> AsyncGenerator[Dict[str, Any], None]:
|
| 291 |
+
"""Stream items as they're scraped. Ideal for long-running spiders or building applications on top of the spiders.
|
| 292 |
+
|
| 293 |
+
Must be called from an async context. Yields items one by one as they are scraped.
|
| 294 |
+
Access `spider.stats` during iteration for real-time statistics.
|
| 295 |
+
|
| 296 |
+
Note: SIGINT handling for pause/resume is not available in stream mode.
|
| 297 |
+
"""
|
| 298 |
+
token = set_logger(self.logger)
|
| 299 |
+
try:
|
| 300 |
+
self._engine = CrawlerEngine(self, self._session_manager, self.crawldir, self._interval)
|
| 301 |
+
async for item in self._engine:
|
| 302 |
+
yield item
|
| 303 |
+
finally:
|
| 304 |
+
self._engine = None
|
| 305 |
+
reset_logger(token)
|
| 306 |
+
if self.log_file:
|
| 307 |
+
for handler in self.logger.handlers:
|
| 308 |
+
if isinstance(handler, logging.FileHandler):
|
| 309 |
+
handler.close()
|
| 310 |
+
|
| 311 |
+
@property
|
| 312 |
+
def stats(self) -> CrawlStats:
|
| 313 |
+
"""Access current crawl stats (works during streaming)."""
|
| 314 |
+
if self._engine:
|
| 315 |
+
return self._engine.stats
|
| 316 |
+
raise RuntimeError("No active crawl. Use this property inside `async for item in spider.stream():`")
|
ui.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from scrapling.core.ai import ScraplingMCPServer
|
| 3 |
+
import asyncio
|
| 4 |
+
from typing import Any
|
| 5 |
+
|
| 6 |
+
def create_ui():
|
| 7 |
+
with gr.Blocks(title="Scrapling") as demo:
|
| 8 |
+
gr.Markdown("# Scrapling Web Interface")
|
| 9 |
+
|
| 10 |
+
with gr.Tab("Fetch (HTTP)"):
|
| 11 |
+
gr.Markdown("Standard HTTP Fetcher. Fast but less stealthy.")
|
| 12 |
+
url_input = gr.Textbox(label="URL", placeholder="https://example.com")
|
| 13 |
+
selector_input = gr.Textbox(label="CSS Selector (Optional)", placeholder=".content")
|
| 14 |
+
output = gr.JSON(label="Result")
|
| 15 |
+
fetch_btn = gr.Button("Fetch")
|
| 16 |
+
|
| 17 |
+
async def fetch_wrapper(url, selector):
|
| 18 |
+
if not url:
|
| 19 |
+
return {"error": "URL is required"}
|
| 20 |
+
try:
|
| 21 |
+
# ScraplingMCPServer.get is synchronous or async?
|
| 22 |
+
# In code: staticmethod def get(...) -> ResponseModel:
|
| 23 |
+
# It calls Fetcher.get which is synchronous.
|
| 24 |
+
# Gradio handles async/sync. But running sync function in async context might block.
|
| 25 |
+
# Since it is blocking, we should probably run it in executor or just let Gradio handle it.
|
| 26 |
+
# But ScraplingMCPServer.get uses 'impersonate' which uses curl_cffi.
|
| 27 |
+
result = ScraplingMCPServer.get(url, css_selector=selector if selector else None)
|
| 28 |
+
return result.model_dump()
|
| 29 |
+
except Exception as e:
|
| 30 |
+
return {"error": str(e)}
|
| 31 |
+
|
| 32 |
+
fetch_btn.click(fetch_wrapper, inputs=[url_input, selector_input], outputs=output)
|
| 33 |
+
|
| 34 |
+
with gr.Tab("Stealthy Fetch (Browser)"):
|
| 35 |
+
gr.Markdown("Stealthy Browser Fetcher (Playwright). Slower but bypasses bot protection.")
|
| 36 |
+
s_url_input = gr.Textbox(label="URL")
|
| 37 |
+
s_selector_input = gr.Textbox(label="CSS Selector (Optional)")
|
| 38 |
+
s_headless = gr.Checkbox(label="Headless", value=True)
|
| 39 |
+
s_output = gr.JSON(label="Result")
|
| 40 |
+
s_fetch_btn = gr.Button("Stealthy Fetch")
|
| 41 |
+
|
| 42 |
+
async def stealthy_fetch_wrapper(url, selector, headless):
|
| 43 |
+
if not url:
|
| 44 |
+
return {"error": "URL is required"}
|
| 45 |
+
try:
|
| 46 |
+
result = await ScraplingMCPServer.stealthy_fetch(
|
| 47 |
+
url,
|
| 48 |
+
css_selector=selector if selector else None,
|
| 49 |
+
headless=headless
|
| 50 |
+
)
|
| 51 |
+
return result.model_dump()
|
| 52 |
+
except Exception as e:
|
| 53 |
+
return {"error": str(e)}
|
| 54 |
+
|
| 55 |
+
s_fetch_btn.click(stealthy_fetch_wrapper, inputs=[s_url_input, s_selector_input, s_headless], outputs=s_output)
|
| 56 |
+
|
| 57 |
+
return demo
|