Karim shoair commited on
Commit ·
0e7f15c
1
Parent(s): 33c6b04
fix(shell): dynamically build the signature of shortcuts after last changes
Browse files- scrapling/core/_shell_signatures.py +95 -0
- scrapling/core/shell.py +48 -6
scrapling/core/_shell_signatures.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from scrapling.core._types import (
|
| 2 |
+
Dict,
|
| 3 |
+
Any,
|
| 4 |
+
List,
|
| 5 |
+
Tuple,
|
| 6 |
+
Optional,
|
| 7 |
+
)
|
| 8 |
+
|
| 9 |
+
# Parameter definitions for shell function signatures (defined once at module level)
|
| 10 |
+
# Mirrors TypedDict definitions from _types.py but runtime-accessible for IPython introspection
|
| 11 |
+
_REQUESTS_PARAMS = {
|
| 12 |
+
"params": Optional[Dict | List | Tuple],
|
| 13 |
+
"cookies": Any,
|
| 14 |
+
"auth": Optional[Tuple[str, str]],
|
| 15 |
+
"impersonate": Any,
|
| 16 |
+
"http3": Optional[bool],
|
| 17 |
+
"stealthy_headers": Optional[bool],
|
| 18 |
+
"proxies": Any,
|
| 19 |
+
"proxy": Optional[str],
|
| 20 |
+
"proxy_auth": Optional[Tuple[str, str]],
|
| 21 |
+
"timeout": Optional[int | float],
|
| 22 |
+
"headers": Any,
|
| 23 |
+
"retries": Optional[int],
|
| 24 |
+
"retry_delay": Optional[int],
|
| 25 |
+
"follow_redirects": Optional[bool],
|
| 26 |
+
"max_redirects": Optional[int],
|
| 27 |
+
"verify": Optional[bool],
|
| 28 |
+
"cert": Optional[str | Tuple[str, str]],
|
| 29 |
+
"selector_config": Optional[Dict],
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
_FETCH_PARAMS = {
|
| 33 |
+
"headless": bool,
|
| 34 |
+
"google_search": bool,
|
| 35 |
+
"hide_canvas": bool,
|
| 36 |
+
"disable_webgl": bool,
|
| 37 |
+
"real_chrome": bool,
|
| 38 |
+
"stealth": bool,
|
| 39 |
+
"wait": int | float,
|
| 40 |
+
"page_action": Optional[Any],
|
| 41 |
+
"proxy": Optional[str | Dict],
|
| 42 |
+
"locale": str,
|
| 43 |
+
"extra_headers": Optional[Dict[str, str]],
|
| 44 |
+
"useragent": Optional[str],
|
| 45 |
+
"cdp_url": Optional[str],
|
| 46 |
+
"timeout": int | float,
|
| 47 |
+
"disable_resources": bool,
|
| 48 |
+
"wait_selector": Optional[str],
|
| 49 |
+
"init_script": Optional[str],
|
| 50 |
+
"cookies": Optional[List[Dict]],
|
| 51 |
+
"network_idle": bool,
|
| 52 |
+
"load_dom": bool,
|
| 53 |
+
"wait_selector_state": Any,
|
| 54 |
+
"extra_flags": Optional[List[str]],
|
| 55 |
+
"additional_args": Optional[Dict],
|
| 56 |
+
"custom_config": Optional[Dict],
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
_STEALTHY_FETCH_PARAMS = {
|
| 60 |
+
"headless": bool,
|
| 61 |
+
"block_images": bool,
|
| 62 |
+
"disable_resources": bool,
|
| 63 |
+
"block_webrtc": bool,
|
| 64 |
+
"allow_webgl": bool,
|
| 65 |
+
"network_idle": bool,
|
| 66 |
+
"load_dom": bool,
|
| 67 |
+
"humanize": bool | float,
|
| 68 |
+
"solve_cloudflare": bool,
|
| 69 |
+
"wait": int | float,
|
| 70 |
+
"timeout": int | float,
|
| 71 |
+
"page_action": Optional[Any],
|
| 72 |
+
"wait_selector": Optional[str],
|
| 73 |
+
"init_script": Optional[str],
|
| 74 |
+
"addons": Optional[List[str]],
|
| 75 |
+
"wait_selector_state": Any,
|
| 76 |
+
"cookies": Optional[List[Dict]],
|
| 77 |
+
"google_search": bool,
|
| 78 |
+
"extra_headers": Optional[Dict[str, str]],
|
| 79 |
+
"proxy": Optional[str | Dict],
|
| 80 |
+
"os_randomize": bool,
|
| 81 |
+
"disable_ads": bool,
|
| 82 |
+
"geoip": bool,
|
| 83 |
+
"custom_config": Optional[Dict],
|
| 84 |
+
"additional_args": Optional[Dict],
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
# Mapping of function names to their parameter definitions
|
| 88 |
+
Signatures_map = {
|
| 89 |
+
"get": _REQUESTS_PARAMS,
|
| 90 |
+
"post": {**_REQUESTS_PARAMS, "data": Optional[Dict | str], "json": Optional[Dict | List]},
|
| 91 |
+
"put": {**_REQUESTS_PARAMS, "data": Optional[Dict | str], "json": Optional[Dict | List]},
|
| 92 |
+
"delete": _REQUESTS_PARAMS,
|
| 93 |
+
"fetch": _FETCH_PARAMS,
|
| 94 |
+
"stealthy_fetch": _STEALTHY_FETCH_PARAMS,
|
| 95 |
+
}
|
scrapling/core/shell.py
CHANGED
|
@@ -1,13 +1,14 @@
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
-
from re import sub as re_sub
|
| 3 |
from sys import stderr
|
| 4 |
from functools import wraps
|
|
|
|
| 5 |
from collections import namedtuple
|
| 6 |
from shlex import split as shlex_split
|
|
|
|
| 7 |
from tempfile import mkstemp as make_temp_file
|
| 8 |
-
from urllib.parse import urlparse, urlunparse, parse_qsl
|
| 9 |
from argparse import ArgumentParser, SUPPRESS
|
| 10 |
from webbrowser import open as open_in_browser
|
|
|
|
| 11 |
from logging import (
|
| 12 |
DEBUG,
|
| 13 |
INFO,
|
|
@@ -21,6 +22,7 @@ from logging import (
|
|
| 21 |
|
| 22 |
from orjson import loads as json_loads, JSONDecodeError
|
| 23 |
|
|
|
|
| 24 |
from scrapling import __version__
|
| 25 |
from scrapling.core.utils import log
|
| 26 |
from scrapling.parser import Selector, Selectors
|
|
@@ -28,12 +30,12 @@ from scrapling.core.custom_types import TextHandler
|
|
| 28 |
from scrapling.engines.toolbelt.custom import Response
|
| 29 |
from scrapling.core.utils._shell import _ParseHeaders, _CookieParser
|
| 30 |
from scrapling.core._types import (
|
| 31 |
-
Optional,
|
| 32 |
Dict,
|
| 33 |
Any,
|
| 34 |
cast,
|
| 35 |
-
|
| 36 |
Generator,
|
|
|
|
| 37 |
)
|
| 38 |
|
| 39 |
|
|
@@ -312,6 +314,40 @@ class CurlParser:
|
|
| 312 |
return None
|
| 313 |
|
| 314 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
def show_page_in_browser(page: Selector): # pragma: no cover
|
| 316 |
if not page or not isinstance(page, Selector):
|
| 317 |
log.error("Input must be of type `Selector`")
|
|
@@ -431,7 +467,7 @@ Type 'exit' or press Ctrl+D to exit.
|
|
| 431 |
|
| 432 |
return result
|
| 433 |
|
| 434 |
-
def create_wrapper(self, func):
|
| 435 |
"""Create a wrapper that preserves function signature but updates page"""
|
| 436 |
|
| 437 |
@wraps(func)
|
|
@@ -439,6 +475,12 @@ Type 'exit' or press Ctrl+D to exit.
|
|
| 439 |
result = func(*args, **kwargs)
|
| 440 |
return self.update_page(result)
|
| 441 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 442 |
return wrapper
|
| 443 |
|
| 444 |
def get_namespace(self):
|
|
@@ -451,7 +493,7 @@ Type 'exit' or press Ctrl+D to exit.
|
|
| 451 |
delete = self.create_wrapper(self.__Fetcher.delete)
|
| 452 |
dynamic_fetch = self.create_wrapper(self.__DynamicFetcher.fetch)
|
| 453 |
stealthy_fetch = self.create_wrapper(self.__StealthyFetcher.fetch)
|
| 454 |
-
curl2fetcher = self.create_wrapper(self._curl_parser.convert2fetcher)
|
| 455 |
|
| 456 |
# Create the namespace dictionary
|
| 457 |
return {
|
|
|
|
| 1 |
# -*- coding: utf-8 -*-
|
|
|
|
| 2 |
from sys import stderr
|
| 3 |
from functools import wraps
|
| 4 |
+
from re import sub as re_sub
|
| 5 |
from collections import namedtuple
|
| 6 |
from shlex import split as shlex_split
|
| 7 |
+
from inspect import signature, Parameter
|
| 8 |
from tempfile import mkstemp as make_temp_file
|
|
|
|
| 9 |
from argparse import ArgumentParser, SUPPRESS
|
| 10 |
from webbrowser import open as open_in_browser
|
| 11 |
+
from urllib.parse import urlparse, urlunparse, parse_qsl
|
| 12 |
from logging import (
|
| 13 |
DEBUG,
|
| 14 |
INFO,
|
|
|
|
| 22 |
|
| 23 |
from orjson import loads as json_loads, JSONDecodeError
|
| 24 |
|
| 25 |
+
from ._shell_signatures import Signatures_map
|
| 26 |
from scrapling import __version__
|
| 27 |
from scrapling.core.utils import log
|
| 28 |
from scrapling.parser import Selector, Selectors
|
|
|
|
| 30 |
from scrapling.engines.toolbelt.custom import Response
|
| 31 |
from scrapling.core.utils._shell import _ParseHeaders, _CookieParser
|
| 32 |
from scrapling.core._types import (
|
|
|
|
| 33 |
Dict,
|
| 34 |
Any,
|
| 35 |
cast,
|
| 36 |
+
Optional,
|
| 37 |
Generator,
|
| 38 |
+
extraction_types,
|
| 39 |
)
|
| 40 |
|
| 41 |
|
|
|
|
| 314 |
return None
|
| 315 |
|
| 316 |
|
| 317 |
+
def _unpack_signature(func):
|
| 318 |
+
"""
|
| 319 |
+
Unpack TypedDict from Unpack[TypedDict] annotations in **kwargs and reconstruct the signature.
|
| 320 |
+
|
| 321 |
+
This allows the interactive shell to show individual parameters instead of just **kwargs, similar to how IDEs display them.
|
| 322 |
+
"""
|
| 323 |
+
try:
|
| 324 |
+
sig = signature(func)
|
| 325 |
+
func_name = getattr(func, "__name__", None)
|
| 326 |
+
|
| 327 |
+
# Check if this function has known parameters
|
| 328 |
+
if func_name not in Signatures_map:
|
| 329 |
+
return sig
|
| 330 |
+
|
| 331 |
+
new_params = []
|
| 332 |
+
for param in sig.parameters.values():
|
| 333 |
+
if param.kind == Parameter.VAR_KEYWORD:
|
| 334 |
+
# Replace **kwargs with individual keyword-only parameters
|
| 335 |
+
for field_name, field_type in Signatures_map[func_name].items():
|
| 336 |
+
new_params.append(
|
| 337 |
+
Parameter(field_name, Parameter.KEYWORD_ONLY, default=Parameter.empty, annotation=field_type)
|
| 338 |
+
)
|
| 339 |
+
else:
|
| 340 |
+
new_params.append(param)
|
| 341 |
+
|
| 342 |
+
# Reconstruct signature with unpacked parameters
|
| 343 |
+
if len(new_params) != len(sig.parameters):
|
| 344 |
+
return sig.replace(parameters=new_params)
|
| 345 |
+
return sig
|
| 346 |
+
|
| 347 |
+
except Exception: # pragma: no cover
|
| 348 |
+
return signature(func)
|
| 349 |
+
|
| 350 |
+
|
| 351 |
def show_page_in_browser(page: Selector): # pragma: no cover
|
| 352 |
if not page or not isinstance(page, Selector):
|
| 353 |
log.error("Input must be of type `Selector`")
|
|
|
|
| 467 |
|
| 468 |
return result
|
| 469 |
|
| 470 |
+
def create_wrapper(self, func, get_signature=True):
|
| 471 |
"""Create a wrapper that preserves function signature but updates page"""
|
| 472 |
|
| 473 |
@wraps(func)
|
|
|
|
| 475 |
result = func(*args, **kwargs)
|
| 476 |
return self.update_page(result)
|
| 477 |
|
| 478 |
+
if get_signature:
|
| 479 |
+
# Explicitly preserve and unpack signature for IPython introspection and autocompletion
|
| 480 |
+
wrapper.__signature__ = _unpack_signature(func) # pyright: ignore
|
| 481 |
+
else:
|
| 482 |
+
wrapper.__signature__ = signature(func) # pyright: ignore
|
| 483 |
+
|
| 484 |
return wrapper
|
| 485 |
|
| 486 |
def get_namespace(self):
|
|
|
|
| 493 |
delete = self.create_wrapper(self.__Fetcher.delete)
|
| 494 |
dynamic_fetch = self.create_wrapper(self.__DynamicFetcher.fetch)
|
| 495 |
stealthy_fetch = self.create_wrapper(self.__StealthyFetcher.fetch)
|
| 496 |
+
curl2fetcher = self.create_wrapper(self._curl_parser.convert2fetcher, get_signature=False)
|
| 497 |
|
| 498 |
# Create the namespace dictionary
|
| 499 |
return {
|