style: Fix all mypy errors and add type hints to untyped function bodies
Browse files**Resolved all 65 mypy errors across 14 files and added type annotations to all previously untyped function bodies. Final result: 0 errors with --check-untyped-defs enabled, all 454 tests pass.**
`scrapling/core/_types.py`
- Removed broken Self = object fallback — now requires typing_extensions for Python < 3.11
`scrapling/core/storage.py`
- Fixed str/bytes mismatch in _get_hash() — used separate _identifier_bytes variable instead of reassigning from str to bytes
`scrapling/core/custom_types.py`
- split() return type: Union[List, "TextHandlers"] → list[Any] (avoids LSP violation with parent list[str])
- format() kwargs: **kwargs: str → **kwargs: object (matches parent str.format signature)
- AttributesHandler.__init__: Added mapping: Any = None, **kwargs: Any and -> None
- json_string property: Added -> bytes return type
`scrapling/core/mixins.py`
- Changed self: "Selector" to self: Any on all mixin methods (mypy can't handle forward-reference self types on non-subclass mixins)
- Added Dict[str, int] annotation for counter variable
- Removed unused TYPE_CHECKING / Selector imports
`scrapling/parser.py (~30 errors)`
- Added body: str | bytes pre-annotation for dual-type if/elif assignment
- Used Dict[str, Any] kwargs dict for HTMLParser(...) to bypass incomplete lxml stubs missing default_doctype
- Changed base_url=url or None → base_url=url or "" (avoids str | None vs str | bytes)
- bool(adaptive) to guarantee bool type for __adaptive_enabled
- Declared __text: Optional[TextHandler], __tag: Optional[str], __attributes: Optional[AttributesHandler] at top of __init__
- cast(List, ...) for all XPath() call results (_find_all_elements, _find_all_elements_with_spaces)
- Added Dict[float, List[Any]] for score_table, Dict[str, Any] for attributes
- Changed score, checks = 0, 0 → score: float = 0; checks: int = 0 (two locations)
- Renamed target → target_element in save() to avoid variable redefinition with different types
- Wrapped node_text.clean() / .lower() in TextHandler(...) to preserve type
`scrapling/engines/_browsers/_page.py`
- Added PageInfo[SyncPage] | PageInfo[AsyncPage] union type annotation to page_info variable
`scrapling/engines/_browsers/_validators.py`
- Convert method_kwargs (TypedDict) to plain Dict[str, Any] before dynamic key access
`scrapling/engines/_browsers/_base.py`
- Added _config declaration to BaseSessionMixin
- Used cast(StealthConfig, self._config) in __generate_stealth_options to access stealth-only attributes
- Added Tuple[str, ...] annotation for flags
- Removed redundant narrower StealthConfig type annotation on self._config in StealthySessionMixin.__validate__
- Widened SyncSession and AsyncSession fields (playwright, context, browser) to Any to support both playwright and patchright types
- Added -> None to both start() methods
`scrapling/engines/_browsers/_stealth.py`
- Added Optional, ProxyType imports
- Annotated proxy: Optional[ProxyType] in both sync/async fetch loops
- Annotated outer_box: Any at first declaration, removed duplicate type annotations in subsequent branches
- Added -> None to sync and async start()
- Added config: Any parameter type to _initialize_context
- Removed redundant self.context: AsyncBrowserContext re-annotations in conditional branches
`scrapling/engines/_browsers/_controllers.py`
- Added Optional, ProxyType imports
- Annotated proxy: Optional[ProxyType] in both sync/async fetch loops
- Added -> None to async start()
- Removed redundant self.context: AsyncBrowserContext re-annotations
`scrapling/spiders/request.py`
- Added Optional import, typed _fp: Optional[bytes] = None
- Removed redundant body: bytes re-annotation
`scrapling/spiders/session.py`
- Used separate client variable instead of reassigning session = session._client (avoids type incompatibility and fixes a bug where session._make_request was called instead of client._make_request)
- Added -> None to SessionManager.__init__
`scrapling/engines/toolbelt/convertor.py`
- Added list[Response] annotation for history in both sync/async methods
`scrapling/engines/static.py`
- FetcherClient.__init__ and AsyncFetcherClient.__init__: Added **kwargs: Any and -> None
`scrapling/core/shell.py`
- Wrapped re_sub(...) result in TextHandler(...) to maintain correct type
- Added -> None to CurlParser.__init__
- Added full type signature to create_wrapper, replaced wrapper.__signature__ = ... with setattr(wrapper, "__signature__", ...) to satisfy mypy
- Added Callable to imports
- scrapling/core/_types.py +1 -21
- scrapling/core/custom_types.py +5 -7
- scrapling/core/mixins.py +11 -10
- scrapling/core/shell.py +9 -6
- scrapling/core/storage.py +4 -5
- scrapling/engines/_browsers/_base.py +19 -19
- scrapling/engines/_browsers/_controllers.py +6 -7
- scrapling/engines/_browsers/_page.py +3 -1
- scrapling/engines/_browsers/_stealth.py +14 -14
- scrapling/engines/_browsers/_validators.py +5 -4
- scrapling/engines/static.py +2 -2
- scrapling/engines/toolbelt/convertor.py +4 -2
- scrapling/parser.py +42 -43
- scrapling/spiders/request.py +3 -3
- scrapling/spiders/session.py +5 -5
|
@@ -32,6 +32,7 @@ from typing import (
|
|
| 32 |
Coroutine,
|
| 33 |
SupportsIndex,
|
| 34 |
)
|
|
|
|
| 35 |
|
| 36 |
# Proxy can be a string URL or a dict (Playwright format: {"server": "...", "username": "...", "password": "..."})
|
| 37 |
ProxyType = Union[str, Dict[str, str]]
|
|
@@ -41,27 +42,6 @@ PageLoadStates = Literal["commit", "domcontentloaded", "load", "networkidle"]
|
|
| 41 |
extraction_types = Literal["text", "html", "markdown"]
|
| 42 |
StrOrBytes = Union[str, bytes]
|
| 43 |
|
| 44 |
-
if TYPE_CHECKING: # pragma: no cover
|
| 45 |
-
from typing_extensions import Unpack
|
| 46 |
-
else: # pragma: no cover
|
| 47 |
-
|
| 48 |
-
class _Unpack:
|
| 49 |
-
@staticmethod
|
| 50 |
-
def __getitem__(*args, **kwargs):
|
| 51 |
-
pass
|
| 52 |
-
|
| 53 |
-
Unpack = _Unpack()
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
try:
|
| 57 |
-
# Python 3.11+
|
| 58 |
-
from typing import Self # novermin
|
| 59 |
-
except ImportError: # pragma: no cover
|
| 60 |
-
try:
|
| 61 |
-
from typing_extensions import Self # Backport
|
| 62 |
-
except ImportError:
|
| 63 |
-
Self = object
|
| 64 |
-
|
| 65 |
|
| 66 |
# Copied from `playwright._impl._api_structures.SetCookieParam`
|
| 67 |
class SetCookieParam(TypedDict, total=False):
|
|
|
|
| 32 |
Coroutine,
|
| 33 |
SupportsIndex,
|
| 34 |
)
|
| 35 |
+
from typing_extensions import Self, Unpack
|
| 36 |
|
| 37 |
# Proxy can be a string URL or a dict (Playwright format: {"server": "...", "username": "...", "password": "..."})
|
| 38 |
ProxyType = Union[str, Dict[str, str]]
|
|
|
|
| 42 |
extraction_types = Literal["text", "html", "markdown"]
|
| 43 |
StrOrBytes = Union[str, bytes]
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
# Copied from `playwright._impl._api_structures.SetCookieParam`
|
| 47 |
class SetCookieParam(TypedDict, total=False):
|
|
@@ -35,9 +35,7 @@ class TextHandler(str):
|
|
| 35 |
lst = super().__getitem__(key)
|
| 36 |
return TextHandler(lst)
|
| 37 |
|
| 38 |
-
def split(
|
| 39 |
-
self, sep: str | None = None, maxsplit: SupportsIndex = -1
|
| 40 |
-
) -> Union[List, "TextHandlers"]: # pragma: no cover
|
| 41 |
return TextHandlers([TextHandler(s) for s in super().split(sep, maxsplit)])
|
| 42 |
|
| 43 |
def strip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
@@ -61,7 +59,7 @@ class TextHandler(str):
|
|
| 61 |
def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, "TextHandler"]: # pragma: no cover
|
| 62 |
return TextHandler(super().expandtabs(tabsize))
|
| 63 |
|
| 64 |
-
def format(self, *args: object, **kwargs:
|
| 65 |
return TextHandler(super().format(*args, **kwargs))
|
| 66 |
|
| 67 |
def format_map(self, mapping) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
@@ -291,7 +289,7 @@ class AttributesHandler(Mapping[str, _TextHandlerType]):
|
|
| 291 |
|
| 292 |
__slots__ = ("_data",)
|
| 293 |
|
| 294 |
-
def __init__(self, mapping=None, **kwargs):
|
| 295 |
mapping = (
|
| 296 |
{key: TextHandler(value) if isinstance(value, str) else value for key, value in mapping.items()}
|
| 297 |
if mapping is not None
|
|
@@ -324,8 +322,8 @@ class AttributesHandler(Mapping[str, _TextHandlerType]):
|
|
| 324 |
yield AttributesHandler({key: value})
|
| 325 |
|
| 326 |
@property
|
| 327 |
-
def json_string(self):
|
| 328 |
-
"""Convert current attributes to JSON
|
| 329 |
return dumps(dict(self._data))
|
| 330 |
|
| 331 |
def __getitem__(self, key: str) -> _TextHandlerType:
|
|
|
|
| 35 |
lst = super().__getitem__(key)
|
| 36 |
return TextHandler(lst)
|
| 37 |
|
| 38 |
+
def split(self, sep: str | None = None, maxsplit: SupportsIndex = -1) -> list[Any]: # pragma: no cover
|
|
|
|
|
|
|
| 39 |
return TextHandlers([TextHandler(s) for s in super().split(sep, maxsplit)])
|
| 40 |
|
| 41 |
def strip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
|
|
| 59 |
def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, "TextHandler"]: # pragma: no cover
|
| 60 |
return TextHandler(super().expandtabs(tabsize))
|
| 61 |
|
| 62 |
+
def format(self, *args: object, **kwargs: object) -> Union[str, "TextHandler"]: # pragma: no cover
|
| 63 |
return TextHandler(super().format(*args, **kwargs))
|
| 64 |
|
| 65 |
def format_map(self, mapping) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
|
|
| 289 |
|
| 290 |
__slots__ = ("_data",)
|
| 291 |
|
| 292 |
+
def __init__(self, mapping: Any = None, **kwargs: Any) -> None:
|
| 293 |
mapping = (
|
| 294 |
{key: TextHandler(value) if isinstance(value, str) else value for key, value in mapping.items()}
|
| 295 |
if mapping is not None
|
|
|
|
| 322 |
yield AttributesHandler({key: value})
|
| 323 |
|
| 324 |
@property
|
| 325 |
+
def json_string(self) -> bytes:
|
| 326 |
+
"""Convert current attributes to JSON bytes if the attributes are JSON serializable otherwise throws error"""
|
| 327 |
return dumps(dict(self._data))
|
| 328 |
|
| 329 |
def __getitem__(self, key: str) -> _TextHandlerType:
|
|
@@ -1,7 +1,4 @@
|
|
| 1 |
-
from scrapling.core._types import
|
| 2 |
-
|
| 3 |
-
if TYPE_CHECKING:
|
| 4 |
-
from scrapling.parser import Selector
|
| 5 |
|
| 6 |
|
| 7 |
class SelectorsGeneration:
|
|
@@ -11,7 +8,11 @@ class SelectorsGeneration:
|
|
| 11 |
Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591
|
| 12 |
"""
|
| 13 |
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
"""Generate a selector for the current element.
|
| 16 |
:return: A string of the generated selector.
|
| 17 |
"""
|
|
@@ -36,7 +37,7 @@ class SelectorsGeneration:
|
|
| 36 |
# if classes and css:
|
| 37 |
# part += f".{'.'.join(classes)}"
|
| 38 |
# else:
|
| 39 |
-
counter = {}
|
| 40 |
for child in target.parent.children:
|
| 41 |
counter.setdefault(child.tag, 0)
|
| 42 |
counter[child.tag] += 1
|
|
@@ -56,28 +57,28 @@ class SelectorsGeneration:
|
|
| 56 |
return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))
|
| 57 |
|
| 58 |
@property
|
| 59 |
-
def generate_css_selector(self:
|
| 60 |
"""Generate a CSS selector for the current element
|
| 61 |
:return: A string of the generated selector.
|
| 62 |
"""
|
| 63 |
return self._general_selection()
|
| 64 |
|
| 65 |
@property
|
| 66 |
-
def generate_full_css_selector(self:
|
| 67 |
"""Generate a complete CSS selector for the current element
|
| 68 |
:return: A string of the generated selector.
|
| 69 |
"""
|
| 70 |
return self._general_selection(full_path=True)
|
| 71 |
|
| 72 |
@property
|
| 73 |
-
def generate_xpath_selector(self:
|
| 74 |
"""Generate an XPath selector for the current element
|
| 75 |
:return: A string of the generated selector.
|
| 76 |
"""
|
| 77 |
return self._general_selection("xpath")
|
| 78 |
|
| 79 |
@property
|
| 80 |
-
def generate_full_xpath_selector(self:
|
| 81 |
"""Generate a complete XPath selector for the current element
|
| 82 |
:return: A string of the generated selector.
|
| 83 |
"""
|
|
|
|
| 1 |
+
from scrapling.core._types import Any, Dict
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
|
| 4 |
class SelectorsGeneration:
|
|
|
|
| 8 |
Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591
|
| 9 |
"""
|
| 10 |
|
| 11 |
+
# Note: This is a mixin class meant to be used with Selector.
|
| 12 |
+
# The methods access Selector attributes (._root, .parent, .attrib, .tag, etc.)
|
| 13 |
+
# through self, which will be a Selector instance at runtime.
|
| 14 |
+
|
| 15 |
+
def _general_selection(self: Any, selection: str = "css", full_path: bool = False) -> str:
|
| 16 |
"""Generate a selector for the current element.
|
| 17 |
:return: A string of the generated selector.
|
| 18 |
"""
|
|
|
|
| 37 |
# if classes and css:
|
| 38 |
# part += f".{'.'.join(classes)}"
|
| 39 |
# else:
|
| 40 |
+
counter: Dict[str, int] = {}
|
| 41 |
for child in target.parent.children:
|
| 42 |
counter.setdefault(child.tag, 0)
|
| 43 |
counter[child.tag] += 1
|
|
|
|
| 57 |
return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))
|
| 58 |
|
| 59 |
@property
|
| 60 |
+
def generate_css_selector(self: Any) -> str:
|
| 61 |
"""Generate a CSS selector for the current element
|
| 62 |
:return: A string of the generated selector.
|
| 63 |
"""
|
| 64 |
return self._general_selection()
|
| 65 |
|
| 66 |
@property
|
| 67 |
+
def generate_full_css_selector(self: Any) -> str:
|
| 68 |
"""Generate a complete CSS selector for the current element
|
| 69 |
:return: A string of the generated selector.
|
| 70 |
"""
|
| 71 |
return self._general_selection(full_path=True)
|
| 72 |
|
| 73 |
@property
|
| 74 |
+
def generate_xpath_selector(self: Any) -> str:
|
| 75 |
"""Generate an XPath selector for the current element
|
| 76 |
:return: A string of the generated selector.
|
| 77 |
"""
|
| 78 |
return self._general_selection("xpath")
|
| 79 |
|
| 80 |
@property
|
| 81 |
+
def generate_full_xpath_selector(self: Any) -> str:
|
| 82 |
"""Generate a complete XPath selector for the current element
|
| 83 |
:return: A string of the generated selector.
|
| 84 |
"""
|
|
@@ -30,6 +30,7 @@ from scrapling.core.custom_types import TextHandler
|
|
| 30 |
from scrapling.engines.toolbelt.custom import Response
|
| 31 |
from scrapling.core.utils._shell import _ParseHeaders, _CookieParser
|
| 32 |
from scrapling.core._types import (
|
|
|
|
| 33 |
Dict,
|
| 34 |
Any,
|
| 35 |
cast,
|
|
@@ -82,7 +83,7 @@ class NoExitArgumentParser(ArgumentParser): # pragma: no cover
|
|
| 82 |
class CurlParser:
|
| 83 |
"""Builds the argument parser for relevant curl flags from DevTools."""
|
| 84 |
|
| 85 |
-
def __init__(self):
|
| 86 |
from scrapling.fetchers import Fetcher as __Fetcher
|
| 87 |
|
| 88 |
self.__fetcher = __Fetcher
|
|
@@ -467,19 +468,21 @@ Type 'exit' or press Ctrl+D to exit.
|
|
| 467 |
|
| 468 |
return result
|
| 469 |
|
| 470 |
-
def create_wrapper(
|
|
|
|
|
|
|
| 471 |
"""Create a wrapper that preserves function signature but updates page"""
|
| 472 |
|
| 473 |
@wraps(func)
|
| 474 |
-
def wrapper(*args, **kwargs):
|
| 475 |
result = func(*args, **kwargs)
|
| 476 |
return self.update_page(result)
|
| 477 |
|
| 478 |
if get_signature:
|
| 479 |
# Explicitly preserve and unpack signature for IPython introspection and autocompletion
|
| 480 |
-
wrapper
|
| 481 |
else:
|
| 482 |
-
wrapper
|
| 483 |
|
| 484 |
return wrapper
|
| 485 |
|
|
@@ -601,7 +604,7 @@ class Convertor:
|
|
| 601 |
" ",
|
| 602 |
):
|
| 603 |
# Remove consecutive white-spaces
|
| 604 |
-
txt_content = re_sub(f"[{s}]+", s, txt_content)
|
| 605 |
yield txt_content
|
| 606 |
yield ""
|
| 607 |
|
|
|
|
| 30 |
from scrapling.engines.toolbelt.custom import Response
|
| 31 |
from scrapling.core.utils._shell import _ParseHeaders, _CookieParser
|
| 32 |
from scrapling.core._types import (
|
| 33 |
+
Callable,
|
| 34 |
Dict,
|
| 35 |
Any,
|
| 36 |
cast,
|
|
|
|
| 83 |
class CurlParser:
|
| 84 |
"""Builds the argument parser for relevant curl flags from DevTools."""
|
| 85 |
|
| 86 |
+
def __init__(self) -> None:
|
| 87 |
from scrapling.fetchers import Fetcher as __Fetcher
|
| 88 |
|
| 89 |
self.__fetcher = __Fetcher
|
|
|
|
| 468 |
|
| 469 |
return result
|
| 470 |
|
| 471 |
+
def create_wrapper(
|
| 472 |
+
self, func: Callable, get_signature: bool = True, signature_name: Optional[str] = None
|
| 473 |
+
) -> Callable:
|
| 474 |
"""Create a wrapper that preserves function signature but updates page"""
|
| 475 |
|
| 476 |
@wraps(func)
|
| 477 |
+
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
| 478 |
result = func(*args, **kwargs)
|
| 479 |
return self.update_page(result)
|
| 480 |
|
| 481 |
if get_signature:
|
| 482 |
# Explicitly preserve and unpack signature for IPython introspection and autocompletion
|
| 483 |
+
setattr(wrapper, "__signature__", _unpack_signature(func, signature_name))
|
| 484 |
else:
|
| 485 |
+
setattr(wrapper, "__signature__", signature(func))
|
| 486 |
|
| 487 |
return wrapper
|
| 488 |
|
|
|
|
| 604 |
" ",
|
| 605 |
):
|
| 606 |
# Remove consecutive white-spaces
|
| 607 |
+
txt_content = TextHandler(re_sub(f"[{s}]+", s, txt_content))
|
| 608 |
yield txt_content
|
| 609 |
yield ""
|
| 610 |
|
|
@@ -63,12 +63,11 @@ class StorageSystemMixin(ABC): # pragma: no cover
|
|
| 63 |
def _get_hash(identifier: str) -> str:
|
| 64 |
"""If you want to hash identifier in your storage system, use this safer"""
|
| 65 |
_identifier = identifier.lower().strip()
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
_identifier = _identifier.encode("utf-8")
|
| 69 |
|
| 70 |
-
hash_value = sha256(
|
| 71 |
-
return f"{hash_value}_{len(
|
| 72 |
|
| 73 |
|
| 74 |
@lru_cache(1, typed=True)
|
|
|
|
| 63 |
def _get_hash(identifier: str) -> str:
|
| 64 |
"""If you want to hash identifier in your storage system, use this safer"""
|
| 65 |
_identifier = identifier.lower().strip()
|
| 66 |
+
# Hash functions have to take bytes
|
| 67 |
+
_identifier_bytes = _identifier.encode("utf-8")
|
|
|
|
| 68 |
|
| 69 |
+
hash_value = sha256(_identifier_bytes).hexdigest()
|
| 70 |
+
return f"{hash_value}_{len(_identifier_bytes)}" # Length to reduce collision chance
|
| 71 |
|
| 72 |
|
| 73 |
@lru_cache(1, typed=True)
|
|
@@ -5,16 +5,12 @@ from contextlib import contextmanager, asynccontextmanager
|
|
| 5 |
from playwright.sync_api._generated import Page
|
| 6 |
from playwright.sync_api import (
|
| 7 |
Frame,
|
| 8 |
-
Browser,
|
| 9 |
BrowserContext,
|
| 10 |
-
Playwright,
|
| 11 |
Response as SyncPlaywrightResponse,
|
| 12 |
)
|
| 13 |
from playwright.async_api._generated import Page as AsyncPage
|
| 14 |
from playwright.async_api import (
|
| 15 |
Frame as AsyncFrame,
|
| 16 |
-
Browser as AsyncBrowser,
|
| 17 |
-
Playwright as AsyncPlaywright,
|
| 18 |
Response as AsyncPlaywrightResponse,
|
| 19 |
BrowserContext as AsyncBrowserContext,
|
| 20 |
)
|
|
@@ -37,6 +33,7 @@ from scrapling.core._types import (
|
|
| 37 |
Optional,
|
| 38 |
Callable,
|
| 39 |
TYPE_CHECKING,
|
|
|
|
| 40 |
overload,
|
| 41 |
Tuple,
|
| 42 |
ProxyType,
|
|
@@ -61,12 +58,12 @@ class SyncSession:
|
|
| 61 |
self.max_pages = max_pages
|
| 62 |
self.page_pool = PagePool(max_pages)
|
| 63 |
self._max_wait_for_page = 60
|
| 64 |
-
self.playwright:
|
| 65 |
-
self.context:
|
| 66 |
-
self.browser:
|
| 67 |
self._is_alive = False
|
| 68 |
|
| 69 |
-
def start(self):
|
| 70 |
pass
|
| 71 |
|
| 72 |
def close(self): # pragma: no cover
|
|
@@ -215,13 +212,13 @@ class AsyncSession:
|
|
| 215 |
self.max_pages = max_pages
|
| 216 |
self.page_pool = PagePool(max_pages)
|
| 217 |
self._max_wait_for_page = 60
|
| 218 |
-
self.playwright:
|
| 219 |
-
self.context:
|
| 220 |
-
self.browser:
|
| 221 |
self._is_alive = False
|
| 222 |
self._lock = Lock()
|
| 223 |
|
| 224 |
-
async def start(self):
|
| 225 |
pass
|
| 226 |
|
| 227 |
async def close(self):
|
|
@@ -378,6 +375,8 @@ class AsyncSession:
|
|
| 378 |
|
| 379 |
|
| 380 |
class BaseSessionMixin:
|
|
|
|
|
|
|
| 381 |
@overload
|
| 382 |
def __validate_routine__(self, params: Dict, model: type[StealthConfig]) -> StealthConfig: ...
|
| 383 |
|
|
@@ -404,7 +403,7 @@ class BaseSessionMixin:
|
|
| 404 |
return config
|
| 405 |
|
| 406 |
def __generate_options__(self, extra_flags: Tuple | None = None) -> None:
|
| 407 |
-
config: PlaywrightConfig | StealthConfig = self._config
|
| 408 |
self._context_options.update(
|
| 409 |
{
|
| 410 |
"proxy": config.proxy,
|
|
@@ -466,7 +465,7 @@ class DynamicSessionMixin(BaseSessionMixin):
|
|
| 466 |
|
| 467 |
class StealthySessionMixin(BaseSessionMixin):
|
| 468 |
def __validate__(self, **params):
|
| 469 |
-
self._config
|
| 470 |
self._context_options.update(
|
| 471 |
{
|
| 472 |
"is_mobile": False,
|
|
@@ -482,22 +481,23 @@ class StealthySessionMixin(BaseSessionMixin):
|
|
| 482 |
self.__generate_stealth_options()
|
| 483 |
|
| 484 |
def __generate_stealth_options(self) -> None:
|
| 485 |
-
|
| 486 |
-
|
|
|
|
| 487 |
flags = DEFAULT_FLAGS + DEFAULT_STEALTH_FLAGS
|
| 488 |
|
| 489 |
-
if
|
| 490 |
flags += (
|
| 491 |
"--webrtc-ip-handling-policy=disable_non_proxied_udp",
|
| 492 |
"--force-webrtc-ip-handling-policy", # Ensures the policy is enforced
|
| 493 |
)
|
| 494 |
-
if not
|
| 495 |
flags += (
|
| 496 |
"--disable-webgl",
|
| 497 |
"--disable-webgl-image-chromium",
|
| 498 |
"--disable-webgl2",
|
| 499 |
)
|
| 500 |
-
if
|
| 501 |
flags += ("--fingerprinting-canvas-image-data-noise",)
|
| 502 |
|
| 503 |
super(StealthySessionMixin, self).__generate_options__(flags)
|
|
|
|
| 5 |
from playwright.sync_api._generated import Page
|
| 6 |
from playwright.sync_api import (
|
| 7 |
Frame,
|
|
|
|
| 8 |
BrowserContext,
|
|
|
|
| 9 |
Response as SyncPlaywrightResponse,
|
| 10 |
)
|
| 11 |
from playwright.async_api._generated import Page as AsyncPage
|
| 12 |
from playwright.async_api import (
|
| 13 |
Frame as AsyncFrame,
|
|
|
|
|
|
|
| 14 |
Response as AsyncPlaywrightResponse,
|
| 15 |
BrowserContext as AsyncBrowserContext,
|
| 16 |
)
|
|
|
|
| 33 |
Optional,
|
| 34 |
Callable,
|
| 35 |
TYPE_CHECKING,
|
| 36 |
+
cast,
|
| 37 |
overload,
|
| 38 |
Tuple,
|
| 39 |
ProxyType,
|
|
|
|
| 58 |
self.max_pages = max_pages
|
| 59 |
self.page_pool = PagePool(max_pages)
|
| 60 |
self._max_wait_for_page = 60
|
| 61 |
+
self.playwright: Any = None
|
| 62 |
+
self.context: Any = None
|
| 63 |
+
self.browser: Any = None
|
| 64 |
self._is_alive = False
|
| 65 |
|
| 66 |
+
def start(self) -> None:
|
| 67 |
pass
|
| 68 |
|
| 69 |
def close(self): # pragma: no cover
|
|
|
|
| 212 |
self.max_pages = max_pages
|
| 213 |
self.page_pool = PagePool(max_pages)
|
| 214 |
self._max_wait_for_page = 60
|
| 215 |
+
self.playwright: Any = None
|
| 216 |
+
self.context: Any = None
|
| 217 |
+
self.browser: Any = None
|
| 218 |
self._is_alive = False
|
| 219 |
self._lock = Lock()
|
| 220 |
|
| 221 |
+
async def start(self) -> None:
|
| 222 |
pass
|
| 223 |
|
| 224 |
async def close(self):
|
|
|
|
| 375 |
|
| 376 |
|
| 377 |
class BaseSessionMixin:
|
| 378 |
+
_config: "PlaywrightConfig | StealthConfig"
|
| 379 |
+
|
| 380 |
@overload
|
| 381 |
def __validate_routine__(self, params: Dict, model: type[StealthConfig]) -> StealthConfig: ...
|
| 382 |
|
|
|
|
| 403 |
return config
|
| 404 |
|
| 405 |
def __generate_options__(self, extra_flags: Tuple | None = None) -> None:
|
| 406 |
+
config: PlaywrightConfig | StealthConfig = self._config
|
| 407 |
self._context_options.update(
|
| 408 |
{
|
| 409 |
"proxy": config.proxy,
|
|
|
|
| 465 |
|
| 466 |
class StealthySessionMixin(BaseSessionMixin):
|
| 467 |
def __validate__(self, **params):
|
| 468 |
+
self._config = self.__validate_routine__(params, model=StealthConfig)
|
| 469 |
self._context_options.update(
|
| 470 |
{
|
| 471 |
"is_mobile": False,
|
|
|
|
| 481 |
self.__generate_stealth_options()
|
| 482 |
|
| 483 |
def __generate_stealth_options(self) -> None:
|
| 484 |
+
config = cast(StealthConfig, self._config)
|
| 485 |
+
flags: Tuple[str, ...] = tuple()
|
| 486 |
+
if not config.cdp_url:
|
| 487 |
flags = DEFAULT_FLAGS + DEFAULT_STEALTH_FLAGS
|
| 488 |
|
| 489 |
+
if config.block_webrtc:
|
| 490 |
flags += (
|
| 491 |
"--webrtc-ip-handling-policy=disable_non_proxied_udp",
|
| 492 |
"--force-webrtc-ip-handling-policy", # Ensures the policy is enforced
|
| 493 |
)
|
| 494 |
+
if not config.allow_webgl:
|
| 495 |
flags += (
|
| 496 |
"--disable-webgl",
|
| 497 |
"--disable-webgl-image-chromium",
|
| 498 |
"--disable-webgl2",
|
| 499 |
)
|
| 500 |
+
if config.hide_canvas:
|
| 501 |
flags += ("--fingerprinting-canvas-image-data-noise",)
|
| 502 |
|
| 503 |
super(StealthySessionMixin, self).__generate_options__(flags)
|
|
@@ -8,11 +8,10 @@ from playwright.sync_api import (
|
|
| 8 |
from playwright.async_api import (
|
| 9 |
async_playwright,
|
| 10 |
Locator as AsyncLocator,
|
| 11 |
-
BrowserContext as AsyncBrowserContext,
|
| 12 |
)
|
| 13 |
|
| 14 |
from scrapling.core.utils import log
|
| 15 |
-
from scrapling.core._types import Unpack
|
| 16 |
from scrapling.engines.toolbelt.proxy_rotation import is_proxy_error
|
| 17 |
from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
|
| 18 |
from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
|
|
@@ -134,6 +133,7 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
|
|
| 134 |
)
|
| 135 |
|
| 136 |
for attempt in range(self._config.retries):
|
|
|
|
| 137 |
if self._config.proxy_rotator and static_proxy is None:
|
| 138 |
proxy = self._config.proxy_rotator.get_proxy()
|
| 139 |
else:
|
|
@@ -238,7 +238,7 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
|
|
| 238 |
self.__validate__(**kwargs)
|
| 239 |
super().__init__(max_pages=self._config.max_pages)
|
| 240 |
|
| 241 |
-
async def start(self):
|
| 242 |
"""Create a browser for this instance and context."""
|
| 243 |
if not self.playwright:
|
| 244 |
self.playwright = await async_playwright().start()
|
|
@@ -246,16 +246,14 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
|
|
| 246 |
if self._config.cdp_url:
|
| 247 |
self.browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
|
| 248 |
if not self._config.proxy_rotator and self.browser:
|
| 249 |
-
self.context
|
| 250 |
elif self._config.proxy_rotator:
|
| 251 |
self.browser = await self.playwright.chromium.launch(**self._browser_options)
|
| 252 |
else:
|
| 253 |
persistent_options = (
|
| 254 |
self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
|
| 255 |
)
|
| 256 |
-
self.context
|
| 257 |
-
**persistent_options
|
| 258 |
-
)
|
| 259 |
|
| 260 |
if self.context:
|
| 261 |
self.context = await self._initialize_context(self._config, self.context)
|
|
@@ -304,6 +302,7 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
|
|
| 304 |
)
|
| 305 |
|
| 306 |
for attempt in range(self._config.retries):
|
|
|
|
| 307 |
if self._config.proxy_rotator and static_proxy is None:
|
| 308 |
proxy = self._config.proxy_rotator.get_proxy()
|
| 309 |
else:
|
|
|
|
| 8 |
from playwright.async_api import (
|
| 9 |
async_playwright,
|
| 10 |
Locator as AsyncLocator,
|
|
|
|
| 11 |
)
|
| 12 |
|
| 13 |
from scrapling.core.utils import log
|
| 14 |
+
from scrapling.core._types import Optional, ProxyType, Unpack
|
| 15 |
from scrapling.engines.toolbelt.proxy_rotation import is_proxy_error
|
| 16 |
from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
|
| 17 |
from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
|
|
|
|
| 133 |
)
|
| 134 |
|
| 135 |
for attempt in range(self._config.retries):
|
| 136 |
+
proxy: Optional[ProxyType] = None
|
| 137 |
if self._config.proxy_rotator and static_proxy is None:
|
| 138 |
proxy = self._config.proxy_rotator.get_proxy()
|
| 139 |
else:
|
|
|
|
| 238 |
self.__validate__(**kwargs)
|
| 239 |
super().__init__(max_pages=self._config.max_pages)
|
| 240 |
|
| 241 |
+
async def start(self) -> None:
|
| 242 |
"""Create a browser for this instance and context."""
|
| 243 |
if not self.playwright:
|
| 244 |
self.playwright = await async_playwright().start()
|
|
|
|
| 246 |
if self._config.cdp_url:
|
| 247 |
self.browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
|
| 248 |
if not self._config.proxy_rotator and self.browser:
|
| 249 |
+
self.context = await self.browser.new_context(**self._context_options)
|
| 250 |
elif self._config.proxy_rotator:
|
| 251 |
self.browser = await self.playwright.chromium.launch(**self._browser_options)
|
| 252 |
else:
|
| 253 |
persistent_options = (
|
| 254 |
self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
|
| 255 |
)
|
| 256 |
+
self.context = await self.playwright.chromium.launch_persistent_context(**persistent_options)
|
|
|
|
|
|
|
| 257 |
|
| 258 |
if self.context:
|
| 259 |
self.context = await self._initialize_context(self._config, self.context)
|
|
|
|
| 302 |
)
|
| 303 |
|
| 304 |
for attempt in range(self._config.retries):
|
| 305 |
+
proxy: Optional[ProxyType] = None
|
| 306 |
if self._config.proxy_rotator and static_proxy is None:
|
| 307 |
proxy = self._config.proxy_rotator.get_proxy()
|
| 308 |
else:
|
|
@@ -61,7 +61,9 @@ class PagePool:
|
|
| 61 |
raise RuntimeError(f"Maximum page limit ({self.max_pages}) reached")
|
| 62 |
|
| 63 |
if isinstance(page, AsyncPage):
|
| 64 |
-
page_info
|
|
|
|
|
|
|
| 65 |
else:
|
| 66 |
page_info = cast(PageInfo[SyncPage], PageInfo(page, "ready", ""))
|
| 67 |
|
|
|
|
| 61 |
raise RuntimeError(f"Maximum page limit ({self.max_pages}) reached")
|
| 62 |
|
| 63 |
if isinstance(page, AsyncPage):
|
| 64 |
+
page_info: PageInfo[SyncPage] | PageInfo[AsyncPage] = cast(
|
| 65 |
+
PageInfo[AsyncPage], PageInfo(page, "ready", "")
|
| 66 |
+
)
|
| 67 |
else:
|
| 68 |
page_info = cast(PageInfo[SyncPage], PageInfo(page, "ready", ""))
|
| 69 |
|
|
@@ -13,7 +13,7 @@ from patchright.sync_api import sync_playwright
|
|
| 13 |
from patchright.async_api import async_playwright
|
| 14 |
|
| 15 |
from scrapling.core.utils import log
|
| 16 |
-
from scrapling.core._types import Any, Unpack
|
| 17 |
from scrapling.engines.toolbelt.proxy_rotation import is_proxy_error
|
| 18 |
from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
|
| 19 |
from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
|
|
@@ -78,7 +78,7 @@ class StealthySession(SyncSession, StealthySessionMixin):
|
|
| 78 |
self.__validate__(**kwargs)
|
| 79 |
super().__init__()
|
| 80 |
|
| 81 |
-
def start(self):
|
| 82 |
"""Create a browser for this instance and context."""
|
| 83 |
if not self.playwright:
|
| 84 |
self.playwright = sync_playwright().start()
|
|
@@ -146,7 +146,7 @@ class StealthySession(SyncSession, StealthySessionMixin):
|
|
| 146 |
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
| 147 |
page.wait_for_timeout(500)
|
| 148 |
|
| 149 |
-
outer_box = {}
|
| 150 |
iframe = page.frame(url=__CF_PATTERN__)
|
| 151 |
if iframe is not None:
|
| 152 |
self._wait_for_page_stability(iframe, True, False)
|
|
@@ -156,14 +156,14 @@ class StealthySession(SyncSession, StealthySessionMixin):
|
|
| 156 |
# Double-checking that the iframe is loaded
|
| 157 |
page.wait_for_timeout(500)
|
| 158 |
|
| 159 |
-
outer_box
|
| 160 |
|
| 161 |
if not iframe or not outer_box:
|
| 162 |
if "<title>Just a moment...</title>" not in (ResponseFactory._get_page_content(page)):
|
| 163 |
log.info("Cloudflare captcha is solved")
|
| 164 |
return
|
| 165 |
|
| 166 |
-
outer_box
|
| 167 |
|
| 168 |
# Calculate the Captcha coordinates for any viewport
|
| 169 |
captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)
|
|
@@ -223,6 +223,7 @@ class StealthySession(SyncSession, StealthySessionMixin):
|
|
| 223 |
)
|
| 224 |
|
| 225 |
for attempt in range(self._config.retries):
|
|
|
|
| 226 |
if self._config.proxy_rotator and static_proxy is None:
|
| 227 |
proxy = self._config.proxy_rotator.get_proxy()
|
| 228 |
else:
|
|
@@ -335,7 +336,7 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
|
|
| 335 |
self.__validate__(**kwargs)
|
| 336 |
super().__init__(max_pages=self._config.max_pages)
|
| 337 |
|
| 338 |
-
async def start(self):
|
| 339 |
"""Create a browser for this instance and context."""
|
| 340 |
if not self.playwright:
|
| 341 |
self.playwright = await async_playwright().start()
|
|
@@ -344,16 +345,14 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
|
|
| 344 |
self.browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
|
| 345 |
if not self._config.proxy_rotator:
|
| 346 |
assert self.browser is not None
|
| 347 |
-
self.context
|
| 348 |
elif self._config.proxy_rotator:
|
| 349 |
self.browser = await self.playwright.chromium.launch(**self._browser_options)
|
| 350 |
else:
|
| 351 |
persistent_options = (
|
| 352 |
self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
|
| 353 |
)
|
| 354 |
-
self.context
|
| 355 |
-
**persistent_options
|
| 356 |
-
)
|
| 357 |
|
| 358 |
if self.context:
|
| 359 |
self.context = await self._initialize_context(self._config, self.context)
|
|
@@ -367,7 +366,7 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
|
|
| 367 |
else:
|
| 368 |
raise RuntimeError("Session has been already started")
|
| 369 |
|
| 370 |
-
async def _initialize_context(self, config, ctx: AsyncBrowserContext) -> AsyncBrowserContext:
|
| 371 |
"""Initialize the browser context."""
|
| 372 |
for script in _compiled_stealth_scripts():
|
| 373 |
await ctx.add_init_script(script=script)
|
|
@@ -404,7 +403,7 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
|
|
| 404 |
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
| 405 |
await page.wait_for_timeout(500)
|
| 406 |
|
| 407 |
-
outer_box = {}
|
| 408 |
iframe = page.frame(url=__CF_PATTERN__)
|
| 409 |
if iframe is not None:
|
| 410 |
await self._wait_for_page_stability(iframe, True, False)
|
|
@@ -414,14 +413,14 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
|
|
| 414 |
# Double-checking that the iframe is loaded
|
| 415 |
await page.wait_for_timeout(500)
|
| 416 |
|
| 417 |
-
outer_box
|
| 418 |
|
| 419 |
if not iframe or not outer_box:
|
| 420 |
if "<title>Just a moment...</title>" not in (await ResponseFactory._get_async_page_content(page)):
|
| 421 |
log.info("Cloudflare captcha is solved")
|
| 422 |
return
|
| 423 |
|
| 424 |
-
outer_box
|
| 425 |
|
| 426 |
# Calculate the Captcha coordinates for any viewport
|
| 427 |
captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)
|
|
@@ -482,6 +481,7 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
|
|
| 482 |
)
|
| 483 |
|
| 484 |
for attempt in range(self._config.retries):
|
|
|
|
| 485 |
if self._config.proxy_rotator and static_proxy is None:
|
| 486 |
proxy = self._config.proxy_rotator.get_proxy()
|
| 487 |
else:
|
|
|
|
| 13 |
from patchright.async_api import async_playwright
|
| 14 |
|
| 15 |
from scrapling.core.utils import log
|
| 16 |
+
from scrapling.core._types import Any, Optional, ProxyType, Unpack
|
| 17 |
from scrapling.engines.toolbelt.proxy_rotation import is_proxy_error
|
| 18 |
from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
|
| 19 |
from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
|
|
|
|
| 78 |
self.__validate__(**kwargs)
|
| 79 |
super().__init__()
|
| 80 |
|
| 81 |
+
def start(self) -> None:
|
| 82 |
"""Create a browser for this instance and context."""
|
| 83 |
if not self.playwright:
|
| 84 |
self.playwright = sync_playwright().start()
|
|
|
|
| 146 |
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
| 147 |
page.wait_for_timeout(500)
|
| 148 |
|
| 149 |
+
outer_box: Any = {}
|
| 150 |
iframe = page.frame(url=__CF_PATTERN__)
|
| 151 |
if iframe is not None:
|
| 152 |
self._wait_for_page_stability(iframe, True, False)
|
|
|
|
| 156 |
# Double-checking that the iframe is loaded
|
| 157 |
page.wait_for_timeout(500)
|
| 158 |
|
| 159 |
+
outer_box = iframe.frame_element().bounding_box()
|
| 160 |
|
| 161 |
if not iframe or not outer_box:
|
| 162 |
if "<title>Just a moment...</title>" not in (ResponseFactory._get_page_content(page)):
|
| 163 |
log.info("Cloudflare captcha is solved")
|
| 164 |
return
|
| 165 |
|
| 166 |
+
outer_box = page.locator(box_selector).last.bounding_box()
|
| 167 |
|
| 168 |
# Calculate the Captcha coordinates for any viewport
|
| 169 |
captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)
|
|
|
|
| 223 |
)
|
| 224 |
|
| 225 |
for attempt in range(self._config.retries):
|
| 226 |
+
proxy: Optional[ProxyType] = None
|
| 227 |
if self._config.proxy_rotator and static_proxy is None:
|
| 228 |
proxy = self._config.proxy_rotator.get_proxy()
|
| 229 |
else:
|
|
|
|
| 336 |
self.__validate__(**kwargs)
|
| 337 |
super().__init__(max_pages=self._config.max_pages)
|
| 338 |
|
| 339 |
+
async def start(self) -> None:
|
| 340 |
"""Create a browser for this instance and context."""
|
| 341 |
if not self.playwright:
|
| 342 |
self.playwright = await async_playwright().start()
|
|
|
|
| 345 |
self.browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)
|
| 346 |
if not self._config.proxy_rotator:
|
| 347 |
assert self.browser is not None
|
| 348 |
+
self.context = await self.browser.new_context(**self._context_options)
|
| 349 |
elif self._config.proxy_rotator:
|
| 350 |
self.browser = await self.playwright.chromium.launch(**self._browser_options)
|
| 351 |
else:
|
| 352 |
persistent_options = (
|
| 353 |
self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir}
|
| 354 |
)
|
| 355 |
+
self.context = await self.playwright.chromium.launch_persistent_context(**persistent_options)
|
|
|
|
|
|
|
| 356 |
|
| 357 |
if self.context:
|
| 358 |
self.context = await self._initialize_context(self._config, self.context)
|
|
|
|
| 366 |
else:
|
| 367 |
raise RuntimeError("Session has been already started")
|
| 368 |
|
| 369 |
+
async def _initialize_context(self, config: Any, ctx: AsyncBrowserContext) -> AsyncBrowserContext:
|
| 370 |
"""Initialize the browser context."""
|
| 371 |
for script in _compiled_stealth_scripts():
|
| 372 |
await ctx.add_init_script(script=script)
|
|
|
|
| 403 |
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
| 404 |
await page.wait_for_timeout(500)
|
| 405 |
|
| 406 |
+
outer_box: Any = {}
|
| 407 |
iframe = page.frame(url=__CF_PATTERN__)
|
| 408 |
if iframe is not None:
|
| 409 |
await self._wait_for_page_stability(iframe, True, False)
|
|
|
|
| 413 |
# Double-checking that the iframe is loaded
|
| 414 |
await page.wait_for_timeout(500)
|
| 415 |
|
| 416 |
+
outer_box = await (await iframe.frame_element()).bounding_box()
|
| 417 |
|
| 418 |
if not iframe or not outer_box:
|
| 419 |
if "<title>Just a moment...</title>" not in (await ResponseFactory._get_async_page_content(page)):
|
| 420 |
log.info("Cloudflare captcha is solved")
|
| 421 |
return
|
| 422 |
|
| 423 |
+
outer_box = await page.locator(box_selector).last.bounding_box()
|
| 424 |
|
| 425 |
# Calculate the Captcha coordinates for any viewport
|
| 426 |
captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27)
|
|
|
|
| 481 |
)
|
| 482 |
|
| 483 |
for attempt in range(self._config.retries):
|
| 484 |
+
proxy: Optional[ProxyType] = None
|
| 485 |
if self._config.proxy_rotator and static_proxy is None:
|
| 486 |
proxy = self._config.proxy_rotator.get_proxy()
|
| 487 |
else:
|
|
@@ -157,15 +157,16 @@ def validate_fetch(
|
|
| 157 |
session: Any,
|
| 158 |
model: type[PlaywrightConfig] | type[StealthConfig],
|
| 159 |
) -> _fetch_params: # pragma: no cover
|
| 160 |
-
result = {}
|
| 161 |
-
overrides = {}
|
|
|
|
| 162 |
|
| 163 |
# Get all field names that _fetch_params needs
|
| 164 |
fetch_param_fields = {f.name for f in fields(_fetch_params)}
|
| 165 |
|
| 166 |
for key in fetch_param_fields:
|
| 167 |
-
if key in
|
| 168 |
-
overrides[key] =
|
| 169 |
elif hasattr(session, "_config") and hasattr(session._config, key):
|
| 170 |
result[key] = getattr(session._config, key)
|
| 171 |
|
|
|
|
| 157 |
session: Any,
|
| 158 |
model: type[PlaywrightConfig] | type[StealthConfig],
|
| 159 |
) -> _fetch_params: # pragma: no cover
|
| 160 |
+
result: Dict[str, Any] = {}
|
| 161 |
+
overrides: Dict[str, Any] = {}
|
| 162 |
+
kwargs_dict: Dict[str, Any] = dict(method_kwargs)
|
| 163 |
|
| 164 |
# Get all field names that _fetch_params needs
|
| 165 |
fetch_param_fields = {f.name for f in fields(_fetch_params)}
|
| 166 |
|
| 167 |
for key in fetch_param_fields:
|
| 168 |
+
if key in kwargs_dict:
|
| 169 |
+
overrides[key] = kwargs_dict[key]
|
| 170 |
elif hasattr(session, "_config") and hasattr(session._config, key):
|
| 171 |
result[key] = getattr(session._config, key)
|
| 172 |
|
|
@@ -753,7 +753,7 @@ class FetcherSession:
|
|
| 753 |
class FetcherClient(_SyncSessionLogic):
|
| 754 |
__slots__ = ("__enter__", "__exit__")
|
| 755 |
|
| 756 |
-
def __init__(self, **kwargs):
|
| 757 |
super().__init__(**kwargs)
|
| 758 |
self.__enter__: Any = None
|
| 759 |
self.__exit__: Any = None
|
|
@@ -763,7 +763,7 @@ class FetcherClient(_SyncSessionLogic):
|
|
| 763 |
class AsyncFetcherClient(_ASyncSessionLogic):
|
| 764 |
__slots__ = ("__aenter__", "__aexit__")
|
| 765 |
|
| 766 |
-
def __init__(self, **kwargs):
|
| 767 |
super().__init__(**kwargs)
|
| 768 |
self.__aenter__: Any = None
|
| 769 |
self.__aexit__: Any = None
|
|
|
|
| 753 |
class FetcherClient(_SyncSessionLogic):
|
| 754 |
__slots__ = ("__enter__", "__exit__")
|
| 755 |
|
| 756 |
+
def __init__(self, **kwargs: Any) -> None:
|
| 757 |
super().__init__(**kwargs)
|
| 758 |
self.__enter__: Any = None
|
| 759 |
self.__exit__: Any = None
|
|
|
|
| 763 |
class AsyncFetcherClient(_ASyncSessionLogic):
|
| 764 |
__slots__ = ("__aenter__", "__aexit__")
|
| 765 |
|
| 766 |
+
def __init__(self, **kwargs: Any) -> None:
|
| 767 |
super().__init__(**kwargs)
|
| 768 |
self.__aenter__: Any = None
|
| 769 |
self.__aexit__: Any = None
|
|
@@ -38,7 +38,7 @@ class ResponseFactory:
|
|
| 38 |
@classmethod
|
| 39 |
def _process_response_history(cls, first_response: SyncResponse, parser_arguments: Dict) -> list[Response]:
|
| 40 |
"""Process response history to build a list of `Response` objects"""
|
| 41 |
-
history = []
|
| 42 |
current_request = first_response.request.redirected_from
|
| 43 |
|
| 44 |
try:
|
|
@@ -101,6 +101,7 @@ class ResponseFactory:
|
|
| 101 |
:param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.
|
| 102 |
:param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into
|
| 103 |
the `Response` object.
|
|
|
|
| 104 |
|
| 105 |
:return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.
|
| 106 |
:rtype: Response
|
|
@@ -145,7 +146,7 @@ class ResponseFactory:
|
|
| 145 |
cls, first_response: AsyncResponse, parser_arguments: Dict
|
| 146 |
) -> list[Response]:
|
| 147 |
"""Process response history to build a list of `Response` objects"""
|
| 148 |
-
history = []
|
| 149 |
current_request = first_response.request.redirected_from
|
| 150 |
|
| 151 |
try:
|
|
@@ -238,6 +239,7 @@ class ResponseFactory:
|
|
| 238 |
:param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.
|
| 239 |
:param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into
|
| 240 |
the `Response` object.
|
|
|
|
| 241 |
|
| 242 |
:return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.
|
| 243 |
:rtype: Response
|
|
|
|
| 38 |
@classmethod
|
| 39 |
def _process_response_history(cls, first_response: SyncResponse, parser_arguments: Dict) -> list[Response]:
|
| 40 |
"""Process response history to build a list of `Response` objects"""
|
| 41 |
+
history: list[Response] = []
|
| 42 |
current_request = first_response.request.redirected_from
|
| 43 |
|
| 44 |
try:
|
|
|
|
| 101 |
:param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.
|
| 102 |
:param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into
|
| 103 |
the `Response` object.
|
| 104 |
+
:param meta: Additional meta data to be saved with the response.
|
| 105 |
|
| 106 |
:return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.
|
| 107 |
:rtype: Response
|
|
|
|
| 146 |
cls, first_response: AsyncResponse, parser_arguments: Dict
|
| 147 |
) -> list[Response]:
|
| 148 |
"""Process response history to build a list of `Response` objects"""
|
| 149 |
+
history: list[Response] = []
|
| 150 |
current_request = first_response.request.redirected_from
|
| 151 |
|
| 152 |
try:
|
|
|
|
| 239 |
:param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.
|
| 240 |
:param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into
|
| 241 |
the `Response` object.
|
| 242 |
+
:param meta: Additional meta data to be saved with the response.
|
| 243 |
|
| 244 |
:return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.
|
| 245 |
:rtype: Response
|
|
@@ -118,8 +118,11 @@ class Selector(SelectorsGeneration):
|
|
| 118 |
if root is None and content is None:
|
| 119 |
raise ValueError("Selector class needs HTML content, or root arguments to work")
|
| 120 |
|
| 121 |
-
self.__text = None
|
|
|
|
|
|
|
| 122 |
if root is None:
|
|
|
|
| 123 |
if isinstance(content, str):
|
| 124 |
body = content.strip().replace("\x00", "") or "<html/>"
|
| 125 |
elif isinstance(content, bytes):
|
|
@@ -128,17 +131,18 @@ class Selector(SelectorsGeneration):
|
|
| 128 |
raise TypeError(f"content argument must be str or bytes, got {type(content)}")
|
| 129 |
|
| 130 |
# https://lxml.de/api/lxml.etree.HTMLParser-class.html
|
| 131 |
-
|
| 132 |
recover=True,
|
| 133 |
remove_blank_text=True,
|
| 134 |
remove_comments=(not keep_comments),
|
| 135 |
encoding=encoding,
|
| 136 |
compact=True,
|
| 137 |
huge_tree=huge_tree,
|
| 138 |
-
default_doctype=True,
|
| 139 |
strip_cdata=(not keep_cdata),
|
| 140 |
)
|
| 141 |
-
|
|
|
|
| 142 |
self._raw_body = content
|
| 143 |
|
| 144 |
else:
|
|
@@ -164,7 +168,7 @@ class Selector(SelectorsGeneration):
|
|
| 164 |
self._root = cast(HtmlElement, root)
|
| 165 |
self._raw_body = ""
|
| 166 |
|
| 167 |
-
self.__adaptive_enabled = adaptive
|
| 168 |
|
| 169 |
if self.__adaptive_enabled:
|
| 170 |
if _storage is not None:
|
|
@@ -277,8 +281,8 @@ class Selector(SelectorsGeneration):
|
|
| 277 |
if self._is_text_node(self._root):
|
| 278 |
return "#text"
|
| 279 |
if not self.__tag:
|
| 280 |
-
self.__tag = self._root.tag
|
| 281 |
-
return self.__tag
|
| 282 |
|
| 283 |
@property
|
| 284 |
def text(self) -> TextHandler:
|
|
@@ -313,11 +317,11 @@ class Selector(SelectorsGeneration):
|
|
| 313 |
if self._is_text_node(self._root):
|
| 314 |
return TextHandler(str(self._root))
|
| 315 |
|
| 316 |
-
ignored_elements = set()
|
| 317 |
if ignore_tags:
|
| 318 |
for element in self._root.iter(*ignore_tags):
|
| 319 |
ignored_elements.add(element)
|
| 320 |
-
ignored_elements.update(
|
| 321 |
|
| 322 |
_all_strings = []
|
| 323 |
for node in self._root.iter():
|
|
@@ -395,7 +399,7 @@ class Selector(SelectorsGeneration):
|
|
| 395 |
"""Return all elements under the current element in the DOM tree"""
|
| 396 |
if self._is_text_node(self._root):
|
| 397 |
return Selectors()
|
| 398 |
-
below = _find_all_elements(self._root)
|
| 399 |
return self.__elements_convertor(below) if below is not None else Selectors()
|
| 400 |
|
| 401 |
@property
|
|
@@ -533,7 +537,7 @@ class Selector(SelectorsGeneration):
|
|
| 533 |
:param selector_type: If True, the return result will be converted to `Selectors` object
|
| 534 |
:return: List of pure HTML elements that got the highest matching score or 'Selectors' object
|
| 535 |
"""
|
| 536 |
-
score_table = {}
|
| 537 |
# Note: `element` will most likely always be a dictionary at this point.
|
| 538 |
if isinstance(element, self.__class__):
|
| 539 |
element = element._root
|
|
@@ -541,11 +545,11 @@ class Selector(SelectorsGeneration):
|
|
| 541 |
if issubclass(type(element), HtmlElement):
|
| 542 |
element = _StorageTools.element_to_dict(element)
|
| 543 |
|
| 544 |
-
for node in _find_all_elements(self._root):
|
| 545 |
# Collect all elements in the page, then for each element get the matching score of it against the node.
|
| 546 |
# Hence: the code doesn't stop even if the score was 100%
|
| 547 |
# because there might be another element(s) left in page with the same score
|
| 548 |
-
score = self.__calculate_similarity_score(element, node)
|
| 549 |
score_table.setdefault(score, []).append(node)
|
| 550 |
|
| 551 |
if score_table:
|
|
@@ -710,7 +714,7 @@ class Selector(SelectorsGeneration):
|
|
| 710 |
if not args and not kwargs:
|
| 711 |
raise TypeError("You have to pass something to search with, like tag name(s), tag attributes, or both.")
|
| 712 |
|
| 713 |
-
attributes = dict()
|
| 714 |
tags: Set[str] = set()
|
| 715 |
patterns: Set[Pattern] = set()
|
| 716 |
results, functions, selectors = Selectors(), [], []
|
|
@@ -809,21 +813,19 @@ class Selector(SelectorsGeneration):
|
|
| 809 |
:param candidate: The element to compare with the original element.
|
| 810 |
:return: A percentage score of how similar is the candidate to the original element
|
| 811 |
"""
|
| 812 |
-
score
|
|
|
|
| 813 |
data = _StorageTools.element_to_dict(candidate)
|
| 814 |
|
| 815 |
-
|
| 816 |
-
# Study the idea of giving weight to each test below so some are more important than others
|
| 817 |
-
# Current results: With weights some websites had better score while it was worse for others
|
| 818 |
-
score += 1 if original["tag"] == data["tag"] else 0 # * 0.3 # 30%
|
| 819 |
checks += 1
|
| 820 |
|
| 821 |
if original["text"]:
|
| 822 |
-
score += SequenceMatcher(None, original["text"], data.get("text") or "").ratio()
|
| 823 |
checks += 1
|
| 824 |
|
| 825 |
# if both don't have attributes, it still counts for something!
|
| 826 |
-
score += self.__calculate_dict_diff(original["attributes"], data["attributes"])
|
| 827 |
checks += 1
|
| 828 |
|
| 829 |
# Separate similarity test for class, id, href,... this will help in full structural changes
|
|
@@ -838,23 +840,19 @@ class Selector(SelectorsGeneration):
|
|
| 838 |
None,
|
| 839 |
original["attributes"][attrib],
|
| 840 |
data["attributes"].get(attrib) or "",
|
| 841 |
-
).ratio()
|
| 842 |
checks += 1
|
| 843 |
|
| 844 |
-
score += SequenceMatcher(None, original["path"], data["path"]).ratio()
|
| 845 |
checks += 1
|
| 846 |
|
| 847 |
if original.get("parent_name"):
|
| 848 |
# Then we start comparing parents' data
|
| 849 |
if data.get("parent_name"):
|
| 850 |
-
score += SequenceMatcher(
|
| 851 |
-
None, original["parent_name"], data.get("parent_name") or ""
|
| 852 |
-
).ratio() # * 0.2 # 20%
|
| 853 |
checks += 1
|
| 854 |
|
| 855 |
-
score += self.__calculate_dict_diff(
|
| 856 |
-
original["parent_attribs"], data.get("parent_attribs") or {}
|
| 857 |
-
) # * 0.2 # 20%
|
| 858 |
checks += 1
|
| 859 |
|
| 860 |
if original["parent_text"]:
|
|
@@ -862,14 +860,14 @@ class Selector(SelectorsGeneration):
|
|
| 862 |
None,
|
| 863 |
original["parent_text"],
|
| 864 |
data.get("parent_text") or "",
|
| 865 |
-
).ratio()
|
| 866 |
checks += 1
|
| 867 |
# else:
|
| 868 |
# # The original element has a parent and this one not, this is not a good sign
|
| 869 |
# score -= 0.1
|
| 870 |
|
| 871 |
if original.get("siblings"):
|
| 872 |
-
score += SequenceMatcher(None, original["siblings"], data.get("siblings") or []).ratio()
|
| 873 |
checks += 1
|
| 874 |
|
| 875 |
# How % sure? let's see
|
|
@@ -890,14 +888,14 @@ class Selector(SelectorsGeneration):
|
|
| 890 |
the docs for more info.
|
| 891 |
"""
|
| 892 |
if self.__adaptive_enabled:
|
| 893 |
-
|
| 894 |
-
if isinstance(
|
| 895 |
-
|
| 896 |
|
| 897 |
-
if self._is_text_node(
|
| 898 |
-
|
| 899 |
|
| 900 |
-
self._storage.save(
|
| 901 |
else:
|
| 902 |
raise RuntimeError(
|
| 903 |
"Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
|
|
@@ -987,7 +985,8 @@ class Selector(SelectorsGeneration):
|
|
| 987 |
candidate_attributes = (
|
| 988 |
self.__get_attributes(candidate, ignore_attributes) if ignore_attributes else candidate.attrib
|
| 989 |
)
|
| 990 |
-
score
|
|
|
|
| 991 |
|
| 992 |
if original_attributes:
|
| 993 |
score += sum(
|
|
@@ -1116,16 +1115,16 @@ class Selector(SelectorsGeneration):
|
|
| 1116 |
if not case_sensitive:
|
| 1117 |
text = text.lower()
|
| 1118 |
|
| 1119 |
-
possible_targets = _find_all_elements_with_spaces(self._root)
|
| 1120 |
if possible_targets:
|
| 1121 |
for node in self.__elements_convertor(possible_targets):
|
| 1122 |
"""Check if element matches given text otherwise, traverse the children tree and iterate"""
|
| 1123 |
-
node_text = node.text
|
| 1124 |
if clean_match:
|
| 1125 |
-
node_text = node_text.clean()
|
| 1126 |
|
| 1127 |
if not case_sensitive:
|
| 1128 |
-
node_text = node_text.lower()
|
| 1129 |
|
| 1130 |
if partial:
|
| 1131 |
if text in node_text:
|
|
@@ -1178,7 +1177,7 @@ class Selector(SelectorsGeneration):
|
|
| 1178 |
|
| 1179 |
results = Selectors()
|
| 1180 |
|
| 1181 |
-
possible_targets = _find_all_elements_with_spaces(self._root)
|
| 1182 |
if possible_targets:
|
| 1183 |
for node in self.__elements_convertor(possible_targets):
|
| 1184 |
"""Check if element matches given regex otherwise, traverse the children tree and iterate"""
|
|
|
|
| 118 |
if root is None and content is None:
|
| 119 |
raise ValueError("Selector class needs HTML content, or root arguments to work")
|
| 120 |
|
| 121 |
+
self.__text: Optional[TextHandler] = None
|
| 122 |
+
self.__tag: Optional[str] = None
|
| 123 |
+
self.__attributes: Optional[AttributesHandler] = None
|
| 124 |
if root is None:
|
| 125 |
+
body: str | bytes
|
| 126 |
if isinstance(content, str):
|
| 127 |
body = content.strip().replace("\x00", "") or "<html/>"
|
| 128 |
elif isinstance(content, bytes):
|
|
|
|
| 131 |
raise TypeError(f"content argument must be str or bytes, got {type(content)}")
|
| 132 |
|
| 133 |
# https://lxml.de/api/lxml.etree.HTMLParser-class.html
|
| 134 |
+
_parser_kwargs: Dict[str, Any] = dict(
|
| 135 |
recover=True,
|
| 136 |
remove_blank_text=True,
|
| 137 |
remove_comments=(not keep_comments),
|
| 138 |
encoding=encoding,
|
| 139 |
compact=True,
|
| 140 |
huge_tree=huge_tree,
|
| 141 |
+
default_doctype=True, # Supported by lxml but missing from stubs
|
| 142 |
strip_cdata=(not keep_cdata),
|
| 143 |
)
|
| 144 |
+
parser = HTMLParser(**_parser_kwargs)
|
| 145 |
+
self._root = cast(HtmlElement, fromstring(body or "<html/>", parser=parser, base_url=url or ""))
|
| 146 |
self._raw_body = content
|
| 147 |
|
| 148 |
else:
|
|
|
|
| 168 |
self._root = cast(HtmlElement, root)
|
| 169 |
self._raw_body = ""
|
| 170 |
|
| 171 |
+
self.__adaptive_enabled = bool(adaptive)
|
| 172 |
|
| 173 |
if self.__adaptive_enabled:
|
| 174 |
if _storage is not None:
|
|
|
|
| 281 |
if self._is_text_node(self._root):
|
| 282 |
return "#text"
|
| 283 |
if not self.__tag:
|
| 284 |
+
self.__tag = str(self._root.tag)
|
| 285 |
+
return self.__tag or ""
|
| 286 |
|
| 287 |
@property
|
| 288 |
def text(self) -> TextHandler:
|
|
|
|
| 317 |
if self._is_text_node(self._root):
|
| 318 |
return TextHandler(str(self._root))
|
| 319 |
|
| 320 |
+
ignored_elements: set[Any] = set()
|
| 321 |
if ignore_tags:
|
| 322 |
for element in self._root.iter(*ignore_tags):
|
| 323 |
ignored_elements.add(element)
|
| 324 |
+
ignored_elements.update(cast(list, _find_all_elements(element)))
|
| 325 |
|
| 326 |
_all_strings = []
|
| 327 |
for node in self._root.iter():
|
|
|
|
| 399 |
"""Return all elements under the current element in the DOM tree"""
|
| 400 |
if self._is_text_node(self._root):
|
| 401 |
return Selectors()
|
| 402 |
+
below = cast(List, _find_all_elements(self._root))
|
| 403 |
return self.__elements_convertor(below) if below is not None else Selectors()
|
| 404 |
|
| 405 |
@property
|
|
|
|
| 537 |
:param selector_type: If True, the return result will be converted to `Selectors` object
|
| 538 |
:return: List of pure HTML elements that got the highest matching score or 'Selectors' object
|
| 539 |
"""
|
| 540 |
+
score_table: Dict[float, List[Any]] = {}
|
| 541 |
# Note: `element` will most likely always be a dictionary at this point.
|
| 542 |
if isinstance(element, self.__class__):
|
| 543 |
element = element._root
|
|
|
|
| 545 |
if issubclass(type(element), HtmlElement):
|
| 546 |
element = _StorageTools.element_to_dict(element)
|
| 547 |
|
| 548 |
+
for node in cast(List, _find_all_elements(self._root)):
|
| 549 |
# Collect all elements in the page, then for each element get the matching score of it against the node.
|
| 550 |
# Hence: the code doesn't stop even if the score was 100%
|
| 551 |
# because there might be another element(s) left in page with the same score
|
| 552 |
+
score = self.__calculate_similarity_score(cast(Dict, element), node)
|
| 553 |
score_table.setdefault(score, []).append(node)
|
| 554 |
|
| 555 |
if score_table:
|
|
|
|
| 714 |
if not args and not kwargs:
|
| 715 |
raise TypeError("You have to pass something to search with, like tag name(s), tag attributes, or both.")
|
| 716 |
|
| 717 |
+
attributes: Dict[str, Any] = dict()
|
| 718 |
tags: Set[str] = set()
|
| 719 |
patterns: Set[Pattern] = set()
|
| 720 |
results, functions, selectors = Selectors(), [], []
|
|
|
|
| 813 |
:param candidate: The element to compare with the original element.
|
| 814 |
:return: A percentage score of how similar is the candidate to the original element
|
| 815 |
"""
|
| 816 |
+
score: float = 0
|
| 817 |
+
checks: int = 0
|
| 818 |
data = _StorageTools.element_to_dict(candidate)
|
| 819 |
|
| 820 |
+
score += 1 if original["tag"] == data["tag"] else 0
|
|
|
|
|
|
|
|
|
|
| 821 |
checks += 1
|
| 822 |
|
| 823 |
if original["text"]:
|
| 824 |
+
score += SequenceMatcher(None, original["text"], data.get("text") or "").ratio()
|
| 825 |
checks += 1
|
| 826 |
|
| 827 |
# if both don't have attributes, it still counts for something!
|
| 828 |
+
score += self.__calculate_dict_diff(original["attributes"], data["attributes"])
|
| 829 |
checks += 1
|
| 830 |
|
| 831 |
# Separate similarity test for class, id, href,... this will help in full structural changes
|
|
|
|
| 840 |
None,
|
| 841 |
original["attributes"][attrib],
|
| 842 |
data["attributes"].get(attrib) or "",
|
| 843 |
+
).ratio()
|
| 844 |
checks += 1
|
| 845 |
|
| 846 |
+
score += SequenceMatcher(None, original["path"], data["path"]).ratio()
|
| 847 |
checks += 1
|
| 848 |
|
| 849 |
if original.get("parent_name"):
|
| 850 |
# Then we start comparing parents' data
|
| 851 |
if data.get("parent_name"):
|
| 852 |
+
score += SequenceMatcher(None, original["parent_name"], data.get("parent_name") or "").ratio()
|
|
|
|
|
|
|
| 853 |
checks += 1
|
| 854 |
|
| 855 |
+
score += self.__calculate_dict_diff(original["parent_attribs"], data.get("parent_attribs") or {})
|
|
|
|
|
|
|
| 856 |
checks += 1
|
| 857 |
|
| 858 |
if original["parent_text"]:
|
|
|
|
| 860 |
None,
|
| 861 |
original["parent_text"],
|
| 862 |
data.get("parent_text") or "",
|
| 863 |
+
).ratio()
|
| 864 |
checks += 1
|
| 865 |
# else:
|
| 866 |
# # The original element has a parent and this one not, this is not a good sign
|
| 867 |
# score -= 0.1
|
| 868 |
|
| 869 |
if original.get("siblings"):
|
| 870 |
+
score += SequenceMatcher(None, original["siblings"], data.get("siblings") or []).ratio()
|
| 871 |
checks += 1
|
| 872 |
|
| 873 |
# How % sure? let's see
|
|
|
|
| 888 |
the docs for more info.
|
| 889 |
"""
|
| 890 |
if self.__adaptive_enabled:
|
| 891 |
+
target_element: Any = element
|
| 892 |
+
if isinstance(target_element, self.__class__):
|
| 893 |
+
target_element = target_element._root
|
| 894 |
|
| 895 |
+
if self._is_text_node(target_element):
|
| 896 |
+
target_element = target_element.getparent()
|
| 897 |
|
| 898 |
+
self._storage.save(target_element, identifier)
|
| 899 |
else:
|
| 900 |
raise RuntimeError(
|
| 901 |
"Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
|
|
|
|
| 985 |
candidate_attributes = (
|
| 986 |
self.__get_attributes(candidate, ignore_attributes) if ignore_attributes else candidate.attrib
|
| 987 |
)
|
| 988 |
+
score: float = 0
|
| 989 |
+
checks: int = 0
|
| 990 |
|
| 991 |
if original_attributes:
|
| 992 |
score += sum(
|
|
|
|
| 1115 |
if not case_sensitive:
|
| 1116 |
text = text.lower()
|
| 1117 |
|
| 1118 |
+
possible_targets = cast(List, _find_all_elements_with_spaces(self._root))
|
| 1119 |
if possible_targets:
|
| 1120 |
for node in self.__elements_convertor(possible_targets):
|
| 1121 |
"""Check if element matches given text otherwise, traverse the children tree and iterate"""
|
| 1122 |
+
node_text: TextHandler = node.text
|
| 1123 |
if clean_match:
|
| 1124 |
+
node_text = TextHandler(node_text.clean())
|
| 1125 |
|
| 1126 |
if not case_sensitive:
|
| 1127 |
+
node_text = TextHandler(node_text.lower())
|
| 1128 |
|
| 1129 |
if partial:
|
| 1130 |
if text in node_text:
|
|
|
|
| 1177 |
|
| 1178 |
results = Selectors()
|
| 1179 |
|
| 1180 |
+
possible_targets = cast(List, _find_all_elements_with_spaces(self._root))
|
| 1181 |
if possible_targets:
|
| 1182 |
for node in self.__elements_convertor(possible_targets):
|
| 1183 |
"""Check if element matches given regex otherwise, traverse the children tree and iterate"""
|
|
@@ -7,7 +7,7 @@ import orjson
|
|
| 7 |
from w3lib.url import canonicalize_url
|
| 8 |
|
| 9 |
from scrapling.engines.toolbelt.custom import Response
|
| 10 |
-
from scrapling.core._types import Any, AsyncGenerator, Callable, Dict, Union, Tuple, TYPE_CHECKING
|
| 11 |
|
| 12 |
if TYPE_CHECKING:
|
| 13 |
from scrapling.spiders.spider import Spider
|
|
@@ -42,7 +42,7 @@ class Request:
|
|
| 42 |
self.meta: dict[str, Any] = meta if meta else {}
|
| 43 |
self._retry_count: int = _retry_count
|
| 44 |
self._session_kwargs = kwargs if kwargs else {}
|
| 45 |
-
self._fp = None
|
| 46 |
|
| 47 |
def copy(self) -> "Request":
|
| 48 |
"""Create a copy of this request."""
|
|
@@ -89,7 +89,7 @@ class Request:
|
|
| 89 |
body = b""
|
| 90 |
else:
|
| 91 |
post_data = self._session_kwargs.get("json", {})
|
| 92 |
-
body
|
| 93 |
|
| 94 |
data: Dict[str, str | Tuple] = {
|
| 95 |
"sid": self.sid,
|
|
|
|
| 7 |
from w3lib.url import canonicalize_url
|
| 8 |
|
| 9 |
from scrapling.engines.toolbelt.custom import Response
|
| 10 |
+
from scrapling.core._types import Any, AsyncGenerator, Callable, Dict, Optional, Union, Tuple, TYPE_CHECKING
|
| 11 |
|
| 12 |
if TYPE_CHECKING:
|
| 13 |
from scrapling.spiders.spider import Spider
|
|
|
|
| 42 |
self.meta: dict[str, Any] = meta if meta else {}
|
| 43 |
self._retry_count: int = _retry_count
|
| 44 |
self._session_kwargs = kwargs if kwargs else {}
|
| 45 |
+
self._fp: Optional[bytes] = None
|
| 46 |
|
| 47 |
def copy(self) -> "Request":
|
| 48 |
"""Create a copy of this request."""
|
|
|
|
| 89 |
body = b""
|
| 90 |
else:
|
| 91 |
post_data = self._session_kwargs.get("json", {})
|
| 92 |
+
body = orjson.dumps(post_data) if post_data else b""
|
| 93 |
|
| 94 |
data: Dict[str, str | Tuple] = {
|
| 95 |
"sid": self.sid,
|
|
@@ -12,7 +12,7 @@ Session = FetcherSession | AsyncDynamicSession | AsyncStealthySession
|
|
| 12 |
class SessionManager:
|
| 13 |
"""Manages pre-configured session instances."""
|
| 14 |
|
| 15 |
-
def __init__(self):
|
| 16 |
self._sessions: dict[str, Session] = {}
|
| 17 |
self._default_session_id: str | None = None
|
| 18 |
self._started: bool = False
|
|
@@ -109,17 +109,17 @@ class SessionManager:
|
|
| 109 |
await session.__aenter__()
|
| 110 |
|
| 111 |
if isinstance(session, FetcherSession):
|
| 112 |
-
|
| 113 |
|
| 114 |
-
if isinstance(
|
| 115 |
-
response = await
|
| 116 |
method=cast(SUPPORTED_HTTP_METHODS, request._session_kwargs.pop("method", "GET")),
|
| 117 |
url=request.url,
|
| 118 |
**request._session_kwargs,
|
| 119 |
)
|
| 120 |
else:
|
| 121 |
# Sync session or other types - shouldn't happen in async context
|
| 122 |
-
raise TypeError(f"Session type {type(
|
| 123 |
else:
|
| 124 |
response = await session.fetch(url=request.url, **request._session_kwargs)
|
| 125 |
|
|
|
|
| 12 |
class SessionManager:
|
| 13 |
"""Manages pre-configured session instances."""
|
| 14 |
|
| 15 |
+
def __init__(self) -> None:
|
| 16 |
self._sessions: dict[str, Session] = {}
|
| 17 |
self._default_session_id: str | None = None
|
| 18 |
self._started: bool = False
|
|
|
|
| 109 |
await session.__aenter__()
|
| 110 |
|
| 111 |
if isinstance(session, FetcherSession):
|
| 112 |
+
client = session._client
|
| 113 |
|
| 114 |
+
if isinstance(client, _ASyncSessionLogic):
|
| 115 |
+
response = await client._make_request(
|
| 116 |
method=cast(SUPPORTED_HTTP_METHODS, request._session_kwargs.pop("method", "GET")),
|
| 117 |
url=request.url,
|
| 118 |
**request._session_kwargs,
|
| 119 |
)
|
| 120 |
else:
|
| 121 |
# Sync session or other types - shouldn't happen in async context
|
| 122 |
+
raise TypeError(f"Session type {type(client)} not supported for async fetch")
|
| 123 |
else:
|
| 124 |
response = await session.fetch(url=request.url, **request._session_kwargs)
|
| 125 |
|