Karim shoair commited on
Commit ·
e5ecf76
1
Parent(s): c4135c8
refactor: Making all the codebase acceptable by PyRight
Browse files- scrapling/core/_types.py +3 -0
- scrapling/core/ai.py +2 -1
- scrapling/core/custom_types.py +20 -27
- scrapling/core/mixins.py +15 -9
- scrapling/core/shell.py +4 -3
- scrapling/core/storage.py +5 -5
- scrapling/core/translator.py +13 -8
- scrapling/engines/_browsers/_base.py +28 -11
- scrapling/engines/_browsers/_camoufox.py +12 -5
- scrapling/engines/_browsers/_config_tools.py +1 -1
- scrapling/engines/_browsers/_controllers.py +14 -7
- scrapling/engines/_browsers/_validators.py +28 -11
- scrapling/engines/static.py +4 -4
- scrapling/engines/toolbelt/convertor.py +6 -4
- scrapling/engines/toolbelt/custom.py +6 -9
- scrapling/engines/toolbelt/fingerprints.py +17 -10
- scrapling/engines/toolbelt/navigation.py +11 -3
- scrapling/fetchers/__init__.py +11 -1
- scrapling/fetchers/chrome.py +3 -4
- scrapling/fetchers/firefox.py +0 -4
- scrapling/parser.py +105 -80
scrapling/core/_types.py
CHANGED
|
@@ -12,9 +12,11 @@ from typing import (
|
|
| 12 |
Generator,
|
| 13 |
Iterable,
|
| 14 |
List,
|
|
|
|
| 15 |
Literal,
|
| 16 |
Optional,
|
| 17 |
Pattern,
|
|
|
|
| 18 |
Tuple,
|
| 19 |
TypeVar,
|
| 20 |
Union,
|
|
@@ -22,6 +24,7 @@ from typing import (
|
|
| 22 |
Mapping,
|
| 23 |
Awaitable,
|
| 24 |
Protocol,
|
|
|
|
| 25 |
SupportsIndex,
|
| 26 |
)
|
| 27 |
|
|
|
|
| 12 |
Generator,
|
| 13 |
Iterable,
|
| 14 |
List,
|
| 15 |
+
Set,
|
| 16 |
Literal,
|
| 17 |
Optional,
|
| 18 |
Pattern,
|
| 19 |
+
Sequence,
|
| 20 |
Tuple,
|
| 21 |
TypeVar,
|
| 22 |
Union,
|
|
|
|
| 24 |
Mapping,
|
| 25 |
Awaitable,
|
| 26 |
Protocol,
|
| 27 |
+
Coroutine,
|
| 28 |
SupportsIndex,
|
| 29 |
)
|
| 30 |
|
scrapling/core/ai.py
CHANGED
|
@@ -20,6 +20,7 @@ from scrapling.core._types import (
|
|
| 20 |
Mapping,
|
| 21 |
Dict,
|
| 22 |
List,
|
|
|
|
| 23 |
SelectorWaitStates,
|
| 24 |
Generator,
|
| 25 |
)
|
|
@@ -171,7 +172,7 @@ class ScraplingMCPServer:
|
|
| 171 |
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
|
| 172 |
"""
|
| 173 |
async with FetcherSession() as session:
|
| 174 |
-
tasks = [
|
| 175 |
session.get(
|
| 176 |
url,
|
| 177 |
auth=auth,
|
|
|
|
| 20 |
Mapping,
|
| 21 |
Dict,
|
| 22 |
List,
|
| 23 |
+
Any,
|
| 24 |
SelectorWaitStates,
|
| 25 |
Generator,
|
| 26 |
)
|
|
|
|
| 172 |
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
|
| 173 |
"""
|
| 174 |
async with FetcherSession() as session:
|
| 175 |
+
tasks: List[Any] = [
|
| 176 |
session.get(
|
| 177 |
url,
|
| 178 |
auth=auth,
|
scrapling/core/custom_types.py
CHANGED
|
@@ -5,6 +5,7 @@ from re import compile as re_compile, UNICODE, IGNORECASE
|
|
| 5 |
from orjson import dumps, loads
|
| 6 |
|
| 7 |
from scrapling.core._types import (
|
|
|
|
| 8 |
cast,
|
| 9 |
Dict,
|
| 10 |
List,
|
|
@@ -14,7 +15,6 @@ from scrapling.core._types import (
|
|
| 14 |
Literal,
|
| 15 |
Pattern,
|
| 16 |
Iterable,
|
| 17 |
-
Optional,
|
| 18 |
Generator,
|
| 19 |
SupportsIndex,
|
| 20 |
)
|
|
@@ -33,23 +33,20 @@ class TextHandler(str):
|
|
| 33 |
|
| 34 |
def __getitem__(self, key: SupportsIndex | slice) -> "TextHandler": # pragma: no cover
|
| 35 |
lst = super().__getitem__(key)
|
| 36 |
-
return
|
| 37 |
|
| 38 |
-
def split(
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
[TextHandler(s) for s in super().split(sep, maxsplit)],
|
| 43 |
-
)
|
| 44 |
-
)
|
| 45 |
|
| 46 |
-
def strip(self, chars: str = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
| 47 |
return TextHandler(super().strip(chars))
|
| 48 |
|
| 49 |
-
def lstrip(self, chars: str = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
| 50 |
return TextHandler(super().lstrip(chars))
|
| 51 |
|
| 52 |
-
def rstrip(self, chars: str = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
| 53 |
return TextHandler(super().rstrip(chars))
|
| 54 |
|
| 55 |
def capitalize(self) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
@@ -64,7 +61,7 @@ class TextHandler(str):
|
|
| 64 |
def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, "TextHandler"]: # pragma: no cover
|
| 65 |
return TextHandler(super().expandtabs(tabsize))
|
| 66 |
|
| 67 |
-
def format(self, *args:
|
| 68 |
return TextHandler(super().format(*args, **kwargs))
|
| 69 |
|
| 70 |
def format_map(self, mapping) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
@@ -131,10 +128,11 @@ class TextHandler(str):
|
|
| 131 |
def re(
|
| 132 |
self,
|
| 133 |
regex: str | Pattern,
|
| 134 |
-
check_match: Literal[True],
|
| 135 |
replace_entities: bool = True,
|
| 136 |
clean_match: bool = False,
|
| 137 |
case_sensitive: bool = True,
|
|
|
|
|
|
|
| 138 |
) -> bool: ...
|
| 139 |
|
| 140 |
@overload
|
|
@@ -179,19 +177,14 @@ class TextHandler(str):
|
|
| 179 |
results = flatten(results)
|
| 180 |
|
| 181 |
if not replace_entities:
|
| 182 |
-
return TextHandlers(
|
| 183 |
|
| 184 |
-
return TextHandlers(
|
| 185 |
-
cast(
|
| 186 |
-
List[_TextHandlerType],
|
| 187 |
-
[TextHandler(_replace_entities(s)) for s in results],
|
| 188 |
-
)
|
| 189 |
-
)
|
| 190 |
|
| 191 |
def re_first(
|
| 192 |
self,
|
| 193 |
regex: str | Pattern,
|
| 194 |
-
default=None,
|
| 195 |
replace_entities: bool = True,
|
| 196 |
clean_match: bool = False,
|
| 197 |
case_sensitive: bool = True,
|
|
@@ -232,8 +225,8 @@ class TextHandlers(List[TextHandler]):
|
|
| 232 |
def __getitem__(self, pos: SupportsIndex | slice) -> Union[TextHandler, "TextHandlers"]:
|
| 233 |
lst = super().__getitem__(pos)
|
| 234 |
if isinstance(pos, slice):
|
| 235 |
-
return TextHandlers(cast(List[
|
| 236 |
-
return cast(
|
| 237 |
|
| 238 |
def re(
|
| 239 |
self,
|
|
@@ -256,7 +249,7 @@ class TextHandlers(List[TextHandler]):
|
|
| 256 |
def re_first(
|
| 257 |
self,
|
| 258 |
regex: str | Pattern,
|
| 259 |
-
default=None,
|
| 260 |
replace_entities: bool = True,
|
| 261 |
clean_match: bool = False,
|
| 262 |
case_sensitive: bool = True,
|
|
@@ -309,9 +302,9 @@ class AttributesHandler(Mapping[str, _TextHandlerType]):
|
|
| 309 |
)
|
| 310 |
|
| 311 |
# Fastest read-only mapping type
|
| 312 |
-
self._data = MappingProxyType(mapping)
|
| 313 |
|
| 314 |
-
def get(self, key: str, default:
|
| 315 |
"""Acts like the standard dictionary `.get()` method"""
|
| 316 |
return self._data.get(key, default)
|
| 317 |
|
|
|
|
| 5 |
from orjson import dumps, loads
|
| 6 |
|
| 7 |
from scrapling.core._types import (
|
| 8 |
+
Any,
|
| 9 |
cast,
|
| 10 |
Dict,
|
| 11 |
List,
|
|
|
|
| 15 |
Literal,
|
| 16 |
Pattern,
|
| 17 |
Iterable,
|
|
|
|
| 18 |
Generator,
|
| 19 |
SupportsIndex,
|
| 20 |
)
|
|
|
|
| 33 |
|
| 34 |
def __getitem__(self, key: SupportsIndex | slice) -> "TextHandler": # pragma: no cover
|
| 35 |
lst = super().__getitem__(key)
|
| 36 |
+
return TextHandler(lst)
|
| 37 |
|
| 38 |
+
def split(
|
| 39 |
+
self, sep: str | None = None, maxsplit: SupportsIndex = -1
|
| 40 |
+
) -> Union[List, "TextHandlers"]: # pragma: no cover
|
| 41 |
+
return TextHandlers([TextHandler(s) for s in super().split(sep, maxsplit)])
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
+
def strip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
| 44 |
return TextHandler(super().strip(chars))
|
| 45 |
|
| 46 |
+
def lstrip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
| 47 |
return TextHandler(super().lstrip(chars))
|
| 48 |
|
| 49 |
+
def rstrip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
| 50 |
return TextHandler(super().rstrip(chars))
|
| 51 |
|
| 52 |
def capitalize(self) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
|
|
| 61 |
def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, "TextHandler"]: # pragma: no cover
|
| 62 |
return TextHandler(super().expandtabs(tabsize))
|
| 63 |
|
| 64 |
+
def format(self, *args: object, **kwargs: str) -> Union[str, "TextHandler"]: # pragma: no cover
|
| 65 |
return TextHandler(super().format(*args, **kwargs))
|
| 66 |
|
| 67 |
def format_map(self, mapping) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
|
|
| 128 |
def re(
|
| 129 |
self,
|
| 130 |
regex: str | Pattern,
|
|
|
|
| 131 |
replace_entities: bool = True,
|
| 132 |
clean_match: bool = False,
|
| 133 |
case_sensitive: bool = True,
|
| 134 |
+
*,
|
| 135 |
+
check_match: Literal[True],
|
| 136 |
) -> bool: ...
|
| 137 |
|
| 138 |
@overload
|
|
|
|
| 177 |
results = flatten(results)
|
| 178 |
|
| 179 |
if not replace_entities:
|
| 180 |
+
return TextHandlers([TextHandler(string) for string in results])
|
| 181 |
|
| 182 |
+
return TextHandlers([TextHandler(_replace_entities(s)) for s in results])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
def re_first(
|
| 185 |
self,
|
| 186 |
regex: str | Pattern,
|
| 187 |
+
default: Any = None,
|
| 188 |
replace_entities: bool = True,
|
| 189 |
clean_match: bool = False,
|
| 190 |
case_sensitive: bool = True,
|
|
|
|
| 225 |
def __getitem__(self, pos: SupportsIndex | slice) -> Union[TextHandler, "TextHandlers"]:
|
| 226 |
lst = super().__getitem__(pos)
|
| 227 |
if isinstance(pos, slice):
|
| 228 |
+
return TextHandlers(cast(List[TextHandler], lst))
|
| 229 |
+
return TextHandler(cast(TextHandler, lst))
|
| 230 |
|
| 231 |
def re(
|
| 232 |
self,
|
|
|
|
| 249 |
def re_first(
|
| 250 |
self,
|
| 251 |
regex: str | Pattern,
|
| 252 |
+
default: Any = None,
|
| 253 |
replace_entities: bool = True,
|
| 254 |
clean_match: bool = False,
|
| 255 |
case_sensitive: bool = True,
|
|
|
|
| 302 |
)
|
| 303 |
|
| 304 |
# Fastest read-only mapping type
|
| 305 |
+
self._data: Mapping[str, Any] = MappingProxyType(mapping)
|
| 306 |
|
| 307 |
+
def get(self, key: str, default: Any = None) -> _TextHandlerType:
|
| 308 |
"""Acts like the standard dictionary `.get()` method"""
|
| 309 |
return self._data.get(key, default)
|
| 310 |
|
scrapling/core/mixins.py
CHANGED
|
@@ -1,3 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
class SelectorsGeneration:
|
| 2 |
"""
|
| 3 |
Functions for generating selectors
|
|
@@ -5,7 +11,7 @@ class SelectorsGeneration:
|
|
| 5 |
Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591
|
| 6 |
"""
|
| 7 |
|
| 8 |
-
def
|
| 9 |
"""Generate a selector for the current element.
|
| 10 |
:return: A string of the generated selector.
|
| 11 |
"""
|
|
@@ -47,29 +53,29 @@ class SelectorsGeneration:
|
|
| 47 |
return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))
|
| 48 |
|
| 49 |
@property
|
| 50 |
-
def generate_css_selector(self) -> str:
|
| 51 |
"""Generate a CSS selector for the current element
|
| 52 |
:return: A string of the generated selector.
|
| 53 |
"""
|
| 54 |
-
return self.
|
| 55 |
|
| 56 |
@property
|
| 57 |
-
def generate_full_css_selector(self) -> str:
|
| 58 |
"""Generate a complete CSS selector for the current element
|
| 59 |
:return: A string of the generated selector.
|
| 60 |
"""
|
| 61 |
-
return self.
|
| 62 |
|
| 63 |
@property
|
| 64 |
-
def generate_xpath_selector(self) -> str:
|
| 65 |
"""Generate an XPath selector for the current element
|
| 66 |
:return: A string of the generated selector.
|
| 67 |
"""
|
| 68 |
-
return self.
|
| 69 |
|
| 70 |
@property
|
| 71 |
-
def generate_full_xpath_selector(self) -> str:
|
| 72 |
"""Generate a complete XPath selector for the current element
|
| 73 |
:return: A string of the generated selector.
|
| 74 |
"""
|
| 75 |
-
return self.
|
|
|
|
| 1 |
+
from scrapling.core._types import TYPE_CHECKING
|
| 2 |
+
|
| 3 |
+
if TYPE_CHECKING:
|
| 4 |
+
from scrapling.parser import Selector
|
| 5 |
+
|
| 6 |
+
|
| 7 |
class SelectorsGeneration:
|
| 8 |
"""
|
| 9 |
Functions for generating selectors
|
|
|
|
| 11 |
Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591
|
| 12 |
"""
|
| 13 |
|
| 14 |
+
def _general_selection(self: "Selector", selection: str = "css", full_path: bool = False) -> str: # type: ignore[name-defined]
|
| 15 |
"""Generate a selector for the current element.
|
| 16 |
:return: A string of the generated selector.
|
| 17 |
"""
|
|
|
|
| 53 |
return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))
|
| 54 |
|
| 55 |
@property
|
| 56 |
+
def generate_css_selector(self: "Selector") -> str: # type: ignore[name-defined]
|
| 57 |
"""Generate a CSS selector for the current element
|
| 58 |
:return: A string of the generated selector.
|
| 59 |
"""
|
| 60 |
+
return self._general_selection()
|
| 61 |
|
| 62 |
@property
|
| 63 |
+
def generate_full_css_selector(self: "Selector") -> str: # type: ignore[name-defined]
|
| 64 |
"""Generate a complete CSS selector for the current element
|
| 65 |
:return: A string of the generated selector.
|
| 66 |
"""
|
| 67 |
+
return self._general_selection(full_path=True)
|
| 68 |
|
| 69 |
@property
|
| 70 |
+
def generate_xpath_selector(self: "Selector") -> str: # type: ignore[name-defined]
|
| 71 |
"""Generate an XPath selector for the current element
|
| 72 |
:return: A string of the generated selector.
|
| 73 |
"""
|
| 74 |
+
return self._general_selection("xpath")
|
| 75 |
|
| 76 |
@property
|
| 77 |
+
def generate_full_xpath_selector(self: "Selector") -> str: # type: ignore[name-defined]
|
| 78 |
"""Generate a complete XPath selector for the current element
|
| 79 |
:return: A string of the generated selector.
|
| 80 |
"""
|
| 81 |
+
return self._general_selection("xpath", full_path=True)
|
scrapling/core/shell.py
CHANGED
|
@@ -31,6 +31,7 @@ from scrapling.core._types import (
|
|
| 31 |
Optional,
|
| 32 |
Dict,
|
| 33 |
Any,
|
|
|
|
| 34 |
extraction_types,
|
| 35 |
Generator,
|
| 36 |
)
|
|
@@ -540,15 +541,15 @@ class Convertor:
|
|
| 540 |
raise ValueError(f"Unknown extraction type: {extraction_type}")
|
| 541 |
else:
|
| 542 |
if main_content_only:
|
| 543 |
-
page = page.css_first("body") or page
|
| 544 |
|
| 545 |
-
pages = [page] if not css_selector else page.css(css_selector)
|
| 546 |
for page in pages:
|
| 547 |
match extraction_type:
|
| 548 |
case "markdown":
|
| 549 |
yield cls._convert_to_markdown(page.html_content)
|
| 550 |
case "html":
|
| 551 |
-
yield page.
|
| 552 |
case "text":
|
| 553 |
txt_content = page.get_all_text(strip=True)
|
| 554 |
for s in (
|
|
|
|
| 31 |
Optional,
|
| 32 |
Dict,
|
| 33 |
Any,
|
| 34 |
+
cast,
|
| 35 |
extraction_types,
|
| 36 |
Generator,
|
| 37 |
)
|
|
|
|
| 541 |
raise ValueError(f"Unknown extraction type: {extraction_type}")
|
| 542 |
else:
|
| 543 |
if main_content_only:
|
| 544 |
+
page = cast(Selector, page.css_first("body")) or page
|
| 545 |
|
| 546 |
+
pages = [page] if not css_selector else cast(Selectors, page.css(css_selector))
|
| 547 |
for page in pages:
|
| 548 |
match extraction_type:
|
| 549 |
case "markdown":
|
| 550 |
yield cls._convert_to_markdown(page.html_content)
|
| 551 |
case "html":
|
| 552 |
+
yield page.html_content
|
| 553 |
case "text":
|
| 554 |
txt_content = page.get_all_text(strip=True)
|
| 555 |
for s in (
|
scrapling/core/storage.py
CHANGED
|
@@ -56,13 +56,13 @@ class StorageSystemMixin(ABC): # pragma: no cover
|
|
| 56 |
@lru_cache(128, typed=True)
|
| 57 |
def _get_hash(identifier: str) -> str:
|
| 58 |
"""If you want to hash identifier in your storage system, use this safer"""
|
| 59 |
-
|
| 60 |
-
if isinstance(
|
| 61 |
# Hash functions have to take bytes
|
| 62 |
-
|
| 63 |
|
| 64 |
-
hash_value = sha256(
|
| 65 |
-
return f"{hash_value}_{len(
|
| 66 |
|
| 67 |
|
| 68 |
@lru_cache(1, typed=True)
|
|
|
|
| 56 |
@lru_cache(128, typed=True)
|
| 57 |
def _get_hash(identifier: str) -> str:
|
| 58 |
"""If you want to hash identifier in your storage system, use this safer"""
|
| 59 |
+
_identifier = identifier.lower().strip()
|
| 60 |
+
if isinstance(_identifier, str):
|
| 61 |
# Hash functions have to take bytes
|
| 62 |
+
_identifier = _identifier.encode("utf-8")
|
| 63 |
|
| 64 |
+
hash_value = sha256(_identifier).hexdigest()
|
| 65 |
+
return f"{hash_value}_{len(_identifier)}" # Length to reduce collision chance
|
| 66 |
|
| 67 |
|
| 68 |
@lru_cache(1, typed=True)
|
scrapling/core/translator.py
CHANGED
|
@@ -10,24 +10,23 @@ So you don't have to learn a new selectors/api method like what bs4 done with so
|
|
| 10 |
|
| 11 |
from functools import lru_cache
|
| 12 |
|
| 13 |
-
from cssselect.xpath import ExpressionError
|
| 14 |
-
from cssselect.xpath import XPathExpr as OriginalXPathExpr
|
| 15 |
from cssselect import HTMLTranslator as OriginalHTMLTranslator
|
|
|
|
| 16 |
from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
|
| 17 |
|
| 18 |
-
from scrapling.core._types import Any,
|
| 19 |
|
| 20 |
|
| 21 |
class XPathExpr(OriginalXPathExpr):
|
| 22 |
textnode: bool = False
|
| 23 |
-
attribute:
|
| 24 |
|
| 25 |
@classmethod
|
| 26 |
def from_xpath(
|
| 27 |
cls,
|
| 28 |
xpath: OriginalXPathExpr,
|
| 29 |
textnode: bool = False,
|
| 30 |
-
attribute:
|
| 31 |
) -> Self:
|
| 32 |
x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
|
| 33 |
x.textnode = textnode
|
|
@@ -71,10 +70,10 @@ class XPathExpr(OriginalXPathExpr):
|
|
| 71 |
|
| 72 |
# e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
|
| 73 |
class TranslatorProtocol(Protocol):
|
| 74 |
-
def xpath_element(self, selector: Element) -> OriginalXPathExpr: # pragma: no cover
|
| 75 |
pass
|
| 76 |
|
| 77 |
-
def css_to_xpath(self, css: str, prefix: str = ...) -> str: # pragma: no cover
|
| 78 |
pass
|
| 79 |
|
| 80 |
|
|
@@ -121,9 +120,15 @@ class TranslatorMixin:
|
|
| 121 |
|
| 122 |
|
| 123 |
class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
|
| 124 |
-
@lru_cache(maxsize=256)
|
| 125 |
def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
|
| 126 |
return super().css_to_xpath(css, prefix)
|
| 127 |
|
| 128 |
|
| 129 |
translator = HTMLTranslator()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
from functools import lru_cache
|
| 12 |
|
|
|
|
|
|
|
| 13 |
from cssselect import HTMLTranslator as OriginalHTMLTranslator
|
| 14 |
+
from cssselect.xpath import ExpressionError, XPathExpr as OriginalXPathExpr
|
| 15 |
from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
|
| 16 |
|
| 17 |
+
from scrapling.core._types import Any, Protocol, Self
|
| 18 |
|
| 19 |
|
| 20 |
class XPathExpr(OriginalXPathExpr):
|
| 21 |
textnode: bool = False
|
| 22 |
+
attribute: str | None = None
|
| 23 |
|
| 24 |
@classmethod
|
| 25 |
def from_xpath(
|
| 26 |
cls,
|
| 27 |
xpath: OriginalXPathExpr,
|
| 28 |
textnode: bool = False,
|
| 29 |
+
attribute: str | None = None,
|
| 30 |
) -> Self:
|
| 31 |
x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
|
| 32 |
x.textnode = textnode
|
|
|
|
| 70 |
|
| 71 |
# e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
|
| 72 |
class TranslatorProtocol(Protocol):
|
| 73 |
+
def xpath_element(self, selector: Element) -> OriginalXPathExpr: # pyright: ignore # pragma: no cover
|
| 74 |
pass
|
| 75 |
|
| 76 |
+
def css_to_xpath(self, css: str, prefix: str = ...) -> str: # pyright: ignore # pragma: no cover
|
| 77 |
pass
|
| 78 |
|
| 79 |
|
|
|
|
| 120 |
|
| 121 |
|
| 122 |
class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
|
|
|
|
| 123 |
def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
|
| 124 |
return super().css_to_xpath(css, prefix)
|
| 125 |
|
| 126 |
|
| 127 |
translator = HTMLTranslator()
|
| 128 |
+
# Using a function instead of the translator directly to avoid Pyright override error
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
@lru_cache(maxsize=256)
|
| 132 |
+
def css_to_xpath(query: str) -> str:
|
| 133 |
+
"""Return translated XPath version of a given CSS query"""
|
| 134 |
+
return translator.css_to_xpath(query)
|
scrapling/engines/_browsers/_base.py
CHANGED
|
@@ -7,14 +7,12 @@ from playwright.async_api import (
|
|
| 7 |
BrowserContext as AsyncBrowserContext,
|
| 8 |
Playwright as AsyncPlaywright,
|
| 9 |
)
|
| 10 |
-
from camoufox.
|
| 11 |
-
|
| 12 |
-
installed_verstr as camoufox_version,
|
| 13 |
-
)
|
| 14 |
|
| 15 |
from ._page import PageInfo, PagePool
|
| 16 |
from scrapling.parser import Selector
|
| 17 |
-
from scrapling.core._types import Dict, Optional
|
| 18 |
from scrapling.engines.toolbelt.fingerprints import get_os_name
|
| 19 |
from ._validators import validate, PlaywrightConfig, CamoufoxConfig
|
| 20 |
from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
|
|
@@ -41,6 +39,7 @@ class SyncSession:
|
|
| 41 |
"""Get a new page to use"""
|
| 42 |
|
| 43 |
# No need to check if a page is available or not in sync code because the code blocked before reaching here till the page closed, ofc.
|
|
|
|
| 44 |
page = self.context.new_page()
|
| 45 |
page.set_default_navigation_timeout(timeout)
|
| 46 |
page.set_default_timeout(timeout)
|
|
@@ -65,11 +64,14 @@ class SyncSession:
|
|
| 65 |
}
|
| 66 |
|
| 67 |
|
| 68 |
-
class AsyncSession
|
| 69 |
def __init__(self, max_pages: int = 1):
|
| 70 |
-
|
|
|
|
|
|
|
| 71 |
self.playwright: Optional[AsyncPlaywright] = None
|
| 72 |
self.context: Optional[AsyncBrowserContext] = None
|
|
|
|
| 73 |
self._lock = Lock()
|
| 74 |
|
| 75 |
async def _get_page(
|
|
@@ -79,6 +81,9 @@ class AsyncSession(SyncSession):
|
|
| 79 |
disable_resources: bool,
|
| 80 |
) -> PageInfo: # pragma: no cover
|
| 81 |
"""Get a new page to use"""
|
|
|
|
|
|
|
|
|
|
| 82 |
async with self._lock:
|
| 83 |
# If we're at max capacity after cleanup, wait for busy pages to finish
|
| 84 |
if self.page_pool.pages_count >= self.max_pages:
|
|
@@ -92,6 +97,7 @@ class AsyncSession(SyncSession):
|
|
| 92 |
f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
|
| 93 |
)
|
| 94 |
|
|
|
|
| 95 |
page = await self.context.new_page()
|
| 96 |
page.set_default_navigation_timeout(timeout)
|
| 97 |
page.set_default_timeout(timeout)
|
|
@@ -107,6 +113,14 @@ class AsyncSession(SyncSession):
|
|
| 107 |
|
| 108 |
return self.page_pool.add_page(page)
|
| 109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
class DynamicSessionMixin:
|
| 112 |
def __validate__(self, **params):
|
|
@@ -139,6 +153,9 @@ class DynamicSessionMixin:
|
|
| 139 |
self.__initiate_browser_options__()
|
| 140 |
|
| 141 |
def __initiate_browser_options__(self):
|
|
|
|
|
|
|
|
|
|
| 142 |
if not self.cdp_url:
|
| 143 |
# `launch_options` is used with persistent context
|
| 144 |
self.launch_options = dict(
|
|
@@ -175,7 +192,7 @@ class DynamicSessionMixin:
|
|
| 175 |
|
| 176 |
class StealthySessionMixin:
|
| 177 |
def __validate__(self, **params):
|
| 178 |
-
config = validate(params, model=CamoufoxConfig)
|
| 179 |
|
| 180 |
self.max_pages = config.max_pages
|
| 181 |
self.headless = config.headless
|
|
@@ -209,10 +226,10 @@ class StealthySessionMixin:
|
|
| 209 |
|
| 210 |
def __initiate_browser_options__(self):
|
| 211 |
"""Initiate browser options."""
|
| 212 |
-
self.launch_options = generate_launch_options(
|
| 213 |
**{
|
| 214 |
"geoip": self.geoip,
|
| 215 |
-
"proxy": dict(self.proxy) if self.proxy else self.proxy,
|
| 216 |
"addons": self.addons,
|
| 217 |
"exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
|
| 218 |
"headless": self.headless,
|
|
@@ -232,7 +249,7 @@ class StealthySessionMixin:
|
|
| 232 |
"browser.cache.disk_cache_ssl": True,
|
| 233 |
"browser.cache.disk.smart_size.enabled": True,
|
| 234 |
},
|
| 235 |
-
**self.additional_args,
|
| 236 |
}
|
| 237 |
)
|
| 238 |
|
|
|
|
| 7 |
BrowserContext as AsyncBrowserContext,
|
| 8 |
Playwright as AsyncPlaywright,
|
| 9 |
)
|
| 10 |
+
from camoufox.pkgman import installed_verstr as camoufox_version
|
| 11 |
+
from camoufox.utils import launch_options as generate_launch_options
|
|
|
|
|
|
|
| 12 |
|
| 13 |
from ._page import PageInfo, PagePool
|
| 14 |
from scrapling.parser import Selector
|
| 15 |
+
from scrapling.core._types import Any, cast, Dict, Optional, TYPE_CHECKING
|
| 16 |
from scrapling.engines.toolbelt.fingerprints import get_os_name
|
| 17 |
from ._validators import validate, PlaywrightConfig, CamoufoxConfig
|
| 18 |
from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
|
|
|
|
| 39 |
"""Get a new page to use"""
|
| 40 |
|
| 41 |
# No need to check if a page is available or not in sync code because the code blocked before reaching here till the page closed, ofc.
|
| 42 |
+
assert self.context is not None, "Browser context not initialized"
|
| 43 |
page = self.context.new_page()
|
| 44 |
page.set_default_navigation_timeout(timeout)
|
| 45 |
page.set_default_timeout(timeout)
|
|
|
|
| 64 |
}
|
| 65 |
|
| 66 |
|
| 67 |
+
class AsyncSession:
|
| 68 |
def __init__(self, max_pages: int = 1):
|
| 69 |
+
self.max_pages = max_pages
|
| 70 |
+
self.page_pool = PagePool(max_pages)
|
| 71 |
+
self._max_wait_for_page = 60
|
| 72 |
self.playwright: Optional[AsyncPlaywright] = None
|
| 73 |
self.context: Optional[AsyncBrowserContext] = None
|
| 74 |
+
self._closed = False
|
| 75 |
self._lock = Lock()
|
| 76 |
|
| 77 |
async def _get_page(
|
|
|
|
| 81 |
disable_resources: bool,
|
| 82 |
) -> PageInfo: # pragma: no cover
|
| 83 |
"""Get a new page to use"""
|
| 84 |
+
if TYPE_CHECKING:
|
| 85 |
+
assert self.context is not None, "Browser context not initialized"
|
| 86 |
+
|
| 87 |
async with self._lock:
|
| 88 |
# If we're at max capacity after cleanup, wait for busy pages to finish
|
| 89 |
if self.page_pool.pages_count >= self.max_pages:
|
|
|
|
| 97 |
f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
|
| 98 |
)
|
| 99 |
|
| 100 |
+
assert self.context is not None, "Browser context not initialized"
|
| 101 |
page = await self.context.new_page()
|
| 102 |
page.set_default_navigation_timeout(timeout)
|
| 103 |
page.set_default_timeout(timeout)
|
|
|
|
| 113 |
|
| 114 |
return self.page_pool.add_page(page)
|
| 115 |
|
| 116 |
+
def get_pool_stats(self) -> Dict[str, int]:
|
| 117 |
+
"""Get statistics about the current page pool"""
|
| 118 |
+
return {
|
| 119 |
+
"total_pages": self.page_pool.pages_count,
|
| 120 |
+
"busy_pages": self.page_pool.busy_count,
|
| 121 |
+
"max_pages": self.max_pages,
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
|
| 125 |
class DynamicSessionMixin:
|
| 126 |
def __validate__(self, **params):
|
|
|
|
| 153 |
self.__initiate_browser_options__()
|
| 154 |
|
| 155 |
def __initiate_browser_options__(self):
|
| 156 |
+
if TYPE_CHECKING:
|
| 157 |
+
assert isinstance(self.proxy, tuple)
|
| 158 |
+
|
| 159 |
if not self.cdp_url:
|
| 160 |
# `launch_options` is used with persistent context
|
| 161 |
self.launch_options = dict(
|
|
|
|
| 192 |
|
| 193 |
class StealthySessionMixin:
|
| 194 |
def __validate__(self, **params):
|
| 195 |
+
config: CamoufoxConfig = validate(params, model=CamoufoxConfig)
|
| 196 |
|
| 197 |
self.max_pages = config.max_pages
|
| 198 |
self.headless = config.headless
|
|
|
|
| 226 |
|
| 227 |
def __initiate_browser_options__(self):
|
| 228 |
"""Initiate browser options."""
|
| 229 |
+
self.launch_options: Dict[str, Any] = generate_launch_options(
|
| 230 |
**{
|
| 231 |
"geoip": self.geoip,
|
| 232 |
+
"proxy": dict(self.proxy) if self.proxy and isinstance(self.proxy, tuple) else self.proxy,
|
| 233 |
"addons": self.addons,
|
| 234 |
"exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
|
| 235 |
"headless": self.headless,
|
|
|
|
| 249 |
"browser.cache.disk_cache_ssl": True,
|
| 250 |
"browser.cache.disk.smart_size.enabled": True,
|
| 251 |
},
|
| 252 |
+
**cast(Dict, self.additional_args),
|
| 253 |
}
|
| 254 |
)
|
| 255 |
|
scrapling/engines/_browsers/_camoufox.py
CHANGED
|
@@ -26,6 +26,7 @@ from scrapling.core._types import (
|
|
| 26 |
List,
|
| 27 |
Optional,
|
| 28 |
Callable,
|
|
|
|
| 29 |
SelectorWaitStates,
|
| 30 |
)
|
| 31 |
from scrapling.engines.toolbelt.convertor import (
|
|
@@ -205,7 +206,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
| 205 |
self._closed = True
|
| 206 |
|
| 207 |
@staticmethod
|
| 208 |
-
def _get_page_content(page: Page) -> str
|
| 209 |
"""
|
| 210 |
A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
|
| 211 |
:param page: The page to extract content from.
|
|
@@ -217,6 +218,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
| 217 |
except PlaywrightError:
|
| 218 |
page.wait_for_timeout(1000)
|
| 219 |
continue
|
|
|
|
| 220 |
|
| 221 |
def _solve_cloudflare(self, page: Page) -> None: # pragma: no cover
|
| 222 |
"""Solve the cloudflare challenge displayed on the playwright page passed
|
|
@@ -502,8 +504,8 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
| 502 |
|
| 503 |
async def __create__(self):
|
| 504 |
"""Create a browser for this instance and context."""
|
| 505 |
-
self.playwright: AsyncPlaywright = await async_playwright().start()
|
| 506 |
-
self.context: AsyncBrowserContext = await self.playwright.firefox.launch_persistent_context(
|
| 507 |
**self.launch_options
|
| 508 |
)
|
| 509 |
|
|
@@ -511,7 +513,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
| 511 |
await self.context.add_init_script(path=self.init_script)
|
| 512 |
|
| 513 |
if self.cookies:
|
| 514 |
-
await self.context.add_cookies(self.cookies)
|
| 515 |
|
| 516 |
async def __aenter__(self):
|
| 517 |
await self.__create__()
|
|
@@ -536,7 +538,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
| 536 |
self._closed = True
|
| 537 |
|
| 538 |
@staticmethod
|
| 539 |
-
async def _get_page_content(page: async_Page) -> str
|
| 540 |
"""
|
| 541 |
A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
|
| 542 |
:param page: The page to extract content from.
|
|
@@ -548,6 +550,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
| 548 |
except PlaywrightError:
|
| 549 |
await page.wait_for_timeout(1000)
|
| 550 |
continue
|
|
|
|
| 551 |
|
| 552 |
async def _solve_cloudflare(self, page: async_Page):
|
| 553 |
"""Solve the cloudflare challenge displayed on the playwright page passed. The async version
|
|
@@ -679,6 +682,10 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
| 679 |
page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
|
| 680 |
page_info.mark_busy(url=url)
|
| 681 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 682 |
try:
|
| 683 |
# Navigate to URL and wait for a specified state
|
| 684 |
page_info.page.on("response", handle_response)
|
|
|
|
| 26 |
List,
|
| 27 |
Optional,
|
| 28 |
Callable,
|
| 29 |
+
TYPE_CHECKING,
|
| 30 |
SelectorWaitStates,
|
| 31 |
)
|
| 32 |
from scrapling.engines.toolbelt.convertor import (
|
|
|
|
| 206 |
self._closed = True
|
| 207 |
|
| 208 |
@staticmethod
|
| 209 |
+
def _get_page_content(page: Page) -> str:
|
| 210 |
"""
|
| 211 |
A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
|
| 212 |
:param page: The page to extract content from.
|
|
|
|
| 218 |
except PlaywrightError:
|
| 219 |
page.wait_for_timeout(1000)
|
| 220 |
continue
|
| 221 |
+
return "" # pyright: ignore
|
| 222 |
|
| 223 |
def _solve_cloudflare(self, page: Page) -> None: # pragma: no cover
|
| 224 |
"""Solve the cloudflare challenge displayed on the playwright page passed
|
|
|
|
| 504 |
|
| 505 |
async def __create__(self):
|
| 506 |
"""Create a browser for this instance and context."""
|
| 507 |
+
self.playwright: AsyncPlaywright | None = await async_playwright().start()
|
| 508 |
+
self.context: AsyncBrowserContext | None = await self.playwright.firefox.launch_persistent_context(
|
| 509 |
**self.launch_options
|
| 510 |
)
|
| 511 |
|
|
|
|
| 513 |
await self.context.add_init_script(path=self.init_script)
|
| 514 |
|
| 515 |
if self.cookies:
|
| 516 |
+
await self.context.add_cookies(self.cookies) # pyright: ignore [reportArgumentType]
|
| 517 |
|
| 518 |
async def __aenter__(self):
|
| 519 |
await self.__create__()
|
|
|
|
| 538 |
self._closed = True
|
| 539 |
|
| 540 |
@staticmethod
|
| 541 |
+
async def _get_page_content(page: async_Page) -> str:
|
| 542 |
"""
|
| 543 |
A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
|
| 544 |
:param page: The page to extract content from.
|
|
|
|
| 550 |
except PlaywrightError:
|
| 551 |
await page.wait_for_timeout(1000)
|
| 552 |
continue
|
| 553 |
+
return "" # pyright: ignore
|
| 554 |
|
| 555 |
async def _solve_cloudflare(self, page: async_Page):
|
| 556 |
"""Solve the cloudflare challenge displayed on the playwright page passed. The async version
|
|
|
|
| 682 |
page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
|
| 683 |
page_info.mark_busy(url=url)
|
| 684 |
|
| 685 |
+
if TYPE_CHECKING:
|
| 686 |
+
if not isinstance(page_info.page, async_Page):
|
| 687 |
+
raise TypeError
|
| 688 |
+
|
| 689 |
try:
|
| 690 |
# Navigate to URL and wait for a specified state
|
| 691 |
page_info.page.on("response", handle_response)
|
scrapling/engines/_browsers/_config_tools.py
CHANGED
|
@@ -62,7 +62,7 @@ def _set_flags(hide_canvas, disable_webgl): # pragma: no cover
|
|
| 62 |
@lru_cache(2, typed=True)
|
| 63 |
def _launch_kwargs(
|
| 64 |
headless,
|
| 65 |
-
proxy,
|
| 66 |
locale,
|
| 67 |
extra_headers,
|
| 68 |
useragent,
|
|
|
|
| 62 |
@lru_cache(2, typed=True)
|
| 63 |
def _launch_kwargs(
|
| 64 |
headless,
|
| 65 |
+
proxy: Tuple,
|
| 66 |
locale,
|
| 67 |
extra_headers,
|
| 68 |
useragent,
|
scrapling/engines/_browsers/_controllers.py
CHANGED
|
@@ -10,6 +10,7 @@ from playwright.async_api import (
|
|
| 10 |
BrowserContext as AsyncBrowserContext,
|
| 11 |
Playwright as AsyncPlaywright,
|
| 12 |
Locator as AsyncLocator,
|
|
|
|
| 13 |
)
|
| 14 |
from patchright.sync_api import sync_playwright as sync_patchright
|
| 15 |
from patchright.async_api import async_playwright as async_patchright
|
|
@@ -18,10 +19,12 @@ from scrapling.core.utils import log
|
|
| 18 |
from ._base import SyncSession, AsyncSession, DynamicSessionMixin
|
| 19 |
from ._validators import validate_fetch as _validate
|
| 20 |
from scrapling.core._types import (
|
|
|
|
| 21 |
Dict,
|
| 22 |
List,
|
| 23 |
Optional,
|
| 24 |
Callable,
|
|
|
|
| 25 |
SelectorWaitStates,
|
| 26 |
)
|
| 27 |
from scrapling.engines.toolbelt.convertor import (
|
|
@@ -30,7 +33,7 @@ from scrapling.engines.toolbelt.convertor import (
|
|
| 30 |
)
|
| 31 |
from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
|
| 32 |
|
| 33 |
-
_UNSET = object()
|
| 34 |
|
| 35 |
|
| 36 |
class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
@@ -154,7 +157,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
| 154 |
"""Create a browser for this instance and context."""
|
| 155 |
sync_context = sync_patchright if self.stealth else sync_playwright
|
| 156 |
|
| 157 |
-
self.playwright: Playwright = sync_context().start()
|
| 158 |
|
| 159 |
if self.cdp_url: # pragma: no cover
|
| 160 |
self.context = self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url).new_context(
|
|
@@ -187,7 +190,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
| 187 |
|
| 188 |
if self.playwright:
|
| 189 |
self.playwright.stop()
|
| 190 |
-
self.playwright = None
|
| 191 |
|
| 192 |
self._closed = True
|
| 193 |
|
|
@@ -399,7 +402,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
| 399 |
"""Create a browser for this instance and context."""
|
| 400 |
async_context = async_patchright if self.stealth else async_playwright
|
| 401 |
|
| 402 |
-
self.playwright: AsyncPlaywright = await async_context().start()
|
| 403 |
|
| 404 |
if self.cdp_url:
|
| 405 |
browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url)
|
|
@@ -413,7 +416,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
| 413 |
await self.context.add_init_script(path=self.init_script)
|
| 414 |
|
| 415 |
if self.cookies:
|
| 416 |
-
await self.context.add_cookies(self.cookies)
|
| 417 |
|
| 418 |
async def __aenter__(self):
|
| 419 |
await self.__create__()
|
|
@@ -429,11 +432,11 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
| 429 |
|
| 430 |
if self.context:
|
| 431 |
await self.context.close()
|
| 432 |
-
self.context = None
|
| 433 |
|
| 434 |
if self.playwright:
|
| 435 |
await self.playwright.stop()
|
| 436 |
-
self.playwright = None
|
| 437 |
|
| 438 |
self._closed = True
|
| 439 |
|
|
@@ -506,6 +509,10 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
| 506 |
page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
|
| 507 |
page_info.mark_busy(url=url)
|
| 508 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 509 |
try:
|
| 510 |
# Navigate to URL and wait for a specified state
|
| 511 |
page_info.page.on("response", handle_response)
|
|
|
|
| 10 |
BrowserContext as AsyncBrowserContext,
|
| 11 |
Playwright as AsyncPlaywright,
|
| 12 |
Locator as AsyncLocator,
|
| 13 |
+
Page as async_Page,
|
| 14 |
)
|
| 15 |
from patchright.sync_api import sync_playwright as sync_patchright
|
| 16 |
from patchright.async_api import async_playwright as async_patchright
|
|
|
|
| 19 |
from ._base import SyncSession, AsyncSession, DynamicSessionMixin
|
| 20 |
from ._validators import validate_fetch as _validate
|
| 21 |
from scrapling.core._types import (
|
| 22 |
+
Any,
|
| 23 |
Dict,
|
| 24 |
List,
|
| 25 |
Optional,
|
| 26 |
Callable,
|
| 27 |
+
TYPE_CHECKING,
|
| 28 |
SelectorWaitStates,
|
| 29 |
)
|
| 30 |
from scrapling.engines.toolbelt.convertor import (
|
|
|
|
| 33 |
)
|
| 34 |
from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
|
| 35 |
|
| 36 |
+
_UNSET: Any = object()
|
| 37 |
|
| 38 |
|
| 39 |
class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
|
|
| 157 |
"""Create a browser for this instance and context."""
|
| 158 |
sync_context = sync_patchright if self.stealth else sync_playwright
|
| 159 |
|
| 160 |
+
self.playwright: Playwright = sync_context().start() # pyright: ignore [reportAttributeAccessIssue]
|
| 161 |
|
| 162 |
if self.cdp_url: # pragma: no cover
|
| 163 |
self.context = self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url).new_context(
|
|
|
|
| 190 |
|
| 191 |
if self.playwright:
|
| 192 |
self.playwright.stop()
|
| 193 |
+
self.playwright = None # pyright: ignore
|
| 194 |
|
| 195 |
self._closed = True
|
| 196 |
|
|
|
|
| 402 |
"""Create a browser for this instance and context."""
|
| 403 |
async_context = async_patchright if self.stealth else async_playwright
|
| 404 |
|
| 405 |
+
self.playwright: AsyncPlaywright = await async_context().start() # pyright: ignore [reportAttributeAccessIssue]
|
| 406 |
|
| 407 |
if self.cdp_url:
|
| 408 |
browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url)
|
|
|
|
| 416 |
await self.context.add_init_script(path=self.init_script)
|
| 417 |
|
| 418 |
if self.cookies:
|
| 419 |
+
await self.context.add_cookies(self.cookies) # pyright: ignore
|
| 420 |
|
| 421 |
async def __aenter__(self):
|
| 422 |
await self.__create__()
|
|
|
|
| 432 |
|
| 433 |
if self.context:
|
| 434 |
await self.context.close()
|
| 435 |
+
self.context = None # pyright: ignore
|
| 436 |
|
| 437 |
if self.playwright:
|
| 438 |
await self.playwright.stop()
|
| 439 |
+
self.playwright = None # pyright: ignore
|
| 440 |
|
| 441 |
self._closed = True
|
| 442 |
|
|
|
|
| 509 |
page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
|
| 510 |
page_info.mark_busy(url=url)
|
| 511 |
|
| 512 |
+
if TYPE_CHECKING:
|
| 513 |
+
if not isinstance(page_info.page, async_Page):
|
| 514 |
+
raise TypeError
|
| 515 |
+
|
| 516 |
try:
|
| 517 |
# Navigate to URL and wait for a specified state
|
| 518 |
page_info.page.on("response", handle_response)
|
scrapling/engines/_browsers/_validators.py
CHANGED
|
@@ -11,7 +11,10 @@ from scrapling.core._types import (
|
|
| 11 |
Tuple,
|
| 12 |
Optional,
|
| 13 |
Callable,
|
|
|
|
| 14 |
SelectorWaitStates,
|
|
|
|
|
|
|
| 15 |
)
|
| 16 |
from scrapling.engines.toolbelt.navigation import construct_proxy_dict
|
| 17 |
|
|
@@ -73,7 +76,7 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
|
| 73 |
stealth: bool = False
|
| 74 |
wait: Seconds = 0
|
| 75 |
page_action: Optional[Callable] = None
|
| 76 |
-
proxy: Optional[str | Dict[str, str]] = None # The default value for proxy in Playwright's source is `None`
|
| 77 |
locale: str = "en-US"
|
| 78 |
extra_headers: Optional[Dict[str, str]] = None
|
| 79 |
useragent: Optional[str] = None
|
|
@@ -81,11 +84,11 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
|
| 81 |
init_script: Optional[str] = None
|
| 82 |
disable_resources: bool = False
|
| 83 |
wait_selector: Optional[str] = None
|
| 84 |
-
cookies: Optional[
|
| 85 |
network_idle: bool = False
|
| 86 |
load_dom: bool = True
|
| 87 |
wait_selector_state: SelectorWaitStates = "attached"
|
| 88 |
-
selector_config: Optional[Dict] =
|
| 89 |
|
| 90 |
def __post_init__(self):
|
| 91 |
"""Custom validation after msgspec validation"""
|
|
@@ -125,15 +128,15 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
|
|
| 125 |
wait_selector: Optional[str] = None
|
| 126 |
addons: Optional[List[str]] = None
|
| 127 |
wait_selector_state: SelectorWaitStates = "attached"
|
| 128 |
-
cookies: Optional[
|
| 129 |
google_search: bool = True
|
| 130 |
extra_headers: Optional[Dict[str, str]] = None
|
| 131 |
-
proxy: Optional[str | Dict[str, str]] = None # The default value for proxy in Playwright's source is `None`
|
| 132 |
os_randomize: bool = False
|
| 133 |
disable_ads: bool = False
|
| 134 |
geoip: bool = False
|
| 135 |
-
selector_config: Optional[Dict] =
|
| 136 |
-
additional_args: Optional[Dict] =
|
| 137 |
|
| 138 |
def __post_init__(self):
|
| 139 |
"""Custom validation after msgspec validation"""
|
|
@@ -177,7 +180,7 @@ class FetchConfig(Struct, kw_only=True):
|
|
| 177 |
network_idle: bool = False
|
| 178 |
load_dom: bool = True
|
| 179 |
solve_cloudflare: bool = False
|
| 180 |
-
selector_config:
|
| 181 |
|
| 182 |
def to_dict(self):
|
| 183 |
return {f: getattr(self, f) for f in self.__struct_fields__}
|
|
@@ -198,7 +201,7 @@ class _fetch_params:
|
|
| 198 |
network_idle: bool
|
| 199 |
load_dom: bool
|
| 200 |
solve_cloudflare: bool
|
| 201 |
-
selector_config:
|
| 202 |
|
| 203 |
|
| 204 |
def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
|
|
@@ -212,7 +215,7 @@ def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
|
|
| 212 |
result[arg] = session_value
|
| 213 |
|
| 214 |
if overrides:
|
| 215 |
-
overrides = validate(overrides, FetchConfig).to_dict()
|
| 216 |
overrides.update(result)
|
| 217 |
return _fetch_params(**overrides)
|
| 218 |
|
|
@@ -222,7 +225,21 @@ def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
|
|
| 222 |
return _fetch_params(**result)
|
| 223 |
|
| 224 |
|
| 225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
try:
|
| 227 |
return convert(params, model)
|
| 228 |
except ValidationError as e:
|
|
|
|
| 11 |
Tuple,
|
| 12 |
Optional,
|
| 13 |
Callable,
|
| 14 |
+
Iterable,
|
| 15 |
SelectorWaitStates,
|
| 16 |
+
cast,
|
| 17 |
+
overload,
|
| 18 |
)
|
| 19 |
from scrapling.engines.toolbelt.navigation import construct_proxy_dict
|
| 20 |
|
|
|
|
| 76 |
stealth: bool = False
|
| 77 |
wait: Seconds = 0
|
| 78 |
page_action: Optional[Callable] = None
|
| 79 |
+
proxy: Optional[str | Dict[str, str] | Tuple] = None # The default value for proxy in Playwright's source is `None`
|
| 80 |
locale: str = "en-US"
|
| 81 |
extra_headers: Optional[Dict[str, str]] = None
|
| 82 |
useragent: Optional[str] = None
|
|
|
|
| 84 |
init_script: Optional[str] = None
|
| 85 |
disable_resources: bool = False
|
| 86 |
wait_selector: Optional[str] = None
|
| 87 |
+
cookies: Optional[Iterable[Dict]] = None
|
| 88 |
network_idle: bool = False
|
| 89 |
load_dom: bool = True
|
| 90 |
wait_selector_state: SelectorWaitStates = "attached"
|
| 91 |
+
selector_config: Optional[Dict] = {}
|
| 92 |
|
| 93 |
def __post_init__(self):
|
| 94 |
"""Custom validation after msgspec validation"""
|
|
|
|
| 128 |
wait_selector: Optional[str] = None
|
| 129 |
addons: Optional[List[str]] = None
|
| 130 |
wait_selector_state: SelectorWaitStates = "attached"
|
| 131 |
+
cookies: Optional[Iterable[Dict]] = None
|
| 132 |
google_search: bool = True
|
| 133 |
extra_headers: Optional[Dict[str, str]] = None
|
| 134 |
+
proxy: Optional[str | Dict[str, str] | Tuple] = None # The default value for proxy in Playwright's source is `None`
|
| 135 |
os_randomize: bool = False
|
| 136 |
disable_ads: bool = False
|
| 137 |
geoip: bool = False
|
| 138 |
+
selector_config: Optional[Dict] = {}
|
| 139 |
+
additional_args: Optional[Dict] = {}
|
| 140 |
|
| 141 |
def __post_init__(self):
|
| 142 |
"""Custom validation after msgspec validation"""
|
|
|
|
| 180 |
network_idle: bool = False
|
| 181 |
load_dom: bool = True
|
| 182 |
solve_cloudflare: bool = False
|
| 183 |
+
selector_config: Dict = {}
|
| 184 |
|
| 185 |
def to_dict(self):
|
| 186 |
return {f: getattr(self, f) for f in self.__struct_fields__}
|
|
|
|
| 201 |
network_idle: bool
|
| 202 |
load_dom: bool
|
| 203 |
solve_cloudflare: bool
|
| 204 |
+
selector_config: Dict
|
| 205 |
|
| 206 |
|
| 207 |
def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
|
|
|
|
| 215 |
result[arg] = session_value
|
| 216 |
|
| 217 |
if overrides:
|
| 218 |
+
overrides = cast(FetchConfig, validate(overrides, FetchConfig)).to_dict()
|
| 219 |
overrides.update(result)
|
| 220 |
return _fetch_params(**overrides)
|
| 221 |
|
|
|
|
| 225 |
return _fetch_params(**result)
|
| 226 |
|
| 227 |
|
| 228 |
+
@overload
|
| 229 |
+
def validate(params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
@overload
|
| 233 |
+
def validate(params: Dict, model: type[CamoufoxConfig]) -> CamoufoxConfig: ...
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
@overload
|
| 237 |
+
def validate(params: Dict, model: type[FetchConfig]) -> FetchConfig: ...
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
def validate(
|
| 241 |
+
params: Dict, model: type[PlaywrightConfig] | type[CamoufoxConfig] | type[FetchConfig]
|
| 242 |
+
) -> PlaywrightConfig | CamoufoxConfig | FetchConfig:
|
| 243 |
try:
|
| 244 |
return convert(params, model)
|
| 245 |
except ValidationError as e:
|
scrapling/engines/static.py
CHANGED
|
@@ -182,7 +182,7 @@ class FetcherSession:
|
|
| 182 |
|
| 183 |
return headers
|
| 184 |
|
| 185 |
-
def __enter__(self):
|
| 186 |
"""Creates and returns a new synchronous Fetcher Session"""
|
| 187 |
if self._curl_session:
|
| 188 |
raise RuntimeError(
|
|
@@ -197,7 +197,7 @@ class FetcherSession:
|
|
| 197 |
)
|
| 198 |
|
| 199 |
self._curl_session = CurlSession()
|
| 200 |
-
return self
|
| 201 |
|
| 202 |
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 203 |
"""Closes the active synchronous session managed by this instance, if any."""
|
|
@@ -205,7 +205,7 @@ class FetcherSession:
|
|
| 205 |
self._curl_session.close()
|
| 206 |
self._curl_session = None
|
| 207 |
|
| 208 |
-
async def __aenter__(self):
|
| 209 |
"""Creates and returns a new asynchronous Session."""
|
| 210 |
if self._async_curl_session:
|
| 211 |
raise RuntimeError(
|
|
@@ -220,7 +220,7 @@ class FetcherSession:
|
|
| 220 |
)
|
| 221 |
|
| 222 |
self._async_curl_session = AsyncCurlSession()
|
| 223 |
-
return self
|
| 224 |
|
| 225 |
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
| 226 |
"""Closes the active asynchronous session managed by this instance, if any."""
|
|
|
|
| 182 |
|
| 183 |
return headers
|
| 184 |
|
| 185 |
+
def __enter__(self) -> "FetcherClient":
|
| 186 |
"""Creates and returns a new synchronous Fetcher Session"""
|
| 187 |
if self._curl_session:
|
| 188 |
raise RuntimeError(
|
|
|
|
| 197 |
)
|
| 198 |
|
| 199 |
self._curl_session = CurlSession()
|
| 200 |
+
return cast("FetcherClient", self)
|
| 201 |
|
| 202 |
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 203 |
"""Closes the active synchronous session managed by this instance, if any."""
|
|
|
|
| 205 |
self._curl_session.close()
|
| 206 |
self._curl_session = None
|
| 207 |
|
| 208 |
+
async def __aenter__(self) -> "AsyncFetcherClient":
|
| 209 |
"""Creates and returns a new asynchronous Session."""
|
| 210 |
if self._async_curl_session:
|
| 211 |
raise RuntimeError(
|
|
|
|
| 220 |
)
|
| 221 |
|
| 222 |
self._async_curl_session = AsyncCurlSession()
|
| 223 |
+
return cast("AsyncFetcherClient", self)
|
| 224 |
|
| 225 |
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
| 226 |
"""Closes the active asynchronous session managed by this instance, if any."""
|
scrapling/engines/toolbelt/convertor.py
CHANGED
|
@@ -58,7 +58,8 @@ class ResponseFactory:
|
|
| 58 |
"encoding": cls.__extract_browser_encoding(
|
| 59 |
current_response.headers.get("content-type", "")
|
| 60 |
)
|
| 61 |
-
|
|
|
|
| 62 |
"cookies": tuple(),
|
| 63 |
"headers": current_response.all_headers() if current_response else {},
|
| 64 |
"request_headers": current_request.all_headers(),
|
|
@@ -161,7 +162,8 @@ class ResponseFactory:
|
|
| 161 |
"encoding": cls.__extract_browser_encoding(
|
| 162 |
current_response.headers.get("content-type", "")
|
| 163 |
)
|
| 164 |
-
|
|
|
|
| 165 |
"cookies": tuple(),
|
| 166 |
"headers": await current_response.all_headers() if current_response else {},
|
| 167 |
"request_headers": await current_request.all_headers(),
|
|
@@ -255,8 +257,8 @@ class ResponseFactory:
|
|
| 255 |
"encoding": response.encoding or "utf-8",
|
| 256 |
"cookies": dict(response.cookies),
|
| 257 |
"headers": dict(response.headers),
|
| 258 |
-
"request_headers": dict(response.request.headers),
|
| 259 |
-
"method": response.request.method,
|
| 260 |
"history": response.history, # https://github.com/lexiforest/curl_cffi/issues/82
|
| 261 |
**parser_arguments,
|
| 262 |
}
|
|
|
|
| 58 |
"encoding": cls.__extract_browser_encoding(
|
| 59 |
current_response.headers.get("content-type", "")
|
| 60 |
)
|
| 61 |
+
if current_response
|
| 62 |
+
else "utf-8",
|
| 63 |
"cookies": tuple(),
|
| 64 |
"headers": current_response.all_headers() if current_response else {},
|
| 65 |
"request_headers": current_request.all_headers(),
|
|
|
|
| 162 |
"encoding": cls.__extract_browser_encoding(
|
| 163 |
current_response.headers.get("content-type", "")
|
| 164 |
)
|
| 165 |
+
if current_response
|
| 166 |
+
else "utf-8",
|
| 167 |
"cookies": tuple(),
|
| 168 |
"headers": await current_response.all_headers() if current_response else {},
|
| 169 |
"request_headers": await current_request.all_headers(),
|
|
|
|
| 257 |
"encoding": response.encoding or "utf-8",
|
| 258 |
"cookies": dict(response.cookies),
|
| 259 |
"headers": dict(response.headers),
|
| 260 |
+
"request_headers": dict(response.request.headers) if response.request else {},
|
| 261 |
+
"method": response.request.method if response.request else "GET",
|
| 262 |
"history": response.history, # https://github.com/lexiforest/curl_cffi/issues/82
|
| 263 |
**parser_arguments,
|
| 264 |
}
|
scrapling/engines/toolbelt/custom.py
CHANGED
|
@@ -8,6 +8,7 @@ from scrapling.core.utils import log
|
|
| 8 |
from scrapling.core._types import (
|
| 9 |
Any,
|
| 10 |
Dict,
|
|
|
|
| 11 |
List,
|
| 12 |
Optional,
|
| 13 |
Tuple,
|
|
@@ -30,10 +31,10 @@ class Response(Selector):
|
|
| 30 |
request_headers: Dict,
|
| 31 |
encoding: str = "utf-8",
|
| 32 |
method: str = "GET",
|
| 33 |
-
history: List = None,
|
| 34 |
-
**selector_config:
|
| 35 |
):
|
| 36 |
-
adaptive_domain = selector_config.pop("adaptive_domain",
|
| 37 |
self.status = status
|
| 38 |
self.reason = reason
|
| 39 |
self.cookies = cookies
|
|
@@ -58,7 +59,7 @@ class BaseFetcher:
|
|
| 58 |
keep_cdata: Optional[bool] = False
|
| 59 |
storage_args: Optional[Dict] = None
|
| 60 |
keep_comments: Optional[bool] = False
|
| 61 |
-
adaptive_domain:
|
| 62 |
parser_keywords: Tuple = (
|
| 63 |
"huge_tree",
|
| 64 |
"adaptive",
|
|
@@ -124,12 +125,8 @@ class BaseFetcher:
|
|
| 124 |
adaptive=cls.adaptive,
|
| 125 |
storage=cls.storage,
|
| 126 |
storage_args=cls.storage_args,
|
|
|
|
| 127 |
)
|
| 128 |
-
if cls.adaptive_domain:
|
| 129 |
-
if not isinstance(cls.adaptive_domain, str):
|
| 130 |
-
log.warning('[Ignored] The argument "adaptive_domain" must be of string type')
|
| 131 |
-
else:
|
| 132 |
-
parser_arguments.update({"adaptive_domain": cls.adaptive_domain})
|
| 133 |
|
| 134 |
return parser_arguments
|
| 135 |
|
|
|
|
| 8 |
from scrapling.core._types import (
|
| 9 |
Any,
|
| 10 |
Dict,
|
| 11 |
+
cast,
|
| 12 |
List,
|
| 13 |
Optional,
|
| 14 |
Tuple,
|
|
|
|
| 31 |
request_headers: Dict,
|
| 32 |
encoding: str = "utf-8",
|
| 33 |
method: str = "GET",
|
| 34 |
+
history: List | None = None,
|
| 35 |
+
**selector_config: Any,
|
| 36 |
):
|
| 37 |
+
adaptive_domain: str = cast(str, selector_config.pop("adaptive_domain", ""))
|
| 38 |
self.status = status
|
| 39 |
self.reason = reason
|
| 40 |
self.cookies = cookies
|
|
|
|
| 59 |
keep_cdata: Optional[bool] = False
|
| 60 |
storage_args: Optional[Dict] = None
|
| 61 |
keep_comments: Optional[bool] = False
|
| 62 |
+
adaptive_domain: str = ""
|
| 63 |
parser_keywords: Tuple = (
|
| 64 |
"huge_tree",
|
| 65 |
"adaptive",
|
|
|
|
| 125 |
adaptive=cls.adaptive,
|
| 126 |
storage=cls.storage,
|
| 127 |
storage_args=cls.storage_args,
|
| 128 |
+
adaptive_domain=cls.adaptive_domain,
|
| 129 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
return parser_arguments
|
| 132 |
|
scrapling/engines/toolbelt/fingerprints.py
CHANGED
|
@@ -8,9 +8,10 @@ from platform import system as platform_system
|
|
| 8 |
from tldextract import extract
|
| 9 |
from browserforge.headers import Browser, HeaderGenerator
|
| 10 |
|
| 11 |
-
from scrapling.core._types import Dict,
|
| 12 |
|
| 13 |
__OS_NAME__ = platform_system()
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
@lru_cache(10, typed=True)
|
|
@@ -28,16 +29,20 @@ def generate_convincing_referer(url: str) -> str:
|
|
| 28 |
|
| 29 |
|
| 30 |
@lru_cache(1, typed=True)
|
| 31 |
-
def get_os_name() ->
|
| 32 |
-
"""Get the current OS name in the same format needed for browserforge
|
| 33 |
|
| 34 |
:return: Current OS name or `None` otherwise
|
| 35 |
"""
|
| 36 |
-
|
| 37 |
-
"Linux":
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
|
| 43 |
def generate_headers(browser_mode: bool = False) -> Dict:
|
|
@@ -58,8 +63,10 @@ def generate_headers(browser_mode: bool = False) -> Dict:
|
|
| 58 |
Browser(name="edge", min_version=130),
|
| 59 |
]
|
| 60 |
)
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
| 63 |
|
| 64 |
|
| 65 |
__default_useragent__ = generate_headers(browser_mode=False).get("User-Agent")
|
|
|
|
| 8 |
from tldextract import extract
|
| 9 |
from browserforge.headers import Browser, HeaderGenerator
|
| 10 |
|
| 11 |
+
from scrapling.core._types import Dict, Literal
|
| 12 |
|
| 13 |
__OS_NAME__ = platform_system()
|
| 14 |
+
OSName = Literal["linux", "macos", "windows"]
|
| 15 |
|
| 16 |
|
| 17 |
@lru_cache(10, typed=True)
|
|
|
|
| 29 |
|
| 30 |
|
| 31 |
@lru_cache(1, typed=True)
|
| 32 |
+
def get_os_name() -> OSName | None:
|
| 33 |
+
"""Get the current OS name in the same format needed for browserforge, if the OS is Unknown, return None so browserforge uses all.
|
| 34 |
|
| 35 |
:return: Current OS name or `None` otherwise
|
| 36 |
"""
|
| 37 |
+
match __OS_NAME__:
|
| 38 |
+
case "Linux":
|
| 39 |
+
return "linux"
|
| 40 |
+
case "Darwin":
|
| 41 |
+
return "macos"
|
| 42 |
+
case "Windows":
|
| 43 |
+
return "windows"
|
| 44 |
+
case _:
|
| 45 |
+
return None
|
| 46 |
|
| 47 |
|
| 48 |
def generate_headers(browser_mode: bool = False) -> Dict:
|
|
|
|
| 63 |
Browser(name="edge", min_version=130),
|
| 64 |
]
|
| 65 |
)
|
| 66 |
+
if os_name:
|
| 67 |
+
return HeaderGenerator(browser=browsers, os=os_name, device="desktop").generate()
|
| 68 |
+
else:
|
| 69 |
+
return HeaderGenerator(browser=browsers, device="desktop").generate()
|
| 70 |
|
| 71 |
|
| 72 |
__default_useragent__ = generate_headers(browser_mode=False).get("User-Agent")
|
scrapling/engines/toolbelt/navigation.py
CHANGED
|
@@ -11,7 +11,7 @@ from msgspec import Struct, structs, convert, ValidationError
|
|
| 11 |
from playwright.sync_api import Route
|
| 12 |
|
| 13 |
from scrapling.core.utils import log
|
| 14 |
-
from scrapling.core._types import Dict,
|
| 15 |
from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
|
| 16 |
|
| 17 |
__BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
|
|
@@ -49,7 +49,15 @@ async def async_intercept_route(route: async_Route):
|
|
| 49 |
await route.continue_()
|
| 50 |
|
| 51 |
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
"""Validate a proxy and return it in the acceptable format for Playwright
|
| 54 |
Reference: https://playwright.dev/python/docs/network#http-proxy
|
| 55 |
|
|
@@ -83,7 +91,7 @@ def construct_proxy_dict(proxy_string: str | Dict[str, str], as_tuple=False) ->
|
|
| 83 |
except ValidationError as e:
|
| 84 |
raise TypeError(f"Invalid proxy dictionary: {e}")
|
| 85 |
|
| 86 |
-
|
| 87 |
|
| 88 |
|
| 89 |
@lru_cache(10, typed=True)
|
|
|
|
| 11 |
from playwright.sync_api import Route
|
| 12 |
|
| 13 |
from scrapling.core.utils import log
|
| 14 |
+
from scrapling.core._types import Dict, Tuple, overload, Literal
|
| 15 |
from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
|
| 16 |
|
| 17 |
__BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
|
|
|
|
| 49 |
await route.continue_()
|
| 50 |
|
| 51 |
|
| 52 |
+
@overload
|
| 53 |
+
def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: Literal[True]) -> Tuple: ...
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
@overload
|
| 57 |
+
def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: Literal[False] = False) -> Dict: ...
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: bool = False) -> Dict | Tuple:
|
| 61 |
"""Validate a proxy and return it in the acceptable format for Playwright
|
| 62 |
Reference: https://playwright.dev/python/docs/network#http-proxy
|
| 63 |
|
|
|
|
| 91 |
except ValidationError as e:
|
| 92 |
raise TypeError(f"Invalid proxy dictionary: {e}")
|
| 93 |
|
| 94 |
+
raise TypeError(f"Invalid proxy string: {proxy_string}")
|
| 95 |
|
| 96 |
|
| 97 |
@lru_cache(10, typed=True)
|
scrapling/fetchers/__init__.py
CHANGED
|
@@ -19,7 +19,17 @@ _LAZY_IMPORTS = {
|
|
| 19 |
"AsyncStealthySession": ("scrapling.fetchers.firefox", "AsyncStealthySession"),
|
| 20 |
}
|
| 21 |
|
| 22 |
-
__all__ = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
def __getattr__(name: str) -> Any:
|
|
|
|
| 19 |
"AsyncStealthySession": ("scrapling.fetchers.firefox", "AsyncStealthySession"),
|
| 20 |
}
|
| 21 |
|
| 22 |
+
__all__ = [
|
| 23 |
+
"Fetcher",
|
| 24 |
+
"AsyncFetcher",
|
| 25 |
+
"FetcherSession",
|
| 26 |
+
"DynamicFetcher",
|
| 27 |
+
"DynamicSession",
|
| 28 |
+
"AsyncDynamicSession",
|
| 29 |
+
"StealthyFetcher",
|
| 30 |
+
"StealthySession",
|
| 31 |
+
"AsyncStealthySession",
|
| 32 |
+
]
|
| 33 |
|
| 34 |
|
| 35 |
def __getattr__(name: str) -> Any:
|
scrapling/fetchers/chrome.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
| 1 |
from scrapling.core._types import (
|
| 2 |
Callable,
|
| 3 |
-
Dict,
|
| 4 |
List,
|
|
|
|
| 5 |
Optional,
|
| 6 |
SelectorWaitStates,
|
| 7 |
-
Iterable,
|
| 8 |
)
|
| 9 |
from scrapling.engines.toolbelt.custom import BaseFetcher, Response
|
| 10 |
from scrapling.engines._browsers._controllers import DynamicSession, AsyncDynamicSession
|
|
@@ -47,7 +46,7 @@ class DynamicFetcher(BaseFetcher):
|
|
| 47 |
disable_resources: bool = False,
|
| 48 |
wait_selector: Optional[str] = None,
|
| 49 |
init_script: Optional[str] = None,
|
| 50 |
-
cookies: Optional[
|
| 51 |
network_idle: bool = False,
|
| 52 |
load_dom: bool = True,
|
| 53 |
wait_selector_state: SelectorWaitStates = "attached",
|
|
@@ -134,7 +133,7 @@ class DynamicFetcher(BaseFetcher):
|
|
| 134 |
disable_resources: bool = False,
|
| 135 |
wait_selector: Optional[str] = None,
|
| 136 |
init_script: Optional[str] = None,
|
| 137 |
-
cookies: Optional[
|
| 138 |
network_idle: bool = False,
|
| 139 |
load_dom: bool = True,
|
| 140 |
wait_selector_state: SelectorWaitStates = "attached",
|
|
|
|
| 1 |
from scrapling.core._types import (
|
| 2 |
Callable,
|
|
|
|
| 3 |
List,
|
| 4 |
+
Dict,
|
| 5 |
Optional,
|
| 6 |
SelectorWaitStates,
|
|
|
|
| 7 |
)
|
| 8 |
from scrapling.engines.toolbelt.custom import BaseFetcher, Response
|
| 9 |
from scrapling.engines._browsers._controllers import DynamicSession, AsyncDynamicSession
|
|
|
|
| 46 |
disable_resources: bool = False,
|
| 47 |
wait_selector: Optional[str] = None,
|
| 48 |
init_script: Optional[str] = None,
|
| 49 |
+
cookies: Optional[List[Dict]] = None,
|
| 50 |
network_idle: bool = False,
|
| 51 |
load_dom: bool = True,
|
| 52 |
wait_selector_state: SelectorWaitStates = "attached",
|
|
|
|
| 133 |
disable_resources: bool = False,
|
| 134 |
wait_selector: Optional[str] = None,
|
| 135 |
init_script: Optional[str] = None,
|
| 136 |
+
cookies: Optional[List[Dict]] = None,
|
| 137 |
network_idle: bool = False,
|
| 138 |
load_dom: bool = True,
|
| 139 |
wait_selector_state: SelectorWaitStates = "attached",
|
scrapling/fetchers/firefox.py
CHANGED
|
@@ -83,8 +83,6 @@ class StealthyFetcher(BaseFetcher):
|
|
| 83 |
"""
|
| 84 |
if not custom_config:
|
| 85 |
custom_config = {}
|
| 86 |
-
elif not isinstance(custom_config, dict):
|
| 87 |
-
ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
| 88 |
|
| 89 |
with StealthySession(
|
| 90 |
wait=wait,
|
|
@@ -182,8 +180,6 @@ class StealthyFetcher(BaseFetcher):
|
|
| 182 |
"""
|
| 183 |
if not custom_config:
|
| 184 |
custom_config = {}
|
| 185 |
-
elif not isinstance(custom_config, dict):
|
| 186 |
-
ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
|
| 187 |
|
| 188 |
async with AsyncStealthySession(
|
| 189 |
wait=wait,
|
|
|
|
| 83 |
"""
|
| 84 |
if not custom_config:
|
| 85 |
custom_config = {}
|
|
|
|
|
|
|
| 86 |
|
| 87 |
with StealthySession(
|
| 88 |
wait=wait,
|
|
|
|
| 180 |
"""
|
| 181 |
if not custom_config:
|
| 182 |
custom_config = {}
|
|
|
|
|
|
|
| 183 |
|
| 184 |
async with AsyncStealthySession(
|
| 185 |
wait=wait,
|
scrapling/parser.py
CHANGED
|
@@ -17,17 +17,21 @@ from lxml.etree import (
|
|
| 17 |
|
| 18 |
from scrapling.core._types import (
|
| 19 |
Any,
|
|
|
|
| 20 |
Dict,
|
|
|
|
| 21 |
List,
|
| 22 |
Tuple,
|
| 23 |
Union,
|
| 24 |
Pattern,
|
| 25 |
Callable,
|
|
|
|
| 26 |
Optional,
|
| 27 |
Iterable,
|
| 28 |
overload,
|
| 29 |
Generator,
|
| 30 |
SupportsIndex,
|
|
|
|
| 31 |
)
|
| 32 |
from scrapling.core.custom_types import AttributesHandler, TextHandler, TextHandlers
|
| 33 |
from scrapling.core.mixins import SelectorsGeneration
|
|
@@ -36,7 +40,7 @@ from scrapling.core.storage import (
|
|
| 36 |
StorageSystemMixin,
|
| 37 |
_StorageTools,
|
| 38 |
)
|
| 39 |
-
from scrapling.core.translator import
|
| 40 |
from scrapling.core.utils import clean_spaces, flatten, html_forbidden, log
|
| 41 |
|
| 42 |
__DEFAULT_DB_FILE__ = str(Path(__file__).parent / "elements_storage.db")
|
|
@@ -70,20 +74,23 @@ class Selector(SelectorsGeneration):
|
|
| 70 |
"_raw_body",
|
| 71 |
)
|
| 72 |
|
|
|
|
|
|
|
|
|
|
| 73 |
def __init__(
|
| 74 |
self,
|
| 75 |
content: Optional[str | bytes] = None,
|
| 76 |
-
url:
|
| 77 |
encoding: str = "utf-8",
|
| 78 |
huge_tree: bool = True,
|
| 79 |
root: Optional[HtmlElement] = None,
|
| 80 |
keep_comments: Optional[bool] = False,
|
| 81 |
keep_cdata: Optional[bool] = False,
|
| 82 |
adaptive: Optional[bool] = False,
|
| 83 |
-
_storage:
|
| 84 |
storage: Any = SQLiteStorageSystem,
|
| 85 |
storage_args: Optional[Dict] = None,
|
| 86 |
-
**
|
| 87 |
):
|
| 88 |
"""The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
|
| 89 |
with expressions in CSS, XPath, or with simply text. Check the docs for more info.
|
|
@@ -131,7 +138,7 @@ class Selector(SelectorsGeneration):
|
|
| 131 |
default_doctype=True,
|
| 132 |
strip_cdata=(not keep_cdata),
|
| 133 |
)
|
| 134 |
-
self._root = fromstring(body, parser=parser, base_url=url)
|
| 135 |
self._raw_body = content
|
| 136 |
|
| 137 |
else:
|
|
@@ -141,7 +148,7 @@ class Selector(SelectorsGeneration):
|
|
| 141 |
f"Root have to be a valid element of `html` module types to work, not of type {type(root)}"
|
| 142 |
)
|
| 143 |
|
| 144 |
-
self._root = root
|
| 145 |
self._raw_body = ""
|
| 146 |
|
| 147 |
self.__adaptive_enabled = adaptive
|
|
@@ -238,6 +245,9 @@ class Selector(SelectorsGeneration):
|
|
| 238 |
**self.__response_data,
|
| 239 |
)
|
| 240 |
|
|
|
|
|
|
|
|
|
|
| 241 |
def __handle_element(
|
| 242 |
self, element: Optional[HtmlElement | _ElementUnicodeResult]
|
| 243 |
) -> Optional[Union[TextHandler, "Selector"]]:
|
|
@@ -262,7 +272,7 @@ class Selector(SelectorsGeneration):
|
|
| 262 |
if self._is_text_node(result[0]):
|
| 263 |
return TextHandlers(map(TextHandler, result))
|
| 264 |
|
| 265 |
-
return
|
| 266 |
|
| 267 |
def __getstate__(self) -> Any:
|
| 268 |
# lxml don't like it :)
|
|
@@ -323,7 +333,7 @@ class Selector(SelectorsGeneration):
|
|
| 323 |
if not valid_values or processed_text.strip():
|
| 324 |
_all_strings.append(processed_text)
|
| 325 |
|
| 326 |
-
return TextHandler(separator).join(_all_strings)
|
| 327 |
|
| 328 |
def urljoin(self, relative_url: str) -> str:
|
| 329 |
"""Join this Selector's url with a relative url to form an absolute full URL."""
|
|
@@ -372,13 +382,14 @@ class Selector(SelectorsGeneration):
|
|
| 372 |
@property
|
| 373 |
def parent(self) -> Optional["Selector"]:
|
| 374 |
"""Return the direct parent of the element or ``None`` otherwise"""
|
| 375 |
-
|
|
|
|
| 376 |
|
| 377 |
@property
|
| 378 |
def below_elements(self) -> "Selectors":
|
| 379 |
"""Return all elements under the current element in the DOM tree"""
|
| 380 |
below = _find_all_elements(self._root)
|
| 381 |
-
return self.
|
| 382 |
|
| 383 |
@property
|
| 384 |
def children(self) -> "Selectors":
|
|
@@ -425,7 +436,7 @@ class Selector(SelectorsGeneration):
|
|
| 425 |
# Ignore HTML comments and unwanted types
|
| 426 |
next_element = next_element.getnext()
|
| 427 |
|
| 428 |
-
return self.
|
| 429 |
|
| 430 |
@property
|
| 431 |
def previous(self) -> Optional["Selector"]:
|
|
@@ -435,10 +446,10 @@ class Selector(SelectorsGeneration):
|
|
| 435 |
# Ignore HTML comments and unwanted types
|
| 436 |
prev_element = prev_element.getprevious()
|
| 437 |
|
| 438 |
-
return self.
|
| 439 |
|
| 440 |
# For easy copy-paste from Scrapy/parsel code when needed :)
|
| 441 |
-
def get(self, default=None):
|
| 442 |
return self
|
| 443 |
|
| 444 |
def get_all(self):
|
|
@@ -468,6 +479,16 @@ class Selector(SelectorsGeneration):
|
|
| 468 |
return data + ">"
|
| 469 |
|
| 470 |
# From here we start with the selecting functions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 471 |
def relocate(
|
| 472 |
self,
|
| 473 |
element: Union[Dict, HtmlElement, "Selector"],
|
|
@@ -506,11 +527,11 @@ class Selector(SelectorsGeneration):
|
|
| 506 |
log.debug(f"Highest probability was {highest_probability}%")
|
| 507 |
log.debug("Top 5 best matching elements are: ")
|
| 508 |
for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
|
| 509 |
-
log.debug(f"{percent} -> {self.
|
| 510 |
|
| 511 |
if not selector_type:
|
| 512 |
return score_table[highest_probability]
|
| 513 |
-
return self.
|
| 514 |
return []
|
| 515 |
|
| 516 |
def css_first(
|
|
@@ -593,7 +614,7 @@ class Selector(SelectorsGeneration):
|
|
| 593 |
auto_save: bool = False,
|
| 594 |
percentage: int = 0,
|
| 595 |
**kwargs: Any,
|
| 596 |
-
) -> Union["Selectors", List, "TextHandlers"]:
|
| 597 |
"""Search the current tree with CSS3 selectors
|
| 598 |
|
| 599 |
**Important:
|
|
@@ -614,7 +635,7 @@ class Selector(SelectorsGeneration):
|
|
| 614 |
try:
|
| 615 |
if not self.__adaptive_enabled or "," not in selector:
|
| 616 |
# No need to split selectors in this case, let's save some CPU cycles :)
|
| 617 |
-
xpath_selector =
|
| 618 |
return self.xpath(
|
| 619 |
xpath_selector,
|
| 620 |
identifier or selector,
|
|
@@ -628,7 +649,7 @@ class Selector(SelectorsGeneration):
|
|
| 628 |
for single_selector in split_selectors(selector):
|
| 629 |
# I'm doing this only so the `save` function saves data correctly for combined selectors
|
| 630 |
# Like using the ',' to combine two different selectors that point to different elements.
|
| 631 |
-
xpath_selector =
|
| 632 |
results += self.xpath(
|
| 633 |
xpath_selector,
|
| 634 |
identifier or single_selector.canonical(),
|
|
@@ -731,7 +752,8 @@ class Selector(SelectorsGeneration):
|
|
| 731 |
raise TypeError("You have to pass something to search with, like tag name(s), tag attributes, or both.")
|
| 732 |
|
| 733 |
attributes = dict()
|
| 734 |
-
tags
|
|
|
|
| 735 |
results, functions, selectors = Selectors(), [], []
|
| 736 |
|
| 737 |
# Brace yourself for a wonderful journey!
|
|
@@ -740,6 +762,7 @@ class Selector(SelectorsGeneration):
|
|
| 740 |
tags.add(arg)
|
| 741 |
|
| 742 |
elif type(arg) in (list, tuple, set):
|
|
|
|
| 743 |
if not all(map(lambda x: isinstance(x, str), arg)):
|
| 744 |
raise TypeError("Nested Iterables are not accepted, only iterables of tag names are accepted")
|
| 745 |
tags.update(set(arg))
|
|
@@ -774,7 +797,7 @@ class Selector(SelectorsGeneration):
|
|
| 774 |
attributes[attribute_name] = value
|
| 775 |
|
| 776 |
# It's easier and faster to build a selector than traversing the tree
|
| 777 |
-
tags = tags or
|
| 778 |
for tag in tags:
|
| 779 |
selector = tag
|
| 780 |
for key, value in attributes.items():
|
|
@@ -785,7 +808,7 @@ class Selector(SelectorsGeneration):
|
|
| 785 |
selectors.append(selector)
|
| 786 |
|
| 787 |
if selectors:
|
| 788 |
-
results = self.css(", ".join(selectors))
|
| 789 |
if results:
|
| 790 |
# From the results, get the ones that fulfill passed regex patterns
|
| 791 |
for pattern in patterns:
|
|
@@ -828,20 +851,20 @@ class Selector(SelectorsGeneration):
|
|
| 828 |
:return: A percentage score of how similar is the candidate to the original element
|
| 829 |
"""
|
| 830 |
score, checks = 0, 0
|
| 831 |
-
|
| 832 |
|
| 833 |
# Possible TODO:
|
| 834 |
# Study the idea of giving weight to each test below so some are more important than others
|
| 835 |
# Current results: With weights some websites had better score while it was worse for others
|
| 836 |
-
score += 1 if original["tag"] ==
|
| 837 |
checks += 1
|
| 838 |
|
| 839 |
if original["text"]:
|
| 840 |
-
score += SequenceMatcher(None, original["text"],
|
| 841 |
checks += 1
|
| 842 |
|
| 843 |
# if both don't have attributes, it still counts for something!
|
| 844 |
-
score += self.__calculate_dict_diff(original["attributes"],
|
| 845 |
checks += 1
|
| 846 |
|
| 847 |
# Separate similarity test for class, id, href,... this will help in full structural changes
|
|
@@ -855,23 +878,23 @@ class Selector(SelectorsGeneration):
|
|
| 855 |
score += SequenceMatcher(
|
| 856 |
None,
|
| 857 |
original["attributes"][attrib],
|
| 858 |
-
|
| 859 |
).ratio() # * 0.3 # 30%
|
| 860 |
checks += 1
|
| 861 |
|
| 862 |
-
score += SequenceMatcher(None, original["path"],
|
| 863 |
checks += 1
|
| 864 |
|
| 865 |
if original.get("parent_name"):
|
| 866 |
# Then we start comparing parents' data
|
| 867 |
-
if
|
| 868 |
score += SequenceMatcher(
|
| 869 |
-
None, original["parent_name"],
|
| 870 |
).ratio() # * 0.2 # 20%
|
| 871 |
checks += 1
|
| 872 |
|
| 873 |
score += self.__calculate_dict_diff(
|
| 874 |
-
original["parent_attribs"],
|
| 875 |
) # * 0.2 # 20%
|
| 876 |
checks += 1
|
| 877 |
|
|
@@ -879,7 +902,7 @@ class Selector(SelectorsGeneration):
|
|
| 879 |
score += SequenceMatcher(
|
| 880 |
None,
|
| 881 |
original["parent_text"],
|
| 882 |
-
|
| 883 |
).ratio() # * 0.1 # 10%
|
| 884 |
checks += 1
|
| 885 |
# else:
|
|
@@ -887,9 +910,7 @@ class Selector(SelectorsGeneration):
|
|
| 887 |
# score -= 0.1
|
| 888 |
|
| 889 |
if original.get("siblings"):
|
| 890 |
-
score += SequenceMatcher(
|
| 891 |
-
None, original["siblings"], candidate.get("siblings") or []
|
| 892 |
-
).ratio() # * 0.1 # 10%
|
| 893 |
checks += 1
|
| 894 |
|
| 895 |
# How % sure? let's see
|
|
@@ -902,7 +923,7 @@ class Selector(SelectorsGeneration):
|
|
| 902 |
score += SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio() * 0.5
|
| 903 |
return score
|
| 904 |
|
| 905 |
-
def save(self, element:
|
| 906 |
"""Saves the element's unique properties to the storage for retrieval and relocation later
|
| 907 |
|
| 908 |
:param element: The element itself that we want to save to storage, it can be a ` Selector ` or pure ` HtmlElement `
|
|
@@ -910,15 +931,16 @@ class Selector(SelectorsGeneration):
|
|
| 910 |
the docs for more info.
|
| 911 |
"""
|
| 912 |
if self.__adaptive_enabled:
|
| 913 |
-
|
| 914 |
-
|
|
|
|
| 915 |
|
| 916 |
-
if self._is_text_node(
|
| 917 |
-
|
| 918 |
|
| 919 |
-
self._storage.save(
|
| 920 |
else:
|
| 921 |
-
|
| 922 |
"Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
|
| 923 |
)
|
| 924 |
|
|
@@ -932,10 +954,9 @@ class Selector(SelectorsGeneration):
|
|
| 932 |
if self.__adaptive_enabled:
|
| 933 |
return self._storage.retrieve(identifier)
|
| 934 |
|
| 935 |
-
|
| 936 |
"Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
|
| 937 |
)
|
| 938 |
-
return None
|
| 939 |
|
| 940 |
# Operations on text functions
|
| 941 |
def json(self) -> Dict:
|
|
@@ -1104,28 +1125,30 @@ class Selector(SelectorsGeneration):
|
|
| 1104 |
if not case_sensitive:
|
| 1105 |
text = text.lower()
|
| 1106 |
|
| 1107 |
-
|
| 1108 |
-
|
| 1109 |
-
|
| 1110 |
-
|
| 1111 |
-
node_text =
|
| 1112 |
-
|
| 1113 |
-
|
| 1114 |
-
|
| 1115 |
-
|
| 1116 |
-
|
| 1117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1118 |
results.append(node)
|
| 1119 |
-
elif text == node_text:
|
| 1120 |
-
results.append(node)
|
| 1121 |
|
| 1122 |
-
|
| 1123 |
-
|
| 1124 |
-
|
| 1125 |
|
| 1126 |
-
|
| 1127 |
-
|
| 1128 |
-
|
| 1129 |
return results
|
| 1130 |
|
| 1131 |
def find_by_regex(
|
|
@@ -1143,23 +1166,25 @@ class Selector(SelectorsGeneration):
|
|
| 1143 |
"""
|
| 1144 |
results = Selectors()
|
| 1145 |
|
| 1146 |
-
|
| 1147 |
-
|
| 1148 |
-
|
| 1149 |
-
|
| 1150 |
-
|
| 1151 |
-
|
| 1152 |
-
|
| 1153 |
-
|
| 1154 |
-
|
| 1155 |
-
|
|
|
|
|
|
|
| 1156 |
|
| 1157 |
-
|
| 1158 |
-
|
| 1159 |
-
|
| 1160 |
|
| 1161 |
-
|
| 1162 |
-
|
| 1163 |
return results
|
| 1164 |
|
| 1165 |
|
|
@@ -1181,9 +1206,9 @@ class Selectors(List[Selector]):
|
|
| 1181 |
def __getitem__(self, pos: SupportsIndex | slice) -> Union[Selector, "Selectors"]:
|
| 1182 |
lst = super().__getitem__(pos)
|
| 1183 |
if isinstance(pos, slice):
|
| 1184 |
-
return self.__class__(lst)
|
| 1185 |
else:
|
| 1186 |
-
return lst
|
| 1187 |
|
| 1188 |
def xpath(
|
| 1189 |
self,
|
|
@@ -1265,7 +1290,7 @@ class Selectors(List[Selector]):
|
|
| 1265 |
def re_first(
|
| 1266 |
self,
|
| 1267 |
regex: str | Pattern,
|
| 1268 |
-
default=None,
|
| 1269 |
replace_entities: bool = True,
|
| 1270 |
clean_match: bool = False,
|
| 1271 |
case_sensitive: bool = True,
|
|
|
|
| 17 |
|
| 18 |
from scrapling.core._types import (
|
| 19 |
Any,
|
| 20 |
+
Set,
|
| 21 |
Dict,
|
| 22 |
+
cast,
|
| 23 |
List,
|
| 24 |
Tuple,
|
| 25 |
Union,
|
| 26 |
Pattern,
|
| 27 |
Callable,
|
| 28 |
+
Literal,
|
| 29 |
Optional,
|
| 30 |
Iterable,
|
| 31 |
overload,
|
| 32 |
Generator,
|
| 33 |
SupportsIndex,
|
| 34 |
+
TYPE_CHECKING,
|
| 35 |
)
|
| 36 |
from scrapling.core.custom_types import AttributesHandler, TextHandler, TextHandlers
|
| 37 |
from scrapling.core.mixins import SelectorsGeneration
|
|
|
|
| 40 |
StorageSystemMixin,
|
| 41 |
_StorageTools,
|
| 42 |
)
|
| 43 |
+
from scrapling.core.translator import css_to_xpath as _css_to_xpath
|
| 44 |
from scrapling.core.utils import clean_spaces, flatten, html_forbidden, log
|
| 45 |
|
| 46 |
__DEFAULT_DB_FILE__ = str(Path(__file__).parent / "elements_storage.db")
|
|
|
|
| 74 |
"_raw_body",
|
| 75 |
)
|
| 76 |
|
| 77 |
+
if TYPE_CHECKING:
|
| 78 |
+
_storage: StorageSystemMixin
|
| 79 |
+
|
| 80 |
def __init__(
|
| 81 |
self,
|
| 82 |
content: Optional[str | bytes] = None,
|
| 83 |
+
url: str = "",
|
| 84 |
encoding: str = "utf-8",
|
| 85 |
huge_tree: bool = True,
|
| 86 |
root: Optional[HtmlElement] = None,
|
| 87 |
keep_comments: Optional[bool] = False,
|
| 88 |
keep_cdata: Optional[bool] = False,
|
| 89 |
adaptive: Optional[bool] = False,
|
| 90 |
+
_storage: Optional[StorageSystemMixin] = None,
|
| 91 |
storage: Any = SQLiteStorageSystem,
|
| 92 |
storage_args: Optional[Dict] = None,
|
| 93 |
+
**_,
|
| 94 |
):
|
| 95 |
"""The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
|
| 96 |
with expressions in CSS, XPath, or with simply text. Check the docs for more info.
|
|
|
|
| 138 |
default_doctype=True,
|
| 139 |
strip_cdata=(not keep_cdata),
|
| 140 |
)
|
| 141 |
+
self._root = cast(HtmlElement, fromstring(body, parser=parser, base_url=url or None))
|
| 142 |
self._raw_body = content
|
| 143 |
|
| 144 |
else:
|
|
|
|
| 148 |
f"Root have to be a valid element of `html` module types to work, not of type {type(root)}"
|
| 149 |
)
|
| 150 |
|
| 151 |
+
self._root = cast(HtmlElement, root)
|
| 152 |
self._raw_body = ""
|
| 153 |
|
| 154 |
self.__adaptive_enabled = adaptive
|
|
|
|
| 245 |
**self.__response_data,
|
| 246 |
)
|
| 247 |
|
| 248 |
+
def __elements_convertor(self, elements: List[HtmlElement]) -> "Selectors":
|
| 249 |
+
return Selectors(map(self.__element_convertor, elements))
|
| 250 |
+
|
| 251 |
def __handle_element(
|
| 252 |
self, element: Optional[HtmlElement | _ElementUnicodeResult]
|
| 253 |
) -> Optional[Union[TextHandler, "Selector"]]:
|
|
|
|
| 272 |
if self._is_text_node(result[0]):
|
| 273 |
return TextHandlers(map(TextHandler, result))
|
| 274 |
|
| 275 |
+
return self.__elements_convertor(result)
|
| 276 |
|
| 277 |
def __getstate__(self) -> Any:
|
| 278 |
# lxml don't like it :)
|
|
|
|
| 333 |
if not valid_values or processed_text.strip():
|
| 334 |
_all_strings.append(processed_text)
|
| 335 |
|
| 336 |
+
return cast(TextHandler, TextHandler(separator).join(_all_strings))
|
| 337 |
|
| 338 |
def urljoin(self, relative_url: str) -> str:
|
| 339 |
"""Join this Selector's url with a relative url to form an absolute full URL."""
|
|
|
|
| 382 |
@property
|
| 383 |
def parent(self) -> Optional["Selector"]:
|
| 384 |
"""Return the direct parent of the element or ``None`` otherwise"""
|
| 385 |
+
_parent = self._root.getparent()
|
| 386 |
+
return self.__element_convertor(_parent) if _parent is not None else None
|
| 387 |
|
| 388 |
@property
|
| 389 |
def below_elements(self) -> "Selectors":
|
| 390 |
"""Return all elements under the current element in the DOM tree"""
|
| 391 |
below = _find_all_elements(self._root)
|
| 392 |
+
return self.__elements_convertor(below) if below is not None else Selectors()
|
| 393 |
|
| 394 |
@property
|
| 395 |
def children(self) -> "Selectors":
|
|
|
|
| 436 |
# Ignore HTML comments and unwanted types
|
| 437 |
next_element = next_element.getnext()
|
| 438 |
|
| 439 |
+
return self.__element_convertor(next_element) if next_element is not None else None
|
| 440 |
|
| 441 |
@property
|
| 442 |
def previous(self) -> Optional["Selector"]:
|
|
|
|
| 446 |
# Ignore HTML comments and unwanted types
|
| 447 |
prev_element = prev_element.getprevious()
|
| 448 |
|
| 449 |
+
return self.__element_convertor(prev_element) if prev_element is not None else None
|
| 450 |
|
| 451 |
# For easy copy-paste from Scrapy/parsel code when needed :)
|
| 452 |
+
def get(self, default=None): # pyright: ignore
|
| 453 |
return self
|
| 454 |
|
| 455 |
def get_all(self):
|
|
|
|
| 479 |
return data + ">"
|
| 480 |
|
| 481 |
# From here we start with the selecting functions
|
| 482 |
+
@overload
|
| 483 |
+
def relocate(
|
| 484 |
+
self, element: Union[Dict, HtmlElement, "Selector"], percentage: int, selector_type: Literal[True]
|
| 485 |
+
) -> "Selectors": ...
|
| 486 |
+
|
| 487 |
+
@overload
|
| 488 |
+
def relocate(
|
| 489 |
+
self, element: Union[Dict, HtmlElement, "Selector"], percentage: int, selector_type: Literal[False] = False
|
| 490 |
+
) -> List[HtmlElement]: ...
|
| 491 |
+
|
| 492 |
def relocate(
|
| 493 |
self,
|
| 494 |
element: Union[Dict, HtmlElement, "Selector"],
|
|
|
|
| 527 |
log.debug(f"Highest probability was {highest_probability}%")
|
| 528 |
log.debug("Top 5 best matching elements are: ")
|
| 529 |
for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
|
| 530 |
+
log.debug(f"{percent} -> {self.__elements_convertor(score_table[percent])}")
|
| 531 |
|
| 532 |
if not selector_type:
|
| 533 |
return score_table[highest_probability]
|
| 534 |
+
return self.__elements_convertor(score_table[highest_probability])
|
| 535 |
return []
|
| 536 |
|
| 537 |
def css_first(
|
|
|
|
| 614 |
auto_save: bool = False,
|
| 615 |
percentage: int = 0,
|
| 616 |
**kwargs: Any,
|
| 617 |
+
) -> Union["Selectors", List[Any], "TextHandlers"]:
|
| 618 |
"""Search the current tree with CSS3 selectors
|
| 619 |
|
| 620 |
**Important:
|
|
|
|
| 635 |
try:
|
| 636 |
if not self.__adaptive_enabled or "," not in selector:
|
| 637 |
# No need to split selectors in this case, let's save some CPU cycles :)
|
| 638 |
+
xpath_selector = _css_to_xpath(selector)
|
| 639 |
return self.xpath(
|
| 640 |
xpath_selector,
|
| 641 |
identifier or selector,
|
|
|
|
| 649 |
for single_selector in split_selectors(selector):
|
| 650 |
# I'm doing this only so the `save` function saves data correctly for combined selectors
|
| 651 |
# Like using the ',' to combine two different selectors that point to different elements.
|
| 652 |
+
xpath_selector = _css_to_xpath(single_selector.canonical())
|
| 653 |
results += self.xpath(
|
| 654 |
xpath_selector,
|
| 655 |
identifier or single_selector.canonical(),
|
|
|
|
| 752 |
raise TypeError("You have to pass something to search with, like tag name(s), tag attributes, or both.")
|
| 753 |
|
| 754 |
attributes = dict()
|
| 755 |
+
tags: Set[str] = set()
|
| 756 |
+
patterns: Set[Pattern] = set()
|
| 757 |
results, functions, selectors = Selectors(), [], []
|
| 758 |
|
| 759 |
# Brace yourself for a wonderful journey!
|
|
|
|
| 762 |
tags.add(arg)
|
| 763 |
|
| 764 |
elif type(arg) in (list, tuple, set):
|
| 765 |
+
arg = cast(Iterable, arg) # Type narrowing for type checkers like pyright
|
| 766 |
if not all(map(lambda x: isinstance(x, str), arg)):
|
| 767 |
raise TypeError("Nested Iterables are not accepted, only iterables of tag names are accepted")
|
| 768 |
tags.update(set(arg))
|
|
|
|
| 797 |
attributes[attribute_name] = value
|
| 798 |
|
| 799 |
# It's easier and faster to build a selector than traversing the tree
|
| 800 |
+
tags = tags or set("*")
|
| 801 |
for tag in tags:
|
| 802 |
selector = tag
|
| 803 |
for key, value in attributes.items():
|
|
|
|
| 808 |
selectors.append(selector)
|
| 809 |
|
| 810 |
if selectors:
|
| 811 |
+
results = cast(Selectors, self.css(", ".join(selectors)))
|
| 812 |
if results:
|
| 813 |
# From the results, get the ones that fulfill passed regex patterns
|
| 814 |
for pattern in patterns:
|
|
|
|
| 851 |
:return: A percentage score of how similar is the candidate to the original element
|
| 852 |
"""
|
| 853 |
score, checks = 0, 0
|
| 854 |
+
data = _StorageTools.element_to_dict(candidate)
|
| 855 |
|
| 856 |
# Possible TODO:
|
| 857 |
# Study the idea of giving weight to each test below so some are more important than others
|
| 858 |
# Current results: With weights some websites had better score while it was worse for others
|
| 859 |
+
score += 1 if original["tag"] == data["tag"] else 0 # * 0.3 # 30%
|
| 860 |
checks += 1
|
| 861 |
|
| 862 |
if original["text"]:
|
| 863 |
+
score += SequenceMatcher(None, original["text"], data.get("text") or "").ratio() # * 0.3 # 30%
|
| 864 |
checks += 1
|
| 865 |
|
| 866 |
# if both don't have attributes, it still counts for something!
|
| 867 |
+
score += self.__calculate_dict_diff(original["attributes"], data["attributes"]) # * 0.3 # 30%
|
| 868 |
checks += 1
|
| 869 |
|
| 870 |
# Separate similarity test for class, id, href,... this will help in full structural changes
|
|
|
|
| 878 |
score += SequenceMatcher(
|
| 879 |
None,
|
| 880 |
original["attributes"][attrib],
|
| 881 |
+
data["attributes"].get(attrib) or "",
|
| 882 |
).ratio() # * 0.3 # 30%
|
| 883 |
checks += 1
|
| 884 |
|
| 885 |
+
score += SequenceMatcher(None, original["path"], data["path"]).ratio() # * 0.1 # 10%
|
| 886 |
checks += 1
|
| 887 |
|
| 888 |
if original.get("parent_name"):
|
| 889 |
# Then we start comparing parents' data
|
| 890 |
+
if data.get("parent_name"):
|
| 891 |
score += SequenceMatcher(
|
| 892 |
+
None, original["parent_name"], data.get("parent_name") or ""
|
| 893 |
).ratio() # * 0.2 # 20%
|
| 894 |
checks += 1
|
| 895 |
|
| 896 |
score += self.__calculate_dict_diff(
|
| 897 |
+
original["parent_attribs"], data.get("parent_attribs") or {}
|
| 898 |
) # * 0.2 # 20%
|
| 899 |
checks += 1
|
| 900 |
|
|
|
|
| 902 |
score += SequenceMatcher(
|
| 903 |
None,
|
| 904 |
original["parent_text"],
|
| 905 |
+
data.get("parent_text") or "",
|
| 906 |
).ratio() # * 0.1 # 10%
|
| 907 |
checks += 1
|
| 908 |
# else:
|
|
|
|
| 910 |
# score -= 0.1
|
| 911 |
|
| 912 |
if original.get("siblings"):
|
| 913 |
+
score += SequenceMatcher(None, original["siblings"], data.get("siblings") or []).ratio() # * 0.1 # 10%
|
|
|
|
|
|
|
| 914 |
checks += 1
|
| 915 |
|
| 916 |
# How % sure? let's see
|
|
|
|
| 923 |
score += SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio() * 0.5
|
| 924 |
return score
|
| 925 |
|
| 926 |
+
def save(self, element: HtmlElement, identifier: str) -> None:
|
| 927 |
"""Saves the element's unique properties to the storage for retrieval and relocation later
|
| 928 |
|
| 929 |
:param element: The element itself that we want to save to storage, it can be a ` Selector ` or pure ` HtmlElement `
|
|
|
|
| 931 |
the docs for more info.
|
| 932 |
"""
|
| 933 |
if self.__adaptive_enabled:
|
| 934 |
+
target = element
|
| 935 |
+
if isinstance(target, self.__class__):
|
| 936 |
+
target: HtmlElement = target._root
|
| 937 |
|
| 938 |
+
if self._is_text_node(target):
|
| 939 |
+
target: HtmlElement = target.getparent()
|
| 940 |
|
| 941 |
+
self._storage.save(target, identifier)
|
| 942 |
else:
|
| 943 |
+
raise RuntimeError(
|
| 944 |
"Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
|
| 945 |
)
|
| 946 |
|
|
|
|
| 954 |
if self.__adaptive_enabled:
|
| 955 |
return self._storage.retrieve(identifier)
|
| 956 |
|
| 957 |
+
raise RuntimeError(
|
| 958 |
"Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
|
| 959 |
)
|
|
|
|
| 960 |
|
| 961 |
# Operations on text functions
|
| 962 |
def json(self) -> Dict:
|
|
|
|
| 1125 |
if not case_sensitive:
|
| 1126 |
text = text.lower()
|
| 1127 |
|
| 1128 |
+
possible_targets = _find_all_elements_with_spaces(self._root)
|
| 1129 |
+
if possible_targets:
|
| 1130 |
+
for node in self.__elements_convertor(possible_targets):
|
| 1131 |
+
"""Check if element matches given text otherwise, traverse the children tree and iterate"""
|
| 1132 |
+
node_text = node.text
|
| 1133 |
+
if clean_match:
|
| 1134 |
+
node_text = node_text.clean()
|
| 1135 |
+
|
| 1136 |
+
if not case_sensitive:
|
| 1137 |
+
node_text = node_text.lower()
|
| 1138 |
+
|
| 1139 |
+
if partial:
|
| 1140 |
+
if text in node_text:
|
| 1141 |
+
results.append(node)
|
| 1142 |
+
elif text == node_text:
|
| 1143 |
results.append(node)
|
|
|
|
|
|
|
| 1144 |
|
| 1145 |
+
if first_match and results:
|
| 1146 |
+
# we got an element so we should stop
|
| 1147 |
+
break
|
| 1148 |
|
| 1149 |
+
if first_match:
|
| 1150 |
+
if results:
|
| 1151 |
+
return results[0]
|
| 1152 |
return results
|
| 1153 |
|
| 1154 |
def find_by_regex(
|
|
|
|
| 1166 |
"""
|
| 1167 |
results = Selectors()
|
| 1168 |
|
| 1169 |
+
possible_targets = _find_all_elements_with_spaces(self._root)
|
| 1170 |
+
if possible_targets:
|
| 1171 |
+
for node in self.__elements_convertor(possible_targets):
|
| 1172 |
+
"""Check if element matches given regex otherwise, traverse the children tree and iterate"""
|
| 1173 |
+
node_text = node.text
|
| 1174 |
+
if node_text.re(
|
| 1175 |
+
query,
|
| 1176 |
+
check_match=True,
|
| 1177 |
+
clean_match=clean_match,
|
| 1178 |
+
case_sensitive=case_sensitive,
|
| 1179 |
+
):
|
| 1180 |
+
results.append(node)
|
| 1181 |
|
| 1182 |
+
if first_match and results:
|
| 1183 |
+
# we got an element so we should stop
|
| 1184 |
+
break
|
| 1185 |
|
| 1186 |
+
if results and first_match:
|
| 1187 |
+
return results[0]
|
| 1188 |
return results
|
| 1189 |
|
| 1190 |
|
|
|
|
| 1206 |
def __getitem__(self, pos: SupportsIndex | slice) -> Union[Selector, "Selectors"]:
|
| 1207 |
lst = super().__getitem__(pos)
|
| 1208 |
if isinstance(pos, slice):
|
| 1209 |
+
return self.__class__(cast(List[Selector], lst))
|
| 1210 |
else:
|
| 1211 |
+
return cast(Selector, lst)
|
| 1212 |
|
| 1213 |
def xpath(
|
| 1214 |
self,
|
|
|
|
| 1290 |
def re_first(
|
| 1291 |
self,
|
| 1292 |
regex: str | Pattern,
|
| 1293 |
+
default: Any = None,
|
| 1294 |
replace_entities: bool = True,
|
| 1295 |
clean_match: bool = False,
|
| 1296 |
case_sensitive: bool = True,
|