Karim shoair commited on
Commit ·
916182a
1
Parent(s): a2a8556
style: A lot of type hints correction
Browse filesSince we are using Py3.10 as minimum version now, we remove Union when possible
- scrapling/core/_html_utils.py +2 -2
- scrapling/core/_types.py +0 -1
- scrapling/core/ai.py +22 -23
- scrapling/core/custom_types.py +31 -40
- scrapling/core/shell.py +2 -3
- scrapling/core/storage.py +7 -7
- scrapling/engines/_browsers/_camoufox.py +11 -12
- scrapling/engines/_browsers/_controllers.py +7 -8
- scrapling/engines/_browsers/_page.py +3 -3
- scrapling/engines/_browsers/_validators.py +7 -8
- scrapling/engines/static.py +25 -26
- scrapling/engines/toolbelt/__init__.py +0 -1
- scrapling/engines/toolbelt/custom.py +1 -51
- scrapling/engines/toolbelt/fingerprints.py +2 -2
- scrapling/engines/toolbelt/navigation.py +3 -3
- scrapling/fetchers.py +16 -17
- scrapling/parser.py +39 -45
scrapling/core/_html_utils.py
CHANGED
|
@@ -6,7 +6,7 @@ Repo source code: https://github.com/scrapy/w3lib/blob/master/w3lib/html.py
|
|
| 6 |
|
| 7 |
from re import compile as _re_compile, IGNORECASE
|
| 8 |
|
| 9 |
-
from scrapling.core._types import Iterable,
|
| 10 |
|
| 11 |
_ent_re = _re_compile(
|
| 12 |
r"&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)",
|
|
@@ -270,7 +270,7 @@ name2codepoint = {
|
|
| 270 |
|
| 271 |
|
| 272 |
def to_unicode(
|
| 273 |
-
text: StrOrBytes, encoding:
|
| 274 |
) -> str:
|
| 275 |
"""Return the Unicode representation of a bytes object `text`. If `text`
|
| 276 |
is already a Unicode object, return it as-is."""
|
|
|
|
| 6 |
|
| 7 |
from re import compile as _re_compile, IGNORECASE
|
| 8 |
|
| 9 |
+
from scrapling.core._types import Iterable, Optional, Match, StrOrBytes
|
| 10 |
|
| 11 |
_ent_re = _re_compile(
|
| 12 |
r"&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)",
|
|
|
|
| 270 |
|
| 271 |
|
| 272 |
def to_unicode(
|
| 273 |
+
text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict"
|
| 274 |
) -> str:
|
| 275 |
"""Return the Unicode representation of a bytes object `text`. If `text`
|
| 276 |
is already a Unicode object, return it as-is."""
|
scrapling/core/_types.py
CHANGED
|
@@ -16,7 +16,6 @@ from typing import (
|
|
| 16 |
Optional,
|
| 17 |
Pattern,
|
| 18 |
Tuple,
|
| 19 |
-
Type,
|
| 20 |
TypeVar,
|
| 21 |
Union,
|
| 22 |
Match,
|
|
|
|
| 16 |
Optional,
|
| 17 |
Pattern,
|
| 18 |
Tuple,
|
|
|
|
| 19 |
TypeVar,
|
| 20 |
Union,
|
| 21 |
Match,
|
scrapling/core/ai.py
CHANGED
|
@@ -17,7 +17,6 @@ from scrapling.core._types import (
|
|
| 17 |
Optional,
|
| 18 |
Tuple,
|
| 19 |
extraction_types,
|
| 20 |
-
Union,
|
| 21 |
Mapping,
|
| 22 |
Dict,
|
| 23 |
List,
|
|
@@ -61,10 +60,10 @@ class ScraplingMCPServer:
|
|
| 61 |
extraction_type: extraction_types = "markdown",
|
| 62 |
css_selector: Optional[str] = None,
|
| 63 |
main_content_only: bool = True,
|
| 64 |
-
params: Optional[
|
| 65 |
headers: Optional[Mapping[str, Optional[str]]] = None,
|
| 66 |
-
cookies: Optional[
|
| 67 |
-
timeout: Optional[
|
| 68 |
follow_redirects: bool = True,
|
| 69 |
max_redirects: int = 30,
|
| 70 |
retries: Optional[int] = 3,
|
|
@@ -140,10 +139,10 @@ class ScraplingMCPServer:
|
|
| 140 |
extraction_type: extraction_types = "markdown",
|
| 141 |
css_selector: Optional[str] = None,
|
| 142 |
main_content_only: bool = True,
|
| 143 |
-
params: Optional[
|
| 144 |
headers: Optional[Mapping[str, Optional[str]]] = None,
|
| 145 |
-
cookies: Optional[
|
| 146 |
-
timeout: Optional[
|
| 147 |
follow_redirects: bool = True,
|
| 148 |
max_redirects: int = 30,
|
| 149 |
retries: Optional[int] = 3,
|
|
@@ -232,13 +231,13 @@ class ScraplingMCPServer:
|
|
| 232 |
disable_webgl: bool = False,
|
| 233 |
real_chrome: bool = False,
|
| 234 |
stealth: bool = False,
|
| 235 |
-
wait:
|
| 236 |
-
proxy: Optional[
|
| 237 |
locale: str = "en-US",
|
| 238 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 239 |
useragent: Optional[str] = None,
|
| 240 |
cdp_url: Optional[str] = None,
|
| 241 |
-
timeout:
|
| 242 |
disable_resources: bool = False,
|
| 243 |
wait_selector: Optional[str] = None,
|
| 244 |
cookies: Optional[List[Dict]] = None,
|
|
@@ -321,13 +320,13 @@ class ScraplingMCPServer:
|
|
| 321 |
disable_webgl: bool = False,
|
| 322 |
real_chrome: bool = False,
|
| 323 |
stealth: bool = False,
|
| 324 |
-
wait:
|
| 325 |
-
proxy: Optional[
|
| 326 |
locale: str = "en-US",
|
| 327 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 328 |
useragent: Optional[str] = None,
|
| 329 |
cdp_url: Optional[str] = None,
|
| 330 |
-
timeout:
|
| 331 |
disable_resources: bool = False,
|
| 332 |
wait_selector: Optional[str] = None,
|
| 333 |
cookies: Optional[List[Dict]] = None,
|
|
@@ -409,23 +408,23 @@ class ScraplingMCPServer:
|
|
| 409 |
extraction_type: extraction_types = "markdown",
|
| 410 |
css_selector: Optional[str] = None,
|
| 411 |
main_content_only: bool = True,
|
| 412 |
-
headless:
|
| 413 |
block_images: bool = False,
|
| 414 |
disable_resources: bool = False,
|
| 415 |
block_webrtc: bool = False,
|
| 416 |
allow_webgl: bool = True,
|
| 417 |
network_idle: bool = False,
|
| 418 |
-
humanize:
|
| 419 |
solve_cloudflare: bool = False,
|
| 420 |
-
wait:
|
| 421 |
-
timeout:
|
| 422 |
wait_selector: Optional[str] = None,
|
| 423 |
addons: Optional[List[str]] = None,
|
| 424 |
wait_selector_state: SelectorWaitStates = "attached",
|
| 425 |
cookies: Optional[List[Dict]] = None,
|
| 426 |
google_search: bool = True,
|
| 427 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 428 |
-
proxy: Optional[
|
| 429 |
os_randomize: bool = False,
|
| 430 |
disable_ads: bool = False,
|
| 431 |
geoip: bool = False,
|
|
@@ -509,23 +508,23 @@ class ScraplingMCPServer:
|
|
| 509 |
extraction_type: extraction_types = "markdown",
|
| 510 |
css_selector: Optional[str] = None,
|
| 511 |
main_content_only: bool = True,
|
| 512 |
-
headless:
|
| 513 |
block_images: bool = False,
|
| 514 |
disable_resources: bool = False,
|
| 515 |
block_webrtc: bool = False,
|
| 516 |
allow_webgl: bool = True,
|
| 517 |
network_idle: bool = False,
|
| 518 |
-
humanize:
|
| 519 |
solve_cloudflare: bool = False,
|
| 520 |
-
wait:
|
| 521 |
-
timeout:
|
| 522 |
wait_selector: Optional[str] = None,
|
| 523 |
addons: Optional[List[str]] = None,
|
| 524 |
wait_selector_state: SelectorWaitStates = "attached",
|
| 525 |
cookies: Optional[List[Dict]] = None,
|
| 526 |
google_search: bool = True,
|
| 527 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 528 |
-
proxy: Optional[
|
| 529 |
os_randomize: bool = False,
|
| 530 |
disable_ads: bool = False,
|
| 531 |
geoip: bool = False,
|
|
|
|
| 17 |
Optional,
|
| 18 |
Tuple,
|
| 19 |
extraction_types,
|
|
|
|
| 20 |
Mapping,
|
| 21 |
Dict,
|
| 22 |
List,
|
|
|
|
| 60 |
extraction_type: extraction_types = "markdown",
|
| 61 |
css_selector: Optional[str] = None,
|
| 62 |
main_content_only: bool = True,
|
| 63 |
+
params: Optional[Dict | List | Tuple] = None,
|
| 64 |
headers: Optional[Mapping[str, Optional[str]]] = None,
|
| 65 |
+
cookies: Optional[Dict[str, str] | list[tuple[str, str]]] = None,
|
| 66 |
+
timeout: Optional[int | float] = 30,
|
| 67 |
follow_redirects: bool = True,
|
| 68 |
max_redirects: int = 30,
|
| 69 |
retries: Optional[int] = 3,
|
|
|
|
| 139 |
extraction_type: extraction_types = "markdown",
|
| 140 |
css_selector: Optional[str] = None,
|
| 141 |
main_content_only: bool = True,
|
| 142 |
+
params: Optional[Dict | List | Tuple] = None,
|
| 143 |
headers: Optional[Mapping[str, Optional[str]]] = None,
|
| 144 |
+
cookies: Optional[Dict[str, str] | list[tuple[str, str]]] = None,
|
| 145 |
+
timeout: Optional[int | float] = 30,
|
| 146 |
follow_redirects: bool = True,
|
| 147 |
max_redirects: int = 30,
|
| 148 |
retries: Optional[int] = 3,
|
|
|
|
| 231 |
disable_webgl: bool = False,
|
| 232 |
real_chrome: bool = False,
|
| 233 |
stealth: bool = False,
|
| 234 |
+
wait: int | float = 0,
|
| 235 |
+
proxy: Optional[str | Dict[str, str]] = None,
|
| 236 |
locale: str = "en-US",
|
| 237 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 238 |
useragent: Optional[str] = None,
|
| 239 |
cdp_url: Optional[str] = None,
|
| 240 |
+
timeout: int | float = 30000,
|
| 241 |
disable_resources: bool = False,
|
| 242 |
wait_selector: Optional[str] = None,
|
| 243 |
cookies: Optional[List[Dict]] = None,
|
|
|
|
| 320 |
disable_webgl: bool = False,
|
| 321 |
real_chrome: bool = False,
|
| 322 |
stealth: bool = False,
|
| 323 |
+
wait: int | float = 0,
|
| 324 |
+
proxy: Optional[str | Dict[str, str]] = None,
|
| 325 |
locale: str = "en-US",
|
| 326 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 327 |
useragent: Optional[str] = None,
|
| 328 |
cdp_url: Optional[str] = None,
|
| 329 |
+
timeout: int | float = 30000,
|
| 330 |
disable_resources: bool = False,
|
| 331 |
wait_selector: Optional[str] = None,
|
| 332 |
cookies: Optional[List[Dict]] = None,
|
|
|
|
| 408 |
extraction_type: extraction_types = "markdown",
|
| 409 |
css_selector: Optional[str] = None,
|
| 410 |
main_content_only: bool = True,
|
| 411 |
+
headless: bool = True, # noqa: F821
|
| 412 |
block_images: bool = False,
|
| 413 |
disable_resources: bool = False,
|
| 414 |
block_webrtc: bool = False,
|
| 415 |
allow_webgl: bool = True,
|
| 416 |
network_idle: bool = False,
|
| 417 |
+
humanize: bool | float = True,
|
| 418 |
solve_cloudflare: bool = False,
|
| 419 |
+
wait: int | float = 0,
|
| 420 |
+
timeout: int | float = 30000,
|
| 421 |
wait_selector: Optional[str] = None,
|
| 422 |
addons: Optional[List[str]] = None,
|
| 423 |
wait_selector_state: SelectorWaitStates = "attached",
|
| 424 |
cookies: Optional[List[Dict]] = None,
|
| 425 |
google_search: bool = True,
|
| 426 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 427 |
+
proxy: Optional[str | Dict[str, str]] = None,
|
| 428 |
os_randomize: bool = False,
|
| 429 |
disable_ads: bool = False,
|
| 430 |
geoip: bool = False,
|
|
|
|
| 508 |
extraction_type: extraction_types = "markdown",
|
| 509 |
css_selector: Optional[str] = None,
|
| 510 |
main_content_only: bool = True,
|
| 511 |
+
headless: bool = True, # noqa: F821
|
| 512 |
block_images: bool = False,
|
| 513 |
disable_resources: bool = False,
|
| 514 |
block_webrtc: bool = False,
|
| 515 |
allow_webgl: bool = True,
|
| 516 |
network_idle: bool = False,
|
| 517 |
+
humanize: bool | float = True,
|
| 518 |
solve_cloudflare: bool = False,
|
| 519 |
+
wait: int | float = 0,
|
| 520 |
+
timeout: int | float = 30000,
|
| 521 |
wait_selector: Optional[str] = None,
|
| 522 |
addons: Optional[List[str]] = None,
|
| 523 |
wait_selector_state: SelectorWaitStates = "attached",
|
| 524 |
cookies: Optional[List[Dict]] = None,
|
| 525 |
google_search: bool = True,
|
| 526 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 527 |
+
proxy: Optional[str | Dict[str, str]] = None,
|
| 528 |
os_randomize: bool = False,
|
| 529 |
disable_ads: bool = False,
|
| 530 |
geoip: bool = False,
|
scrapling/core/custom_types.py
CHANGED
|
@@ -8,7 +8,6 @@ from scrapling.core._types import (
|
|
| 8 |
cast,
|
| 9 |
Dict,
|
| 10 |
List,
|
| 11 |
-
Union,
|
| 12 |
overload,
|
| 13 |
TypeVar,
|
| 14 |
Literal,
|
|
@@ -34,7 +33,7 @@ class TextHandler(str):
|
|
| 34 |
def __new__(cls, string):
|
| 35 |
return super().__new__(cls, str(string))
|
| 36 |
|
| 37 |
-
def __getitem__(self, key:
|
| 38 |
lst = super().__getitem__(key)
|
| 39 |
return cast(_TextHandlerType, TextHandler(lst))
|
| 40 |
|
|
@@ -46,78 +45,72 @@ class TextHandler(str):
|
|
| 46 |
)
|
| 47 |
)
|
| 48 |
|
| 49 |
-
def strip(self, chars: str = None) ->
|
| 50 |
return TextHandler(super().strip(chars))
|
| 51 |
|
| 52 |
-
def lstrip(self, chars: str = None) ->
|
| 53 |
return TextHandler(super().lstrip(chars))
|
| 54 |
|
| 55 |
-
def rstrip(self, chars: str = None) ->
|
| 56 |
return TextHandler(super().rstrip(chars))
|
| 57 |
|
| 58 |
-
def capitalize(self) ->
|
| 59 |
return TextHandler(super().capitalize())
|
| 60 |
|
| 61 |
-
def casefold(self) ->
|
| 62 |
return TextHandler(super().casefold())
|
| 63 |
|
| 64 |
-
def center(
|
| 65 |
-
self, width: SupportsIndex, fillchar: str = " "
|
| 66 |
-
) -> Union[str, "TextHandler"]:
|
| 67 |
return TextHandler(super().center(width, fillchar))
|
| 68 |
|
| 69 |
-
def expandtabs(self, tabsize: SupportsIndex = 8) ->
|
| 70 |
return TextHandler(super().expandtabs(tabsize))
|
| 71 |
|
| 72 |
-
def format(self, *args: str, **kwargs: str) ->
|
| 73 |
return TextHandler(super().format(*args, **kwargs))
|
| 74 |
|
| 75 |
-
def format_map(self, mapping) ->
|
| 76 |
return TextHandler(super().format_map(mapping))
|
| 77 |
|
| 78 |
-
def join(self, iterable: Iterable[str]) ->
|
| 79 |
return TextHandler(super().join(iterable))
|
| 80 |
|
| 81 |
-
def ljust(
|
| 82 |
-
self, width: SupportsIndex, fillchar: str = " "
|
| 83 |
-
) -> Union[str, "TextHandler"]:
|
| 84 |
return TextHandler(super().ljust(width, fillchar))
|
| 85 |
|
| 86 |
-
def rjust(
|
| 87 |
-
self, width: SupportsIndex, fillchar: str = " "
|
| 88 |
-
) -> Union[str, "TextHandler"]:
|
| 89 |
return TextHandler(super().rjust(width, fillchar))
|
| 90 |
|
| 91 |
-
def swapcase(self) ->
|
| 92 |
return TextHandler(super().swapcase())
|
| 93 |
|
| 94 |
-
def title(self) ->
|
| 95 |
return TextHandler(super().title())
|
| 96 |
|
| 97 |
-
def translate(self, table) ->
|
| 98 |
return TextHandler(super().translate(table))
|
| 99 |
|
| 100 |
-
def zfill(self, width: SupportsIndex) ->
|
| 101 |
return TextHandler(super().zfill(width))
|
| 102 |
|
| 103 |
def replace(
|
| 104 |
self, old: str, new: str, count: SupportsIndex = -1
|
| 105 |
-
) ->
|
| 106 |
return TextHandler(super().replace(old, new, count))
|
| 107 |
|
| 108 |
-
def upper(self) ->
|
| 109 |
return TextHandler(super().upper())
|
| 110 |
|
| 111 |
-
def lower(self) ->
|
| 112 |
return TextHandler(super().lower())
|
| 113 |
|
| 114 |
##############
|
| 115 |
|
| 116 |
-
def sort(self, reverse: bool = False) ->
|
| 117 |
"""Return a sorted version of the string"""
|
| 118 |
return self.__class__("".join(sorted(self, reverse=reverse)))
|
| 119 |
|
| 120 |
-
def clean(self) ->
|
| 121 |
"""Return a new version of the string after removing all white spaces and consecutive spaces"""
|
| 122 |
data = self.translate(__CLEANING_TABLE__)
|
| 123 |
return self.__class__(__CONSECUTIVE_SPACES_REGEX__.sub(" ", data).strip())
|
|
@@ -141,7 +134,7 @@ class TextHandler(str):
|
|
| 141 |
@overload
|
| 142 |
def re(
|
| 143 |
self,
|
| 144 |
-
regex:
|
| 145 |
check_match: Literal[True],
|
| 146 |
replace_entities: bool = True,
|
| 147 |
clean_match: bool = False,
|
|
@@ -151,7 +144,7 @@ class TextHandler(str):
|
|
| 151 |
@overload
|
| 152 |
def re(
|
| 153 |
self,
|
| 154 |
-
regex:
|
| 155 |
replace_entities: bool = True,
|
| 156 |
clean_match: bool = False,
|
| 157 |
case_sensitive: bool = True,
|
|
@@ -160,12 +153,12 @@ class TextHandler(str):
|
|
| 160 |
|
| 161 |
def re(
|
| 162 |
self,
|
| 163 |
-
regex:
|
| 164 |
replace_entities: bool = True,
|
| 165 |
clean_match: bool = False,
|
| 166 |
case_sensitive: bool = True,
|
| 167 |
check_match: bool = False,
|
| 168 |
-
) ->
|
| 169 |
"""Apply the given regex to the current text and return a list of strings with the matches.
|
| 170 |
|
| 171 |
:param regex: Can be either a compiled regular expression or a string.
|
|
@@ -205,7 +198,7 @@ class TextHandler(str):
|
|
| 205 |
|
| 206 |
def re_first(
|
| 207 |
self,
|
| 208 |
-
regex:
|
| 209 |
default=None,
|
| 210 |
replace_entities: bool = True,
|
| 211 |
clean_match: bool = False,
|
|
@@ -244,9 +237,7 @@ class TextHandlers(List[TextHandler]):
|
|
| 244 |
def __getitem__(self, pos: slice) -> "TextHandlers":
|
| 245 |
pass
|
| 246 |
|
| 247 |
-
def __getitem__(
|
| 248 |
-
self, pos: Union[SupportsIndex, slice]
|
| 249 |
-
) -> Union[TextHandler, "TextHandlers"]:
|
| 250 |
lst = super().__getitem__(pos)
|
| 251 |
if isinstance(pos, slice):
|
| 252 |
lst = [TextHandler(s) for s in lst]
|
|
@@ -255,7 +246,7 @@ class TextHandlers(List[TextHandler]):
|
|
| 255 |
|
| 256 |
def re(
|
| 257 |
self,
|
| 258 |
-
regex:
|
| 259 |
replace_entities: bool = True,
|
| 260 |
clean_match: bool = False,
|
| 261 |
case_sensitive: bool = True,
|
|
@@ -275,7 +266,7 @@ class TextHandlers(List[TextHandler]):
|
|
| 275 |
|
| 276 |
def re_first(
|
| 277 |
self,
|
| 278 |
-
regex:
|
| 279 |
default=None,
|
| 280 |
replace_entities: bool = True,
|
| 281 |
clean_match: bool = False,
|
|
@@ -339,7 +330,7 @@ class AttributesHandler(Mapping[str, _TextHandlerType]):
|
|
| 339 |
|
| 340 |
def get(
|
| 341 |
self, key: str, default: Optional[str] = None
|
| 342 |
-
) ->
|
| 343 |
"""Acts like the standard dictionary `.get()` method"""
|
| 344 |
return self._data.get(key, default)
|
| 345 |
|
|
|
|
| 8 |
cast,
|
| 9 |
Dict,
|
| 10 |
List,
|
|
|
|
| 11 |
overload,
|
| 12 |
TypeVar,
|
| 13 |
Literal,
|
|
|
|
| 33 |
def __new__(cls, string):
|
| 34 |
return super().__new__(cls, str(string))
|
| 35 |
|
| 36 |
+
def __getitem__(self, key: SupportsIndex | slice) -> "TextHandler":
|
| 37 |
lst = super().__getitem__(key)
|
| 38 |
return cast(_TextHandlerType, TextHandler(lst))
|
| 39 |
|
|
|
|
| 45 |
)
|
| 46 |
)
|
| 47 |
|
| 48 |
+
def strip(self, chars: str = None) -> str | "TextHandler":
|
| 49 |
return TextHandler(super().strip(chars))
|
| 50 |
|
| 51 |
+
def lstrip(self, chars: str = None) -> str | "TextHandler":
|
| 52 |
return TextHandler(super().lstrip(chars))
|
| 53 |
|
| 54 |
+
def rstrip(self, chars: str = None) -> str | "TextHandler":
|
| 55 |
return TextHandler(super().rstrip(chars))
|
| 56 |
|
| 57 |
+
def capitalize(self) -> str | "TextHandler":
|
| 58 |
return TextHandler(super().capitalize())
|
| 59 |
|
| 60 |
+
def casefold(self) -> str | "TextHandler":
|
| 61 |
return TextHandler(super().casefold())
|
| 62 |
|
| 63 |
+
def center(self, width: SupportsIndex, fillchar: str = " ") -> str | "TextHandler":
|
|
|
|
|
|
|
| 64 |
return TextHandler(super().center(width, fillchar))
|
| 65 |
|
| 66 |
+
def expandtabs(self, tabsize: SupportsIndex = 8) -> str | "TextHandler":
|
| 67 |
return TextHandler(super().expandtabs(tabsize))
|
| 68 |
|
| 69 |
+
def format(self, *args: str, **kwargs: str) -> str | "TextHandler":
|
| 70 |
return TextHandler(super().format(*args, **kwargs))
|
| 71 |
|
| 72 |
+
def format_map(self, mapping) -> str | "TextHandler":
|
| 73 |
return TextHandler(super().format_map(mapping))
|
| 74 |
|
| 75 |
+
def join(self, iterable: Iterable[str]) -> str | "TextHandler":
|
| 76 |
return TextHandler(super().join(iterable))
|
| 77 |
|
| 78 |
+
def ljust(self, width: SupportsIndex, fillchar: str = " ") -> str | "TextHandler":
|
|
|
|
|
|
|
| 79 |
return TextHandler(super().ljust(width, fillchar))
|
| 80 |
|
| 81 |
+
def rjust(self, width: SupportsIndex, fillchar: str = " ") -> str | "TextHandler":
|
|
|
|
|
|
|
| 82 |
return TextHandler(super().rjust(width, fillchar))
|
| 83 |
|
| 84 |
+
def swapcase(self) -> str | "TextHandler":
|
| 85 |
return TextHandler(super().swapcase())
|
| 86 |
|
| 87 |
+
def title(self) -> str | "TextHandler":
|
| 88 |
return TextHandler(super().title())
|
| 89 |
|
| 90 |
+
def translate(self, table) -> str | "TextHandler":
|
| 91 |
return TextHandler(super().translate(table))
|
| 92 |
|
| 93 |
+
def zfill(self, width: SupportsIndex) -> str | "TextHandler":
|
| 94 |
return TextHandler(super().zfill(width))
|
| 95 |
|
| 96 |
def replace(
|
| 97 |
self, old: str, new: str, count: SupportsIndex = -1
|
| 98 |
+
) -> str | "TextHandler":
|
| 99 |
return TextHandler(super().replace(old, new, count))
|
| 100 |
|
| 101 |
+
def upper(self) -> str | "TextHandler":
|
| 102 |
return TextHandler(super().upper())
|
| 103 |
|
| 104 |
+
def lower(self) -> str | "TextHandler":
|
| 105 |
return TextHandler(super().lower())
|
| 106 |
|
| 107 |
##############
|
| 108 |
|
| 109 |
+
def sort(self, reverse: bool = False) -> str | "TextHandler":
|
| 110 |
"""Return a sorted version of the string"""
|
| 111 |
return self.__class__("".join(sorted(self, reverse=reverse)))
|
| 112 |
|
| 113 |
+
def clean(self) -> str | "TextHandler":
|
| 114 |
"""Return a new version of the string after removing all white spaces and consecutive spaces"""
|
| 115 |
data = self.translate(__CLEANING_TABLE__)
|
| 116 |
return self.__class__(__CONSECUTIVE_SPACES_REGEX__.sub(" ", data).strip())
|
|
|
|
| 134 |
@overload
|
| 135 |
def re(
|
| 136 |
self,
|
| 137 |
+
regex: str | Pattern,
|
| 138 |
check_match: Literal[True],
|
| 139 |
replace_entities: bool = True,
|
| 140 |
clean_match: bool = False,
|
|
|
|
| 144 |
@overload
|
| 145 |
def re(
|
| 146 |
self,
|
| 147 |
+
regex: str | Pattern,
|
| 148 |
replace_entities: bool = True,
|
| 149 |
clean_match: bool = False,
|
| 150 |
case_sensitive: bool = True,
|
|
|
|
| 153 |
|
| 154 |
def re(
|
| 155 |
self,
|
| 156 |
+
regex: str | Pattern,
|
| 157 |
replace_entities: bool = True,
|
| 158 |
clean_match: bool = False,
|
| 159 |
case_sensitive: bool = True,
|
| 160 |
check_match: bool = False,
|
| 161 |
+
) -> "TextHandlers" | bool:
|
| 162 |
"""Apply the given regex to the current text and return a list of strings with the matches.
|
| 163 |
|
| 164 |
:param regex: Can be either a compiled regular expression or a string.
|
|
|
|
| 198 |
|
| 199 |
def re_first(
|
| 200 |
self,
|
| 201 |
+
regex: str | Pattern,
|
| 202 |
default=None,
|
| 203 |
replace_entities: bool = True,
|
| 204 |
clean_match: bool = False,
|
|
|
|
| 237 |
def __getitem__(self, pos: slice) -> "TextHandlers":
|
| 238 |
pass
|
| 239 |
|
| 240 |
+
def __getitem__(self, pos: SupportsIndex | slice) -> TextHandler | "TextHandlers":
|
|
|
|
|
|
|
| 241 |
lst = super().__getitem__(pos)
|
| 242 |
if isinstance(pos, slice):
|
| 243 |
lst = [TextHandler(s) for s in lst]
|
|
|
|
| 246 |
|
| 247 |
def re(
|
| 248 |
self,
|
| 249 |
+
regex: str | Pattern,
|
| 250 |
replace_entities: bool = True,
|
| 251 |
clean_match: bool = False,
|
| 252 |
case_sensitive: bool = True,
|
|
|
|
| 266 |
|
| 267 |
def re_first(
|
| 268 |
self,
|
| 269 |
+
regex: str | Pattern,
|
| 270 |
default=None,
|
| 271 |
replace_entities: bool = True,
|
| 272 |
clean_match: bool = False,
|
|
|
|
| 330 |
|
| 331 |
def get(
|
| 332 |
self, key: str, default: Optional[str] = None
|
| 333 |
+
) -> Optional[_TextHandlerType]:
|
| 334 |
"""Acts like the standard dictionary `.get()` method"""
|
| 335 |
return self._data.get(key, default)
|
| 336 |
|
scrapling/core/shell.py
CHANGED
|
@@ -33,7 +33,6 @@ from scrapling.core._types import (
|
|
| 33 |
Dict,
|
| 34 |
Tuple,
|
| 35 |
Any,
|
| 36 |
-
Union,
|
| 37 |
extraction_types,
|
| 38 |
Generator,
|
| 39 |
)
|
|
@@ -254,7 +253,7 @@ class CurlParser:
|
|
| 254 |
|
| 255 |
# --- Process Data Payload ---
|
| 256 |
params = dict()
|
| 257 |
-
data_payload:
|
| 258 |
json_payload: Optional[Any] = None
|
| 259 |
|
| 260 |
# DevTools often uses --data-raw for JSON bodies
|
|
@@ -358,7 +357,7 @@ class CurlParser:
|
|
| 358 |
follow_redirects=True, # Scrapling default is True
|
| 359 |
)
|
| 360 |
|
| 361 |
-
def convert2fetcher(self, curl_command:
|
| 362 |
if isinstance(curl_command, (Request, str)):
|
| 363 |
request = (
|
| 364 |
self.parse(curl_command)
|
|
|
|
| 33 |
Dict,
|
| 34 |
Tuple,
|
| 35 |
Any,
|
|
|
|
| 36 |
extraction_types,
|
| 37 |
Generator,
|
| 38 |
)
|
|
|
|
| 253 |
|
| 254 |
# --- Process Data Payload ---
|
| 255 |
params = dict()
|
| 256 |
+
data_payload: Optional[str | bytes | Dict] = None
|
| 257 |
json_payload: Optional[Any] = None
|
| 258 |
|
| 259 |
# DevTools often uses --data-raw for JSON bodies
|
|
|
|
| 357 |
follow_redirects=True, # Scrapling default is True
|
| 358 |
)
|
| 359 |
|
| 360 |
+
def convert2fetcher(self, curl_command: Request | str) -> Optional[Response]:
|
| 361 |
if isinstance(curl_command, (Request, str)):
|
| 362 |
request = (
|
| 363 |
self.parse(curl_command)
|
scrapling/core/storage.py
CHANGED
|
@@ -1,20 +1,20 @@
|
|
| 1 |
-
from sqlite3 import connect as db_connect
|
| 2 |
-
from threading import RLock
|
| 3 |
-
from abc import ABC, abstractmethod
|
| 4 |
from hashlib import sha256
|
|
|
|
| 5 |
from functools import lru_cache
|
|
|
|
|
|
|
| 6 |
|
| 7 |
-
from lxml.html import HtmlElement
|
| 8 |
from orjson import dumps, loads
|
|
|
|
| 9 |
from tldextract import extract as tld
|
| 10 |
|
| 11 |
from scrapling.core.utils import _StorageTools, log
|
| 12 |
-
from scrapling.core._types import Dict, Optional,
|
| 13 |
|
| 14 |
|
| 15 |
class StorageSystemMixin(ABC):
|
| 16 |
# If you want to make your own storage system, you have to inherit from this
|
| 17 |
-
def __init__(self, url:
|
| 18 |
"""
|
| 19 |
:param url: URL of the website we are working on to separate it from other websites data
|
| 20 |
"""
|
|
@@ -74,7 +74,7 @@ class SQLiteStorageSystem(StorageSystemMixin):
|
|
| 74 |
Mainly built, so the library can run in threaded frameworks like scrapy or threaded tools
|
| 75 |
> It's optimized for threaded applications, but running it without threads shouldn't make it slow."""
|
| 76 |
|
| 77 |
-
def __init__(self, storage_file: str, url:
|
| 78 |
"""
|
| 79 |
:param storage_file: File to be used to store elements' data.
|
| 80 |
:param url: URL of the website we are working on to separate it from other websites data
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from hashlib import sha256
|
| 2 |
+
from threading import RLock
|
| 3 |
from functools import lru_cache
|
| 4 |
+
from abc import ABC, abstractmethod
|
| 5 |
+
from sqlite3 import connect as db_connect
|
| 6 |
|
|
|
|
| 7 |
from orjson import dumps, loads
|
| 8 |
+
from lxml.html import HtmlElement
|
| 9 |
from tldextract import extract as tld
|
| 10 |
|
| 11 |
from scrapling.core.utils import _StorageTools, log
|
| 12 |
+
from scrapling.core._types import Dict, Optional, Any
|
| 13 |
|
| 14 |
|
| 15 |
class StorageSystemMixin(ABC):
|
| 16 |
# If you want to make your own storage system, you have to inherit from this
|
| 17 |
+
def __init__(self, url: Optional[str] = None):
|
| 18 |
"""
|
| 19 |
:param url: URL of the website we are working on to separate it from other websites data
|
| 20 |
"""
|
|
|
|
| 74 |
Mainly built, so the library can run in threaded frameworks like scrapy or threaded tools
|
| 75 |
> It's optimized for threaded applications, but running it without threads shouldn't make it slow."""
|
| 76 |
|
| 77 |
+
def __init__(self, storage_file: str, url: Optional[str] = None):
|
| 78 |
"""
|
| 79 |
:param storage_file: File to be used to store elements' data.
|
| 80 |
:param url: URL of the website we are working on to separate it from other websites data
|
scrapling/engines/_browsers/_camoufox.py
CHANGED
|
@@ -26,10 +26,9 @@ from ._page import PageInfo, PagePool
|
|
| 26 |
from ._validators import validate, CamoufoxConfig
|
| 27 |
from scrapling.core._types import (
|
| 28 |
Dict,
|
|
|
|
| 29 |
Optional,
|
| 30 |
-
Union,
|
| 31 |
Callable,
|
| 32 |
-
List,
|
| 33 |
SelectorWaitStates,
|
| 34 |
)
|
| 35 |
from scrapling.engines.toolbelt import (
|
|
@@ -84,16 +83,16 @@ class StealthySession:
|
|
| 84 |
def __init__(
|
| 85 |
self,
|
| 86 |
max_pages: int = 1,
|
| 87 |
-
headless:
|
| 88 |
block_images: bool = False,
|
| 89 |
disable_resources: bool = False,
|
| 90 |
block_webrtc: bool = False,
|
| 91 |
allow_webgl: bool = True,
|
| 92 |
network_idle: bool = False,
|
| 93 |
-
humanize:
|
| 94 |
solve_cloudflare: bool = False,
|
| 95 |
-
wait:
|
| 96 |
-
timeout:
|
| 97 |
page_action: Optional[Callable] = None,
|
| 98 |
wait_selector: Optional[str] = None,
|
| 99 |
addons: Optional[List[str]] = None,
|
|
@@ -101,7 +100,7 @@ class StealthySession:
|
|
| 101 |
cookies: Optional[List[Dict]] = None,
|
| 102 |
google_search: bool = True,
|
| 103 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 104 |
-
proxy: Optional[
|
| 105 |
os_randomize: bool = False,
|
| 106 |
disable_ads: bool = False,
|
| 107 |
geoip: bool = False,
|
|
@@ -461,16 +460,16 @@ class AsyncStealthySession(StealthySession):
|
|
| 461 |
def __init__(
|
| 462 |
self,
|
| 463 |
max_pages: int = 1,
|
| 464 |
-
headless:
|
| 465 |
block_images: bool = False,
|
| 466 |
disable_resources: bool = False,
|
| 467 |
block_webrtc: bool = False,
|
| 468 |
allow_webgl: bool = True,
|
| 469 |
network_idle: bool = False,
|
| 470 |
-
humanize:
|
| 471 |
solve_cloudflare: bool = False,
|
| 472 |
-
wait:
|
| 473 |
-
timeout:
|
| 474 |
page_action: Optional[Callable] = None,
|
| 475 |
wait_selector: Optional[str] = None,
|
| 476 |
addons: Optional[List[str]] = None,
|
|
@@ -478,7 +477,7 @@ class AsyncStealthySession(StealthySession):
|
|
| 478 |
cookies: Optional[List[Dict]] = None,
|
| 479 |
google_search: bool = True,
|
| 480 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 481 |
-
proxy: Optional[
|
| 482 |
os_randomize: bool = False,
|
| 483 |
disable_ads: bool = False,
|
| 484 |
geoip: bool = False,
|
|
|
|
| 26 |
from ._validators import validate, CamoufoxConfig
|
| 27 |
from scrapling.core._types import (
|
| 28 |
Dict,
|
| 29 |
+
List,
|
| 30 |
Optional,
|
|
|
|
| 31 |
Callable,
|
|
|
|
| 32 |
SelectorWaitStates,
|
| 33 |
)
|
| 34 |
from scrapling.engines.toolbelt import (
|
|
|
|
| 83 |
def __init__(
|
| 84 |
self,
|
| 85 |
max_pages: int = 1,
|
| 86 |
+
headless: bool = True, # noqa: F821
|
| 87 |
block_images: bool = False,
|
| 88 |
disable_resources: bool = False,
|
| 89 |
block_webrtc: bool = False,
|
| 90 |
allow_webgl: bool = True,
|
| 91 |
network_idle: bool = False,
|
| 92 |
+
humanize: bool | float = True,
|
| 93 |
solve_cloudflare: bool = False,
|
| 94 |
+
wait: int | float = 0,
|
| 95 |
+
timeout: int | float = 30000,
|
| 96 |
page_action: Optional[Callable] = None,
|
| 97 |
wait_selector: Optional[str] = None,
|
| 98 |
addons: Optional[List[str]] = None,
|
|
|
|
| 100 |
cookies: Optional[List[Dict]] = None,
|
| 101 |
google_search: bool = True,
|
| 102 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 103 |
+
proxy: Optional[str | Dict[str, str]] = None,
|
| 104 |
os_randomize: bool = False,
|
| 105 |
disable_ads: bool = False,
|
| 106 |
geoip: bool = False,
|
|
|
|
| 460 |
def __init__(
|
| 461 |
self,
|
| 462 |
max_pages: int = 1,
|
| 463 |
+
headless: bool = True, # noqa: F821
|
| 464 |
block_images: bool = False,
|
| 465 |
disable_resources: bool = False,
|
| 466 |
block_webrtc: bool = False,
|
| 467 |
allow_webgl: bool = True,
|
| 468 |
network_idle: bool = False,
|
| 469 |
+
humanize: bool | float = True,
|
| 470 |
solve_cloudflare: bool = False,
|
| 471 |
+
wait: int | float = 0,
|
| 472 |
+
timeout: int | float = 30000,
|
| 473 |
page_action: Optional[Callable] = None,
|
| 474 |
wait_selector: Optional[str] = None,
|
| 475 |
addons: Optional[List[str]] = None,
|
|
|
|
| 477 |
cookies: Optional[List[Dict]] = None,
|
| 478 |
google_search: bool = True,
|
| 479 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 480 |
+
proxy: Optional[str | Dict[str, str]] = None,
|
| 481 |
os_randomize: bool = False,
|
| 482 |
disable_ads: bool = False,
|
| 483 |
geoip: bool = False,
|
scrapling/engines/_browsers/_controllers.py
CHANGED
|
@@ -28,9 +28,8 @@ from ._validators import validate, PlaywrightConfig
|
|
| 28 |
from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
|
| 29 |
from scrapling.core._types import (
|
| 30 |
Dict,
|
| 31 |
-
Optional,
|
| 32 |
-
Union,
|
| 33 |
List,
|
|
|
|
| 34 |
Callable,
|
| 35 |
SelectorWaitStates,
|
| 36 |
)
|
|
@@ -87,14 +86,14 @@ class DynamicSession:
|
|
| 87 |
disable_webgl: bool = False,
|
| 88 |
real_chrome: bool = False,
|
| 89 |
stealth: bool = False,
|
| 90 |
-
wait:
|
| 91 |
page_action: Optional[Callable] = None,
|
| 92 |
-
proxy: Optional[
|
| 93 |
locale: str = "en-US",
|
| 94 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 95 |
useragent: Optional[str] = None,
|
| 96 |
cdp_url: Optional[str] = None,
|
| 97 |
-
timeout:
|
| 98 |
disable_resources: bool = False,
|
| 99 |
wait_selector: Optional[str] = None,
|
| 100 |
cookies: Optional[List[Dict]] = None,
|
|
@@ -404,14 +403,14 @@ class AsyncDynamicSession(DynamicSession):
|
|
| 404 |
disable_webgl: bool = False,
|
| 405 |
real_chrome: bool = False,
|
| 406 |
stealth: bool = False,
|
| 407 |
-
wait:
|
| 408 |
page_action: Optional[Callable] = None,
|
| 409 |
-
proxy: Optional[
|
| 410 |
locale: str = "en-US",
|
| 411 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 412 |
useragent: Optional[str] = None,
|
| 413 |
cdp_url: Optional[str] = None,
|
| 414 |
-
timeout:
|
| 415 |
disable_resources: bool = False,
|
| 416 |
wait_selector: Optional[str] = None,
|
| 417 |
cookies: Optional[List[Dict]] = None,
|
|
|
|
| 28 |
from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
|
| 29 |
from scrapling.core._types import (
|
| 30 |
Dict,
|
|
|
|
|
|
|
| 31 |
List,
|
| 32 |
+
Optional,
|
| 33 |
Callable,
|
| 34 |
SelectorWaitStates,
|
| 35 |
)
|
|
|
|
| 86 |
disable_webgl: bool = False,
|
| 87 |
real_chrome: bool = False,
|
| 88 |
stealth: bool = False,
|
| 89 |
+
wait: int | float = 0,
|
| 90 |
page_action: Optional[Callable] = None,
|
| 91 |
+
proxy: Optional[str | Dict[str, str]] = None,
|
| 92 |
locale: str = "en-US",
|
| 93 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 94 |
useragent: Optional[str] = None,
|
| 95 |
cdp_url: Optional[str] = None,
|
| 96 |
+
timeout: int | float = 30000,
|
| 97 |
disable_resources: bool = False,
|
| 98 |
wait_selector: Optional[str] = None,
|
| 99 |
cookies: Optional[List[Dict]] = None,
|
|
|
|
| 403 |
disable_webgl: bool = False,
|
| 404 |
real_chrome: bool = False,
|
| 405 |
stealth: bool = False,
|
| 406 |
+
wait: int | float = 0,
|
| 407 |
page_action: Optional[Callable] = None,
|
| 408 |
+
proxy: Optional[str | Dict[str, str]] = None,
|
| 409 |
locale: str = "en-US",
|
| 410 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 411 |
useragent: Optional[str] = None,
|
| 412 |
cdp_url: Optional[str] = None,
|
| 413 |
+
timeout: int | float = 30000,
|
| 414 |
disable_resources: bool = False,
|
| 415 |
wait_selector: Optional[str] = None,
|
| 416 |
cookies: Optional[List[Dict]] = None,
|
scrapling/engines/_browsers/_page.py
CHANGED
|
@@ -4,7 +4,7 @@ from dataclasses import dataclass
|
|
| 4 |
from playwright.sync_api import Page as SyncPage
|
| 5 |
from playwright.async_api import Page as AsyncPage
|
| 6 |
|
| 7 |
-
from scrapling.core._types import Optional,
|
| 8 |
|
| 9 |
PageState = Literal["ready", "busy", "error"] # States that a page can be in
|
| 10 |
|
|
@@ -14,7 +14,7 @@ class PageInfo:
|
|
| 14 |
"""Information about the page and its current state"""
|
| 15 |
|
| 16 |
__slots__ = ("page", "state", "url")
|
| 17 |
-
page:
|
| 18 |
state: PageState
|
| 19 |
url: Optional[str]
|
| 20 |
|
|
@@ -52,7 +52,7 @@ class PagePool:
|
|
| 52 |
self.pages: List[PageInfo] = []
|
| 53 |
self._lock = RLock()
|
| 54 |
|
| 55 |
-
def add_page(self, page:
|
| 56 |
"""Add a new page to the pool"""
|
| 57 |
with self._lock:
|
| 58 |
if len(self.pages) >= self.max_pages:
|
|
|
|
| 4 |
from playwright.sync_api import Page as SyncPage
|
| 5 |
from playwright.async_api import Page as AsyncPage
|
| 6 |
|
| 7 |
+
from scrapling.core._types import Optional, List, Literal
|
| 8 |
|
| 9 |
PageState = Literal["ready", "busy", "error"] # States that a page can be in
|
| 10 |
|
|
|
|
| 14 |
"""Information about the page and its current state"""
|
| 15 |
|
| 16 |
__slots__ = ("page", "state", "url")
|
| 17 |
+
page: SyncPage | AsyncPage
|
| 18 |
state: PageState
|
| 19 |
url: Optional[str]
|
| 20 |
|
|
|
|
| 52 |
self.pages: List[PageInfo] = []
|
| 53 |
self._lock = RLock()
|
| 54 |
|
| 55 |
+
def add_page(self, page: SyncPage | AsyncPage) -> PageInfo:
|
| 56 |
"""Add a new page to the pool"""
|
| 57 |
with self._lock:
|
| 58 |
if len(self.pages) >= self.max_pages:
|
scrapling/engines/_browsers/_validators.py
CHANGED
|
@@ -4,7 +4,6 @@ from pathlib import Path
|
|
| 4 |
|
| 5 |
from scrapling.core._types import (
|
| 6 |
Optional,
|
| 7 |
-
Union,
|
| 8 |
Dict,
|
| 9 |
Callable,
|
| 10 |
List,
|
|
@@ -24,15 +23,15 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
|
| 24 |
disable_webgl: bool = False
|
| 25 |
real_chrome: bool = False
|
| 26 |
stealth: bool = False
|
| 27 |
-
wait:
|
| 28 |
page_action: Optional[Callable] = None
|
| 29 |
-
proxy: Optional[
|
| 30 |
None # The default value for proxy in Playwright's source is `None`
|
| 31 |
)
|
| 32 |
locale: str = "en-US"
|
| 33 |
extra_headers: Optional[Dict[str, str]] = None
|
| 34 |
useragent: Optional[str] = None
|
| 35 |
-
timeout:
|
| 36 |
disable_resources: bool = False
|
| 37 |
wait_selector: Optional[str] = None
|
| 38 |
cookies: Optional[List[Dict]] = None
|
|
@@ -87,10 +86,10 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
|
|
| 87 |
block_webrtc: bool = False
|
| 88 |
allow_webgl: bool = True
|
| 89 |
network_idle: bool = False
|
| 90 |
-
humanize:
|
| 91 |
solve_cloudflare: bool = False
|
| 92 |
-
wait:
|
| 93 |
-
timeout:
|
| 94 |
page_action: Optional[Callable] = None
|
| 95 |
wait_selector: Optional[str] = None
|
| 96 |
addons: Optional[List[str]] = None
|
|
@@ -98,7 +97,7 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
|
|
| 98 |
cookies: Optional[List[Dict]] = None
|
| 99 |
google_search: bool = True
|
| 100 |
extra_headers: Optional[Dict[str, str]] = None
|
| 101 |
-
proxy: Optional[
|
| 102 |
None # The default value for proxy in Playwright's source is `None`
|
| 103 |
)
|
| 104 |
os_randomize: bool = False
|
|
|
|
| 4 |
|
| 5 |
from scrapling.core._types import (
|
| 6 |
Optional,
|
|
|
|
| 7 |
Dict,
|
| 8 |
Callable,
|
| 9 |
List,
|
|
|
|
| 23 |
disable_webgl: bool = False
|
| 24 |
real_chrome: bool = False
|
| 25 |
stealth: bool = False
|
| 26 |
+
wait: int | float = 0
|
| 27 |
page_action: Optional[Callable] = None
|
| 28 |
+
proxy: Optional[str | Dict[str, str]] = (
|
| 29 |
None # The default value for proxy in Playwright's source is `None`
|
| 30 |
)
|
| 31 |
locale: str = "en-US"
|
| 32 |
extra_headers: Optional[Dict[str, str]] = None
|
| 33 |
useragent: Optional[str] = None
|
| 34 |
+
timeout: int | float = 30000
|
| 35 |
disable_resources: bool = False
|
| 36 |
wait_selector: Optional[str] = None
|
| 37 |
cookies: Optional[List[Dict]] = None
|
|
|
|
| 86 |
block_webrtc: bool = False
|
| 87 |
allow_webgl: bool = True
|
| 88 |
network_idle: bool = False
|
| 89 |
+
humanize: bool | float = True
|
| 90 |
solve_cloudflare: bool = False
|
| 91 |
+
wait: int | float = 0
|
| 92 |
+
timeout: int | float = 30000
|
| 93 |
page_action: Optional[Callable] = None
|
| 94 |
wait_selector: Optional[str] = None
|
| 95 |
addons: Optional[List[str]] = None
|
|
|
|
| 97 |
cookies: Optional[List[Dict]] = None
|
| 98 |
google_search: bool = True
|
| 99 |
extra_headers: Optional[Dict[str, str]] = None
|
| 100 |
+
proxy: Optional[str | Dict[str, str]] = (
|
| 101 |
None # The default value for proxy in Playwright's source is `None`
|
| 102 |
)
|
| 103 |
os_randomize: bool = False
|
scrapling/engines/static.py
CHANGED
|
@@ -17,7 +17,6 @@ from scrapling.core._types import (
|
|
| 17 |
Dict,
|
| 18 |
Optional,
|
| 19 |
Tuple,
|
| 20 |
-
Union,
|
| 21 |
Mapping,
|
| 22 |
SUPPORTED_HTTP_METHODS,
|
| 23 |
Awaitable,
|
|
@@ -55,14 +54,14 @@ class FetcherSession:
|
|
| 55 |
proxies: Optional[Dict[str, str]] = None,
|
| 56 |
proxy: Optional[str] = None,
|
| 57 |
proxy_auth: Optional[Tuple[str, str]] = None,
|
| 58 |
-
timeout: Optional[
|
| 59 |
headers: Optional[Dict[str, str]] = None,
|
| 60 |
retries: Optional[int] = 3,
|
| 61 |
retry_delay: Optional[int] = 1,
|
| 62 |
follow_redirects: bool = True,
|
| 63 |
max_redirects: int = 30,
|
| 64 |
verify: bool = True,
|
| 65 |
-
cert: Optional[
|
| 66 |
selector_config: Optional[Dict] = None,
|
| 67 |
):
|
| 68 |
"""
|
|
@@ -357,7 +356,7 @@ class FetcherSession:
|
|
| 357 |
method: SUPPORTED_HTTP_METHODS,
|
| 358 |
stealth: Optional[bool] = None,
|
| 359 |
**kwargs,
|
| 360 |
-
) ->
|
| 361 |
"""
|
| 362 |
Internal dispatcher. Prepares arguments and calls sync or async request helper.
|
| 363 |
|
|
@@ -390,10 +389,10 @@ class FetcherSession:
|
|
| 390 |
def get(
|
| 391 |
self,
|
| 392 |
url: str,
|
| 393 |
-
params: Optional[
|
| 394 |
headers: Optional[Mapping[str, Optional[str]]] = _UNSET,
|
| 395 |
cookies: Optional[CookieTypes] = None,
|
| 396 |
-
timeout: Optional[
|
| 397 |
follow_redirects: Optional[bool] = _UNSET,
|
| 398 |
max_redirects: Optional[int] = _UNSET,
|
| 399 |
retries: Optional[int] = _UNSET,
|
|
@@ -403,12 +402,12 @@ class FetcherSession:
|
|
| 403 |
proxy_auth: Optional[Tuple[str, str]] = _UNSET,
|
| 404 |
auth: Optional[Tuple[str, str]] = None,
|
| 405 |
verify: Optional[bool] = _UNSET,
|
| 406 |
-
cert: Optional[
|
| 407 |
impersonate: Optional[BrowserTypeLiteral] = _UNSET,
|
| 408 |
http3: Optional[bool] = _UNSET,
|
| 409 |
stealthy_headers: Optional[bool] = _UNSET,
|
| 410 |
**kwargs,
|
| 411 |
-
) ->
|
| 412 |
"""
|
| 413 |
Perform a GET request.
|
| 414 |
|
|
@@ -461,12 +460,12 @@ class FetcherSession:
|
|
| 461 |
def post(
|
| 462 |
self,
|
| 463 |
url: str,
|
| 464 |
-
data: Optional[
|
| 465 |
-
json: Optional[
|
| 466 |
headers: Optional[Mapping[str, Optional[str]]] = _UNSET,
|
| 467 |
-
params: Optional[
|
| 468 |
cookies: Optional[CookieTypes] = None,
|
| 469 |
-
timeout: Optional[
|
| 470 |
follow_redirects: Optional[bool] = _UNSET,
|
| 471 |
max_redirects: Optional[int] = _UNSET,
|
| 472 |
retries: Optional[int] = _UNSET,
|
|
@@ -476,12 +475,12 @@ class FetcherSession:
|
|
| 476 |
proxy_auth: Optional[Tuple[str, str]] = _UNSET,
|
| 477 |
auth: Optional[Tuple[str, str]] = None,
|
| 478 |
verify: Optional[bool] = _UNSET,
|
| 479 |
-
cert: Optional[
|
| 480 |
impersonate: Optional[BrowserTypeLiteral] = _UNSET,
|
| 481 |
http3: Optional[bool] = _UNSET,
|
| 482 |
stealthy_headers: Optional[bool] = _UNSET,
|
| 483 |
**kwargs,
|
| 484 |
-
) ->
|
| 485 |
"""
|
| 486 |
Perform a POST request.
|
| 487 |
|
|
@@ -538,12 +537,12 @@ class FetcherSession:
|
|
| 538 |
def put(
|
| 539 |
self,
|
| 540 |
url: str,
|
| 541 |
-
data: Optional[
|
| 542 |
-
json: Optional[
|
| 543 |
headers: Optional[Mapping[str, Optional[str]]] = _UNSET,
|
| 544 |
-
params: Optional[
|
| 545 |
cookies: Optional[CookieTypes] = None,
|
| 546 |
-
timeout: Optional[
|
| 547 |
follow_redirects: Optional[bool] = _UNSET,
|
| 548 |
max_redirects: Optional[int] = _UNSET,
|
| 549 |
retries: Optional[int] = _UNSET,
|
|
@@ -553,12 +552,12 @@ class FetcherSession:
|
|
| 553 |
proxy_auth: Optional[Tuple[str, str]] = _UNSET,
|
| 554 |
auth: Optional[Tuple[str, str]] = None,
|
| 555 |
verify: Optional[bool] = _UNSET,
|
| 556 |
-
cert: Optional[
|
| 557 |
impersonate: Optional[BrowserTypeLiteral] = _UNSET,
|
| 558 |
http3: Optional[bool] = _UNSET,
|
| 559 |
stealthy_headers: Optional[bool] = _UNSET,
|
| 560 |
**kwargs,
|
| 561 |
-
) ->
|
| 562 |
"""
|
| 563 |
Perform a PUT request.
|
| 564 |
|
|
@@ -615,12 +614,12 @@ class FetcherSession:
|
|
| 615 |
def delete(
|
| 616 |
self,
|
| 617 |
url: str,
|
| 618 |
-
data: Optional[
|
| 619 |
-
json: Optional[
|
| 620 |
headers: Optional[Mapping[str, Optional[str]]] = _UNSET,
|
| 621 |
-
params: Optional[
|
| 622 |
cookies: Optional[CookieTypes] = None,
|
| 623 |
-
timeout: Optional[
|
| 624 |
follow_redirects: Optional[bool] = _UNSET,
|
| 625 |
max_redirects: Optional[int] = _UNSET,
|
| 626 |
retries: Optional[int] = _UNSET,
|
|
@@ -630,12 +629,12 @@ class FetcherSession:
|
|
| 630 |
proxy_auth: Optional[Tuple[str, str]] = _UNSET,
|
| 631 |
auth: Optional[Tuple[str, str]] = None,
|
| 632 |
verify: Optional[bool] = _UNSET,
|
| 633 |
-
cert: Optional[
|
| 634 |
impersonate: Optional[BrowserTypeLiteral] = _UNSET,
|
| 635 |
http3: Optional[bool] = _UNSET,
|
| 636 |
stealthy_headers: Optional[bool] = _UNSET,
|
| 637 |
**kwargs,
|
| 638 |
-
) ->
|
| 639 |
"""
|
| 640 |
Perform a DELETE request.
|
| 641 |
|
|
|
|
| 17 |
Dict,
|
| 18 |
Optional,
|
| 19 |
Tuple,
|
|
|
|
| 20 |
Mapping,
|
| 21 |
SUPPORTED_HTTP_METHODS,
|
| 22 |
Awaitable,
|
|
|
|
| 54 |
proxies: Optional[Dict[str, str]] = None,
|
| 55 |
proxy: Optional[str] = None,
|
| 56 |
proxy_auth: Optional[Tuple[str, str]] = None,
|
| 57 |
+
timeout: Optional[int | float] = 30,
|
| 58 |
headers: Optional[Dict[str, str]] = None,
|
| 59 |
retries: Optional[int] = 3,
|
| 60 |
retry_delay: Optional[int] = 1,
|
| 61 |
follow_redirects: bool = True,
|
| 62 |
max_redirects: int = 30,
|
| 63 |
verify: bool = True,
|
| 64 |
+
cert: Optional[str | Tuple[str, str]] = None,
|
| 65 |
selector_config: Optional[Dict] = None,
|
| 66 |
):
|
| 67 |
"""
|
|
|
|
| 356 |
method: SUPPORTED_HTTP_METHODS,
|
| 357 |
stealth: Optional[bool] = None,
|
| 358 |
**kwargs,
|
| 359 |
+
) -> Response | Awaitable[Response]:
|
| 360 |
"""
|
| 361 |
Internal dispatcher. Prepares arguments and calls sync or async request helper.
|
| 362 |
|
|
|
|
| 389 |
def get(
|
| 390 |
self,
|
| 391 |
url: str,
|
| 392 |
+
params: Optional[Dict | List | Tuple] = None,
|
| 393 |
headers: Optional[Mapping[str, Optional[str]]] = _UNSET,
|
| 394 |
cookies: Optional[CookieTypes] = None,
|
| 395 |
+
timeout: Optional[int | float] = _UNSET,
|
| 396 |
follow_redirects: Optional[bool] = _UNSET,
|
| 397 |
max_redirects: Optional[int] = _UNSET,
|
| 398 |
retries: Optional[int] = _UNSET,
|
|
|
|
| 402 |
proxy_auth: Optional[Tuple[str, str]] = _UNSET,
|
| 403 |
auth: Optional[Tuple[str, str]] = None,
|
| 404 |
verify: Optional[bool] = _UNSET,
|
| 405 |
+
cert: Optional[str | Tuple[str, str]] = _UNSET,
|
| 406 |
impersonate: Optional[BrowserTypeLiteral] = _UNSET,
|
| 407 |
http3: Optional[bool] = _UNSET,
|
| 408 |
stealthy_headers: Optional[bool] = _UNSET,
|
| 409 |
**kwargs,
|
| 410 |
+
) -> Response | Awaitable[Response]:
|
| 411 |
"""
|
| 412 |
Perform a GET request.
|
| 413 |
|
|
|
|
| 460 |
def post(
|
| 461 |
self,
|
| 462 |
url: str,
|
| 463 |
+
data: Optional[Dict | str] = None,
|
| 464 |
+
json: Optional[Dict | List] = None,
|
| 465 |
headers: Optional[Mapping[str, Optional[str]]] = _UNSET,
|
| 466 |
+
params: Optional[Dict | List | Tuple] = None,
|
| 467 |
cookies: Optional[CookieTypes] = None,
|
| 468 |
+
timeout: Optional[int | float] = _UNSET,
|
| 469 |
follow_redirects: Optional[bool] = _UNSET,
|
| 470 |
max_redirects: Optional[int] = _UNSET,
|
| 471 |
retries: Optional[int] = _UNSET,
|
|
|
|
| 475 |
proxy_auth: Optional[Tuple[str, str]] = _UNSET,
|
| 476 |
auth: Optional[Tuple[str, str]] = None,
|
| 477 |
verify: Optional[bool] = _UNSET,
|
| 478 |
+
cert: Optional[str | Tuple[str, str]] = _UNSET,
|
| 479 |
impersonate: Optional[BrowserTypeLiteral] = _UNSET,
|
| 480 |
http3: Optional[bool] = _UNSET,
|
| 481 |
stealthy_headers: Optional[bool] = _UNSET,
|
| 482 |
**kwargs,
|
| 483 |
+
) -> Response | Awaitable[Response]:
|
| 484 |
"""
|
| 485 |
Perform a POST request.
|
| 486 |
|
|
|
|
| 537 |
def put(
|
| 538 |
self,
|
| 539 |
url: str,
|
| 540 |
+
data: Optional[Dict | str] = None,
|
| 541 |
+
json: Optional[Dict | List] = None,
|
| 542 |
headers: Optional[Mapping[str, Optional[str]]] = _UNSET,
|
| 543 |
+
params: Optional[Dict | List | Tuple] = None,
|
| 544 |
cookies: Optional[CookieTypes] = None,
|
| 545 |
+
timeout: Optional[int | float] = _UNSET,
|
| 546 |
follow_redirects: Optional[bool] = _UNSET,
|
| 547 |
max_redirects: Optional[int] = _UNSET,
|
| 548 |
retries: Optional[int] = _UNSET,
|
|
|
|
| 552 |
proxy_auth: Optional[Tuple[str, str]] = _UNSET,
|
| 553 |
auth: Optional[Tuple[str, str]] = None,
|
| 554 |
verify: Optional[bool] = _UNSET,
|
| 555 |
+
cert: Optional[str | Tuple[str, str]] = _UNSET,
|
| 556 |
impersonate: Optional[BrowserTypeLiteral] = _UNSET,
|
| 557 |
http3: Optional[bool] = _UNSET,
|
| 558 |
stealthy_headers: Optional[bool] = _UNSET,
|
| 559 |
**kwargs,
|
| 560 |
+
) -> Response | Awaitable[Response]:
|
| 561 |
"""
|
| 562 |
Perform a PUT request.
|
| 563 |
|
|
|
|
| 614 |
def delete(
|
| 615 |
self,
|
| 616 |
url: str,
|
| 617 |
+
data: Optional[Dict | str] = None,
|
| 618 |
+
json: Optional[Dict | List] = None,
|
| 619 |
headers: Optional[Mapping[str, Optional[str]]] = _UNSET,
|
| 620 |
+
params: Optional[Dict | List | Tuple] = None,
|
| 621 |
cookies: Optional[CookieTypes] = None,
|
| 622 |
+
timeout: Optional[int | float] = _UNSET,
|
| 623 |
follow_redirects: Optional[bool] = _UNSET,
|
| 624 |
max_redirects: Optional[int] = _UNSET,
|
| 625 |
retries: Optional[int] = _UNSET,
|
|
|
|
| 629 |
proxy_auth: Optional[Tuple[str, str]] = _UNSET,
|
| 630 |
auth: Optional[Tuple[str, str]] = None,
|
| 631 |
verify: Optional[bool] = _UNSET,
|
| 632 |
+
cert: Optional[str | Tuple[str, str]] = _UNSET,
|
| 633 |
impersonate: Optional[BrowserTypeLiteral] = _UNSET,
|
| 634 |
http3: Optional[bool] = _UNSET,
|
| 635 |
stealthy_headers: Optional[bool] = _UNSET,
|
| 636 |
**kwargs,
|
| 637 |
+
) -> Response | Awaitable[Response]:
|
| 638 |
"""
|
| 639 |
Perform a DELETE request.
|
| 640 |
|
scrapling/engines/toolbelt/__init__.py
CHANGED
|
@@ -2,7 +2,6 @@ from .custom import (
|
|
| 2 |
BaseFetcher,
|
| 3 |
Response,
|
| 4 |
StatusText,
|
| 5 |
-
check_type_validity,
|
| 6 |
get_variable_name,
|
| 7 |
)
|
| 8 |
from .fingerprints import (
|
|
|
|
| 2 |
BaseFetcher,
|
| 3 |
Response,
|
| 4 |
StatusText,
|
|
|
|
| 5 |
get_variable_name,
|
| 6 |
)
|
| 7 |
from .fingerprints import (
|
scrapling/engines/toolbelt/custom.py
CHANGED
|
@@ -10,8 +10,6 @@ from scrapling.core._types import (
|
|
| 10 |
List,
|
| 11 |
Optional,
|
| 12 |
Tuple,
|
| 13 |
-
Type,
|
| 14 |
-
Union,
|
| 15 |
)
|
| 16 |
from scrapling.core.custom_types import MappingProxyType
|
| 17 |
from scrapling.core.utils import log, lru_cache
|
|
@@ -106,7 +104,7 @@ class Response(Selector):
|
|
| 106 |
content: str | bytes,
|
| 107 |
status: int,
|
| 108 |
reason: str,
|
| 109 |
-
cookies:
|
| 110 |
headers: Dict,
|
| 111 |
request_headers: Dict,
|
| 112 |
encoding: str = "utf-8",
|
|
@@ -318,51 +316,3 @@ def get_variable_name(var: Any) -> Optional[str]:
|
|
| 318 |
if value is var:
|
| 319 |
return name
|
| 320 |
return None
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
def check_type_validity(
|
| 324 |
-
variable: Any,
|
| 325 |
-
valid_types: Union[List[Type], None],
|
| 326 |
-
default_value: Any = None,
|
| 327 |
-
critical: bool = False,
|
| 328 |
-
param_name: Optional[str] = None,
|
| 329 |
-
) -> Any:
|
| 330 |
-
"""Check if a variable matches the specified type constraints.
|
| 331 |
-
:param variable: The variable to check
|
| 332 |
-
:param valid_types: List of valid types for the variable
|
| 333 |
-
:param default_value: Value to return if type check fails
|
| 334 |
-
:param critical: If True, raises TypeError instead of logging error
|
| 335 |
-
:param param_name: Optional parameter name for error messages
|
| 336 |
-
:return: The original variable if valid, default_value if invalid
|
| 337 |
-
:raise TypeError: If critical=True and type check fails
|
| 338 |
-
"""
|
| 339 |
-
# Use provided param_name or try to get it automatically
|
| 340 |
-
var_name = param_name or get_variable_name(variable) or "Unknown"
|
| 341 |
-
|
| 342 |
-
# Convert valid_types to a list if None
|
| 343 |
-
valid_types = valid_types or []
|
| 344 |
-
|
| 345 |
-
# Handle None value
|
| 346 |
-
if variable is None:
|
| 347 |
-
if type(None) in valid_types:
|
| 348 |
-
return variable
|
| 349 |
-
error_msg = f'Argument "{var_name}" cannot be None'
|
| 350 |
-
if critical:
|
| 351 |
-
raise TypeError(error_msg)
|
| 352 |
-
log.error(f"[Ignored] {error_msg}")
|
| 353 |
-
return default_value
|
| 354 |
-
|
| 355 |
-
# If no valid_types specified and variable has a value, return it
|
| 356 |
-
if not valid_types:
|
| 357 |
-
return variable
|
| 358 |
-
|
| 359 |
-
# Check if variable type matches any of the valid types
|
| 360 |
-
if not any(isinstance(variable, t) for t in valid_types):
|
| 361 |
-
type_names = [t.__name__ for t in valid_types]
|
| 362 |
-
error_msg = f'Argument "{var_name}" must be of type {" or ".join(type_names)}'
|
| 363 |
-
if critical:
|
| 364 |
-
raise TypeError(error_msg)
|
| 365 |
-
log.error(f"[Ignored] {error_msg}")
|
| 366 |
-
return default_value
|
| 367 |
-
|
| 368 |
-
return variable
|
|
|
|
| 10 |
List,
|
| 11 |
Optional,
|
| 12 |
Tuple,
|
|
|
|
|
|
|
| 13 |
)
|
| 14 |
from scrapling.core.custom_types import MappingProxyType
|
| 15 |
from scrapling.core.utils import log, lru_cache
|
|
|
|
| 104 |
content: str | bytes,
|
| 105 |
status: int,
|
| 106 |
reason: str,
|
| 107 |
+
cookies: Tuple[Dict[str, str], ...] | Dict[str, str],
|
| 108 |
headers: Dict,
|
| 109 |
request_headers: Dict,
|
| 110 |
encoding: str = "utf-8",
|
|
|
|
| 316 |
if value is var:
|
| 317 |
return name
|
| 318 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scrapling/engines/toolbelt/fingerprints.py
CHANGED
|
@@ -7,7 +7,7 @@ from platform import system as platform_system
|
|
| 7 |
from tldextract import extract
|
| 8 |
from browserforge.headers import Browser, HeaderGenerator
|
| 9 |
|
| 10 |
-
from scrapling.core._types import Dict,
|
| 11 |
from scrapling.core.utils import lru_cache
|
| 12 |
|
| 13 |
__OS_NAME__ = platform_system()
|
|
@@ -28,7 +28,7 @@ def generate_convincing_referer(url: str) -> str:
|
|
| 28 |
|
| 29 |
|
| 30 |
@lru_cache(1, typed=True)
|
| 31 |
-
def get_os_name() ->
|
| 32 |
"""Get the current OS name in the same format needed for browserforge
|
| 33 |
|
| 34 |
:return: Current OS name or `None` otherwise
|
|
|
|
| 7 |
from tldextract import extract
|
| 8 |
from browserforge.headers import Browser, HeaderGenerator
|
| 9 |
|
| 10 |
+
from scrapling.core._types import Dict, Optional
|
| 11 |
from scrapling.core.utils import lru_cache
|
| 12 |
|
| 13 |
__OS_NAME__ = platform_system()
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
@lru_cache(1, typed=True)
|
| 31 |
+
def get_os_name() -> Optional[str]:
|
| 32 |
"""Get the current OS name in the same format needed for browserforge
|
| 33 |
|
| 34 |
:return: Current OS name or `None` otherwise
|
scrapling/engines/toolbelt/navigation.py
CHANGED
|
@@ -11,7 +11,7 @@ from msgspec import Struct, structs, convert, ValidationError
|
|
| 11 |
from playwright.sync_api import Route
|
| 12 |
|
| 13 |
from scrapling.core.utils import log
|
| 14 |
-
from scrapling.core._types import Dict, Optional,
|
| 15 |
from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
|
| 16 |
|
| 17 |
__BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
|
|
@@ -54,8 +54,8 @@ async def async_intercept_route(route: async_Route):
|
|
| 54 |
|
| 55 |
|
| 56 |
def construct_proxy_dict(
|
| 57 |
-
proxy_string:
|
| 58 |
-
) ->
|
| 59 |
"""Validate a proxy and return it in the acceptable format for Playwright
|
| 60 |
Reference: https://playwright.dev/python/docs/network#http-proxy
|
| 61 |
|
|
|
|
| 11 |
from playwright.sync_api import Route
|
| 12 |
|
| 13 |
from scrapling.core.utils import log
|
| 14 |
+
from scrapling.core._types import Dict, Optional, Tuple
|
| 15 |
from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
|
| 16 |
|
| 17 |
__BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
|
|
|
|
| 54 |
|
| 55 |
|
| 56 |
def construct_proxy_dict(
|
| 57 |
+
proxy_string: str | Dict[str, str], as_tuple=False
|
| 58 |
+
) -> Optional[Dict | Tuple]:
|
| 59 |
"""Validate a proxy and return it in the acceptable format for Playwright
|
| 60 |
Reference: https://playwright.dev/python/docs/network#http-proxy
|
| 61 |
|
scrapling/fetchers.py
CHANGED
|
@@ -4,7 +4,6 @@ from scrapling.core._types import (
|
|
| 4 |
List,
|
| 5 |
Optional,
|
| 6 |
SelectorWaitStates,
|
| 7 |
-
Union,
|
| 8 |
Iterable,
|
| 9 |
)
|
| 10 |
from scrapling.engines import (
|
|
@@ -51,16 +50,16 @@ class StealthyFetcher(BaseFetcher):
|
|
| 51 |
def fetch(
|
| 52 |
cls,
|
| 53 |
url: str,
|
| 54 |
-
headless:
|
| 55 |
block_images: bool = False,
|
| 56 |
disable_resources: bool = False,
|
| 57 |
block_webrtc: bool = False,
|
| 58 |
allow_webgl: bool = True,
|
| 59 |
network_idle: bool = False,
|
| 60 |
-
humanize:
|
| 61 |
solve_cloudflare: bool = False,
|
| 62 |
-
wait:
|
| 63 |
-
timeout:
|
| 64 |
page_action: Optional[Callable] = None,
|
| 65 |
wait_selector: Optional[str] = None,
|
| 66 |
addons: Optional[List[str]] = None,
|
|
@@ -68,7 +67,7 @@ class StealthyFetcher(BaseFetcher):
|
|
| 68 |
cookies: Optional[List[Dict]] = None,
|
| 69 |
google_search: bool = True,
|
| 70 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 71 |
-
proxy: Optional[
|
| 72 |
os_randomize: bool = False,
|
| 73 |
disable_ads: bool = False,
|
| 74 |
geoip: bool = False,
|
|
@@ -147,16 +146,16 @@ class StealthyFetcher(BaseFetcher):
|
|
| 147 |
async def async_fetch(
|
| 148 |
cls,
|
| 149 |
url: str,
|
| 150 |
-
headless:
|
| 151 |
block_images: bool = False,
|
| 152 |
disable_resources: bool = False,
|
| 153 |
block_webrtc: bool = False,
|
| 154 |
allow_webgl: bool = True,
|
| 155 |
network_idle: bool = False,
|
| 156 |
-
humanize:
|
| 157 |
solve_cloudflare: bool = False,
|
| 158 |
-
wait:
|
| 159 |
-
timeout:
|
| 160 |
page_action: Optional[Callable] = None,
|
| 161 |
wait_selector: Optional[str] = None,
|
| 162 |
addons: Optional[List[str]] = None,
|
|
@@ -164,7 +163,7 @@ class StealthyFetcher(BaseFetcher):
|
|
| 164 |
cookies: Optional[List[Dict]] = None,
|
| 165 |
google_search: bool = True,
|
| 166 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 167 |
-
proxy: Optional[
|
| 168 |
os_randomize: bool = False,
|
| 169 |
disable_ads: bool = False,
|
| 170 |
geoip: bool = False,
|
|
@@ -267,14 +266,14 @@ class DynamicFetcher(BaseFetcher):
|
|
| 267 |
disable_webgl: bool = False,
|
| 268 |
real_chrome: bool = False,
|
| 269 |
stealth: bool = False,
|
| 270 |
-
wait:
|
| 271 |
page_action: Optional[Callable] = None,
|
| 272 |
-
proxy: Optional[
|
| 273 |
locale: str = "en-US",
|
| 274 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 275 |
useragent: Optional[str] = None,
|
| 276 |
cdp_url: Optional[str] = None,
|
| 277 |
-
timeout:
|
| 278 |
disable_resources: bool = False,
|
| 279 |
wait_selector: Optional[str] = None,
|
| 280 |
cookies: Optional[Iterable[Dict]] = None,
|
|
@@ -350,14 +349,14 @@ class DynamicFetcher(BaseFetcher):
|
|
| 350 |
disable_webgl: bool = False,
|
| 351 |
real_chrome: bool = False,
|
| 352 |
stealth: bool = False,
|
| 353 |
-
wait:
|
| 354 |
page_action: Optional[Callable] = None,
|
| 355 |
-
proxy: Optional[
|
| 356 |
locale: str = "en-US",
|
| 357 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 358 |
useragent: Optional[str] = None,
|
| 359 |
cdp_url: Optional[str] = None,
|
| 360 |
-
timeout:
|
| 361 |
disable_resources: bool = False,
|
| 362 |
wait_selector: Optional[str] = None,
|
| 363 |
cookies: Optional[Iterable[Dict]] = None,
|
|
|
|
| 4 |
List,
|
| 5 |
Optional,
|
| 6 |
SelectorWaitStates,
|
|
|
|
| 7 |
Iterable,
|
| 8 |
)
|
| 9 |
from scrapling.engines import (
|
|
|
|
| 50 |
def fetch(
|
| 51 |
cls,
|
| 52 |
url: str,
|
| 53 |
+
headless: bool = True, # noqa: F821
|
| 54 |
block_images: bool = False,
|
| 55 |
disable_resources: bool = False,
|
| 56 |
block_webrtc: bool = False,
|
| 57 |
allow_webgl: bool = True,
|
| 58 |
network_idle: bool = False,
|
| 59 |
+
humanize: bool | float = True,
|
| 60 |
solve_cloudflare: bool = False,
|
| 61 |
+
wait: int | float = 0,
|
| 62 |
+
timeout: int | float = 30000,
|
| 63 |
page_action: Optional[Callable] = None,
|
| 64 |
wait_selector: Optional[str] = None,
|
| 65 |
addons: Optional[List[str]] = None,
|
|
|
|
| 67 |
cookies: Optional[List[Dict]] = None,
|
| 68 |
google_search: bool = True,
|
| 69 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 70 |
+
proxy: Optional[str | Dict[str, str]] = None,
|
| 71 |
os_randomize: bool = False,
|
| 72 |
disable_ads: bool = False,
|
| 73 |
geoip: bool = False,
|
|
|
|
| 146 |
async def async_fetch(
|
| 147 |
cls,
|
| 148 |
url: str,
|
| 149 |
+
headless: bool = True, # noqa: F821
|
| 150 |
block_images: bool = False,
|
| 151 |
disable_resources: bool = False,
|
| 152 |
block_webrtc: bool = False,
|
| 153 |
allow_webgl: bool = True,
|
| 154 |
network_idle: bool = False,
|
| 155 |
+
humanize: bool | float = True,
|
| 156 |
solve_cloudflare: bool = False,
|
| 157 |
+
wait: int | float = 0,
|
| 158 |
+
timeout: int | float = 30000,
|
| 159 |
page_action: Optional[Callable] = None,
|
| 160 |
wait_selector: Optional[str] = None,
|
| 161 |
addons: Optional[List[str]] = None,
|
|
|
|
| 163 |
cookies: Optional[List[Dict]] = None,
|
| 164 |
google_search: bool = True,
|
| 165 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 166 |
+
proxy: Optional[str | Dict[str, str]] = None,
|
| 167 |
os_randomize: bool = False,
|
| 168 |
disable_ads: bool = False,
|
| 169 |
geoip: bool = False,
|
|
|
|
| 266 |
disable_webgl: bool = False,
|
| 267 |
real_chrome: bool = False,
|
| 268 |
stealth: bool = False,
|
| 269 |
+
wait: int | float = 0,
|
| 270 |
page_action: Optional[Callable] = None,
|
| 271 |
+
proxy: Optional[str | Dict[str, str]] = None,
|
| 272 |
locale: str = "en-US",
|
| 273 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 274 |
useragent: Optional[str] = None,
|
| 275 |
cdp_url: Optional[str] = None,
|
| 276 |
+
timeout: int | float = 30000,
|
| 277 |
disable_resources: bool = False,
|
| 278 |
wait_selector: Optional[str] = None,
|
| 279 |
cookies: Optional[Iterable[Dict]] = None,
|
|
|
|
| 349 |
disable_webgl: bool = False,
|
| 350 |
real_chrome: bool = False,
|
| 351 |
stealth: bool = False,
|
| 352 |
+
wait: int | float = 0,
|
| 353 |
page_action: Optional[Callable] = None,
|
| 354 |
+
proxy: Optional[str | Dict[str, str]] = None,
|
| 355 |
locale: str = "en-US",
|
| 356 |
extra_headers: Optional[Dict[str, str]] = None,
|
| 357 |
useragent: Optional[str] = None,
|
| 358 |
cdp_url: Optional[str] = None,
|
| 359 |
+
timeout: int | float = 30000,
|
| 360 |
disable_resources: bool = False,
|
| 361 |
wait_selector: Optional[str] = None,
|
| 362 |
cookies: Optional[Iterable[Dict]] = None,
|
scrapling/parser.py
CHANGED
|
@@ -59,7 +59,7 @@ class Selector(SelectorsGeneration):
|
|
| 59 |
|
| 60 |
def __init__(
|
| 61 |
self,
|
| 62 |
-
content: Optional[
|
| 63 |
url: Optional[str] = None,
|
| 64 |
encoding: str = "utf8",
|
| 65 |
huge_tree: bool = True,
|
|
@@ -197,7 +197,7 @@ class Selector(SelectorsGeneration):
|
|
| 197 |
# Node functionalities, I wanted to move to a separate Mixin class, but it had a slight impact on performance
|
| 198 |
@staticmethod
|
| 199 |
def _is_text_node(
|
| 200 |
-
element:
|
| 201 |
) -> bool:
|
| 202 |
"""Return True if the given element is a result of a string expression
|
| 203 |
Examples:
|
|
@@ -209,7 +209,7 @@ class Selector(SelectorsGeneration):
|
|
| 209 |
|
| 210 |
@staticmethod
|
| 211 |
def __content_convertor(
|
| 212 |
-
element:
|
| 213 |
) -> TextHandler:
|
| 214 |
"""Used internally to convert a single element's text content to TextHandler directly without checks
|
| 215 |
|
|
@@ -235,8 +235,8 @@ class Selector(SelectorsGeneration):
|
|
| 235 |
)
|
| 236 |
|
| 237 |
def __handle_element(
|
| 238 |
-
self, element:
|
| 239 |
-
) ->
|
| 240 |
"""Used internally in all functions to convert a single element to type (Selector|TextHandler) when possible"""
|
| 241 |
if element is None:
|
| 242 |
return None
|
|
@@ -247,7 +247,7 @@ class Selector(SelectorsGeneration):
|
|
| 247 |
return self.__element_convertor(element)
|
| 248 |
|
| 249 |
def __handle_elements(
|
| 250 |
-
self, result: List[
|
| 251 |
) -> Union["Selectors", "TextHandlers"]:
|
| 252 |
"""Used internally in all functions to convert results to type (Selectors|TextHandlers) in bulk when possible"""
|
| 253 |
if not len(
|
|
@@ -364,18 +364,18 @@ class Selector(SelectorsGeneration):
|
|
| 364 |
return class_name in self._root.classes
|
| 365 |
|
| 366 |
@property
|
| 367 |
-
def parent(self) ->
|
| 368 |
"""Return the direct parent of the element or ``None`` otherwise"""
|
| 369 |
return self.__handle_element(self._root.getparent())
|
| 370 |
|
| 371 |
@property
|
| 372 |
-
def below_elements(self) -> "Selectors
|
| 373 |
"""Return all elements under the current element in the DOM tree"""
|
| 374 |
below = self._root.xpath(".//*")
|
| 375 |
return self.__handle_elements(below)
|
| 376 |
|
| 377 |
@property
|
| 378 |
-
def children(self) -> "Selectors
|
| 379 |
"""Return the children elements of the current element or empty list otherwise"""
|
| 380 |
return Selectors(
|
| 381 |
self.__element_convertor(child)
|
|
@@ -384,7 +384,7 @@ class Selector(SelectorsGeneration):
|
|
| 384 |
)
|
| 385 |
|
| 386 |
@property
|
| 387 |
-
def siblings(self) -> "Selectors
|
| 388 |
"""Return other children of the current element's parent or empty list otherwise"""
|
| 389 |
if self.parent:
|
| 390 |
return Selectors(
|
|
@@ -397,9 +397,7 @@ class Selector(SelectorsGeneration):
|
|
| 397 |
for ancestor in self._root.iterancestors():
|
| 398 |
yield self.__element_convertor(ancestor)
|
| 399 |
|
| 400 |
-
def find_ancestor(
|
| 401 |
-
self, func: Callable[["Selector"], bool]
|
| 402 |
-
) -> Union["Selector", None]:
|
| 403 |
"""Loop over all ancestors of the element till one match the passed function
|
| 404 |
:param func: A function that takes each ancestor as an argument and returns True/False
|
| 405 |
:return: The first ancestor that match the function or ``None`` otherwise.
|
|
@@ -410,13 +408,13 @@ class Selector(SelectorsGeneration):
|
|
| 410 |
return None
|
| 411 |
|
| 412 |
@property
|
| 413 |
-
def path(self) -> "Selectors
|
| 414 |
"""Returns a list of type `Selectors` that contains the path leading to the current element from the root."""
|
| 415 |
lst = list(self.iterancestors())
|
| 416 |
return Selectors(lst)
|
| 417 |
|
| 418 |
@property
|
| 419 |
-
def next(self) ->
|
| 420 |
"""Returns the next element of the current element in the children of the parent or ``None`` otherwise."""
|
| 421 |
next_element = self._root.getnext()
|
| 422 |
if next_element is not None:
|
|
@@ -427,7 +425,7 @@ class Selector(SelectorsGeneration):
|
|
| 427 |
return self.__handle_element(next_element)
|
| 428 |
|
| 429 |
@property
|
| 430 |
-
def previous(self) ->
|
| 431 |
"""Returns the previous element of the current element in the children of the parent or ``None`` otherwise."""
|
| 432 |
prev_element = self._root.getprevious()
|
| 433 |
if prev_element is not None:
|
|
@@ -470,10 +468,10 @@ class Selector(SelectorsGeneration):
|
|
| 470 |
# From here we start with the selecting functions
|
| 471 |
def relocate(
|
| 472 |
self,
|
| 473 |
-
element:
|
| 474 |
percentage: int = 0,
|
| 475 |
selector_type: bool = False,
|
| 476 |
-
) ->
|
| 477 |
"""This function will search again for the element in the page tree, used automatically on page structure change
|
| 478 |
|
| 479 |
:param element: The element we want to relocate in the tree
|
|
@@ -581,7 +579,7 @@ class Selector(SelectorsGeneration):
|
|
| 581 |
adaptive: bool = False,
|
| 582 |
auto_save: bool = False,
|
| 583 |
percentage: int = 0,
|
| 584 |
-
) ->
|
| 585 |
"""Search the current tree with CSS3 selectors
|
| 586 |
|
| 587 |
**Important:
|
|
@@ -644,7 +642,7 @@ class Selector(SelectorsGeneration):
|
|
| 644 |
auto_save: bool = False,
|
| 645 |
percentage: int = 0,
|
| 646 |
**kwargs: Any,
|
| 647 |
-
) ->
|
| 648 |
"""Search the current tree with XPath selectors
|
| 649 |
|
| 650 |
**Important:
|
|
@@ -708,7 +706,7 @@ class Selector(SelectorsGeneration):
|
|
| 708 |
|
| 709 |
def find_all(
|
| 710 |
self,
|
| 711 |
-
*args:
|
| 712 |
**kwargs: str,
|
| 713 |
) -> "Selectors":
|
| 714 |
"""Find elements by filters of your creations for ease.
|
|
@@ -815,9 +813,9 @@ class Selector(SelectorsGeneration):
|
|
| 815 |
|
| 816 |
def find(
|
| 817 |
self,
|
| 818 |
-
*args:
|
| 819 |
**kwargs: str,
|
| 820 |
-
) ->
|
| 821 |
"""Find elements by filters of your creations for ease, then return the first result. Otherwise return `None`.
|
| 822 |
|
| 823 |
:param args: Tag name(s), iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
|
|
@@ -924,7 +922,7 @@ class Selector(SelectorsGeneration):
|
|
| 924 |
)
|
| 925 |
return score
|
| 926 |
|
| 927 |
-
def save(self, element:
|
| 928 |
"""Saves the element's unique properties to the storage for retrieval and relocation later
|
| 929 |
|
| 930 |
:param element: The element itself that we want to save to storage, it can be a ` Selector ` or pure ` HtmlElement `
|
|
@@ -969,7 +967,7 @@ class Selector(SelectorsGeneration):
|
|
| 969 |
|
| 970 |
def re(
|
| 971 |
self,
|
| 972 |
-
regex:
|
| 973 |
replace_entities: bool = True,
|
| 974 |
clean_match: bool = False,
|
| 975 |
case_sensitive: bool = True,
|
|
@@ -985,7 +983,7 @@ class Selector(SelectorsGeneration):
|
|
| 985 |
|
| 986 |
def re_first(
|
| 987 |
self,
|
| 988 |
-
regex:
|
| 989 |
default=None,
|
| 990 |
replace_entities: bool = True,
|
| 991 |
clean_match: bool = False,
|
|
@@ -1004,9 +1002,7 @@ class Selector(SelectorsGeneration):
|
|
| 1004 |
)
|
| 1005 |
|
| 1006 |
@staticmethod
|
| 1007 |
-
def __get_attributes(
|
| 1008 |
-
element: HtmlElement, ignore_attributes: Union[List, Tuple]
|
| 1009 |
-
) -> Dict:
|
| 1010 |
"""Return attributes dictionary without the ignored list"""
|
| 1011 |
return {k: v for k, v in element.attrib.items() if k not in ignore_attributes}
|
| 1012 |
|
|
@@ -1015,7 +1011,7 @@ class Selector(SelectorsGeneration):
|
|
| 1015 |
original: HtmlElement,
|
| 1016 |
original_attributes: Dict,
|
| 1017 |
candidate: HtmlElement,
|
| 1018 |
-
ignore_attributes:
|
| 1019 |
similarity_threshold: float,
|
| 1020 |
match_text: bool = False,
|
| 1021 |
) -> bool:
|
|
@@ -1055,12 +1051,12 @@ class Selector(SelectorsGeneration):
|
|
| 1055 |
def find_similar(
|
| 1056 |
self,
|
| 1057 |
similarity_threshold: float = 0.2,
|
| 1058 |
-
ignore_attributes:
|
| 1059 |
"href",
|
| 1060 |
"src",
|
| 1061 |
),
|
| 1062 |
match_text: bool = False,
|
| 1063 |
-
) ->
|
| 1064 |
"""Find elements that are in the same tree depth in the page with the same tag name and same parent tag etc...
|
| 1065 |
then return the ones that match the current element attributes with a percentage higher than the input threshold.
|
| 1066 |
|
|
@@ -1123,7 +1119,7 @@ class Selector(SelectorsGeneration):
|
|
| 1123 |
partial: bool = False,
|
| 1124 |
case_sensitive: bool = False,
|
| 1125 |
clean_match: bool = True,
|
| 1126 |
-
) -> Union["Selectors
|
| 1127 |
"""Find elements that its text content fully/partially matches input.
|
| 1128 |
:param text: Text query to match
|
| 1129 |
:param first_match: Returns the first element that matches conditions, enabled by default
|
|
@@ -1165,11 +1161,11 @@ class Selector(SelectorsGeneration):
|
|
| 1165 |
|
| 1166 |
def find_by_regex(
|
| 1167 |
self,
|
| 1168 |
-
query:
|
| 1169 |
first_match: bool = True,
|
| 1170 |
case_sensitive: bool = False,
|
| 1171 |
clean_match: bool = True,
|
| 1172 |
-
) -> Union["Selectors
|
| 1173 |
"""Find elements that its text content matches the input regex pattern.
|
| 1174 |
:param query: Regex query/pattern to match
|
| 1175 |
:param first_match: Return the first element that matches conditions; enabled by default.
|
|
@@ -1216,9 +1212,7 @@ class Selectors(List[Selector]):
|
|
| 1216 |
def __getitem__(self, pos: slice) -> "Selectors":
|
| 1217 |
pass
|
| 1218 |
|
| 1219 |
-
def __getitem__(
|
| 1220 |
-
self, pos: Union[SupportsIndex, slice]
|
| 1221 |
-
) -> Union[Selector, "Selectors"]:
|
| 1222 |
lst = super().__getitem__(pos)
|
| 1223 |
if isinstance(pos, slice):
|
| 1224 |
return self.__class__(lst)
|
|
@@ -1232,7 +1226,7 @@ class Selectors(List[Selector]):
|
|
| 1232 |
auto_save: bool = False,
|
| 1233 |
percentage: int = 0,
|
| 1234 |
**kwargs: Any,
|
| 1235 |
-
) -> "Selectors
|
| 1236 |
"""
|
| 1237 |
Call the ``.xpath()`` method for each element in this list and return
|
| 1238 |
their results as another `Selectors` class.
|
|
@@ -1267,7 +1261,7 @@ class Selectors(List[Selector]):
|
|
| 1267 |
identifier: str = "",
|
| 1268 |
auto_save: bool = False,
|
| 1269 |
percentage: int = 0,
|
| 1270 |
-
) -> "Selectors
|
| 1271 |
"""
|
| 1272 |
Call the ``.css()`` method for each element in this list and return
|
| 1273 |
their results flattened as another `Selectors` class.
|
|
@@ -1294,11 +1288,11 @@ class Selectors(List[Selector]):
|
|
| 1294 |
|
| 1295 |
def re(
|
| 1296 |
self,
|
| 1297 |
-
regex:
|
| 1298 |
replace_entities: bool = True,
|
| 1299 |
clean_match: bool = False,
|
| 1300 |
case_sensitive: bool = True,
|
| 1301 |
-
) -> TextHandlers
|
| 1302 |
"""Call the ``.re()`` method for each element in this list and return
|
| 1303 |
their results flattened as List of TextHandler.
|
| 1304 |
|
|
@@ -1315,7 +1309,7 @@ class Selectors(List[Selector]):
|
|
| 1315 |
|
| 1316 |
def re_first(
|
| 1317 |
self,
|
| 1318 |
-
regex:
|
| 1319 |
default=None,
|
| 1320 |
replace_entities: bool = True,
|
| 1321 |
clean_match: bool = False,
|
|
@@ -1335,7 +1329,7 @@ class Selectors(List[Selector]):
|
|
| 1335 |
return result
|
| 1336 |
return default
|
| 1337 |
|
| 1338 |
-
def search(self, func: Callable[["Selector"], bool]) ->
|
| 1339 |
"""Loop over all current elements and return the first element that matches the passed function
|
| 1340 |
:param func: A function that takes each element as an argument and returns True/False
|
| 1341 |
:return: The first element that match the function or ``None`` otherwise.
|
|
@@ -1345,7 +1339,7 @@ class Selectors(List[Selector]):
|
|
| 1345 |
return element
|
| 1346 |
return None
|
| 1347 |
|
| 1348 |
-
def filter(self, func: Callable[["Selector"], bool]) -> "Selectors
|
| 1349 |
"""Filter current elements based on the passed function
|
| 1350 |
:param func: A function that takes each element as an argument and returns True/False
|
| 1351 |
:return: The new `Selectors` object or empty list otherwise.
|
|
|
|
| 59 |
|
| 60 |
def __init__(
|
| 61 |
self,
|
| 62 |
+
content: Optional[str | bytes] = None,
|
| 63 |
url: Optional[str] = None,
|
| 64 |
encoding: str = "utf8",
|
| 65 |
huge_tree: bool = True,
|
|
|
|
| 197 |
# Node functionalities, I wanted to move to a separate Mixin class, but it had a slight impact on performance
|
| 198 |
@staticmethod
|
| 199 |
def _is_text_node(
|
| 200 |
+
element: HtmlElement | _ElementUnicodeResult,
|
| 201 |
) -> bool:
|
| 202 |
"""Return True if the given element is a result of a string expression
|
| 203 |
Examples:
|
|
|
|
| 209 |
|
| 210 |
@staticmethod
|
| 211 |
def __content_convertor(
|
| 212 |
+
element: HtmlElement | _ElementUnicodeResult,
|
| 213 |
) -> TextHandler:
|
| 214 |
"""Used internally to convert a single element's text content to TextHandler directly without checks
|
| 215 |
|
|
|
|
| 235 |
)
|
| 236 |
|
| 237 |
def __handle_element(
|
| 238 |
+
self, element: HtmlElement | _ElementUnicodeResult
|
| 239 |
+
) -> Optional[TextHandler | "Selector"]:
|
| 240 |
"""Used internally in all functions to convert a single element to type (Selector|TextHandler) when possible"""
|
| 241 |
if element is None:
|
| 242 |
return None
|
|
|
|
| 247 |
return self.__element_convertor(element)
|
| 248 |
|
| 249 |
def __handle_elements(
|
| 250 |
+
self, result: List[HtmlElement | _ElementUnicodeResult]
|
| 251 |
) -> Union["Selectors", "TextHandlers"]:
|
| 252 |
"""Used internally in all functions to convert results to type (Selectors|TextHandlers) in bulk when possible"""
|
| 253 |
if not len(
|
|
|
|
| 364 |
return class_name in self._root.classes
|
| 365 |
|
| 366 |
@property
|
| 367 |
+
def parent(self) -> Optional["Selector"]:
|
| 368 |
"""Return the direct parent of the element or ``None`` otherwise"""
|
| 369 |
return self.__handle_element(self._root.getparent())
|
| 370 |
|
| 371 |
@property
|
| 372 |
+
def below_elements(self) -> "Selectors":
|
| 373 |
"""Return all elements under the current element in the DOM tree"""
|
| 374 |
below = self._root.xpath(".//*")
|
| 375 |
return self.__handle_elements(below)
|
| 376 |
|
| 377 |
@property
|
| 378 |
+
def children(self) -> "Selectors":
|
| 379 |
"""Return the children elements of the current element or empty list otherwise"""
|
| 380 |
return Selectors(
|
| 381 |
self.__element_convertor(child)
|
|
|
|
| 384 |
)
|
| 385 |
|
| 386 |
@property
|
| 387 |
+
def siblings(self) -> "Selectors":
|
| 388 |
"""Return other children of the current element's parent or empty list otherwise"""
|
| 389 |
if self.parent:
|
| 390 |
return Selectors(
|
|
|
|
| 397 |
for ancestor in self._root.iterancestors():
|
| 398 |
yield self.__element_convertor(ancestor)
|
| 399 |
|
| 400 |
+
def find_ancestor(self, func: Callable[["Selector"], bool]) -> Optional["Selector"]:
|
|
|
|
|
|
|
| 401 |
"""Loop over all ancestors of the element till one match the passed function
|
| 402 |
:param func: A function that takes each ancestor as an argument and returns True/False
|
| 403 |
:return: The first ancestor that match the function or ``None`` otherwise.
|
|
|
|
| 408 |
return None
|
| 409 |
|
| 410 |
@property
|
| 411 |
+
def path(self) -> "Selectors":
|
| 412 |
"""Returns a list of type `Selectors` that contains the path leading to the current element from the root."""
|
| 413 |
lst = list(self.iterancestors())
|
| 414 |
return Selectors(lst)
|
| 415 |
|
| 416 |
@property
|
| 417 |
+
def next(self) -> Optional["Selector"]:
|
| 418 |
"""Returns the next element of the current element in the children of the parent or ``None`` otherwise."""
|
| 419 |
next_element = self._root.getnext()
|
| 420 |
if next_element is not None:
|
|
|
|
| 425 |
return self.__handle_element(next_element)
|
| 426 |
|
| 427 |
@property
|
| 428 |
+
def previous(self) -> Optional["Selector"]:
|
| 429 |
"""Returns the previous element of the current element in the children of the parent or ``None`` otherwise."""
|
| 430 |
prev_element = self._root.getprevious()
|
| 431 |
if prev_element is not None:
|
|
|
|
| 468 |
# From here we start with the selecting functions
|
| 469 |
def relocate(
|
| 470 |
self,
|
| 471 |
+
element: Dict | HtmlElement | "Selector",
|
| 472 |
percentage: int = 0,
|
| 473 |
selector_type: bool = False,
|
| 474 |
+
) -> List[HtmlElement] | "Selectors":
|
| 475 |
"""This function will search again for the element in the page tree, used automatically on page structure change
|
| 476 |
|
| 477 |
:param element: The element we want to relocate in the tree
|
|
|
|
| 579 |
adaptive: bool = False,
|
| 580 |
auto_save: bool = False,
|
| 581 |
percentage: int = 0,
|
| 582 |
+
) -> "Selectors" | List | "TextHandlers":
|
| 583 |
"""Search the current tree with CSS3 selectors
|
| 584 |
|
| 585 |
**Important:
|
|
|
|
| 642 |
auto_save: bool = False,
|
| 643 |
percentage: int = 0,
|
| 644 |
**kwargs: Any,
|
| 645 |
+
) -> "Selectors" | List | "TextHandlers":
|
| 646 |
"""Search the current tree with XPath selectors
|
| 647 |
|
| 648 |
**Important:
|
|
|
|
| 706 |
|
| 707 |
def find_all(
|
| 708 |
self,
|
| 709 |
+
*args: str | Iterable[str] | Pattern | Callable | Dict[str, str],
|
| 710 |
**kwargs: str,
|
| 711 |
) -> "Selectors":
|
| 712 |
"""Find elements by filters of your creations for ease.
|
|
|
|
| 813 |
|
| 814 |
def find(
|
| 815 |
self,
|
| 816 |
+
*args: str | Iterable[str] | Pattern | Callable | Dict[str, str],
|
| 817 |
**kwargs: str,
|
| 818 |
+
) -> Optional["Selector"]:
|
| 819 |
"""Find elements by filters of your creations for ease, then return the first result. Otherwise return `None`.
|
| 820 |
|
| 821 |
:param args: Tag name(s), iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
|
|
|
|
| 922 |
)
|
| 923 |
return score
|
| 924 |
|
| 925 |
+
def save(self, element: "Selector" | HtmlElement, identifier: str) -> None:
|
| 926 |
"""Saves the element's unique properties to the storage for retrieval and relocation later
|
| 927 |
|
| 928 |
:param element: The element itself that we want to save to storage, it can be a ` Selector ` or pure ` HtmlElement `
|
|
|
|
| 967 |
|
| 968 |
def re(
|
| 969 |
self,
|
| 970 |
+
regex: str | Pattern[str],
|
| 971 |
replace_entities: bool = True,
|
| 972 |
clean_match: bool = False,
|
| 973 |
case_sensitive: bool = True,
|
|
|
|
| 983 |
|
| 984 |
def re_first(
|
| 985 |
self,
|
| 986 |
+
regex: str | Pattern[str],
|
| 987 |
default=None,
|
| 988 |
replace_entities: bool = True,
|
| 989 |
clean_match: bool = False,
|
|
|
|
| 1002 |
)
|
| 1003 |
|
| 1004 |
@staticmethod
|
| 1005 |
+
def __get_attributes(element: HtmlElement, ignore_attributes: List | Tuple) -> Dict:
|
|
|
|
|
|
|
| 1006 |
"""Return attributes dictionary without the ignored list"""
|
| 1007 |
return {k: v for k, v in element.attrib.items() if k not in ignore_attributes}
|
| 1008 |
|
|
|
|
| 1011 |
original: HtmlElement,
|
| 1012 |
original_attributes: Dict,
|
| 1013 |
candidate: HtmlElement,
|
| 1014 |
+
ignore_attributes: List | Tuple,
|
| 1015 |
similarity_threshold: float,
|
| 1016 |
match_text: bool = False,
|
| 1017 |
) -> bool:
|
|
|
|
| 1051 |
def find_similar(
|
| 1052 |
self,
|
| 1053 |
similarity_threshold: float = 0.2,
|
| 1054 |
+
ignore_attributes: List | Tuple = (
|
| 1055 |
"href",
|
| 1056 |
"src",
|
| 1057 |
),
|
| 1058 |
match_text: bool = False,
|
| 1059 |
+
) -> "Selectors" | List:
|
| 1060 |
"""Find elements that are in the same tree depth in the page with the same tag name and same parent tag etc...
|
| 1061 |
then return the ones that match the current element attributes with a percentage higher than the input threshold.
|
| 1062 |
|
|
|
|
| 1119 |
partial: bool = False,
|
| 1120 |
case_sensitive: bool = False,
|
| 1121 |
clean_match: bool = True,
|
| 1122 |
+
) -> Union["Selectors", "Selector"]:
|
| 1123 |
"""Find elements that its text content fully/partially matches input.
|
| 1124 |
:param text: Text query to match
|
| 1125 |
:param first_match: Returns the first element that matches conditions, enabled by default
|
|
|
|
| 1161 |
|
| 1162 |
def find_by_regex(
|
| 1163 |
self,
|
| 1164 |
+
query: str | Pattern[str],
|
| 1165 |
first_match: bool = True,
|
| 1166 |
case_sensitive: bool = False,
|
| 1167 |
clean_match: bool = True,
|
| 1168 |
+
) -> Union["Selectors", "Selector"]:
|
| 1169 |
"""Find elements that its text content matches the input regex pattern.
|
| 1170 |
:param query: Regex query/pattern to match
|
| 1171 |
:param first_match: Return the first element that matches conditions; enabled by default.
|
|
|
|
| 1212 |
def __getitem__(self, pos: slice) -> "Selectors":
|
| 1213 |
pass
|
| 1214 |
|
| 1215 |
+
def __getitem__(self, pos: SupportsIndex | slice) -> Selector | "Selectors":
|
|
|
|
|
|
|
| 1216 |
lst = super().__getitem__(pos)
|
| 1217 |
if isinstance(pos, slice):
|
| 1218 |
return self.__class__(lst)
|
|
|
|
| 1226 |
auto_save: bool = False,
|
| 1227 |
percentage: int = 0,
|
| 1228 |
**kwargs: Any,
|
| 1229 |
+
) -> "Selectors":
|
| 1230 |
"""
|
| 1231 |
Call the ``.xpath()`` method for each element in this list and return
|
| 1232 |
their results as another `Selectors` class.
|
|
|
|
| 1261 |
identifier: str = "",
|
| 1262 |
auto_save: bool = False,
|
| 1263 |
percentage: int = 0,
|
| 1264 |
+
) -> "Selectors":
|
| 1265 |
"""
|
| 1266 |
Call the ``.css()`` method for each element in this list and return
|
| 1267 |
their results flattened as another `Selectors` class.
|
|
|
|
| 1288 |
|
| 1289 |
def re(
|
| 1290 |
self,
|
| 1291 |
+
regex: str | Pattern,
|
| 1292 |
replace_entities: bool = True,
|
| 1293 |
clean_match: bool = False,
|
| 1294 |
case_sensitive: bool = True,
|
| 1295 |
+
) -> TextHandlers:
|
| 1296 |
"""Call the ``.re()`` method for each element in this list and return
|
| 1297 |
their results flattened as List of TextHandler.
|
| 1298 |
|
|
|
|
| 1309 |
|
| 1310 |
def re_first(
|
| 1311 |
self,
|
| 1312 |
+
regex: str | Pattern,
|
| 1313 |
default=None,
|
| 1314 |
replace_entities: bool = True,
|
| 1315 |
clean_match: bool = False,
|
|
|
|
| 1329 |
return result
|
| 1330 |
return default
|
| 1331 |
|
| 1332 |
+
def search(self, func: Callable[["Selector"], bool]) -> Optional["Selector"]:
|
| 1333 |
"""Loop over all current elements and return the first element that matches the passed function
|
| 1334 |
:param func: A function that takes each element as an argument and returns True/False
|
| 1335 |
:return: The first element that match the function or ``None`` otherwise.
|
|
|
|
| 1339 |
return element
|
| 1340 |
return None
|
| 1341 |
|
| 1342 |
+
def filter(self, func: Callable[["Selector"], bool]) -> "Selectors":
|
| 1343 |
"""Filter current elements based on the passed function
|
| 1344 |
:param func: A function that takes each element as an argument and returns True/False
|
| 1345 |
:return: The new `Selectors` object or empty list otherwise.
|