Karim shoair commited on
Commit
e5ecf76
·
1 Parent(s): c4135c8

refactor: Making all the codebase acceptable by PyRight

Browse files
scrapling/core/_types.py CHANGED
@@ -12,9 +12,11 @@ from typing import (
12
  Generator,
13
  Iterable,
14
  List,
 
15
  Literal,
16
  Optional,
17
  Pattern,
 
18
  Tuple,
19
  TypeVar,
20
  Union,
@@ -22,6 +24,7 @@ from typing import (
22
  Mapping,
23
  Awaitable,
24
  Protocol,
 
25
  SupportsIndex,
26
  )
27
 
 
12
  Generator,
13
  Iterable,
14
  List,
15
+ Set,
16
  Literal,
17
  Optional,
18
  Pattern,
19
+ Sequence,
20
  Tuple,
21
  TypeVar,
22
  Union,
 
24
  Mapping,
25
  Awaitable,
26
  Protocol,
27
+ Coroutine,
28
  SupportsIndex,
29
  )
30
 
scrapling/core/ai.py CHANGED
@@ -20,6 +20,7 @@ from scrapling.core._types import (
20
  Mapping,
21
  Dict,
22
  List,
 
23
  SelectorWaitStates,
24
  Generator,
25
  )
@@ -171,7 +172,7 @@ class ScraplingMCPServer:
171
  :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
172
  """
173
  async with FetcherSession() as session:
174
- tasks = [
175
  session.get(
176
  url,
177
  auth=auth,
 
20
  Mapping,
21
  Dict,
22
  List,
23
+ Any,
24
  SelectorWaitStates,
25
  Generator,
26
  )
 
172
  :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
173
  """
174
  async with FetcherSession() as session:
175
+ tasks: List[Any] = [
176
  session.get(
177
  url,
178
  auth=auth,
scrapling/core/custom_types.py CHANGED
@@ -5,6 +5,7 @@ from re import compile as re_compile, UNICODE, IGNORECASE
5
  from orjson import dumps, loads
6
 
7
  from scrapling.core._types import (
 
8
  cast,
9
  Dict,
10
  List,
@@ -14,7 +15,6 @@ from scrapling.core._types import (
14
  Literal,
15
  Pattern,
16
  Iterable,
17
- Optional,
18
  Generator,
19
  SupportsIndex,
20
  )
@@ -33,23 +33,20 @@ class TextHandler(str):
33
 
34
  def __getitem__(self, key: SupportsIndex | slice) -> "TextHandler": # pragma: no cover
35
  lst = super().__getitem__(key)
36
- return cast(_TextHandlerType, TextHandler(lst))
37
 
38
- def split(self, sep: str = None, maxsplit: SupportsIndex = -1) -> "TextHandlers": # pragma: no cover
39
- return TextHandlers(
40
- cast(
41
- List[_TextHandlerType],
42
- [TextHandler(s) for s in super().split(sep, maxsplit)],
43
- )
44
- )
45
 
46
- def strip(self, chars: str = None) -> Union[str, "TextHandler"]: # pragma: no cover
47
  return TextHandler(super().strip(chars))
48
 
49
- def lstrip(self, chars: str = None) -> Union[str, "TextHandler"]: # pragma: no cover
50
  return TextHandler(super().lstrip(chars))
51
 
52
- def rstrip(self, chars: str = None) -> Union[str, "TextHandler"]: # pragma: no cover
53
  return TextHandler(super().rstrip(chars))
54
 
55
  def capitalize(self) -> Union[str, "TextHandler"]: # pragma: no cover
@@ -64,7 +61,7 @@ class TextHandler(str):
64
  def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, "TextHandler"]: # pragma: no cover
65
  return TextHandler(super().expandtabs(tabsize))
66
 
67
- def format(self, *args: str, **kwargs: str) -> Union[str, "TextHandler"]: # pragma: no cover
68
  return TextHandler(super().format(*args, **kwargs))
69
 
70
  def format_map(self, mapping) -> Union[str, "TextHandler"]: # pragma: no cover
@@ -131,10 +128,11 @@ class TextHandler(str):
131
  def re(
132
  self,
133
  regex: str | Pattern,
134
- check_match: Literal[True],
135
  replace_entities: bool = True,
136
  clean_match: bool = False,
137
  case_sensitive: bool = True,
 
 
138
  ) -> bool: ...
139
 
140
  @overload
@@ -179,19 +177,14 @@ class TextHandler(str):
179
  results = flatten(results)
180
 
181
  if not replace_entities:
182
- return TextHandlers(cast(List[_TextHandlerType], [TextHandler(string) for string in results]))
183
 
184
- return TextHandlers(
185
- cast(
186
- List[_TextHandlerType],
187
- [TextHandler(_replace_entities(s)) for s in results],
188
- )
189
- )
190
 
191
  def re_first(
192
  self,
193
  regex: str | Pattern,
194
- default=None,
195
  replace_entities: bool = True,
196
  clean_match: bool = False,
197
  case_sensitive: bool = True,
@@ -232,8 +225,8 @@ class TextHandlers(List[TextHandler]):
232
  def __getitem__(self, pos: SupportsIndex | slice) -> Union[TextHandler, "TextHandlers"]:
233
  lst = super().__getitem__(pos)
234
  if isinstance(pos, slice):
235
- return TextHandlers(cast(List[_TextHandlerType], lst))
236
- return cast(_TextHandlerType, TextHandler(lst))
237
 
238
  def re(
239
  self,
@@ -256,7 +249,7 @@ class TextHandlers(List[TextHandler]):
256
  def re_first(
257
  self,
258
  regex: str | Pattern,
259
- default=None,
260
  replace_entities: bool = True,
261
  clean_match: bool = False,
262
  case_sensitive: bool = True,
@@ -309,9 +302,9 @@ class AttributesHandler(Mapping[str, _TextHandlerType]):
309
  )
310
 
311
  # Fastest read-only mapping type
312
- self._data = MappingProxyType(mapping)
313
 
314
- def get(self, key: str, default: Optional[str] = None) -> Optional[_TextHandlerType]:
315
  """Acts like the standard dictionary `.get()` method"""
316
  return self._data.get(key, default)
317
 
 
5
  from orjson import dumps, loads
6
 
7
  from scrapling.core._types import (
8
+ Any,
9
  cast,
10
  Dict,
11
  List,
 
15
  Literal,
16
  Pattern,
17
  Iterable,
 
18
  Generator,
19
  SupportsIndex,
20
  )
 
33
 
34
  def __getitem__(self, key: SupportsIndex | slice) -> "TextHandler": # pragma: no cover
35
  lst = super().__getitem__(key)
36
+ return TextHandler(lst)
37
 
38
+ def split(
39
+ self, sep: str | None = None, maxsplit: SupportsIndex = -1
40
+ ) -> Union[List, "TextHandlers"]: # pragma: no cover
41
+ return TextHandlers([TextHandler(s) for s in super().split(sep, maxsplit)])
 
 
 
42
 
43
+ def strip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
44
  return TextHandler(super().strip(chars))
45
 
46
+ def lstrip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
47
  return TextHandler(super().lstrip(chars))
48
 
49
+ def rstrip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
50
  return TextHandler(super().rstrip(chars))
51
 
52
  def capitalize(self) -> Union[str, "TextHandler"]: # pragma: no cover
 
61
  def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, "TextHandler"]: # pragma: no cover
62
  return TextHandler(super().expandtabs(tabsize))
63
 
64
+ def format(self, *args: object, **kwargs: str) -> Union[str, "TextHandler"]: # pragma: no cover
65
  return TextHandler(super().format(*args, **kwargs))
66
 
67
  def format_map(self, mapping) -> Union[str, "TextHandler"]: # pragma: no cover
 
128
  def re(
129
  self,
130
  regex: str | Pattern,
 
131
  replace_entities: bool = True,
132
  clean_match: bool = False,
133
  case_sensitive: bool = True,
134
+ *,
135
+ check_match: Literal[True],
136
  ) -> bool: ...
137
 
138
  @overload
 
177
  results = flatten(results)
178
 
179
  if not replace_entities:
180
+ return TextHandlers([TextHandler(string) for string in results])
181
 
182
+ return TextHandlers([TextHandler(_replace_entities(s)) for s in results])
 
 
 
 
 
183
 
184
  def re_first(
185
  self,
186
  regex: str | Pattern,
187
+ default: Any = None,
188
  replace_entities: bool = True,
189
  clean_match: bool = False,
190
  case_sensitive: bool = True,
 
225
  def __getitem__(self, pos: SupportsIndex | slice) -> Union[TextHandler, "TextHandlers"]:
226
  lst = super().__getitem__(pos)
227
  if isinstance(pos, slice):
228
+ return TextHandlers(cast(List[TextHandler], lst))
229
+ return TextHandler(cast(TextHandler, lst))
230
 
231
  def re(
232
  self,
 
249
  def re_first(
250
  self,
251
  regex: str | Pattern,
252
+ default: Any = None,
253
  replace_entities: bool = True,
254
  clean_match: bool = False,
255
  case_sensitive: bool = True,
 
302
  )
303
 
304
  # Fastest read-only mapping type
305
+ self._data: Mapping[str, Any] = MappingProxyType(mapping)
306
 
307
+ def get(self, key: str, default: Any = None) -> _TextHandlerType:
308
  """Acts like the standard dictionary `.get()` method"""
309
  return self._data.get(key, default)
310
 
scrapling/core/mixins.py CHANGED
@@ -1,3 +1,9 @@
 
 
 
 
 
 
1
  class SelectorsGeneration:
2
  """
3
  Functions for generating selectors
@@ -5,7 +11,7 @@ class SelectorsGeneration:
5
  Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591
6
  """
7
 
8
- def __general_selection(self, selection: str = "css", full_path: bool = False) -> str:
9
  """Generate a selector for the current element.
10
  :return: A string of the generated selector.
11
  """
@@ -47,29 +53,29 @@ class SelectorsGeneration:
47
  return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))
48
 
49
  @property
50
- def generate_css_selector(self) -> str:
51
  """Generate a CSS selector for the current element
52
  :return: A string of the generated selector.
53
  """
54
- return self.__general_selection()
55
 
56
  @property
57
- def generate_full_css_selector(self) -> str:
58
  """Generate a complete CSS selector for the current element
59
  :return: A string of the generated selector.
60
  """
61
- return self.__general_selection(full_path=True)
62
 
63
  @property
64
- def generate_xpath_selector(self) -> str:
65
  """Generate an XPath selector for the current element
66
  :return: A string of the generated selector.
67
  """
68
- return self.__general_selection("xpath")
69
 
70
  @property
71
- def generate_full_xpath_selector(self) -> str:
72
  """Generate a complete XPath selector for the current element
73
  :return: A string of the generated selector.
74
  """
75
- return self.__general_selection("xpath", full_path=True)
 
1
+ from scrapling.core._types import TYPE_CHECKING
2
+
3
+ if TYPE_CHECKING:
4
+ from scrapling.parser import Selector
5
+
6
+
7
  class SelectorsGeneration:
8
  """
9
  Functions for generating selectors
 
11
  Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591
12
  """
13
 
14
+ def _general_selection(self: "Selector", selection: str = "css", full_path: bool = False) -> str: # type: ignore[name-defined]
15
  """Generate a selector for the current element.
16
  :return: A string of the generated selector.
17
  """
 
53
  return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))
54
 
55
  @property
56
+ def generate_css_selector(self: "Selector") -> str: # type: ignore[name-defined]
57
  """Generate a CSS selector for the current element
58
  :return: A string of the generated selector.
59
  """
60
+ return self._general_selection()
61
 
62
  @property
63
+ def generate_full_css_selector(self: "Selector") -> str: # type: ignore[name-defined]
64
  """Generate a complete CSS selector for the current element
65
  :return: A string of the generated selector.
66
  """
67
+ return self._general_selection(full_path=True)
68
 
69
  @property
70
+ def generate_xpath_selector(self: "Selector") -> str: # type: ignore[name-defined]
71
  """Generate an XPath selector for the current element
72
  :return: A string of the generated selector.
73
  """
74
+ return self._general_selection("xpath")
75
 
76
  @property
77
+ def generate_full_xpath_selector(self: "Selector") -> str: # type: ignore[name-defined]
78
  """Generate a complete XPath selector for the current element
79
  :return: A string of the generated selector.
80
  """
81
+ return self._general_selection("xpath", full_path=True)
scrapling/core/shell.py CHANGED
@@ -31,6 +31,7 @@ from scrapling.core._types import (
31
  Optional,
32
  Dict,
33
  Any,
 
34
  extraction_types,
35
  Generator,
36
  )
@@ -540,15 +541,15 @@ class Convertor:
540
  raise ValueError(f"Unknown extraction type: {extraction_type}")
541
  else:
542
  if main_content_only:
543
- page = page.css_first("body") or page
544
 
545
- pages = [page] if not css_selector else page.css(css_selector)
546
  for page in pages:
547
  match extraction_type:
548
  case "markdown":
549
  yield cls._convert_to_markdown(page.html_content)
550
  case "html":
551
- yield page.body
552
  case "text":
553
  txt_content = page.get_all_text(strip=True)
554
  for s in (
 
31
  Optional,
32
  Dict,
33
  Any,
34
+ cast,
35
  extraction_types,
36
  Generator,
37
  )
 
541
  raise ValueError(f"Unknown extraction type: {extraction_type}")
542
  else:
543
  if main_content_only:
544
+ page = cast(Selector, page.css_first("body")) or page
545
 
546
+ pages = [page] if not css_selector else cast(Selectors, page.css(css_selector))
547
  for page in pages:
548
  match extraction_type:
549
  case "markdown":
550
  yield cls._convert_to_markdown(page.html_content)
551
  case "html":
552
+ yield page.html_content
553
  case "text":
554
  txt_content = page.get_all_text(strip=True)
555
  for s in (
scrapling/core/storage.py CHANGED
@@ -56,13 +56,13 @@ class StorageSystemMixin(ABC): # pragma: no cover
56
  @lru_cache(128, typed=True)
57
  def _get_hash(identifier: str) -> str:
58
  """If you want to hash identifier in your storage system, use this safer"""
59
- identifier = identifier.lower().strip()
60
- if isinstance(identifier, str):
61
  # Hash functions have to take bytes
62
- identifier = identifier.encode("utf-8")
63
 
64
- hash_value = sha256(identifier).hexdigest()
65
- return f"{hash_value}_{len(identifier)}" # Length to reduce collision chance
66
 
67
 
68
  @lru_cache(1, typed=True)
 
56
  @lru_cache(128, typed=True)
57
  def _get_hash(identifier: str) -> str:
58
  """If you want to hash identifier in your storage system, use this safer"""
59
+ _identifier = identifier.lower().strip()
60
+ if isinstance(_identifier, str):
61
  # Hash functions have to take bytes
62
+ _identifier = _identifier.encode("utf-8")
63
 
64
+ hash_value = sha256(_identifier).hexdigest()
65
+ return f"{hash_value}_{len(_identifier)}" # Length to reduce collision chance
66
 
67
 
68
  @lru_cache(1, typed=True)
scrapling/core/translator.py CHANGED
@@ -10,24 +10,23 @@ So you don't have to learn a new selectors/api method like what bs4 done with so
10
 
11
  from functools import lru_cache
12
 
13
- from cssselect.xpath import ExpressionError
14
- from cssselect.xpath import XPathExpr as OriginalXPathExpr
15
  from cssselect import HTMLTranslator as OriginalHTMLTranslator
 
16
  from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
17
 
18
- from scrapling.core._types import Any, Optional, Protocol, Self
19
 
20
 
21
  class XPathExpr(OriginalXPathExpr):
22
  textnode: bool = False
23
- attribute: Optional[str] = None
24
 
25
  @classmethod
26
  def from_xpath(
27
  cls,
28
  xpath: OriginalXPathExpr,
29
  textnode: bool = False,
30
- attribute: Optional[str] = None,
31
  ) -> Self:
32
  x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
33
  x.textnode = textnode
@@ -71,10 +70,10 @@ class XPathExpr(OriginalXPathExpr):
71
 
72
  # e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
73
  class TranslatorProtocol(Protocol):
74
- def xpath_element(self, selector: Element) -> OriginalXPathExpr: # pragma: no cover
75
  pass
76
 
77
- def css_to_xpath(self, css: str, prefix: str = ...) -> str: # pragma: no cover
78
  pass
79
 
80
 
@@ -121,9 +120,15 @@ class TranslatorMixin:
121
 
122
 
123
  class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
124
- @lru_cache(maxsize=256)
125
  def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
126
  return super().css_to_xpath(css, prefix)
127
 
128
 
129
  translator = HTMLTranslator()
 
 
 
 
 
 
 
 
10
 
11
  from functools import lru_cache
12
 
 
 
13
  from cssselect import HTMLTranslator as OriginalHTMLTranslator
14
+ from cssselect.xpath import ExpressionError, XPathExpr as OriginalXPathExpr
15
  from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
16
 
17
+ from scrapling.core._types import Any, Protocol, Self
18
 
19
 
20
  class XPathExpr(OriginalXPathExpr):
21
  textnode: bool = False
22
+ attribute: str | None = None
23
 
24
  @classmethod
25
  def from_xpath(
26
  cls,
27
  xpath: OriginalXPathExpr,
28
  textnode: bool = False,
29
+ attribute: str | None = None,
30
  ) -> Self:
31
  x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
32
  x.textnode = textnode
 
70
 
71
  # e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
72
  class TranslatorProtocol(Protocol):
73
+ def xpath_element(self, selector: Element) -> OriginalXPathExpr: # pyright: ignore # pragma: no cover
74
  pass
75
 
76
+ def css_to_xpath(self, css: str, prefix: str = ...) -> str: # pyright: ignore # pragma: no cover
77
  pass
78
 
79
 
 
120
 
121
 
122
  class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
 
123
  def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
124
  return super().css_to_xpath(css, prefix)
125
 
126
 
127
  translator = HTMLTranslator()
128
+ # Using a function instead of the translator directly to avoid Pyright override error
129
+
130
+
131
+ @lru_cache(maxsize=256)
132
+ def css_to_xpath(query: str) -> str:
133
+ """Return translated XPath version of a given CSS query"""
134
+ return translator.css_to_xpath(query)
scrapling/engines/_browsers/_base.py CHANGED
@@ -7,14 +7,12 @@ from playwright.async_api import (
7
  BrowserContext as AsyncBrowserContext,
8
  Playwright as AsyncPlaywright,
9
  )
10
- from camoufox.utils import (
11
- launch_options as generate_launch_options,
12
- installed_verstr as camoufox_version,
13
- )
14
 
15
  from ._page import PageInfo, PagePool
16
  from scrapling.parser import Selector
17
- from scrapling.core._types import Dict, Optional
18
  from scrapling.engines.toolbelt.fingerprints import get_os_name
19
  from ._validators import validate, PlaywrightConfig, CamoufoxConfig
20
  from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
@@ -41,6 +39,7 @@ class SyncSession:
41
  """Get a new page to use"""
42
 
43
  # No need to check if a page is available or not in sync code because the code blocked before reaching here till the page closed, ofc.
 
44
  page = self.context.new_page()
45
  page.set_default_navigation_timeout(timeout)
46
  page.set_default_timeout(timeout)
@@ -65,11 +64,14 @@ class SyncSession:
65
  }
66
 
67
 
68
- class AsyncSession(SyncSession):
69
  def __init__(self, max_pages: int = 1):
70
- super().__init__(max_pages)
 
 
71
  self.playwright: Optional[AsyncPlaywright] = None
72
  self.context: Optional[AsyncBrowserContext] = None
 
73
  self._lock = Lock()
74
 
75
  async def _get_page(
@@ -79,6 +81,9 @@ class AsyncSession(SyncSession):
79
  disable_resources: bool,
80
  ) -> PageInfo: # pragma: no cover
81
  """Get a new page to use"""
 
 
 
82
  async with self._lock:
83
  # If we're at max capacity after cleanup, wait for busy pages to finish
84
  if self.page_pool.pages_count >= self.max_pages:
@@ -92,6 +97,7 @@ class AsyncSession(SyncSession):
92
  f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
93
  )
94
 
 
95
  page = await self.context.new_page()
96
  page.set_default_navigation_timeout(timeout)
97
  page.set_default_timeout(timeout)
@@ -107,6 +113,14 @@ class AsyncSession(SyncSession):
107
 
108
  return self.page_pool.add_page(page)
109
 
 
 
 
 
 
 
 
 
110
 
111
  class DynamicSessionMixin:
112
  def __validate__(self, **params):
@@ -139,6 +153,9 @@ class DynamicSessionMixin:
139
  self.__initiate_browser_options__()
140
 
141
  def __initiate_browser_options__(self):
 
 
 
142
  if not self.cdp_url:
143
  # `launch_options` is used with persistent context
144
  self.launch_options = dict(
@@ -175,7 +192,7 @@ class DynamicSessionMixin:
175
 
176
  class StealthySessionMixin:
177
  def __validate__(self, **params):
178
- config = validate(params, model=CamoufoxConfig)
179
 
180
  self.max_pages = config.max_pages
181
  self.headless = config.headless
@@ -209,10 +226,10 @@ class StealthySessionMixin:
209
 
210
  def __initiate_browser_options__(self):
211
  """Initiate browser options."""
212
- self.launch_options = generate_launch_options(
213
  **{
214
  "geoip": self.geoip,
215
- "proxy": dict(self.proxy) if self.proxy else self.proxy,
216
  "addons": self.addons,
217
  "exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
218
  "headless": self.headless,
@@ -232,7 +249,7 @@ class StealthySessionMixin:
232
  "browser.cache.disk_cache_ssl": True,
233
  "browser.cache.disk.smart_size.enabled": True,
234
  },
235
- **self.additional_args,
236
  }
237
  )
238
 
 
7
  BrowserContext as AsyncBrowserContext,
8
  Playwright as AsyncPlaywright,
9
  )
10
+ from camoufox.pkgman import installed_verstr as camoufox_version
11
+ from camoufox.utils import launch_options as generate_launch_options
 
 
12
 
13
  from ._page import PageInfo, PagePool
14
  from scrapling.parser import Selector
15
+ from scrapling.core._types import Any, cast, Dict, Optional, TYPE_CHECKING
16
  from scrapling.engines.toolbelt.fingerprints import get_os_name
17
  from ._validators import validate, PlaywrightConfig, CamoufoxConfig
18
  from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
 
39
  """Get a new page to use"""
40
 
41
  # No need to check if a page is available or not in sync code because the code blocked before reaching here till the page closed, ofc.
42
+ assert self.context is not None, "Browser context not initialized"
43
  page = self.context.new_page()
44
  page.set_default_navigation_timeout(timeout)
45
  page.set_default_timeout(timeout)
 
64
  }
65
 
66
 
67
+ class AsyncSession:
68
  def __init__(self, max_pages: int = 1):
69
+ self.max_pages = max_pages
70
+ self.page_pool = PagePool(max_pages)
71
+ self._max_wait_for_page = 60
72
  self.playwright: Optional[AsyncPlaywright] = None
73
  self.context: Optional[AsyncBrowserContext] = None
74
+ self._closed = False
75
  self._lock = Lock()
76
 
77
  async def _get_page(
 
81
  disable_resources: bool,
82
  ) -> PageInfo: # pragma: no cover
83
  """Get a new page to use"""
84
+ if TYPE_CHECKING:
85
+ assert self.context is not None, "Browser context not initialized"
86
+
87
  async with self._lock:
88
  # If we're at max capacity after cleanup, wait for busy pages to finish
89
  if self.page_pool.pages_count >= self.max_pages:
 
97
  f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
98
  )
99
 
100
+ assert self.context is not None, "Browser context not initialized"
101
  page = await self.context.new_page()
102
  page.set_default_navigation_timeout(timeout)
103
  page.set_default_timeout(timeout)
 
113
 
114
  return self.page_pool.add_page(page)
115
 
116
+ def get_pool_stats(self) -> Dict[str, int]:
117
+ """Get statistics about the current page pool"""
118
+ return {
119
+ "total_pages": self.page_pool.pages_count,
120
+ "busy_pages": self.page_pool.busy_count,
121
+ "max_pages": self.max_pages,
122
+ }
123
+
124
 
125
  class DynamicSessionMixin:
126
  def __validate__(self, **params):
 
153
  self.__initiate_browser_options__()
154
 
155
  def __initiate_browser_options__(self):
156
+ if TYPE_CHECKING:
157
+ assert isinstance(self.proxy, tuple)
158
+
159
  if not self.cdp_url:
160
  # `launch_options` is used with persistent context
161
  self.launch_options = dict(
 
192
 
193
  class StealthySessionMixin:
194
  def __validate__(self, **params):
195
+ config: CamoufoxConfig = validate(params, model=CamoufoxConfig)
196
 
197
  self.max_pages = config.max_pages
198
  self.headless = config.headless
 
226
 
227
  def __initiate_browser_options__(self):
228
  """Initiate browser options."""
229
+ self.launch_options: Dict[str, Any] = generate_launch_options(
230
  **{
231
  "geoip": self.geoip,
232
+ "proxy": dict(self.proxy) if self.proxy and isinstance(self.proxy, tuple) else self.proxy,
233
  "addons": self.addons,
234
  "exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
235
  "headless": self.headless,
 
249
  "browser.cache.disk_cache_ssl": True,
250
  "browser.cache.disk.smart_size.enabled": True,
251
  },
252
+ **cast(Dict, self.additional_args),
253
  }
254
  )
255
 
scrapling/engines/_browsers/_camoufox.py CHANGED
@@ -26,6 +26,7 @@ from scrapling.core._types import (
26
  List,
27
  Optional,
28
  Callable,
 
29
  SelectorWaitStates,
30
  )
31
  from scrapling.engines.toolbelt.convertor import (
@@ -205,7 +206,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
205
  self._closed = True
206
 
207
  @staticmethod
208
- def _get_page_content(page: Page) -> str | None:
209
  """
210
  A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
211
  :param page: The page to extract content from.
@@ -217,6 +218,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
217
  except PlaywrightError:
218
  page.wait_for_timeout(1000)
219
  continue
 
220
 
221
  def _solve_cloudflare(self, page: Page) -> None: # pragma: no cover
222
  """Solve the cloudflare challenge displayed on the playwright page passed
@@ -502,8 +504,8 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
502
 
503
  async def __create__(self):
504
  """Create a browser for this instance and context."""
505
- self.playwright: AsyncPlaywright = await async_playwright().start()
506
- self.context: AsyncBrowserContext = await self.playwright.firefox.launch_persistent_context(
507
  **self.launch_options
508
  )
509
 
@@ -511,7 +513,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
511
  await self.context.add_init_script(path=self.init_script)
512
 
513
  if self.cookies:
514
- await self.context.add_cookies(self.cookies)
515
 
516
  async def __aenter__(self):
517
  await self.__create__()
@@ -536,7 +538,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
536
  self._closed = True
537
 
538
  @staticmethod
539
- async def _get_page_content(page: async_Page) -> str | None:
540
  """
541
  A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
542
  :param page: The page to extract content from.
@@ -548,6 +550,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
548
  except PlaywrightError:
549
  await page.wait_for_timeout(1000)
550
  continue
 
551
 
552
  async def _solve_cloudflare(self, page: async_Page):
553
  """Solve the cloudflare challenge displayed on the playwright page passed. The async version
@@ -679,6 +682,10 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
679
  page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
680
  page_info.mark_busy(url=url)
681
 
 
 
 
 
682
  try:
683
  # Navigate to URL and wait for a specified state
684
  page_info.page.on("response", handle_response)
 
26
  List,
27
  Optional,
28
  Callable,
29
+ TYPE_CHECKING,
30
  SelectorWaitStates,
31
  )
32
  from scrapling.engines.toolbelt.convertor import (
 
206
  self._closed = True
207
 
208
  @staticmethod
209
+ def _get_page_content(page: Page) -> str:
210
  """
211
  A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
212
  :param page: The page to extract content from.
 
218
  except PlaywrightError:
219
  page.wait_for_timeout(1000)
220
  continue
221
+ return "" # pyright: ignore
222
 
223
  def _solve_cloudflare(self, page: Page) -> None: # pragma: no cover
224
  """Solve the cloudflare challenge displayed on the playwright page passed
 
504
 
505
  async def __create__(self):
506
  """Create a browser for this instance and context."""
507
+ self.playwright: AsyncPlaywright | None = await async_playwright().start()
508
+ self.context: AsyncBrowserContext | None = await self.playwright.firefox.launch_persistent_context(
509
  **self.launch_options
510
  )
511
 
 
513
  await self.context.add_init_script(path=self.init_script)
514
 
515
  if self.cookies:
516
+ await self.context.add_cookies(self.cookies) # pyright: ignore [reportArgumentType]
517
 
518
  async def __aenter__(self):
519
  await self.__create__()
 
538
  self._closed = True
539
 
540
  @staticmethod
541
+ async def _get_page_content(page: async_Page) -> str:
542
  """
543
  A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
544
  :param page: The page to extract content from.
 
550
  except PlaywrightError:
551
  await page.wait_for_timeout(1000)
552
  continue
553
+ return "" # pyright: ignore
554
 
555
  async def _solve_cloudflare(self, page: async_Page):
556
  """Solve the cloudflare challenge displayed on the playwright page passed. The async version
 
682
  page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
683
  page_info.mark_busy(url=url)
684
 
685
+ if TYPE_CHECKING:
686
+ if not isinstance(page_info.page, async_Page):
687
+ raise TypeError
688
+
689
  try:
690
  # Navigate to URL and wait for a specified state
691
  page_info.page.on("response", handle_response)
scrapling/engines/_browsers/_config_tools.py CHANGED
@@ -62,7 +62,7 @@ def _set_flags(hide_canvas, disable_webgl): # pragma: no cover
62
  @lru_cache(2, typed=True)
63
  def _launch_kwargs(
64
  headless,
65
- proxy,
66
  locale,
67
  extra_headers,
68
  useragent,
 
62
  @lru_cache(2, typed=True)
63
  def _launch_kwargs(
64
  headless,
65
+ proxy: Tuple,
66
  locale,
67
  extra_headers,
68
  useragent,
scrapling/engines/_browsers/_controllers.py CHANGED
@@ -10,6 +10,7 @@ from playwright.async_api import (
10
  BrowserContext as AsyncBrowserContext,
11
  Playwright as AsyncPlaywright,
12
  Locator as AsyncLocator,
 
13
  )
14
  from patchright.sync_api import sync_playwright as sync_patchright
15
  from patchright.async_api import async_playwright as async_patchright
@@ -18,10 +19,12 @@ from scrapling.core.utils import log
18
  from ._base import SyncSession, AsyncSession, DynamicSessionMixin
19
  from ._validators import validate_fetch as _validate
20
  from scrapling.core._types import (
 
21
  Dict,
22
  List,
23
  Optional,
24
  Callable,
 
25
  SelectorWaitStates,
26
  )
27
  from scrapling.engines.toolbelt.convertor import (
@@ -30,7 +33,7 @@ from scrapling.engines.toolbelt.convertor import (
30
  )
31
  from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
32
 
33
- _UNSET = object()
34
 
35
 
36
  class DynamicSession(DynamicSessionMixin, SyncSession):
@@ -154,7 +157,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
154
  """Create a browser for this instance and context."""
155
  sync_context = sync_patchright if self.stealth else sync_playwright
156
 
157
- self.playwright: Playwright = sync_context().start()
158
 
159
  if self.cdp_url: # pragma: no cover
160
  self.context = self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url).new_context(
@@ -187,7 +190,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
187
 
188
  if self.playwright:
189
  self.playwright.stop()
190
- self.playwright = None
191
 
192
  self._closed = True
193
 
@@ -399,7 +402,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
399
  """Create a browser for this instance and context."""
400
  async_context = async_patchright if self.stealth else async_playwright
401
 
402
- self.playwright: AsyncPlaywright = await async_context().start()
403
 
404
  if self.cdp_url:
405
  browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url)
@@ -413,7 +416,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
413
  await self.context.add_init_script(path=self.init_script)
414
 
415
  if self.cookies:
416
- await self.context.add_cookies(self.cookies)
417
 
418
  async def __aenter__(self):
419
  await self.__create__()
@@ -429,11 +432,11 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
429
 
430
  if self.context:
431
  await self.context.close()
432
- self.context = None
433
 
434
  if self.playwright:
435
  await self.playwright.stop()
436
- self.playwright = None
437
 
438
  self._closed = True
439
 
@@ -506,6 +509,10 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
506
  page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
507
  page_info.mark_busy(url=url)
508
 
 
 
 
 
509
  try:
510
  # Navigate to URL and wait for a specified state
511
  page_info.page.on("response", handle_response)
 
10
  BrowserContext as AsyncBrowserContext,
11
  Playwright as AsyncPlaywright,
12
  Locator as AsyncLocator,
13
+ Page as async_Page,
14
  )
15
  from patchright.sync_api import sync_playwright as sync_patchright
16
  from patchright.async_api import async_playwright as async_patchright
 
19
  from ._base import SyncSession, AsyncSession, DynamicSessionMixin
20
  from ._validators import validate_fetch as _validate
21
  from scrapling.core._types import (
22
+ Any,
23
  Dict,
24
  List,
25
  Optional,
26
  Callable,
27
+ TYPE_CHECKING,
28
  SelectorWaitStates,
29
  )
30
  from scrapling.engines.toolbelt.convertor import (
 
33
  )
34
  from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
35
 
36
+ _UNSET: Any = object()
37
 
38
 
39
  class DynamicSession(DynamicSessionMixin, SyncSession):
 
157
  """Create a browser for this instance and context."""
158
  sync_context = sync_patchright if self.stealth else sync_playwright
159
 
160
+ self.playwright: Playwright = sync_context().start() # pyright: ignore [reportAttributeAccessIssue]
161
 
162
  if self.cdp_url: # pragma: no cover
163
  self.context = self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url).new_context(
 
190
 
191
  if self.playwright:
192
  self.playwright.stop()
193
+ self.playwright = None # pyright: ignore
194
 
195
  self._closed = True
196
 
 
402
  """Create a browser for this instance and context."""
403
  async_context = async_patchright if self.stealth else async_playwright
404
 
405
+ self.playwright: AsyncPlaywright = await async_context().start() # pyright: ignore [reportAttributeAccessIssue]
406
 
407
  if self.cdp_url:
408
  browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url)
 
416
  await self.context.add_init_script(path=self.init_script)
417
 
418
  if self.cookies:
419
+ await self.context.add_cookies(self.cookies) # pyright: ignore
420
 
421
  async def __aenter__(self):
422
  await self.__create__()
 
432
 
433
  if self.context:
434
  await self.context.close()
435
+ self.context = None # pyright: ignore
436
 
437
  if self.playwright:
438
  await self.playwright.stop()
439
+ self.playwright = None # pyright: ignore
440
 
441
  self._closed = True
442
 
 
509
  page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
510
  page_info.mark_busy(url=url)
511
 
512
+ if TYPE_CHECKING:
513
+ if not isinstance(page_info.page, async_Page):
514
+ raise TypeError
515
+
516
  try:
517
  # Navigate to URL and wait for a specified state
518
  page_info.page.on("response", handle_response)
scrapling/engines/_browsers/_validators.py CHANGED
@@ -11,7 +11,10 @@ from scrapling.core._types import (
11
  Tuple,
12
  Optional,
13
  Callable,
 
14
  SelectorWaitStates,
 
 
15
  )
16
  from scrapling.engines.toolbelt.navigation import construct_proxy_dict
17
 
@@ -73,7 +76,7 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
73
  stealth: bool = False
74
  wait: Seconds = 0
75
  page_action: Optional[Callable] = None
76
- proxy: Optional[str | Dict[str, str]] = None # The default value for proxy in Playwright's source is `None`
77
  locale: str = "en-US"
78
  extra_headers: Optional[Dict[str, str]] = None
79
  useragent: Optional[str] = None
@@ -81,11 +84,11 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
81
  init_script: Optional[str] = None
82
  disable_resources: bool = False
83
  wait_selector: Optional[str] = None
84
- cookies: Optional[List[Dict]] = None
85
  network_idle: bool = False
86
  load_dom: bool = True
87
  wait_selector_state: SelectorWaitStates = "attached"
88
- selector_config: Optional[Dict] = None
89
 
90
  def __post_init__(self):
91
  """Custom validation after msgspec validation"""
@@ -125,15 +128,15 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
125
  wait_selector: Optional[str] = None
126
  addons: Optional[List[str]] = None
127
  wait_selector_state: SelectorWaitStates = "attached"
128
- cookies: Optional[List[Dict]] = None
129
  google_search: bool = True
130
  extra_headers: Optional[Dict[str, str]] = None
131
- proxy: Optional[str | Dict[str, str]] = None # The default value for proxy in Playwright's source is `None`
132
  os_randomize: bool = False
133
  disable_ads: bool = False
134
  geoip: bool = False
135
- selector_config: Optional[Dict] = None
136
- additional_args: Optional[Dict] = None
137
 
138
  def __post_init__(self):
139
  """Custom validation after msgspec validation"""
@@ -177,7 +180,7 @@ class FetchConfig(Struct, kw_only=True):
177
  network_idle: bool = False
178
  load_dom: bool = True
179
  solve_cloudflare: bool = False
180
- selector_config: Optional[Dict] = {}
181
 
182
  def to_dict(self):
183
  return {f: getattr(self, f) for f in self.__struct_fields__}
@@ -198,7 +201,7 @@ class _fetch_params:
198
  network_idle: bool
199
  load_dom: bool
200
  solve_cloudflare: bool
201
- selector_config: Optional[Dict]
202
 
203
 
204
  def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
@@ -212,7 +215,7 @@ def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
212
  result[arg] = session_value
213
 
214
  if overrides:
215
- overrides = validate(overrides, FetchConfig).to_dict()
216
  overrides.update(result)
217
  return _fetch_params(**overrides)
218
 
@@ -222,7 +225,21 @@ def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
222
  return _fetch_params(**result)
223
 
224
 
225
- def validate(params: Dict, model) -> PlaywrightConfig | CamoufoxConfig | FetchConfig:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  try:
227
  return convert(params, model)
228
  except ValidationError as e:
 
11
  Tuple,
12
  Optional,
13
  Callable,
14
+ Iterable,
15
  SelectorWaitStates,
16
+ cast,
17
+ overload,
18
  )
19
  from scrapling.engines.toolbelt.navigation import construct_proxy_dict
20
 
 
76
  stealth: bool = False
77
  wait: Seconds = 0
78
  page_action: Optional[Callable] = None
79
+ proxy: Optional[str | Dict[str, str] | Tuple] = None # The default value for proxy in Playwright's source is `None`
80
  locale: str = "en-US"
81
  extra_headers: Optional[Dict[str, str]] = None
82
  useragent: Optional[str] = None
 
84
  init_script: Optional[str] = None
85
  disable_resources: bool = False
86
  wait_selector: Optional[str] = None
87
+ cookies: Optional[Iterable[Dict]] = None
88
  network_idle: bool = False
89
  load_dom: bool = True
90
  wait_selector_state: SelectorWaitStates = "attached"
91
+ selector_config: Optional[Dict] = {}
92
 
93
  def __post_init__(self):
94
  """Custom validation after msgspec validation"""
 
128
  wait_selector: Optional[str] = None
129
  addons: Optional[List[str]] = None
130
  wait_selector_state: SelectorWaitStates = "attached"
131
+ cookies: Optional[Iterable[Dict]] = None
132
  google_search: bool = True
133
  extra_headers: Optional[Dict[str, str]] = None
134
+ proxy: Optional[str | Dict[str, str] | Tuple] = None # The default value for proxy in Playwright's source is `None`
135
  os_randomize: bool = False
136
  disable_ads: bool = False
137
  geoip: bool = False
138
+ selector_config: Optional[Dict] = {}
139
+ additional_args: Optional[Dict] = {}
140
 
141
  def __post_init__(self):
142
  """Custom validation after msgspec validation"""
 
180
  network_idle: bool = False
181
  load_dom: bool = True
182
  solve_cloudflare: bool = False
183
+ selector_config: Dict = {}
184
 
185
  def to_dict(self):
186
  return {f: getattr(self, f) for f in self.__struct_fields__}
 
201
  network_idle: bool
202
  load_dom: bool
203
  solve_cloudflare: bool
204
+ selector_config: Dict
205
 
206
 
207
  def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
 
215
  result[arg] = session_value
216
 
217
  if overrides:
218
+ overrides = cast(FetchConfig, validate(overrides, FetchConfig)).to_dict()
219
  overrides.update(result)
220
  return _fetch_params(**overrides)
221
 
 
225
  return _fetch_params(**result)
226
 
227
 
228
+ @overload
229
+ def validate(params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...
230
+
231
+
232
+ @overload
233
+ def validate(params: Dict, model: type[CamoufoxConfig]) -> CamoufoxConfig: ...
234
+
235
+
236
+ @overload
237
+ def validate(params: Dict, model: type[FetchConfig]) -> FetchConfig: ...
238
+
239
+
240
+ def validate(
241
+ params: Dict, model: type[PlaywrightConfig] | type[CamoufoxConfig] | type[FetchConfig]
242
+ ) -> PlaywrightConfig | CamoufoxConfig | FetchConfig:
243
  try:
244
  return convert(params, model)
245
  except ValidationError as e:
scrapling/engines/static.py CHANGED
@@ -182,7 +182,7 @@ class FetcherSession:
182
 
183
  return headers
184
 
185
- def __enter__(self):
186
  """Creates and returns a new synchronous Fetcher Session"""
187
  if self._curl_session:
188
  raise RuntimeError(
@@ -197,7 +197,7 @@ class FetcherSession:
197
  )
198
 
199
  self._curl_session = CurlSession()
200
- return self
201
 
202
  def __exit__(self, exc_type, exc_val, exc_tb):
203
  """Closes the active synchronous session managed by this instance, if any."""
@@ -205,7 +205,7 @@ class FetcherSession:
205
  self._curl_session.close()
206
  self._curl_session = None
207
 
208
- async def __aenter__(self):
209
  """Creates and returns a new asynchronous Session."""
210
  if self._async_curl_session:
211
  raise RuntimeError(
@@ -220,7 +220,7 @@ class FetcherSession:
220
  )
221
 
222
  self._async_curl_session = AsyncCurlSession()
223
- return self
224
 
225
  async def __aexit__(self, exc_type, exc_val, exc_tb):
226
  """Closes the active asynchronous session managed by this instance, if any."""
 
182
 
183
  return headers
184
 
185
+ def __enter__(self) -> "FetcherClient":
186
  """Creates and returns a new synchronous Fetcher Session"""
187
  if self._curl_session:
188
  raise RuntimeError(
 
197
  )
198
 
199
  self._curl_session = CurlSession()
200
+ return cast("FetcherClient", self)
201
 
202
  def __exit__(self, exc_type, exc_val, exc_tb):
203
  """Closes the active synchronous session managed by this instance, if any."""
 
205
  self._curl_session.close()
206
  self._curl_session = None
207
 
208
+ async def __aenter__(self) -> "AsyncFetcherClient":
209
  """Creates and returns a new asynchronous Session."""
210
  if self._async_curl_session:
211
  raise RuntimeError(
 
220
  )
221
 
222
  self._async_curl_session = AsyncCurlSession()
223
+ return cast("AsyncFetcherClient", self)
224
 
225
  async def __aexit__(self, exc_type, exc_val, exc_tb):
226
  """Closes the active asynchronous session managed by this instance, if any."""
scrapling/engines/toolbelt/convertor.py CHANGED
@@ -58,7 +58,8 @@ class ResponseFactory:
58
  "encoding": cls.__extract_browser_encoding(
59
  current_response.headers.get("content-type", "")
60
  )
61
- or "utf-8",
 
62
  "cookies": tuple(),
63
  "headers": current_response.all_headers() if current_response else {},
64
  "request_headers": current_request.all_headers(),
@@ -161,7 +162,8 @@ class ResponseFactory:
161
  "encoding": cls.__extract_browser_encoding(
162
  current_response.headers.get("content-type", "")
163
  )
164
- or "utf-8",
 
165
  "cookies": tuple(),
166
  "headers": await current_response.all_headers() if current_response else {},
167
  "request_headers": await current_request.all_headers(),
@@ -255,8 +257,8 @@ class ResponseFactory:
255
  "encoding": response.encoding or "utf-8",
256
  "cookies": dict(response.cookies),
257
  "headers": dict(response.headers),
258
- "request_headers": dict(response.request.headers),
259
- "method": response.request.method,
260
  "history": response.history, # https://github.com/lexiforest/curl_cffi/issues/82
261
  **parser_arguments,
262
  }
 
58
  "encoding": cls.__extract_browser_encoding(
59
  current_response.headers.get("content-type", "")
60
  )
61
+ if current_response
62
+ else "utf-8",
63
  "cookies": tuple(),
64
  "headers": current_response.all_headers() if current_response else {},
65
  "request_headers": current_request.all_headers(),
 
162
  "encoding": cls.__extract_browser_encoding(
163
  current_response.headers.get("content-type", "")
164
  )
165
+ if current_response
166
+ else "utf-8",
167
  "cookies": tuple(),
168
  "headers": await current_response.all_headers() if current_response else {},
169
  "request_headers": await current_request.all_headers(),
 
257
  "encoding": response.encoding or "utf-8",
258
  "cookies": dict(response.cookies),
259
  "headers": dict(response.headers),
260
+ "request_headers": dict(response.request.headers) if response.request else {},
261
+ "method": response.request.method if response.request else "GET",
262
  "history": response.history, # https://github.com/lexiforest/curl_cffi/issues/82
263
  **parser_arguments,
264
  }
scrapling/engines/toolbelt/custom.py CHANGED
@@ -8,6 +8,7 @@ from scrapling.core.utils import log
8
  from scrapling.core._types import (
9
  Any,
10
  Dict,
 
11
  List,
12
  Optional,
13
  Tuple,
@@ -30,10 +31,10 @@ class Response(Selector):
30
  request_headers: Dict,
31
  encoding: str = "utf-8",
32
  method: str = "GET",
33
- history: List = None,
34
- **selector_config: Dict,
35
  ):
36
- adaptive_domain = selector_config.pop("adaptive_domain", None)
37
  self.status = status
38
  self.reason = reason
39
  self.cookies = cookies
@@ -58,7 +59,7 @@ class BaseFetcher:
58
  keep_cdata: Optional[bool] = False
59
  storage_args: Optional[Dict] = None
60
  keep_comments: Optional[bool] = False
61
- adaptive_domain: Optional[str] = None
62
  parser_keywords: Tuple = (
63
  "huge_tree",
64
  "adaptive",
@@ -124,12 +125,8 @@ class BaseFetcher:
124
  adaptive=cls.adaptive,
125
  storage=cls.storage,
126
  storage_args=cls.storage_args,
 
127
  )
128
- if cls.adaptive_domain:
129
- if not isinstance(cls.adaptive_domain, str):
130
- log.warning('[Ignored] The argument "adaptive_domain" must be of string type')
131
- else:
132
- parser_arguments.update({"adaptive_domain": cls.adaptive_domain})
133
 
134
  return parser_arguments
135
 
 
8
  from scrapling.core._types import (
9
  Any,
10
  Dict,
11
+ cast,
12
  List,
13
  Optional,
14
  Tuple,
 
31
  request_headers: Dict,
32
  encoding: str = "utf-8",
33
  method: str = "GET",
34
+ history: List | None = None,
35
+ **selector_config: Any,
36
  ):
37
+ adaptive_domain: str = cast(str, selector_config.pop("adaptive_domain", ""))
38
  self.status = status
39
  self.reason = reason
40
  self.cookies = cookies
 
59
  keep_cdata: Optional[bool] = False
60
  storage_args: Optional[Dict] = None
61
  keep_comments: Optional[bool] = False
62
+ adaptive_domain: str = ""
63
  parser_keywords: Tuple = (
64
  "huge_tree",
65
  "adaptive",
 
125
  adaptive=cls.adaptive,
126
  storage=cls.storage,
127
  storage_args=cls.storage_args,
128
+ adaptive_domain=cls.adaptive_domain,
129
  )
 
 
 
 
 
130
 
131
  return parser_arguments
132
 
scrapling/engines/toolbelt/fingerprints.py CHANGED
@@ -8,9 +8,10 @@ from platform import system as platform_system
8
  from tldextract import extract
9
  from browserforge.headers import Browser, HeaderGenerator
10
 
11
- from scrapling.core._types import Dict, Optional
12
 
13
  __OS_NAME__ = platform_system()
 
14
 
15
 
16
  @lru_cache(10, typed=True)
@@ -28,16 +29,20 @@ def generate_convincing_referer(url: str) -> str:
28
 
29
 
30
  @lru_cache(1, typed=True)
31
- def get_os_name() -> Optional[str]:
32
- """Get the current OS name in the same format needed for browserforge
33
 
34
  :return: Current OS name or `None` otherwise
35
  """
36
- return {
37
- "Linux": "linux",
38
- "Darwin": "macos",
39
- "Windows": "windows",
40
- }.get(__OS_NAME__)
 
 
 
 
41
 
42
 
43
  def generate_headers(browser_mode: bool = False) -> Dict:
@@ -58,8 +63,10 @@ def generate_headers(browser_mode: bool = False) -> Dict:
58
  Browser(name="edge", min_version=130),
59
  ]
60
  )
61
-
62
- return HeaderGenerator(browser=browsers, os=os_name, device="desktop").generate()
 
 
63
 
64
 
65
  __default_useragent__ = generate_headers(browser_mode=False).get("User-Agent")
 
8
  from tldextract import extract
9
  from browserforge.headers import Browser, HeaderGenerator
10
 
11
+ from scrapling.core._types import Dict, Literal
12
 
13
  __OS_NAME__ = platform_system()
14
+ OSName = Literal["linux", "macos", "windows"]
15
 
16
 
17
  @lru_cache(10, typed=True)
 
29
 
30
 
31
  @lru_cache(1, typed=True)
32
+ def get_os_name() -> OSName | None:
33
+ """Get the current OS name in the same format needed for browserforge, if the OS is Unknown, return None so browserforge uses all.
34
 
35
  :return: Current OS name or `None` otherwise
36
  """
37
+ match __OS_NAME__:
38
+ case "Linux":
39
+ return "linux"
40
+ case "Darwin":
41
+ return "macos"
42
+ case "Windows":
43
+ return "windows"
44
+ case _:
45
+ return None
46
 
47
 
48
  def generate_headers(browser_mode: bool = False) -> Dict:
 
63
  Browser(name="edge", min_version=130),
64
  ]
65
  )
66
+ if os_name:
67
+ return HeaderGenerator(browser=browsers, os=os_name, device="desktop").generate()
68
+ else:
69
+ return HeaderGenerator(browser=browsers, device="desktop").generate()
70
 
71
 
72
  __default_useragent__ = generate_headers(browser_mode=False).get("User-Agent")
scrapling/engines/toolbelt/navigation.py CHANGED
@@ -11,7 +11,7 @@ from msgspec import Struct, structs, convert, ValidationError
11
  from playwright.sync_api import Route
12
 
13
  from scrapling.core.utils import log
14
- from scrapling.core._types import Dict, Optional, Tuple
15
  from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
16
 
17
  __BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
@@ -49,7 +49,15 @@ async def async_intercept_route(route: async_Route):
49
  await route.continue_()
50
 
51
 
52
- def construct_proxy_dict(proxy_string: str | Dict[str, str], as_tuple=False) -> Optional[Dict | Tuple]:
 
 
 
 
 
 
 
 
53
  """Validate a proxy and return it in the acceptable format for Playwright
54
  Reference: https://playwright.dev/python/docs/network#http-proxy
55
 
@@ -83,7 +91,7 @@ def construct_proxy_dict(proxy_string: str | Dict[str, str], as_tuple=False) ->
83
  except ValidationError as e:
84
  raise TypeError(f"Invalid proxy dictionary: {e}")
85
 
86
- return None
87
 
88
 
89
  @lru_cache(10, typed=True)
 
11
  from playwright.sync_api import Route
12
 
13
  from scrapling.core.utils import log
14
+ from scrapling.core._types import Dict, Tuple, overload, Literal
15
  from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
16
 
17
  __BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
 
49
  await route.continue_()
50
 
51
 
52
+ @overload
53
+ def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: Literal[True]) -> Tuple: ...
54
+
55
+
56
+ @overload
57
+ def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: Literal[False] = False) -> Dict: ...
58
+
59
+
60
+ def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: bool = False) -> Dict | Tuple:
61
  """Validate a proxy and return it in the acceptable format for Playwright
62
  Reference: https://playwright.dev/python/docs/network#http-proxy
63
 
 
91
  except ValidationError as e:
92
  raise TypeError(f"Invalid proxy dictionary: {e}")
93
 
94
+ raise TypeError(f"Invalid proxy string: {proxy_string}")
95
 
96
 
97
  @lru_cache(10, typed=True)
scrapling/fetchers/__init__.py CHANGED
@@ -19,7 +19,17 @@ _LAZY_IMPORTS = {
19
  "AsyncStealthySession": ("scrapling.fetchers.firefox", "AsyncStealthySession"),
20
  }
21
 
22
- __all__ = ["Fetcher", "AsyncFetcher", "StealthyFetcher", "DynamicFetcher"]
 
 
 
 
 
 
 
 
 
 
23
 
24
 
25
  def __getattr__(name: str) -> Any:
 
19
  "AsyncStealthySession": ("scrapling.fetchers.firefox", "AsyncStealthySession"),
20
  }
21
 
22
+ __all__ = [
23
+ "Fetcher",
24
+ "AsyncFetcher",
25
+ "FetcherSession",
26
+ "DynamicFetcher",
27
+ "DynamicSession",
28
+ "AsyncDynamicSession",
29
+ "StealthyFetcher",
30
+ "StealthySession",
31
+ "AsyncStealthySession",
32
+ ]
33
 
34
 
35
  def __getattr__(name: str) -> Any:
scrapling/fetchers/chrome.py CHANGED
@@ -1,10 +1,9 @@
1
  from scrapling.core._types import (
2
  Callable,
3
- Dict,
4
  List,
 
5
  Optional,
6
  SelectorWaitStates,
7
- Iterable,
8
  )
9
  from scrapling.engines.toolbelt.custom import BaseFetcher, Response
10
  from scrapling.engines._browsers._controllers import DynamicSession, AsyncDynamicSession
@@ -47,7 +46,7 @@ class DynamicFetcher(BaseFetcher):
47
  disable_resources: bool = False,
48
  wait_selector: Optional[str] = None,
49
  init_script: Optional[str] = None,
50
- cookies: Optional[Iterable[Dict]] = None,
51
  network_idle: bool = False,
52
  load_dom: bool = True,
53
  wait_selector_state: SelectorWaitStates = "attached",
@@ -134,7 +133,7 @@ class DynamicFetcher(BaseFetcher):
134
  disable_resources: bool = False,
135
  wait_selector: Optional[str] = None,
136
  init_script: Optional[str] = None,
137
- cookies: Optional[Iterable[Dict]] = None,
138
  network_idle: bool = False,
139
  load_dom: bool = True,
140
  wait_selector_state: SelectorWaitStates = "attached",
 
1
  from scrapling.core._types import (
2
  Callable,
 
3
  List,
4
+ Dict,
5
  Optional,
6
  SelectorWaitStates,
 
7
  )
8
  from scrapling.engines.toolbelt.custom import BaseFetcher, Response
9
  from scrapling.engines._browsers._controllers import DynamicSession, AsyncDynamicSession
 
46
  disable_resources: bool = False,
47
  wait_selector: Optional[str] = None,
48
  init_script: Optional[str] = None,
49
+ cookies: Optional[List[Dict]] = None,
50
  network_idle: bool = False,
51
  load_dom: bool = True,
52
  wait_selector_state: SelectorWaitStates = "attached",
 
133
  disable_resources: bool = False,
134
  wait_selector: Optional[str] = None,
135
  init_script: Optional[str] = None,
136
+ cookies: Optional[List[Dict]] = None,
137
  network_idle: bool = False,
138
  load_dom: bool = True,
139
  wait_selector_state: SelectorWaitStates = "attached",
scrapling/fetchers/firefox.py CHANGED
@@ -83,8 +83,6 @@ class StealthyFetcher(BaseFetcher):
83
  """
84
  if not custom_config:
85
  custom_config = {}
86
- elif not isinstance(custom_config, dict):
87
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
88
 
89
  with StealthySession(
90
  wait=wait,
@@ -182,8 +180,6 @@ class StealthyFetcher(BaseFetcher):
182
  """
183
  if not custom_config:
184
  custom_config = {}
185
- elif not isinstance(custom_config, dict):
186
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
187
 
188
  async with AsyncStealthySession(
189
  wait=wait,
 
83
  """
84
  if not custom_config:
85
  custom_config = {}
 
 
86
 
87
  with StealthySession(
88
  wait=wait,
 
180
  """
181
  if not custom_config:
182
  custom_config = {}
 
 
183
 
184
  async with AsyncStealthySession(
185
  wait=wait,
scrapling/parser.py CHANGED
@@ -17,17 +17,21 @@ from lxml.etree import (
17
 
18
  from scrapling.core._types import (
19
  Any,
 
20
  Dict,
 
21
  List,
22
  Tuple,
23
  Union,
24
  Pattern,
25
  Callable,
 
26
  Optional,
27
  Iterable,
28
  overload,
29
  Generator,
30
  SupportsIndex,
 
31
  )
32
  from scrapling.core.custom_types import AttributesHandler, TextHandler, TextHandlers
33
  from scrapling.core.mixins import SelectorsGeneration
@@ -36,7 +40,7 @@ from scrapling.core.storage import (
36
  StorageSystemMixin,
37
  _StorageTools,
38
  )
39
- from scrapling.core.translator import translator as _translator
40
  from scrapling.core.utils import clean_spaces, flatten, html_forbidden, log
41
 
42
  __DEFAULT_DB_FILE__ = str(Path(__file__).parent / "elements_storage.db")
@@ -70,20 +74,23 @@ class Selector(SelectorsGeneration):
70
  "_raw_body",
71
  )
72
 
 
 
 
73
  def __init__(
74
  self,
75
  content: Optional[str | bytes] = None,
76
- url: Optional[str] = None,
77
  encoding: str = "utf-8",
78
  huge_tree: bool = True,
79
  root: Optional[HtmlElement] = None,
80
  keep_comments: Optional[bool] = False,
81
  keep_cdata: Optional[bool] = False,
82
  adaptive: Optional[bool] = False,
83
- _storage: object = None,
84
  storage: Any = SQLiteStorageSystem,
85
  storage_args: Optional[Dict] = None,
86
- **kwargs,
87
  ):
88
  """The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
89
  with expressions in CSS, XPath, or with simply text. Check the docs for more info.
@@ -131,7 +138,7 @@ class Selector(SelectorsGeneration):
131
  default_doctype=True,
132
  strip_cdata=(not keep_cdata),
133
  )
134
- self._root = fromstring(body, parser=parser, base_url=url)
135
  self._raw_body = content
136
 
137
  else:
@@ -141,7 +148,7 @@ class Selector(SelectorsGeneration):
141
  f"Root have to be a valid element of `html` module types to work, not of type {type(root)}"
142
  )
143
 
144
- self._root = root
145
  self._raw_body = ""
146
 
147
  self.__adaptive_enabled = adaptive
@@ -238,6 +245,9 @@ class Selector(SelectorsGeneration):
238
  **self.__response_data,
239
  )
240
 
 
 
 
241
  def __handle_element(
242
  self, element: Optional[HtmlElement | _ElementUnicodeResult]
243
  ) -> Optional[Union[TextHandler, "Selector"]]:
@@ -262,7 +272,7 @@ class Selector(SelectorsGeneration):
262
  if self._is_text_node(result[0]):
263
  return TextHandlers(map(TextHandler, result))
264
 
265
- return Selectors(map(self.__element_convertor, result))
266
 
267
  def __getstate__(self) -> Any:
268
  # lxml don't like it :)
@@ -323,7 +333,7 @@ class Selector(SelectorsGeneration):
323
  if not valid_values or processed_text.strip():
324
  _all_strings.append(processed_text)
325
 
326
- return TextHandler(separator).join(_all_strings)
327
 
328
  def urljoin(self, relative_url: str) -> str:
329
  """Join this Selector's url with a relative url to form an absolute full URL."""
@@ -372,13 +382,14 @@ class Selector(SelectorsGeneration):
372
  @property
373
  def parent(self) -> Optional["Selector"]:
374
  """Return the direct parent of the element or ``None`` otherwise"""
375
- return self.__handle_element(self._root.getparent())
 
376
 
377
  @property
378
  def below_elements(self) -> "Selectors":
379
  """Return all elements under the current element in the DOM tree"""
380
  below = _find_all_elements(self._root)
381
- return self.__handle_elements(below)
382
 
383
  @property
384
  def children(self) -> "Selectors":
@@ -425,7 +436,7 @@ class Selector(SelectorsGeneration):
425
  # Ignore HTML comments and unwanted types
426
  next_element = next_element.getnext()
427
 
428
- return self.__handle_element(next_element)
429
 
430
  @property
431
  def previous(self) -> Optional["Selector"]:
@@ -435,10 +446,10 @@ class Selector(SelectorsGeneration):
435
  # Ignore HTML comments and unwanted types
436
  prev_element = prev_element.getprevious()
437
 
438
- return self.__handle_element(prev_element)
439
 
440
  # For easy copy-paste from Scrapy/parsel code when needed :)
441
- def get(self, default=None):
442
  return self
443
 
444
  def get_all(self):
@@ -468,6 +479,16 @@ class Selector(SelectorsGeneration):
468
  return data + ">"
469
 
470
  # From here we start with the selecting functions
 
 
 
 
 
 
 
 
 
 
471
  def relocate(
472
  self,
473
  element: Union[Dict, HtmlElement, "Selector"],
@@ -506,11 +527,11 @@ class Selector(SelectorsGeneration):
506
  log.debug(f"Highest probability was {highest_probability}%")
507
  log.debug("Top 5 best matching elements are: ")
508
  for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
509
- log.debug(f"{percent} -> {self.__handle_elements(score_table[percent])}")
510
 
511
  if not selector_type:
512
  return score_table[highest_probability]
513
- return self.__handle_elements(score_table[highest_probability])
514
  return []
515
 
516
  def css_first(
@@ -593,7 +614,7 @@ class Selector(SelectorsGeneration):
593
  auto_save: bool = False,
594
  percentage: int = 0,
595
  **kwargs: Any,
596
- ) -> Union["Selectors", List, "TextHandlers"]:
597
  """Search the current tree with CSS3 selectors
598
 
599
  **Important:
@@ -614,7 +635,7 @@ class Selector(SelectorsGeneration):
614
  try:
615
  if not self.__adaptive_enabled or "," not in selector:
616
  # No need to split selectors in this case, let's save some CPU cycles :)
617
- xpath_selector = _translator.css_to_xpath(selector)
618
  return self.xpath(
619
  xpath_selector,
620
  identifier or selector,
@@ -628,7 +649,7 @@ class Selector(SelectorsGeneration):
628
  for single_selector in split_selectors(selector):
629
  # I'm doing this only so the `save` function saves data correctly for combined selectors
630
  # Like using the ',' to combine two different selectors that point to different elements.
631
- xpath_selector = _translator.css_to_xpath(single_selector.canonical())
632
  results += self.xpath(
633
  xpath_selector,
634
  identifier or single_selector.canonical(),
@@ -731,7 +752,8 @@ class Selector(SelectorsGeneration):
731
  raise TypeError("You have to pass something to search with, like tag name(s), tag attributes, or both.")
732
 
733
  attributes = dict()
734
- tags, patterns = set(), set()
 
735
  results, functions, selectors = Selectors(), [], []
736
 
737
  # Brace yourself for a wonderful journey!
@@ -740,6 +762,7 @@ class Selector(SelectorsGeneration):
740
  tags.add(arg)
741
 
742
  elif type(arg) in (list, tuple, set):
 
743
  if not all(map(lambda x: isinstance(x, str), arg)):
744
  raise TypeError("Nested Iterables are not accepted, only iterables of tag names are accepted")
745
  tags.update(set(arg))
@@ -774,7 +797,7 @@ class Selector(SelectorsGeneration):
774
  attributes[attribute_name] = value
775
 
776
  # It's easier and faster to build a selector than traversing the tree
777
- tags = tags or ["*"]
778
  for tag in tags:
779
  selector = tag
780
  for key, value in attributes.items():
@@ -785,7 +808,7 @@ class Selector(SelectorsGeneration):
785
  selectors.append(selector)
786
 
787
  if selectors:
788
- results = self.css(", ".join(selectors))
789
  if results:
790
  # From the results, get the ones that fulfill passed regex patterns
791
  for pattern in patterns:
@@ -828,20 +851,20 @@ class Selector(SelectorsGeneration):
828
  :return: A percentage score of how similar is the candidate to the original element
829
  """
830
  score, checks = 0, 0
831
- candidate = _StorageTools.element_to_dict(candidate)
832
 
833
  # Possible TODO:
834
  # Study the idea of giving weight to each test below so some are more important than others
835
  # Current results: With weights some websites had better score while it was worse for others
836
- score += 1 if original["tag"] == candidate["tag"] else 0 # * 0.3 # 30%
837
  checks += 1
838
 
839
  if original["text"]:
840
- score += SequenceMatcher(None, original["text"], candidate.get("text") or "").ratio() # * 0.3 # 30%
841
  checks += 1
842
 
843
  # if both don't have attributes, it still counts for something!
844
- score += self.__calculate_dict_diff(original["attributes"], candidate["attributes"]) # * 0.3 # 30%
845
  checks += 1
846
 
847
  # Separate similarity test for class, id, href,... this will help in full structural changes
@@ -855,23 +878,23 @@ class Selector(SelectorsGeneration):
855
  score += SequenceMatcher(
856
  None,
857
  original["attributes"][attrib],
858
- candidate["attributes"].get(attrib) or "",
859
  ).ratio() # * 0.3 # 30%
860
  checks += 1
861
 
862
- score += SequenceMatcher(None, original["path"], candidate["path"]).ratio() # * 0.1 # 10%
863
  checks += 1
864
 
865
  if original.get("parent_name"):
866
  # Then we start comparing parents' data
867
- if candidate.get("parent_name"):
868
  score += SequenceMatcher(
869
- None, original["parent_name"], candidate.get("parent_name") or ""
870
  ).ratio() # * 0.2 # 20%
871
  checks += 1
872
 
873
  score += self.__calculate_dict_diff(
874
- original["parent_attribs"], candidate.get("parent_attribs") or {}
875
  ) # * 0.2 # 20%
876
  checks += 1
877
 
@@ -879,7 +902,7 @@ class Selector(SelectorsGeneration):
879
  score += SequenceMatcher(
880
  None,
881
  original["parent_text"],
882
- candidate.get("parent_text") or "",
883
  ).ratio() # * 0.1 # 10%
884
  checks += 1
885
  # else:
@@ -887,9 +910,7 @@ class Selector(SelectorsGeneration):
887
  # score -= 0.1
888
 
889
  if original.get("siblings"):
890
- score += SequenceMatcher(
891
- None, original["siblings"], candidate.get("siblings") or []
892
- ).ratio() # * 0.1 # 10%
893
  checks += 1
894
 
895
  # How % sure? let's see
@@ -902,7 +923,7 @@ class Selector(SelectorsGeneration):
902
  score += SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio() * 0.5
903
  return score
904
 
905
- def save(self, element: Union["Selector", HtmlElement], identifier: str) -> None:
906
  """Saves the element's unique properties to the storage for retrieval and relocation later
907
 
908
  :param element: The element itself that we want to save to storage, it can be a ` Selector ` or pure ` HtmlElement `
@@ -910,15 +931,16 @@ class Selector(SelectorsGeneration):
910
  the docs for more info.
911
  """
912
  if self.__adaptive_enabled:
913
- if isinstance(element, self.__class__):
914
- element = element._root
 
915
 
916
- if self._is_text_node(element):
917
- element = element.getparent()
918
 
919
- self._storage.save(element, identifier)
920
  else:
921
- log.critical(
922
  "Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
923
  )
924
 
@@ -932,10 +954,9 @@ class Selector(SelectorsGeneration):
932
  if self.__adaptive_enabled:
933
  return self._storage.retrieve(identifier)
934
 
935
- log.critical(
936
  "Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
937
  )
938
- return None
939
 
940
  # Operations on text functions
941
  def json(self) -> Dict:
@@ -1104,28 +1125,30 @@ class Selector(SelectorsGeneration):
1104
  if not case_sensitive:
1105
  text = text.lower()
1106
 
1107
- for node in self.__handle_elements(_find_all_elements_with_spaces(self._root)):
1108
- """Check if element matches given text otherwise, traverse the children tree and iterate"""
1109
- node_text = node.text
1110
- if clean_match:
1111
- node_text = node_text.clean()
1112
-
1113
- if not case_sensitive:
1114
- node_text = node_text.lower()
1115
-
1116
- if partial:
1117
- if text in node_text:
 
 
 
 
1118
  results.append(node)
1119
- elif text == node_text:
1120
- results.append(node)
1121
 
1122
- if first_match and results:
1123
- # we got an element so we should stop
1124
- break
1125
 
1126
- if first_match:
1127
- if results:
1128
- return results[0]
1129
  return results
1130
 
1131
  def find_by_regex(
@@ -1143,23 +1166,25 @@ class Selector(SelectorsGeneration):
1143
  """
1144
  results = Selectors()
1145
 
1146
- for node in self.__handle_elements(_find_all_elements_with_spaces(self._root)):
1147
- """Check if element matches given regex otherwise, traverse the children tree and iterate"""
1148
- node_text = node.text
1149
- if node_text.re(
1150
- query,
1151
- check_match=True,
1152
- clean_match=clean_match,
1153
- case_sensitive=case_sensitive,
1154
- ):
1155
- results.append(node)
 
 
1156
 
1157
- if first_match and results:
1158
- # we got an element so we should stop
1159
- break
1160
 
1161
- if results and first_match:
1162
- return results[0]
1163
  return results
1164
 
1165
 
@@ -1181,9 +1206,9 @@ class Selectors(List[Selector]):
1181
  def __getitem__(self, pos: SupportsIndex | slice) -> Union[Selector, "Selectors"]:
1182
  lst = super().__getitem__(pos)
1183
  if isinstance(pos, slice):
1184
- return self.__class__(lst)
1185
  else:
1186
- return lst
1187
 
1188
  def xpath(
1189
  self,
@@ -1265,7 +1290,7 @@ class Selectors(List[Selector]):
1265
  def re_first(
1266
  self,
1267
  regex: str | Pattern,
1268
- default=None,
1269
  replace_entities: bool = True,
1270
  clean_match: bool = False,
1271
  case_sensitive: bool = True,
 
17
 
18
  from scrapling.core._types import (
19
  Any,
20
+ Set,
21
  Dict,
22
+ cast,
23
  List,
24
  Tuple,
25
  Union,
26
  Pattern,
27
  Callable,
28
+ Literal,
29
  Optional,
30
  Iterable,
31
  overload,
32
  Generator,
33
  SupportsIndex,
34
+ TYPE_CHECKING,
35
  )
36
  from scrapling.core.custom_types import AttributesHandler, TextHandler, TextHandlers
37
  from scrapling.core.mixins import SelectorsGeneration
 
40
  StorageSystemMixin,
41
  _StorageTools,
42
  )
43
+ from scrapling.core.translator import css_to_xpath as _css_to_xpath
44
  from scrapling.core.utils import clean_spaces, flatten, html_forbidden, log
45
 
46
  __DEFAULT_DB_FILE__ = str(Path(__file__).parent / "elements_storage.db")
 
74
  "_raw_body",
75
  )
76
 
77
+ if TYPE_CHECKING:
78
+ _storage: StorageSystemMixin
79
+
80
  def __init__(
81
  self,
82
  content: Optional[str | bytes] = None,
83
+ url: str = "",
84
  encoding: str = "utf-8",
85
  huge_tree: bool = True,
86
  root: Optional[HtmlElement] = None,
87
  keep_comments: Optional[bool] = False,
88
  keep_cdata: Optional[bool] = False,
89
  adaptive: Optional[bool] = False,
90
+ _storage: Optional[StorageSystemMixin] = None,
91
  storage: Any = SQLiteStorageSystem,
92
  storage_args: Optional[Dict] = None,
93
+ **_,
94
  ):
95
  """The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
96
  with expressions in CSS, XPath, or with simply text. Check the docs for more info.
 
138
  default_doctype=True,
139
  strip_cdata=(not keep_cdata),
140
  )
141
+ self._root = cast(HtmlElement, fromstring(body, parser=parser, base_url=url or None))
142
  self._raw_body = content
143
 
144
  else:
 
148
  f"Root have to be a valid element of `html` module types to work, not of type {type(root)}"
149
  )
150
 
151
+ self._root = cast(HtmlElement, root)
152
  self._raw_body = ""
153
 
154
  self.__adaptive_enabled = adaptive
 
245
  **self.__response_data,
246
  )
247
 
248
+ def __elements_convertor(self, elements: List[HtmlElement]) -> "Selectors":
249
+ return Selectors(map(self.__element_convertor, elements))
250
+
251
  def __handle_element(
252
  self, element: Optional[HtmlElement | _ElementUnicodeResult]
253
  ) -> Optional[Union[TextHandler, "Selector"]]:
 
272
  if self._is_text_node(result[0]):
273
  return TextHandlers(map(TextHandler, result))
274
 
275
+ return self.__elements_convertor(result)
276
 
277
  def __getstate__(self) -> Any:
278
  # lxml don't like it :)
 
333
  if not valid_values or processed_text.strip():
334
  _all_strings.append(processed_text)
335
 
336
+ return cast(TextHandler, TextHandler(separator).join(_all_strings))
337
 
338
  def urljoin(self, relative_url: str) -> str:
339
  """Join this Selector's url with a relative url to form an absolute full URL."""
 
382
  @property
383
  def parent(self) -> Optional["Selector"]:
384
  """Return the direct parent of the element or ``None`` otherwise"""
385
+ _parent = self._root.getparent()
386
+ return self.__element_convertor(_parent) if _parent is not None else None
387
 
388
  @property
389
  def below_elements(self) -> "Selectors":
390
  """Return all elements under the current element in the DOM tree"""
391
  below = _find_all_elements(self._root)
392
+ return self.__elements_convertor(below) if below is not None else Selectors()
393
 
394
  @property
395
  def children(self) -> "Selectors":
 
436
  # Ignore HTML comments and unwanted types
437
  next_element = next_element.getnext()
438
 
439
+ return self.__element_convertor(next_element) if next_element is not None else None
440
 
441
  @property
442
  def previous(self) -> Optional["Selector"]:
 
446
  # Ignore HTML comments and unwanted types
447
  prev_element = prev_element.getprevious()
448
 
449
+ return self.__element_convertor(prev_element) if prev_element is not None else None
450
 
451
  # For easy copy-paste from Scrapy/parsel code when needed :)
452
+ def get(self, default=None): # pyright: ignore
453
  return self
454
 
455
  def get_all(self):
 
479
  return data + ">"
480
 
481
  # From here we start with the selecting functions
482
+ @overload
483
+ def relocate(
484
+ self, element: Union[Dict, HtmlElement, "Selector"], percentage: int, selector_type: Literal[True]
485
+ ) -> "Selectors": ...
486
+
487
+ @overload
488
+ def relocate(
489
+ self, element: Union[Dict, HtmlElement, "Selector"], percentage: int, selector_type: Literal[False] = False
490
+ ) -> List[HtmlElement]: ...
491
+
492
  def relocate(
493
  self,
494
  element: Union[Dict, HtmlElement, "Selector"],
 
527
  log.debug(f"Highest probability was {highest_probability}%")
528
  log.debug("Top 5 best matching elements are: ")
529
  for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
530
+ log.debug(f"{percent} -> {self.__elements_convertor(score_table[percent])}")
531
 
532
  if not selector_type:
533
  return score_table[highest_probability]
534
+ return self.__elements_convertor(score_table[highest_probability])
535
  return []
536
 
537
  def css_first(
 
614
  auto_save: bool = False,
615
  percentage: int = 0,
616
  **kwargs: Any,
617
+ ) -> Union["Selectors", List[Any], "TextHandlers"]:
618
  """Search the current tree with CSS3 selectors
619
 
620
  **Important:
 
635
  try:
636
  if not self.__adaptive_enabled or "," not in selector:
637
  # No need to split selectors in this case, let's save some CPU cycles :)
638
+ xpath_selector = _css_to_xpath(selector)
639
  return self.xpath(
640
  xpath_selector,
641
  identifier or selector,
 
649
  for single_selector in split_selectors(selector):
650
  # I'm doing this only so the `save` function saves data correctly for combined selectors
651
  # Like using the ',' to combine two different selectors that point to different elements.
652
+ xpath_selector = _css_to_xpath(single_selector.canonical())
653
  results += self.xpath(
654
  xpath_selector,
655
  identifier or single_selector.canonical(),
 
752
  raise TypeError("You have to pass something to search with, like tag name(s), tag attributes, or both.")
753
 
754
  attributes = dict()
755
+ tags: Set[str] = set()
756
+ patterns: Set[Pattern] = set()
757
  results, functions, selectors = Selectors(), [], []
758
 
759
  # Brace yourself for a wonderful journey!
 
762
  tags.add(arg)
763
 
764
  elif type(arg) in (list, tuple, set):
765
+ arg = cast(Iterable, arg) # Type narrowing for type checkers like pyright
766
  if not all(map(lambda x: isinstance(x, str), arg)):
767
  raise TypeError("Nested Iterables are not accepted, only iterables of tag names are accepted")
768
  tags.update(set(arg))
 
797
  attributes[attribute_name] = value
798
 
799
  # It's easier and faster to build a selector than traversing the tree
800
+ tags = tags or set("*")
801
  for tag in tags:
802
  selector = tag
803
  for key, value in attributes.items():
 
808
  selectors.append(selector)
809
 
810
  if selectors:
811
+ results = cast(Selectors, self.css(", ".join(selectors)))
812
  if results:
813
  # From the results, get the ones that fulfill passed regex patterns
814
  for pattern in patterns:
 
851
  :return: A percentage score of how similar is the candidate to the original element
852
  """
853
  score, checks = 0, 0
854
+ data = _StorageTools.element_to_dict(candidate)
855
 
856
  # Possible TODO:
857
  # Study the idea of giving weight to each test below so some are more important than others
858
  # Current results: With weights some websites had better score while it was worse for others
859
+ score += 1 if original["tag"] == data["tag"] else 0 # * 0.3 # 30%
860
  checks += 1
861
 
862
  if original["text"]:
863
+ score += SequenceMatcher(None, original["text"], data.get("text") or "").ratio() # * 0.3 # 30%
864
  checks += 1
865
 
866
  # if both don't have attributes, it still counts for something!
867
+ score += self.__calculate_dict_diff(original["attributes"], data["attributes"]) # * 0.3 # 30%
868
  checks += 1
869
 
870
  # Separate similarity test for class, id, href,... this will help in full structural changes
 
878
  score += SequenceMatcher(
879
  None,
880
  original["attributes"][attrib],
881
+ data["attributes"].get(attrib) or "",
882
  ).ratio() # * 0.3 # 30%
883
  checks += 1
884
 
885
+ score += SequenceMatcher(None, original["path"], data["path"]).ratio() # * 0.1 # 10%
886
  checks += 1
887
 
888
  if original.get("parent_name"):
889
  # Then we start comparing parents' data
890
+ if data.get("parent_name"):
891
  score += SequenceMatcher(
892
+ None, original["parent_name"], data.get("parent_name") or ""
893
  ).ratio() # * 0.2 # 20%
894
  checks += 1
895
 
896
  score += self.__calculate_dict_diff(
897
+ original["parent_attribs"], data.get("parent_attribs") or {}
898
  ) # * 0.2 # 20%
899
  checks += 1
900
 
 
902
  score += SequenceMatcher(
903
  None,
904
  original["parent_text"],
905
+ data.get("parent_text") or "",
906
  ).ratio() # * 0.1 # 10%
907
  checks += 1
908
  # else:
 
910
  # score -= 0.1
911
 
912
  if original.get("siblings"):
913
+ score += SequenceMatcher(None, original["siblings"], data.get("siblings") or []).ratio() # * 0.1 # 10%
 
 
914
  checks += 1
915
 
916
  # How % sure? let's see
 
923
  score += SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio() * 0.5
924
  return score
925
 
926
+ def save(self, element: HtmlElement, identifier: str) -> None:
927
  """Saves the element's unique properties to the storage for retrieval and relocation later
928
 
929
  :param element: The element itself that we want to save to storage, it can be a ` Selector ` or pure ` HtmlElement `
 
931
  the docs for more info.
932
  """
933
  if self.__adaptive_enabled:
934
+ target = element
935
+ if isinstance(target, self.__class__):
936
+ target: HtmlElement = target._root
937
 
938
+ if self._is_text_node(target):
939
+ target: HtmlElement = target.getparent()
940
 
941
+ self._storage.save(target, identifier)
942
  else:
943
+ raise RuntimeError(
944
  "Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
945
  )
946
 
 
954
  if self.__adaptive_enabled:
955
  return self._storage.retrieve(identifier)
956
 
957
+ raise RuntimeError(
958
  "Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
959
  )
 
960
 
961
  # Operations on text functions
962
  def json(self) -> Dict:
 
1125
  if not case_sensitive:
1126
  text = text.lower()
1127
 
1128
+ possible_targets = _find_all_elements_with_spaces(self._root)
1129
+ if possible_targets:
1130
+ for node in self.__elements_convertor(possible_targets):
1131
+ """Check if element matches given text otherwise, traverse the children tree and iterate"""
1132
+ node_text = node.text
1133
+ if clean_match:
1134
+ node_text = node_text.clean()
1135
+
1136
+ if not case_sensitive:
1137
+ node_text = node_text.lower()
1138
+
1139
+ if partial:
1140
+ if text in node_text:
1141
+ results.append(node)
1142
+ elif text == node_text:
1143
  results.append(node)
 
 
1144
 
1145
+ if first_match and results:
1146
+ # we got an element so we should stop
1147
+ break
1148
 
1149
+ if first_match:
1150
+ if results:
1151
+ return results[0]
1152
  return results
1153
 
1154
  def find_by_regex(
 
1166
  """
1167
  results = Selectors()
1168
 
1169
+ possible_targets = _find_all_elements_with_spaces(self._root)
1170
+ if possible_targets:
1171
+ for node in self.__elements_convertor(possible_targets):
1172
+ """Check if element matches given regex otherwise, traverse the children tree and iterate"""
1173
+ node_text = node.text
1174
+ if node_text.re(
1175
+ query,
1176
+ check_match=True,
1177
+ clean_match=clean_match,
1178
+ case_sensitive=case_sensitive,
1179
+ ):
1180
+ results.append(node)
1181
 
1182
+ if first_match and results:
1183
+ # we got an element so we should stop
1184
+ break
1185
 
1186
+ if results and first_match:
1187
+ return results[0]
1188
  return results
1189
 
1190
 
 
1206
  def __getitem__(self, pos: SupportsIndex | slice) -> Union[Selector, "Selectors"]:
1207
  lst = super().__getitem__(pos)
1208
  if isinstance(pos, slice):
1209
+ return self.__class__(cast(List[Selector], lst))
1210
  else:
1211
+ return cast(Selector, lst)
1212
 
1213
  def xpath(
1214
  self,
 
1290
  def re_first(
1291
  self,
1292
  regex: str | Pattern,
1293
+ default: Any = None,
1294
  replace_entities: bool = True,
1295
  clean_match: bool = False,
1296
  case_sensitive: bool = True,