Karim shoair commited on
Commit
a85d2c8
·
1 Parent(s): 085d32d

refactor: Make all fetchers as an optional dependency group

Browse files
pyproject.toml CHANGED
@@ -61,6 +61,10 @@ dependencies = [
61
  "click>=8.2.1",
62
  "orjson>=3.11.3",
63
  "tldextract>=5.3.0",
 
 
 
 
64
  "curl_cffi>=0.13.0",
65
  "playwright>=1.52.0",
66
  "rebrowser-playwright>=1.52.0",
@@ -68,15 +72,15 @@ dependencies = [
68
  "geoip2>=5.1.0",
69
  "msgspec>=0.19.0",
70
  ]
71
-
72
- [project.optional-dependencies]
73
  ai = [
74
  "mcp>=1.14.0",
75
  "markdownify>=1.2.0",
 
76
  ]
77
  shell = [
78
  "IPython>=8.37", # The last version that supports Python 3.10
79
  "markdownify>=1.2.0",
 
80
  ]
81
  all = [
82
  "scrapling[ai,shell]",
 
61
  "click>=8.2.1",
62
  "orjson>=3.11.3",
63
  "tldextract>=5.3.0",
64
+ ]
65
+
66
+ [project.optional-dependencies]
67
+ fetchers = [
68
  "curl_cffi>=0.13.0",
69
  "playwright>=1.52.0",
70
  "rebrowser-playwright>=1.52.0",
 
72
  "geoip2>=5.1.0",
73
  "msgspec>=0.19.0",
74
  ]
 
 
75
  ai = [
76
  "mcp>=1.14.0",
77
  "markdownify>=1.2.0",
78
+ "scrapling[fetchers]",
79
  ]
80
  shell = [
81
  "IPython>=8.37", # The last version that supports Python 3.10
82
  "markdownify>=1.2.0",
83
+ "scrapling[fetchers]",
84
  ]
85
  all = [
86
  "scrapling[ai,shell]",
scrapling/cli.py CHANGED
@@ -2,11 +2,9 @@ from pathlib import Path
2
  from subprocess import check_output
3
  from sys import executable as python_executable
4
 
5
- from scrapling.core.utils import log
6
- from scrapling.engines.toolbelt import Response
7
  from scrapling.core._types import List, Optional, Dict, Tuple, Any, Callable
8
- from scrapling.fetchers import Fetcher, DynamicFetcher, StealthyFetcher
9
- from scrapling.core.shell import Convertor, _CookieParser, _ParseHeaders
10
 
11
  from orjson import loads as json_loads, JSONDecodeError
12
  from click import command, option, Choice, group, argument
@@ -40,6 +38,8 @@ def __Request_and_Save(
40
  **kwargs,
41
  ) -> None:
42
  """Make a request using the specified fetcher function and save the result"""
 
 
43
  # Handle relative paths - convert to an absolute path based on the current working directory
44
  output_path = Path(output_file)
45
  if not output_path.is_absolute():
@@ -251,6 +251,8 @@ def get(
251
  impersonate=impersonate,
252
  proxy=proxy,
253
  )
 
 
254
  __Request_and_Save(Fetcher.get, url, output_file, css_selector, **kwargs)
255
 
256
 
@@ -347,6 +349,8 @@ def post(
347
  proxy=proxy,
348
  data=data,
349
  )
 
 
350
  __Request_and_Save(Fetcher.post, url, output_file, css_selector, **kwargs)
351
 
352
 
@@ -439,6 +443,8 @@ def put(
439
  proxy=proxy,
440
  data=data,
441
  )
 
 
442
  __Request_and_Save(Fetcher.put, url, output_file, css_selector, **kwargs)
443
 
444
 
@@ -524,6 +530,8 @@ def delete(
524
  impersonate=impersonate,
525
  proxy=proxy,
526
  )
 
 
527
  __Request_and_Save(Fetcher.delete, url, output_file, css_selector, **kwargs)
528
 
529
 
@@ -643,6 +651,8 @@ def fetch(
643
  if parsed_headers:
644
  kwargs["extra_headers"] = parsed_headers
645
 
 
 
646
  __Request_and_Save(DynamicFetcher.fetch, url, output_file, css_selector, **kwargs)
647
 
648
 
@@ -790,6 +800,8 @@ def stealthy_fetch(
790
  if parsed_headers:
791
  kwargs["extra_headers"] = parsed_headers
792
 
 
 
793
  __Request_and_Save(StealthyFetcher.fetch, url, output_file, css_selector, **kwargs)
794
 
795
 
 
2
  from subprocess import check_output
3
  from sys import executable as python_executable
4
 
5
+ from scrapling.engines.toolbelt.custom import Response
6
+ from scrapling.core.utils import log, _CookieParser, _ParseHeaders
7
  from scrapling.core._types import List, Optional, Dict, Tuple, Any, Callable
 
 
8
 
9
  from orjson import loads as json_loads, JSONDecodeError
10
  from click import command, option, Choice, group, argument
 
38
  **kwargs,
39
  ) -> None:
40
  """Make a request using the specified fetcher function and save the result"""
41
+ from scrapling.core.shell import Convertor
42
+
43
  # Handle relative paths - convert to an absolute path based on the current working directory
44
  output_path = Path(output_file)
45
  if not output_path.is_absolute():
 
251
  impersonate=impersonate,
252
  proxy=proxy,
253
  )
254
+ from scrapling.fetchers import Fetcher
255
+
256
  __Request_and_Save(Fetcher.get, url, output_file, css_selector, **kwargs)
257
 
258
 
 
349
  proxy=proxy,
350
  data=data,
351
  )
352
+ from scrapling.fetchers import Fetcher
353
+
354
  __Request_and_Save(Fetcher.post, url, output_file, css_selector, **kwargs)
355
 
356
 
 
443
  proxy=proxy,
444
  data=data,
445
  )
446
+ from scrapling.fetchers import Fetcher
447
+
448
  __Request_and_Save(Fetcher.put, url, output_file, css_selector, **kwargs)
449
 
450
 
 
530
  impersonate=impersonate,
531
  proxy=proxy,
532
  )
533
+ from scrapling.fetchers import Fetcher
534
+
535
  __Request_and_Save(Fetcher.delete, url, output_file, css_selector, **kwargs)
536
 
537
 
 
651
  if parsed_headers:
652
  kwargs["extra_headers"] = parsed_headers
653
 
654
+ from scrapling.fetchers import DynamicFetcher
655
+
656
  __Request_and_Save(DynamicFetcher.fetch, url, output_file, css_selector, **kwargs)
657
 
658
 
 
800
  if parsed_headers:
801
  kwargs["extra_headers"] = parsed_headers
802
 
803
+ from scrapling.fetchers import StealthyFetcher
804
+
805
  __Request_and_Save(StealthyFetcher.fetch, url, output_file, css_selector, **kwargs)
806
 
807
 
scrapling/core/shell.py CHANGED
@@ -2,7 +2,6 @@
2
  from re import sub as re_sub
3
  from sys import stderr
4
  from functools import wraps
5
- from http import cookies as Cookie
6
  from collections import namedtuple
7
  from shlex import split as shlex_split
8
  from tempfile import mkstemp as make_temp_file
@@ -23,25 +22,17 @@ from logging import (
23
  from orjson import loads as json_loads, JSONDecodeError
24
 
25
  from scrapling import __version__
26
- from scrapling.core.custom_types import TextHandler
27
- from scrapling.core.utils import log
28
  from scrapling.parser import Selector, Selectors
 
 
 
29
  from scrapling.core._types import (
30
- List,
31
  Optional,
32
  Dict,
33
- Tuple,
34
  Any,
35
  extraction_types,
36
  Generator,
37
  )
38
- from scrapling.fetchers import (
39
- Fetcher,
40
- AsyncFetcher,
41
- DynamicFetcher,
42
- StealthyFetcher,
43
- Response,
44
- )
45
 
46
 
47
  _known_logging_levels = {
@@ -71,46 +62,6 @@ Request = namedtuple(
71
  )
72
 
73
 
74
- def _CookieParser(cookie_string):
75
- # Errors will be handled on call so the log can be specified
76
- cookie_parser = Cookie.SimpleCookie()
77
- cookie_parser.load(cookie_string)
78
- for key, morsel in cookie_parser.items():
79
- yield key, morsel.value
80
-
81
-
82
- def _ParseHeaders(header_lines: List[str], parse_cookies: bool = True) -> Tuple[Dict[str, str], Dict[str, str]]:
83
- """Parses headers into separate header and cookie dictionaries."""
84
- header_dict = dict()
85
- cookie_dict = dict()
86
-
87
- for header_line in header_lines:
88
- if ":" not in header_line:
89
- if header_line.endswith(";"):
90
- header_key = header_line[:-1].strip()
91
- header_value = ""
92
- header_dict[header_key] = header_value
93
- else:
94
- raise ValueError(f"Could not parse header without colon: '{header_line}'.")
95
- else:
96
- header_key, header_value = header_line.split(":", 1)
97
- header_key = header_key.strip()
98
- header_value = header_value.strip()
99
-
100
- if parse_cookies:
101
- if header_key.lower() == "cookie":
102
- try:
103
- cookie_dict = {key: value for key, value in _CookieParser(header_value)}
104
- except Exception as e: # pragma: no cover
105
- raise ValueError(f"Could not parse cookie string from header '{header_value}': {e}")
106
- else:
107
- header_dict[header_key] = header_value
108
- else:
109
- header_dict[header_key] = header_value
110
-
111
- return header_dict, cookie_dict
112
-
113
-
114
  # Suppress exit on error to handle parsing errors gracefully
115
  class NoExitArgumentParser(ArgumentParser): # pragma: no cover
116
  def error(self, message):
@@ -128,6 +79,9 @@ class CurlParser:
128
  """Builds the argument parser for relevant curl flags from DevTools."""
129
 
130
  def __init__(self):
 
 
 
131
  # We will use argparse parser to parse the curl command directly instead of regex
132
  # We will focus more on flags that will show up on curl commands copied from DevTools's network tab
133
  _parser = NoExitArgumentParser(add_help=False) # Disable default help
@@ -343,7 +297,7 @@ class CurlParser:
343
  _ = request_args.pop("json", None)
344
 
345
  try:
346
- return getattr(Fetcher, method)(**request_args)
347
  except Exception as e: # pragma: no cover
348
  log.error(f"Error calling Fetcher.{method}: {e}")
349
  return None
@@ -377,6 +331,19 @@ class CustomShell:
377
  """A custom IPython shell with minimal dependencies"""
378
 
379
  def __init__(self, code, log_level="debug"):
 
 
 
 
 
 
 
 
 
 
 
 
 
380
  self.code = code
381
  self.page = None
382
  self.pages = Selectors([])
@@ -400,7 +367,7 @@ class CustomShell:
400
  if self.log_level:
401
  getLogger("scrapling").setLevel(self.log_level)
402
 
403
- settings = Fetcher.display_config()
404
  settings.pop("storage", None)
405
  settings.pop("storage_args", None)
406
  log.info(f"Scrapling {__version__} shell started")
@@ -466,12 +433,12 @@ Type 'exit' or press Ctrl+D to exit.
466
  """Create a namespace with application-specific objects"""
467
 
468
  # Create wrapped versions of fetch functions
469
- get = self.create_wrapper(Fetcher.get)
470
- post = self.create_wrapper(Fetcher.post)
471
- put = self.create_wrapper(Fetcher.put)
472
- delete = self.create_wrapper(Fetcher.delete)
473
- dynamic_fetch = self.create_wrapper(DynamicFetcher.fetch)
474
- stealthy_fetch = self.create_wrapper(StealthyFetcher.fetch)
475
  curl2fetcher = self.create_wrapper(self._curl_parser.convert2fetcher)
476
 
477
  # Create the namespace dictionary
@@ -480,12 +447,12 @@ Type 'exit' or press Ctrl+D to exit.
480
  "post": post,
481
  "put": put,
482
  "delete": delete,
483
- "Fetcher": Fetcher,
484
- "AsyncFetcher": AsyncFetcher,
485
  "fetch": dynamic_fetch,
486
- "DynamicFetcher": DynamicFetcher,
487
  "stealthy_fetch": stealthy_fetch,
488
- "StealthyFetcher": StealthyFetcher,
489
  "Selector": Selector,
490
  "page": self.page,
491
  "response": self.page,
@@ -502,11 +469,10 @@ Type 'exit' or press Ctrl+D to exit.
502
 
503
  def start(self): # pragma: no cover
504
  """Start the interactive shell"""
505
- from IPython.terminal.embed import InteractiveShellEmbed
506
 
507
  # Get our namespace with application objects
508
  namespace = self.get_namespace()
509
- ipython_shell = InteractiveShellEmbed(
510
  banner1=self.banner(),
511
  banner2="",
512
  enable_tip=False,
 
2
  from re import sub as re_sub
3
  from sys import stderr
4
  from functools import wraps
 
5
  from collections import namedtuple
6
  from shlex import split as shlex_split
7
  from tempfile import mkstemp as make_temp_file
 
22
  from orjson import loads as json_loads, JSONDecodeError
23
 
24
  from scrapling import __version__
 
 
25
  from scrapling.parser import Selector, Selectors
26
+ from scrapling.core.custom_types import TextHandler
27
+ from scrapling.engines.toolbelt.custom import Response
28
+ from scrapling.core.utils import log, _ParseHeaders, _CookieParser
29
  from scrapling.core._types import (
 
30
  Optional,
31
  Dict,
 
32
  Any,
33
  extraction_types,
34
  Generator,
35
  )
 
 
 
 
 
 
 
36
 
37
 
38
  _known_logging_levels = {
 
62
  )
63
 
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  # Suppress exit on error to handle parsing errors gracefully
66
  class NoExitArgumentParser(ArgumentParser): # pragma: no cover
67
  def error(self, message):
 
79
  """Builds the argument parser for relevant curl flags from DevTools."""
80
 
81
  def __init__(self):
82
+ from scrapling.fetchers import Fetcher as __Fetcher
83
+
84
+ self.__fetcher = __Fetcher
85
  # We will use argparse parser to parse the curl command directly instead of regex
86
  # We will focus more on flags that will show up on curl commands copied from DevTools's network tab
87
  _parser = NoExitArgumentParser(add_help=False) # Disable default help
 
297
  _ = request_args.pop("json", None)
298
 
299
  try:
300
+ return getattr(self.__Fetcher, method)(**request_args)
301
  except Exception as e: # pragma: no cover
302
  log.error(f"Error calling Fetcher.{method}: {e}")
303
  return None
 
331
  """A custom IPython shell with minimal dependencies"""
332
 
333
  def __init__(self, code, log_level="debug"):
334
+ from IPython.terminal.embed import InteractiveShellEmbed as __InteractiveShellEmbed
335
+ from scrapling.fetchers import (
336
+ Fetcher as __Fetcher,
337
+ AsyncFetcher as __AsyncFetcher,
338
+ DynamicFetcher as __DynamicFetcher,
339
+ StealthyFetcher as __StealthyFetcher,
340
+ )
341
+
342
+ self.__InteractiveShellEmbed = __InteractiveShellEmbed
343
+ self.__Fetcher = __Fetcher
344
+ self.__AsyncFetcher = __AsyncFetcher
345
+ self.__DynamicFetcher = __DynamicFetcher
346
+ self.__StealthyFetcher = __StealthyFetcher
347
  self.code = code
348
  self.page = None
349
  self.pages = Selectors([])
 
367
  if self.log_level:
368
  getLogger("scrapling").setLevel(self.log_level)
369
 
370
+ settings = self.__Fetcher.display_config()
371
  settings.pop("storage", None)
372
  settings.pop("storage_args", None)
373
  log.info(f"Scrapling {__version__} shell started")
 
433
  """Create a namespace with application-specific objects"""
434
 
435
  # Create wrapped versions of fetch functions
436
+ get = self.create_wrapper(self.__Fetcher.get)
437
+ post = self.create_wrapper(self.__Fetcher.post)
438
+ put = self.create_wrapper(self.__Fetcher.put)
439
+ delete = self.create_wrapper(self.__Fetcher.delete)
440
+ dynamic_fetch = self.create_wrapper(self.__DynamicFetcher.fetch)
441
+ stealthy_fetch = self.create_wrapper(self.__StealthyFetcher.fetch)
442
  curl2fetcher = self.create_wrapper(self._curl_parser.convert2fetcher)
443
 
444
  # Create the namespace dictionary
 
447
  "post": post,
448
  "put": put,
449
  "delete": delete,
450
+ "Fetcher": self.__Fetcher,
451
+ "AsyncFetcher": self.__AsyncFetcher,
452
  "fetch": dynamic_fetch,
453
+ "DynamicFetcher": self.__DynamicFetcher,
454
  "stealthy_fetch": stealthy_fetch,
455
+ "StealthyFetcher": self.__StealthyFetcher,
456
  "Selector": Selector,
457
  "page": self.page,
458
  "response": self.page,
 
469
 
470
  def start(self): # pragma: no cover
471
  """Start the interactive shell"""
 
472
 
473
  # Get our namespace with application objects
474
  namespace = self.get_namespace()
475
+ ipython_shell = self.__InteractiveShellEmbed(
476
  banner1=self.banner(),
477
  banner2="",
478
  enable_tip=False,
scrapling/core/translator.py CHANGED
@@ -10,10 +10,10 @@ So you don't have to learn a new selectors/api method like what bs4 done with so
10
 
11
  from functools import lru_cache
12
 
13
- from cssselect import HTMLTranslator as OriginalHTMLTranslator
14
- from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
15
  from cssselect.xpath import ExpressionError
16
  from cssselect.xpath import XPathExpr as OriginalXPathExpr
 
 
17
 
18
  from scrapling.core._types import Any, Optional, Protocol, Self
19
 
 
10
 
11
  from functools import lru_cache
12
 
 
 
13
  from cssselect.xpath import ExpressionError
14
  from cssselect.xpath import XPathExpr as OriginalXPathExpr
15
+ from cssselect import HTMLTranslator as OriginalHTMLTranslator
16
+ from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
17
 
18
  from scrapling.core._types import Any, Optional, Protocol, Self
19
 
scrapling/core/utils/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from ._utils import (
2
+ log,
3
+ __CONSECUTIVE_SPACES_REGEX__,
4
+ flatten,
5
+ _is_iterable,
6
+ _StorageTools,
7
+ clean_spaces,
8
+ html_forbidden,
9
+ )
10
+ from ._shell import _CookieParser, _ParseHeaders
scrapling/core/utils/_shell.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from http import cookies as Cookie
2
+
3
+
4
+ from scrapling.core._types import (
5
+ List,
6
+ Dict,
7
+ Tuple,
8
+ )
9
+
10
+
11
+ def _CookieParser(cookie_string):
12
+ # Errors will be handled on call so the log can be specified
13
+ cookie_parser = Cookie.SimpleCookie()
14
+ cookie_parser.load(cookie_string)
15
+ for key, morsel in cookie_parser.items():
16
+ yield key, morsel.value
17
+
18
+
19
+ def _ParseHeaders(header_lines: List[str], parse_cookies: bool = True) -> Tuple[Dict[str, str], Dict[str, str]]:
20
+ """Parses headers into separate header and cookie dictionaries."""
21
+ header_dict = dict()
22
+ cookie_dict = dict()
23
+
24
+ for header_line in header_lines:
25
+ if ":" not in header_line:
26
+ if header_line.endswith(";"):
27
+ header_key = header_line[:-1].strip()
28
+ header_value = ""
29
+ header_dict[header_key] = header_value
30
+ else:
31
+ raise ValueError(f"Could not parse header without colon: '{header_line}'.")
32
+ else:
33
+ header_key, header_value = header_line.split(":", 1)
34
+ header_key = header_key.strip()
35
+ header_value = header_value.strip()
36
+
37
+ if parse_cookies:
38
+ if header_key.lower() == "cookie":
39
+ try:
40
+ cookie_dict = {key: value for key, value in _CookieParser(header_value)}
41
+ except Exception as e: # pragma: no cover
42
+ raise ValueError(f"Could not parse cookie string from header '{header_value}': {e}")
43
+ else:
44
+ header_dict[header_key] = header_value
45
+ else:
46
+ header_dict[header_key] = header_value
47
+
48
+ return header_dict, cookie_dict
scrapling/core/{utils.py → utils/_utils.py} RENAMED
File without changes
scrapling/engines/__init__.py CHANGED
@@ -1,16 +0,0 @@
1
- from .constants import DEFAULT_DISABLED_RESOURCES, DEFAULT_STEALTH_FLAGS, DEFAULT_FLAGS
2
- from .static import FetcherSession, FetcherClient, AsyncFetcherClient
3
- from ._browsers import (
4
- DynamicSession,
5
- AsyncDynamicSession,
6
- StealthySession,
7
- AsyncStealthySession,
8
- )
9
-
10
- __all__ = [
11
- "FetcherSession",
12
- "DynamicSession",
13
- "AsyncDynamicSession",
14
- "StealthySession",
15
- "AsyncStealthySession",
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scrapling/engines/_browsers/_base.py CHANGED
@@ -12,20 +12,17 @@ from camoufox.utils import (
12
  installed_verstr as camoufox_version,
13
  )
14
 
15
- from scrapling.engines.toolbelt import (
16
- intercept_route,
17
- async_intercept_route,
18
- get_os_name,
19
- )
20
- from ._page import PageInfo, PagePool
21
- from ._config_tools import _compiled_stealth_scripts
22
- from ._validators import validate, PlaywrightConfig, CamoufoxConfig
23
- from ._config_tools import _launch_kwargs, _context_kwargs
24
  from scrapling.core._types import (
25
  Any,
26
  Dict,
27
  Optional,
28
  )
 
 
 
 
 
29
 
30
  __ff_version_str__ = camoufox_version().split(".", 1)[0]
31
 
 
12
  installed_verstr as camoufox_version,
13
  )
14
 
15
+ from scrapling.engines.toolbelt.navigation import intercept_route, async_intercept_route
 
 
 
 
 
 
 
 
16
  from scrapling.core._types import (
17
  Any,
18
  Dict,
19
  Optional,
20
  )
21
+ from ._page import PageInfo, PagePool
22
+ from ._config_tools import _compiled_stealth_scripts
23
+ from ._config_tools import _launch_kwargs, _context_kwargs
24
+ from scrapling.engines.toolbelt.fingerprints import get_os_name
25
+ from ._validators import validate, PlaywrightConfig, CamoufoxConfig
26
 
27
  __ff_version_str__ = camoufox_version().split(".", 1)[0]
28
 
scrapling/engines/_browsers/_camoufox.py CHANGED
@@ -25,11 +25,11 @@ from scrapling.core._types import (
25
  Callable,
26
  SelectorWaitStates,
27
  )
28
- from scrapling.engines.toolbelt import (
29
  Response,
30
  ResponseFactory,
31
- generate_convincing_referer,
32
  )
 
33
 
34
  __CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
35
  _UNSET = object()
 
25
  Callable,
26
  SelectorWaitStates,
27
  )
28
+ from scrapling.engines.toolbelt.convertor import (
29
  Response,
30
  ResponseFactory,
 
31
  )
32
+ from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
33
 
34
  __CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
35
  _UNSET = object()
scrapling/engines/_browsers/_config_tools.py CHANGED
@@ -6,7 +6,8 @@ from scrapling.engines.constants import (
6
  HARMFUL_DEFAULT_ARGS,
7
  DEFAULT_FLAGS,
8
  )
9
- from scrapling.engines.toolbelt import js_bypass_path, generate_headers
 
10
 
11
  __default_useragent__ = generate_headers(browser_mode=True).get("User-Agent")
12
 
 
6
  HARMFUL_DEFAULT_ARGS,
7
  DEFAULT_FLAGS,
8
  )
9
+ from scrapling.engines.toolbelt.navigation import js_bypass_path
10
+ from scrapling.engines.toolbelt.fingerprints import generate_headers
11
 
12
  __default_useragent__ = generate_headers(browser_mode=True).get("User-Agent")
13
 
scrapling/engines/_browsers/_controllers.py CHANGED
@@ -26,11 +26,11 @@ from scrapling.core._types import (
26
  Callable,
27
  SelectorWaitStates,
28
  )
29
- from scrapling.engines.toolbelt import (
30
  Response,
31
  ResponseFactory,
32
- generate_convincing_referer,
33
  )
 
34
 
35
  _UNSET = object()
36
 
 
26
  Callable,
27
  SelectorWaitStates,
28
  )
29
+ from scrapling.engines.toolbelt.convertor import (
30
  Response,
31
  ResponseFactory,
 
32
  )
33
+ from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
34
 
35
  _UNSET = object()
36
 
scrapling/engines/_browsers/_validators.py CHANGED
@@ -9,7 +9,7 @@ from scrapling.core._types import (
9
  List,
10
  SelectorWaitStates,
11
  )
12
- from scrapling.engines.toolbelt import construct_proxy_dict
13
 
14
 
15
  class PlaywrightConfig(Struct, kw_only=True, frozen=False):
 
9
  List,
10
  SelectorWaitStates,
11
  )
12
+ from scrapling.engines.toolbelt.navigation import construct_proxy_dict
13
 
14
 
15
  class PlaywrightConfig(Struct, kw_only=True, frozen=False):
scrapling/engines/static.py CHANGED
@@ -26,11 +26,11 @@ from scrapling.core._types import (
26
 
27
  from .toolbelt import (
28
  Response,
29
- generate_convincing_referer,
30
  generate_headers,
31
- ResponseFactory,
32
  __default_useragent__,
33
  )
 
 
34
 
35
  _UNSET = object()
36
 
 
26
 
27
  from .toolbelt import (
28
  Response,
 
29
  generate_headers,
 
30
  __default_useragent__,
31
  )
32
+ from .toolbelt.convertor import ResponseFactory
33
+ from .toolbelt.fingerprints import generate_convincing_referer
34
 
35
  _UNSET = object()
36
 
scrapling/engines/toolbelt/__init__.py CHANGED
@@ -5,16 +5,7 @@ from .custom import (
5
  get_variable_name,
6
  )
7
  from .fingerprints import (
8
- generate_convincing_referer,
9
  generate_headers,
10
  get_os_name,
11
  __default_useragent__,
12
  )
13
- from .navigation import (
14
- async_intercept_route,
15
- construct_cdp_url,
16
- construct_proxy_dict,
17
- intercept_route,
18
- js_bypass_path,
19
- )
20
- from .convertor import ResponseFactory
 
5
  get_variable_name,
6
  )
7
  from .fingerprints import (
 
8
  generate_headers,
9
  get_os_name,
10
  __default_useragent__,
11
  )
 
 
 
 
 
 
 
 
scrapling/engines/toolbelt/custom.py CHANGED
@@ -2,8 +2,10 @@
2
  Functions related to custom types or type checking
3
  """
4
 
 
5
  from email.message import Message
6
 
 
7
  from scrapling.core._types import (
8
  Any,
9
  Dict,
@@ -12,7 +14,6 @@ from scrapling.core._types import (
12
  Tuple,
13
  )
14
  from scrapling.core.custom_types import MappingProxyType
15
- from scrapling.core.utils import log, lru_cache
16
  from scrapling.parser import Selector, SQLiteStorageSystem
17
 
18
 
 
2
  Functions related to custom types or type checking
3
  """
4
 
5
+ from functools import lru_cache
6
  from email.message import Message
7
 
8
+ from scrapling.core.utils import log
9
  from scrapling.core._types import (
10
  Any,
11
  Dict,
 
14
  Tuple,
15
  )
16
  from scrapling.core.custom_types import MappingProxyType
 
17
  from scrapling.parser import Selector, SQLiteStorageSystem
18
 
19
 
scrapling/engines/toolbelt/fingerprints.py CHANGED
@@ -2,13 +2,13 @@
2
  Functions related to generating headers and fingerprints generally
3
  """
4
 
 
5
  from platform import system as platform_system
6
 
7
  from tldextract import extract
8
  from browserforge.headers import Browser, HeaderGenerator
9
 
10
  from scrapling.core._types import Dict, Optional
11
- from scrapling.core.utils import lru_cache
12
 
13
  __OS_NAME__ = platform_system()
14
 
@@ -37,8 +37,6 @@ def get_os_name() -> Optional[str]:
37
  "Linux": "linux",
38
  "Darwin": "macos",
39
  "Windows": "windows",
40
- # For the future? because why not?
41
- "iOS": "ios",
42
  }.get(__OS_NAME__)
43
 
44
 
 
2
  Functions related to generating headers and fingerprints generally
3
  """
4
 
5
+ from functools import lru_cache
6
  from platform import system as platform_system
7
 
8
  from tldextract import extract
9
  from browserforge.headers import Browser, HeaderGenerator
10
 
11
  from scrapling.core._types import Dict, Optional
 
12
 
13
  __OS_NAME__ = platform_system()
14
 
 
37
  "Linux": "linux",
38
  "Darwin": "macos",
39
  "Windows": "windows",
 
 
40
  }.get(__OS_NAME__)
41
 
42
 
scrapling/engines/toolbelt/navigation.py CHANGED
@@ -86,51 +86,6 @@ def construct_proxy_dict(proxy_string: str | Dict[str, str], as_tuple=False) ->
86
  return None
87
 
88
 
89
- def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
90
- """Takes a CDP URL, reconstruct it to check it's valid, then adds encoded parameters if exists
91
-
92
- :param cdp_url: The target URL.
93
- :param query_params: A dictionary of the parameters to add.
94
- :return: The new CDP URL.
95
- """
96
- try:
97
- # Validate the base URL structure
98
- parsed = urlparse(cdp_url)
99
-
100
- # Check scheme
101
- if parsed.scheme not in ("ws", "wss"):
102
- raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
103
-
104
- # Validate hostname and port
105
- if not parsed.netloc:
106
- raise ValueError("Invalid hostname for the CDP URL")
107
-
108
- try:
109
- # Checking if the port is valid (if available)
110
- _ = parsed.port
111
- except ValueError:
112
- # urlparse will raise `ValueError` if the port can't be casted to integer
113
- raise ValueError("Invalid port for the CDP URL")
114
-
115
- # Ensure the path starts with /
116
- path = parsed.path
117
- if not path.startswith("/"):
118
- path = "/" + path
119
-
120
- # Reconstruct the base URL with validated parts
121
- validated_base = f"{parsed.scheme}://{parsed.netloc}{path}"
122
-
123
- # Add query parameters
124
- if query_params:
125
- query_string = urlencode(query_params)
126
- return f"{validated_base}?{query_string}"
127
-
128
- return validated_base
129
-
130
- except Exception as e:
131
- raise ValueError(f"Invalid CDP URL: {str(e)}")
132
-
133
-
134
  @lru_cache(10, typed=True)
135
  def js_bypass_path(filename: str) -> str:
136
  """Takes the base filename of a JS file inside the `bypasses` folder, then return the full path of it
 
86
  return None
87
 
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  @lru_cache(10, typed=True)
90
  def js_bypass_path(filename: str) -> str:
91
  """Takes the base filename of a JS file inside the `bypasses` folder, then return the full path of it
scrapling/fetchers.py CHANGED
@@ -6,15 +6,17 @@ from scrapling.core._types import (
6
  SelectorWaitStates,
7
  Iterable,
8
  )
9
- from scrapling.engines import (
10
  FetcherSession,
11
- StealthySession,
12
- AsyncStealthySession,
13
- DynamicSession,
14
- AsyncDynamicSession,
15
  FetcherClient as _FetcherClient,
16
  AsyncFetcherClient as _AsyncFetcherClient,
17
  )
 
 
 
 
 
 
18
  from scrapling.engines.toolbelt import BaseFetcher, Response
19
 
20
  __FetcherClientInstance__ = _FetcherClient()
 
6
  SelectorWaitStates,
7
  Iterable,
8
  )
9
+ from scrapling.engines.static import (
10
  FetcherSession,
 
 
 
 
11
  FetcherClient as _FetcherClient,
12
  AsyncFetcherClient as _AsyncFetcherClient,
13
  )
14
+ from scrapling.engines._browsers import (
15
+ DynamicSession,
16
+ StealthySession,
17
+ AsyncDynamicSession,
18
+ AsyncStealthySession,
19
+ )
20
  from scrapling.engines.toolbelt import BaseFetcher, Response
21
 
22
  __FetcherClientInstance__ = _FetcherClient()
scrapling/parser.py CHANGED
@@ -1,12 +1,11 @@
1
- from pathlib import Path
2
  import re
 
3
  from inspect import signature
4
- from difflib import SequenceMatcher
5
  from urllib.parse import urljoin
 
6
 
7
- from cssselect import SelectorError, SelectorSyntaxError
8
- from cssselect import parse as split_selectors
9
  from lxml.html import HtmlElement, HtmlMixin, HTMLParser
 
10
  from lxml.etree import (
11
  XPath,
12
  tostring,
 
 
1
  import re
2
+ from pathlib import Path
3
  from inspect import signature
 
4
  from urllib.parse import urljoin
5
+ from difflib import SequenceMatcher
6
 
 
 
7
  from lxml.html import HtmlElement, HtmlMixin, HTMLParser
8
+ from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
9
  from lxml.etree import (
10
  XPath,
11
  tostring,