Karim shoair commited on
Commit
8e67a4c
·
1 Parent(s): d4fe1d6

refactor: huge change, many features/class got a better naming

Browse files

- `Adaptor` became `Selector`
- `Adaptors` became `Selectors`
- `auto_match` argument/feature became `adaptive`
- `adaptor_arguments` argument became `selector_config`
- `automatch_domain` argument became `adaptive_domain`
- `additional_arguments` argument became `additional_args`
- `storage_adaptors` file became just `storage`

README.md CHANGED
@@ -52,14 +52,14 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
52
 
53
  ```python
54
  >> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
55
- >> StealthyFetcher.auto_match = True
56
  # Fetch websites' source under the radar!
57
  >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
58
  >> print(page.status)
59
  200
60
  >> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
61
- >> # Later, if the website structure changes, pass `auto_match=True`
62
- >> products = page.css('.product', auto_match=True) # and Scrapling still finds them!
63
  ```
64
 
65
  # Sponsors
@@ -150,7 +150,7 @@ Tired of your PC slowing you down? Can’t keep your machine on 24/7 for scrapin
150
  ```python
151
  from scrapling.fetchers import Fetcher
152
 
153
- # Do HTTP GET request to a web page and create an Adaptor instance
154
  page = Fetcher.get('https://quotes.toscrape.com/', stealthy_headers=True)
155
  # Get all text content from all HTML tags in the page except the `script` and `style` tags
156
  page.get_all_text(ignore_tags=('script', 'style'))
@@ -219,7 +219,7 @@ Here are the results:
219
  | Scrapling | 2.51 | 1.0x |
220
  | AutoScraper | 11.41 | 4.546x |
221
 
222
- Scrapling can find elements with more methods and returns the entire element's `Adaptor` object, not only text like AutoScraper. So, to make this test fair, both libraries will extract an element with text, find similar elements, and then extract the text content for all of them.
223
 
224
  As you see, Scrapling is still 4.5 times faster at the same task.
225
 
 
52
 
53
  ```python
54
  >> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
55
+ >> StealthyFetcher.adaptive = True
56
  # Fetch websites' source under the radar!
57
  >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
58
  >> print(page.status)
59
  200
60
  >> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
61
+ >> # Later, if the website structure changes, pass `adaptive=True`
62
+ >> products = page.css('.product', adaptive=True) # and Scrapling still finds them!
63
  ```
64
 
65
  # Sponsors
 
150
  ```python
151
  from scrapling.fetchers import Fetcher
152
 
153
+ # Do HTTP GET request to a web page and create an Selector instance
154
  page = Fetcher.get('https://quotes.toscrape.com/', stealthy_headers=True)
155
  # Get all text content from all HTML tags in the page except the `script` and `style` tags
156
  page.get_all_text(ignore_tags=('script', 'style'))
 
219
  | Scrapling | 2.51 | 1.0x |
220
  | AutoScraper | 11.41 | 4.546x |
221
 
222
+ Scrapling can find elements with more methods and returns the entire element's `Selector` object, not only text like AutoScraper. So, to make this test fair, both libraries will extract an element with text, find similar elements, and then extract the text content for all of them.
223
 
224
  As you see, Scrapling is still 4.5 times faster at the same task.
225
 
benchmarks.py CHANGED
@@ -12,7 +12,7 @@ from parsel import Selector
12
  from pyquery import PyQuery as pq
13
  from selectolax.parser import HTMLParser
14
 
15
- from scrapling import Adaptor
16
 
17
  large_html = (
18
  "<html><body>" + '<div class="item">' * 5000 + "</div>" * 5000 + "</body></html>"
@@ -73,9 +73,9 @@ def test_pyquery():
73
  @benchmark
74
  def test_scrapling():
75
  # No need to do `.extract()` like parsel to extract text
76
- # Also, this is faster than `[t.text for t in Adaptor(large_html, auto_match=False).css('.item')]`
77
  # for obvious reasons, of course.
78
- return Adaptor(large_html, auto_match=False).css(".item::text")
79
 
80
 
81
  @benchmark
@@ -112,7 +112,7 @@ def test_scrapling_text(request_html):
112
  # Will loop over resulted elements to get text too to make comparison even more fair otherwise Scrapling will be even faster
113
  return [
114
  element.text
115
- for element in Adaptor(request_html, auto_match=False)
116
  .find_by_text("Tipping the Velvet", first_match=True)
117
  .find_similar(ignore_attributes=["title"])
118
  ]
 
12
  from pyquery import PyQuery as pq
13
  from selectolax.parser import HTMLParser
14
 
15
+ from scrapling import Selector as ScraplingSelector
16
 
17
  large_html = (
18
  "<html><body>" + '<div class="item">' * 5000 + "</div>" * 5000 + "</body></html>"
 
73
  @benchmark
74
  def test_scrapling():
75
  # No need to do `.extract()` like parsel to extract text
76
+ # Also, this is faster than `[t.text for t in Selector(large_html, adaptive=False).css('.item')]`
77
  # for obvious reasons, of course.
78
+ return ScraplingSelector(large_html, adaptive=False).css(".item::text")
79
 
80
 
81
  @benchmark
 
112
  # Will loop over resulted elements to get text too to make comparison even more fair otherwise Scrapling will be even faster
113
  return [
114
  element.text
115
+ for element in ScraplingSelector(request_html, adaptive=False)
116
  .find_by_text("Tipping the Velvet", first_match=True)
117
  .find_similar(ignore_attributes=["title"])
118
  ]
scrapling/__init__.py CHANGED
@@ -10,12 +10,12 @@ def __getattr__(name):
10
  from scrapling.fetchers import Fetcher as cls
11
 
12
  return cls
13
- elif name == "Adaptor":
14
- from scrapling.parser import Adaptor as cls
15
 
16
  return cls
17
- elif name == "Adaptors":
18
- from scrapling.parser import Adaptors as cls
19
 
20
  return cls
21
  elif name == "AttributesHandler":
@@ -46,4 +46,4 @@ def __getattr__(name):
46
  raise AttributeError(f"module 'scrapling' has no attribute '{name}'")
47
 
48
 
49
- __all__ = ["Adaptor", "Fetcher", "AsyncFetcher", "StealthyFetcher", "DynamicFetcher"]
 
10
  from scrapling.fetchers import Fetcher as cls
11
 
12
  return cls
13
+ elif name == "Selector":
14
+ from scrapling.parser import Selector as cls
15
 
16
  return cls
17
+ elif name == "Selectors":
18
+ from scrapling.parser import Selectors as cls
19
 
20
  return cls
21
  elif name == "AttributesHandler":
 
46
  raise AttributeError(f"module 'scrapling' has no attribute '{name}'")
47
 
48
 
49
+ __all__ = ["Selector", "Fetcher", "AsyncFetcher", "StealthyFetcher", "DynamicFetcher"]
scrapling/core/ai.py CHANGED
@@ -430,7 +430,7 @@ class ScraplingMCPServer:
430
  os_randomize: bool = False,
431
  disable_ads: bool = False,
432
  geoip: bool = False,
433
- additional_arguments: Optional[Dict] = None,
434
  ) -> ResponseModel:
435
  """Use Scrapling's version of the Camoufox browser to fetch a URL and return a structured output of the result.
436
  Note: This is best suitable for high protection levels. It's slower than the other tools.
@@ -467,7 +467,7 @@ class ScraplingMCPServer:
467
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
468
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
469
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
470
- :param additional_arguments: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
471
  """
472
  page = await StealthyFetcher.async_fetch(
473
  url,
@@ -491,7 +491,7 @@ class ScraplingMCPServer:
491
  solve_cloudflare=solve_cloudflare,
492
  disable_resources=disable_resources,
493
  wait_selector_state=wait_selector_state,
494
- additional_arguments=additional_arguments,
495
  )
496
  return _ContentTranslator(
497
  Convertor._extract_content(
@@ -530,7 +530,7 @@ class ScraplingMCPServer:
530
  os_randomize: bool = False,
531
  disable_ads: bool = False,
532
  geoip: bool = False,
533
- additional_arguments: Optional[Dict] = None,
534
  ) -> List[ResponseModel]:
535
  """Use Scrapling's version of the Camoufox browser to fetch a group of URLs at the same time, and for each page return a structured output of the result.
536
  Note: This is best suitable for high protection levels. It's slower than the other tools.
@@ -567,7 +567,7 @@ class ScraplingMCPServer:
567
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
568
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
569
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
570
- :param additional_arguments: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
571
  """
572
  async with AsyncStealthySession(
573
  wait=wait,
@@ -591,7 +591,7 @@ class ScraplingMCPServer:
591
  solve_cloudflare=solve_cloudflare,
592
  disable_resources=disable_resources,
593
  wait_selector_state=wait_selector_state,
594
- additional_arguments=additional_arguments,
595
  ) as session:
596
  tasks = [session.fetch(url) for url in urls]
597
  responses = await gather(*tasks)
 
430
  os_randomize: bool = False,
431
  disable_ads: bool = False,
432
  geoip: bool = False,
433
+ additional_args: Optional[Dict] = None,
434
  ) -> ResponseModel:
435
  """Use Scrapling's version of the Camoufox browser to fetch a URL and return a structured output of the result.
436
  Note: This is best suitable for high protection levels. It's slower than the other tools.
 
467
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
468
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
469
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
470
+ :param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
471
  """
472
  page = await StealthyFetcher.async_fetch(
473
  url,
 
491
  solve_cloudflare=solve_cloudflare,
492
  disable_resources=disable_resources,
493
  wait_selector_state=wait_selector_state,
494
+ additional_args=additional_args,
495
  )
496
  return _ContentTranslator(
497
  Convertor._extract_content(
 
530
  os_randomize: bool = False,
531
  disable_ads: bool = False,
532
  geoip: bool = False,
533
+ additional_args: Optional[Dict] = None,
534
  ) -> List[ResponseModel]:
535
  """Use Scrapling's version of the Camoufox browser to fetch a group of URLs at the same time, and for each page return a structured output of the result.
536
  Note: This is best suitable for high protection levels. It's slower than the other tools.
 
567
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
568
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
569
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
570
+ :param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
571
  """
572
  async with AsyncStealthySession(
573
  wait=wait,
 
591
  solve_cloudflare=solve_cloudflare,
592
  disable_resources=disable_resources,
593
  wait_selector_state=wait_selector_state,
594
+ additional_args=additional_args,
595
  ) as session:
596
  tasks = [session.fetch(url) for url in urls]
597
  responses = await gather(*tasks)
scrapling/core/shell.py CHANGED
@@ -27,7 +27,7 @@ from orjson import loads as json_loads, JSONDecodeError
27
  from scrapling import __version__
28
  from scrapling.core.custom_types import TextHandler
29
  from scrapling.core.utils import log
30
- from scrapling.parser import Adaptor, Adaptors
31
  from scrapling.core._types import (
32
  List,
33
  Optional,
@@ -399,9 +399,9 @@ class CurlParser:
399
  return None
400
 
401
 
402
- def show_page_in_browser(page: Adaptor):
403
- if not page or not isinstance(page, Adaptor):
404
- log.error("Input must be of type `Adaptor`")
405
  return
406
 
407
  try:
@@ -421,7 +421,7 @@ class CustomShell:
421
  def __init__(self, code, log_level="debug"):
422
  self.code = code
423
  self.page = None
424
- self.pages = Adaptors([])
425
  self._curl_parser = CurlParser()
426
  log_level = log_level.strip().lower()
427
 
@@ -457,7 +457,7 @@ class CustomShell:
457
  - Fetcher/AsyncFetcher
458
  - DynamicFetcher
459
  - StealthyFetcher
460
- - Adaptor
461
 
462
  -> Useful shortcuts:
463
  - {"get":<30} Shortcut for `Fetcher.get`
@@ -469,7 +469,7 @@ class CustomShell:
469
 
470
  -> Useful commands
471
  - {"page / response":<30} The response object of the last page you fetched
472
- - {"pages":<30} Adaptors object of the last 5 response objects you fetched
473
  - {"uncurl('curl_command')":<30} Convert curl command to a Request object. (Optimized to handle curl commands copied from DevTools network tab.)
474
  - {"curl2fetcher('curl_command')":<30} Convert curl command and make the request with Fetcher. (Optimized to handle curl commands copied from DevTools network tab.)
475
  - {"view(page)":<30} View page in a browser
@@ -481,7 +481,7 @@ Type 'exit' or press Ctrl+D to exit.
481
  def update_page(self, result):
482
  """Update the current page and add to pages history"""
483
  self.page = result
484
- if isinstance(result, (Response, Adaptor)):
485
  self.pages.append(result)
486
  if len(self.pages) > 5:
487
  self.pages.pop(0) # Remove oldest item
@@ -528,7 +528,7 @@ Type 'exit' or press Ctrl+D to exit.
528
  "DynamicFetcher": DynamicFetcher,
529
  "stealthy_fetch": stealthy_fetch,
530
  "StealthyFetcher": StealthyFetcher,
531
- "Adaptor": Adaptor,
532
  "page": self.page,
533
  "response": self.page,
534
  "pages": self.pages,
@@ -586,14 +586,14 @@ class Convertor:
586
  @classmethod
587
  def _extract_content(
588
  cls,
589
- page: Adaptor,
590
  extraction_type: extraction_types = "markdown",
591
  css_selector: Optional[str] = None,
592
  main_content_only: bool = False,
593
  ) -> Generator[str, None, None]:
594
- """Extract the content of an Adaptor"""
595
- if not page or not isinstance(page, Adaptor):
596
- raise TypeError("Input must be of type `Adaptor`")
597
  elif not extraction_type or extraction_type not in cls._extension_map.values():
598
  raise ValueError(f"Unknown extraction type: {extraction_type}")
599
  else:
@@ -622,11 +622,11 @@ class Convertor:
622
 
623
  @classmethod
624
  def write_content_to_file(
625
- cls, page: Adaptor, filename: str, css_selector: Optional[str] = None
626
  ) -> None:
627
- """Write an Adaptor's content to a file"""
628
- if not page or not isinstance(page, Adaptor):
629
- raise TypeError("Input must be of type `Adaptor`")
630
  elif not filename or not isinstance(filename, str) or not filename.strip():
631
  raise ValueError("Filename must be provided")
632
  elif not filename.endswith((".md", ".html", ".txt")):
 
27
  from scrapling import __version__
28
  from scrapling.core.custom_types import TextHandler
29
  from scrapling.core.utils import log
30
+ from scrapling.parser import Selector, Selectors
31
  from scrapling.core._types import (
32
  List,
33
  Optional,
 
399
  return None
400
 
401
 
402
+ def show_page_in_browser(page: Selector):
403
+ if not page or not isinstance(page, Selector):
404
+ log.error("Input must be of type `Selector`")
405
  return
406
 
407
  try:
 
421
  def __init__(self, code, log_level="debug"):
422
  self.code = code
423
  self.page = None
424
+ self.pages = Selectors([])
425
  self._curl_parser = CurlParser()
426
  log_level = log_level.strip().lower()
427
 
 
457
  - Fetcher/AsyncFetcher
458
  - DynamicFetcher
459
  - StealthyFetcher
460
+ - Selector
461
 
462
  -> Useful shortcuts:
463
  - {"get":<30} Shortcut for `Fetcher.get`
 
469
 
470
  -> Useful commands
471
  - {"page / response":<30} The response object of the last page you fetched
472
+ - {"pages":<30} Selectors object of the last 5 response objects you fetched
473
  - {"uncurl('curl_command')":<30} Convert curl command to a Request object. (Optimized to handle curl commands copied from DevTools network tab.)
474
  - {"curl2fetcher('curl_command')":<30} Convert curl command and make the request with Fetcher. (Optimized to handle curl commands copied from DevTools network tab.)
475
  - {"view(page)":<30} View page in a browser
 
481
  def update_page(self, result):
482
  """Update the current page and add to pages history"""
483
  self.page = result
484
+ if isinstance(result, (Response, Selector)):
485
  self.pages.append(result)
486
  if len(self.pages) > 5:
487
  self.pages.pop(0) # Remove oldest item
 
528
  "DynamicFetcher": DynamicFetcher,
529
  "stealthy_fetch": stealthy_fetch,
530
  "StealthyFetcher": StealthyFetcher,
531
+ "Selector": Selector,
532
  "page": self.page,
533
  "response": self.page,
534
  "pages": self.pages,
 
586
  @classmethod
587
  def _extract_content(
588
  cls,
589
+ page: Selector,
590
  extraction_type: extraction_types = "markdown",
591
  css_selector: Optional[str] = None,
592
  main_content_only: bool = False,
593
  ) -> Generator[str, None, None]:
594
+ """Extract the content of an Selector"""
595
+ if not page or not isinstance(page, Selector):
596
+ raise TypeError("Input must be of type `Selector`")
597
  elif not extraction_type or extraction_type not in cls._extension_map.values():
598
  raise ValueError(f"Unknown extraction type: {extraction_type}")
599
  else:
 
622
 
623
  @classmethod
624
  def write_content_to_file(
625
+ cls, page: Selector, filename: str, css_selector: Optional[str] = None
626
  ) -> None:
627
+ """Write an Selector's content to a file"""
628
+ if not page or not isinstance(page, Selector):
629
+ raise TypeError("Input must be of type `Selector`")
630
  elif not filename or not isinstance(filename, str) or not filename.strip():
631
  raise ValueError("Filename must be provided")
632
  elif not filename.endswith((".md", ".html", ".txt")):
scrapling/core/{storage_adaptors.py → storage.py} RENAMED
File without changes
scrapling/engines/_browsers/_camoufox.py CHANGED
@@ -70,8 +70,8 @@ class StealthySession:
70
  "os_randomize",
71
  "disable_ads",
72
  "geoip",
73
- "adaptor_arguments",
74
- "additional_arguments",
75
  "playwright",
76
  "browser",
77
  "context",
@@ -105,8 +105,8 @@ class StealthySession:
105
  os_randomize: bool = False,
106
  disable_ads: bool = False,
107
  geoip: bool = False,
108
- adaptor_arguments: Optional[Dict] = None,
109
- additional_arguments: Optional[Dict] = None,
110
  ):
111
  """A Browser session manager with page pooling
112
 
@@ -136,8 +136,8 @@ class StealthySession:
136
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
137
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
138
  :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
139
- :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
140
- :param additional_arguments: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
141
  """
142
 
143
  params = {
@@ -163,8 +163,8 @@ class StealthySession:
163
  "os_randomize": os_randomize,
164
  "disable_ads": disable_ads,
165
  "geoip": geoip,
166
- "adaptor_arguments": adaptor_arguments,
167
- "additional_arguments": additional_arguments,
168
  }
169
  config = validate(params, CamoufoxConfig)
170
 
@@ -190,14 +190,14 @@ class StealthySession:
190
  self.os_randomize = config.os_randomize
191
  self.disable_ads = config.disable_ads
192
  self.geoip = config.geoip
193
- self.adaptor_arguments = config.adaptor_arguments
194
- self.additional_arguments = config.additional_arguments
195
 
196
  self.playwright: Optional[Playwright] = None
197
  self.context: Optional[BrowserContext] = None
198
  self.page_pool = PagePool(self.max_pages)
199
  self._closed = False
200
- self.adaptor_arguments = config.adaptor_arguments
201
  self.page_action = config.page_action
202
  self._headers_keys = (
203
  set(map(str.lower, self.extra_headers.keys()))
@@ -223,7 +223,7 @@ class StealthySession:
223
  "block_images": self.block_images, # Careful! it makes some websites don't finish loading at all like stackoverflow even in headful mode.
224
  "os": None if self.os_randomize else get_os_name(),
225
  "user_data_dir": "",
226
- **self.additional_arguments,
227
  }
228
  )
229
 
@@ -433,7 +433,7 @@ class StealthySession:
433
 
434
  page_info.page.wait_for_timeout(self.wait)
435
  response = ResponseFactory.from_playwright_response(
436
- page_info.page, first_response, final_response, self.adaptor_arguments
437
  )
438
 
439
  # Mark the page as ready for next use
@@ -482,8 +482,8 @@ class AsyncStealthySession(StealthySession):
482
  os_randomize: bool = False,
483
  disable_ads: bool = False,
484
  geoip: bool = False,
485
- adaptor_arguments: Optional[Dict] = None,
486
- additional_arguments: Optional[Dict] = None,
487
  ):
488
  """A Browser session manager with page pooling
489
 
@@ -513,8 +513,8 @@ class AsyncStealthySession(StealthySession):
513
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
514
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
515
  :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
516
- :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
517
- :param additional_arguments: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
518
  """
519
  super().__init__(
520
  max_pages,
@@ -539,8 +539,8 @@ class AsyncStealthySession(StealthySession):
539
  os_randomize,
540
  disable_ads,
541
  geoip,
542
- adaptor_arguments,
543
- additional_arguments,
544
  )
545
  self.playwright: Optional[AsyncPlaywright] = None
546
  self.context: Optional[AsyncBrowserContext] = None
@@ -731,7 +731,7 @@ class AsyncStealthySession(StealthySession):
731
 
732
  # Create response object
733
  response = await ResponseFactory.from_async_playwright_response(
734
- page_info.page, first_response, final_response, self.adaptor_arguments
735
  )
736
 
737
  # Mark the page as ready for next use
 
70
  "os_randomize",
71
  "disable_ads",
72
  "geoip",
73
+ "selector_config",
74
+ "additional_args",
75
  "playwright",
76
  "browser",
77
  "context",
 
105
  os_randomize: bool = False,
106
  disable_ads: bool = False,
107
  geoip: bool = False,
108
+ selector_config: Optional[Dict] = None,
109
+ additional_args: Optional[Dict] = None,
110
  ):
111
  """A Browser session manager with page pooling
112
 
 
136
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
137
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
138
  :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
139
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
140
+ :param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
141
  """
142
 
143
  params = {
 
163
  "os_randomize": os_randomize,
164
  "disable_ads": disable_ads,
165
  "geoip": geoip,
166
+ "selector_config": selector_config,
167
+ "additional_args": additional_args,
168
  }
169
  config = validate(params, CamoufoxConfig)
170
 
 
190
  self.os_randomize = config.os_randomize
191
  self.disable_ads = config.disable_ads
192
  self.geoip = config.geoip
193
+ self.selector_config = config.selector_config
194
+ self.additional_args = config.additional_args
195
 
196
  self.playwright: Optional[Playwright] = None
197
  self.context: Optional[BrowserContext] = None
198
  self.page_pool = PagePool(self.max_pages)
199
  self._closed = False
200
+ self.selector_config = config.selector_config
201
  self.page_action = config.page_action
202
  self._headers_keys = (
203
  set(map(str.lower, self.extra_headers.keys()))
 
223
  "block_images": self.block_images, # Careful! it makes some websites don't finish loading at all like stackoverflow even in headful mode.
224
  "os": None if self.os_randomize else get_os_name(),
225
  "user_data_dir": "",
226
+ **self.additional_args,
227
  }
228
  )
229
 
 
433
 
434
  page_info.page.wait_for_timeout(self.wait)
435
  response = ResponseFactory.from_playwright_response(
436
+ page_info.page, first_response, final_response, self.selector_config
437
  )
438
 
439
  # Mark the page as ready for next use
 
482
  os_randomize: bool = False,
483
  disable_ads: bool = False,
484
  geoip: bool = False,
485
+ selector_config: Optional[Dict] = None,
486
+ additional_args: Optional[Dict] = None,
487
  ):
488
  """A Browser session manager with page pooling
489
 
 
513
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
514
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
515
  :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
516
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
517
+ :param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
518
  """
519
  super().__init__(
520
  max_pages,
 
539
  os_randomize,
540
  disable_ads,
541
  geoip,
542
+ selector_config,
543
+ additional_args,
544
  )
545
  self.playwright: Optional[AsyncPlaywright] = None
546
  self.context: Optional[AsyncBrowserContext] = None
 
731
 
732
  # Create response object
733
  response = await ResponseFactory.from_async_playwright_response(
734
+ page_info.page, first_response, final_response, self.selector_config
735
  )
736
 
737
  # Mark the page as ready for next use
scrapling/engines/_browsers/_controllers.py CHANGED
@@ -70,7 +70,7 @@ class DynamicSession:
70
  "context",
71
  "page_pool",
72
  "_closed",
73
- "adaptor_arguments",
74
  "page_action",
75
  "launch_options",
76
  "context_options",
@@ -100,7 +100,7 @@ class DynamicSession:
100
  cookies: Optional[List[Dict]] = None,
101
  network_idle: bool = False,
102
  wait_selector_state: SelectorWaitStates = "attached",
103
- adaptor_arguments: Optional[Dict] = None,
104
  ):
105
  """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
106
 
@@ -125,7 +125,7 @@ class DynamicSession:
125
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
126
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
127
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
128
- :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
129
  """
130
 
131
  params = {
@@ -143,7 +143,7 @@ class DynamicSession:
143
  "extra_headers": extra_headers,
144
  "useragent": useragent,
145
  "timeout": timeout,
146
- "adaptor_arguments": adaptor_arguments,
147
  "disable_resources": disable_resources,
148
  "wait_selector": wait_selector,
149
  "cookies": cookies,
@@ -177,7 +177,7 @@ class DynamicSession:
177
  self.context: Optional[BrowserContext] = None
178
  self.page_pool = PagePool(self.max_pages)
179
  self._closed = False
180
- self.adaptor_arguments = config.adaptor_arguments
181
  self.page_action = config.page_action
182
  self._headers_keys = (
183
  set(map(str.lower, self.extra_headers.keys()))
@@ -370,7 +370,7 @@ class DynamicSession:
370
 
371
  # Create response object
372
  response = ResponseFactory.from_playwright_response(
373
- page_info.page, first_response, final_response, self.adaptor_arguments
374
  )
375
 
376
  # Mark the page as ready for next use
@@ -417,7 +417,7 @@ class AsyncDynamicSession(DynamicSession):
417
  cookies: Optional[List[Dict]] = None,
418
  network_idle: bool = False,
419
  wait_selector_state: SelectorWaitStates = "attached",
420
- adaptor_arguments: Optional[Dict] = None,
421
  ):
422
  """A Browser session manager with page pooling
423
 
@@ -443,7 +443,7 @@ class AsyncDynamicSession(DynamicSession):
443
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
444
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
445
  :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
446
- :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
447
  """
448
 
449
  super().__init__(
@@ -467,7 +467,7 @@ class AsyncDynamicSession(DynamicSession):
467
  cookies,
468
  network_idle,
469
  wait_selector_state,
470
- adaptor_arguments,
471
  )
472
 
473
  self.playwright: Optional[AsyncPlaywright] = None
@@ -623,7 +623,7 @@ class AsyncDynamicSession(DynamicSession):
623
 
624
  # Create response object
625
  response = await ResponseFactory.from_async_playwright_response(
626
- page_info.page, first_response, final_response, self.adaptor_arguments
627
  )
628
 
629
  # Mark the page as ready for next use
 
70
  "context",
71
  "page_pool",
72
  "_closed",
73
+ "selector_config",
74
  "page_action",
75
  "launch_options",
76
  "context_options",
 
100
  cookies: Optional[List[Dict]] = None,
101
  network_idle: bool = False,
102
  wait_selector_state: SelectorWaitStates = "attached",
103
+ selector_config: Optional[Dict] = None,
104
  ):
105
  """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
106
 
 
125
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
126
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
127
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
128
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
129
  """
130
 
131
  params = {
 
143
  "extra_headers": extra_headers,
144
  "useragent": useragent,
145
  "timeout": timeout,
146
+ "selector_config": selector_config,
147
  "disable_resources": disable_resources,
148
  "wait_selector": wait_selector,
149
  "cookies": cookies,
 
177
  self.context: Optional[BrowserContext] = None
178
  self.page_pool = PagePool(self.max_pages)
179
  self._closed = False
180
+ self.selector_config = config.selector_config
181
  self.page_action = config.page_action
182
  self._headers_keys = (
183
  set(map(str.lower, self.extra_headers.keys()))
 
370
 
371
  # Create response object
372
  response = ResponseFactory.from_playwright_response(
373
+ page_info.page, first_response, final_response, self.selector_config
374
  )
375
 
376
  # Mark the page as ready for next use
 
417
  cookies: Optional[List[Dict]] = None,
418
  network_idle: bool = False,
419
  wait_selector_state: SelectorWaitStates = "attached",
420
+ selector_config: Optional[Dict] = None,
421
  ):
422
  """A Browser session manager with page pooling
423
 
 
443
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
444
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
445
  :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
446
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
447
  """
448
 
449
  super().__init__(
 
467
  cookies,
468
  network_idle,
469
  wait_selector_state,
470
+ selector_config,
471
  )
472
 
473
  self.playwright: Optional[AsyncPlaywright] = None
 
623
 
624
  # Create response object
625
  response = await ResponseFactory.from_async_playwright_response(
626
+ page_info.page, first_response, final_response, self.selector_config
627
  )
628
 
629
  # Mark the page as ready for next use
scrapling/engines/_browsers/_validators.py CHANGED
@@ -39,7 +39,7 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
39
  cookies: Optional[List[Dict]] = None
40
  network_idle: bool = False
41
  wait_selector_state: SelectorWaitStates = "attached"
42
- adaptor_arguments: Optional[Dict] = None
43
 
44
  def __post_init__(self):
45
  """Custom validation after msgspec validation"""
@@ -57,8 +57,8 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
57
  self.__validate_cdp(self.cdp_url)
58
  if not self.cookies:
59
  self.cookies = []
60
- if not self.adaptor_arguments:
61
- self.adaptor_arguments = {}
62
 
63
  @staticmethod
64
  def __validate_cdp(cdp_url):
@@ -105,8 +105,8 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
105
  os_randomize: bool = False
106
  disable_ads: bool = False
107
  geoip: bool = False
108
- adaptor_arguments: Optional[Dict] = None
109
- additional_arguments: Optional[Dict] = None
110
 
111
  def __post_init__(self):
112
  """Custom validation after msgspec validation"""
@@ -136,10 +136,10 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
136
  self.cookies = []
137
  if self.solve_cloudflare and self.timeout < 60_000:
138
  self.timeout = 60_000
139
- if not self.adaptor_arguments:
140
- self.adaptor_arguments = {}
141
- if not self.additional_arguments:
142
- self.additional_arguments = {}
143
 
144
 
145
  def validate(params, model):
 
39
  cookies: Optional[List[Dict]] = None
40
  network_idle: bool = False
41
  wait_selector_state: SelectorWaitStates = "attached"
42
+ selector_config: Optional[Dict] = None
43
 
44
  def __post_init__(self):
45
  """Custom validation after msgspec validation"""
 
57
  self.__validate_cdp(self.cdp_url)
58
  if not self.cookies:
59
  self.cookies = []
60
+ if not self.selector_config:
61
+ self.selector_config = {}
62
 
63
  @staticmethod
64
  def __validate_cdp(cdp_url):
 
105
  os_randomize: bool = False
106
  disable_ads: bool = False
107
  geoip: bool = False
108
+ selector_config: Optional[Dict] = None
109
+ additional_args: Optional[Dict] = None
110
 
111
  def __post_init__(self):
112
  """Custom validation after msgspec validation"""
 
136
  self.cookies = []
137
  if self.solve_cloudflare and self.timeout < 60_000:
138
  self.timeout = 60_000
139
+ if not self.selector_config:
140
+ self.selector_config = {}
141
+ if not self.additional_args:
142
+ self.additional_args = {}
143
 
144
 
145
  def validate(params, model):
scrapling/engines/static.py CHANGED
@@ -63,7 +63,7 @@ class FetcherSession:
63
  max_redirects: int = 30,
64
  verify: bool = True,
65
  cert: Optional[Union[str, Tuple[str, str]]] = None,
66
- adaptor_arguments: Optional[Dict] = None,
67
  ):
68
  """
69
  :param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
@@ -81,7 +81,7 @@ class FetcherSession:
81
  :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
82
  :param verify: Whether to verify HTTPS certificates. Defaults to True.
83
  :param cert: Tuple of (cert, key) filenames for the client certificate.
84
- :param adaptor_arguments: Arguments passed when creating the final Adaptor class.
85
  """
86
  self.default_impersonate = impersonate
87
  self.stealth = stealthy_headers
@@ -97,7 +97,7 @@ class FetcherSession:
97
  self.default_verify = verify
98
  self.default_cert = cert
99
  self.default_http3 = http3
100
- self.adaptor_arguments = adaptor_arguments or {}
101
 
102
  self._curl_session: Optional[CurlSession] = None
103
  self._async_curl_session: Optional[AsyncCurlSession] = None
@@ -260,7 +260,7 @@ class FetcherSession:
260
  request_args: Dict[str, Any],
261
  max_retries: int,
262
  retry_delay: int,
263
- adaptor_arguments: Optional[Dict] = None,
264
  ) -> Response:
265
  """
266
  Perform an HTTP request using the configured session.
@@ -270,7 +270,7 @@ class FetcherSession:
270
  :param request_args: Arguments to be passed to the session's `request()` method.
271
  :param max_retries: Maximum number of retries for the request.
272
  :param retry_delay: Number of seconds to wait between retries.
273
- :param adaptor_arguments: Arguments passed when creating the final Adaptor class.
274
  :return: A `Response` object for synchronous requests or an awaitable for asynchronous.
275
  """
276
  session = self._curl_session
@@ -286,9 +286,7 @@ class FetcherSession:
286
  try:
287
  response = session.request(method, **request_args)
288
  # response.raise_for_status() # Retry responses with a status code between 200-400
289
- return ResponseFactory.from_http_request(
290
- response, adaptor_arguments
291
- )
292
  except CurlError as e:
293
  if attempt < max_retries - 1:
294
  log.error(
@@ -307,7 +305,7 @@ class FetcherSession:
307
  request_args: Dict[str, Any],
308
  max_retries: int,
309
  retry_delay: int,
310
- adaptor_arguments: Optional[Dict] = None,
311
  ) -> Response:
312
  """
313
  Perform an HTTP request using the configured session.
@@ -317,7 +315,7 @@ class FetcherSession:
317
  :param request_args: Arguments to be passed to the session's `request()` method.
318
  :param max_retries: Maximum number of retries for the request.
319
  :param retry_delay: Number of seconds to wait between retries.
320
- :param adaptor_arguments: Arguments passed when creating the final Adaptor class.
321
  :return: A `Response` object for synchronous requests or an awaitable for asynchronous.
322
  """
323
  session = self._async_curl_session
@@ -335,9 +333,7 @@ class FetcherSession:
335
  try:
336
  response = await session.request(method, **request_args)
337
  # response.raise_for_status() # Retry responses with a status code between 200-400
338
- return ResponseFactory.from_http_request(
339
- response, adaptor_arguments
340
- )
341
  except CurlError as e:
342
  if attempt < max_retries - 1:
343
  log.error(
@@ -373,9 +369,7 @@ class FetcherSession:
373
  """
374
  stealth = self.stealth if stealth is None else stealth
375
 
376
- adaptor_arguments = (
377
- kwargs.pop("adaptor_arguments", {}) or self.adaptor_arguments
378
- )
379
  max_retries = self.get_with_precedence(kwargs, "retries", self.default_retries)
380
  retry_delay = self.get_with_precedence(
381
  kwargs, "retry_delay", self.default_retry_delay
@@ -383,12 +377,12 @@ class FetcherSession:
383
  request_args = self._merge_request_args(stealth=stealth, **kwargs)
384
  if self._curl_session:
385
  return self.__make_request(
386
- method, request_args, max_retries, retry_delay, adaptor_arguments
387
  )
388
  elif self._async_curl_session:
389
  # The returned value is a Coroutine
390
  return self.__make_async_request(
391
- method, request_args, max_retries, retry_delay, adaptor_arguments
392
  )
393
 
394
  raise RuntimeError("No active session available.")
 
63
  max_redirects: int = 30,
64
  verify: bool = True,
65
  cert: Optional[Union[str, Tuple[str, str]]] = None,
66
+ selector_config: Optional[Dict] = None,
67
  ):
68
  """
69
  :param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
 
81
  :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
82
  :param verify: Whether to verify HTTPS certificates. Defaults to True.
83
  :param cert: Tuple of (cert, key) filenames for the client certificate.
84
+ :param selector_config: Arguments passed when creating the final Selector class.
85
  """
86
  self.default_impersonate = impersonate
87
  self.stealth = stealthy_headers
 
97
  self.default_verify = verify
98
  self.default_cert = cert
99
  self.default_http3 = http3
100
+ self.selector_config = selector_config or {}
101
 
102
  self._curl_session: Optional[CurlSession] = None
103
  self._async_curl_session: Optional[AsyncCurlSession] = None
 
260
  request_args: Dict[str, Any],
261
  max_retries: int,
262
  retry_delay: int,
263
+ selector_config: Optional[Dict] = None,
264
  ) -> Response:
265
  """
266
  Perform an HTTP request using the configured session.
 
270
  :param request_args: Arguments to be passed to the session's `request()` method.
271
  :param max_retries: Maximum number of retries for the request.
272
  :param retry_delay: Number of seconds to wait between retries.
273
+ :param selector_config: Arguments passed when creating the final Selector class.
274
  :return: A `Response` object for synchronous requests or an awaitable for asynchronous.
275
  """
276
  session = self._curl_session
 
286
  try:
287
  response = session.request(method, **request_args)
288
  # response.raise_for_status() # Retry responses with a status code between 200-400
289
+ return ResponseFactory.from_http_request(response, selector_config)
 
 
290
  except CurlError as e:
291
  if attempt < max_retries - 1:
292
  log.error(
 
305
  request_args: Dict[str, Any],
306
  max_retries: int,
307
  retry_delay: int,
308
+ selector_config: Optional[Dict] = None,
309
  ) -> Response:
310
  """
311
  Perform an HTTP request using the configured session.
 
315
  :param request_args: Arguments to be passed to the session's `request()` method.
316
  :param max_retries: Maximum number of retries for the request.
317
  :param retry_delay: Number of seconds to wait between retries.
318
+ :param selector_config: Arguments passed when creating the final Selector class.
319
  :return: A `Response` object for synchronous requests or an awaitable for asynchronous.
320
  """
321
  session = self._async_curl_session
 
333
  try:
334
  response = await session.request(method, **request_args)
335
  # response.raise_for_status() # Retry responses with a status code between 200-400
336
+ return ResponseFactory.from_http_request(response, selector_config)
 
 
337
  except CurlError as e:
338
  if attempt < max_retries - 1:
339
  log.error(
 
369
  """
370
  stealth = self.stealth if stealth is None else stealth
371
 
372
+ selector_config = kwargs.pop("selector_config", {}) or self.selector_config
 
 
373
  max_retries = self.get_with_precedence(kwargs, "retries", self.default_retries)
374
  retry_delay = self.get_with_precedence(
375
  kwargs, "retry_delay", self.default_retry_delay
 
377
  request_args = self._merge_request_args(stealth=stealth, **kwargs)
378
  if self._curl_session:
379
  return self.__make_request(
380
+ method, request_args, max_retries, retry_delay, selector_config
381
  )
382
  elif self._async_curl_session:
383
  # The returned value is a Coroutine
384
  return self.__make_async_request(
385
+ method, request_args, max_retries, retry_delay, selector_config
386
  )
387
 
388
  raise RuntimeError("No active session available.")
scrapling/engines/toolbelt/convertor.py CHANGED
@@ -239,7 +239,7 @@ class ResponseFactory:
239
 
240
  :param response: `curl_cffi` response object
241
  :param parser_arguments: Additional arguments to be passed to the `Response` object constructor.
242
- :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
243
  """
244
  return Response(
245
  url=response.url,
 
239
 
240
  :param response: `curl_cffi` response object
241
  :param parser_arguments: Additional arguments to be passed to the `Response` object constructor.
242
+ :return: A `Response` object that is the same as `Selector` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
243
  """
244
  return Response(
245
  url=response.url,
scrapling/engines/toolbelt/custom.py CHANGED
@@ -15,7 +15,7 @@ from scrapling.core._types import (
15
  )
16
  from scrapling.core.custom_types import MappingProxyType
17
  from scrapling.core.utils import log, lru_cache
18
- from scrapling.parser import Adaptor, SQLiteStorageSystem
19
 
20
 
21
  class ResponseEncoding:
@@ -97,7 +97,7 @@ class ResponseEncoding:
97
  return cls.__DEFAULT_ENCODING
98
 
99
 
100
- class Response(Adaptor):
101
  """This class is returned by all engines as a way to unify response type between different libraries."""
102
 
103
  def __init__(
@@ -113,9 +113,9 @@ class Response(Adaptor):
113
  encoding: str = "utf-8",
114
  method: str = "GET",
115
  history: List = None,
116
- **adaptor_arguments: Dict,
117
  ):
118
- automatch_domain = adaptor_arguments.pop("automatch_domain", None)
119
  self.status = status
120
  self.reason = reason
121
  self.cookies = cookies
@@ -126,12 +126,10 @@ class Response(Adaptor):
126
  super().__init__(
127
  text=text,
128
  body=body,
129
- url=automatch_domain or url,
130
  encoding=encoding,
131
- **adaptor_arguments,
132
  )
133
- # For backward compatibility
134
- self.adaptor = self
135
  # For easier debugging while working from a Python shell
136
  log.info(
137
  f"Fetched ({status}) <{method} {url}> (referer: {request_headers.get('referer')})"
@@ -144,20 +142,20 @@ class Response(Adaptor):
144
  class BaseFetcher:
145
  __slots__ = ()
146
  huge_tree: bool = True
147
- auto_match: Optional[bool] = False
148
  storage: Any = SQLiteStorageSystem
149
  keep_cdata: Optional[bool] = False
150
  storage_args: Optional[Dict] = None
151
  keep_comments: Optional[bool] = False
152
- automatch_domain: Optional[str] = None
153
  parser_keywords: Tuple = (
154
  "huge_tree",
155
- "auto_match",
156
  "storage",
157
  "keep_cdata",
158
  "storage_args",
159
  "keep_comments",
160
- "automatch_domain",
161
  ) # Left open for the user
162
 
163
  def __init__(self, *args, **kwargs):
@@ -178,17 +176,17 @@ class BaseFetcher:
178
  huge_tree=cls.huge_tree,
179
  keep_comments=cls.keep_comments,
180
  keep_cdata=cls.keep_cdata,
181
- auto_match=cls.auto_match,
182
  storage=cls.storage,
183
  storage_args=cls.storage_args,
184
- automatch_domain=cls.automatch_domain,
185
  )
186
 
187
  @classmethod
188
  def configure(cls, **kwargs):
189
  """Set multiple arguments for the parser at once globally
190
 
191
- :param kwargs: The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata, auto_match, storage, storage_args, automatch_domain
192
  """
193
  for key, value in kwargs.items():
194
  key = key.strip().lower()
@@ -212,23 +210,23 @@ class BaseFetcher:
212
 
213
  @classmethod
214
  def _generate_parser_arguments(cls) -> Dict:
215
- # Adaptor class parameters
216
- # I won't validate Adaptor's class parameters here again, I will leave it to be validated later
217
  parser_arguments = dict(
218
  huge_tree=cls.huge_tree,
219
  keep_comments=cls.keep_comments,
220
  keep_cdata=cls.keep_cdata,
221
- auto_match=cls.auto_match,
222
  storage=cls.storage,
223
  storage_args=cls.storage_args,
224
  )
225
- if cls.automatch_domain:
226
- if type(cls.automatch_domain) is not str:
227
  log.warning(
228
- '[Ignored] The argument "automatch_domain" must be of string type'
229
  )
230
  else:
231
- parser_arguments.update({"automatch_domain": cls.automatch_domain})
232
 
233
  return parser_arguments
234
 
 
15
  )
16
  from scrapling.core.custom_types import MappingProxyType
17
  from scrapling.core.utils import log, lru_cache
18
+ from scrapling.parser import Selector, SQLiteStorageSystem
19
 
20
 
21
  class ResponseEncoding:
 
97
  return cls.__DEFAULT_ENCODING
98
 
99
 
100
+ class Response(Selector):
101
  """This class is returned by all engines as a way to unify response type between different libraries."""
102
 
103
  def __init__(
 
113
  encoding: str = "utf-8",
114
  method: str = "GET",
115
  history: List = None,
116
+ **selector_config: Dict,
117
  ):
118
+ adaptive_domain = selector_config.pop("adaptive_domain", None)
119
  self.status = status
120
  self.reason = reason
121
  self.cookies = cookies
 
126
  super().__init__(
127
  text=text,
128
  body=body,
129
+ url=adaptive_domain or url,
130
  encoding=encoding,
131
+ **selector_config,
132
  )
 
 
133
  # For easier debugging while working from a Python shell
134
  log.info(
135
  f"Fetched ({status}) <{method} {url}> (referer: {request_headers.get('referer')})"
 
142
  class BaseFetcher:
143
  __slots__ = ()
144
  huge_tree: bool = True
145
+ adaptive: Optional[bool] = False
146
  storage: Any = SQLiteStorageSystem
147
  keep_cdata: Optional[bool] = False
148
  storage_args: Optional[Dict] = None
149
  keep_comments: Optional[bool] = False
150
+ adaptive_domain: Optional[str] = None
151
  parser_keywords: Tuple = (
152
  "huge_tree",
153
+ "adaptive",
154
  "storage",
155
  "keep_cdata",
156
  "storage_args",
157
  "keep_comments",
158
+ "adaptive_domain",
159
  ) # Left open for the user
160
 
161
  def __init__(self, *args, **kwargs):
 
176
  huge_tree=cls.huge_tree,
177
  keep_comments=cls.keep_comments,
178
  keep_cdata=cls.keep_cdata,
179
+ adaptive=cls.adaptive,
180
  storage=cls.storage,
181
  storage_args=cls.storage_args,
182
+ adaptive_domain=cls.adaptive_domain,
183
  )
184
 
185
  @classmethod
186
  def configure(cls, **kwargs):
187
  """Set multiple arguments for the parser at once globally
188
 
189
+ :param kwargs: The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata, adaptive, storage, storage_args, adaptive_domain
190
  """
191
  for key, value in kwargs.items():
192
  key = key.strip().lower()
 
210
 
211
  @classmethod
212
  def _generate_parser_arguments(cls) -> Dict:
213
+ # Selector class parameters
214
+ # I won't validate Selector's class parameters here again, I will leave it to be validated later
215
  parser_arguments = dict(
216
  huge_tree=cls.huge_tree,
217
  keep_comments=cls.keep_comments,
218
  keep_cdata=cls.keep_cdata,
219
+ adaptive=cls.adaptive,
220
  storage=cls.storage,
221
  storage_args=cls.storage_args,
222
  )
223
+ if cls.adaptive_domain:
224
+ if type(cls.adaptive_domain) is not str:
225
  log.warning(
226
+ '[Ignored] The argument "adaptive_domain" must be of string type'
227
  )
228
  else:
229
+ parser_arguments.update({"adaptive_domain": cls.adaptive_domain})
230
 
231
  return parser_arguments
232
 
scrapling/fetchers.py CHANGED
@@ -74,7 +74,7 @@ class StealthyFetcher(BaseFetcher):
74
  disable_ads: bool = False,
75
  geoip: bool = False,
76
  custom_config: Optional[Dict] = None,
77
- additional_arguments: Optional[Dict] = None,
78
  ) -> Response:
79
  """
80
  Opens up a browser and do your request based on your chosen options below.
@@ -106,7 +106,7 @@ class StealthyFetcher(BaseFetcher):
106
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
107
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
108
  :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
109
- :param additional_arguments: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
110
  :return: A `Response` object.
111
  """
112
  if not custom_config:
@@ -139,8 +139,8 @@ class StealthyFetcher(BaseFetcher):
139
  solve_cloudflare=solve_cloudflare,
140
  disable_resources=disable_resources,
141
  wait_selector_state=wait_selector_state,
142
- adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
143
- additional_arguments=additional_arguments or {},
144
  ) as engine:
145
  return engine.fetch(url)
146
 
@@ -170,7 +170,7 @@ class StealthyFetcher(BaseFetcher):
170
  disable_ads: bool = False,
171
  geoip: bool = False,
172
  custom_config: Optional[Dict] = None,
173
- additional_arguments: Optional[Dict] = None,
174
  ) -> Response:
175
  """
176
  Opens up a browser and do your request based on your chosen options below.
@@ -202,7 +202,7 @@ class StealthyFetcher(BaseFetcher):
202
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
203
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
204
  :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
205
- :param additional_arguments: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
206
  :return: A `Response` object.
207
  """
208
  if not custom_config:
@@ -235,8 +235,8 @@ class StealthyFetcher(BaseFetcher):
235
  solve_cloudflare=solve_cloudflare,
236
  disable_resources=disable_resources,
237
  wait_selector_state=wait_selector_state,
238
- adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
239
- additional_arguments=additional_arguments or {},
240
  ) as engine:
241
  return await engine.fetch(url)
242
 
@@ -337,7 +337,7 @@ class DynamicFetcher(BaseFetcher):
337
  disable_webgl=disable_webgl,
338
  disable_resources=disable_resources,
339
  wait_selector_state=wait_selector_state,
340
- adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
341
  ) as session:
342
  return session.fetch(url)
343
 
@@ -421,7 +421,7 @@ class DynamicFetcher(BaseFetcher):
421
  disable_webgl=disable_webgl,
422
  disable_resources=disable_resources,
423
  wait_selector_state=wait_selector_state,
424
- adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
425
  ) as session:
426
  return await session.fetch(url)
427
 
 
74
  disable_ads: bool = False,
75
  geoip: bool = False,
76
  custom_config: Optional[Dict] = None,
77
+ additional_args: Optional[Dict] = None,
78
  ) -> Response:
79
  """
80
  Opens up a browser and do your request based on your chosen options below.
 
106
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
107
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
108
  :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
109
+ :param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
110
  :return: A `Response` object.
111
  """
112
  if not custom_config:
 
139
  solve_cloudflare=solve_cloudflare,
140
  disable_resources=disable_resources,
141
  wait_selector_state=wait_selector_state,
142
+ selector_config={**cls._generate_parser_arguments(), **custom_config},
143
+ additional_args=additional_args or {},
144
  ) as engine:
145
  return engine.fetch(url)
146
 
 
170
  disable_ads: bool = False,
171
  geoip: bool = False,
172
  custom_config: Optional[Dict] = None,
173
+ additional_args: Optional[Dict] = None,
174
  ) -> Response:
175
  """
176
  Opens up a browser and do your request based on your chosen options below.
 
202
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
203
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
204
  :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
205
+ :param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
206
  :return: A `Response` object.
207
  """
208
  if not custom_config:
 
235
  solve_cloudflare=solve_cloudflare,
236
  disable_resources=disable_resources,
237
  wait_selector_state=wait_selector_state,
238
+ selector_config={**cls._generate_parser_arguments(), **custom_config},
239
+ additional_args=additional_args or {},
240
  ) as engine:
241
  return await engine.fetch(url)
242
 
 
337
  disable_webgl=disable_webgl,
338
  disable_resources=disable_resources,
339
  wait_selector_state=wait_selector_state,
340
+ selector_config={**cls._generate_parser_arguments(), **custom_config},
341
  ) as session:
342
  return session.fetch(url)
343
 
 
421
  disable_webgl=disable_webgl,
422
  disable_resources=disable_resources,
423
  wait_selector_state=wait_selector_state,
424
+ selector_config={**cls._generate_parser_arguments(), **custom_config},
425
  ) as session:
426
  return await session.fetch(url)
427
 
scrapling/parser.py CHANGED
@@ -24,7 +24,7 @@ from scrapling.core._types import (
24
  )
25
  from scrapling.core.custom_types import AttributesHandler, TextHandler, TextHandlers
26
  from scrapling.core.mixins import SelectorsGeneration
27
- from scrapling.core.storage_adaptors import (
28
  SQLiteStorageSystem,
29
  StorageSystemMixin,
30
  _StorageTools,
@@ -33,11 +33,11 @@ from scrapling.core.translator import translator_instance
33
  from scrapling.core.utils import clean_spaces, flatten, html_forbidden, is_jsonable, log
34
 
35
 
36
- class Adaptor(SelectorsGeneration):
37
  __slots__ = (
38
  "url",
39
  "encoding",
40
- "__auto_match_enabled",
41
  "_root",
42
  "_storage",
43
  "__keep_comments",
@@ -58,7 +58,7 @@ class Adaptor(SelectorsGeneration):
58
  root: Optional[html.HtmlElement] = None,
59
  keep_comments: Optional[bool] = False,
60
  keep_cdata: Optional[bool] = False,
61
- auto_match: Optional[bool] = False,
62
  _storage: object = None,
63
  storage: Any = SQLiteStorageSystem,
64
  storage_args: Optional[Dict] = None,
@@ -82,7 +82,7 @@ class Adaptor(SelectorsGeneration):
82
  Don't use it unless you know what you are doing!
83
  :param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
84
  :param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
85
- :param auto_match: Globally turn off the auto-match feature in all functions, this argument takes higher
86
  priority over all auto-match related arguments/functions in the class.
87
  :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
88
  :param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
@@ -90,7 +90,7 @@ class Adaptor(SelectorsGeneration):
90
  """
91
  if root is None and not body and text is None:
92
  raise ValueError(
93
- "Adaptor class needs text, body, or root arguments to work"
94
  )
95
 
96
  self.__text = ""
@@ -134,9 +134,9 @@ class Adaptor(SelectorsGeneration):
134
 
135
  self._root = root
136
 
137
- self.__auto_match_enabled = auto_match
138
 
139
- if self.__auto_match_enabled:
140
  if _storage is not None:
141
  self._storage = _storage
142
  else:
@@ -214,17 +214,17 @@ class Adaptor(SelectorsGeneration):
214
  """
215
  return TextHandler(str(element))
216
 
217
- def __element_convertor(self, element: html.HtmlElement) -> "Adaptor":
218
- """Used internally to convert a single HtmlElement to Adaptor directly without checks"""
219
  db_instance = (
220
  self._storage if (hasattr(self, "_storage") and self._storage) else None
221
  )
222
- return Adaptor(
223
  root=element,
224
  url=self.url,
225
  encoding=self.encoding,
226
- auto_match=self.__auto_match_enabled,
227
- _storage=db_instance, # Reuse existing storage if it exists otherwise it won't be checked if `auto_match` is turned off
228
  keep_comments=self.__keep_comments,
229
  keep_cdata=self.__keep_cdata,
230
  huge_tree=self.__huge_tree_enabled,
@@ -233,8 +233,8 @@ class Adaptor(SelectorsGeneration):
233
 
234
  def __handle_element(
235
  self, element: Union[html.HtmlElement, etree._ElementUnicodeResult]
236
- ) -> Union[TextHandler, "Adaptor", None]:
237
- """Used internally in all functions to convert a single element to type (Adaptor|TextHandler) when possible"""
238
  if element is None:
239
  return None
240
  elif self._is_text_node(element):
@@ -245,23 +245,23 @@ class Adaptor(SelectorsGeneration):
245
 
246
  def __handle_elements(
247
  self, result: List[Union[html.HtmlElement, etree._ElementUnicodeResult]]
248
- ) -> Union["Adaptors", "TextHandlers", List]:
249
- """Used internally in all functions to convert results to type (Adaptors|TextHandlers) in bulk when possible"""
250
  if not len(
251
  result
252
  ): # Lxml will give a warning if I used something like `not result`
253
- return Adaptors([])
254
 
255
  # From within the code, this method will always get a list of the same type,
256
  # so we will continue without checks for a slight performance boost
257
  if self._is_text_node(result[0]):
258
  return TextHandlers(list(map(self.__content_convertor, result)))
259
 
260
- return Adaptors(list(map(self.__element_convertor, result)))
261
 
262
  def __getstate__(self) -> Any:
263
  # lxml don't like it :)
264
- raise TypeError("Can't pickle Adaptor objects")
265
 
266
  # The following four properties I made them into functions instead of variables directly
267
  # So they don't slow down the process of initializing many instances of the class and gets executed only
@@ -322,7 +322,7 @@ class Adaptor(SelectorsGeneration):
322
  return TextHandler(separator).join(_all_strings)
323
 
324
  def urljoin(self, relative_url: str) -> str:
325
- """Join this Adaptor's url with a relative url to form an absolute full URL."""
326
  return urljoin(self.url, relative_url)
327
 
328
  @property
@@ -363,20 +363,20 @@ class Adaptor(SelectorsGeneration):
363
  return class_name in self._root.classes
364
 
365
  @property
366
- def parent(self) -> Union["Adaptor", None]:
367
  """Return the direct parent of the element or ``None`` otherwise"""
368
  return self.__handle_element(self._root.getparent())
369
 
370
  @property
371
- def below_elements(self) -> "Adaptors[Adaptor]":
372
  """Return all elements under the current element in the DOM tree"""
373
  below = self._root.xpath(".//*")
374
  return self.__handle_elements(below)
375
 
376
  @property
377
- def children(self) -> "Adaptors[Adaptor]":
378
  """Return the children elements of the current element or empty list otherwise"""
379
- return Adaptors(
380
  [
381
  self.__element_convertor(child)
382
  for child in self._root.iterchildren()
@@ -385,22 +385,22 @@ class Adaptor(SelectorsGeneration):
385
  )
386
 
387
  @property
388
- def siblings(self) -> "Adaptors[Adaptor]":
389
  """Return other children of the current element's parent or empty list otherwise"""
390
  if self.parent:
391
- return Adaptors(
392
  [child for child in self.parent.children if child._root != self._root]
393
  )
394
- return Adaptors([])
395
 
396
- def iterancestors(self) -> Generator["Adaptor", None, None]:
397
  """Return a generator that loops over all ancestors of the element, starting with the element's parent."""
398
  for ancestor in self._root.iterancestors():
399
  yield self.__element_convertor(ancestor)
400
 
401
  def find_ancestor(
402
- self, func: Callable[["Adaptor"], bool]
403
- ) -> Union["Adaptor", None]:
404
  """Loop over all ancestors of the element till one match the passed function
405
  :param func: A function that takes each ancestor as an argument and returns True/False
406
  :return: The first ancestor that match the function or ``None`` otherwise.
@@ -411,13 +411,13 @@ class Adaptor(SelectorsGeneration):
411
  return None
412
 
413
  @property
414
- def path(self) -> "Adaptors[Adaptor]":
415
- """Returns a list of type `Adaptors` that contains the path leading to the current element from the root."""
416
  lst = list(self.iterancestors())
417
- return Adaptors(lst)
418
 
419
  @property
420
- def next(self) -> Union["Adaptor", None]:
421
  """Returns the next element of the current element in the children of the parent or ``None`` otherwise."""
422
  next_element = self._root.getnext()
423
  if next_element is not None:
@@ -428,7 +428,7 @@ class Adaptor(SelectorsGeneration):
428
  return self.__handle_element(next_element)
429
 
430
  @property
431
- def previous(self) -> Union["Adaptor", None]:
432
  """Returns the previous element of the current element in the children of the parent or ``None`` otherwise."""
433
  prev_element = self._root.getprevious()
434
  if prev_element is not None:
@@ -471,18 +471,18 @@ class Adaptor(SelectorsGeneration):
471
  # From here we start with the selecting functions
472
  def relocate(
473
  self,
474
- element: Union[Dict, html.HtmlElement, "Adaptor"],
475
  percentage: int = 0,
476
- adaptor_type: bool = False,
477
- ) -> Union[List[Union[html.HtmlElement, None]], "Adaptors"]:
478
  """This function will search again for the element in the page tree, used automatically on page structure change
479
 
480
  :param element: The element we want to relocate in the tree
481
  :param percentage: The minimum percentage to accept and not going lower than that. Be aware that the percentage
482
  calculation depends solely on the page structure, so don't play with this number unless you must know
483
  what you are doing!
484
- :param adaptor_type: If True, the return result will be converted to `Adaptors` object
485
- :return: List of pure HTML elements that got the highest matching score or 'Adaptors' object
486
  """
487
  score_table = {}
488
  # Note: `element` will most likely always be a dictionary at this point.
@@ -511,7 +511,7 @@ class Adaptor(SelectorsGeneration):
511
  f"{percent} -> {self.__handle_elements(score_table[percent])}"
512
  )
513
 
514
- if not adaptor_type:
515
  return score_table[highest_probability]
516
  return self.__handle_elements(score_table[highest_probability])
517
  return []
@@ -520,10 +520,10 @@ class Adaptor(SelectorsGeneration):
520
  self,
521
  selector: str,
522
  identifier: str = "",
523
- auto_match: bool = False,
524
  auto_save: bool = False,
525
  percentage: int = 0,
526
- ) -> Union["Adaptor", "TextHandler", None]:
527
  """Search the current tree with CSS3 selectors and return the first result if possible, otherwise return `None`
528
 
529
  **Important:
@@ -531,17 +531,15 @@ class Adaptor(SelectorsGeneration):
531
  and want to relocate the same element(s)**
532
 
533
  :param selector: The CSS3 selector to be used.
534
- :param auto_match: Enabled will make the function try to relocate the element if it was 'saved' before
535
  :param identifier: A string that will be used to save/retrieve element's data in auto-matching,
536
  otherwise the selector will be used.
537
- :param auto_save: Automatically save new elements for `auto_match` later
538
  :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
539
  Be aware that the percentage calculation depends solely on the page structure, so don't play with this
540
  number unless you must know what you are doing!
541
  """
542
- for element in self.css(
543
- selector, identifier, auto_match, auto_save, percentage
544
- ):
545
  return element
546
  return None
547
 
@@ -549,11 +547,11 @@ class Adaptor(SelectorsGeneration):
549
  self,
550
  selector: str,
551
  identifier: str = "",
552
- auto_match: bool = False,
553
  auto_save: bool = False,
554
  percentage: int = 0,
555
  **kwargs: Any,
556
- ) -> Union["Adaptor", "TextHandler", None]:
557
  """Search the current tree with XPath selectors and return the first result if possible, otherwise return `None`
558
 
559
  **Important:
@@ -563,16 +561,16 @@ class Adaptor(SelectorsGeneration):
563
  Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
564
 
565
  :param selector: The XPath selector to be used.
566
- :param auto_match: Enabled will make the function try to relocate the element if it was 'saved' before
567
  :param identifier: A string that will be used to save/retrieve element's data in auto-matching,
568
  otherwise the selector will be used.
569
- :param auto_save: Automatically save new elements for `auto_match` later
570
  :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
571
  Be aware that the percentage calculation depends solely on the page structure, so don't play with this
572
  number unless you must know what you are doing!
573
  """
574
  for element in self.xpath(
575
- selector, identifier, auto_match, auto_save, percentage, **kwargs
576
  ):
577
  return element
578
  return None
@@ -581,10 +579,10 @@ class Adaptor(SelectorsGeneration):
581
  self,
582
  selector: str,
583
  identifier: str = "",
584
- auto_match: bool = False,
585
  auto_save: bool = False,
586
  percentage: int = 0,
587
- ) -> Union["Adaptors[Adaptor]", List, "TextHandlers[TextHandler]"]:
588
  """Search the current tree with CSS3 selectors
589
 
590
  **Important:
@@ -592,24 +590,24 @@ class Adaptor(SelectorsGeneration):
592
  and want to relocate the same element(s)**
593
 
594
  :param selector: The CSS3 selector to be used.
595
- :param auto_match: Enabled will make the function try to relocate the element if it was 'saved' before
596
  :param identifier: A string that will be used to save/retrieve element's data in auto-matching,
597
  otherwise the selector will be used.
598
- :param auto_save: Automatically save new elements for `auto_match` later
599
  :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
600
  Be aware that the percentage calculation depends solely on the page structure, so don't play with this
601
  number unless you must know what you are doing!
602
 
603
- :return: `Adaptors` class.
604
  """
605
  try:
606
- if not self.__auto_match_enabled or "," not in selector:
607
  # No need to split selectors in this case, let's save some CPU cycles :)
608
  xpath_selector = translator_instance.css_to_xpath(selector)
609
  return self.xpath(
610
  xpath_selector,
611
  identifier or selector,
612
- auto_match,
613
  auto_save,
614
  percentage,
615
  )
@@ -625,7 +623,7 @@ class Adaptor(SelectorsGeneration):
625
  results += self.xpath(
626
  xpath_selector,
627
  identifier or single_selector.canonical(),
628
- auto_match,
629
  auto_save,
630
  percentage,
631
  )
@@ -643,11 +641,11 @@ class Adaptor(SelectorsGeneration):
643
  self,
644
  selector: str,
645
  identifier: str = "",
646
- auto_match: bool = False,
647
  auto_save: bool = False,
648
  percentage: int = 0,
649
  **kwargs: Any,
650
- ) -> Union["Adaptors[Adaptor]", List, "TextHandlers[TextHandler]"]:
651
  """Search the current tree with XPath selectors
652
 
653
  **Important:
@@ -657,31 +655,31 @@ class Adaptor(SelectorsGeneration):
657
  Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
658
 
659
  :param selector: The XPath selector to be used.
660
- :param auto_match: Enabled will make the function try to relocate the element if it was 'saved' before
661
  :param identifier: A string that will be used to save/retrieve element's data in auto-matching,
662
  otherwise the selector will be used.
663
- :param auto_save: Automatically save new elements for `auto_match` later
664
  :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
665
  Be aware that the percentage calculation depends solely on the page structure, so don't play with this
666
  number unless you must know what you are doing!
667
 
668
- :return: `Adaptors` class.
669
  """
670
  try:
671
  elements = self._root.xpath(selector, **kwargs)
672
 
673
  if elements:
674
  if auto_save:
675
- if not self.__auto_match_enabled:
676
  log.warning(
677
- "Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info."
678
  )
679
  else:
680
  self.save(elements[0], identifier or selector)
681
 
682
  return self.__handle_elements(elements)
683
- elif self.__auto_match_enabled:
684
- if auto_match:
685
  element_data = self.retrieve(identifier or selector)
686
  if element_data:
687
  elements = self.relocate(element_data, percentage)
@@ -690,13 +688,13 @@ class Adaptor(SelectorsGeneration):
690
 
691
  return self.__handle_elements(elements)
692
  else:
693
- if auto_match:
694
  log.warning(
695
- "Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info."
696
  )
697
  elif auto_save:
698
  log.warning(
699
- "Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info."
700
  )
701
 
702
  return self.__handle_elements(elements)
@@ -713,12 +711,12 @@ class Adaptor(SelectorsGeneration):
713
  self,
714
  *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]],
715
  **kwargs: str,
716
- ) -> "Adaptors":
717
  """Find elements by filters of your creations for ease.
718
 
719
  :param args: Tag name(s), iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
720
  :param kwargs: The attributes you want to filter elements based on it.
721
- :return: The `Adaptors` object of the elements or empty list
722
  """
723
  # Attributes that are Python reserved words and can't be used directly
724
  # Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
@@ -735,7 +733,7 @@ class Adaptor(SelectorsGeneration):
735
 
736
  attributes = dict()
737
  tags, patterns = set(), set()
738
- results, functions, selectors = Adaptors([]), [], []
739
 
740
  # Brace yourself for a wonderful journey!
741
  for arg in args:
@@ -766,7 +764,7 @@ class Adaptor(SelectorsGeneration):
766
  functions.append(arg)
767
  else:
768
  raise TypeError(
769
- "Callable filter function must have at least one argument to take `Adaptor` objects."
770
  )
771
 
772
  else:
@@ -820,12 +818,12 @@ class Adaptor(SelectorsGeneration):
820
  self,
821
  *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]],
822
  **kwargs: str,
823
- ) -> Union["Adaptor", None]:
824
  """Find elements by filters of your creations for ease, then return the first result. Otherwise return `None`.
825
 
826
  :param args: Tag name(s), iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
827
  :param kwargs: The attributes you want to filter elements based on it.
828
- :return: The `Adaptor` object of the element or `None` if the result didn't match
829
  """
830
  for element in self.find_all(*args, **kwargs):
831
  return element
@@ -928,15 +926,15 @@ class Adaptor(SelectorsGeneration):
928
  return score
929
 
930
  def save(
931
- self, element: Union["Adaptor", html.HtmlElement], identifier: str
932
  ) -> None:
933
  """Saves the element's unique properties to the storage for retrieval and relocation later
934
 
935
- :param element: The element itself that we want to save to storage, it can be an ` Adaptor ` or pure ` HtmlElement `
936
  :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
937
  the docs for more info.
938
  """
939
- if self.__auto_match_enabled:
940
  if isinstance(element, self.__class__):
941
  element = element._root
942
 
@@ -956,7 +954,7 @@ class Adaptor(SelectorsGeneration):
956
  the docs for more info.
957
  :return: A dictionary of the unique properties
958
  """
959
- if self.__auto_match_enabled:
960
  return self._storage.retrieve(identifier)
961
 
962
  log.critical(
@@ -1065,7 +1063,7 @@ class Adaptor(SelectorsGeneration):
1065
  "src",
1066
  ),
1067
  match_text: bool = False,
1068
- ) -> Union["Adaptors[Adaptor]", List]:
1069
  """Find elements that are in the same tree depth in the page with the same tag name and same parent tag etc...
1070
  then return the ones that match the current element attributes with a percentage higher than the input threshold.
1071
 
@@ -1084,7 +1082,7 @@ class Adaptor(SelectorsGeneration):
1084
  :param match_text: If True, element text content will be taken into calculation while matching.
1085
  Not recommended to use in normal cases, but it depends.
1086
 
1087
- :return: A ``Adaptors`` container of ``Adaptor`` objects or empty list
1088
  """
1089
  # We will use the elements' root from now on to get the speed boost of using Lxml directly
1090
  root = self._root
@@ -1128,7 +1126,7 @@ class Adaptor(SelectorsGeneration):
1128
  partial: bool = False,
1129
  case_sensitive: bool = False,
1130
  clean_match: bool = True,
1131
- ) -> Union["Adaptors[Adaptor]", "Adaptor"]:
1132
  """Find elements that its text content fully/partially matches input.
1133
  :param text: Text query to match
1134
  :param first_match: Returns the first element that matches conditions, enabled by default
@@ -1137,7 +1135,7 @@ class Adaptor(SelectorsGeneration):
1137
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
1138
  """
1139
 
1140
- results = Adaptors([])
1141
  if not case_sensitive:
1142
  text = text.lower()
1143
 
@@ -1174,14 +1172,14 @@ class Adaptor(SelectorsGeneration):
1174
  first_match: bool = True,
1175
  case_sensitive: bool = False,
1176
  clean_match: bool = True,
1177
- ) -> Union["Adaptors[Adaptor]", "Adaptor"]:
1178
  """Find elements that its text content matches the input regex pattern.
1179
  :param query: Regex query/pattern to match
1180
  :param first_match: Return the first element that matches conditions; enabled by default.
1181
  :param case_sensitive: If enabled, the letters case will be taken into consideration in the regex.
1182
  :param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching.
1183
  """
1184
- results = Adaptors([])
1185
 
1186
  # This selector gets all elements with text content
1187
  for node in self.__handle_elements(
@@ -1206,24 +1204,24 @@ class Adaptor(SelectorsGeneration):
1206
  return results
1207
 
1208
 
1209
- class Adaptors(List[Adaptor]):
1210
  """
1211
- The `Adaptors` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
1212
  """
1213
 
1214
  __slots__ = ()
1215
 
1216
  @typing.overload
1217
- def __getitem__(self, pos: SupportsIndex) -> Adaptor:
1218
  pass
1219
 
1220
  @typing.overload
1221
- def __getitem__(self, pos: slice) -> "Adaptors":
1222
  pass
1223
 
1224
  def __getitem__(
1225
  self, pos: Union[SupportsIndex, slice]
1226
- ) -> Union[Adaptor, "Adaptors"]:
1227
  lst = super().__getitem__(pos)
1228
  if isinstance(pos, slice):
1229
  return self.__class__(lst)
@@ -1237,10 +1235,10 @@ class Adaptors(List[Adaptor]):
1237
  auto_save: bool = False,
1238
  percentage: int = 0,
1239
  **kwargs: Any,
1240
- ) -> "Adaptors[Adaptor]":
1241
  """
1242
  Call the ``.xpath()`` method for each element in this list and return
1243
- their results as another `Adaptors` class.
1244
 
1245
  **Important:
1246
  It's recommended to use the identifier argument if you plan to use a different selector later
@@ -1251,12 +1249,12 @@ class Adaptors(List[Adaptor]):
1251
  :param selector: The XPath selector to be used.
1252
  :param identifier: A string that will be used to retrieve element's data in auto-matching,
1253
  otherwise the selector will be used.
1254
- :param auto_save: Automatically save new elements for `auto_match` later
1255
  :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
1256
  Be aware that the percentage calculation depends solely on the page structure, so don't play with this
1257
  number unless you must know what you are doing!
1258
 
1259
- :return: `Adaptors` class.
1260
  """
1261
  results = [
1262
  n.xpath(
@@ -1272,10 +1270,10 @@ class Adaptors(List[Adaptor]):
1272
  identifier: str = "",
1273
  auto_save: bool = False,
1274
  percentage: int = 0,
1275
- ) -> "Adaptors[Adaptor]":
1276
  """
1277
  Call the ``.css()`` method for each element in this list and return
1278
- their results flattened as another `Adaptors` class.
1279
 
1280
  **Important:
1281
  It's recommended to use the identifier argument if you plan to use a different selector later
@@ -1284,12 +1282,12 @@ class Adaptors(List[Adaptor]):
1284
  :param selector: The CSS3 selector to be used.
1285
  :param identifier: A string that will be used to retrieve element's data in auto-matching,
1286
  otherwise the selector will be used.
1287
- :param auto_save: Automatically save new elements for `auto_match` later
1288
  :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
1289
  Be aware that the percentage calculation depends solely on the page structure, so don't play with this
1290
  number unless you must know what you are doing!
1291
 
1292
- :return: `Adaptors` class.
1293
  """
1294
  results = [
1295
  n.css(selector, identifier or selector, False, auto_save, percentage)
@@ -1340,7 +1338,7 @@ class Adaptors(List[Adaptor]):
1340
  return result
1341
  return default
1342
 
1343
- def search(self, func: Callable[["Adaptor"], bool]) -> Union["Adaptor", None]:
1344
  """Loop over all current elements and return the first element that matches the passed function
1345
  :param func: A function that takes each element as an argument and returns True/False
1346
  :return: The first element that match the function or ``None`` otherwise.
@@ -1350,10 +1348,10 @@ class Adaptors(List[Adaptor]):
1350
  return element
1351
  return None
1352
 
1353
- def filter(self, func: Callable[["Adaptor"], bool]) -> "Adaptors[Adaptor]":
1354
  """Filter current elements based on the passed function
1355
  :param func: A function that takes each element as an argument and returns True/False
1356
- :return: The new `Adaptors` object or empty list otherwise.
1357
  """
1358
  return self.__class__([element for element in self if func(element)])
1359
 
@@ -1382,4 +1380,4 @@ class Adaptors(List[Adaptor]):
1382
 
1383
  def __getstate__(self) -> Any:
1384
  # lxml don't like it :)
1385
- raise TypeError("Can't pickle Adaptors object")
 
24
  )
25
  from scrapling.core.custom_types import AttributesHandler, TextHandler, TextHandlers
26
  from scrapling.core.mixins import SelectorsGeneration
27
+ from scrapling.core.storage import (
28
  SQLiteStorageSystem,
29
  StorageSystemMixin,
30
  _StorageTools,
 
33
  from scrapling.core.utils import clean_spaces, flatten, html_forbidden, is_jsonable, log
34
 
35
 
36
+ class Selector(SelectorsGeneration):
37
  __slots__ = (
38
  "url",
39
  "encoding",
40
+ "__adaptive_enabled",
41
  "_root",
42
  "_storage",
43
  "__keep_comments",
 
58
  root: Optional[html.HtmlElement] = None,
59
  keep_comments: Optional[bool] = False,
60
  keep_cdata: Optional[bool] = False,
61
+ adaptive: Optional[bool] = False,
62
  _storage: object = None,
63
  storage: Any = SQLiteStorageSystem,
64
  storage_args: Optional[Dict] = None,
 
82
  Don't use it unless you know what you are doing!
83
  :param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
84
  :param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
85
+ :param adaptive: Globally turn off the auto-match feature in all functions, this argument takes higher
86
  priority over all auto-match related arguments/functions in the class.
87
  :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
88
  :param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
 
90
  """
91
  if root is None and not body and text is None:
92
  raise ValueError(
93
+ "Selector class needs text, body, or root arguments to work"
94
  )
95
 
96
  self.__text = ""
 
134
 
135
  self._root = root
136
 
137
+ self.__adaptive_enabled = adaptive
138
 
139
+ if self.__adaptive_enabled:
140
  if _storage is not None:
141
  self._storage = _storage
142
  else:
 
214
  """
215
  return TextHandler(str(element))
216
 
217
+ def __element_convertor(self, element: html.HtmlElement) -> "Selector":
218
+ """Used internally to convert a single HtmlElement to Selector directly without checks"""
219
  db_instance = (
220
  self._storage if (hasattr(self, "_storage") and self._storage) else None
221
  )
222
+ return Selector(
223
  root=element,
224
  url=self.url,
225
  encoding=self.encoding,
226
+ adaptive=self.__adaptive_enabled,
227
+ _storage=db_instance, # Reuse existing storage if it exists otherwise it won't be checked if `adaptive` is turned off
228
  keep_comments=self.__keep_comments,
229
  keep_cdata=self.__keep_cdata,
230
  huge_tree=self.__huge_tree_enabled,
 
233
 
234
  def __handle_element(
235
  self, element: Union[html.HtmlElement, etree._ElementUnicodeResult]
236
+ ) -> Union[TextHandler, "Selector", None]:
237
+ """Used internally in all functions to convert a single element to type (Selector|TextHandler) when possible"""
238
  if element is None:
239
  return None
240
  elif self._is_text_node(element):
 
245
 
246
  def __handle_elements(
247
  self, result: List[Union[html.HtmlElement, etree._ElementUnicodeResult]]
248
+ ) -> Union["Selectors", "TextHandlers", List]:
249
+ """Used internally in all functions to convert results to type (Selectors|TextHandlers) in bulk when possible"""
250
  if not len(
251
  result
252
  ): # Lxml will give a warning if I used something like `not result`
253
+ return Selectors([])
254
 
255
  # From within the code, this method will always get a list of the same type,
256
  # so we will continue without checks for a slight performance boost
257
  if self._is_text_node(result[0]):
258
  return TextHandlers(list(map(self.__content_convertor, result)))
259
 
260
+ return Selectors(list(map(self.__element_convertor, result)))
261
 
262
  def __getstate__(self) -> Any:
263
  # lxml don't like it :)
264
+ raise TypeError("Can't pickle Selector objects")
265
 
266
  # The following four properties I made them into functions instead of variables directly
267
  # So they don't slow down the process of initializing many instances of the class and gets executed only
 
322
  return TextHandler(separator).join(_all_strings)
323
 
324
  def urljoin(self, relative_url: str) -> str:
325
+ """Join this Selector's url with a relative url to form an absolute full URL."""
326
  return urljoin(self.url, relative_url)
327
 
328
  @property
 
363
  return class_name in self._root.classes
364
 
365
  @property
366
+ def parent(self) -> Union["Selector", None]:
367
  """Return the direct parent of the element or ``None`` otherwise"""
368
  return self.__handle_element(self._root.getparent())
369
 
370
  @property
371
+ def below_elements(self) -> "Selectors[Selector]":
372
  """Return all elements under the current element in the DOM tree"""
373
  below = self._root.xpath(".//*")
374
  return self.__handle_elements(below)
375
 
376
  @property
377
+ def children(self) -> "Selectors[Selector]":
378
  """Return the children elements of the current element or empty list otherwise"""
379
+ return Selectors(
380
  [
381
  self.__element_convertor(child)
382
  for child in self._root.iterchildren()
 
385
  )
386
 
387
  @property
388
+ def siblings(self) -> "Selectors[Selector]":
389
  """Return other children of the current element's parent or empty list otherwise"""
390
  if self.parent:
391
+ return Selectors(
392
  [child for child in self.parent.children if child._root != self._root]
393
  )
394
+ return Selectors([])
395
 
396
+ def iterancestors(self) -> Generator["Selector", None, None]:
397
  """Return a generator that loops over all ancestors of the element, starting with the element's parent."""
398
  for ancestor in self._root.iterancestors():
399
  yield self.__element_convertor(ancestor)
400
 
401
  def find_ancestor(
402
+ self, func: Callable[["Selector"], bool]
403
+ ) -> Union["Selector", None]:
404
  """Loop over all ancestors of the element till one match the passed function
405
  :param func: A function that takes each ancestor as an argument and returns True/False
406
  :return: The first ancestor that match the function or ``None`` otherwise.
 
411
  return None
412
 
413
  @property
414
+ def path(self) -> "Selectors[Selector]":
415
+ """Returns a list of type `Selectors` that contains the path leading to the current element from the root."""
416
  lst = list(self.iterancestors())
417
+ return Selectors(lst)
418
 
419
  @property
420
+ def next(self) -> Union["Selector", None]:
421
  """Returns the next element of the current element in the children of the parent or ``None`` otherwise."""
422
  next_element = self._root.getnext()
423
  if next_element is not None:
 
428
  return self.__handle_element(next_element)
429
 
430
  @property
431
+ def previous(self) -> Union["Selector", None]:
432
  """Returns the previous element of the current element in the children of the parent or ``None`` otherwise."""
433
  prev_element = self._root.getprevious()
434
  if prev_element is not None:
 
471
  # From here we start with the selecting functions
472
  def relocate(
473
  self,
474
+ element: Union[Dict, html.HtmlElement, "Selector"],
475
  percentage: int = 0,
476
+ selector_type: bool = False,
477
+ ) -> Union[List[Union[html.HtmlElement, None]], "Selectors"]:
478
  """This function will search again for the element in the page tree, used automatically on page structure change
479
 
480
  :param element: The element we want to relocate in the tree
481
  :param percentage: The minimum percentage to accept and not going lower than that. Be aware that the percentage
482
  calculation depends solely on the page structure, so don't play with this number unless you must know
483
  what you are doing!
484
+ :param selector_type: If True, the return result will be converted to `Selectors` object
485
+ :return: List of pure HTML elements that got the highest matching score or 'Selectors' object
486
  """
487
  score_table = {}
488
  # Note: `element` will most likely always be a dictionary at this point.
 
511
  f"{percent} -> {self.__handle_elements(score_table[percent])}"
512
  )
513
 
514
+ if not selector_type:
515
  return score_table[highest_probability]
516
  return self.__handle_elements(score_table[highest_probability])
517
  return []
 
520
  self,
521
  selector: str,
522
  identifier: str = "",
523
+ adaptive: bool = False,
524
  auto_save: bool = False,
525
  percentage: int = 0,
526
+ ) -> Union["Selector", "TextHandler", None]:
527
  """Search the current tree with CSS3 selectors and return the first result if possible, otherwise return `None`
528
 
529
  **Important:
 
531
  and want to relocate the same element(s)**
532
 
533
  :param selector: The CSS3 selector to be used.
534
+ :param adaptive: Enabled will make the function try to relocate the element if it was 'saved' before
535
  :param identifier: A string that will be used to save/retrieve element's data in auto-matching,
536
  otherwise the selector will be used.
537
+ :param auto_save: Automatically save new elements for `adaptive` later
538
  :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
539
  Be aware that the percentage calculation depends solely on the page structure, so don't play with this
540
  number unless you must know what you are doing!
541
  """
542
+ for element in self.css(selector, identifier, adaptive, auto_save, percentage):
 
 
543
  return element
544
  return None
545
 
 
547
  self,
548
  selector: str,
549
  identifier: str = "",
550
+ adaptive: bool = False,
551
  auto_save: bool = False,
552
  percentage: int = 0,
553
  **kwargs: Any,
554
+ ) -> Union["Selector", "TextHandler", None]:
555
  """Search the current tree with XPath selectors and return the first result if possible, otherwise return `None`
556
 
557
  **Important:
 
561
  Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
562
 
563
  :param selector: The XPath selector to be used.
564
+ :param adaptive: Enabled will make the function try to relocate the element if it was 'saved' before
565
  :param identifier: A string that will be used to save/retrieve element's data in auto-matching,
566
  otherwise the selector will be used.
567
+ :param auto_save: Automatically save new elements for `adaptive` later
568
  :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
569
  Be aware that the percentage calculation depends solely on the page structure, so don't play with this
570
  number unless you must know what you are doing!
571
  """
572
  for element in self.xpath(
573
+ selector, identifier, adaptive, auto_save, percentage, **kwargs
574
  ):
575
  return element
576
  return None
 
579
  self,
580
  selector: str,
581
  identifier: str = "",
582
+ adaptive: bool = False,
583
  auto_save: bool = False,
584
  percentage: int = 0,
585
+ ) -> Union["Selectors[Selector]", List, "TextHandlers[TextHandler]"]:
586
  """Search the current tree with CSS3 selectors
587
 
588
  **Important:
 
590
  and want to relocate the same element(s)**
591
 
592
  :param selector: The CSS3 selector to be used.
593
+ :param adaptive: Enabled will make the function try to relocate the element if it was 'saved' before
594
  :param identifier: A string that will be used to save/retrieve element's data in auto-matching,
595
  otherwise the selector will be used.
596
+ :param auto_save: Automatically save new elements for `adaptive` later
597
  :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
598
  Be aware that the percentage calculation depends solely on the page structure, so don't play with this
599
  number unless you must know what you are doing!
600
 
601
+ :return: `Selectors` class.
602
  """
603
  try:
604
+ if not self.__adaptive_enabled or "," not in selector:
605
  # No need to split selectors in this case, let's save some CPU cycles :)
606
  xpath_selector = translator_instance.css_to_xpath(selector)
607
  return self.xpath(
608
  xpath_selector,
609
  identifier or selector,
610
+ adaptive,
611
  auto_save,
612
  percentage,
613
  )
 
623
  results += self.xpath(
624
  xpath_selector,
625
  identifier or single_selector.canonical(),
626
+ adaptive,
627
  auto_save,
628
  percentage,
629
  )
 
641
  self,
642
  selector: str,
643
  identifier: str = "",
644
+ adaptive: bool = False,
645
  auto_save: bool = False,
646
  percentage: int = 0,
647
  **kwargs: Any,
648
+ ) -> Union["Selectors[Selector]", List, "TextHandlers[TextHandler]"]:
649
  """Search the current tree with XPath selectors
650
 
651
  **Important:
 
655
  Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
656
 
657
  :param selector: The XPath selector to be used.
658
+ :param adaptive: Enabled will make the function try to relocate the element if it was 'saved' before
659
  :param identifier: A string that will be used to save/retrieve element's data in auto-matching,
660
  otherwise the selector will be used.
661
+ :param auto_save: Automatically save new elements for `adaptive` later
662
  :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
663
  Be aware that the percentage calculation depends solely on the page structure, so don't play with this
664
  number unless you must know what you are doing!
665
 
666
+ :return: `Selectors` class.
667
  """
668
  try:
669
  elements = self._root.xpath(selector, **kwargs)
670
 
671
  if elements:
672
  if auto_save:
673
+ if not self.__adaptive_enabled:
674
  log.warning(
675
+ "Argument `auto_save` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
676
  )
677
  else:
678
  self.save(elements[0], identifier or selector)
679
 
680
  return self.__handle_elements(elements)
681
+ elif self.__adaptive_enabled:
682
+ if adaptive:
683
  element_data = self.retrieve(identifier or selector)
684
  if element_data:
685
  elements = self.relocate(element_data, percentage)
 
688
 
689
  return self.__handle_elements(elements)
690
  else:
691
+ if adaptive:
692
  log.warning(
693
+ "Argument `adaptive` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
694
  )
695
  elif auto_save:
696
  log.warning(
697
+ "Argument `auto_save` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
698
  )
699
 
700
  return self.__handle_elements(elements)
 
711
  self,
712
  *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]],
713
  **kwargs: str,
714
+ ) -> "Selectors":
715
  """Find elements by filters of your creations for ease.
716
 
717
  :param args: Tag name(s), iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
718
  :param kwargs: The attributes you want to filter elements based on it.
719
+ :return: The `Selectors` object of the elements or empty list
720
  """
721
  # Attributes that are Python reserved words and can't be used directly
722
  # Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
 
733
 
734
  attributes = dict()
735
  tags, patterns = set(), set()
736
+ results, functions, selectors = Selectors([]), [], []
737
 
738
  # Brace yourself for a wonderful journey!
739
  for arg in args:
 
764
  functions.append(arg)
765
  else:
766
  raise TypeError(
767
+ "Callable filter function must have at least one argument to take `Selector` objects."
768
  )
769
 
770
  else:
 
818
  self,
819
  *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]],
820
  **kwargs: str,
821
+ ) -> Union["Selector", None]:
822
  """Find elements by filters of your creations for ease, then return the first result. Otherwise return `None`.
823
 
824
  :param args: Tag name(s), iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
825
  :param kwargs: The attributes you want to filter elements based on it.
826
+ :return: The `Selector` object of the element or `None` if the result didn't match
827
  """
828
  for element in self.find_all(*args, **kwargs):
829
  return element
 
926
  return score
927
 
928
  def save(
929
+ self, element: Union["Selector", html.HtmlElement], identifier: str
930
  ) -> None:
931
  """Saves the element's unique properties to the storage for retrieval and relocation later
932
 
933
+ :param element: The element itself that we want to save to storage, it can be an ` Selector ` or pure ` HtmlElement `
934
  :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
935
  the docs for more info.
936
  """
937
+ if self.__adaptive_enabled:
938
  if isinstance(element, self.__class__):
939
  element = element._root
940
 
 
954
  the docs for more info.
955
  :return: A dictionary of the unique properties
956
  """
957
+ if self.__adaptive_enabled:
958
  return self._storage.retrieve(identifier)
959
 
960
  log.critical(
 
1063
  "src",
1064
  ),
1065
  match_text: bool = False,
1066
+ ) -> Union["Selectors[Selector]", List]:
1067
  """Find elements that are in the same tree depth in the page with the same tag name and same parent tag etc...
1068
  then return the ones that match the current element attributes with a percentage higher than the input threshold.
1069
 
 
1082
  :param match_text: If True, element text content will be taken into calculation while matching.
1083
  Not recommended to use in normal cases, but it depends.
1084
 
1085
+ :return: A ``Selectors`` container of ``Selector`` objects or empty list
1086
  """
1087
  # We will use the elements' root from now on to get the speed boost of using Lxml directly
1088
  root = self._root
 
1126
  partial: bool = False,
1127
  case_sensitive: bool = False,
1128
  clean_match: bool = True,
1129
+ ) -> Union["Selectors[Selector]", "Selector"]:
1130
  """Find elements that its text content fully/partially matches input.
1131
  :param text: Text query to match
1132
  :param first_match: Returns the first element that matches conditions, enabled by default
 
1135
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
1136
  """
1137
 
1138
+ results = Selectors([])
1139
  if not case_sensitive:
1140
  text = text.lower()
1141
 
 
1172
  first_match: bool = True,
1173
  case_sensitive: bool = False,
1174
  clean_match: bool = True,
1175
+ ) -> Union["Selectors[Selector]", "Selector"]:
1176
  """Find elements that its text content matches the input regex pattern.
1177
  :param query: Regex query/pattern to match
1178
  :param first_match: Return the first element that matches conditions; enabled by default.
1179
  :param case_sensitive: If enabled, the letters case will be taken into consideration in the regex.
1180
  :param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching.
1181
  """
1182
+ results = Selectors([])
1183
 
1184
  # This selector gets all elements with text content
1185
  for node in self.__handle_elements(
 
1204
  return results
1205
 
1206
 
1207
+ class Selectors(List[Selector]):
1208
  """
1209
+ The `Selectors` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
1210
  """
1211
 
1212
  __slots__ = ()
1213
 
1214
  @typing.overload
1215
+ def __getitem__(self, pos: SupportsIndex) -> Selector:
1216
  pass
1217
 
1218
  @typing.overload
1219
+ def __getitem__(self, pos: slice) -> "Selectors":
1220
  pass
1221
 
1222
  def __getitem__(
1223
  self, pos: Union[SupportsIndex, slice]
1224
+ ) -> Union[Selector, "Selectors"]:
1225
  lst = super().__getitem__(pos)
1226
  if isinstance(pos, slice):
1227
  return self.__class__(lst)
 
1235
  auto_save: bool = False,
1236
  percentage: int = 0,
1237
  **kwargs: Any,
1238
+ ) -> "Selectors[Selector]":
1239
  """
1240
  Call the ``.xpath()`` method for each element in this list and return
1241
+ their results as another `Selectors` class.
1242
 
1243
  **Important:
1244
  It's recommended to use the identifier argument if you plan to use a different selector later
 
1249
  :param selector: The XPath selector to be used.
1250
  :param identifier: A string that will be used to retrieve element's data in auto-matching,
1251
  otherwise the selector will be used.
1252
+ :param auto_save: Automatically save new elements for `adaptive` later
1253
  :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
1254
  Be aware that the percentage calculation depends solely on the page structure, so don't play with this
1255
  number unless you must know what you are doing!
1256
 
1257
+ :return: `Selectors` class.
1258
  """
1259
  results = [
1260
  n.xpath(
 
1270
  identifier: str = "",
1271
  auto_save: bool = False,
1272
  percentage: int = 0,
1273
+ ) -> "Selectors[Selector]":
1274
  """
1275
  Call the ``.css()`` method for each element in this list and return
1276
+ their results flattened as another `Selectors` class.
1277
 
1278
  **Important:
1279
  It's recommended to use the identifier argument if you plan to use a different selector later
 
1282
  :param selector: The CSS3 selector to be used.
1283
  :param identifier: A string that will be used to retrieve element's data in auto-matching,
1284
  otherwise the selector will be used.
1285
+ :param auto_save: Automatically save new elements for `adaptive` later
1286
  :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
1287
  Be aware that the percentage calculation depends solely on the page structure, so don't play with this
1288
  number unless you must know what you are doing!
1289
 
1290
+ :return: `Selectors` class.
1291
  """
1292
  results = [
1293
  n.css(selector, identifier or selector, False, auto_save, percentage)
 
1338
  return result
1339
  return default
1340
 
1341
+ def search(self, func: Callable[["Selector"], bool]) -> Union["Selector", None]:
1342
  """Loop over all current elements and return the first element that matches the passed function
1343
  :param func: A function that takes each element as an argument and returns True/False
1344
  :return: The first element that match the function or ``None`` otherwise.
 
1348
  return element
1349
  return None
1350
 
1351
+ def filter(self, func: Callable[["Selector"], bool]) -> "Selectors[Selector]":
1352
  """Filter current elements based on the passed function
1353
  :param func: A function that takes each element as an argument and returns True/False
1354
+ :return: The new `Selectors` object or empty list otherwise.
1355
  """
1356
  return self.__class__([element for element in self if func(element)])
1357
 
 
1380
 
1381
  def __getstate__(self) -> Any:
1382
  # lxml don't like it :)
1383
+ raise TypeError("Can't pickle Selectors object")
tests/fetchers/async/test_camoufox.py CHANGED
@@ -3,7 +3,7 @@ import pytest_httpbin
3
 
4
  from scrapling import StealthyFetcher
5
 
6
- StealthyFetcher.auto_match = True
7
 
8
 
9
  @pytest_httpbin.use_class_based_httpbin
 
3
 
4
  from scrapling import StealthyFetcher
5
 
6
+ StealthyFetcher.adaptive = True
7
 
8
 
9
  @pytest_httpbin.use_class_based_httpbin
tests/fetchers/async/test_dynamic.py CHANGED
@@ -3,7 +3,7 @@ import pytest_httpbin
3
 
4
  from scrapling import DynamicFetcher
5
 
6
- DynamicFetcher.auto_match = True
7
 
8
 
9
  @pytest_httpbin.use_class_based_httpbin
 
3
 
4
  from scrapling import DynamicFetcher
5
 
6
+ DynamicFetcher.adaptive = True
7
 
8
 
9
  @pytest_httpbin.use_class_based_httpbin
tests/fetchers/async/test_requests.py CHANGED
@@ -3,7 +3,7 @@ import pytest_httpbin
3
 
4
  from scrapling.fetchers import AsyncFetcher
5
 
6
- AsyncFetcher.auto_match = True
7
 
8
 
9
  @pytest_httpbin.use_class_based_httpbin
 
3
 
4
  from scrapling.fetchers import AsyncFetcher
5
 
6
+ AsyncFetcher.adaptive = True
7
 
8
 
9
  @pytest_httpbin.use_class_based_httpbin
tests/fetchers/sync/test_camoufox.py CHANGED
@@ -3,7 +3,7 @@ import pytest_httpbin
3
 
4
  from scrapling import StealthyFetcher
5
 
6
- StealthyFetcher.auto_match = True
7
 
8
 
9
  @pytest_httpbin.use_class_based_httpbin
 
3
 
4
  from scrapling import StealthyFetcher
5
 
6
+ StealthyFetcher.adaptive = True
7
 
8
 
9
  @pytest_httpbin.use_class_based_httpbin
tests/fetchers/sync/test_dynamic.py CHANGED
@@ -5,7 +5,7 @@ import pytest_httpbin
5
 
6
  from scrapling import DynamicFetcher
7
 
8
- DynamicFetcher.auto_match = True
9
 
10
 
11
  @pytest_httpbin.use_class_based_httpbin
 
5
 
6
  from scrapling import DynamicFetcher
7
 
8
+ DynamicFetcher.adaptive = True
9
 
10
 
11
  @pytest_httpbin.use_class_based_httpbin
tests/fetchers/sync/test_requests.py CHANGED
@@ -3,7 +3,7 @@ import pytest_httpbin
3
 
4
  from scrapling import Fetcher
5
 
6
- Fetcher.auto_match = True
7
 
8
 
9
  @pytest_httpbin.use_class_based_httpbin
 
3
 
4
  from scrapling import Fetcher
5
 
6
+ Fetcher.adaptive = True
7
 
8
 
9
  @pytest_httpbin.use_class_based_httpbin
tests/parser/{test_automatch.py → test_adaptive.py} RENAMED
@@ -2,10 +2,10 @@ import asyncio
2
 
3
  import pytest
4
 
5
- from scrapling import Adaptor
6
 
7
 
8
- class TestParserAutoMatch:
9
  def test_element_relocation(self):
10
  """Test relocating element after structure change"""
11
  original_html = """
@@ -43,13 +43,13 @@ class TestParserAutoMatch:
43
  </div>
44
  """
45
 
46
- old_page = Adaptor(original_html, url="example.com", auto_match=True)
47
- new_page = Adaptor(changed_html, url="example.com", auto_match=True)
48
 
49
  # 'p1' was used as ID and now it's not and all the path elements have changes
50
  # Also at the same time testing auto-match vs combined selectors
51
  _ = old_page.css("#p1, #p2", auto_save=True)[0]
52
- relocated = new_page.css("#p1", auto_match=True)
53
 
54
  assert relocated is not None
55
  assert relocated[0].attrib["data-id"] == "p1"
@@ -97,13 +97,13 @@ class TestParserAutoMatch:
97
  # Simulate async operation
98
  await asyncio.sleep(0.1) # Minimal async operation
99
 
100
- old_page = Adaptor(original_html, url="example.com", auto_match=True)
101
- new_page = Adaptor(changed_html, url="example.com", auto_match=True)
102
 
103
  # 'p1' was used as ID and now it's not and all the path elements have changes
104
  # Also at the same time testing auto-match vs combined selectors
105
  _ = old_page.css("#p1, #p2", auto_save=True)[0]
106
- relocated = new_page.css("#p1", auto_match=True)
107
 
108
  assert relocated is not None
109
  assert relocated[0].attrib["data-id"] == "p1"
 
2
 
3
  import pytest
4
 
5
+ from scrapling import Selector
6
 
7
 
8
+ class TestParserAdaptive:
9
  def test_element_relocation(self):
10
  """Test relocating element after structure change"""
11
  original_html = """
 
43
  </div>
44
  """
45
 
46
+ old_page = Selector(original_html, url="example.com", adaptive=True)
47
+ new_page = Selector(changed_html, url="example.com", adaptive=True)
48
 
49
  # 'p1' was used as ID and now it's not and all the path elements have changes
50
  # Also at the same time testing auto-match vs combined selectors
51
  _ = old_page.css("#p1, #p2", auto_save=True)[0]
52
+ relocated = new_page.css("#p1", adaptive=True)
53
 
54
  assert relocated is not None
55
  assert relocated[0].attrib["data-id"] == "p1"
 
97
  # Simulate async operation
98
  await asyncio.sleep(0.1) # Minimal async operation
99
 
100
+ old_page = Selector(original_html, url="example.com", adaptive=True)
101
+ new_page = Selector(changed_html, url="example.com", adaptive=True)
102
 
103
  # 'p1' was used as ID and now it's not and all the path elements have changes
104
  # Also at the same time testing auto-match vs combined selectors
105
  _ = old_page.css("#p1, #p2", auto_save=True)[0]
106
+ relocated = new_page.css("#p1", adaptive=True)
107
 
108
  assert relocated is not None
109
  assert relocated[0].attrib["data-id"] == "p1"
tests/parser/test_general.py CHANGED
@@ -4,7 +4,7 @@ import time
4
  import pytest
5
  from cssselect import SelectorError, SelectorSyntaxError
6
 
7
- from scrapling import Adaptor
8
 
9
 
10
  @pytest.fixture
@@ -78,7 +78,7 @@ def html_content():
78
 
79
  @pytest.fixture
80
  def page(html_content):
81
- return Adaptor(html_content, auto_match=False)
82
 
83
 
84
  # CSS Selector Tests
@@ -162,26 +162,26 @@ class TestSimilarElements:
162
 
163
  # Error Handling Tests
164
  class TestErrorHandling:
165
- def test_invalid_adaptor_initialization(self):
166
- """Test various invalid Adaptor initializations"""
167
  # No arguments
168
  with pytest.raises(ValueError):
169
- _ = Adaptor(auto_match=False)
170
 
171
  # Invalid argument types
172
  with pytest.raises(TypeError):
173
- _ = Adaptor(root="ayo", auto_match=False)
174
 
175
  with pytest.raises(TypeError):
176
- _ = Adaptor(text=1, auto_match=False)
177
 
178
  with pytest.raises(TypeError):
179
- _ = Adaptor(body=1, auto_match=False)
180
 
181
  def test_invalid_storage(self, page, html_content):
182
  """Test invalid storage parameter"""
183
  with pytest.raises(ValueError):
184
- _ = Adaptor(html_content, storage=object, auto_match=True)
185
 
186
  def test_bad_selectors(self, page):
187
  """Test handling of invalid selectors"""
@@ -195,7 +195,7 @@ class TestErrorHandling:
195
  # Pickling and Object Representation Tests
196
  class TestPicklingAndRepresentation:
197
  def test_unpickleable_objects(self, page):
198
- """Test that Adaptor objects cannot be pickled"""
199
  table = page.css(".product-list")[0]
200
  with pytest.raises(TypeError):
201
  pickle.dumps(table)
@@ -299,7 +299,7 @@ def test_large_html_parsing_performance():
299
  )
300
 
301
  start_time = time.time()
302
- parsed = Adaptor(large_html, auto_match=False)
303
  elements = parsed.css(".item")
304
  end_time = time.time()
305
 
@@ -315,7 +315,7 @@ def test_large_html_parsing_performance():
315
  def test_selectors_generation(page):
316
  """Try to create selectors for all elements in the page"""
317
 
318
- def _traverse(element: Adaptor):
319
  assert isinstance(element.generate_css_selector, str)
320
  assert isinstance(element.generate_xpath_selector, str)
321
  for branch in element.children:
 
4
  import pytest
5
  from cssselect import SelectorError, SelectorSyntaxError
6
 
7
+ from scrapling import Selector
8
 
9
 
10
  @pytest.fixture
 
78
 
79
  @pytest.fixture
80
  def page(html_content):
81
+ return Selector(html_content, adaptive=False)
82
 
83
 
84
  # CSS Selector Tests
 
162
 
163
  # Error Handling Tests
164
  class TestErrorHandling:
165
+ def test_invalid_selector_initialization(self):
166
+ """Test various invalid Selector initializations"""
167
  # No arguments
168
  with pytest.raises(ValueError):
169
+ _ = Selector(adaptive=False)
170
 
171
  # Invalid argument types
172
  with pytest.raises(TypeError):
173
+ _ = Selector(root="ayo", adaptive=False)
174
 
175
  with pytest.raises(TypeError):
176
+ _ = Selector(text=1, adaptive=False)
177
 
178
  with pytest.raises(TypeError):
179
+ _ = Selector(body=1, adaptive=False)
180
 
181
  def test_invalid_storage(self, page, html_content):
182
  """Test invalid storage parameter"""
183
  with pytest.raises(ValueError):
184
+ _ = Selector(html_content, storage=object, adaptive=True)
185
 
186
  def test_bad_selectors(self, page):
187
  """Test handling of invalid selectors"""
 
195
  # Pickling and Object Representation Tests
196
  class TestPicklingAndRepresentation:
197
  def test_unpickleable_objects(self, page):
198
+ """Test that Selector objects cannot be pickled"""
199
  table = page.css(".product-list")[0]
200
  with pytest.raises(TypeError):
201
  pickle.dumps(table)
 
299
  )
300
 
301
  start_time = time.time()
302
+ parsed = Selector(large_html, adaptive=False)
303
  elements = parsed.css(".item")
304
  end_time = time.time()
305
 
 
315
  def test_selectors_generation(page):
316
  """Try to create selectors for all elements in the page"""
317
 
318
+ def _traverse(element: Selector):
319
  assert isinstance(element.generate_css_selector, str)
320
  assert isinstance(element.generate_xpath_selector, str)
321
  for branch in element.children: