Karim shoair commited on
Commit ·
8e67a4c
1
Parent(s): d4fe1d6
refactor: huge change, many features/class got a better naming
Browse files- `Adaptor` became `Selector`
- `Adaptors` became `Selectors`
- `auto_match` argument/feature became `adaptive`
- `adaptor_arguments` argument became `selector_config`
- `automatch_domain` argument became `adaptive_domain`
- `additional_arguments` argument became `additional_args`
- `storage_adaptors` file became just `storage`
- README.md +5 -5
- benchmarks.py +4 -4
- scrapling/__init__.py +5 -5
- scrapling/core/ai.py +6 -6
- scrapling/core/shell.py +17 -17
- scrapling/core/{storage_adaptors.py → storage.py} +0 -0
- scrapling/engines/_browsers/_camoufox.py +20 -20
- scrapling/engines/_browsers/_controllers.py +10 -10
- scrapling/engines/_browsers/_validators.py +9 -9
- scrapling/engines/static.py +12 -18
- scrapling/engines/toolbelt/convertor.py +1 -1
- scrapling/engines/toolbelt/custom.py +20 -22
- scrapling/fetchers.py +10 -10
- scrapling/parser.py +105 -107
- tests/fetchers/async/test_camoufox.py +1 -1
- tests/fetchers/async/test_dynamic.py +1 -1
- tests/fetchers/async/test_requests.py +1 -1
- tests/fetchers/sync/test_camoufox.py +1 -1
- tests/fetchers/sync/test_dynamic.py +1 -1
- tests/fetchers/sync/test_requests.py +1 -1
- tests/parser/{test_automatch.py → test_adaptive.py} +8 -8
- tests/parser/test_general.py +12 -12
README.md
CHANGED
|
@@ -52,14 +52,14 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
| 52 |
|
| 53 |
```python
|
| 54 |
>> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
|
| 55 |
-
>> StealthyFetcher.
|
| 56 |
# Fetch websites' source under the radar!
|
| 57 |
>> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
|
| 58 |
>> print(page.status)
|
| 59 |
200
|
| 60 |
>> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
|
| 61 |
-
>> # Later, if the website structure changes, pass `
|
| 62 |
-
>> products = page.css('.product',
|
| 63 |
```
|
| 64 |
|
| 65 |
# Sponsors
|
|
@@ -150,7 +150,7 @@ Tired of your PC slowing you down? Can’t keep your machine on 24/7 for scrapin
|
|
| 150 |
```python
|
| 151 |
from scrapling.fetchers import Fetcher
|
| 152 |
|
| 153 |
-
# Do HTTP GET request to a web page and create an
|
| 154 |
page = Fetcher.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
| 155 |
# Get all text content from all HTML tags in the page except the `script` and `style` tags
|
| 156 |
page.get_all_text(ignore_tags=('script', 'style'))
|
|
@@ -219,7 +219,7 @@ Here are the results:
|
|
| 219 |
| Scrapling | 2.51 | 1.0x |
|
| 220 |
| AutoScraper | 11.41 | 4.546x |
|
| 221 |
|
| 222 |
-
Scrapling can find elements with more methods and returns the entire element's `
|
| 223 |
|
| 224 |
As you see, Scrapling is still 4.5 times faster at the same task.
|
| 225 |
|
|
|
|
| 52 |
|
| 53 |
```python
|
| 54 |
>> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
|
| 55 |
+
>> StealthyFetcher.adaptive = True
|
| 56 |
# Fetch websites' source under the radar!
|
| 57 |
>> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
|
| 58 |
>> print(page.status)
|
| 59 |
200
|
| 60 |
>> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
|
| 61 |
+
>> # Later, if the website structure changes, pass `adaptive=True`
|
| 62 |
+
>> products = page.css('.product', adaptive=True) # and Scrapling still finds them!
|
| 63 |
```
|
| 64 |
|
| 65 |
# Sponsors
|
|
|
|
| 150 |
```python
|
| 151 |
from scrapling.fetchers import Fetcher
|
| 152 |
|
| 153 |
+
# Do HTTP GET request to a web page and create an Selector instance
|
| 154 |
page = Fetcher.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
| 155 |
# Get all text content from all HTML tags in the page except the `script` and `style` tags
|
| 156 |
page.get_all_text(ignore_tags=('script', 'style'))
|
|
|
|
| 219 |
| Scrapling | 2.51 | 1.0x |
|
| 220 |
| AutoScraper | 11.41 | 4.546x |
|
| 221 |
|
| 222 |
+
Scrapling can find elements with more methods and returns the entire element's `Selector` object, not only text like AutoScraper. So, to make this test fair, both libraries will extract an element with text, find similar elements, and then extract the text content for all of them.
|
| 223 |
|
| 224 |
As you see, Scrapling is still 4.5 times faster at the same task.
|
| 225 |
|
benchmarks.py
CHANGED
|
@@ -12,7 +12,7 @@ from parsel import Selector
|
|
| 12 |
from pyquery import PyQuery as pq
|
| 13 |
from selectolax.parser import HTMLParser
|
| 14 |
|
| 15 |
-
from scrapling import
|
| 16 |
|
| 17 |
large_html = (
|
| 18 |
"<html><body>" + '<div class="item">' * 5000 + "</div>" * 5000 + "</body></html>"
|
|
@@ -73,9 +73,9 @@ def test_pyquery():
|
|
| 73 |
@benchmark
|
| 74 |
def test_scrapling():
|
| 75 |
# No need to do `.extract()` like parsel to extract text
|
| 76 |
-
# Also, this is faster than `[t.text for t in
|
| 77 |
# for obvious reasons, of course.
|
| 78 |
-
return
|
| 79 |
|
| 80 |
|
| 81 |
@benchmark
|
|
@@ -112,7 +112,7 @@ def test_scrapling_text(request_html):
|
|
| 112 |
# Will loop over resulted elements to get text too to make comparison even more fair otherwise Scrapling will be even faster
|
| 113 |
return [
|
| 114 |
element.text
|
| 115 |
-
for element in
|
| 116 |
.find_by_text("Tipping the Velvet", first_match=True)
|
| 117 |
.find_similar(ignore_attributes=["title"])
|
| 118 |
]
|
|
|
|
| 12 |
from pyquery import PyQuery as pq
|
| 13 |
from selectolax.parser import HTMLParser
|
| 14 |
|
| 15 |
+
from scrapling import Selector as ScraplingSelector
|
| 16 |
|
| 17 |
large_html = (
|
| 18 |
"<html><body>" + '<div class="item">' * 5000 + "</div>" * 5000 + "</body></html>"
|
|
|
|
| 73 |
@benchmark
|
| 74 |
def test_scrapling():
|
| 75 |
# No need to do `.extract()` like parsel to extract text
|
| 76 |
+
# Also, this is faster than `[t.text for t in Selector(large_html, adaptive=False).css('.item')]`
|
| 77 |
# for obvious reasons, of course.
|
| 78 |
+
return ScraplingSelector(large_html, adaptive=False).css(".item::text")
|
| 79 |
|
| 80 |
|
| 81 |
@benchmark
|
|
|
|
| 112 |
# Will loop over resulted elements to get text too to make comparison even more fair otherwise Scrapling will be even faster
|
| 113 |
return [
|
| 114 |
element.text
|
| 115 |
+
for element in ScraplingSelector(request_html, adaptive=False)
|
| 116 |
.find_by_text("Tipping the Velvet", first_match=True)
|
| 117 |
.find_similar(ignore_attributes=["title"])
|
| 118 |
]
|
scrapling/__init__.py
CHANGED
|
@@ -10,12 +10,12 @@ def __getattr__(name):
|
|
| 10 |
from scrapling.fetchers import Fetcher as cls
|
| 11 |
|
| 12 |
return cls
|
| 13 |
-
elif name == "
|
| 14 |
-
from scrapling.parser import
|
| 15 |
|
| 16 |
return cls
|
| 17 |
-
elif name == "
|
| 18 |
-
from scrapling.parser import
|
| 19 |
|
| 20 |
return cls
|
| 21 |
elif name == "AttributesHandler":
|
|
@@ -46,4 +46,4 @@ def __getattr__(name):
|
|
| 46 |
raise AttributeError(f"module 'scrapling' has no attribute '{name}'")
|
| 47 |
|
| 48 |
|
| 49 |
-
__all__ = ["
|
|
|
|
| 10 |
from scrapling.fetchers import Fetcher as cls
|
| 11 |
|
| 12 |
return cls
|
| 13 |
+
elif name == "Selector":
|
| 14 |
+
from scrapling.parser import Selector as cls
|
| 15 |
|
| 16 |
return cls
|
| 17 |
+
elif name == "Selectors":
|
| 18 |
+
from scrapling.parser import Selectors as cls
|
| 19 |
|
| 20 |
return cls
|
| 21 |
elif name == "AttributesHandler":
|
|
|
|
| 46 |
raise AttributeError(f"module 'scrapling' has no attribute '{name}'")
|
| 47 |
|
| 48 |
|
| 49 |
+
__all__ = ["Selector", "Fetcher", "AsyncFetcher", "StealthyFetcher", "DynamicFetcher"]
|
scrapling/core/ai.py
CHANGED
|
@@ -430,7 +430,7 @@ class ScraplingMCPServer:
|
|
| 430 |
os_randomize: bool = False,
|
| 431 |
disable_ads: bool = False,
|
| 432 |
geoip: bool = False,
|
| 433 |
-
|
| 434 |
) -> ResponseModel:
|
| 435 |
"""Use Scrapling's version of the Camoufox browser to fetch a URL and return a structured output of the result.
|
| 436 |
Note: This is best suitable for high protection levels. It's slower than the other tools.
|
|
@@ -467,7 +467,7 @@ class ScraplingMCPServer:
|
|
| 467 |
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 468 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 469 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 470 |
-
:param
|
| 471 |
"""
|
| 472 |
page = await StealthyFetcher.async_fetch(
|
| 473 |
url,
|
|
@@ -491,7 +491,7 @@ class ScraplingMCPServer:
|
|
| 491 |
solve_cloudflare=solve_cloudflare,
|
| 492 |
disable_resources=disable_resources,
|
| 493 |
wait_selector_state=wait_selector_state,
|
| 494 |
-
|
| 495 |
)
|
| 496 |
return _ContentTranslator(
|
| 497 |
Convertor._extract_content(
|
|
@@ -530,7 +530,7 @@ class ScraplingMCPServer:
|
|
| 530 |
os_randomize: bool = False,
|
| 531 |
disable_ads: bool = False,
|
| 532 |
geoip: bool = False,
|
| 533 |
-
|
| 534 |
) -> List[ResponseModel]:
|
| 535 |
"""Use Scrapling's version of the Camoufox browser to fetch a group of URLs at the same time, and for each page return a structured output of the result.
|
| 536 |
Note: This is best suitable for high protection levels. It's slower than the other tools.
|
|
@@ -567,7 +567,7 @@ class ScraplingMCPServer:
|
|
| 567 |
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 568 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 569 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 570 |
-
:param
|
| 571 |
"""
|
| 572 |
async with AsyncStealthySession(
|
| 573 |
wait=wait,
|
|
@@ -591,7 +591,7 @@ class ScraplingMCPServer:
|
|
| 591 |
solve_cloudflare=solve_cloudflare,
|
| 592 |
disable_resources=disable_resources,
|
| 593 |
wait_selector_state=wait_selector_state,
|
| 594 |
-
|
| 595 |
) as session:
|
| 596 |
tasks = [session.fetch(url) for url in urls]
|
| 597 |
responses = await gather(*tasks)
|
|
|
|
| 430 |
os_randomize: bool = False,
|
| 431 |
disable_ads: bool = False,
|
| 432 |
geoip: bool = False,
|
| 433 |
+
additional_args: Optional[Dict] = None,
|
| 434 |
) -> ResponseModel:
|
| 435 |
"""Use Scrapling's version of the Camoufox browser to fetch a URL and return a structured output of the result.
|
| 436 |
Note: This is best suitable for high protection levels. It's slower than the other tools.
|
|
|
|
| 467 |
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 468 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 469 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 470 |
+
:param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
| 471 |
"""
|
| 472 |
page = await StealthyFetcher.async_fetch(
|
| 473 |
url,
|
|
|
|
| 491 |
solve_cloudflare=solve_cloudflare,
|
| 492 |
disable_resources=disable_resources,
|
| 493 |
wait_selector_state=wait_selector_state,
|
| 494 |
+
additional_args=additional_args,
|
| 495 |
)
|
| 496 |
return _ContentTranslator(
|
| 497 |
Convertor._extract_content(
|
|
|
|
| 530 |
os_randomize: bool = False,
|
| 531 |
disable_ads: bool = False,
|
| 532 |
geoip: bool = False,
|
| 533 |
+
additional_args: Optional[Dict] = None,
|
| 534 |
) -> List[ResponseModel]:
|
| 535 |
"""Use Scrapling's version of the Camoufox browser to fetch a group of URLs at the same time, and for each page return a structured output of the result.
|
| 536 |
Note: This is best suitable for high protection levels. It's slower than the other tools.
|
|
|
|
| 567 |
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 568 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 569 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 570 |
+
:param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
| 571 |
"""
|
| 572 |
async with AsyncStealthySession(
|
| 573 |
wait=wait,
|
|
|
|
| 591 |
solve_cloudflare=solve_cloudflare,
|
| 592 |
disable_resources=disable_resources,
|
| 593 |
wait_selector_state=wait_selector_state,
|
| 594 |
+
additional_args=additional_args,
|
| 595 |
) as session:
|
| 596 |
tasks = [session.fetch(url) for url in urls]
|
| 597 |
responses = await gather(*tasks)
|
scrapling/core/shell.py
CHANGED
|
@@ -27,7 +27,7 @@ from orjson import loads as json_loads, JSONDecodeError
|
|
| 27 |
from scrapling import __version__
|
| 28 |
from scrapling.core.custom_types import TextHandler
|
| 29 |
from scrapling.core.utils import log
|
| 30 |
-
from scrapling.parser import
|
| 31 |
from scrapling.core._types import (
|
| 32 |
List,
|
| 33 |
Optional,
|
|
@@ -399,9 +399,9 @@ class CurlParser:
|
|
| 399 |
return None
|
| 400 |
|
| 401 |
|
| 402 |
-
def show_page_in_browser(page:
|
| 403 |
-
if not page or not isinstance(page,
|
| 404 |
-
log.error("Input must be of type `
|
| 405 |
return
|
| 406 |
|
| 407 |
try:
|
|
@@ -421,7 +421,7 @@ class CustomShell:
|
|
| 421 |
def __init__(self, code, log_level="debug"):
|
| 422 |
self.code = code
|
| 423 |
self.page = None
|
| 424 |
-
self.pages =
|
| 425 |
self._curl_parser = CurlParser()
|
| 426 |
log_level = log_level.strip().lower()
|
| 427 |
|
|
@@ -457,7 +457,7 @@ class CustomShell:
|
|
| 457 |
- Fetcher/AsyncFetcher
|
| 458 |
- DynamicFetcher
|
| 459 |
- StealthyFetcher
|
| 460 |
-
-
|
| 461 |
|
| 462 |
-> Useful shortcuts:
|
| 463 |
- {"get":<30} Shortcut for `Fetcher.get`
|
|
@@ -469,7 +469,7 @@ class CustomShell:
|
|
| 469 |
|
| 470 |
-> Useful commands
|
| 471 |
- {"page / response":<30} The response object of the last page you fetched
|
| 472 |
-
- {"pages":<30}
|
| 473 |
- {"uncurl('curl_command')":<30} Convert curl command to a Request object. (Optimized to handle curl commands copied from DevTools network tab.)
|
| 474 |
- {"curl2fetcher('curl_command')":<30} Convert curl command and make the request with Fetcher. (Optimized to handle curl commands copied from DevTools network tab.)
|
| 475 |
- {"view(page)":<30} View page in a browser
|
|
@@ -481,7 +481,7 @@ Type 'exit' or press Ctrl+D to exit.
|
|
| 481 |
def update_page(self, result):
|
| 482 |
"""Update the current page and add to pages history"""
|
| 483 |
self.page = result
|
| 484 |
-
if isinstance(result, (Response,
|
| 485 |
self.pages.append(result)
|
| 486 |
if len(self.pages) > 5:
|
| 487 |
self.pages.pop(0) # Remove oldest item
|
|
@@ -528,7 +528,7 @@ Type 'exit' or press Ctrl+D to exit.
|
|
| 528 |
"DynamicFetcher": DynamicFetcher,
|
| 529 |
"stealthy_fetch": stealthy_fetch,
|
| 530 |
"StealthyFetcher": StealthyFetcher,
|
| 531 |
-
"
|
| 532 |
"page": self.page,
|
| 533 |
"response": self.page,
|
| 534 |
"pages": self.pages,
|
|
@@ -586,14 +586,14 @@ class Convertor:
|
|
| 586 |
@classmethod
|
| 587 |
def _extract_content(
|
| 588 |
cls,
|
| 589 |
-
page:
|
| 590 |
extraction_type: extraction_types = "markdown",
|
| 591 |
css_selector: Optional[str] = None,
|
| 592 |
main_content_only: bool = False,
|
| 593 |
) -> Generator[str, None, None]:
|
| 594 |
-
"""Extract the content of an
|
| 595 |
-
if not page or not isinstance(page,
|
| 596 |
-
raise TypeError("Input must be of type `
|
| 597 |
elif not extraction_type or extraction_type not in cls._extension_map.values():
|
| 598 |
raise ValueError(f"Unknown extraction type: {extraction_type}")
|
| 599 |
else:
|
|
@@ -622,11 +622,11 @@ class Convertor:
|
|
| 622 |
|
| 623 |
@classmethod
|
| 624 |
def write_content_to_file(
|
| 625 |
-
cls, page:
|
| 626 |
) -> None:
|
| 627 |
-
"""Write an
|
| 628 |
-
if not page or not isinstance(page,
|
| 629 |
-
raise TypeError("Input must be of type `
|
| 630 |
elif not filename or not isinstance(filename, str) or not filename.strip():
|
| 631 |
raise ValueError("Filename must be provided")
|
| 632 |
elif not filename.endswith((".md", ".html", ".txt")):
|
|
|
|
| 27 |
from scrapling import __version__
|
| 28 |
from scrapling.core.custom_types import TextHandler
|
| 29 |
from scrapling.core.utils import log
|
| 30 |
+
from scrapling.parser import Selector, Selectors
|
| 31 |
from scrapling.core._types import (
|
| 32 |
List,
|
| 33 |
Optional,
|
|
|
|
| 399 |
return None
|
| 400 |
|
| 401 |
|
| 402 |
+
def show_page_in_browser(page: Selector):
|
| 403 |
+
if not page or not isinstance(page, Selector):
|
| 404 |
+
log.error("Input must be of type `Selector`")
|
| 405 |
return
|
| 406 |
|
| 407 |
try:
|
|
|
|
| 421 |
def __init__(self, code, log_level="debug"):
|
| 422 |
self.code = code
|
| 423 |
self.page = None
|
| 424 |
+
self.pages = Selectors([])
|
| 425 |
self._curl_parser = CurlParser()
|
| 426 |
log_level = log_level.strip().lower()
|
| 427 |
|
|
|
|
| 457 |
- Fetcher/AsyncFetcher
|
| 458 |
- DynamicFetcher
|
| 459 |
- StealthyFetcher
|
| 460 |
+
- Selector
|
| 461 |
|
| 462 |
-> Useful shortcuts:
|
| 463 |
- {"get":<30} Shortcut for `Fetcher.get`
|
|
|
|
| 469 |
|
| 470 |
-> Useful commands
|
| 471 |
- {"page / response":<30} The response object of the last page you fetched
|
| 472 |
+
- {"pages":<30} Selectors object of the last 5 response objects you fetched
|
| 473 |
- {"uncurl('curl_command')":<30} Convert curl command to a Request object. (Optimized to handle curl commands copied from DevTools network tab.)
|
| 474 |
- {"curl2fetcher('curl_command')":<30} Convert curl command and make the request with Fetcher. (Optimized to handle curl commands copied from DevTools network tab.)
|
| 475 |
- {"view(page)":<30} View page in a browser
|
|
|
|
| 481 |
def update_page(self, result):
|
| 482 |
"""Update the current page and add to pages history"""
|
| 483 |
self.page = result
|
| 484 |
+
if isinstance(result, (Response, Selector)):
|
| 485 |
self.pages.append(result)
|
| 486 |
if len(self.pages) > 5:
|
| 487 |
self.pages.pop(0) # Remove oldest item
|
|
|
|
| 528 |
"DynamicFetcher": DynamicFetcher,
|
| 529 |
"stealthy_fetch": stealthy_fetch,
|
| 530 |
"StealthyFetcher": StealthyFetcher,
|
| 531 |
+
"Selector": Selector,
|
| 532 |
"page": self.page,
|
| 533 |
"response": self.page,
|
| 534 |
"pages": self.pages,
|
|
|
|
| 586 |
@classmethod
|
| 587 |
def _extract_content(
|
| 588 |
cls,
|
| 589 |
+
page: Selector,
|
| 590 |
extraction_type: extraction_types = "markdown",
|
| 591 |
css_selector: Optional[str] = None,
|
| 592 |
main_content_only: bool = False,
|
| 593 |
) -> Generator[str, None, None]:
|
| 594 |
+
"""Extract the content of an Selector"""
|
| 595 |
+
if not page or not isinstance(page, Selector):
|
| 596 |
+
raise TypeError("Input must be of type `Selector`")
|
| 597 |
elif not extraction_type or extraction_type not in cls._extension_map.values():
|
| 598 |
raise ValueError(f"Unknown extraction type: {extraction_type}")
|
| 599 |
else:
|
|
|
|
| 622 |
|
| 623 |
@classmethod
|
| 624 |
def write_content_to_file(
|
| 625 |
+
cls, page: Selector, filename: str, css_selector: Optional[str] = None
|
| 626 |
) -> None:
|
| 627 |
+
"""Write an Selector's content to a file"""
|
| 628 |
+
if not page or not isinstance(page, Selector):
|
| 629 |
+
raise TypeError("Input must be of type `Selector`")
|
| 630 |
elif not filename or not isinstance(filename, str) or not filename.strip():
|
| 631 |
raise ValueError("Filename must be provided")
|
| 632 |
elif not filename.endswith((".md", ".html", ".txt")):
|
scrapling/core/{storage_adaptors.py → storage.py}
RENAMED
|
File without changes
|
scrapling/engines/_browsers/_camoufox.py
CHANGED
|
@@ -70,8 +70,8 @@ class StealthySession:
|
|
| 70 |
"os_randomize",
|
| 71 |
"disable_ads",
|
| 72 |
"geoip",
|
| 73 |
-
"
|
| 74 |
-
"
|
| 75 |
"playwright",
|
| 76 |
"browser",
|
| 77 |
"context",
|
|
@@ -105,8 +105,8 @@ class StealthySession:
|
|
| 105 |
os_randomize: bool = False,
|
| 106 |
disable_ads: bool = False,
|
| 107 |
geoip: bool = False,
|
| 108 |
-
|
| 109 |
-
|
| 110 |
):
|
| 111 |
"""A Browser session manager with page pooling
|
| 112 |
|
|
@@ -136,8 +136,8 @@ class StealthySession:
|
|
| 136 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 137 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 138 |
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
| 139 |
-
:param
|
| 140 |
-
:param
|
| 141 |
"""
|
| 142 |
|
| 143 |
params = {
|
|
@@ -163,8 +163,8 @@ class StealthySession:
|
|
| 163 |
"os_randomize": os_randomize,
|
| 164 |
"disable_ads": disable_ads,
|
| 165 |
"geoip": geoip,
|
| 166 |
-
"
|
| 167 |
-
"
|
| 168 |
}
|
| 169 |
config = validate(params, CamoufoxConfig)
|
| 170 |
|
|
@@ -190,14 +190,14 @@ class StealthySession:
|
|
| 190 |
self.os_randomize = config.os_randomize
|
| 191 |
self.disable_ads = config.disable_ads
|
| 192 |
self.geoip = config.geoip
|
| 193 |
-
self.
|
| 194 |
-
self.
|
| 195 |
|
| 196 |
self.playwright: Optional[Playwright] = None
|
| 197 |
self.context: Optional[BrowserContext] = None
|
| 198 |
self.page_pool = PagePool(self.max_pages)
|
| 199 |
self._closed = False
|
| 200 |
-
self.
|
| 201 |
self.page_action = config.page_action
|
| 202 |
self._headers_keys = (
|
| 203 |
set(map(str.lower, self.extra_headers.keys()))
|
|
@@ -223,7 +223,7 @@ class StealthySession:
|
|
| 223 |
"block_images": self.block_images, # Careful! it makes some websites don't finish loading at all like stackoverflow even in headful mode.
|
| 224 |
"os": None if self.os_randomize else get_os_name(),
|
| 225 |
"user_data_dir": "",
|
| 226 |
-
**self.
|
| 227 |
}
|
| 228 |
)
|
| 229 |
|
|
@@ -433,7 +433,7 @@ class StealthySession:
|
|
| 433 |
|
| 434 |
page_info.page.wait_for_timeout(self.wait)
|
| 435 |
response = ResponseFactory.from_playwright_response(
|
| 436 |
-
page_info.page, first_response, final_response, self.
|
| 437 |
)
|
| 438 |
|
| 439 |
# Mark the page as ready for next use
|
|
@@ -482,8 +482,8 @@ class AsyncStealthySession(StealthySession):
|
|
| 482 |
os_randomize: bool = False,
|
| 483 |
disable_ads: bool = False,
|
| 484 |
geoip: bool = False,
|
| 485 |
-
|
| 486 |
-
|
| 487 |
):
|
| 488 |
"""A Browser session manager with page pooling
|
| 489 |
|
|
@@ -513,8 +513,8 @@ class AsyncStealthySession(StealthySession):
|
|
| 513 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 514 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 515 |
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
| 516 |
-
:param
|
| 517 |
-
:param
|
| 518 |
"""
|
| 519 |
super().__init__(
|
| 520 |
max_pages,
|
|
@@ -539,8 +539,8 @@ class AsyncStealthySession(StealthySession):
|
|
| 539 |
os_randomize,
|
| 540 |
disable_ads,
|
| 541 |
geoip,
|
| 542 |
-
|
| 543 |
-
|
| 544 |
)
|
| 545 |
self.playwright: Optional[AsyncPlaywright] = None
|
| 546 |
self.context: Optional[AsyncBrowserContext] = None
|
|
@@ -731,7 +731,7 @@ class AsyncStealthySession(StealthySession):
|
|
| 731 |
|
| 732 |
# Create response object
|
| 733 |
response = await ResponseFactory.from_async_playwright_response(
|
| 734 |
-
page_info.page, first_response, final_response, self.
|
| 735 |
)
|
| 736 |
|
| 737 |
# Mark the page as ready for next use
|
|
|
|
| 70 |
"os_randomize",
|
| 71 |
"disable_ads",
|
| 72 |
"geoip",
|
| 73 |
+
"selector_config",
|
| 74 |
+
"additional_args",
|
| 75 |
"playwright",
|
| 76 |
"browser",
|
| 77 |
"context",
|
|
|
|
| 105 |
os_randomize: bool = False,
|
| 106 |
disable_ads: bool = False,
|
| 107 |
geoip: bool = False,
|
| 108 |
+
selector_config: Optional[Dict] = None,
|
| 109 |
+
additional_args: Optional[Dict] = None,
|
| 110 |
):
|
| 111 |
"""A Browser session manager with page pooling
|
| 112 |
|
|
|
|
| 136 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 137 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 138 |
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
| 139 |
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 140 |
+
:param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
| 141 |
"""
|
| 142 |
|
| 143 |
params = {
|
|
|
|
| 163 |
"os_randomize": os_randomize,
|
| 164 |
"disable_ads": disable_ads,
|
| 165 |
"geoip": geoip,
|
| 166 |
+
"selector_config": selector_config,
|
| 167 |
+
"additional_args": additional_args,
|
| 168 |
}
|
| 169 |
config = validate(params, CamoufoxConfig)
|
| 170 |
|
|
|
|
| 190 |
self.os_randomize = config.os_randomize
|
| 191 |
self.disable_ads = config.disable_ads
|
| 192 |
self.geoip = config.geoip
|
| 193 |
+
self.selector_config = config.selector_config
|
| 194 |
+
self.additional_args = config.additional_args
|
| 195 |
|
| 196 |
self.playwright: Optional[Playwright] = None
|
| 197 |
self.context: Optional[BrowserContext] = None
|
| 198 |
self.page_pool = PagePool(self.max_pages)
|
| 199 |
self._closed = False
|
| 200 |
+
self.selector_config = config.selector_config
|
| 201 |
self.page_action = config.page_action
|
| 202 |
self._headers_keys = (
|
| 203 |
set(map(str.lower, self.extra_headers.keys()))
|
|
|
|
| 223 |
"block_images": self.block_images, # Careful! it makes some websites don't finish loading at all like stackoverflow even in headful mode.
|
| 224 |
"os": None if self.os_randomize else get_os_name(),
|
| 225 |
"user_data_dir": "",
|
| 226 |
+
**self.additional_args,
|
| 227 |
}
|
| 228 |
)
|
| 229 |
|
|
|
|
| 433 |
|
| 434 |
page_info.page.wait_for_timeout(self.wait)
|
| 435 |
response = ResponseFactory.from_playwright_response(
|
| 436 |
+
page_info.page, first_response, final_response, self.selector_config
|
| 437 |
)
|
| 438 |
|
| 439 |
# Mark the page as ready for next use
|
|
|
|
| 482 |
os_randomize: bool = False,
|
| 483 |
disable_ads: bool = False,
|
| 484 |
geoip: bool = False,
|
| 485 |
+
selector_config: Optional[Dict] = None,
|
| 486 |
+
additional_args: Optional[Dict] = None,
|
| 487 |
):
|
| 488 |
"""A Browser session manager with page pooling
|
| 489 |
|
|
|
|
| 513 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 514 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 515 |
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
| 516 |
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 517 |
+
:param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
| 518 |
"""
|
| 519 |
super().__init__(
|
| 520 |
max_pages,
|
|
|
|
| 539 |
os_randomize,
|
| 540 |
disable_ads,
|
| 541 |
geoip,
|
| 542 |
+
selector_config,
|
| 543 |
+
additional_args,
|
| 544 |
)
|
| 545 |
self.playwright: Optional[AsyncPlaywright] = None
|
| 546 |
self.context: Optional[AsyncBrowserContext] = None
|
|
|
|
| 731 |
|
| 732 |
# Create response object
|
| 733 |
response = await ResponseFactory.from_async_playwright_response(
|
| 734 |
+
page_info.page, first_response, final_response, self.selector_config
|
| 735 |
)
|
| 736 |
|
| 737 |
# Mark the page as ready for next use
|
scrapling/engines/_browsers/_controllers.py
CHANGED
|
@@ -70,7 +70,7 @@ class DynamicSession:
|
|
| 70 |
"context",
|
| 71 |
"page_pool",
|
| 72 |
"_closed",
|
| 73 |
-
"
|
| 74 |
"page_action",
|
| 75 |
"launch_options",
|
| 76 |
"context_options",
|
|
@@ -100,7 +100,7 @@ class DynamicSession:
|
|
| 100 |
cookies: Optional[List[Dict]] = None,
|
| 101 |
network_idle: bool = False,
|
| 102 |
wait_selector_state: SelectorWaitStates = "attached",
|
| 103 |
-
|
| 104 |
):
|
| 105 |
"""A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
|
| 106 |
|
|
@@ -125,7 +125,7 @@ class DynamicSession:
|
|
| 125 |
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 126 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 127 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 128 |
-
:param
|
| 129 |
"""
|
| 130 |
|
| 131 |
params = {
|
|
@@ -143,7 +143,7 @@ class DynamicSession:
|
|
| 143 |
"extra_headers": extra_headers,
|
| 144 |
"useragent": useragent,
|
| 145 |
"timeout": timeout,
|
| 146 |
-
"
|
| 147 |
"disable_resources": disable_resources,
|
| 148 |
"wait_selector": wait_selector,
|
| 149 |
"cookies": cookies,
|
|
@@ -177,7 +177,7 @@ class DynamicSession:
|
|
| 177 |
self.context: Optional[BrowserContext] = None
|
| 178 |
self.page_pool = PagePool(self.max_pages)
|
| 179 |
self._closed = False
|
| 180 |
-
self.
|
| 181 |
self.page_action = config.page_action
|
| 182 |
self._headers_keys = (
|
| 183 |
set(map(str.lower, self.extra_headers.keys()))
|
|
@@ -370,7 +370,7 @@ class DynamicSession:
|
|
| 370 |
|
| 371 |
# Create response object
|
| 372 |
response = ResponseFactory.from_playwright_response(
|
| 373 |
-
page_info.page, first_response, final_response, self.
|
| 374 |
)
|
| 375 |
|
| 376 |
# Mark the page as ready for next use
|
|
@@ -417,7 +417,7 @@ class AsyncDynamicSession(DynamicSession):
|
|
| 417 |
cookies: Optional[List[Dict]] = None,
|
| 418 |
network_idle: bool = False,
|
| 419 |
wait_selector_state: SelectorWaitStates = "attached",
|
| 420 |
-
|
| 421 |
):
|
| 422 |
"""A Browser session manager with page pooling
|
| 423 |
|
|
@@ -443,7 +443,7 @@ class AsyncDynamicSession(DynamicSession):
|
|
| 443 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 444 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 445 |
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
| 446 |
-
:param
|
| 447 |
"""
|
| 448 |
|
| 449 |
super().__init__(
|
|
@@ -467,7 +467,7 @@ class AsyncDynamicSession(DynamicSession):
|
|
| 467 |
cookies,
|
| 468 |
network_idle,
|
| 469 |
wait_selector_state,
|
| 470 |
-
|
| 471 |
)
|
| 472 |
|
| 473 |
self.playwright: Optional[AsyncPlaywright] = None
|
|
@@ -623,7 +623,7 @@ class AsyncDynamicSession(DynamicSession):
|
|
| 623 |
|
| 624 |
# Create response object
|
| 625 |
response = await ResponseFactory.from_async_playwright_response(
|
| 626 |
-
page_info.page, first_response, final_response, self.
|
| 627 |
)
|
| 628 |
|
| 629 |
# Mark the page as ready for next use
|
|
|
|
| 70 |
"context",
|
| 71 |
"page_pool",
|
| 72 |
"_closed",
|
| 73 |
+
"selector_config",
|
| 74 |
"page_action",
|
| 75 |
"launch_options",
|
| 76 |
"context_options",
|
|
|
|
| 100 |
cookies: Optional[List[Dict]] = None,
|
| 101 |
network_idle: bool = False,
|
| 102 |
wait_selector_state: SelectorWaitStates = "attached",
|
| 103 |
+
selector_config: Optional[Dict] = None,
|
| 104 |
):
|
| 105 |
"""A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
|
| 106 |
|
|
|
|
| 125 |
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
| 126 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 127 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 128 |
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 129 |
"""
|
| 130 |
|
| 131 |
params = {
|
|
|
|
| 143 |
"extra_headers": extra_headers,
|
| 144 |
"useragent": useragent,
|
| 145 |
"timeout": timeout,
|
| 146 |
+
"selector_config": selector_config,
|
| 147 |
"disable_resources": disable_resources,
|
| 148 |
"wait_selector": wait_selector,
|
| 149 |
"cookies": cookies,
|
|
|
|
| 177 |
self.context: Optional[BrowserContext] = None
|
| 178 |
self.page_pool = PagePool(self.max_pages)
|
| 179 |
self._closed = False
|
| 180 |
+
self.selector_config = config.selector_config
|
| 181 |
self.page_action = config.page_action
|
| 182 |
self._headers_keys = (
|
| 183 |
set(map(str.lower, self.extra_headers.keys()))
|
|
|
|
| 370 |
|
| 371 |
# Create response object
|
| 372 |
response = ResponseFactory.from_playwright_response(
|
| 373 |
+
page_info.page, first_response, final_response, self.selector_config
|
| 374 |
)
|
| 375 |
|
| 376 |
# Mark the page as ready for next use
|
|
|
|
| 417 |
cookies: Optional[List[Dict]] = None,
|
| 418 |
network_idle: bool = False,
|
| 419 |
wait_selector_state: SelectorWaitStates = "attached",
|
| 420 |
+
selector_config: Optional[Dict] = None,
|
| 421 |
):
|
| 422 |
"""A Browser session manager with page pooling
|
| 423 |
|
|
|
|
| 443 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 444 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 445 |
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
| 446 |
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
| 447 |
"""
|
| 448 |
|
| 449 |
super().__init__(
|
|
|
|
| 467 |
cookies,
|
| 468 |
network_idle,
|
| 469 |
wait_selector_state,
|
| 470 |
+
selector_config,
|
| 471 |
)
|
| 472 |
|
| 473 |
self.playwright: Optional[AsyncPlaywright] = None
|
|
|
|
| 623 |
|
| 624 |
# Create response object
|
| 625 |
response = await ResponseFactory.from_async_playwright_response(
|
| 626 |
+
page_info.page, first_response, final_response, self.selector_config
|
| 627 |
)
|
| 628 |
|
| 629 |
# Mark the page as ready for next use
|
scrapling/engines/_browsers/_validators.py
CHANGED
|
@@ -39,7 +39,7 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
|
| 39 |
cookies: Optional[List[Dict]] = None
|
| 40 |
network_idle: bool = False
|
| 41 |
wait_selector_state: SelectorWaitStates = "attached"
|
| 42 |
-
|
| 43 |
|
| 44 |
def __post_init__(self):
|
| 45 |
"""Custom validation after msgspec validation"""
|
|
@@ -57,8 +57,8 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
|
| 57 |
self.__validate_cdp(self.cdp_url)
|
| 58 |
if not self.cookies:
|
| 59 |
self.cookies = []
|
| 60 |
-
if not self.
|
| 61 |
-
self.
|
| 62 |
|
| 63 |
@staticmethod
|
| 64 |
def __validate_cdp(cdp_url):
|
|
@@ -105,8 +105,8 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
|
|
| 105 |
os_randomize: bool = False
|
| 106 |
disable_ads: bool = False
|
| 107 |
geoip: bool = False
|
| 108 |
-
|
| 109 |
-
|
| 110 |
|
| 111 |
def __post_init__(self):
|
| 112 |
"""Custom validation after msgspec validation"""
|
|
@@ -136,10 +136,10 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
|
|
| 136 |
self.cookies = []
|
| 137 |
if self.solve_cloudflare and self.timeout < 60_000:
|
| 138 |
self.timeout = 60_000
|
| 139 |
-
if not self.
|
| 140 |
-
self.
|
| 141 |
-
if not self.
|
| 142 |
-
self.
|
| 143 |
|
| 144 |
|
| 145 |
def validate(params, model):
|
|
|
|
| 39 |
cookies: Optional[List[Dict]] = None
|
| 40 |
network_idle: bool = False
|
| 41 |
wait_selector_state: SelectorWaitStates = "attached"
|
| 42 |
+
selector_config: Optional[Dict] = None
|
| 43 |
|
| 44 |
def __post_init__(self):
|
| 45 |
"""Custom validation after msgspec validation"""
|
|
|
|
| 57 |
self.__validate_cdp(self.cdp_url)
|
| 58 |
if not self.cookies:
|
| 59 |
self.cookies = []
|
| 60 |
+
if not self.selector_config:
|
| 61 |
+
self.selector_config = {}
|
| 62 |
|
| 63 |
@staticmethod
|
| 64 |
def __validate_cdp(cdp_url):
|
|
|
|
| 105 |
os_randomize: bool = False
|
| 106 |
disable_ads: bool = False
|
| 107 |
geoip: bool = False
|
| 108 |
+
selector_config: Optional[Dict] = None
|
| 109 |
+
additional_args: Optional[Dict] = None
|
| 110 |
|
| 111 |
def __post_init__(self):
|
| 112 |
"""Custom validation after msgspec validation"""
|
|
|
|
| 136 |
self.cookies = []
|
| 137 |
if self.solve_cloudflare and self.timeout < 60_000:
|
| 138 |
self.timeout = 60_000
|
| 139 |
+
if not self.selector_config:
|
| 140 |
+
self.selector_config = {}
|
| 141 |
+
if not self.additional_args:
|
| 142 |
+
self.additional_args = {}
|
| 143 |
|
| 144 |
|
| 145 |
def validate(params, model):
|
scrapling/engines/static.py
CHANGED
|
@@ -63,7 +63,7 @@ class FetcherSession:
|
|
| 63 |
max_redirects: int = 30,
|
| 64 |
verify: bool = True,
|
| 65 |
cert: Optional[Union[str, Tuple[str, str]]] = None,
|
| 66 |
-
|
| 67 |
):
|
| 68 |
"""
|
| 69 |
:param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
|
|
@@ -81,7 +81,7 @@ class FetcherSession:
|
|
| 81 |
:param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
| 82 |
:param verify: Whether to verify HTTPS certificates. Defaults to True.
|
| 83 |
:param cert: Tuple of (cert, key) filenames for the client certificate.
|
| 84 |
-
:param
|
| 85 |
"""
|
| 86 |
self.default_impersonate = impersonate
|
| 87 |
self.stealth = stealthy_headers
|
|
@@ -97,7 +97,7 @@ class FetcherSession:
|
|
| 97 |
self.default_verify = verify
|
| 98 |
self.default_cert = cert
|
| 99 |
self.default_http3 = http3
|
| 100 |
-
self.
|
| 101 |
|
| 102 |
self._curl_session: Optional[CurlSession] = None
|
| 103 |
self._async_curl_session: Optional[AsyncCurlSession] = None
|
|
@@ -260,7 +260,7 @@ class FetcherSession:
|
|
| 260 |
request_args: Dict[str, Any],
|
| 261 |
max_retries: int,
|
| 262 |
retry_delay: int,
|
| 263 |
-
|
| 264 |
) -> Response:
|
| 265 |
"""
|
| 266 |
Perform an HTTP request using the configured session.
|
|
@@ -270,7 +270,7 @@ class FetcherSession:
|
|
| 270 |
:param request_args: Arguments to be passed to the session's `request()` method.
|
| 271 |
:param max_retries: Maximum number of retries for the request.
|
| 272 |
:param retry_delay: Number of seconds to wait between retries.
|
| 273 |
-
:param
|
| 274 |
:return: A `Response` object for synchronous requests or an awaitable for asynchronous.
|
| 275 |
"""
|
| 276 |
session = self._curl_session
|
|
@@ -286,9 +286,7 @@ class FetcherSession:
|
|
| 286 |
try:
|
| 287 |
response = session.request(method, **request_args)
|
| 288 |
# response.raise_for_status() # Retry responses with a status code between 200-400
|
| 289 |
-
return ResponseFactory.from_http_request(
|
| 290 |
-
response, adaptor_arguments
|
| 291 |
-
)
|
| 292 |
except CurlError as e:
|
| 293 |
if attempt < max_retries - 1:
|
| 294 |
log.error(
|
|
@@ -307,7 +305,7 @@ class FetcherSession:
|
|
| 307 |
request_args: Dict[str, Any],
|
| 308 |
max_retries: int,
|
| 309 |
retry_delay: int,
|
| 310 |
-
|
| 311 |
) -> Response:
|
| 312 |
"""
|
| 313 |
Perform an HTTP request using the configured session.
|
|
@@ -317,7 +315,7 @@ class FetcherSession:
|
|
| 317 |
:param request_args: Arguments to be passed to the session's `request()` method.
|
| 318 |
:param max_retries: Maximum number of retries for the request.
|
| 319 |
:param retry_delay: Number of seconds to wait between retries.
|
| 320 |
-
:param
|
| 321 |
:return: A `Response` object for synchronous requests or an awaitable for asynchronous.
|
| 322 |
"""
|
| 323 |
session = self._async_curl_session
|
|
@@ -335,9 +333,7 @@ class FetcherSession:
|
|
| 335 |
try:
|
| 336 |
response = await session.request(method, **request_args)
|
| 337 |
# response.raise_for_status() # Retry responses with a status code between 200-400
|
| 338 |
-
return ResponseFactory.from_http_request(
|
| 339 |
-
response, adaptor_arguments
|
| 340 |
-
)
|
| 341 |
except CurlError as e:
|
| 342 |
if attempt < max_retries - 1:
|
| 343 |
log.error(
|
|
@@ -373,9 +369,7 @@ class FetcherSession:
|
|
| 373 |
"""
|
| 374 |
stealth = self.stealth if stealth is None else stealth
|
| 375 |
|
| 376 |
-
|
| 377 |
-
kwargs.pop("adaptor_arguments", {}) or self.adaptor_arguments
|
| 378 |
-
)
|
| 379 |
max_retries = self.get_with_precedence(kwargs, "retries", self.default_retries)
|
| 380 |
retry_delay = self.get_with_precedence(
|
| 381 |
kwargs, "retry_delay", self.default_retry_delay
|
|
@@ -383,12 +377,12 @@ class FetcherSession:
|
|
| 383 |
request_args = self._merge_request_args(stealth=stealth, **kwargs)
|
| 384 |
if self._curl_session:
|
| 385 |
return self.__make_request(
|
| 386 |
-
method, request_args, max_retries, retry_delay,
|
| 387 |
)
|
| 388 |
elif self._async_curl_session:
|
| 389 |
# The returned value is a Coroutine
|
| 390 |
return self.__make_async_request(
|
| 391 |
-
method, request_args, max_retries, retry_delay,
|
| 392 |
)
|
| 393 |
|
| 394 |
raise RuntimeError("No active session available.")
|
|
|
|
| 63 |
max_redirects: int = 30,
|
| 64 |
verify: bool = True,
|
| 65 |
cert: Optional[Union[str, Tuple[str, str]]] = None,
|
| 66 |
+
selector_config: Optional[Dict] = None,
|
| 67 |
):
|
| 68 |
"""
|
| 69 |
:param impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.
|
|
|
|
| 81 |
:param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
| 82 |
:param verify: Whether to verify HTTPS certificates. Defaults to True.
|
| 83 |
:param cert: Tuple of (cert, key) filenames for the client certificate.
|
| 84 |
+
:param selector_config: Arguments passed when creating the final Selector class.
|
| 85 |
"""
|
| 86 |
self.default_impersonate = impersonate
|
| 87 |
self.stealth = stealthy_headers
|
|
|
|
| 97 |
self.default_verify = verify
|
| 98 |
self.default_cert = cert
|
| 99 |
self.default_http3 = http3
|
| 100 |
+
self.selector_config = selector_config or {}
|
| 101 |
|
| 102 |
self._curl_session: Optional[CurlSession] = None
|
| 103 |
self._async_curl_session: Optional[AsyncCurlSession] = None
|
|
|
|
| 260 |
request_args: Dict[str, Any],
|
| 261 |
max_retries: int,
|
| 262 |
retry_delay: int,
|
| 263 |
+
selector_config: Optional[Dict] = None,
|
| 264 |
) -> Response:
|
| 265 |
"""
|
| 266 |
Perform an HTTP request using the configured session.
|
|
|
|
| 270 |
:param request_args: Arguments to be passed to the session's `request()` method.
|
| 271 |
:param max_retries: Maximum number of retries for the request.
|
| 272 |
:param retry_delay: Number of seconds to wait between retries.
|
| 273 |
+
:param selector_config: Arguments passed when creating the final Selector class.
|
| 274 |
:return: A `Response` object for synchronous requests or an awaitable for asynchronous.
|
| 275 |
"""
|
| 276 |
session = self._curl_session
|
|
|
|
| 286 |
try:
|
| 287 |
response = session.request(method, **request_args)
|
| 288 |
# response.raise_for_status() # Retry responses with a status code between 200-400
|
| 289 |
+
return ResponseFactory.from_http_request(response, selector_config)
|
|
|
|
|
|
|
| 290 |
except CurlError as e:
|
| 291 |
if attempt < max_retries - 1:
|
| 292 |
log.error(
|
|
|
|
| 305 |
request_args: Dict[str, Any],
|
| 306 |
max_retries: int,
|
| 307 |
retry_delay: int,
|
| 308 |
+
selector_config: Optional[Dict] = None,
|
| 309 |
) -> Response:
|
| 310 |
"""
|
| 311 |
Perform an HTTP request using the configured session.
|
|
|
|
| 315 |
:param request_args: Arguments to be passed to the session's `request()` method.
|
| 316 |
:param max_retries: Maximum number of retries for the request.
|
| 317 |
:param retry_delay: Number of seconds to wait between retries.
|
| 318 |
+
:param selector_config: Arguments passed when creating the final Selector class.
|
| 319 |
:return: A `Response` object for synchronous requests or an awaitable for asynchronous.
|
| 320 |
"""
|
| 321 |
session = self._async_curl_session
|
|
|
|
| 333 |
try:
|
| 334 |
response = await session.request(method, **request_args)
|
| 335 |
# response.raise_for_status() # Retry responses with a status code between 200-400
|
| 336 |
+
return ResponseFactory.from_http_request(response, selector_config)
|
|
|
|
|
|
|
| 337 |
except CurlError as e:
|
| 338 |
if attempt < max_retries - 1:
|
| 339 |
log.error(
|
|
|
|
| 369 |
"""
|
| 370 |
stealth = self.stealth if stealth is None else stealth
|
| 371 |
|
| 372 |
+
selector_config = kwargs.pop("selector_config", {}) or self.selector_config
|
|
|
|
|
|
|
| 373 |
max_retries = self.get_with_precedence(kwargs, "retries", self.default_retries)
|
| 374 |
retry_delay = self.get_with_precedence(
|
| 375 |
kwargs, "retry_delay", self.default_retry_delay
|
|
|
|
| 377 |
request_args = self._merge_request_args(stealth=stealth, **kwargs)
|
| 378 |
if self._curl_session:
|
| 379 |
return self.__make_request(
|
| 380 |
+
method, request_args, max_retries, retry_delay, selector_config
|
| 381 |
)
|
| 382 |
elif self._async_curl_session:
|
| 383 |
# The returned value is a Coroutine
|
| 384 |
return self.__make_async_request(
|
| 385 |
+
method, request_args, max_retries, retry_delay, selector_config
|
| 386 |
)
|
| 387 |
|
| 388 |
raise RuntimeError("No active session available.")
|
scrapling/engines/toolbelt/convertor.py
CHANGED
|
@@ -239,7 +239,7 @@ class ResponseFactory:
|
|
| 239 |
|
| 240 |
:param response: `curl_cffi` response object
|
| 241 |
:param parser_arguments: Additional arguments to be passed to the `Response` object constructor.
|
| 242 |
-
:return: A `Response` object that is the same as `
|
| 243 |
"""
|
| 244 |
return Response(
|
| 245 |
url=response.url,
|
|
|
|
| 239 |
|
| 240 |
:param response: `curl_cffi` response object
|
| 241 |
:param parser_arguments: Additional arguments to be passed to the `Response` object constructor.
|
| 242 |
+
:return: A `Response` object that is the same as `Selector` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
| 243 |
"""
|
| 244 |
return Response(
|
| 245 |
url=response.url,
|
scrapling/engines/toolbelt/custom.py
CHANGED
|
@@ -15,7 +15,7 @@ from scrapling.core._types import (
|
|
| 15 |
)
|
| 16 |
from scrapling.core.custom_types import MappingProxyType
|
| 17 |
from scrapling.core.utils import log, lru_cache
|
| 18 |
-
from scrapling.parser import
|
| 19 |
|
| 20 |
|
| 21 |
class ResponseEncoding:
|
|
@@ -97,7 +97,7 @@ class ResponseEncoding:
|
|
| 97 |
return cls.__DEFAULT_ENCODING
|
| 98 |
|
| 99 |
|
| 100 |
-
class Response(
|
| 101 |
"""This class is returned by all engines as a way to unify response type between different libraries."""
|
| 102 |
|
| 103 |
def __init__(
|
|
@@ -113,9 +113,9 @@ class Response(Adaptor):
|
|
| 113 |
encoding: str = "utf-8",
|
| 114 |
method: str = "GET",
|
| 115 |
history: List = None,
|
| 116 |
-
**
|
| 117 |
):
|
| 118 |
-
|
| 119 |
self.status = status
|
| 120 |
self.reason = reason
|
| 121 |
self.cookies = cookies
|
|
@@ -126,12 +126,10 @@ class Response(Adaptor):
|
|
| 126 |
super().__init__(
|
| 127 |
text=text,
|
| 128 |
body=body,
|
| 129 |
-
url=
|
| 130 |
encoding=encoding,
|
| 131 |
-
**
|
| 132 |
)
|
| 133 |
-
# For backward compatibility
|
| 134 |
-
self.adaptor = self
|
| 135 |
# For easier debugging while working from a Python shell
|
| 136 |
log.info(
|
| 137 |
f"Fetched ({status}) <{method} {url}> (referer: {request_headers.get('referer')})"
|
|
@@ -144,20 +142,20 @@ class Response(Adaptor):
|
|
| 144 |
class BaseFetcher:
|
| 145 |
__slots__ = ()
|
| 146 |
huge_tree: bool = True
|
| 147 |
-
|
| 148 |
storage: Any = SQLiteStorageSystem
|
| 149 |
keep_cdata: Optional[bool] = False
|
| 150 |
storage_args: Optional[Dict] = None
|
| 151 |
keep_comments: Optional[bool] = False
|
| 152 |
-
|
| 153 |
parser_keywords: Tuple = (
|
| 154 |
"huge_tree",
|
| 155 |
-
"
|
| 156 |
"storage",
|
| 157 |
"keep_cdata",
|
| 158 |
"storage_args",
|
| 159 |
"keep_comments",
|
| 160 |
-
"
|
| 161 |
) # Left open for the user
|
| 162 |
|
| 163 |
def __init__(self, *args, **kwargs):
|
|
@@ -178,17 +176,17 @@ class BaseFetcher:
|
|
| 178 |
huge_tree=cls.huge_tree,
|
| 179 |
keep_comments=cls.keep_comments,
|
| 180 |
keep_cdata=cls.keep_cdata,
|
| 181 |
-
|
| 182 |
storage=cls.storage,
|
| 183 |
storage_args=cls.storage_args,
|
| 184 |
-
|
| 185 |
)
|
| 186 |
|
| 187 |
@classmethod
|
| 188 |
def configure(cls, **kwargs):
|
| 189 |
"""Set multiple arguments for the parser at once globally
|
| 190 |
|
| 191 |
-
:param kwargs: The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata,
|
| 192 |
"""
|
| 193 |
for key, value in kwargs.items():
|
| 194 |
key = key.strip().lower()
|
|
@@ -212,23 +210,23 @@ class BaseFetcher:
|
|
| 212 |
|
| 213 |
@classmethod
|
| 214 |
def _generate_parser_arguments(cls) -> Dict:
|
| 215 |
-
#
|
| 216 |
-
# I won't validate
|
| 217 |
parser_arguments = dict(
|
| 218 |
huge_tree=cls.huge_tree,
|
| 219 |
keep_comments=cls.keep_comments,
|
| 220 |
keep_cdata=cls.keep_cdata,
|
| 221 |
-
|
| 222 |
storage=cls.storage,
|
| 223 |
storage_args=cls.storage_args,
|
| 224 |
)
|
| 225 |
-
if cls.
|
| 226 |
-
if type(cls.
|
| 227 |
log.warning(
|
| 228 |
-
'[Ignored] The argument "
|
| 229 |
)
|
| 230 |
else:
|
| 231 |
-
parser_arguments.update({"
|
| 232 |
|
| 233 |
return parser_arguments
|
| 234 |
|
|
|
|
| 15 |
)
|
| 16 |
from scrapling.core.custom_types import MappingProxyType
|
| 17 |
from scrapling.core.utils import log, lru_cache
|
| 18 |
+
from scrapling.parser import Selector, SQLiteStorageSystem
|
| 19 |
|
| 20 |
|
| 21 |
class ResponseEncoding:
|
|
|
|
| 97 |
return cls.__DEFAULT_ENCODING
|
| 98 |
|
| 99 |
|
| 100 |
+
class Response(Selector):
|
| 101 |
"""This class is returned by all engines as a way to unify response type between different libraries."""
|
| 102 |
|
| 103 |
def __init__(
|
|
|
|
| 113 |
encoding: str = "utf-8",
|
| 114 |
method: str = "GET",
|
| 115 |
history: List = None,
|
| 116 |
+
**selector_config: Dict,
|
| 117 |
):
|
| 118 |
+
adaptive_domain = selector_config.pop("adaptive_domain", None)
|
| 119 |
self.status = status
|
| 120 |
self.reason = reason
|
| 121 |
self.cookies = cookies
|
|
|
|
| 126 |
super().__init__(
|
| 127 |
text=text,
|
| 128 |
body=body,
|
| 129 |
+
url=adaptive_domain or url,
|
| 130 |
encoding=encoding,
|
| 131 |
+
**selector_config,
|
| 132 |
)
|
|
|
|
|
|
|
| 133 |
# For easier debugging while working from a Python shell
|
| 134 |
log.info(
|
| 135 |
f"Fetched ({status}) <{method} {url}> (referer: {request_headers.get('referer')})"
|
|
|
|
| 142 |
class BaseFetcher:
|
| 143 |
__slots__ = ()
|
| 144 |
huge_tree: bool = True
|
| 145 |
+
adaptive: Optional[bool] = False
|
| 146 |
storage: Any = SQLiteStorageSystem
|
| 147 |
keep_cdata: Optional[bool] = False
|
| 148 |
storage_args: Optional[Dict] = None
|
| 149 |
keep_comments: Optional[bool] = False
|
| 150 |
+
adaptive_domain: Optional[str] = None
|
| 151 |
parser_keywords: Tuple = (
|
| 152 |
"huge_tree",
|
| 153 |
+
"adaptive",
|
| 154 |
"storage",
|
| 155 |
"keep_cdata",
|
| 156 |
"storage_args",
|
| 157 |
"keep_comments",
|
| 158 |
+
"adaptive_domain",
|
| 159 |
) # Left open for the user
|
| 160 |
|
| 161 |
def __init__(self, *args, **kwargs):
|
|
|
|
| 176 |
huge_tree=cls.huge_tree,
|
| 177 |
keep_comments=cls.keep_comments,
|
| 178 |
keep_cdata=cls.keep_cdata,
|
| 179 |
+
adaptive=cls.adaptive,
|
| 180 |
storage=cls.storage,
|
| 181 |
storage_args=cls.storage_args,
|
| 182 |
+
adaptive_domain=cls.adaptive_domain,
|
| 183 |
)
|
| 184 |
|
| 185 |
@classmethod
|
| 186 |
def configure(cls, **kwargs):
|
| 187 |
"""Set multiple arguments for the parser at once globally
|
| 188 |
|
| 189 |
+
:param kwargs: The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata, adaptive, storage, storage_args, adaptive_domain
|
| 190 |
"""
|
| 191 |
for key, value in kwargs.items():
|
| 192 |
key = key.strip().lower()
|
|
|
|
| 210 |
|
| 211 |
@classmethod
|
| 212 |
def _generate_parser_arguments(cls) -> Dict:
|
| 213 |
+
# Selector class parameters
|
| 214 |
+
# I won't validate Selector's class parameters here again, I will leave it to be validated later
|
| 215 |
parser_arguments = dict(
|
| 216 |
huge_tree=cls.huge_tree,
|
| 217 |
keep_comments=cls.keep_comments,
|
| 218 |
keep_cdata=cls.keep_cdata,
|
| 219 |
+
adaptive=cls.adaptive,
|
| 220 |
storage=cls.storage,
|
| 221 |
storage_args=cls.storage_args,
|
| 222 |
)
|
| 223 |
+
if cls.adaptive_domain:
|
| 224 |
+
if type(cls.adaptive_domain) is not str:
|
| 225 |
log.warning(
|
| 226 |
+
'[Ignored] The argument "adaptive_domain" must be of string type'
|
| 227 |
)
|
| 228 |
else:
|
| 229 |
+
parser_arguments.update({"adaptive_domain": cls.adaptive_domain})
|
| 230 |
|
| 231 |
return parser_arguments
|
| 232 |
|
scrapling/fetchers.py
CHANGED
|
@@ -74,7 +74,7 @@ class StealthyFetcher(BaseFetcher):
|
|
| 74 |
disable_ads: bool = False,
|
| 75 |
geoip: bool = False,
|
| 76 |
custom_config: Optional[Dict] = None,
|
| 77 |
-
|
| 78 |
) -> Response:
|
| 79 |
"""
|
| 80 |
Opens up a browser and do your request based on your chosen options below.
|
|
@@ -106,7 +106,7 @@ class StealthyFetcher(BaseFetcher):
|
|
| 106 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 107 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 108 |
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
| 109 |
-
:param
|
| 110 |
:return: A `Response` object.
|
| 111 |
"""
|
| 112 |
if not custom_config:
|
|
@@ -139,8 +139,8 @@ class StealthyFetcher(BaseFetcher):
|
|
| 139 |
solve_cloudflare=solve_cloudflare,
|
| 140 |
disable_resources=disable_resources,
|
| 141 |
wait_selector_state=wait_selector_state,
|
| 142 |
-
|
| 143 |
-
|
| 144 |
) as engine:
|
| 145 |
return engine.fetch(url)
|
| 146 |
|
|
@@ -170,7 +170,7 @@ class StealthyFetcher(BaseFetcher):
|
|
| 170 |
disable_ads: bool = False,
|
| 171 |
geoip: bool = False,
|
| 172 |
custom_config: Optional[Dict] = None,
|
| 173 |
-
|
| 174 |
) -> Response:
|
| 175 |
"""
|
| 176 |
Opens up a browser and do your request based on your chosen options below.
|
|
@@ -202,7 +202,7 @@ class StealthyFetcher(BaseFetcher):
|
|
| 202 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 203 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 204 |
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
| 205 |
-
:param
|
| 206 |
:return: A `Response` object.
|
| 207 |
"""
|
| 208 |
if not custom_config:
|
|
@@ -235,8 +235,8 @@ class StealthyFetcher(BaseFetcher):
|
|
| 235 |
solve_cloudflare=solve_cloudflare,
|
| 236 |
disable_resources=disable_resources,
|
| 237 |
wait_selector_state=wait_selector_state,
|
| 238 |
-
|
| 239 |
-
|
| 240 |
) as engine:
|
| 241 |
return await engine.fetch(url)
|
| 242 |
|
|
@@ -337,7 +337,7 @@ class DynamicFetcher(BaseFetcher):
|
|
| 337 |
disable_webgl=disable_webgl,
|
| 338 |
disable_resources=disable_resources,
|
| 339 |
wait_selector_state=wait_selector_state,
|
| 340 |
-
|
| 341 |
) as session:
|
| 342 |
return session.fetch(url)
|
| 343 |
|
|
@@ -421,7 +421,7 @@ class DynamicFetcher(BaseFetcher):
|
|
| 421 |
disable_webgl=disable_webgl,
|
| 422 |
disable_resources=disable_resources,
|
| 423 |
wait_selector_state=wait_selector_state,
|
| 424 |
-
|
| 425 |
) as session:
|
| 426 |
return await session.fetch(url)
|
| 427 |
|
|
|
|
| 74 |
disable_ads: bool = False,
|
| 75 |
geoip: bool = False,
|
| 76 |
custom_config: Optional[Dict] = None,
|
| 77 |
+
additional_args: Optional[Dict] = None,
|
| 78 |
) -> Response:
|
| 79 |
"""
|
| 80 |
Opens up a browser and do your request based on your chosen options below.
|
|
|
|
| 106 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 107 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 108 |
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
| 109 |
+
:param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
| 110 |
:return: A `Response` object.
|
| 111 |
"""
|
| 112 |
if not custom_config:
|
|
|
|
| 139 |
solve_cloudflare=solve_cloudflare,
|
| 140 |
disable_resources=disable_resources,
|
| 141 |
wait_selector_state=wait_selector_state,
|
| 142 |
+
selector_config={**cls._generate_parser_arguments(), **custom_config},
|
| 143 |
+
additional_args=additional_args or {},
|
| 144 |
) as engine:
|
| 145 |
return engine.fetch(url)
|
| 146 |
|
|
|
|
| 170 |
disable_ads: bool = False,
|
| 171 |
geoip: bool = False,
|
| 172 |
custom_config: Optional[Dict] = None,
|
| 173 |
+
additional_args: Optional[Dict] = None,
|
| 174 |
) -> Response:
|
| 175 |
"""
|
| 176 |
Opens up a browser and do your request based on your chosen options below.
|
|
|
|
| 202 |
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 203 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 204 |
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
|
| 205 |
+
:param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
| 206 |
:return: A `Response` object.
|
| 207 |
"""
|
| 208 |
if not custom_config:
|
|
|
|
| 235 |
solve_cloudflare=solve_cloudflare,
|
| 236 |
disable_resources=disable_resources,
|
| 237 |
wait_selector_state=wait_selector_state,
|
| 238 |
+
selector_config={**cls._generate_parser_arguments(), **custom_config},
|
| 239 |
+
additional_args=additional_args or {},
|
| 240 |
) as engine:
|
| 241 |
return await engine.fetch(url)
|
| 242 |
|
|
|
|
| 337 |
disable_webgl=disable_webgl,
|
| 338 |
disable_resources=disable_resources,
|
| 339 |
wait_selector_state=wait_selector_state,
|
| 340 |
+
selector_config={**cls._generate_parser_arguments(), **custom_config},
|
| 341 |
) as session:
|
| 342 |
return session.fetch(url)
|
| 343 |
|
|
|
|
| 421 |
disable_webgl=disable_webgl,
|
| 422 |
disable_resources=disable_resources,
|
| 423 |
wait_selector_state=wait_selector_state,
|
| 424 |
+
selector_config={**cls._generate_parser_arguments(), **custom_config},
|
| 425 |
) as session:
|
| 426 |
return await session.fetch(url)
|
| 427 |
|
scrapling/parser.py
CHANGED
|
@@ -24,7 +24,7 @@ from scrapling.core._types import (
|
|
| 24 |
)
|
| 25 |
from scrapling.core.custom_types import AttributesHandler, TextHandler, TextHandlers
|
| 26 |
from scrapling.core.mixins import SelectorsGeneration
|
| 27 |
-
from scrapling.core.
|
| 28 |
SQLiteStorageSystem,
|
| 29 |
StorageSystemMixin,
|
| 30 |
_StorageTools,
|
|
@@ -33,11 +33,11 @@ from scrapling.core.translator import translator_instance
|
|
| 33 |
from scrapling.core.utils import clean_spaces, flatten, html_forbidden, is_jsonable, log
|
| 34 |
|
| 35 |
|
| 36 |
-
class
|
| 37 |
__slots__ = (
|
| 38 |
"url",
|
| 39 |
"encoding",
|
| 40 |
-
"
|
| 41 |
"_root",
|
| 42 |
"_storage",
|
| 43 |
"__keep_comments",
|
|
@@ -58,7 +58,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 58 |
root: Optional[html.HtmlElement] = None,
|
| 59 |
keep_comments: Optional[bool] = False,
|
| 60 |
keep_cdata: Optional[bool] = False,
|
| 61 |
-
|
| 62 |
_storage: object = None,
|
| 63 |
storage: Any = SQLiteStorageSystem,
|
| 64 |
storage_args: Optional[Dict] = None,
|
|
@@ -82,7 +82,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 82 |
Don't use it unless you know what you are doing!
|
| 83 |
:param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
|
| 84 |
:param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
|
| 85 |
-
:param
|
| 86 |
priority over all auto-match related arguments/functions in the class.
|
| 87 |
:param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
|
| 88 |
:param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
|
|
@@ -90,7 +90,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 90 |
"""
|
| 91 |
if root is None and not body and text is None:
|
| 92 |
raise ValueError(
|
| 93 |
-
"
|
| 94 |
)
|
| 95 |
|
| 96 |
self.__text = ""
|
|
@@ -134,9 +134,9 @@ class Adaptor(SelectorsGeneration):
|
|
| 134 |
|
| 135 |
self._root = root
|
| 136 |
|
| 137 |
-
self.
|
| 138 |
|
| 139 |
-
if self.
|
| 140 |
if _storage is not None:
|
| 141 |
self._storage = _storage
|
| 142 |
else:
|
|
@@ -214,17 +214,17 @@ class Adaptor(SelectorsGeneration):
|
|
| 214 |
"""
|
| 215 |
return TextHandler(str(element))
|
| 216 |
|
| 217 |
-
def __element_convertor(self, element: html.HtmlElement) -> "
|
| 218 |
-
"""Used internally to convert a single HtmlElement to
|
| 219 |
db_instance = (
|
| 220 |
self._storage if (hasattr(self, "_storage") and self._storage) else None
|
| 221 |
)
|
| 222 |
-
return
|
| 223 |
root=element,
|
| 224 |
url=self.url,
|
| 225 |
encoding=self.encoding,
|
| 226 |
-
|
| 227 |
-
_storage=db_instance, # Reuse existing storage if it exists otherwise it won't be checked if `
|
| 228 |
keep_comments=self.__keep_comments,
|
| 229 |
keep_cdata=self.__keep_cdata,
|
| 230 |
huge_tree=self.__huge_tree_enabled,
|
|
@@ -233,8 +233,8 @@ class Adaptor(SelectorsGeneration):
|
|
| 233 |
|
| 234 |
def __handle_element(
|
| 235 |
self, element: Union[html.HtmlElement, etree._ElementUnicodeResult]
|
| 236 |
-
) -> Union[TextHandler, "
|
| 237 |
-
"""Used internally in all functions to convert a single element to type (
|
| 238 |
if element is None:
|
| 239 |
return None
|
| 240 |
elif self._is_text_node(element):
|
|
@@ -245,23 +245,23 @@ class Adaptor(SelectorsGeneration):
|
|
| 245 |
|
| 246 |
def __handle_elements(
|
| 247 |
self, result: List[Union[html.HtmlElement, etree._ElementUnicodeResult]]
|
| 248 |
-
) -> Union["
|
| 249 |
-
"""Used internally in all functions to convert results to type (
|
| 250 |
if not len(
|
| 251 |
result
|
| 252 |
): # Lxml will give a warning if I used something like `not result`
|
| 253 |
-
return
|
| 254 |
|
| 255 |
# From within the code, this method will always get a list of the same type,
|
| 256 |
# so we will continue without checks for a slight performance boost
|
| 257 |
if self._is_text_node(result[0]):
|
| 258 |
return TextHandlers(list(map(self.__content_convertor, result)))
|
| 259 |
|
| 260 |
-
return
|
| 261 |
|
| 262 |
def __getstate__(self) -> Any:
|
| 263 |
# lxml don't like it :)
|
| 264 |
-
raise TypeError("Can't pickle
|
| 265 |
|
| 266 |
# The following four properties I made them into functions instead of variables directly
|
| 267 |
# So they don't slow down the process of initializing many instances of the class and gets executed only
|
|
@@ -322,7 +322,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 322 |
return TextHandler(separator).join(_all_strings)
|
| 323 |
|
| 324 |
def urljoin(self, relative_url: str) -> str:
|
| 325 |
-
"""Join this
|
| 326 |
return urljoin(self.url, relative_url)
|
| 327 |
|
| 328 |
@property
|
|
@@ -363,20 +363,20 @@ class Adaptor(SelectorsGeneration):
|
|
| 363 |
return class_name in self._root.classes
|
| 364 |
|
| 365 |
@property
|
| 366 |
-
def parent(self) -> Union["
|
| 367 |
"""Return the direct parent of the element or ``None`` otherwise"""
|
| 368 |
return self.__handle_element(self._root.getparent())
|
| 369 |
|
| 370 |
@property
|
| 371 |
-
def below_elements(self) -> "
|
| 372 |
"""Return all elements under the current element in the DOM tree"""
|
| 373 |
below = self._root.xpath(".//*")
|
| 374 |
return self.__handle_elements(below)
|
| 375 |
|
| 376 |
@property
|
| 377 |
-
def children(self) -> "
|
| 378 |
"""Return the children elements of the current element or empty list otherwise"""
|
| 379 |
-
return
|
| 380 |
[
|
| 381 |
self.__element_convertor(child)
|
| 382 |
for child in self._root.iterchildren()
|
|
@@ -385,22 +385,22 @@ class Adaptor(SelectorsGeneration):
|
|
| 385 |
)
|
| 386 |
|
| 387 |
@property
|
| 388 |
-
def siblings(self) -> "
|
| 389 |
"""Return other children of the current element's parent or empty list otherwise"""
|
| 390 |
if self.parent:
|
| 391 |
-
return
|
| 392 |
[child for child in self.parent.children if child._root != self._root]
|
| 393 |
)
|
| 394 |
-
return
|
| 395 |
|
| 396 |
-
def iterancestors(self) -> Generator["
|
| 397 |
"""Return a generator that loops over all ancestors of the element, starting with the element's parent."""
|
| 398 |
for ancestor in self._root.iterancestors():
|
| 399 |
yield self.__element_convertor(ancestor)
|
| 400 |
|
| 401 |
def find_ancestor(
|
| 402 |
-
self, func: Callable[["
|
| 403 |
-
) -> Union["
|
| 404 |
"""Loop over all ancestors of the element till one match the passed function
|
| 405 |
:param func: A function that takes each ancestor as an argument and returns True/False
|
| 406 |
:return: The first ancestor that match the function or ``None`` otherwise.
|
|
@@ -411,13 +411,13 @@ class Adaptor(SelectorsGeneration):
|
|
| 411 |
return None
|
| 412 |
|
| 413 |
@property
|
| 414 |
-
def path(self) -> "
|
| 415 |
-
"""Returns a list of type `
|
| 416 |
lst = list(self.iterancestors())
|
| 417 |
-
return
|
| 418 |
|
| 419 |
@property
|
| 420 |
-
def next(self) -> Union["
|
| 421 |
"""Returns the next element of the current element in the children of the parent or ``None`` otherwise."""
|
| 422 |
next_element = self._root.getnext()
|
| 423 |
if next_element is not None:
|
|
@@ -428,7 +428,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 428 |
return self.__handle_element(next_element)
|
| 429 |
|
| 430 |
@property
|
| 431 |
-
def previous(self) -> Union["
|
| 432 |
"""Returns the previous element of the current element in the children of the parent or ``None`` otherwise."""
|
| 433 |
prev_element = self._root.getprevious()
|
| 434 |
if prev_element is not None:
|
|
@@ -471,18 +471,18 @@ class Adaptor(SelectorsGeneration):
|
|
| 471 |
# From here we start with the selecting functions
|
| 472 |
def relocate(
|
| 473 |
self,
|
| 474 |
-
element: Union[Dict, html.HtmlElement, "
|
| 475 |
percentage: int = 0,
|
| 476 |
-
|
| 477 |
-
) -> Union[List[Union[html.HtmlElement, None]], "
|
| 478 |
"""This function will search again for the element in the page tree, used automatically on page structure change
|
| 479 |
|
| 480 |
:param element: The element we want to relocate in the tree
|
| 481 |
:param percentage: The minimum percentage to accept and not going lower than that. Be aware that the percentage
|
| 482 |
calculation depends solely on the page structure, so don't play with this number unless you must know
|
| 483 |
what you are doing!
|
| 484 |
-
:param
|
| 485 |
-
:return: List of pure HTML elements that got the highest matching score or '
|
| 486 |
"""
|
| 487 |
score_table = {}
|
| 488 |
# Note: `element` will most likely always be a dictionary at this point.
|
|
@@ -511,7 +511,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 511 |
f"{percent} -> {self.__handle_elements(score_table[percent])}"
|
| 512 |
)
|
| 513 |
|
| 514 |
-
if not
|
| 515 |
return score_table[highest_probability]
|
| 516 |
return self.__handle_elements(score_table[highest_probability])
|
| 517 |
return []
|
|
@@ -520,10 +520,10 @@ class Adaptor(SelectorsGeneration):
|
|
| 520 |
self,
|
| 521 |
selector: str,
|
| 522 |
identifier: str = "",
|
| 523 |
-
|
| 524 |
auto_save: bool = False,
|
| 525 |
percentage: int = 0,
|
| 526 |
-
) -> Union["
|
| 527 |
"""Search the current tree with CSS3 selectors and return the first result if possible, otherwise return `None`
|
| 528 |
|
| 529 |
**Important:
|
|
@@ -531,17 +531,15 @@ class Adaptor(SelectorsGeneration):
|
|
| 531 |
and want to relocate the same element(s)**
|
| 532 |
|
| 533 |
:param selector: The CSS3 selector to be used.
|
| 534 |
-
:param
|
| 535 |
:param identifier: A string that will be used to save/retrieve element's data in auto-matching,
|
| 536 |
otherwise the selector will be used.
|
| 537 |
-
:param auto_save: Automatically save new elements for `
|
| 538 |
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 539 |
Be aware that the percentage calculation depends solely on the page structure, so don't play with this
|
| 540 |
number unless you must know what you are doing!
|
| 541 |
"""
|
| 542 |
-
for element in self.css(
|
| 543 |
-
selector, identifier, auto_match, auto_save, percentage
|
| 544 |
-
):
|
| 545 |
return element
|
| 546 |
return None
|
| 547 |
|
|
@@ -549,11 +547,11 @@ class Adaptor(SelectorsGeneration):
|
|
| 549 |
self,
|
| 550 |
selector: str,
|
| 551 |
identifier: str = "",
|
| 552 |
-
|
| 553 |
auto_save: bool = False,
|
| 554 |
percentage: int = 0,
|
| 555 |
**kwargs: Any,
|
| 556 |
-
) -> Union["
|
| 557 |
"""Search the current tree with XPath selectors and return the first result if possible, otherwise return `None`
|
| 558 |
|
| 559 |
**Important:
|
|
@@ -563,16 +561,16 @@ class Adaptor(SelectorsGeneration):
|
|
| 563 |
Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
|
| 564 |
|
| 565 |
:param selector: The XPath selector to be used.
|
| 566 |
-
:param
|
| 567 |
:param identifier: A string that will be used to save/retrieve element's data in auto-matching,
|
| 568 |
otherwise the selector will be used.
|
| 569 |
-
:param auto_save: Automatically save new elements for `
|
| 570 |
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 571 |
Be aware that the percentage calculation depends solely on the page structure, so don't play with this
|
| 572 |
number unless you must know what you are doing!
|
| 573 |
"""
|
| 574 |
for element in self.xpath(
|
| 575 |
-
selector, identifier,
|
| 576 |
):
|
| 577 |
return element
|
| 578 |
return None
|
|
@@ -581,10 +579,10 @@ class Adaptor(SelectorsGeneration):
|
|
| 581 |
self,
|
| 582 |
selector: str,
|
| 583 |
identifier: str = "",
|
| 584 |
-
|
| 585 |
auto_save: bool = False,
|
| 586 |
percentage: int = 0,
|
| 587 |
-
) -> Union["
|
| 588 |
"""Search the current tree with CSS3 selectors
|
| 589 |
|
| 590 |
**Important:
|
|
@@ -592,24 +590,24 @@ class Adaptor(SelectorsGeneration):
|
|
| 592 |
and want to relocate the same element(s)**
|
| 593 |
|
| 594 |
:param selector: The CSS3 selector to be used.
|
| 595 |
-
:param
|
| 596 |
:param identifier: A string that will be used to save/retrieve element's data in auto-matching,
|
| 597 |
otherwise the selector will be used.
|
| 598 |
-
:param auto_save: Automatically save new elements for `
|
| 599 |
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 600 |
Be aware that the percentage calculation depends solely on the page structure, so don't play with this
|
| 601 |
number unless you must know what you are doing!
|
| 602 |
|
| 603 |
-
:return: `
|
| 604 |
"""
|
| 605 |
try:
|
| 606 |
-
if not self.
|
| 607 |
# No need to split selectors in this case, let's save some CPU cycles :)
|
| 608 |
xpath_selector = translator_instance.css_to_xpath(selector)
|
| 609 |
return self.xpath(
|
| 610 |
xpath_selector,
|
| 611 |
identifier or selector,
|
| 612 |
-
|
| 613 |
auto_save,
|
| 614 |
percentage,
|
| 615 |
)
|
|
@@ -625,7 +623,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 625 |
results += self.xpath(
|
| 626 |
xpath_selector,
|
| 627 |
identifier or single_selector.canonical(),
|
| 628 |
-
|
| 629 |
auto_save,
|
| 630 |
percentage,
|
| 631 |
)
|
|
@@ -643,11 +641,11 @@ class Adaptor(SelectorsGeneration):
|
|
| 643 |
self,
|
| 644 |
selector: str,
|
| 645 |
identifier: str = "",
|
| 646 |
-
|
| 647 |
auto_save: bool = False,
|
| 648 |
percentage: int = 0,
|
| 649 |
**kwargs: Any,
|
| 650 |
-
) -> Union["
|
| 651 |
"""Search the current tree with XPath selectors
|
| 652 |
|
| 653 |
**Important:
|
|
@@ -657,31 +655,31 @@ class Adaptor(SelectorsGeneration):
|
|
| 657 |
Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
|
| 658 |
|
| 659 |
:param selector: The XPath selector to be used.
|
| 660 |
-
:param
|
| 661 |
:param identifier: A string that will be used to save/retrieve element's data in auto-matching,
|
| 662 |
otherwise the selector will be used.
|
| 663 |
-
:param auto_save: Automatically save new elements for `
|
| 664 |
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 665 |
Be aware that the percentage calculation depends solely on the page structure, so don't play with this
|
| 666 |
number unless you must know what you are doing!
|
| 667 |
|
| 668 |
-
:return: `
|
| 669 |
"""
|
| 670 |
try:
|
| 671 |
elements = self._root.xpath(selector, **kwargs)
|
| 672 |
|
| 673 |
if elements:
|
| 674 |
if auto_save:
|
| 675 |
-
if not self.
|
| 676 |
log.warning(
|
| 677 |
-
"Argument `auto_save` will be ignored because `
|
| 678 |
)
|
| 679 |
else:
|
| 680 |
self.save(elements[0], identifier or selector)
|
| 681 |
|
| 682 |
return self.__handle_elements(elements)
|
| 683 |
-
elif self.
|
| 684 |
-
if
|
| 685 |
element_data = self.retrieve(identifier or selector)
|
| 686 |
if element_data:
|
| 687 |
elements = self.relocate(element_data, percentage)
|
|
@@ -690,13 +688,13 @@ class Adaptor(SelectorsGeneration):
|
|
| 690 |
|
| 691 |
return self.__handle_elements(elements)
|
| 692 |
else:
|
| 693 |
-
if
|
| 694 |
log.warning(
|
| 695 |
-
"Argument `
|
| 696 |
)
|
| 697 |
elif auto_save:
|
| 698 |
log.warning(
|
| 699 |
-
"Argument `auto_save` will be ignored because `
|
| 700 |
)
|
| 701 |
|
| 702 |
return self.__handle_elements(elements)
|
|
@@ -713,12 +711,12 @@ class Adaptor(SelectorsGeneration):
|
|
| 713 |
self,
|
| 714 |
*args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]],
|
| 715 |
**kwargs: str,
|
| 716 |
-
) -> "
|
| 717 |
"""Find elements by filters of your creations for ease.
|
| 718 |
|
| 719 |
:param args: Tag name(s), iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
|
| 720 |
:param kwargs: The attributes you want to filter elements based on it.
|
| 721 |
-
:return: The `
|
| 722 |
"""
|
| 723 |
# Attributes that are Python reserved words and can't be used directly
|
| 724 |
# Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
|
|
@@ -735,7 +733,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 735 |
|
| 736 |
attributes = dict()
|
| 737 |
tags, patterns = set(), set()
|
| 738 |
-
results, functions, selectors =
|
| 739 |
|
| 740 |
# Brace yourself for a wonderful journey!
|
| 741 |
for arg in args:
|
|
@@ -766,7 +764,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 766 |
functions.append(arg)
|
| 767 |
else:
|
| 768 |
raise TypeError(
|
| 769 |
-
"Callable filter function must have at least one argument to take `
|
| 770 |
)
|
| 771 |
|
| 772 |
else:
|
|
@@ -820,12 +818,12 @@ class Adaptor(SelectorsGeneration):
|
|
| 820 |
self,
|
| 821 |
*args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]],
|
| 822 |
**kwargs: str,
|
| 823 |
-
) -> Union["
|
| 824 |
"""Find elements by filters of your creations for ease, then return the first result. Otherwise return `None`.
|
| 825 |
|
| 826 |
:param args: Tag name(s), iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
|
| 827 |
:param kwargs: The attributes you want to filter elements based on it.
|
| 828 |
-
:return: The `
|
| 829 |
"""
|
| 830 |
for element in self.find_all(*args, **kwargs):
|
| 831 |
return element
|
|
@@ -928,15 +926,15 @@ class Adaptor(SelectorsGeneration):
|
|
| 928 |
return score
|
| 929 |
|
| 930 |
def save(
|
| 931 |
-
self, element: Union["
|
| 932 |
) -> None:
|
| 933 |
"""Saves the element's unique properties to the storage for retrieval and relocation later
|
| 934 |
|
| 935 |
-
:param element: The element itself that we want to save to storage, it can be an `
|
| 936 |
:param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
|
| 937 |
the docs for more info.
|
| 938 |
"""
|
| 939 |
-
if self.
|
| 940 |
if isinstance(element, self.__class__):
|
| 941 |
element = element._root
|
| 942 |
|
|
@@ -956,7 +954,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 956 |
the docs for more info.
|
| 957 |
:return: A dictionary of the unique properties
|
| 958 |
"""
|
| 959 |
-
if self.
|
| 960 |
return self._storage.retrieve(identifier)
|
| 961 |
|
| 962 |
log.critical(
|
|
@@ -1065,7 +1063,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 1065 |
"src",
|
| 1066 |
),
|
| 1067 |
match_text: bool = False,
|
| 1068 |
-
) -> Union["
|
| 1069 |
"""Find elements that are in the same tree depth in the page with the same tag name and same parent tag etc...
|
| 1070 |
then return the ones that match the current element attributes with a percentage higher than the input threshold.
|
| 1071 |
|
|
@@ -1084,7 +1082,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 1084 |
:param match_text: If True, element text content will be taken into calculation while matching.
|
| 1085 |
Not recommended to use in normal cases, but it depends.
|
| 1086 |
|
| 1087 |
-
:return: A ``
|
| 1088 |
"""
|
| 1089 |
# We will use the elements' root from now on to get the speed boost of using Lxml directly
|
| 1090 |
root = self._root
|
|
@@ -1128,7 +1126,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 1128 |
partial: bool = False,
|
| 1129 |
case_sensitive: bool = False,
|
| 1130 |
clean_match: bool = True,
|
| 1131 |
-
) -> Union["
|
| 1132 |
"""Find elements that its text content fully/partially matches input.
|
| 1133 |
:param text: Text query to match
|
| 1134 |
:param first_match: Returns the first element that matches conditions, enabled by default
|
|
@@ -1137,7 +1135,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 1137 |
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 1138 |
"""
|
| 1139 |
|
| 1140 |
-
results =
|
| 1141 |
if not case_sensitive:
|
| 1142 |
text = text.lower()
|
| 1143 |
|
|
@@ -1174,14 +1172,14 @@ class Adaptor(SelectorsGeneration):
|
|
| 1174 |
first_match: bool = True,
|
| 1175 |
case_sensitive: bool = False,
|
| 1176 |
clean_match: bool = True,
|
| 1177 |
-
) -> Union["
|
| 1178 |
"""Find elements that its text content matches the input regex pattern.
|
| 1179 |
:param query: Regex query/pattern to match
|
| 1180 |
:param first_match: Return the first element that matches conditions; enabled by default.
|
| 1181 |
:param case_sensitive: If enabled, the letters case will be taken into consideration in the regex.
|
| 1182 |
:param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching.
|
| 1183 |
"""
|
| 1184 |
-
results =
|
| 1185 |
|
| 1186 |
# This selector gets all elements with text content
|
| 1187 |
for node in self.__handle_elements(
|
|
@@ -1206,24 +1204,24 @@ class Adaptor(SelectorsGeneration):
|
|
| 1206 |
return results
|
| 1207 |
|
| 1208 |
|
| 1209 |
-
class
|
| 1210 |
"""
|
| 1211 |
-
The `
|
| 1212 |
"""
|
| 1213 |
|
| 1214 |
__slots__ = ()
|
| 1215 |
|
| 1216 |
@typing.overload
|
| 1217 |
-
def __getitem__(self, pos: SupportsIndex) ->
|
| 1218 |
pass
|
| 1219 |
|
| 1220 |
@typing.overload
|
| 1221 |
-
def __getitem__(self, pos: slice) -> "
|
| 1222 |
pass
|
| 1223 |
|
| 1224 |
def __getitem__(
|
| 1225 |
self, pos: Union[SupportsIndex, slice]
|
| 1226 |
-
) -> Union[
|
| 1227 |
lst = super().__getitem__(pos)
|
| 1228 |
if isinstance(pos, slice):
|
| 1229 |
return self.__class__(lst)
|
|
@@ -1237,10 +1235,10 @@ class Adaptors(List[Adaptor]):
|
|
| 1237 |
auto_save: bool = False,
|
| 1238 |
percentage: int = 0,
|
| 1239 |
**kwargs: Any,
|
| 1240 |
-
) -> "
|
| 1241 |
"""
|
| 1242 |
Call the ``.xpath()`` method for each element in this list and return
|
| 1243 |
-
their results as another `
|
| 1244 |
|
| 1245 |
**Important:
|
| 1246 |
It's recommended to use the identifier argument if you plan to use a different selector later
|
|
@@ -1251,12 +1249,12 @@ class Adaptors(List[Adaptor]):
|
|
| 1251 |
:param selector: The XPath selector to be used.
|
| 1252 |
:param identifier: A string that will be used to retrieve element's data in auto-matching,
|
| 1253 |
otherwise the selector will be used.
|
| 1254 |
-
:param auto_save: Automatically save new elements for `
|
| 1255 |
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 1256 |
Be aware that the percentage calculation depends solely on the page structure, so don't play with this
|
| 1257 |
number unless you must know what you are doing!
|
| 1258 |
|
| 1259 |
-
:return: `
|
| 1260 |
"""
|
| 1261 |
results = [
|
| 1262 |
n.xpath(
|
|
@@ -1272,10 +1270,10 @@ class Adaptors(List[Adaptor]):
|
|
| 1272 |
identifier: str = "",
|
| 1273 |
auto_save: bool = False,
|
| 1274 |
percentage: int = 0,
|
| 1275 |
-
) -> "
|
| 1276 |
"""
|
| 1277 |
Call the ``.css()`` method for each element in this list and return
|
| 1278 |
-
their results flattened as another `
|
| 1279 |
|
| 1280 |
**Important:
|
| 1281 |
It's recommended to use the identifier argument if you plan to use a different selector later
|
|
@@ -1284,12 +1282,12 @@ class Adaptors(List[Adaptor]):
|
|
| 1284 |
:param selector: The CSS3 selector to be used.
|
| 1285 |
:param identifier: A string that will be used to retrieve element's data in auto-matching,
|
| 1286 |
otherwise the selector will be used.
|
| 1287 |
-
:param auto_save: Automatically save new elements for `
|
| 1288 |
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 1289 |
Be aware that the percentage calculation depends solely on the page structure, so don't play with this
|
| 1290 |
number unless you must know what you are doing!
|
| 1291 |
|
| 1292 |
-
:return: `
|
| 1293 |
"""
|
| 1294 |
results = [
|
| 1295 |
n.css(selector, identifier or selector, False, auto_save, percentage)
|
|
@@ -1340,7 +1338,7 @@ class Adaptors(List[Adaptor]):
|
|
| 1340 |
return result
|
| 1341 |
return default
|
| 1342 |
|
| 1343 |
-
def search(self, func: Callable[["
|
| 1344 |
"""Loop over all current elements and return the first element that matches the passed function
|
| 1345 |
:param func: A function that takes each element as an argument and returns True/False
|
| 1346 |
:return: The first element that match the function or ``None`` otherwise.
|
|
@@ -1350,10 +1348,10 @@ class Adaptors(List[Adaptor]):
|
|
| 1350 |
return element
|
| 1351 |
return None
|
| 1352 |
|
| 1353 |
-
def filter(self, func: Callable[["
|
| 1354 |
"""Filter current elements based on the passed function
|
| 1355 |
:param func: A function that takes each element as an argument and returns True/False
|
| 1356 |
-
:return: The new `
|
| 1357 |
"""
|
| 1358 |
return self.__class__([element for element in self if func(element)])
|
| 1359 |
|
|
@@ -1382,4 +1380,4 @@ class Adaptors(List[Adaptor]):
|
|
| 1382 |
|
| 1383 |
def __getstate__(self) -> Any:
|
| 1384 |
# lxml don't like it :)
|
| 1385 |
-
raise TypeError("Can't pickle
|
|
|
|
| 24 |
)
|
| 25 |
from scrapling.core.custom_types import AttributesHandler, TextHandler, TextHandlers
|
| 26 |
from scrapling.core.mixins import SelectorsGeneration
|
| 27 |
+
from scrapling.core.storage import (
|
| 28 |
SQLiteStorageSystem,
|
| 29 |
StorageSystemMixin,
|
| 30 |
_StorageTools,
|
|
|
|
| 33 |
from scrapling.core.utils import clean_spaces, flatten, html_forbidden, is_jsonable, log
|
| 34 |
|
| 35 |
|
| 36 |
+
class Selector(SelectorsGeneration):
|
| 37 |
__slots__ = (
|
| 38 |
"url",
|
| 39 |
"encoding",
|
| 40 |
+
"__adaptive_enabled",
|
| 41 |
"_root",
|
| 42 |
"_storage",
|
| 43 |
"__keep_comments",
|
|
|
|
| 58 |
root: Optional[html.HtmlElement] = None,
|
| 59 |
keep_comments: Optional[bool] = False,
|
| 60 |
keep_cdata: Optional[bool] = False,
|
| 61 |
+
adaptive: Optional[bool] = False,
|
| 62 |
_storage: object = None,
|
| 63 |
storage: Any = SQLiteStorageSystem,
|
| 64 |
storage_args: Optional[Dict] = None,
|
|
|
|
| 82 |
Don't use it unless you know what you are doing!
|
| 83 |
:param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
|
| 84 |
:param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
|
| 85 |
+
:param adaptive: Globally turn off the auto-match feature in all functions, this argument takes higher
|
| 86 |
priority over all auto-match related arguments/functions in the class.
|
| 87 |
:param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
|
| 88 |
:param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
|
|
|
|
| 90 |
"""
|
| 91 |
if root is None and not body and text is None:
|
| 92 |
raise ValueError(
|
| 93 |
+
"Selector class needs text, body, or root arguments to work"
|
| 94 |
)
|
| 95 |
|
| 96 |
self.__text = ""
|
|
|
|
| 134 |
|
| 135 |
self._root = root
|
| 136 |
|
| 137 |
+
self.__adaptive_enabled = adaptive
|
| 138 |
|
| 139 |
+
if self.__adaptive_enabled:
|
| 140 |
if _storage is not None:
|
| 141 |
self._storage = _storage
|
| 142 |
else:
|
|
|
|
| 214 |
"""
|
| 215 |
return TextHandler(str(element))
|
| 216 |
|
| 217 |
+
def __element_convertor(self, element: html.HtmlElement) -> "Selector":
|
| 218 |
+
"""Used internally to convert a single HtmlElement to Selector directly without checks"""
|
| 219 |
db_instance = (
|
| 220 |
self._storage if (hasattr(self, "_storage") and self._storage) else None
|
| 221 |
)
|
| 222 |
+
return Selector(
|
| 223 |
root=element,
|
| 224 |
url=self.url,
|
| 225 |
encoding=self.encoding,
|
| 226 |
+
adaptive=self.__adaptive_enabled,
|
| 227 |
+
_storage=db_instance, # Reuse existing storage if it exists otherwise it won't be checked if `adaptive` is turned off
|
| 228 |
keep_comments=self.__keep_comments,
|
| 229 |
keep_cdata=self.__keep_cdata,
|
| 230 |
huge_tree=self.__huge_tree_enabled,
|
|
|
|
| 233 |
|
| 234 |
def __handle_element(
|
| 235 |
self, element: Union[html.HtmlElement, etree._ElementUnicodeResult]
|
| 236 |
+
) -> Union[TextHandler, "Selector", None]:
|
| 237 |
+
"""Used internally in all functions to convert a single element to type (Selector|TextHandler) when possible"""
|
| 238 |
if element is None:
|
| 239 |
return None
|
| 240 |
elif self._is_text_node(element):
|
|
|
|
| 245 |
|
| 246 |
def __handle_elements(
|
| 247 |
self, result: List[Union[html.HtmlElement, etree._ElementUnicodeResult]]
|
| 248 |
+
) -> Union["Selectors", "TextHandlers", List]:
|
| 249 |
+
"""Used internally in all functions to convert results to type (Selectors|TextHandlers) in bulk when possible"""
|
| 250 |
if not len(
|
| 251 |
result
|
| 252 |
): # Lxml will give a warning if I used something like `not result`
|
| 253 |
+
return Selectors([])
|
| 254 |
|
| 255 |
# From within the code, this method will always get a list of the same type,
|
| 256 |
# so we will continue without checks for a slight performance boost
|
| 257 |
if self._is_text_node(result[0]):
|
| 258 |
return TextHandlers(list(map(self.__content_convertor, result)))
|
| 259 |
|
| 260 |
+
return Selectors(list(map(self.__element_convertor, result)))
|
| 261 |
|
| 262 |
def __getstate__(self) -> Any:
|
| 263 |
# lxml don't like it :)
|
| 264 |
+
raise TypeError("Can't pickle Selector objects")
|
| 265 |
|
| 266 |
# The following four properties I made them into functions instead of variables directly
|
| 267 |
# So they don't slow down the process of initializing many instances of the class and gets executed only
|
|
|
|
| 322 |
return TextHandler(separator).join(_all_strings)
|
| 323 |
|
| 324 |
def urljoin(self, relative_url: str) -> str:
|
| 325 |
+
"""Join this Selector's url with a relative url to form an absolute full URL."""
|
| 326 |
return urljoin(self.url, relative_url)
|
| 327 |
|
| 328 |
@property
|
|
|
|
| 363 |
return class_name in self._root.classes
|
| 364 |
|
| 365 |
@property
|
| 366 |
+
def parent(self) -> Union["Selector", None]:
|
| 367 |
"""Return the direct parent of the element or ``None`` otherwise"""
|
| 368 |
return self.__handle_element(self._root.getparent())
|
| 369 |
|
| 370 |
@property
|
| 371 |
+
def below_elements(self) -> "Selectors[Selector]":
|
| 372 |
"""Return all elements under the current element in the DOM tree"""
|
| 373 |
below = self._root.xpath(".//*")
|
| 374 |
return self.__handle_elements(below)
|
| 375 |
|
| 376 |
@property
|
| 377 |
+
def children(self) -> "Selectors[Selector]":
|
| 378 |
"""Return the children elements of the current element or empty list otherwise"""
|
| 379 |
+
return Selectors(
|
| 380 |
[
|
| 381 |
self.__element_convertor(child)
|
| 382 |
for child in self._root.iterchildren()
|
|
|
|
| 385 |
)
|
| 386 |
|
| 387 |
@property
|
| 388 |
+
def siblings(self) -> "Selectors[Selector]":
|
| 389 |
"""Return other children of the current element's parent or empty list otherwise"""
|
| 390 |
if self.parent:
|
| 391 |
+
return Selectors(
|
| 392 |
[child for child in self.parent.children if child._root != self._root]
|
| 393 |
)
|
| 394 |
+
return Selectors([])
|
| 395 |
|
| 396 |
+
def iterancestors(self) -> Generator["Selector", None, None]:
|
| 397 |
"""Return a generator that loops over all ancestors of the element, starting with the element's parent."""
|
| 398 |
for ancestor in self._root.iterancestors():
|
| 399 |
yield self.__element_convertor(ancestor)
|
| 400 |
|
| 401 |
def find_ancestor(
|
| 402 |
+
self, func: Callable[["Selector"], bool]
|
| 403 |
+
) -> Union["Selector", None]:
|
| 404 |
"""Loop over all ancestors of the element till one match the passed function
|
| 405 |
:param func: A function that takes each ancestor as an argument and returns True/False
|
| 406 |
:return: The first ancestor that match the function or ``None`` otherwise.
|
|
|
|
| 411 |
return None
|
| 412 |
|
| 413 |
@property
|
| 414 |
+
def path(self) -> "Selectors[Selector]":
|
| 415 |
+
"""Returns a list of type `Selectors` that contains the path leading to the current element from the root."""
|
| 416 |
lst = list(self.iterancestors())
|
| 417 |
+
return Selectors(lst)
|
| 418 |
|
| 419 |
@property
|
| 420 |
+
def next(self) -> Union["Selector", None]:
|
| 421 |
"""Returns the next element of the current element in the children of the parent or ``None`` otherwise."""
|
| 422 |
next_element = self._root.getnext()
|
| 423 |
if next_element is not None:
|
|
|
|
| 428 |
return self.__handle_element(next_element)
|
| 429 |
|
| 430 |
@property
|
| 431 |
+
def previous(self) -> Union["Selector", None]:
|
| 432 |
"""Returns the previous element of the current element in the children of the parent or ``None`` otherwise."""
|
| 433 |
prev_element = self._root.getprevious()
|
| 434 |
if prev_element is not None:
|
|
|
|
| 471 |
# From here we start with the selecting functions
|
| 472 |
def relocate(
|
| 473 |
self,
|
| 474 |
+
element: Union[Dict, html.HtmlElement, "Selector"],
|
| 475 |
percentage: int = 0,
|
| 476 |
+
selector_type: bool = False,
|
| 477 |
+
) -> Union[List[Union[html.HtmlElement, None]], "Selectors"]:
|
| 478 |
"""This function will search again for the element in the page tree, used automatically on page structure change
|
| 479 |
|
| 480 |
:param element: The element we want to relocate in the tree
|
| 481 |
:param percentage: The minimum percentage to accept and not going lower than that. Be aware that the percentage
|
| 482 |
calculation depends solely on the page structure, so don't play with this number unless you must know
|
| 483 |
what you are doing!
|
| 484 |
+
:param selector_type: If True, the return result will be converted to `Selectors` object
|
| 485 |
+
:return: List of pure HTML elements that got the highest matching score or 'Selectors' object
|
| 486 |
"""
|
| 487 |
score_table = {}
|
| 488 |
# Note: `element` will most likely always be a dictionary at this point.
|
|
|
|
| 511 |
f"{percent} -> {self.__handle_elements(score_table[percent])}"
|
| 512 |
)
|
| 513 |
|
| 514 |
+
if not selector_type:
|
| 515 |
return score_table[highest_probability]
|
| 516 |
return self.__handle_elements(score_table[highest_probability])
|
| 517 |
return []
|
|
|
|
| 520 |
self,
|
| 521 |
selector: str,
|
| 522 |
identifier: str = "",
|
| 523 |
+
adaptive: bool = False,
|
| 524 |
auto_save: bool = False,
|
| 525 |
percentage: int = 0,
|
| 526 |
+
) -> Union["Selector", "TextHandler", None]:
|
| 527 |
"""Search the current tree with CSS3 selectors and return the first result if possible, otherwise return `None`
|
| 528 |
|
| 529 |
**Important:
|
|
|
|
| 531 |
and want to relocate the same element(s)**
|
| 532 |
|
| 533 |
:param selector: The CSS3 selector to be used.
|
| 534 |
+
:param adaptive: Enabled will make the function try to relocate the element if it was 'saved' before
|
| 535 |
:param identifier: A string that will be used to save/retrieve element's data in auto-matching,
|
| 536 |
otherwise the selector will be used.
|
| 537 |
+
:param auto_save: Automatically save new elements for `adaptive` later
|
| 538 |
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 539 |
Be aware that the percentage calculation depends solely on the page structure, so don't play with this
|
| 540 |
number unless you must know what you are doing!
|
| 541 |
"""
|
| 542 |
+
for element in self.css(selector, identifier, adaptive, auto_save, percentage):
|
|
|
|
|
|
|
| 543 |
return element
|
| 544 |
return None
|
| 545 |
|
|
|
|
| 547 |
self,
|
| 548 |
selector: str,
|
| 549 |
identifier: str = "",
|
| 550 |
+
adaptive: bool = False,
|
| 551 |
auto_save: bool = False,
|
| 552 |
percentage: int = 0,
|
| 553 |
**kwargs: Any,
|
| 554 |
+
) -> Union["Selector", "TextHandler", None]:
|
| 555 |
"""Search the current tree with XPath selectors and return the first result if possible, otherwise return `None`
|
| 556 |
|
| 557 |
**Important:
|
|
|
|
| 561 |
Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
|
| 562 |
|
| 563 |
:param selector: The XPath selector to be used.
|
| 564 |
+
:param adaptive: Enabled will make the function try to relocate the element if it was 'saved' before
|
| 565 |
:param identifier: A string that will be used to save/retrieve element's data in auto-matching,
|
| 566 |
otherwise the selector will be used.
|
| 567 |
+
:param auto_save: Automatically save new elements for `adaptive` later
|
| 568 |
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 569 |
Be aware that the percentage calculation depends solely on the page structure, so don't play with this
|
| 570 |
number unless you must know what you are doing!
|
| 571 |
"""
|
| 572 |
for element in self.xpath(
|
| 573 |
+
selector, identifier, adaptive, auto_save, percentage, **kwargs
|
| 574 |
):
|
| 575 |
return element
|
| 576 |
return None
|
|
|
|
| 579 |
self,
|
| 580 |
selector: str,
|
| 581 |
identifier: str = "",
|
| 582 |
+
adaptive: bool = False,
|
| 583 |
auto_save: bool = False,
|
| 584 |
percentage: int = 0,
|
| 585 |
+
) -> Union["Selectors[Selector]", List, "TextHandlers[TextHandler]"]:
|
| 586 |
"""Search the current tree with CSS3 selectors
|
| 587 |
|
| 588 |
**Important:
|
|
|
|
| 590 |
and want to relocate the same element(s)**
|
| 591 |
|
| 592 |
:param selector: The CSS3 selector to be used.
|
| 593 |
+
:param adaptive: Enabled will make the function try to relocate the element if it was 'saved' before
|
| 594 |
:param identifier: A string that will be used to save/retrieve element's data in auto-matching,
|
| 595 |
otherwise the selector will be used.
|
| 596 |
+
:param auto_save: Automatically save new elements for `adaptive` later
|
| 597 |
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 598 |
Be aware that the percentage calculation depends solely on the page structure, so don't play with this
|
| 599 |
number unless you must know what you are doing!
|
| 600 |
|
| 601 |
+
:return: `Selectors` class.
|
| 602 |
"""
|
| 603 |
try:
|
| 604 |
+
if not self.__adaptive_enabled or "," not in selector:
|
| 605 |
# No need to split selectors in this case, let's save some CPU cycles :)
|
| 606 |
xpath_selector = translator_instance.css_to_xpath(selector)
|
| 607 |
return self.xpath(
|
| 608 |
xpath_selector,
|
| 609 |
identifier or selector,
|
| 610 |
+
adaptive,
|
| 611 |
auto_save,
|
| 612 |
percentage,
|
| 613 |
)
|
|
|
|
| 623 |
results += self.xpath(
|
| 624 |
xpath_selector,
|
| 625 |
identifier or single_selector.canonical(),
|
| 626 |
+
adaptive,
|
| 627 |
auto_save,
|
| 628 |
percentage,
|
| 629 |
)
|
|
|
|
| 641 |
self,
|
| 642 |
selector: str,
|
| 643 |
identifier: str = "",
|
| 644 |
+
adaptive: bool = False,
|
| 645 |
auto_save: bool = False,
|
| 646 |
percentage: int = 0,
|
| 647 |
**kwargs: Any,
|
| 648 |
+
) -> Union["Selectors[Selector]", List, "TextHandlers[TextHandler]"]:
|
| 649 |
"""Search the current tree with XPath selectors
|
| 650 |
|
| 651 |
**Important:
|
|
|
|
| 655 |
Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
|
| 656 |
|
| 657 |
:param selector: The XPath selector to be used.
|
| 658 |
+
:param adaptive: Enabled will make the function try to relocate the element if it was 'saved' before
|
| 659 |
:param identifier: A string that will be used to save/retrieve element's data in auto-matching,
|
| 660 |
otherwise the selector will be used.
|
| 661 |
+
:param auto_save: Automatically save new elements for `adaptive` later
|
| 662 |
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 663 |
Be aware that the percentage calculation depends solely on the page structure, so don't play with this
|
| 664 |
number unless you must know what you are doing!
|
| 665 |
|
| 666 |
+
:return: `Selectors` class.
|
| 667 |
"""
|
| 668 |
try:
|
| 669 |
elements = self._root.xpath(selector, **kwargs)
|
| 670 |
|
| 671 |
if elements:
|
| 672 |
if auto_save:
|
| 673 |
+
if not self.__adaptive_enabled:
|
| 674 |
log.warning(
|
| 675 |
+
"Argument `auto_save` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
|
| 676 |
)
|
| 677 |
else:
|
| 678 |
self.save(elements[0], identifier or selector)
|
| 679 |
|
| 680 |
return self.__handle_elements(elements)
|
| 681 |
+
elif self.__adaptive_enabled:
|
| 682 |
+
if adaptive:
|
| 683 |
element_data = self.retrieve(identifier or selector)
|
| 684 |
if element_data:
|
| 685 |
elements = self.relocate(element_data, percentage)
|
|
|
|
| 688 |
|
| 689 |
return self.__handle_elements(elements)
|
| 690 |
else:
|
| 691 |
+
if adaptive:
|
| 692 |
log.warning(
|
| 693 |
+
"Argument `adaptive` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
|
| 694 |
)
|
| 695 |
elif auto_save:
|
| 696 |
log.warning(
|
| 697 |
+
"Argument `auto_save` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
|
| 698 |
)
|
| 699 |
|
| 700 |
return self.__handle_elements(elements)
|
|
|
|
| 711 |
self,
|
| 712 |
*args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]],
|
| 713 |
**kwargs: str,
|
| 714 |
+
) -> "Selectors":
|
| 715 |
"""Find elements by filters of your creations for ease.
|
| 716 |
|
| 717 |
:param args: Tag name(s), iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
|
| 718 |
:param kwargs: The attributes you want to filter elements based on it.
|
| 719 |
+
:return: The `Selectors` object of the elements or empty list
|
| 720 |
"""
|
| 721 |
# Attributes that are Python reserved words and can't be used directly
|
| 722 |
# Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
|
|
|
|
| 733 |
|
| 734 |
attributes = dict()
|
| 735 |
tags, patterns = set(), set()
|
| 736 |
+
results, functions, selectors = Selectors([]), [], []
|
| 737 |
|
| 738 |
# Brace yourself for a wonderful journey!
|
| 739 |
for arg in args:
|
|
|
|
| 764 |
functions.append(arg)
|
| 765 |
else:
|
| 766 |
raise TypeError(
|
| 767 |
+
"Callable filter function must have at least one argument to take `Selector` objects."
|
| 768 |
)
|
| 769 |
|
| 770 |
else:
|
|
|
|
| 818 |
self,
|
| 819 |
*args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]],
|
| 820 |
**kwargs: str,
|
| 821 |
+
) -> Union["Selector", None]:
|
| 822 |
"""Find elements by filters of your creations for ease, then return the first result. Otherwise return `None`.
|
| 823 |
|
| 824 |
:param args: Tag name(s), iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
|
| 825 |
:param kwargs: The attributes you want to filter elements based on it.
|
| 826 |
+
:return: The `Selector` object of the element or `None` if the result didn't match
|
| 827 |
"""
|
| 828 |
for element in self.find_all(*args, **kwargs):
|
| 829 |
return element
|
|
|
|
| 926 |
return score
|
| 927 |
|
| 928 |
def save(
|
| 929 |
+
self, element: Union["Selector", html.HtmlElement], identifier: str
|
| 930 |
) -> None:
|
| 931 |
"""Saves the element's unique properties to the storage for retrieval and relocation later
|
| 932 |
|
| 933 |
+
:param element: The element itself that we want to save to storage, it can be an ` Selector ` or pure ` HtmlElement `
|
| 934 |
:param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
|
| 935 |
the docs for more info.
|
| 936 |
"""
|
| 937 |
+
if self.__adaptive_enabled:
|
| 938 |
if isinstance(element, self.__class__):
|
| 939 |
element = element._root
|
| 940 |
|
|
|
|
| 954 |
the docs for more info.
|
| 955 |
:return: A dictionary of the unique properties
|
| 956 |
"""
|
| 957 |
+
if self.__adaptive_enabled:
|
| 958 |
return self._storage.retrieve(identifier)
|
| 959 |
|
| 960 |
log.critical(
|
|
|
|
| 1063 |
"src",
|
| 1064 |
),
|
| 1065 |
match_text: bool = False,
|
| 1066 |
+
) -> Union["Selectors[Selector]", List]:
|
| 1067 |
"""Find elements that are in the same tree depth in the page with the same tag name and same parent tag etc...
|
| 1068 |
then return the ones that match the current element attributes with a percentage higher than the input threshold.
|
| 1069 |
|
|
|
|
| 1082 |
:param match_text: If True, element text content will be taken into calculation while matching.
|
| 1083 |
Not recommended to use in normal cases, but it depends.
|
| 1084 |
|
| 1085 |
+
:return: A ``Selectors`` container of ``Selector`` objects or empty list
|
| 1086 |
"""
|
| 1087 |
# We will use the elements' root from now on to get the speed boost of using Lxml directly
|
| 1088 |
root = self._root
|
|
|
|
| 1126 |
partial: bool = False,
|
| 1127 |
case_sensitive: bool = False,
|
| 1128 |
clean_match: bool = True,
|
| 1129 |
+
) -> Union["Selectors[Selector]", "Selector"]:
|
| 1130 |
"""Find elements that its text content fully/partially matches input.
|
| 1131 |
:param text: Text query to match
|
| 1132 |
:param first_match: Returns the first element that matches conditions, enabled by default
|
|
|
|
| 1135 |
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 1136 |
"""
|
| 1137 |
|
| 1138 |
+
results = Selectors([])
|
| 1139 |
if not case_sensitive:
|
| 1140 |
text = text.lower()
|
| 1141 |
|
|
|
|
| 1172 |
first_match: bool = True,
|
| 1173 |
case_sensitive: bool = False,
|
| 1174 |
clean_match: bool = True,
|
| 1175 |
+
) -> Union["Selectors[Selector]", "Selector"]:
|
| 1176 |
"""Find elements that its text content matches the input regex pattern.
|
| 1177 |
:param query: Regex query/pattern to match
|
| 1178 |
:param first_match: Return the first element that matches conditions; enabled by default.
|
| 1179 |
:param case_sensitive: If enabled, the letters case will be taken into consideration in the regex.
|
| 1180 |
:param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching.
|
| 1181 |
"""
|
| 1182 |
+
results = Selectors([])
|
| 1183 |
|
| 1184 |
# This selector gets all elements with text content
|
| 1185 |
for node in self.__handle_elements(
|
|
|
|
| 1204 |
return results
|
| 1205 |
|
| 1206 |
|
| 1207 |
+
class Selectors(List[Selector]):
|
| 1208 |
"""
|
| 1209 |
+
The `Selectors` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
|
| 1210 |
"""
|
| 1211 |
|
| 1212 |
__slots__ = ()
|
| 1213 |
|
| 1214 |
@typing.overload
|
| 1215 |
+
def __getitem__(self, pos: SupportsIndex) -> Selector:
|
| 1216 |
pass
|
| 1217 |
|
| 1218 |
@typing.overload
|
| 1219 |
+
def __getitem__(self, pos: slice) -> "Selectors":
|
| 1220 |
pass
|
| 1221 |
|
| 1222 |
def __getitem__(
|
| 1223 |
self, pos: Union[SupportsIndex, slice]
|
| 1224 |
+
) -> Union[Selector, "Selectors"]:
|
| 1225 |
lst = super().__getitem__(pos)
|
| 1226 |
if isinstance(pos, slice):
|
| 1227 |
return self.__class__(lst)
|
|
|
|
| 1235 |
auto_save: bool = False,
|
| 1236 |
percentage: int = 0,
|
| 1237 |
**kwargs: Any,
|
| 1238 |
+
) -> "Selectors[Selector]":
|
| 1239 |
"""
|
| 1240 |
Call the ``.xpath()`` method for each element in this list and return
|
| 1241 |
+
their results as another `Selectors` class.
|
| 1242 |
|
| 1243 |
**Important:
|
| 1244 |
It's recommended to use the identifier argument if you plan to use a different selector later
|
|
|
|
| 1249 |
:param selector: The XPath selector to be used.
|
| 1250 |
:param identifier: A string that will be used to retrieve element's data in auto-matching,
|
| 1251 |
otherwise the selector will be used.
|
| 1252 |
+
:param auto_save: Automatically save new elements for `adaptive` later
|
| 1253 |
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 1254 |
Be aware that the percentage calculation depends solely on the page structure, so don't play with this
|
| 1255 |
number unless you must know what you are doing!
|
| 1256 |
|
| 1257 |
+
:return: `Selectors` class.
|
| 1258 |
"""
|
| 1259 |
results = [
|
| 1260 |
n.xpath(
|
|
|
|
| 1270 |
identifier: str = "",
|
| 1271 |
auto_save: bool = False,
|
| 1272 |
percentage: int = 0,
|
| 1273 |
+
) -> "Selectors[Selector]":
|
| 1274 |
"""
|
| 1275 |
Call the ``.css()`` method for each element in this list and return
|
| 1276 |
+
their results flattened as another `Selectors` class.
|
| 1277 |
|
| 1278 |
**Important:
|
| 1279 |
It's recommended to use the identifier argument if you plan to use a different selector later
|
|
|
|
| 1282 |
:param selector: The CSS3 selector to be used.
|
| 1283 |
:param identifier: A string that will be used to retrieve element's data in auto-matching,
|
| 1284 |
otherwise the selector will be used.
|
| 1285 |
+
:param auto_save: Automatically save new elements for `adaptive` later
|
| 1286 |
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 1287 |
Be aware that the percentage calculation depends solely on the page structure, so don't play with this
|
| 1288 |
number unless you must know what you are doing!
|
| 1289 |
|
| 1290 |
+
:return: `Selectors` class.
|
| 1291 |
"""
|
| 1292 |
results = [
|
| 1293 |
n.css(selector, identifier or selector, False, auto_save, percentage)
|
|
|
|
| 1338 |
return result
|
| 1339 |
return default
|
| 1340 |
|
| 1341 |
+
def search(self, func: Callable[["Selector"], bool]) -> Union["Selector", None]:
|
| 1342 |
"""Loop over all current elements and return the first element that matches the passed function
|
| 1343 |
:param func: A function that takes each element as an argument and returns True/False
|
| 1344 |
:return: The first element that match the function or ``None`` otherwise.
|
|
|
|
| 1348 |
return element
|
| 1349 |
return None
|
| 1350 |
|
| 1351 |
+
def filter(self, func: Callable[["Selector"], bool]) -> "Selectors[Selector]":
|
| 1352 |
"""Filter current elements based on the passed function
|
| 1353 |
:param func: A function that takes each element as an argument and returns True/False
|
| 1354 |
+
:return: The new `Selectors` object or empty list otherwise.
|
| 1355 |
"""
|
| 1356 |
return self.__class__([element for element in self if func(element)])
|
| 1357 |
|
|
|
|
| 1380 |
|
| 1381 |
def __getstate__(self) -> Any:
|
| 1382 |
# lxml don't like it :)
|
| 1383 |
+
raise TypeError("Can't pickle Selectors object")
|
tests/fetchers/async/test_camoufox.py
CHANGED
|
@@ -3,7 +3,7 @@ import pytest_httpbin
|
|
| 3 |
|
| 4 |
from scrapling import StealthyFetcher
|
| 5 |
|
| 6 |
-
StealthyFetcher.
|
| 7 |
|
| 8 |
|
| 9 |
@pytest_httpbin.use_class_based_httpbin
|
|
|
|
| 3 |
|
| 4 |
from scrapling import StealthyFetcher
|
| 5 |
|
| 6 |
+
StealthyFetcher.adaptive = True
|
| 7 |
|
| 8 |
|
| 9 |
@pytest_httpbin.use_class_based_httpbin
|
tests/fetchers/async/test_dynamic.py
CHANGED
|
@@ -3,7 +3,7 @@ import pytest_httpbin
|
|
| 3 |
|
| 4 |
from scrapling import DynamicFetcher
|
| 5 |
|
| 6 |
-
DynamicFetcher.
|
| 7 |
|
| 8 |
|
| 9 |
@pytest_httpbin.use_class_based_httpbin
|
|
|
|
| 3 |
|
| 4 |
from scrapling import DynamicFetcher
|
| 5 |
|
| 6 |
+
DynamicFetcher.adaptive = True
|
| 7 |
|
| 8 |
|
| 9 |
@pytest_httpbin.use_class_based_httpbin
|
tests/fetchers/async/test_requests.py
CHANGED
|
@@ -3,7 +3,7 @@ import pytest_httpbin
|
|
| 3 |
|
| 4 |
from scrapling.fetchers import AsyncFetcher
|
| 5 |
|
| 6 |
-
AsyncFetcher.
|
| 7 |
|
| 8 |
|
| 9 |
@pytest_httpbin.use_class_based_httpbin
|
|
|
|
| 3 |
|
| 4 |
from scrapling.fetchers import AsyncFetcher
|
| 5 |
|
| 6 |
+
AsyncFetcher.adaptive = True
|
| 7 |
|
| 8 |
|
| 9 |
@pytest_httpbin.use_class_based_httpbin
|
tests/fetchers/sync/test_camoufox.py
CHANGED
|
@@ -3,7 +3,7 @@ import pytest_httpbin
|
|
| 3 |
|
| 4 |
from scrapling import StealthyFetcher
|
| 5 |
|
| 6 |
-
StealthyFetcher.
|
| 7 |
|
| 8 |
|
| 9 |
@pytest_httpbin.use_class_based_httpbin
|
|
|
|
| 3 |
|
| 4 |
from scrapling import StealthyFetcher
|
| 5 |
|
| 6 |
+
StealthyFetcher.adaptive = True
|
| 7 |
|
| 8 |
|
| 9 |
@pytest_httpbin.use_class_based_httpbin
|
tests/fetchers/sync/test_dynamic.py
CHANGED
|
@@ -5,7 +5,7 @@ import pytest_httpbin
|
|
| 5 |
|
| 6 |
from scrapling import DynamicFetcher
|
| 7 |
|
| 8 |
-
DynamicFetcher.
|
| 9 |
|
| 10 |
|
| 11 |
@pytest_httpbin.use_class_based_httpbin
|
|
|
|
| 5 |
|
| 6 |
from scrapling import DynamicFetcher
|
| 7 |
|
| 8 |
+
DynamicFetcher.adaptive = True
|
| 9 |
|
| 10 |
|
| 11 |
@pytest_httpbin.use_class_based_httpbin
|
tests/fetchers/sync/test_requests.py
CHANGED
|
@@ -3,7 +3,7 @@ import pytest_httpbin
|
|
| 3 |
|
| 4 |
from scrapling import Fetcher
|
| 5 |
|
| 6 |
-
Fetcher.
|
| 7 |
|
| 8 |
|
| 9 |
@pytest_httpbin.use_class_based_httpbin
|
|
|
|
| 3 |
|
| 4 |
from scrapling import Fetcher
|
| 5 |
|
| 6 |
+
Fetcher.adaptive = True
|
| 7 |
|
| 8 |
|
| 9 |
@pytest_httpbin.use_class_based_httpbin
|
tests/parser/{test_automatch.py → test_adaptive.py}
RENAMED
|
@@ -2,10 +2,10 @@ import asyncio
|
|
| 2 |
|
| 3 |
import pytest
|
| 4 |
|
| 5 |
-
from scrapling import
|
| 6 |
|
| 7 |
|
| 8 |
-
class
|
| 9 |
def test_element_relocation(self):
|
| 10 |
"""Test relocating element after structure change"""
|
| 11 |
original_html = """
|
|
@@ -43,13 +43,13 @@ class TestParserAutoMatch:
|
|
| 43 |
</div>
|
| 44 |
"""
|
| 45 |
|
| 46 |
-
old_page =
|
| 47 |
-
new_page =
|
| 48 |
|
| 49 |
# 'p1' was used as ID and now it's not and all the path elements have changes
|
| 50 |
# Also at the same time testing auto-match vs combined selectors
|
| 51 |
_ = old_page.css("#p1, #p2", auto_save=True)[0]
|
| 52 |
-
relocated = new_page.css("#p1",
|
| 53 |
|
| 54 |
assert relocated is not None
|
| 55 |
assert relocated[0].attrib["data-id"] == "p1"
|
|
@@ -97,13 +97,13 @@ class TestParserAutoMatch:
|
|
| 97 |
# Simulate async operation
|
| 98 |
await asyncio.sleep(0.1) # Minimal async operation
|
| 99 |
|
| 100 |
-
old_page =
|
| 101 |
-
new_page =
|
| 102 |
|
| 103 |
# 'p1' was used as ID and now it's not and all the path elements have changes
|
| 104 |
# Also at the same time testing auto-match vs combined selectors
|
| 105 |
_ = old_page.css("#p1, #p2", auto_save=True)[0]
|
| 106 |
-
relocated = new_page.css("#p1",
|
| 107 |
|
| 108 |
assert relocated is not None
|
| 109 |
assert relocated[0].attrib["data-id"] == "p1"
|
|
|
|
| 2 |
|
| 3 |
import pytest
|
| 4 |
|
| 5 |
+
from scrapling import Selector
|
| 6 |
|
| 7 |
|
| 8 |
+
class TestParserAdaptive:
|
| 9 |
def test_element_relocation(self):
|
| 10 |
"""Test relocating element after structure change"""
|
| 11 |
original_html = """
|
|
|
|
| 43 |
</div>
|
| 44 |
"""
|
| 45 |
|
| 46 |
+
old_page = Selector(original_html, url="example.com", adaptive=True)
|
| 47 |
+
new_page = Selector(changed_html, url="example.com", adaptive=True)
|
| 48 |
|
| 49 |
# 'p1' was used as ID and now it's not and all the path elements have changes
|
| 50 |
# Also at the same time testing auto-match vs combined selectors
|
| 51 |
_ = old_page.css("#p1, #p2", auto_save=True)[0]
|
| 52 |
+
relocated = new_page.css("#p1", adaptive=True)
|
| 53 |
|
| 54 |
assert relocated is not None
|
| 55 |
assert relocated[0].attrib["data-id"] == "p1"
|
|
|
|
| 97 |
# Simulate async operation
|
| 98 |
await asyncio.sleep(0.1) # Minimal async operation
|
| 99 |
|
| 100 |
+
old_page = Selector(original_html, url="example.com", adaptive=True)
|
| 101 |
+
new_page = Selector(changed_html, url="example.com", adaptive=True)
|
| 102 |
|
| 103 |
# 'p1' was used as ID and now it's not and all the path elements have changes
|
| 104 |
# Also at the same time testing auto-match vs combined selectors
|
| 105 |
_ = old_page.css("#p1, #p2", auto_save=True)[0]
|
| 106 |
+
relocated = new_page.css("#p1", adaptive=True)
|
| 107 |
|
| 108 |
assert relocated is not None
|
| 109 |
assert relocated[0].attrib["data-id"] == "p1"
|
tests/parser/test_general.py
CHANGED
|
@@ -4,7 +4,7 @@ import time
|
|
| 4 |
import pytest
|
| 5 |
from cssselect import SelectorError, SelectorSyntaxError
|
| 6 |
|
| 7 |
-
from scrapling import
|
| 8 |
|
| 9 |
|
| 10 |
@pytest.fixture
|
|
@@ -78,7 +78,7 @@ def html_content():
|
|
| 78 |
|
| 79 |
@pytest.fixture
|
| 80 |
def page(html_content):
|
| 81 |
-
return
|
| 82 |
|
| 83 |
|
| 84 |
# CSS Selector Tests
|
|
@@ -162,26 +162,26 @@ class TestSimilarElements:
|
|
| 162 |
|
| 163 |
# Error Handling Tests
|
| 164 |
class TestErrorHandling:
|
| 165 |
-
def
|
| 166 |
-
"""Test various invalid
|
| 167 |
# No arguments
|
| 168 |
with pytest.raises(ValueError):
|
| 169 |
-
_ =
|
| 170 |
|
| 171 |
# Invalid argument types
|
| 172 |
with pytest.raises(TypeError):
|
| 173 |
-
_ =
|
| 174 |
|
| 175 |
with pytest.raises(TypeError):
|
| 176 |
-
_ =
|
| 177 |
|
| 178 |
with pytest.raises(TypeError):
|
| 179 |
-
_ =
|
| 180 |
|
| 181 |
def test_invalid_storage(self, page, html_content):
|
| 182 |
"""Test invalid storage parameter"""
|
| 183 |
with pytest.raises(ValueError):
|
| 184 |
-
_ =
|
| 185 |
|
| 186 |
def test_bad_selectors(self, page):
|
| 187 |
"""Test handling of invalid selectors"""
|
|
@@ -195,7 +195,7 @@ class TestErrorHandling:
|
|
| 195 |
# Pickling and Object Representation Tests
|
| 196 |
class TestPicklingAndRepresentation:
|
| 197 |
def test_unpickleable_objects(self, page):
|
| 198 |
-
"""Test that
|
| 199 |
table = page.css(".product-list")[0]
|
| 200 |
with pytest.raises(TypeError):
|
| 201 |
pickle.dumps(table)
|
|
@@ -299,7 +299,7 @@ def test_large_html_parsing_performance():
|
|
| 299 |
)
|
| 300 |
|
| 301 |
start_time = time.time()
|
| 302 |
-
parsed =
|
| 303 |
elements = parsed.css(".item")
|
| 304 |
end_time = time.time()
|
| 305 |
|
|
@@ -315,7 +315,7 @@ def test_large_html_parsing_performance():
|
|
| 315 |
def test_selectors_generation(page):
|
| 316 |
"""Try to create selectors for all elements in the page"""
|
| 317 |
|
| 318 |
-
def _traverse(element:
|
| 319 |
assert isinstance(element.generate_css_selector, str)
|
| 320 |
assert isinstance(element.generate_xpath_selector, str)
|
| 321 |
for branch in element.children:
|
|
|
|
| 4 |
import pytest
|
| 5 |
from cssselect import SelectorError, SelectorSyntaxError
|
| 6 |
|
| 7 |
+
from scrapling import Selector
|
| 8 |
|
| 9 |
|
| 10 |
@pytest.fixture
|
|
|
|
| 78 |
|
| 79 |
@pytest.fixture
|
| 80 |
def page(html_content):
|
| 81 |
+
return Selector(html_content, adaptive=False)
|
| 82 |
|
| 83 |
|
| 84 |
# CSS Selector Tests
|
|
|
|
| 162 |
|
| 163 |
# Error Handling Tests
|
| 164 |
class TestErrorHandling:
|
| 165 |
+
def test_invalid_selector_initialization(self):
|
| 166 |
+
"""Test various invalid Selector initializations"""
|
| 167 |
# No arguments
|
| 168 |
with pytest.raises(ValueError):
|
| 169 |
+
_ = Selector(adaptive=False)
|
| 170 |
|
| 171 |
# Invalid argument types
|
| 172 |
with pytest.raises(TypeError):
|
| 173 |
+
_ = Selector(root="ayo", adaptive=False)
|
| 174 |
|
| 175 |
with pytest.raises(TypeError):
|
| 176 |
+
_ = Selector(text=1, adaptive=False)
|
| 177 |
|
| 178 |
with pytest.raises(TypeError):
|
| 179 |
+
_ = Selector(body=1, adaptive=False)
|
| 180 |
|
| 181 |
def test_invalid_storage(self, page, html_content):
|
| 182 |
"""Test invalid storage parameter"""
|
| 183 |
with pytest.raises(ValueError):
|
| 184 |
+
_ = Selector(html_content, storage=object, adaptive=True)
|
| 185 |
|
| 186 |
def test_bad_selectors(self, page):
|
| 187 |
"""Test handling of invalid selectors"""
|
|
|
|
| 195 |
# Pickling and Object Representation Tests
|
| 196 |
class TestPicklingAndRepresentation:
|
| 197 |
def test_unpickleable_objects(self, page):
|
| 198 |
+
"""Test that Selector objects cannot be pickled"""
|
| 199 |
table = page.css(".product-list")[0]
|
| 200 |
with pytest.raises(TypeError):
|
| 201 |
pickle.dumps(table)
|
|
|
|
| 299 |
)
|
| 300 |
|
| 301 |
start_time = time.time()
|
| 302 |
+
parsed = Selector(large_html, adaptive=False)
|
| 303 |
elements = parsed.css(".item")
|
| 304 |
end_time = time.time()
|
| 305 |
|
|
|
|
| 315 |
def test_selectors_generation(page):
|
| 316 |
"""Try to create selectors for all elements in the page"""
|
| 317 |
|
| 318 |
+
def _traverse(element: Selector):
|
| 319 |
assert isinstance(element.generate_css_selector, str)
|
| 320 |
assert isinstance(element.generate_xpath_selector, str)
|
| 321 |
for branch in element.children:
|