Karim shoair commited on
Commit ·
df3c414
1
Parent(s): 9c43da3
refactor(api)!: Unifying log under 1 logger and removing debug parameter
Browse filesSo now you control the logging and the debugging from the shell through the logger with the name 'scrapling'
- .github/ISSUE_TEMPLATE/01-bug_report.yml +1 -1
- CONTRIBUTING.md +5 -1
- README.md +1 -1
- benchmarks.py +3 -3
- scrapling/core/storage_adaptors.py +5 -6
- scrapling/core/translator.py +2 -2
- scrapling/core/utils.py +29 -28
- scrapling/engines/camo.py +2 -3
- scrapling/engines/pw.py +2 -2
- scrapling/engines/static.py +0 -3
- scrapling/engines/toolbelt/custom.py +10 -15
- scrapling/engines/toolbelt/fingerprints.py +3 -3
- scrapling/engines/toolbelt/navigation.py +3 -5
- scrapling/parser.py +11 -15
- tests/parser/test_automatch.py +2 -2
- tests/parser/test_general.py +2 -2
.github/ISSUE_TEMPLATE/01-bug_report.yml
CHANGED
|
@@ -65,7 +65,7 @@ body:
|
|
| 65 |
|
| 66 |
- type: textarea
|
| 67 |
attributes:
|
| 68 |
-
label: "Actual behavior
|
| 69 |
validations:
|
| 70 |
required: true
|
| 71 |
|
|
|
|
| 65 |
|
| 66 |
- type: textarea
|
| 67 |
attributes:
|
| 68 |
+
label: "Actual behavior"
|
| 69 |
validations:
|
| 70 |
required: true
|
| 71 |
|
CONTRIBUTING.md
CHANGED
|
@@ -19,7 +19,11 @@ tests/test_parser_functions.py ................ [100%]
|
|
| 19 |
|
| 20 |
=============================== 16 passed in 0.22s ================================
|
| 21 |
```
|
| 22 |
-
Also, consider setting
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
### The process is straight-forward.
|
| 25 |
|
|
|
|
| 19 |
|
| 20 |
=============================== 16 passed in 0.22s ================================
|
| 21 |
```
|
| 22 |
+
Also, consider setting the scrapling logging level to `debug` so it's easier to know what's happening in the background.
|
| 23 |
+
```python
|
| 24 |
+
>>> import logging
|
| 25 |
+
>>> logging.getLogger("scrapling").setLevel(logging.DEBUG)
|
| 26 |
+
```
|
| 27 |
|
| 28 |
### The process is straight-forward.
|
| 29 |
|
README.md
CHANGED
|
@@ -219,7 +219,7 @@ You might be slightly confused by now so let me clear things up. All fetcher-typ
|
|
| 219 |
```python
|
| 220 |
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
| 221 |
```
|
| 222 |
-
All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`,
|
| 223 |
|
| 224 |
If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
|
| 225 |
```python
|
|
|
|
| 219 |
```python
|
| 220 |
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
| 221 |
```
|
| 222 |
+
All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the `Adaptor` class.
|
| 223 |
|
| 224 |
If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
|
| 225 |
```python
|
benchmarks.py
CHANGED
|
@@ -64,9 +64,9 @@ def test_pyquery():
|
|
| 64 |
@benchmark
|
| 65 |
def test_scrapling():
|
| 66 |
# No need to do `.extract()` like parsel to extract text
|
| 67 |
-
# Also, this is faster than `[t.text for t in Adaptor(large_html, auto_match=False
|
| 68 |
# for obvious reasons, of course.
|
| 69 |
-
return Adaptor(large_html, auto_match=False
|
| 70 |
|
| 71 |
|
| 72 |
@benchmark
|
|
@@ -103,7 +103,7 @@ def test_scrapling_text(request_html):
|
|
| 103 |
# Will loop over resulted elements to get text too to make comparison even more fair otherwise Scrapling will be even faster
|
| 104 |
return [
|
| 105 |
element.text for element in Adaptor(
|
| 106 |
-
request_html, auto_match=False
|
| 107 |
).find_by_text('Tipping the Velvet', first_match=True).find_similar(ignore_attributes=['title'])
|
| 108 |
]
|
| 109 |
|
|
|
|
| 64 |
@benchmark
|
| 65 |
def test_scrapling():
|
| 66 |
# No need to do `.extract()` like parsel to extract text
|
| 67 |
+
# Also, this is faster than `[t.text for t in Adaptor(large_html, auto_match=False).css('.item')]`
|
| 68 |
# for obvious reasons, of course.
|
| 69 |
+
return Adaptor(large_html, auto_match=False).css('.item::text')
|
| 70 |
|
| 71 |
|
| 72 |
@benchmark
|
|
|
|
| 103 |
# Will loop over resulted elements to get text too to make comparison even more fair otherwise Scrapling will be even faster
|
| 104 |
return [
|
| 105 |
element.text for element in Adaptor(
|
| 106 |
+
request_html, auto_match=False
|
| 107 |
).find_by_text('Tipping the Velvet', first_match=True).find_similar(ignore_attributes=['title'])
|
| 108 |
]
|
| 109 |
|
scrapling/core/storage_adaptors.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
import logging
|
| 2 |
import sqlite3
|
| 3 |
import threading
|
| 4 |
from abc import ABC, abstractmethod
|
|
@@ -9,7 +8,7 @@ from lxml import html
|
|
| 9 |
from tldextract import extract as tld
|
| 10 |
|
| 11 |
from scrapling.core._types import Dict, Optional, Union
|
| 12 |
-
from scrapling.core.utils import _StorageTools,
|
| 13 |
|
| 14 |
|
| 15 |
class StorageSystemMixin(ABC):
|
|
@@ -20,7 +19,7 @@ class StorageSystemMixin(ABC):
|
|
| 20 |
"""
|
| 21 |
self.url = url
|
| 22 |
|
| 23 |
-
@
|
| 24 |
def _get_base_url(self, default_value: str = 'default') -> str:
|
| 25 |
if not self.url or type(self.url) is not str:
|
| 26 |
return default_value
|
|
@@ -52,7 +51,7 @@ class StorageSystemMixin(ABC):
|
|
| 52 |
raise NotImplementedError('Storage system must implement `save` method')
|
| 53 |
|
| 54 |
@staticmethod
|
| 55 |
-
@
|
| 56 |
def _get_hash(identifier: str) -> str:
|
| 57 |
"""If you want to hash identifier in your storage system, use this safer"""
|
| 58 |
identifier = identifier.lower().strip()
|
|
@@ -64,7 +63,7 @@ class StorageSystemMixin(ABC):
|
|
| 64 |
return f"{hash_value}_{len(identifier)}" # Length to reduce collision chance
|
| 65 |
|
| 66 |
|
| 67 |
-
@
|
| 68 |
class SQLiteStorageSystem(StorageSystemMixin):
|
| 69 |
"""The recommended system to use, it's race condition safe and thread safe.
|
| 70 |
Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
|
|
@@ -86,7 +85,7 @@ class SQLiteStorageSystem(StorageSystemMixin):
|
|
| 86 |
self.connection.execute("PRAGMA journal_mode=WAL")
|
| 87 |
self.cursor = self.connection.cursor()
|
| 88 |
self._setup_database()
|
| 89 |
-
|
| 90 |
f'Storage system loaded with arguments (storage_file="{storage_file}", url="{url}")'
|
| 91 |
)
|
| 92 |
|
|
|
|
|
|
|
| 1 |
import sqlite3
|
| 2 |
import threading
|
| 3 |
from abc import ABC, abstractmethod
|
|
|
|
| 8 |
from tldextract import extract as tld
|
| 9 |
|
| 10 |
from scrapling.core._types import Dict, Optional, Union
|
| 11 |
+
from scrapling.core.utils import _StorageTools, log, lru_cache
|
| 12 |
|
| 13 |
|
| 14 |
class StorageSystemMixin(ABC):
|
|
|
|
| 19 |
"""
|
| 20 |
self.url = url
|
| 21 |
|
| 22 |
+
@lru_cache(None, typed=True)
|
| 23 |
def _get_base_url(self, default_value: str = 'default') -> str:
|
| 24 |
if not self.url or type(self.url) is not str:
|
| 25 |
return default_value
|
|
|
|
| 51 |
raise NotImplementedError('Storage system must implement `save` method')
|
| 52 |
|
| 53 |
@staticmethod
|
| 54 |
+
@lru_cache(None, typed=True)
|
| 55 |
def _get_hash(identifier: str) -> str:
|
| 56 |
"""If you want to hash identifier in your storage system, use this safer"""
|
| 57 |
identifier = identifier.lower().strip()
|
|
|
|
| 63 |
return f"{hash_value}_{len(identifier)}" # Length to reduce collision chance
|
| 64 |
|
| 65 |
|
| 66 |
+
@lru_cache(None, typed=True)
|
| 67 |
class SQLiteStorageSystem(StorageSystemMixin):
|
| 68 |
"""The recommended system to use, it's race condition safe and thread safe.
|
| 69 |
Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
|
|
|
|
| 85 |
self.connection.execute("PRAGMA journal_mode=WAL")
|
| 86 |
self.cursor = self.connection.cursor()
|
| 87 |
self._setup_database()
|
| 88 |
+
log.debug(
|
| 89 |
f'Storage system loaded with arguments (storage_file="{storage_file}", url="{url}")'
|
| 90 |
)
|
| 91 |
|
scrapling/core/translator.py
CHANGED
|
@@ -17,7 +17,7 @@ from cssselect.xpath import XPathExpr as OriginalXPathExpr
|
|
| 17 |
from w3lib.html import HTML5_WHITESPACE
|
| 18 |
|
| 19 |
from scrapling.core._types import Any, Optional, Protocol, Self
|
| 20 |
-
from scrapling.core.utils import
|
| 21 |
|
| 22 |
regex = f"[{HTML5_WHITESPACE}]+"
|
| 23 |
replace_html5_whitespaces = re.compile(regex).sub
|
|
@@ -139,6 +139,6 @@ class TranslatorMixin:
|
|
| 139 |
|
| 140 |
|
| 141 |
class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
|
| 142 |
-
@
|
| 143 |
def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
|
| 144 |
return super().css_to_xpath(css, prefix)
|
|
|
|
| 17 |
from w3lib.html import HTML5_WHITESPACE
|
| 18 |
|
| 19 |
from scrapling.core._types import Any, Optional, Protocol, Self
|
| 20 |
+
from scrapling.core.utils import lru_cache
|
| 21 |
|
| 22 |
regex = f"[{HTML5_WHITESPACE}]+"
|
| 23 |
replace_html5_whitespaces = re.compile(regex).sub
|
|
|
|
| 139 |
|
| 140 |
|
| 141 |
class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
|
| 142 |
+
@lru_cache(maxsize=256)
|
| 143 |
def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
|
| 144 |
return super().css_to_xpath(css, prefix)
|
scrapling/core/utils.py
CHANGED
|
@@ -9,18 +9,36 @@ from scrapling.core._types import Any, Dict, Iterable, Union
|
|
| 9 |
|
| 10 |
# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
|
| 11 |
# functools.cache is available on Python 3.9+ only so let's keep lru_cache
|
| 12 |
-
from functools import lru_cache
|
| 13 |
-
|
| 14 |
|
| 15 |
html_forbidden = {html.HtmlComment, }
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
def is_jsonable(content: Union[bytes, str]) -> bool:
|
|
@@ -34,23 +52,6 @@ def is_jsonable(content: Union[bytes, str]) -> bool:
|
|
| 34 |
return False
|
| 35 |
|
| 36 |
|
| 37 |
-
@cache(None, typed=True)
|
| 38 |
-
def setup_basic_logging(level: str = 'debug'):
|
| 39 |
-
levels = {
|
| 40 |
-
'debug': logging.DEBUG,
|
| 41 |
-
'info': logging.INFO,
|
| 42 |
-
'warning': logging.WARNING,
|
| 43 |
-
'error': logging.ERROR,
|
| 44 |
-
'critical': logging.CRITICAL
|
| 45 |
-
}
|
| 46 |
-
formatter = logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s", "%Y-%m-%d %H:%M:%S")
|
| 47 |
-
lvl = levels[level.lower()]
|
| 48 |
-
handler = logging.StreamHandler()
|
| 49 |
-
handler.setFormatter(formatter)
|
| 50 |
-
# Configure the root logger
|
| 51 |
-
logging.basicConfig(level=lvl, handlers=[handler])
|
| 52 |
-
|
| 53 |
-
|
| 54 |
def flatten(lst: Iterable):
|
| 55 |
return list(chain.from_iterable(lst))
|
| 56 |
|
|
@@ -114,7 +115,7 @@ class _StorageTools:
|
|
| 114 |
# return _impl
|
| 115 |
|
| 116 |
|
| 117 |
-
@
|
| 118 |
def clean_spaces(string):
|
| 119 |
string = string.replace('\t', ' ')
|
| 120 |
string = re.sub('[\n|\r]', '', string)
|
|
|
|
| 9 |
|
| 10 |
# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
|
| 11 |
# functools.cache is available on Python 3.9+ only so let's keep lru_cache
|
| 12 |
+
from functools import lru_cache # isort:skip
|
|
|
|
| 13 |
|
| 14 |
html_forbidden = {html.HtmlComment, }
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@lru_cache(1, typed=True)
|
| 18 |
+
def setup_logger():
|
| 19 |
+
"""Create and configure a logger with a standard format.
|
| 20 |
+
|
| 21 |
+
:returns: logging.Logger: Configured logger instance
|
| 22 |
+
"""
|
| 23 |
+
logger = logging.getLogger('scrapling')
|
| 24 |
+
logger.setLevel(logging.INFO)
|
| 25 |
+
|
| 26 |
+
formatter = logging.Formatter(
|
| 27 |
+
fmt="[%(asctime)s] %(levelname)s: %(message)s",
|
| 28 |
+
datefmt="%Y-%m-%d %H:%M:%S"
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
console_handler = logging.StreamHandler()
|
| 32 |
+
console_handler.setFormatter(formatter)
|
| 33 |
+
|
| 34 |
+
# Add handler to logger (if not already added)
|
| 35 |
+
if not logger.handlers:
|
| 36 |
+
logger.addHandler(console_handler)
|
| 37 |
+
|
| 38 |
+
return logger
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
log = setup_logger()
|
| 42 |
|
| 43 |
|
| 44 |
def is_jsonable(content: Union[bytes, str]) -> bool:
|
|
|
|
| 52 |
return False
|
| 53 |
|
| 54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
def flatten(lst: Iterable):
|
| 56 |
return list(chain.from_iterable(lst))
|
| 57 |
|
|
|
|
| 115 |
# return _impl
|
| 116 |
|
| 117 |
|
| 118 |
+
@lru_cache(None, typed=True)
|
| 119 |
def clean_spaces(string):
|
| 120 |
string = string.replace('\t', ' ')
|
| 121 |
string = re.sub('[\n|\r]', '', string)
|
scrapling/engines/camo.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
| 1 |
-
import logging
|
| 2 |
-
|
| 3 |
from camoufox import DefaultAddons
|
| 4 |
from camoufox.sync_api import Camoufox
|
| 5 |
|
| 6 |
from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
|
| 7 |
Union)
|
|
|
|
| 8 |
from scrapling.engines.toolbelt import (Response, StatusText,
|
| 9 |
check_type_validity,
|
| 10 |
construct_proxy_dict, do_nothing,
|
|
@@ -63,7 +62,7 @@ class CamoufoxEngine:
|
|
| 63 |
self.page_action = page_action
|
| 64 |
else:
|
| 65 |
self.page_action = do_nothing
|
| 66 |
-
|
| 67 |
|
| 68 |
self.wait_selector = wait_selector
|
| 69 |
self.wait_selector_state = wait_selector_state
|
|
|
|
|
|
|
|
|
|
| 1 |
from camoufox import DefaultAddons
|
| 2 |
from camoufox.sync_api import Camoufox
|
| 3 |
|
| 4 |
from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
|
| 5 |
Union)
|
| 6 |
+
from scrapling.core.utils import log
|
| 7 |
from scrapling.engines.toolbelt import (Response, StatusText,
|
| 8 |
check_type_validity,
|
| 9 |
construct_proxy_dict, do_nothing,
|
|
|
|
| 62 |
self.page_action = page_action
|
| 63 |
else:
|
| 64 |
self.page_action = do_nothing
|
| 65 |
+
log.error('[Ignored] Argument "page_action" must be callable')
|
| 66 |
|
| 67 |
self.wait_selector = wait_selector
|
| 68 |
self.wait_selector_state = wait_selector_state
|
scrapling/engines/pw.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import json
|
| 2 |
-
import logging
|
| 3 |
|
| 4 |
from scrapling.core._types import Callable, Dict, List, Optional, Union
|
|
|
|
| 5 |
from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
|
| 6 |
NSTBROWSER_DEFAULT_QUERY)
|
| 7 |
from scrapling.engines.toolbelt import (Response, StatusText,
|
|
@@ -78,7 +78,7 @@ class PlaywrightEngine:
|
|
| 78 |
self.page_action = page_action
|
| 79 |
else:
|
| 80 |
self.page_action = do_nothing
|
| 81 |
-
|
| 82 |
|
| 83 |
self.wait_selector = wait_selector
|
| 84 |
self.wait_selector_state = wait_selector_state
|
|
|
|
| 1 |
import json
|
|
|
|
| 2 |
|
| 3 |
from scrapling.core._types import Callable, Dict, List, Optional, Union
|
| 4 |
+
from scrapling.core.utils import log
|
| 5 |
from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
|
| 6 |
NSTBROWSER_DEFAULT_QUERY)
|
| 7 |
from scrapling.engines.toolbelt import (Response, StatusText,
|
|
|
|
| 78 |
self.page_action = page_action
|
| 79 |
else:
|
| 80 |
self.page_action = do_nothing
|
| 81 |
+
log.error('[Ignored] Argument "page_action" must be callable')
|
| 82 |
|
| 83 |
self.wait_selector = wait_selector
|
| 84 |
self.wait_selector_state = wait_selector_state
|
scrapling/engines/static.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
| 1 |
-
import logging
|
| 2 |
-
|
| 3 |
import httpx
|
| 4 |
from httpx._models import Response as httpxResponse
|
| 5 |
|
|
@@ -36,7 +34,6 @@ class StaticEngine:
|
|
| 36 |
# Validate headers
|
| 37 |
if not headers.get('user-agent') and not headers.get('User-Agent'):
|
| 38 |
headers['User-Agent'] = generate_headers(browser_mode=False).get('User-Agent')
|
| 39 |
-
logging.info(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
|
| 40 |
|
| 41 |
if stealth:
|
| 42 |
extra_headers = generate_headers(browser_mode=False)
|
|
|
|
|
|
|
|
|
|
| 1 |
import httpx
|
| 2 |
from httpx._models import Response as httpxResponse
|
| 3 |
|
|
|
|
| 34 |
# Validate headers
|
| 35 |
if not headers.get('user-agent') and not headers.get('User-Agent'):
|
| 36 |
headers['User-Agent'] = generate_headers(browser_mode=False).get('User-Agent')
|
|
|
|
| 37 |
|
| 38 |
if stealth:
|
| 39 |
extra_headers = generate_headers(browser_mode=False)
|
scrapling/engines/toolbelt/custom.py
CHANGED
|
@@ -2,13 +2,12 @@
|
|
| 2 |
Functions related to custom types or type checking
|
| 3 |
"""
|
| 4 |
import inspect
|
| 5 |
-
import logging
|
| 6 |
from email.message import Message
|
| 7 |
|
| 8 |
from scrapling.core._types import (Any, Callable, Dict, List, Optional, Tuple,
|
| 9 |
Type, Union)
|
| 10 |
from scrapling.core.custom_types import MappingProxyType
|
| 11 |
-
from scrapling.core.utils import
|
| 12 |
from scrapling.parser import Adaptor, SQLiteStorageSystem
|
| 13 |
|
| 14 |
|
|
@@ -17,7 +16,7 @@ class ResponseEncoding:
|
|
| 17 |
__ISO_8859_1_CONTENT_TYPES = {"text/plain", "text/html", "text/css", "text/javascript"}
|
| 18 |
|
| 19 |
@classmethod
|
| 20 |
-
@
|
| 21 |
def __parse_content_type(cls, header_value: str) -> Tuple[str, Dict[str, str]]:
|
| 22 |
"""Parse content type and parameters from a content-type header value.
|
| 23 |
|
|
@@ -39,7 +38,7 @@ class ResponseEncoding:
|
|
| 39 |
return content_type, params
|
| 40 |
|
| 41 |
@classmethod
|
| 42 |
-
@
|
| 43 |
def get_value(cls, content_type: Optional[str], text: Optional[str] = 'test') -> str:
|
| 44 |
"""Determine the appropriate character encoding from a content-type header.
|
| 45 |
|
|
@@ -98,7 +97,7 @@ class Response(Adaptor):
|
|
| 98 |
# For back-ward compatibility
|
| 99 |
self.adaptor = self
|
| 100 |
# For easier debugging while working from a Python shell
|
| 101 |
-
|
| 102 |
|
| 103 |
# def __repr__(self):
|
| 104 |
# return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
|
|
@@ -107,7 +106,7 @@ class Response(Adaptor):
|
|
| 107 |
class BaseFetcher:
|
| 108 |
def __init__(
|
| 109 |
self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
|
| 110 |
-
storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None,
|
| 111 |
automatch_domain: Optional[str] = None, keep_cdata: Optional[bool] = False,
|
| 112 |
):
|
| 113 |
"""Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
|
|
@@ -124,7 +123,6 @@ class BaseFetcher:
|
|
| 124 |
If empty, default values will be used.
|
| 125 |
:param automatch_domain: For cases where you want to automatch selectors across different websites as if they were on the same website, use this argument to unify them.
|
| 126 |
Otherwise, the domain of the request is used by default.
|
| 127 |
-
:param debug: Enable debug mode
|
| 128 |
"""
|
| 129 |
# Adaptor class parameters
|
| 130 |
# I won't validate Adaptor's class parameters here again, I will leave it to be validated later
|
|
@@ -134,14 +132,11 @@ class BaseFetcher:
|
|
| 134 |
keep_cdata=keep_cdata,
|
| 135 |
auto_match=auto_match,
|
| 136 |
storage=storage,
|
| 137 |
-
storage_args=storage_args
|
| 138 |
-
debug=debug,
|
| 139 |
)
|
| 140 |
-
# If the user used fetchers first, then configure the logger from here instead of the `Adaptor` class
|
| 141 |
-
setup_basic_logging(level='debug' if debug else 'info')
|
| 142 |
if automatch_domain:
|
| 143 |
if type(automatch_domain) is not str:
|
| 144 |
-
|
| 145 |
else:
|
| 146 |
self.adaptor_arguments.update({'automatch_domain': automatch_domain})
|
| 147 |
|
|
@@ -217,7 +212,7 @@ class StatusText:
|
|
| 217 |
})
|
| 218 |
|
| 219 |
@classmethod
|
| 220 |
-
@
|
| 221 |
def get(cls, status_code: int) -> str:
|
| 222 |
"""Get the phrase for a given HTTP status code."""
|
| 223 |
return cls._phrases.get(status_code, "Unknown Status Code")
|
|
@@ -284,7 +279,7 @@ def check_type_validity(variable: Any, valid_types: Union[List[Type], None], def
|
|
| 284 |
error_msg = f'Argument "{var_name}" cannot be None'
|
| 285 |
if critical:
|
| 286 |
raise TypeError(error_msg)
|
| 287 |
-
|
| 288 |
return default_value
|
| 289 |
|
| 290 |
# If no valid_types specified and variable has a value, return it
|
|
@@ -297,7 +292,7 @@ def check_type_validity(variable: Any, valid_types: Union[List[Type], None], def
|
|
| 297 |
error_msg = f'Argument "{var_name}" must be of type {" or ".join(type_names)}'
|
| 298 |
if critical:
|
| 299 |
raise TypeError(error_msg)
|
| 300 |
-
|
| 301 |
return default_value
|
| 302 |
|
| 303 |
return variable
|
|
|
|
| 2 |
Functions related to custom types or type checking
|
| 3 |
"""
|
| 4 |
import inspect
|
|
|
|
| 5 |
from email.message import Message
|
| 6 |
|
| 7 |
from scrapling.core._types import (Any, Callable, Dict, List, Optional, Tuple,
|
| 8 |
Type, Union)
|
| 9 |
from scrapling.core.custom_types import MappingProxyType
|
| 10 |
+
from scrapling.core.utils import log, lru_cache
|
| 11 |
from scrapling.parser import Adaptor, SQLiteStorageSystem
|
| 12 |
|
| 13 |
|
|
|
|
| 16 |
__ISO_8859_1_CONTENT_TYPES = {"text/plain", "text/html", "text/css", "text/javascript"}
|
| 17 |
|
| 18 |
@classmethod
|
| 19 |
+
@lru_cache(maxsize=None)
|
| 20 |
def __parse_content_type(cls, header_value: str) -> Tuple[str, Dict[str, str]]:
|
| 21 |
"""Parse content type and parameters from a content-type header value.
|
| 22 |
|
|
|
|
| 38 |
return content_type, params
|
| 39 |
|
| 40 |
@classmethod
|
| 41 |
+
@lru_cache(maxsize=None)
|
| 42 |
def get_value(cls, content_type: Optional[str], text: Optional[str] = 'test') -> str:
|
| 43 |
"""Determine the appropriate character encoding from a content-type header.
|
| 44 |
|
|
|
|
| 97 |
# For back-ward compatibility
|
| 98 |
self.adaptor = self
|
| 99 |
# For easier debugging while working from a Python shell
|
| 100 |
+
log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')
|
| 101 |
|
| 102 |
# def __repr__(self):
|
| 103 |
# return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
|
|
|
|
| 106 |
class BaseFetcher:
|
| 107 |
def __init__(
|
| 108 |
self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
|
| 109 |
+
storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None,
|
| 110 |
automatch_domain: Optional[str] = None, keep_cdata: Optional[bool] = False,
|
| 111 |
):
|
| 112 |
"""Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
|
|
|
|
| 123 |
If empty, default values will be used.
|
| 124 |
:param automatch_domain: For cases where you want to automatch selectors across different websites as if they were on the same website, use this argument to unify them.
|
| 125 |
Otherwise, the domain of the request is used by default.
|
|
|
|
| 126 |
"""
|
| 127 |
# Adaptor class parameters
|
| 128 |
# I won't validate Adaptor's class parameters here again, I will leave it to be validated later
|
|
|
|
| 132 |
keep_cdata=keep_cdata,
|
| 133 |
auto_match=auto_match,
|
| 134 |
storage=storage,
|
| 135 |
+
storage_args=storage_args
|
|
|
|
| 136 |
)
|
|
|
|
|
|
|
| 137 |
if automatch_domain:
|
| 138 |
if type(automatch_domain) is not str:
|
| 139 |
+
log.warning('[Ignored] The argument "automatch_domain" must be of string type')
|
| 140 |
else:
|
| 141 |
self.adaptor_arguments.update({'automatch_domain': automatch_domain})
|
| 142 |
|
|
|
|
| 212 |
})
|
| 213 |
|
| 214 |
@classmethod
|
| 215 |
+
@lru_cache(maxsize=128)
|
| 216 |
def get(cls, status_code: int) -> str:
|
| 217 |
"""Get the phrase for a given HTTP status code."""
|
| 218 |
return cls._phrases.get(status_code, "Unknown Status Code")
|
|
|
|
| 279 |
error_msg = f'Argument "{var_name}" cannot be None'
|
| 280 |
if critical:
|
| 281 |
raise TypeError(error_msg)
|
| 282 |
+
log.error(f'[Ignored] {error_msg}')
|
| 283 |
return default_value
|
| 284 |
|
| 285 |
# If no valid_types specified and variable has a value, return it
|
|
|
|
| 292 |
error_msg = f'Argument "{var_name}" must be of type {" or ".join(type_names)}'
|
| 293 |
if critical:
|
| 294 |
raise TypeError(error_msg)
|
| 295 |
+
log.error(f'[Ignored] {error_msg}')
|
| 296 |
return default_value
|
| 297 |
|
| 298 |
return variable
|
scrapling/engines/toolbelt/fingerprints.py
CHANGED
|
@@ -9,10 +9,10 @@ from browserforge.headers import Browser, HeaderGenerator
|
|
| 9 |
from tldextract import extract
|
| 10 |
|
| 11 |
from scrapling.core._types import Dict, Union
|
| 12 |
-
from scrapling.core.utils import
|
| 13 |
|
| 14 |
|
| 15 |
-
@
|
| 16 |
def generate_convincing_referer(url: str) -> str:
|
| 17 |
"""Takes the domain from the URL without the subdomain/suffix and make it look like you were searching google for this website
|
| 18 |
|
|
@@ -26,7 +26,7 @@ def generate_convincing_referer(url: str) -> str:
|
|
| 26 |
return f'https://www.google.com/search?q={website_name}'
|
| 27 |
|
| 28 |
|
| 29 |
-
@
|
| 30 |
def get_os_name() -> Union[str, None]:
|
| 31 |
"""Get the current OS name in the same format needed for browserforge
|
| 32 |
|
|
|
|
| 9 |
from tldextract import extract
|
| 10 |
|
| 11 |
from scrapling.core._types import Dict, Union
|
| 12 |
+
from scrapling.core.utils import lru_cache
|
| 13 |
|
| 14 |
|
| 15 |
+
@lru_cache(None, typed=True)
|
| 16 |
def generate_convincing_referer(url: str) -> str:
|
| 17 |
"""Takes the domain from the URL without the subdomain/suffix and make it look like you were searching google for this website
|
| 18 |
|
|
|
|
| 26 |
return f'https://www.google.com/search?q={website_name}'
|
| 27 |
|
| 28 |
|
| 29 |
+
@lru_cache(None, typed=True)
|
| 30 |
def get_os_name() -> Union[str, None]:
|
| 31 |
"""Get the current OS name in the same format needed for browserforge
|
| 32 |
|
scrapling/engines/toolbelt/navigation.py
CHANGED
|
@@ -1,15 +1,13 @@
|
|
| 1 |
"""
|
| 2 |
Functions related to files and URLs
|
| 3 |
"""
|
| 4 |
-
|
| 5 |
-
import logging
|
| 6 |
import os
|
| 7 |
from urllib.parse import urlencode, urlparse
|
| 8 |
|
| 9 |
from playwright.sync_api import Route
|
| 10 |
|
| 11 |
from scrapling.core._types import Dict, Optional, Union
|
| 12 |
-
from scrapling.core.utils import
|
| 13 |
from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
|
| 14 |
|
| 15 |
|
|
@@ -20,7 +18,7 @@ def intercept_route(route: Route) -> Union[Route, None]:
|
|
| 20 |
:return: PlayWright `Route` object
|
| 21 |
"""
|
| 22 |
if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
|
| 23 |
-
|
| 24 |
return route.abort()
|
| 25 |
return route.continue_()
|
| 26 |
|
|
@@ -97,7 +95,7 @@ def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
|
|
| 97 |
raise ValueError(f"Invalid CDP URL: {str(e)}")
|
| 98 |
|
| 99 |
|
| 100 |
-
@
|
| 101 |
def js_bypass_path(filename: str) -> str:
|
| 102 |
"""Takes the base filename of JS file inside the `bypasses` folder then return the full path of it
|
| 103 |
|
|
|
|
| 1 |
"""
|
| 2 |
Functions related to files and URLs
|
| 3 |
"""
|
|
|
|
|
|
|
| 4 |
import os
|
| 5 |
from urllib.parse import urlencode, urlparse
|
| 6 |
|
| 7 |
from playwright.sync_api import Route
|
| 8 |
|
| 9 |
from scrapling.core._types import Dict, Optional, Union
|
| 10 |
+
from scrapling.core.utils import log, lru_cache
|
| 11 |
from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
|
| 12 |
|
| 13 |
|
|
|
|
| 18 |
:return: PlayWright `Route` object
|
| 19 |
"""
|
| 20 |
if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
|
| 21 |
+
log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
|
| 22 |
return route.abort()
|
| 23 |
return route.continue_()
|
| 24 |
|
|
|
|
| 95 |
raise ValueError(f"Invalid CDP URL: {str(e)}")
|
| 96 |
|
| 97 |
|
| 98 |
+
@lru_cache(None, typed=True)
|
| 99 |
def js_bypass_path(filename: str) -> str:
|
| 100 |
"""Takes the base filename of JS file inside the `bypasses` folder then return the full path of it
|
| 101 |
|
scrapling/parser.py
CHANGED
|
@@ -18,12 +18,12 @@ from scrapling.core.storage_adaptors import (SQLiteStorageSystem,
|
|
| 18 |
StorageSystemMixin, _StorageTools)
|
| 19 |
from scrapling.core.translator import HTMLTranslator
|
| 20 |
from scrapling.core.utils import (clean_spaces, flatten, html_forbidden,
|
| 21 |
-
is_jsonable,
|
| 22 |
|
| 23 |
|
| 24 |
class Adaptor(SelectorsGeneration):
|
| 25 |
__slots__ = (
|
| 26 |
-
'url', 'encoding', '__auto_match_enabled', '_root', '_storage',
|
| 27 |
'__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
|
| 28 |
'__keep_cdata', '__raw_body'
|
| 29 |
)
|
|
@@ -41,7 +41,6 @@ class Adaptor(SelectorsGeneration):
|
|
| 41 |
auto_match: Optional[bool] = True,
|
| 42 |
storage: Any = SQLiteStorageSystem,
|
| 43 |
storage_args: Optional[Dict] = None,
|
| 44 |
-
debug: Optional[bool] = True,
|
| 45 |
**kwargs
|
| 46 |
):
|
| 47 |
"""The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
|
|
@@ -67,7 +66,6 @@ class Adaptor(SelectorsGeneration):
|
|
| 67 |
:param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
|
| 68 |
:param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
|
| 69 |
If empty, default values will be used.
|
| 70 |
-
:param debug: Enable debug mode
|
| 71 |
"""
|
| 72 |
if root is None and not body and text is None:
|
| 73 |
raise ValueError("Adaptor class needs text, body, or root arguments to work")
|
|
@@ -106,7 +104,6 @@ class Adaptor(SelectorsGeneration):
|
|
| 106 |
|
| 107 |
self._root = root
|
| 108 |
|
| 109 |
-
setup_basic_logging(level='debug' if debug else 'info')
|
| 110 |
self.__auto_match_enabled = auto_match
|
| 111 |
|
| 112 |
if self.__auto_match_enabled:
|
|
@@ -117,7 +114,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 117 |
}
|
| 118 |
|
| 119 |
if not hasattr(storage, '__wrapped__'):
|
| 120 |
-
raise ValueError("Storage class must be wrapped with
|
| 121 |
|
| 122 |
if not issubclass(storage.__wrapped__, StorageSystemMixin):
|
| 123 |
raise ValueError("Storage system must be inherited from class `StorageSystemMixin`")
|
|
@@ -132,7 +129,6 @@ class Adaptor(SelectorsGeneration):
|
|
| 132 |
# For selector stuff
|
| 133 |
self.__attributes = None
|
| 134 |
self.__tag = None
|
| 135 |
-
self.__debug = debug
|
| 136 |
# No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
|
| 137 |
self.__response_data = {
|
| 138 |
key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'headers', 'request_headers',)
|
|
@@ -164,7 +160,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 164 |
text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
|
| 165 |
url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
|
| 166 |
keep_comments=self.__keep_comments, keep_cdata=self.__keep_cdata,
|
| 167 |
-
huge_tree=self.__huge_tree_enabled,
|
| 168 |
**self.__response_data
|
| 169 |
)
|
| 170 |
return element
|
|
@@ -417,10 +413,10 @@ class Adaptor(SelectorsGeneration):
|
|
| 417 |
if score_table:
|
| 418 |
highest_probability = max(score_table.keys())
|
| 419 |
if score_table[highest_probability] and highest_probability >= percentage:
|
| 420 |
-
|
| 421 |
-
|
| 422 |
for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
|
| 423 |
-
|
| 424 |
if not adaptor_type:
|
| 425 |
return score_table[highest_probability]
|
| 426 |
return self.__convert_results(score_table[highest_probability])
|
|
@@ -546,7 +542,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 546 |
|
| 547 |
if selected_elements:
|
| 548 |
if not self.__auto_match_enabled and auto_save:
|
| 549 |
-
|
| 550 |
|
| 551 |
elif self.__auto_match_enabled and auto_save:
|
| 552 |
self.save(selected_elements[0], identifier or selector)
|
|
@@ -565,7 +561,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 565 |
return self.__convert_results(selected_elements)
|
| 566 |
|
| 567 |
elif not self.__auto_match_enabled and auto_match:
|
| 568 |
-
|
| 569 |
|
| 570 |
return self.__convert_results(selected_elements)
|
| 571 |
|
|
@@ -769,7 +765,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 769 |
|
| 770 |
self._storage.save(element, identifier)
|
| 771 |
else:
|
| 772 |
-
|
| 773 |
"Can't use Auto-match features with disabled globally, you have to start a new class instance."
|
| 774 |
)
|
| 775 |
|
|
@@ -783,7 +779,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 783 |
if self.__auto_match_enabled:
|
| 784 |
return self._storage.retrieve(identifier)
|
| 785 |
|
| 786 |
-
|
| 787 |
"Can't use Auto-match features with disabled globally, you have to start a new class instance."
|
| 788 |
)
|
| 789 |
|
|
|
|
| 18 |
StorageSystemMixin, _StorageTools)
|
| 19 |
from scrapling.core.translator import HTMLTranslator
|
| 20 |
from scrapling.core.utils import (clean_spaces, flatten, html_forbidden,
|
| 21 |
+
is_jsonable, log)
|
| 22 |
|
| 23 |
|
| 24 |
class Adaptor(SelectorsGeneration):
|
| 25 |
__slots__ = (
|
| 26 |
+
'url', 'encoding', '__auto_match_enabled', '_root', '_storage',
|
| 27 |
'__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
|
| 28 |
'__keep_cdata', '__raw_body'
|
| 29 |
)
|
|
|
|
| 41 |
auto_match: Optional[bool] = True,
|
| 42 |
storage: Any = SQLiteStorageSystem,
|
| 43 |
storage_args: Optional[Dict] = None,
|
|
|
|
| 44 |
**kwargs
|
| 45 |
):
|
| 46 |
"""The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
|
|
|
|
| 66 |
:param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
|
| 67 |
:param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
|
| 68 |
If empty, default values will be used.
|
|
|
|
| 69 |
"""
|
| 70 |
if root is None and not body and text is None:
|
| 71 |
raise ValueError("Adaptor class needs text, body, or root arguments to work")
|
|
|
|
| 104 |
|
| 105 |
self._root = root
|
| 106 |
|
|
|
|
| 107 |
self.__auto_match_enabled = auto_match
|
| 108 |
|
| 109 |
if self.__auto_match_enabled:
|
|
|
|
| 114 |
}
|
| 115 |
|
| 116 |
if not hasattr(storage, '__wrapped__'):
|
| 117 |
+
raise ValueError("Storage class must be wrapped with lru_cache decorator, see docs for info")
|
| 118 |
|
| 119 |
if not issubclass(storage.__wrapped__, StorageSystemMixin):
|
| 120 |
raise ValueError("Storage system must be inherited from class `StorageSystemMixin`")
|
|
|
|
| 129 |
# For selector stuff
|
| 130 |
self.__attributes = None
|
| 131 |
self.__tag = None
|
|
|
|
| 132 |
# No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
|
| 133 |
self.__response_data = {
|
| 134 |
key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'headers', 'request_headers',)
|
|
|
|
| 160 |
text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
|
| 161 |
url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
|
| 162 |
keep_comments=self.__keep_comments, keep_cdata=self.__keep_cdata,
|
| 163 |
+
huge_tree=self.__huge_tree_enabled,
|
| 164 |
**self.__response_data
|
| 165 |
)
|
| 166 |
return element
|
|
|
|
| 413 |
if score_table:
|
| 414 |
highest_probability = max(score_table.keys())
|
| 415 |
if score_table[highest_probability] and highest_probability >= percentage:
|
| 416 |
+
log.debug(f'Highest probability was {highest_probability}%')
|
| 417 |
+
log.debug('Top 5 best matching elements are: ')
|
| 418 |
for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
|
| 419 |
+
log.debug(f'{percent} -> {self.__convert_results(score_table[percent])}')
|
| 420 |
if not adaptor_type:
|
| 421 |
return score_table[highest_probability]
|
| 422 |
return self.__convert_results(score_table[highest_probability])
|
|
|
|
| 542 |
|
| 543 |
if selected_elements:
|
| 544 |
if not self.__auto_match_enabled and auto_save:
|
| 545 |
+
log.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
|
| 546 |
|
| 547 |
elif self.__auto_match_enabled and auto_save:
|
| 548 |
self.save(selected_elements[0], identifier or selector)
|
|
|
|
| 561 |
return self.__convert_results(selected_elements)
|
| 562 |
|
| 563 |
elif not self.__auto_match_enabled and auto_match:
|
| 564 |
+
log.warning("Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
|
| 565 |
|
| 566 |
return self.__convert_results(selected_elements)
|
| 567 |
|
|
|
|
| 765 |
|
| 766 |
self._storage.save(element, identifier)
|
| 767 |
else:
|
| 768 |
+
log.critical(
|
| 769 |
"Can't use Auto-match features with disabled globally, you have to start a new class instance."
|
| 770 |
)
|
| 771 |
|
|
|
|
| 779 |
if self.__auto_match_enabled:
|
| 780 |
return self._storage.retrieve(identifier)
|
| 781 |
|
| 782 |
+
log.critical(
|
| 783 |
"Can't use Auto-match features with disabled globally, you have to start a new class instance."
|
| 784 |
)
|
| 785 |
|
tests/parser/test_automatch.py
CHANGED
|
@@ -42,8 +42,8 @@ class TestParserAutoMatch(unittest.TestCase):
|
|
| 42 |
</div>
|
| 43 |
'''
|
| 44 |
|
| 45 |
-
old_page = Adaptor(original_html, url='example.com', auto_match=True
|
| 46 |
-
new_page = Adaptor(changed_html, url='example.com', auto_match=True
|
| 47 |
|
| 48 |
# 'p1' was used as ID and now it's not and all the path elements have changes
|
| 49 |
# Also at the same time testing auto-match vs combined selectors
|
|
|
|
| 42 |
</div>
|
| 43 |
'''
|
| 44 |
|
| 45 |
+
old_page = Adaptor(original_html, url='example.com', auto_match=True)
|
| 46 |
+
new_page = Adaptor(changed_html, url='example.com', auto_match=True)
|
| 47 |
|
| 48 |
# 'p1' was used as ID and now it's not and all the path elements have changes
|
| 49 |
# Also at the same time testing auto-match vs combined selectors
|
tests/parser/test_general.py
CHANGED
|
@@ -74,7 +74,7 @@ class TestParser(unittest.TestCase):
|
|
| 74 |
</body>
|
| 75 |
</html>
|
| 76 |
'''
|
| 77 |
-
self.page = Adaptor(self.html, auto_match=False
|
| 78 |
|
| 79 |
def test_css_selector(self):
|
| 80 |
"""Test Selecting elements with complex CSS selectors"""
|
|
@@ -273,7 +273,7 @@ class TestParser(unittest.TestCase):
|
|
| 273 |
large_html = '<html><body>' + '<div class="item">' * 5000 + '</div>' * 5000 + '</body></html>'
|
| 274 |
|
| 275 |
start_time = time.time()
|
| 276 |
-
parsed = Adaptor(large_html, auto_match=False
|
| 277 |
elements = parsed.css('.item')
|
| 278 |
end_time = time.time()
|
| 279 |
|
|
|
|
| 74 |
</body>
|
| 75 |
</html>
|
| 76 |
'''
|
| 77 |
+
self.page = Adaptor(self.html, auto_match=False)
|
| 78 |
|
| 79 |
def test_css_selector(self):
|
| 80 |
"""Test Selecting elements with complex CSS selectors"""
|
|
|
|
| 273 |
large_html = '<html><body>' + '<div class="item">' * 5000 + '</div>' * 5000 + '</body></html>'
|
| 274 |
|
| 275 |
start_time = time.time()
|
| 276 |
+
parsed = Adaptor(large_html, auto_match=False)
|
| 277 |
elements = parsed.css('.item')
|
| 278 |
end_time = time.time()
|
| 279 |
|