Karim shoair commited on
Commit
df3c414
·
1 Parent(s): 9c43da3

refactor(api)!: Unifying log under 1 logger and removing debug parameter

Browse files

So now you control the logging and the debugging from the shell through the logger with the name 'scrapling'

.github/ISSUE_TEMPLATE/01-bug_report.yml CHANGED
@@ -65,7 +65,7 @@ body:
65
 
66
  - type: textarea
67
  attributes:
68
- label: "Actual behavior (Remember to use `debug` parameter)"
69
  validations:
70
  required: true
71
 
 
65
 
66
  - type: textarea
67
  attributes:
68
+ label: "Actual behavior"
69
  validations:
70
  required: true
71
 
CONTRIBUTING.md CHANGED
@@ -19,7 +19,11 @@ tests/test_parser_functions.py ................ [100%]
19
 
20
  =============================== 16 passed in 0.22s ================================
21
  ```
22
- Also, consider setting `debug` to `True` while initializing the Adaptor object so it's easier to know what's happening in the background.
 
 
 
 
23
 
24
  ### The process is straight-forward.
25
 
 
19
 
20
  =============================== 16 passed in 0.22s ================================
21
  ```
22
+ Also, consider setting the scrapling logging level to `debug` so it's easier to know what's happening in the background.
23
+ ```python
24
+ >>> import logging
25
+ >>> logging.getLogger("scrapling").setLevel(logging.DEBUG)
26
+ ```
27
 
28
  ### The process is straight-forward.
29
 
README.md CHANGED
@@ -219,7 +219,7 @@ You might be slightly confused by now so let me clear things up. All fetcher-typ
219
  ```python
220
  from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
221
  ```
222
- All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, `storage_args`, and `debug`, which are the same ones you give to the `Adaptor` class.
223
 
224
  If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
225
  ```python
 
219
  ```python
220
  from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
221
  ```
222
+ All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the `Adaptor` class.
223
 
224
  If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
225
  ```python
benchmarks.py CHANGED
@@ -64,9 +64,9 @@ def test_pyquery():
64
  @benchmark
65
  def test_scrapling():
66
  # No need to do `.extract()` like parsel to extract text
67
- # Also, this is faster than `[t.text for t in Adaptor(large_html, auto_match=False, debug=False).css('.item')]`
68
  # for obvious reasons, of course.
69
- return Adaptor(large_html, auto_match=False, debug=False).css('.item::text')
70
 
71
 
72
  @benchmark
@@ -103,7 +103,7 @@ def test_scrapling_text(request_html):
103
  # Will loop over resulted elements to get text too to make comparison even more fair otherwise Scrapling will be even faster
104
  return [
105
  element.text for element in Adaptor(
106
- request_html, auto_match=False, debug=False
107
  ).find_by_text('Tipping the Velvet', first_match=True).find_similar(ignore_attributes=['title'])
108
  ]
109
 
 
64
  @benchmark
65
  def test_scrapling():
66
  # No need to do `.extract()` like parsel to extract text
67
+ # Also, this is faster than `[t.text for t in Adaptor(large_html, auto_match=False).css('.item')]`
68
  # for obvious reasons, of course.
69
+ return Adaptor(large_html, auto_match=False).css('.item::text')
70
 
71
 
72
  @benchmark
 
103
  # Will loop over resulted elements to get text too to make comparison even more fair otherwise Scrapling will be even faster
104
  return [
105
  element.text for element in Adaptor(
106
+ request_html, auto_match=False
107
  ).find_by_text('Tipping the Velvet', first_match=True).find_similar(ignore_attributes=['title'])
108
  ]
109
 
scrapling/core/storage_adaptors.py CHANGED
@@ -1,4 +1,3 @@
1
- import logging
2
  import sqlite3
3
  import threading
4
  from abc import ABC, abstractmethod
@@ -9,7 +8,7 @@ from lxml import html
9
  from tldextract import extract as tld
10
 
11
  from scrapling.core._types import Dict, Optional, Union
12
- from scrapling.core.utils import _StorageTools, cache
13
 
14
 
15
  class StorageSystemMixin(ABC):
@@ -20,7 +19,7 @@ class StorageSystemMixin(ABC):
20
  """
21
  self.url = url
22
 
23
- @cache(None, typed=True)
24
  def _get_base_url(self, default_value: str = 'default') -> str:
25
  if not self.url or type(self.url) is not str:
26
  return default_value
@@ -52,7 +51,7 @@ class StorageSystemMixin(ABC):
52
  raise NotImplementedError('Storage system must implement `save` method')
53
 
54
  @staticmethod
55
- @cache(None, typed=True)
56
  def _get_hash(identifier: str) -> str:
57
  """If you want to hash identifier in your storage system, use this safer"""
58
  identifier = identifier.lower().strip()
@@ -64,7 +63,7 @@ class StorageSystemMixin(ABC):
64
  return f"{hash_value}_{len(identifier)}" # Length to reduce collision chance
65
 
66
 
67
- @cache(None, typed=True)
68
  class SQLiteStorageSystem(StorageSystemMixin):
69
  """The recommended system to use, it's race condition safe and thread safe.
70
  Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
@@ -86,7 +85,7 @@ class SQLiteStorageSystem(StorageSystemMixin):
86
  self.connection.execute("PRAGMA journal_mode=WAL")
87
  self.cursor = self.connection.cursor()
88
  self._setup_database()
89
- logging.debug(
90
  f'Storage system loaded with arguments (storage_file="{storage_file}", url="{url}")'
91
  )
92
 
 
 
1
  import sqlite3
2
  import threading
3
  from abc import ABC, abstractmethod
 
8
  from tldextract import extract as tld
9
 
10
  from scrapling.core._types import Dict, Optional, Union
11
+ from scrapling.core.utils import _StorageTools, log, lru_cache
12
 
13
 
14
  class StorageSystemMixin(ABC):
 
19
  """
20
  self.url = url
21
 
22
+ @lru_cache(None, typed=True)
23
  def _get_base_url(self, default_value: str = 'default') -> str:
24
  if not self.url or type(self.url) is not str:
25
  return default_value
 
51
  raise NotImplementedError('Storage system must implement `save` method')
52
 
53
  @staticmethod
54
+ @lru_cache(None, typed=True)
55
  def _get_hash(identifier: str) -> str:
56
  """If you want to hash identifier in your storage system, use this safer"""
57
  identifier = identifier.lower().strip()
 
63
  return f"{hash_value}_{len(identifier)}" # Length to reduce collision chance
64
 
65
 
66
+ @lru_cache(None, typed=True)
67
  class SQLiteStorageSystem(StorageSystemMixin):
68
  """The recommended system to use, it's race condition safe and thread safe.
69
  Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
 
85
  self.connection.execute("PRAGMA journal_mode=WAL")
86
  self.cursor = self.connection.cursor()
87
  self._setup_database()
88
+ log.debug(
89
  f'Storage system loaded with arguments (storage_file="{storage_file}", url="{url}")'
90
  )
91
 
scrapling/core/translator.py CHANGED
@@ -17,7 +17,7 @@ from cssselect.xpath import XPathExpr as OriginalXPathExpr
17
  from w3lib.html import HTML5_WHITESPACE
18
 
19
  from scrapling.core._types import Any, Optional, Protocol, Self
20
- from scrapling.core.utils import cache
21
 
22
  regex = f"[{HTML5_WHITESPACE}]+"
23
  replace_html5_whitespaces = re.compile(regex).sub
@@ -139,6 +139,6 @@ class TranslatorMixin:
139
 
140
 
141
  class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
142
- @cache(maxsize=256)
143
  def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
144
  return super().css_to_xpath(css, prefix)
 
17
  from w3lib.html import HTML5_WHITESPACE
18
 
19
  from scrapling.core._types import Any, Optional, Protocol, Self
20
+ from scrapling.core.utils import lru_cache
21
 
22
  regex = f"[{HTML5_WHITESPACE}]+"
23
  replace_html5_whitespaces = re.compile(regex).sub
 
139
 
140
 
141
  class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
142
+ @lru_cache(maxsize=256)
143
  def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
144
  return super().css_to_xpath(css, prefix)
scrapling/core/utils.py CHANGED
@@ -9,18 +9,36 @@ from scrapling.core._types import Any, Dict, Iterable, Union
9
 
10
  # Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
11
  # functools.cache is available on Python 3.9+ only so let's keep lru_cache
12
- from functools import lru_cache as cache # isort:skip
13
-
14
 
15
  html_forbidden = {html.HtmlComment, }
16
- logging.basicConfig(
17
- level=logging.INFO,
18
- format="[%(asctime)s] %(levelname)s: %(message)s",
19
- datefmt="%Y-%m-%d %H:%M:%S",
20
- handlers=[
21
- logging.StreamHandler()
22
- ]
23
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
 
26
  def is_jsonable(content: Union[bytes, str]) -> bool:
@@ -34,23 +52,6 @@ def is_jsonable(content: Union[bytes, str]) -> bool:
34
  return False
35
 
36
 
37
- @cache(None, typed=True)
38
- def setup_basic_logging(level: str = 'debug'):
39
- levels = {
40
- 'debug': logging.DEBUG,
41
- 'info': logging.INFO,
42
- 'warning': logging.WARNING,
43
- 'error': logging.ERROR,
44
- 'critical': logging.CRITICAL
45
- }
46
- formatter = logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s", "%Y-%m-%d %H:%M:%S")
47
- lvl = levels[level.lower()]
48
- handler = logging.StreamHandler()
49
- handler.setFormatter(formatter)
50
- # Configure the root logger
51
- logging.basicConfig(level=lvl, handlers=[handler])
52
-
53
-
54
  def flatten(lst: Iterable):
55
  return list(chain.from_iterable(lst))
56
 
@@ -114,7 +115,7 @@ class _StorageTools:
114
  # return _impl
115
 
116
 
117
- @cache(None, typed=True)
118
  def clean_spaces(string):
119
  string = string.replace('\t', ' ')
120
  string = re.sub('[\n|\r]', '', string)
 
9
 
10
  # Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
11
  # functools.cache is available on Python 3.9+ only so let's keep lru_cache
12
+ from functools import lru_cache # isort:skip
 
13
 
14
  html_forbidden = {html.HtmlComment, }
15
+
16
+
17
+ @lru_cache(1, typed=True)
18
+ def setup_logger():
19
+ """Create and configure a logger with a standard format.
20
+
21
+ :returns: logging.Logger: Configured logger instance
22
+ """
23
+ logger = logging.getLogger('scrapling')
24
+ logger.setLevel(logging.INFO)
25
+
26
+ formatter = logging.Formatter(
27
+ fmt="[%(asctime)s] %(levelname)s: %(message)s",
28
+ datefmt="%Y-%m-%d %H:%M:%S"
29
+ )
30
+
31
+ console_handler = logging.StreamHandler()
32
+ console_handler.setFormatter(formatter)
33
+
34
+ # Add handler to logger (if not already added)
35
+ if not logger.handlers:
36
+ logger.addHandler(console_handler)
37
+
38
+ return logger
39
+
40
+
41
+ log = setup_logger()
42
 
43
 
44
  def is_jsonable(content: Union[bytes, str]) -> bool:
 
52
  return False
53
 
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  def flatten(lst: Iterable):
56
  return list(chain.from_iterable(lst))
57
 
 
115
  # return _impl
116
 
117
 
118
+ @lru_cache(None, typed=True)
119
  def clean_spaces(string):
120
  string = string.replace('\t', ' ')
121
  string = re.sub('[\n|\r]', '', string)
scrapling/engines/camo.py CHANGED
@@ -1,10 +1,9 @@
1
- import logging
2
-
3
  from camoufox import DefaultAddons
4
  from camoufox.sync_api import Camoufox
5
 
6
  from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
7
  Union)
 
8
  from scrapling.engines.toolbelt import (Response, StatusText,
9
  check_type_validity,
10
  construct_proxy_dict, do_nothing,
@@ -63,7 +62,7 @@ class CamoufoxEngine:
63
  self.page_action = page_action
64
  else:
65
  self.page_action = do_nothing
66
- logging.error('[Ignored] Argument "page_action" must be callable')
67
 
68
  self.wait_selector = wait_selector
69
  self.wait_selector_state = wait_selector_state
 
 
 
1
  from camoufox import DefaultAddons
2
  from camoufox.sync_api import Camoufox
3
 
4
  from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
5
  Union)
6
+ from scrapling.core.utils import log
7
  from scrapling.engines.toolbelt import (Response, StatusText,
8
  check_type_validity,
9
  construct_proxy_dict, do_nothing,
 
62
  self.page_action = page_action
63
  else:
64
  self.page_action = do_nothing
65
+ log.error('[Ignored] Argument "page_action" must be callable')
66
 
67
  self.wait_selector = wait_selector
68
  self.wait_selector_state = wait_selector_state
scrapling/engines/pw.py CHANGED
@@ -1,7 +1,7 @@
1
  import json
2
- import logging
3
 
4
  from scrapling.core._types import Callable, Dict, List, Optional, Union
 
5
  from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
6
  NSTBROWSER_DEFAULT_QUERY)
7
  from scrapling.engines.toolbelt import (Response, StatusText,
@@ -78,7 +78,7 @@ class PlaywrightEngine:
78
  self.page_action = page_action
79
  else:
80
  self.page_action = do_nothing
81
- logging.error('[Ignored] Argument "page_action" must be callable')
82
 
83
  self.wait_selector = wait_selector
84
  self.wait_selector_state = wait_selector_state
 
1
  import json
 
2
 
3
  from scrapling.core._types import Callable, Dict, List, Optional, Union
4
+ from scrapling.core.utils import log
5
  from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
6
  NSTBROWSER_DEFAULT_QUERY)
7
  from scrapling.engines.toolbelt import (Response, StatusText,
 
78
  self.page_action = page_action
79
  else:
80
  self.page_action = do_nothing
81
+ log.error('[Ignored] Argument "page_action" must be callable')
82
 
83
  self.wait_selector = wait_selector
84
  self.wait_selector_state = wait_selector_state
scrapling/engines/static.py CHANGED
@@ -1,5 +1,3 @@
1
- import logging
2
-
3
  import httpx
4
  from httpx._models import Response as httpxResponse
5
 
@@ -36,7 +34,6 @@ class StaticEngine:
36
  # Validate headers
37
  if not headers.get('user-agent') and not headers.get('User-Agent'):
38
  headers['User-Agent'] = generate_headers(browser_mode=False).get('User-Agent')
39
- logging.info(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
40
 
41
  if stealth:
42
  extra_headers = generate_headers(browser_mode=False)
 
 
 
1
  import httpx
2
  from httpx._models import Response as httpxResponse
3
 
 
34
  # Validate headers
35
  if not headers.get('user-agent') and not headers.get('User-Agent'):
36
  headers['User-Agent'] = generate_headers(browser_mode=False).get('User-Agent')
 
37
 
38
  if stealth:
39
  extra_headers = generate_headers(browser_mode=False)
scrapling/engines/toolbelt/custom.py CHANGED
@@ -2,13 +2,12 @@
2
  Functions related to custom types or type checking
3
  """
4
  import inspect
5
- import logging
6
  from email.message import Message
7
 
8
  from scrapling.core._types import (Any, Callable, Dict, List, Optional, Tuple,
9
  Type, Union)
10
  from scrapling.core.custom_types import MappingProxyType
11
- from scrapling.core.utils import cache, setup_basic_logging
12
  from scrapling.parser import Adaptor, SQLiteStorageSystem
13
 
14
 
@@ -17,7 +16,7 @@ class ResponseEncoding:
17
  __ISO_8859_1_CONTENT_TYPES = {"text/plain", "text/html", "text/css", "text/javascript"}
18
 
19
  @classmethod
20
- @cache(maxsize=None)
21
  def __parse_content_type(cls, header_value: str) -> Tuple[str, Dict[str, str]]:
22
  """Parse content type and parameters from a content-type header value.
23
 
@@ -39,7 +38,7 @@ class ResponseEncoding:
39
  return content_type, params
40
 
41
  @classmethod
42
- @cache(maxsize=None)
43
  def get_value(cls, content_type: Optional[str], text: Optional[str] = 'test') -> str:
44
  """Determine the appropriate character encoding from a content-type header.
45
 
@@ -98,7 +97,7 @@ class Response(Adaptor):
98
  # For back-ward compatibility
99
  self.adaptor = self
100
  # For easier debugging while working from a Python shell
101
- logging.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')
102
 
103
  # def __repr__(self):
104
  # return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
@@ -107,7 +106,7 @@ class Response(Adaptor):
107
  class BaseFetcher:
108
  def __init__(
109
  self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
110
- storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = False,
111
  automatch_domain: Optional[str] = None, keep_cdata: Optional[bool] = False,
112
  ):
113
  """Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
@@ -124,7 +123,6 @@ class BaseFetcher:
124
  If empty, default values will be used.
125
  :param automatch_domain: For cases where you want to automatch selectors across different websites as if they were on the same website, use this argument to unify them.
126
  Otherwise, the domain of the request is used by default.
127
- :param debug: Enable debug mode
128
  """
129
  # Adaptor class parameters
130
  # I won't validate Adaptor's class parameters here again, I will leave it to be validated later
@@ -134,14 +132,11 @@ class BaseFetcher:
134
  keep_cdata=keep_cdata,
135
  auto_match=auto_match,
136
  storage=storage,
137
- storage_args=storage_args,
138
- debug=debug,
139
  )
140
- # If the user used fetchers first, then configure the logger from here instead of the `Adaptor` class
141
- setup_basic_logging(level='debug' if debug else 'info')
142
  if automatch_domain:
143
  if type(automatch_domain) is not str:
144
- logging.warning('[Ignored] The argument "automatch_domain" must be of string type')
145
  else:
146
  self.adaptor_arguments.update({'automatch_domain': automatch_domain})
147
 
@@ -217,7 +212,7 @@ class StatusText:
217
  })
218
 
219
  @classmethod
220
- @cache(maxsize=128)
221
  def get(cls, status_code: int) -> str:
222
  """Get the phrase for a given HTTP status code."""
223
  return cls._phrases.get(status_code, "Unknown Status Code")
@@ -284,7 +279,7 @@ def check_type_validity(variable: Any, valid_types: Union[List[Type], None], def
284
  error_msg = f'Argument "{var_name}" cannot be None'
285
  if critical:
286
  raise TypeError(error_msg)
287
- logging.error(f'[Ignored] {error_msg}')
288
  return default_value
289
 
290
  # If no valid_types specified and variable has a value, return it
@@ -297,7 +292,7 @@ def check_type_validity(variable: Any, valid_types: Union[List[Type], None], def
297
  error_msg = f'Argument "{var_name}" must be of type {" or ".join(type_names)}'
298
  if critical:
299
  raise TypeError(error_msg)
300
- logging.error(f'[Ignored] {error_msg}')
301
  return default_value
302
 
303
  return variable
 
2
  Functions related to custom types or type checking
3
  """
4
  import inspect
 
5
  from email.message import Message
6
 
7
  from scrapling.core._types import (Any, Callable, Dict, List, Optional, Tuple,
8
  Type, Union)
9
  from scrapling.core.custom_types import MappingProxyType
10
+ from scrapling.core.utils import log, lru_cache
11
  from scrapling.parser import Adaptor, SQLiteStorageSystem
12
 
13
 
 
16
  __ISO_8859_1_CONTENT_TYPES = {"text/plain", "text/html", "text/css", "text/javascript"}
17
 
18
  @classmethod
19
+ @lru_cache(maxsize=None)
20
  def __parse_content_type(cls, header_value: str) -> Tuple[str, Dict[str, str]]:
21
  """Parse content type and parameters from a content-type header value.
22
 
 
38
  return content_type, params
39
 
40
  @classmethod
41
+ @lru_cache(maxsize=None)
42
  def get_value(cls, content_type: Optional[str], text: Optional[str] = 'test') -> str:
43
  """Determine the appropriate character encoding from a content-type header.
44
 
 
97
  # For back-ward compatibility
98
  self.adaptor = self
99
  # For easier debugging while working from a Python shell
100
+ log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')
101
 
102
  # def __repr__(self):
103
  # return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
 
106
  class BaseFetcher:
107
  def __init__(
108
  self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
109
+ storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None,
110
  automatch_domain: Optional[str] = None, keep_cdata: Optional[bool] = False,
111
  ):
112
  """Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
 
123
  If empty, default values will be used.
124
  :param automatch_domain: For cases where you want to automatch selectors across different websites as if they were on the same website, use this argument to unify them.
125
  Otherwise, the domain of the request is used by default.
 
126
  """
127
  # Adaptor class parameters
128
  # I won't validate Adaptor's class parameters here again, I will leave it to be validated later
 
132
  keep_cdata=keep_cdata,
133
  auto_match=auto_match,
134
  storage=storage,
135
+ storage_args=storage_args
 
136
  )
 
 
137
  if automatch_domain:
138
  if type(automatch_domain) is not str:
139
+ log.warning('[Ignored] The argument "automatch_domain" must be of string type')
140
  else:
141
  self.adaptor_arguments.update({'automatch_domain': automatch_domain})
142
 
 
212
  })
213
 
214
  @classmethod
215
+ @lru_cache(maxsize=128)
216
  def get(cls, status_code: int) -> str:
217
  """Get the phrase for a given HTTP status code."""
218
  return cls._phrases.get(status_code, "Unknown Status Code")
 
279
  error_msg = f'Argument "{var_name}" cannot be None'
280
  if critical:
281
  raise TypeError(error_msg)
282
+ log.error(f'[Ignored] {error_msg}')
283
  return default_value
284
 
285
  # If no valid_types specified and variable has a value, return it
 
292
  error_msg = f'Argument "{var_name}" must be of type {" or ".join(type_names)}'
293
  if critical:
294
  raise TypeError(error_msg)
295
+ log.error(f'[Ignored] {error_msg}')
296
  return default_value
297
 
298
  return variable
scrapling/engines/toolbelt/fingerprints.py CHANGED
@@ -9,10 +9,10 @@ from browserforge.headers import Browser, HeaderGenerator
9
  from tldextract import extract
10
 
11
  from scrapling.core._types import Dict, Union
12
- from scrapling.core.utils import cache
13
 
14
 
15
- @cache(None, typed=True)
16
  def generate_convincing_referer(url: str) -> str:
17
  """Takes the domain from the URL without the subdomain/suffix and make it look like you were searching google for this website
18
 
@@ -26,7 +26,7 @@ def generate_convincing_referer(url: str) -> str:
26
  return f'https://www.google.com/search?q={website_name}'
27
 
28
 
29
- @cache(None, typed=True)
30
  def get_os_name() -> Union[str, None]:
31
  """Get the current OS name in the same format needed for browserforge
32
 
 
9
  from tldextract import extract
10
 
11
  from scrapling.core._types import Dict, Union
12
+ from scrapling.core.utils import lru_cache
13
 
14
 
15
+ @lru_cache(None, typed=True)
16
  def generate_convincing_referer(url: str) -> str:
17
  """Takes the domain from the URL without the subdomain/suffix and make it look like you were searching google for this website
18
 
 
26
  return f'https://www.google.com/search?q={website_name}'
27
 
28
 
29
+ @lru_cache(None, typed=True)
30
  def get_os_name() -> Union[str, None]:
31
  """Get the current OS name in the same format needed for browserforge
32
 
scrapling/engines/toolbelt/navigation.py CHANGED
@@ -1,15 +1,13 @@
1
  """
2
  Functions related to files and URLs
3
  """
4
-
5
- import logging
6
  import os
7
  from urllib.parse import urlencode, urlparse
8
 
9
  from playwright.sync_api import Route
10
 
11
  from scrapling.core._types import Dict, Optional, Union
12
- from scrapling.core.utils import cache
13
  from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
14
 
15
 
@@ -20,7 +18,7 @@ def intercept_route(route: Route) -> Union[Route, None]:
20
  :return: PlayWright `Route` object
21
  """
22
  if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
23
- logging.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
24
  return route.abort()
25
  return route.continue_()
26
 
@@ -97,7 +95,7 @@ def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
97
  raise ValueError(f"Invalid CDP URL: {str(e)}")
98
 
99
 
100
- @cache(None, typed=True)
101
  def js_bypass_path(filename: str) -> str:
102
  """Takes the base filename of JS file inside the `bypasses` folder then return the full path of it
103
 
 
1
  """
2
  Functions related to files and URLs
3
  """
 
 
4
  import os
5
  from urllib.parse import urlencode, urlparse
6
 
7
  from playwright.sync_api import Route
8
 
9
  from scrapling.core._types import Dict, Optional, Union
10
+ from scrapling.core.utils import log, lru_cache
11
  from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
12
 
13
 
 
18
  :return: PlayWright `Route` object
19
  """
20
  if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
21
+ log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
22
  return route.abort()
23
  return route.continue_()
24
 
 
95
  raise ValueError(f"Invalid CDP URL: {str(e)}")
96
 
97
 
98
+ @lru_cache(None, typed=True)
99
  def js_bypass_path(filename: str) -> str:
100
  """Takes the base filename of JS file inside the `bypasses` folder then return the full path of it
101
 
scrapling/parser.py CHANGED
@@ -18,12 +18,12 @@ from scrapling.core.storage_adaptors import (SQLiteStorageSystem,
18
  StorageSystemMixin, _StorageTools)
19
  from scrapling.core.translator import HTMLTranslator
20
  from scrapling.core.utils import (clean_spaces, flatten, html_forbidden,
21
- is_jsonable, logging, setup_basic_logging)
22
 
23
 
24
  class Adaptor(SelectorsGeneration):
25
  __slots__ = (
26
- 'url', 'encoding', '__auto_match_enabled', '_root', '_storage', '__debug',
27
  '__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
28
  '__keep_cdata', '__raw_body'
29
  )
@@ -41,7 +41,6 @@ class Adaptor(SelectorsGeneration):
41
  auto_match: Optional[bool] = True,
42
  storage: Any = SQLiteStorageSystem,
43
  storage_args: Optional[Dict] = None,
44
- debug: Optional[bool] = True,
45
  **kwargs
46
  ):
47
  """The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
@@ -67,7 +66,6 @@ class Adaptor(SelectorsGeneration):
67
  :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
68
  :param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
69
  If empty, default values will be used.
70
- :param debug: Enable debug mode
71
  """
72
  if root is None and not body and text is None:
73
  raise ValueError("Adaptor class needs text, body, or root arguments to work")
@@ -106,7 +104,6 @@ class Adaptor(SelectorsGeneration):
106
 
107
  self._root = root
108
 
109
- setup_basic_logging(level='debug' if debug else 'info')
110
  self.__auto_match_enabled = auto_match
111
 
112
  if self.__auto_match_enabled:
@@ -117,7 +114,7 @@ class Adaptor(SelectorsGeneration):
117
  }
118
 
119
  if not hasattr(storage, '__wrapped__'):
120
- raise ValueError("Storage class must be wrapped with cache decorator, see docs for info")
121
 
122
  if not issubclass(storage.__wrapped__, StorageSystemMixin):
123
  raise ValueError("Storage system must be inherited from class `StorageSystemMixin`")
@@ -132,7 +129,6 @@ class Adaptor(SelectorsGeneration):
132
  # For selector stuff
133
  self.__attributes = None
134
  self.__tag = None
135
- self.__debug = debug
136
  # No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
137
  self.__response_data = {
138
  key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'headers', 'request_headers',)
@@ -164,7 +160,7 @@ class Adaptor(SelectorsGeneration):
164
  text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
165
  url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
166
  keep_comments=self.__keep_comments, keep_cdata=self.__keep_cdata,
167
- huge_tree=self.__huge_tree_enabled, debug=self.__debug,
168
  **self.__response_data
169
  )
170
  return element
@@ -417,10 +413,10 @@ class Adaptor(SelectorsGeneration):
417
  if score_table:
418
  highest_probability = max(score_table.keys())
419
  if score_table[highest_probability] and highest_probability >= percentage:
420
- logging.debug(f'Highest probability was {highest_probability}%')
421
- logging.debug('Top 5 best matching elements are: ')
422
  for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
423
- logging.debug(f'{percent} -> {self.__convert_results(score_table[percent])}')
424
  if not adaptor_type:
425
  return score_table[highest_probability]
426
  return self.__convert_results(score_table[highest_probability])
@@ -546,7 +542,7 @@ class Adaptor(SelectorsGeneration):
546
 
547
  if selected_elements:
548
  if not self.__auto_match_enabled and auto_save:
549
- logging.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
550
 
551
  elif self.__auto_match_enabled and auto_save:
552
  self.save(selected_elements[0], identifier or selector)
@@ -565,7 +561,7 @@ class Adaptor(SelectorsGeneration):
565
  return self.__convert_results(selected_elements)
566
 
567
  elif not self.__auto_match_enabled and auto_match:
568
- logging.warning("Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
569
 
570
  return self.__convert_results(selected_elements)
571
 
@@ -769,7 +765,7 @@ class Adaptor(SelectorsGeneration):
769
 
770
  self._storage.save(element, identifier)
771
  else:
772
- logging.critical(
773
  "Can't use Auto-match features with disabled globally, you have to start a new class instance."
774
  )
775
 
@@ -783,7 +779,7 @@ class Adaptor(SelectorsGeneration):
783
  if self.__auto_match_enabled:
784
  return self._storage.retrieve(identifier)
785
 
786
- logging.critical(
787
  "Can't use Auto-match features with disabled globally, you have to start a new class instance."
788
  )
789
 
 
18
  StorageSystemMixin, _StorageTools)
19
  from scrapling.core.translator import HTMLTranslator
20
  from scrapling.core.utils import (clean_spaces, flatten, html_forbidden,
21
+ is_jsonable, log)
22
 
23
 
24
  class Adaptor(SelectorsGeneration):
25
  __slots__ = (
26
+ 'url', 'encoding', '__auto_match_enabled', '_root', '_storage',
27
  '__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
28
  '__keep_cdata', '__raw_body'
29
  )
 
41
  auto_match: Optional[bool] = True,
42
  storage: Any = SQLiteStorageSystem,
43
  storage_args: Optional[Dict] = None,
 
44
  **kwargs
45
  ):
46
  """The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
 
66
  :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
67
  :param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
68
  If empty, default values will be used.
 
69
  """
70
  if root is None and not body and text is None:
71
  raise ValueError("Adaptor class needs text, body, or root arguments to work")
 
104
 
105
  self._root = root
106
 
 
107
  self.__auto_match_enabled = auto_match
108
 
109
  if self.__auto_match_enabled:
 
114
  }
115
 
116
  if not hasattr(storage, '__wrapped__'):
117
+ raise ValueError("Storage class must be wrapped with lru_cache decorator, see docs for info")
118
 
119
  if not issubclass(storage.__wrapped__, StorageSystemMixin):
120
  raise ValueError("Storage system must be inherited from class `StorageSystemMixin`")
 
129
  # For selector stuff
130
  self.__attributes = None
131
  self.__tag = None
 
132
  # No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
133
  self.__response_data = {
134
  key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'headers', 'request_headers',)
 
160
  text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
161
  url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
162
  keep_comments=self.__keep_comments, keep_cdata=self.__keep_cdata,
163
+ huge_tree=self.__huge_tree_enabled,
164
  **self.__response_data
165
  )
166
  return element
 
413
  if score_table:
414
  highest_probability = max(score_table.keys())
415
  if score_table[highest_probability] and highest_probability >= percentage:
416
+ log.debug(f'Highest probability was {highest_probability}%')
417
+ log.debug('Top 5 best matching elements are: ')
418
  for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
419
+ log.debug(f'{percent} -> {self.__convert_results(score_table[percent])}')
420
  if not adaptor_type:
421
  return score_table[highest_probability]
422
  return self.__convert_results(score_table[highest_probability])
 
542
 
543
  if selected_elements:
544
  if not self.__auto_match_enabled and auto_save:
545
+ log.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
546
 
547
  elif self.__auto_match_enabled and auto_save:
548
  self.save(selected_elements[0], identifier or selector)
 
561
  return self.__convert_results(selected_elements)
562
 
563
  elif not self.__auto_match_enabled and auto_match:
564
+ log.warning("Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
565
 
566
  return self.__convert_results(selected_elements)
567
 
 
765
 
766
  self._storage.save(element, identifier)
767
  else:
768
+ log.critical(
769
  "Can't use Auto-match features with disabled globally, you have to start a new class instance."
770
  )
771
 
 
779
  if self.__auto_match_enabled:
780
  return self._storage.retrieve(identifier)
781
 
782
+ log.critical(
783
  "Can't use Auto-match features with disabled globally, you have to start a new class instance."
784
  )
785
 
tests/parser/test_automatch.py CHANGED
@@ -42,8 +42,8 @@ class TestParserAutoMatch(unittest.TestCase):
42
  </div>
43
  '''
44
 
45
- old_page = Adaptor(original_html, url='example.com', auto_match=True, debug=True)
46
- new_page = Adaptor(changed_html, url='example.com', auto_match=True, debug=True)
47
 
48
  # 'p1' was used as ID and now it's not and all the path elements have changes
49
  # Also at the same time testing auto-match vs combined selectors
 
42
  </div>
43
  '''
44
 
45
+ old_page = Adaptor(original_html, url='example.com', auto_match=True)
46
+ new_page = Adaptor(changed_html, url='example.com', auto_match=True)
47
 
48
  # 'p1' was used as ID and now it's not and all the path elements have changes
49
  # Also at the same time testing auto-match vs combined selectors
tests/parser/test_general.py CHANGED
@@ -74,7 +74,7 @@ class TestParser(unittest.TestCase):
74
  </body>
75
  </html>
76
  '''
77
- self.page = Adaptor(self.html, auto_match=False, debug=False)
78
 
79
  def test_css_selector(self):
80
  """Test Selecting elements with complex CSS selectors"""
@@ -273,7 +273,7 @@ class TestParser(unittest.TestCase):
273
  large_html = '<html><body>' + '<div class="item">' * 5000 + '</div>' * 5000 + '</body></html>'
274
 
275
  start_time = time.time()
276
- parsed = Adaptor(large_html, auto_match=False, debug=False)
277
  elements = parsed.css('.item')
278
  end_time = time.time()
279
 
 
74
  </body>
75
  </html>
76
  '''
77
+ self.page = Adaptor(self.html, auto_match=False)
78
 
79
  def test_css_selector(self):
80
  """Test Selecting elements with complex CSS selectors"""
 
273
  large_html = '<html><body>' + '<div class="item">' * 5000 + '</div>' * 5000 + '</body></html>'
274
 
275
  start_time = time.time()
276
+ parsed = Adaptor(large_html, auto_match=False)
277
  elements = parsed.css('.item')
278
  end_time = time.time()
279