Spaces:

lenson78
/

Scrapling

Paused

App Files Files Community

Karim shoair commited on Dec 11, 2024

Commit

df3c414

1 Parent(s): 9c43da3

refactor(api)!: Unifying log under 1 logger and removing debug parameter

Browse files

So now you control the logging and the debugging from the shell through the logger with the name 'scrapling'

Files changed (16) hide show

.github/ISSUE_TEMPLATE/01-bug_report.yml +1 -1
CONTRIBUTING.md +5 -1
README.md +1 -1
benchmarks.py +3 -3
scrapling/core/storage_adaptors.py +5 -6
scrapling/core/translator.py +2 -2
scrapling/core/utils.py +29 -28
scrapling/engines/camo.py +2 -3
scrapling/engines/pw.py +2 -2
scrapling/engines/static.py +0 -3
scrapling/engines/toolbelt/custom.py +10 -15
scrapling/engines/toolbelt/fingerprints.py +3 -3
scrapling/engines/toolbelt/navigation.py +3 -5
scrapling/parser.py +11 -15
tests/parser/test_automatch.py +2 -2
tests/parser/test_general.py +2 -2

.github/ISSUE_TEMPLATE/01-bug_report.yml CHANGED Viewed

@@ -65,7 +65,7 @@ body:
   - type: textarea
     attributes:
-      label: "Actual behavior (Remember to use `debug` parameter)"
     validations:
       required: true

   - type: textarea
     attributes:
+      label: "Actual behavior"
     validations:
       required: true

CONTRIBUTING.md CHANGED Viewed

@@ -19,7 +19,11 @@ tests/test_parser_functions.py ................          [100%]
 =============================== 16 passed in 0.22s ================================
 ```
-Also, consider setting `debug` to `True` while initializing the Adaptor object so it's easier to know what's happening in the background.
 ### The process is straight-forward.

 =============================== 16 passed in 0.22s ================================
 ```
+Also, consider setting the scrapling logging level to `debug` so it's easier to know what's happening in the background.
+```python
+>>> import logging
+>>> logging.getLogger("scrapling").setLevel(logging.DEBUG)
+```
 ### The process is straight-forward.

README.md CHANGED Viewed

@@ -219,7 +219,7 @@ You might be slightly confused by now so let me clear things up. All fetcher-typ
 ```python
 from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
 ```
-All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, `storage_args`, and `debug`, which are the same ones you give to the `Adaptor` class.
 If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
 ```python

 ```python
 from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
 ```
+All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the `Adaptor` class.
 If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
 ```python

benchmarks.py CHANGED Viewed

@@ -64,9 +64,9 @@ def test_pyquery():
 @benchmark
 def test_scrapling():
     # No need to do `.extract()` like parsel to extract text
-    # Also, this is faster than `[t.text for t in Adaptor(large_html, auto_match=False, debug=False).css('.item')]`
     # for obvious reasons, of course.
-    return Adaptor(large_html, auto_match=False, debug=False).css('.item::text')
 @benchmark
@@ -103,7 +103,7 @@ def test_scrapling_text(request_html):
     # Will loop over resulted elements to get text too to make comparison even more fair otherwise Scrapling will be even faster
     return [
         element.text for element in Adaptor(
-            request_html, auto_match=False, debug=False
         ).find_by_text('Tipping the Velvet', first_match=True).find_similar(ignore_attributes=['title'])
     ]

 @benchmark
 def test_scrapling():
     # No need to do `.extract()` like parsel to extract text
+    # Also, this is faster than `[t.text for t in Adaptor(large_html, auto_match=False).css('.item')]`
     # for obvious reasons, of course.
+    return Adaptor(large_html, auto_match=False).css('.item::text')
 @benchmark
     # Will loop over resulted elements to get text too to make comparison even more fair otherwise Scrapling will be even faster
     return [
         element.text for element in Adaptor(
+            request_html, auto_match=False
         ).find_by_text('Tipping the Velvet', first_match=True).find_similar(ignore_attributes=['title'])
     ]

scrapling/core/storage_adaptors.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import logging
 import sqlite3
 import threading
 from abc import ABC, abstractmethod
@@ -9,7 +8,7 @@ from lxml import html
 from tldextract import extract as tld
 from scrapling.core._types import Dict, Optional, Union
-from scrapling.core.utils import _StorageTools, cache
 class StorageSystemMixin(ABC):
@@ -20,7 +19,7 @@ class StorageSystemMixin(ABC):
         """
         self.url = url
-    @cache(None, typed=True)
     def _get_base_url(self, default_value: str = 'default') -> str:
         if not self.url or type(self.url) is not str:
             return default_value
@@ -52,7 +51,7 @@ class StorageSystemMixin(ABC):
         raise NotImplementedError('Storage system must implement `save` method')
     @staticmethod
-    @cache(None, typed=True)
     def _get_hash(identifier: str) -> str:
         """If you want to hash identifier in your storage system, use this safer"""
         identifier = identifier.lower().strip()
@@ -64,7 +63,7 @@ class StorageSystemMixin(ABC):
         return f"{hash_value}_{len(identifier)}"  # Length to reduce collision chance
-@cache(None, typed=True)
 class SQLiteStorageSystem(StorageSystemMixin):
     """The recommended system to use, it's race condition safe and thread safe.
     Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
@@ -86,7 +85,7 @@ class SQLiteStorageSystem(StorageSystemMixin):
         self.connection.execute("PRAGMA journal_mode=WAL")
         self.cursor = self.connection.cursor()
         self._setup_database()
-        logging.debug(
             f'Storage system loaded with arguments (storage_file="{storage_file}", url="{url}")'
         )

 import sqlite3
 import threading
 from abc import ABC, abstractmethod
 from tldextract import extract as tld
 from scrapling.core._types import Dict, Optional, Union
+from scrapling.core.utils import _StorageTools, log, lru_cache
 class StorageSystemMixin(ABC):
         """
         self.url = url
+    @lru_cache(None, typed=True)
     def _get_base_url(self, default_value: str = 'default') -> str:
         if not self.url or type(self.url) is not str:
             return default_value
         raise NotImplementedError('Storage system must implement `save` method')
     @staticmethod
+    @lru_cache(None, typed=True)
     def _get_hash(identifier: str) -> str:
         """If you want to hash identifier in your storage system, use this safer"""
         identifier = identifier.lower().strip()
         return f"{hash_value}_{len(identifier)}"  # Length to reduce collision chance
+@lru_cache(None, typed=True)
 class SQLiteStorageSystem(StorageSystemMixin):
     """The recommended system to use, it's race condition safe and thread safe.
     Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
         self.connection.execute("PRAGMA journal_mode=WAL")
         self.cursor = self.connection.cursor()
         self._setup_database()
+        log.debug(
             f'Storage system loaded with arguments (storage_file="{storage_file}", url="{url}")'
         )

scrapling/core/translator.py CHANGED Viewed

@@ -17,7 +17,7 @@ from cssselect.xpath import XPathExpr as OriginalXPathExpr
 from w3lib.html import HTML5_WHITESPACE
 from scrapling.core._types import Any, Optional, Protocol, Self
-from scrapling.core.utils import cache
 regex = f"[{HTML5_WHITESPACE}]+"
 replace_html5_whitespaces = re.compile(regex).sub
@@ -139,6 +139,6 @@ class TranslatorMixin:
 class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
-    @cache(maxsize=256)
     def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
         return super().css_to_xpath(css, prefix)

 from w3lib.html import HTML5_WHITESPACE
 from scrapling.core._types import Any, Optional, Protocol, Self
+from scrapling.core.utils import lru_cache
 regex = f"[{HTML5_WHITESPACE}]+"
 replace_html5_whitespaces = re.compile(regex).sub
 class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
+    @lru_cache(maxsize=256)
     def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
         return super().css_to_xpath(css, prefix)

scrapling/core/utils.py CHANGED Viewed

@@ -9,18 +9,36 @@ from scrapling.core._types import Any, Dict, Iterable, Union
 # Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
 # functools.cache is available on Python 3.9+ only so let's keep lru_cache
-from functools import lru_cache as cache  # isort:skip
 html_forbidden = {html.HtmlComment, }
-logging.basicConfig(
-    level=logging.INFO,
-    format="[%(asctime)s] %(levelname)s: %(message)s",
-    datefmt="%Y-%m-%d %H:%M:%S",
-    handlers=[
-        logging.StreamHandler()
-    ]
-)
 def is_jsonable(content: Union[bytes, str]) -> bool:
@@ -34,23 +52,6 @@ def is_jsonable(content: Union[bytes, str]) -> bool:
         return False
-@cache(None, typed=True)
-def setup_basic_logging(level: str = 'debug'):
-    levels = {
-        'debug': logging.DEBUG,
-        'info': logging.INFO,
-        'warning': logging.WARNING,
-        'error': logging.ERROR,
-        'critical': logging.CRITICAL
-    }
-    formatter = logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s", "%Y-%m-%d %H:%M:%S")
-    lvl = levels[level.lower()]
-    handler = logging.StreamHandler()
-    handler.setFormatter(formatter)
-    # Configure the root logger
-    logging.basicConfig(level=lvl, handlers=[handler])
 def flatten(lst: Iterable):
     return list(chain.from_iterable(lst))
@@ -114,7 +115,7 @@ class _StorageTools:
 #     return _impl
-@cache(None, typed=True)
 def clean_spaces(string):
     string = string.replace('\t', ' ')
     string = re.sub('[\n|\r]', '', string)

 # Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
 # functools.cache is available on Python 3.9+ only so let's keep lru_cache
+from functools import lru_cache  # isort:skip
 html_forbidden = {html.HtmlComment, }
+@lru_cache(1, typed=True)
+def setup_logger():
+    """Create and configure a logger with a standard format.
+    :returns: logging.Logger: Configured logger instance
+    """
+    logger = logging.getLogger('scrapling')
+    logger.setLevel(logging.INFO)
+    formatter = logging.Formatter(
+        fmt="[%(asctime)s] %(levelname)s: %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S"
+    )
+    console_handler = logging.StreamHandler()
+    console_handler.setFormatter(formatter)
+    # Add handler to logger (if not already added)
+    if not logger.handlers:
+        logger.addHandler(console_handler)
+    return logger
+log = setup_logger()
 def is_jsonable(content: Union[bytes, str]) -> bool:
         return False
 def flatten(lst: Iterable):
     return list(chain.from_iterable(lst))
 #     return _impl
+@lru_cache(None, typed=True)
 def clean_spaces(string):
     string = string.replace('\t', ' ')
     string = re.sub('[\n|\r]', '', string)

scrapling/engines/camo.py CHANGED Viewed

@@ -1,10 +1,9 @@
-import logging
 from camoufox import DefaultAddons
 from camoufox.sync_api import Camoufox
 from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
                                    Union)
 from scrapling.engines.toolbelt import (Response, StatusText,
                                         check_type_validity,
                                         construct_proxy_dict, do_nothing,
@@ -63,7 +62,7 @@ class CamoufoxEngine:
             self.page_action = page_action
         else:
             self.page_action = do_nothing
-            logging.error('[Ignored] Argument "page_action" must be callable')
         self.wait_selector = wait_selector
         self.wait_selector_state = wait_selector_state

 from camoufox import DefaultAddons
 from camoufox.sync_api import Camoufox
 from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
                                    Union)
+from scrapling.core.utils import log
 from scrapling.engines.toolbelt import (Response, StatusText,
                                         check_type_validity,
                                         construct_proxy_dict, do_nothing,
             self.page_action = page_action
         else:
             self.page_action = do_nothing
+            log.error('[Ignored] Argument "page_action" must be callable')
         self.wait_selector = wait_selector
         self.wait_selector_state = wait_selector_state

scrapling/engines/pw.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import json
-import logging
 from scrapling.core._types import Callable, Dict, List, Optional, Union
 from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
                                          NSTBROWSER_DEFAULT_QUERY)
 from scrapling.engines.toolbelt import (Response, StatusText,
@@ -78,7 +78,7 @@ class PlaywrightEngine:
             self.page_action = page_action
         else:
             self.page_action = do_nothing
-            logging.error('[Ignored] Argument "page_action" must be callable')
         self.wait_selector = wait_selector
         self.wait_selector_state = wait_selector_state

 import json
 from scrapling.core._types import Callable, Dict, List, Optional, Union
+from scrapling.core.utils import log
 from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
                                          NSTBROWSER_DEFAULT_QUERY)
 from scrapling.engines.toolbelt import (Response, StatusText,
             self.page_action = page_action
         else:
             self.page_action = do_nothing
+            log.error('[Ignored] Argument "page_action" must be callable')
         self.wait_selector = wait_selector
         self.wait_selector_state = wait_selector_state

scrapling/engines/static.py CHANGED Viewed

@@ -1,5 +1,3 @@
-import logging
 import httpx
 from httpx._models import Response as httpxResponse
@@ -36,7 +34,6 @@ class StaticEngine:
         # Validate headers
         if not headers.get('user-agent') and not headers.get('User-Agent'):
             headers['User-Agent'] = generate_headers(browser_mode=False).get('User-Agent')
-            logging.info(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
         if stealth:
             extra_headers = generate_headers(browser_mode=False)

 import httpx
 from httpx._models import Response as httpxResponse
         # Validate headers
         if not headers.get('user-agent') and not headers.get('User-Agent'):
             headers['User-Agent'] = generate_headers(browser_mode=False).get('User-Agent')
         if stealth:
             extra_headers = generate_headers(browser_mode=False)

scrapling/engines/toolbelt/custom.py CHANGED Viewed

@@ -2,13 +2,12 @@
 Functions related to custom types or type checking
 """
 import inspect
-import logging
 from email.message import Message
 from scrapling.core._types import (Any, Callable, Dict, List, Optional, Tuple,
                                    Type, Union)
 from scrapling.core.custom_types import MappingProxyType
-from scrapling.core.utils import cache, setup_basic_logging
 from scrapling.parser import Adaptor, SQLiteStorageSystem
@@ -17,7 +16,7 @@ class ResponseEncoding:
     __ISO_8859_1_CONTENT_TYPES = {"text/plain", "text/html", "text/css", "text/javascript"}
     @classmethod
-    @cache(maxsize=None)
     def __parse_content_type(cls, header_value: str) -> Tuple[str, Dict[str, str]]:
         """Parse content type and parameters from a content-type header value.
@@ -39,7 +38,7 @@ class ResponseEncoding:
         return content_type, params
     @classmethod
-    @cache(maxsize=None)
     def get_value(cls, content_type: Optional[str], text: Optional[str] = 'test') -> str:
         """Determine the appropriate character encoding from a content-type header.
@@ -98,7 +97,7 @@ class Response(Adaptor):
         # For back-ward compatibility
         self.adaptor = self
         # For easier debugging while working from a Python shell
-        logging.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')
     # def __repr__(self):
     #     return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
@@ -107,7 +106,7 @@ class Response(Adaptor):
 class BaseFetcher:
     def __init__(
             self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
-            storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = False,
             automatch_domain: Optional[str] = None, keep_cdata: Optional[bool] = False,
     ):
         """Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
@@ -124,7 +123,6 @@ class BaseFetcher:
             If empty, default values will be used.
         :param automatch_domain: For cases where you want to automatch selectors across different websites as if they were on the same website, use this argument to unify them.
             Otherwise, the domain of the request is used by default.
-        :param debug: Enable debug mode
         """
         # Adaptor class parameters
         # I won't validate Adaptor's class parameters here again, I will leave it to be validated later
@@ -134,14 +132,11 @@ class BaseFetcher:
             keep_cdata=keep_cdata,
             auto_match=auto_match,
             storage=storage,
-            storage_args=storage_args,
-            debug=debug,
         )
-        # If the user used fetchers first, then configure the logger from here instead of the `Adaptor` class
-        setup_basic_logging(level='debug' if debug else 'info')
         if automatch_domain:
             if type(automatch_domain) is not str:
-                logging.warning('[Ignored] The argument "automatch_domain" must be of string type')
             else:
                 self.adaptor_arguments.update({'automatch_domain': automatch_domain})
@@ -217,7 +212,7 @@ class StatusText:
     })
     @classmethod
-    @cache(maxsize=128)
     def get(cls, status_code: int) -> str:
         """Get the phrase for a given HTTP status code."""
         return cls._phrases.get(status_code, "Unknown Status Code")
@@ -284,7 +279,7 @@ def check_type_validity(variable: Any, valid_types: Union[List[Type], None], def
         error_msg = f'Argument "{var_name}" cannot be None'
         if critical:
             raise TypeError(error_msg)
-        logging.error(f'[Ignored] {error_msg}')
         return default_value
     # If no valid_types specified and variable has a value, return it
@@ -297,7 +292,7 @@ def check_type_validity(variable: Any, valid_types: Union[List[Type], None], def
         error_msg = f'Argument "{var_name}" must be of type {" or ".join(type_names)}'
         if critical:
             raise TypeError(error_msg)
-        logging.error(f'[Ignored] {error_msg}')
         return default_value
     return variable

 Functions related to custom types or type checking
 """
 import inspect
 from email.message import Message
 from scrapling.core._types import (Any, Callable, Dict, List, Optional, Tuple,
                                    Type, Union)
 from scrapling.core.custom_types import MappingProxyType
+from scrapling.core.utils import log, lru_cache
 from scrapling.parser import Adaptor, SQLiteStorageSystem
     __ISO_8859_1_CONTENT_TYPES = {"text/plain", "text/html", "text/css", "text/javascript"}
     @classmethod
+    @lru_cache(maxsize=None)
     def __parse_content_type(cls, header_value: str) -> Tuple[str, Dict[str, str]]:
         """Parse content type and parameters from a content-type header value.
         return content_type, params
     @classmethod
+    @lru_cache(maxsize=None)
     def get_value(cls, content_type: Optional[str], text: Optional[str] = 'test') -> str:
         """Determine the appropriate character encoding from a content-type header.
         # For back-ward compatibility
         self.adaptor = self
         # For easier debugging while working from a Python shell
+        log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')
     # def __repr__(self):
     #     return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
 class BaseFetcher:
     def __init__(
             self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
+            storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None,
             automatch_domain: Optional[str] = None, keep_cdata: Optional[bool] = False,
     ):
         """Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
             If empty, default values will be used.
         :param automatch_domain: For cases where you want to automatch selectors across different websites as if they were on the same website, use this argument to unify them.
             Otherwise, the domain of the request is used by default.
         """
         # Adaptor class parameters
         # I won't validate Adaptor's class parameters here again, I will leave it to be validated later
             keep_cdata=keep_cdata,
             auto_match=auto_match,
             storage=storage,
+            storage_args=storage_args
         )
         if automatch_domain:
             if type(automatch_domain) is not str:
+                log.warning('[Ignored] The argument "automatch_domain" must be of string type')
             else:
                 self.adaptor_arguments.update({'automatch_domain': automatch_domain})
     })
     @classmethod
+    @lru_cache(maxsize=128)
     def get(cls, status_code: int) -> str:
         """Get the phrase for a given HTTP status code."""
         return cls._phrases.get(status_code, "Unknown Status Code")
         error_msg = f'Argument "{var_name}" cannot be None'
         if critical:
             raise TypeError(error_msg)
+        log.error(f'[Ignored] {error_msg}')
         return default_value
     # If no valid_types specified and variable has a value, return it
         error_msg = f'Argument "{var_name}" must be of type {" or ".join(type_names)}'
         if critical:
             raise TypeError(error_msg)
+        log.error(f'[Ignored] {error_msg}')
         return default_value
     return variable

scrapling/engines/toolbelt/fingerprints.py CHANGED Viewed

@@ -9,10 +9,10 @@ from browserforge.headers import Browser, HeaderGenerator
 from tldextract import extract
 from scrapling.core._types import Dict, Union
-from scrapling.core.utils import cache
-@cache(None, typed=True)
 def generate_convincing_referer(url: str) -> str:
     """Takes the domain from the URL without the subdomain/suffix and make it look like you were searching google for this website
@@ -26,7 +26,7 @@ def generate_convincing_referer(url: str) -> str:
     return f'https://www.google.com/search?q={website_name}'
-@cache(None, typed=True)
 def get_os_name() -> Union[str, None]:
     """Get the current OS name in the same format needed for browserforge

 from tldextract import extract
 from scrapling.core._types import Dict, Union
+from scrapling.core.utils import lru_cache
+@lru_cache(None, typed=True)
 def generate_convincing_referer(url: str) -> str:
     """Takes the domain from the URL without the subdomain/suffix and make it look like you were searching google for this website
     return f'https://www.google.com/search?q={website_name}'
+@lru_cache(None, typed=True)
 def get_os_name() -> Union[str, None]:
     """Get the current OS name in the same format needed for browserforge

scrapling/engines/toolbelt/navigation.py CHANGED Viewed

@@ -1,15 +1,13 @@
 """
 Functions related to files and URLs
 """
-import logging
 import os
 from urllib.parse import urlencode, urlparse
 from playwright.sync_api import Route
 from scrapling.core._types import Dict, Optional, Union
-from scrapling.core.utils import cache
 from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
@@ -20,7 +18,7 @@ def intercept_route(route: Route) -> Union[Route, None]:
     :return: PlayWright `Route` object
     """
     if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
-        logging.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
         return route.abort()
     return route.continue_()
@@ -97,7 +95,7 @@ def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
         raise ValueError(f"Invalid CDP URL: {str(e)}")
-@cache(None, typed=True)
 def js_bypass_path(filename: str) -> str:
     """Takes the base filename of JS file inside the `bypasses` folder then return the full path of it

 """
 Functions related to files and URLs
 """
 import os
 from urllib.parse import urlencode, urlparse
 from playwright.sync_api import Route
 from scrapling.core._types import Dict, Optional, Union
+from scrapling.core.utils import log, lru_cache
 from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
     :return: PlayWright `Route` object
     """
     if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
+        log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
         return route.abort()
     return route.continue_()
         raise ValueError(f"Invalid CDP URL: {str(e)}")
+@lru_cache(None, typed=True)
 def js_bypass_path(filename: str) -> str:
     """Takes the base filename of JS file inside the `bypasses` folder then return the full path of it

scrapling/parser.py CHANGED Viewed

@@ -18,12 +18,12 @@ from scrapling.core.storage_adaptors import (SQLiteStorageSystem,
                                              StorageSystemMixin, _StorageTools)
 from scrapling.core.translator import HTMLTranslator
 from scrapling.core.utils import (clean_spaces, flatten, html_forbidden,
-                                  is_jsonable, logging, setup_basic_logging)
 class Adaptor(SelectorsGeneration):
     __slots__ = (
-        'url', 'encoding', '__auto_match_enabled', '_root', '_storage', '__debug',
         '__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
         '__keep_cdata', '__raw_body'
     )
@@ -41,7 +41,6 @@ class Adaptor(SelectorsGeneration):
             auto_match: Optional[bool] = True,
             storage: Any = SQLiteStorageSystem,
             storage_args: Optional[Dict] = None,
-            debug: Optional[bool] = True,
             **kwargs
     ):
         """The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
@@ -67,7 +66,6 @@ class Adaptor(SelectorsGeneration):
         :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
         :param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
             If empty, default values will be used.
-        :param debug: Enable debug mode
         """
         if root is None and not body and text is None:
             raise ValueError("Adaptor class needs text, body, or root arguments to work")
@@ -106,7 +104,6 @@ class Adaptor(SelectorsGeneration):
             self._root = root
-        setup_basic_logging(level='debug' if debug else 'info')
         self.__auto_match_enabled = auto_match
         if self.__auto_match_enabled:
@@ -117,7 +114,7 @@ class Adaptor(SelectorsGeneration):
                 }
             if not hasattr(storage, '__wrapped__'):
-                raise ValueError("Storage class must be wrapped with cache decorator, see docs for info")
             if not issubclass(storage.__wrapped__, StorageSystemMixin):
                 raise ValueError("Storage system must be inherited from class `StorageSystemMixin`")
@@ -132,7 +129,6 @@ class Adaptor(SelectorsGeneration):
         # For selector stuff
         self.__attributes = None
         self.__tag = None
-        self.__debug = debug
         # No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
         self.__response_data = {
             key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'headers', 'request_headers',)
@@ -164,7 +160,7 @@ class Adaptor(SelectorsGeneration):
                     text='', body=b'',  # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
                     url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
                     keep_comments=self.__keep_comments, keep_cdata=self.__keep_cdata,
-                    huge_tree=self.__huge_tree_enabled, debug=self.__debug,
                     **self.__response_data
                 )
             return element
@@ -417,10 +413,10 @@ class Adaptor(SelectorsGeneration):
         if score_table:
             highest_probability = max(score_table.keys())
             if score_table[highest_probability] and highest_probability >= percentage:
-                logging.debug(f'Highest probability was {highest_probability}%')
-                logging.debug('Top 5 best matching elements are: ')
                 for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
-                    logging.debug(f'{percent} -> {self.__convert_results(score_table[percent])}')
                 if not adaptor_type:
                     return score_table[highest_probability]
                 return self.__convert_results(score_table[highest_probability])
@@ -546,7 +542,7 @@ class Adaptor(SelectorsGeneration):
             if selected_elements:
                 if not self.__auto_match_enabled and auto_save:
-                    logging.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
                 elif self.__auto_match_enabled and auto_save:
                     self.save(selected_elements[0], identifier or selector)
@@ -565,7 +561,7 @@ class Adaptor(SelectorsGeneration):
                         return self.__convert_results(selected_elements)
                 elif not self.__auto_match_enabled and auto_match:
-                    logging.warning("Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
                 return self.__convert_results(selected_elements)
@@ -769,7 +765,7 @@ class Adaptor(SelectorsGeneration):
             self._storage.save(element, identifier)
         else:
-            logging.critical(
                 "Can't use Auto-match features with disabled globally, you have to start a new class instance."
             )
@@ -783,7 +779,7 @@ class Adaptor(SelectorsGeneration):
         if self.__auto_match_enabled:
             return self._storage.retrieve(identifier)
-        logging.critical(
             "Can't use Auto-match features with disabled globally, you have to start a new class instance."
         )

                                              StorageSystemMixin, _StorageTools)
 from scrapling.core.translator import HTMLTranslator
 from scrapling.core.utils import (clean_spaces, flatten, html_forbidden,
+                                  is_jsonable, log)
 class Adaptor(SelectorsGeneration):
     __slots__ = (
+        'url', 'encoding', '__auto_match_enabled', '_root', '_storage',
         '__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
         '__keep_cdata', '__raw_body'
     )
             auto_match: Optional[bool] = True,
             storage: Any = SQLiteStorageSystem,
             storage_args: Optional[Dict] = None,
             **kwargs
     ):
         """The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
         :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
         :param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
             If empty, default values will be used.
         """
         if root is None and not body and text is None:
             raise ValueError("Adaptor class needs text, body, or root arguments to work")
             self._root = root
         self.__auto_match_enabled = auto_match
         if self.__auto_match_enabled:
                 }
             if not hasattr(storage, '__wrapped__'):
+                raise ValueError("Storage class must be wrapped with lru_cache decorator, see docs for info")
             if not issubclass(storage.__wrapped__, StorageSystemMixin):
                 raise ValueError("Storage system must be inherited from class `StorageSystemMixin`")
         # For selector stuff
         self.__attributes = None
         self.__tag = None
         # No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
         self.__response_data = {
             key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'headers', 'request_headers',)
                     text='', body=b'',  # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
                     url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
                     keep_comments=self.__keep_comments, keep_cdata=self.__keep_cdata,
+                    huge_tree=self.__huge_tree_enabled,
                     **self.__response_data
                 )
             return element
         if score_table:
             highest_probability = max(score_table.keys())
             if score_table[highest_probability] and highest_probability >= percentage:
+                log.debug(f'Highest probability was {highest_probability}%')
+                log.debug('Top 5 best matching elements are: ')
                 for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
+                    log.debug(f'{percent} -> {self.__convert_results(score_table[percent])}')
                 if not adaptor_type:
                     return score_table[highest_probability]
                 return self.__convert_results(score_table[highest_probability])
             if selected_elements:
                 if not self.__auto_match_enabled and auto_save:
+                    log.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
                 elif self.__auto_match_enabled and auto_save:
                     self.save(selected_elements[0], identifier or selector)
                         return self.__convert_results(selected_elements)
                 elif not self.__auto_match_enabled and auto_match:
+                    log.warning("Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
                 return self.__convert_results(selected_elements)
             self._storage.save(element, identifier)
         else:
+            log.critical(
                 "Can't use Auto-match features with disabled globally, you have to start a new class instance."
             )
         if self.__auto_match_enabled:
             return self._storage.retrieve(identifier)
+        log.critical(
             "Can't use Auto-match features with disabled globally, you have to start a new class instance."
         )

tests/parser/test_automatch.py CHANGED Viewed

@@ -42,8 +42,8 @@ class TestParserAutoMatch(unittest.TestCase):
                 </div>
                 '''
-        old_page = Adaptor(original_html, url='example.com', auto_match=True, debug=True)
-        new_page = Adaptor(changed_html, url='example.com', auto_match=True, debug=True)
         # 'p1' was used as ID and now it's not and all the path elements have changes
         # Also at the same time testing auto-match vs combined selectors

                 </div>
                 '''
+        old_page = Adaptor(original_html, url='example.com', auto_match=True)
+        new_page = Adaptor(changed_html, url='example.com', auto_match=True)
         # 'p1' was used as ID and now it's not and all the path elements have changes
         # Also at the same time testing auto-match vs combined selectors

tests/parser/test_general.py CHANGED Viewed

@@ -74,7 +74,7 @@ class TestParser(unittest.TestCase):
         </body>
         </html>
         '''
-        self.page = Adaptor(self.html, auto_match=False, debug=False)
     def test_css_selector(self):
         """Test Selecting elements with complex CSS selectors"""
@@ -273,7 +273,7 @@ class TestParser(unittest.TestCase):
         large_html = '<html><body>' + '<div class="item">' * 5000 + '</div>' * 5000 + '</body></html>'
         start_time = time.time()
-        parsed = Adaptor(large_html, auto_match=False, debug=False)
         elements = parsed.css('.item')
         end_time = time.time()

         </body>
         </html>
         '''
+        self.page = Adaptor(self.html, auto_match=False)
     def test_css_selector(self):
         """Test Selecting elements with complex CSS selectors"""
         large_html = '<html><body>' + '<div class="item">' * 5000 + '</div>' * 5000 + '</body></html>'
         start_time = time.time()
+        parsed = Adaptor(large_html, auto_match=False)
         elements = parsed.css('.item')
         end_time = time.time()