File size: 3,684 Bytes
2b837a0 a2a8556 7003b66 2b837a0 48cc818 a2a8556 29489df 0c5f148 df3c414 29489df 7d01598 df3c414 e22b4a7 a2a8556 e22b4a7 df3c414 fcedcce df3c414 226b463 df3c414 7003b66 2b837a0 1d98b51 2b837a0 1d98b51 2b837a0 fcedcce 1d98b51 fcedcce 2b837a0 226b463 2b837a0 fcedcce 2b837a0 fcedcce 2b837a0 226b463 2b837a0 fcedcce 2b837a0 226b463 2b837a0 fcedcce 2b837a0 226b463 2b837a0 1138f08 2b837a0 e22b4a7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 | import logging
from itertools import chain
from re import compile as re_compile
from contextvars import ContextVar, Token
from lxml import html
from scrapling.core._types import Any, Dict, Iterable, List
# Using cache on top of a class is a brilliant way to achieve a Singleton design pattern without much code
from functools import lru_cache # isort:skip
html_forbidden = (html.HtmlComment,)
__CLEANING_TABLE__ = str.maketrans({"\t": " ", "\n": None, "\r": None})
__CONSECUTIVE_SPACES_REGEX__ = re_compile(r" +")
@lru_cache(1, typed=True)
def setup_logger():
"""Create and configure a logger with a standard format.
:returns: logging.Logger: Configured logger instance
"""
logger = logging.getLogger("scrapling")
logger.setLevel(logging.INFO)
formatter = logging.Formatter(fmt="[%(asctime)s] %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
# Add handler to logger (if not already added)
if not logger.handlers:
logger.addHandler(console_handler)
return logger
_current_logger: ContextVar[logging.Logger] = ContextVar("scrapling_logger", default=setup_logger())
class LoggerProxy:
def __getattr__(self, name: str):
return getattr(_current_logger.get(), name)
log = LoggerProxy()
def set_logger(logger: logging.Logger) -> Token:
"""Set the current context logger. Returns token for reset."""
return _current_logger.set(logger)
def reset_logger(token: Token) -> None:
"""Reset logger to previous state using token."""
_current_logger.reset(token)
def flatten(lst: Iterable[Any]) -> List[Any]:
return list(chain.from_iterable(lst))
def _is_iterable(obj: Any) -> bool:
# This will be used only in regex functions to make sure it's iterable but not string/bytes
return isinstance(
obj,
(
list,
tuple,
),
)
class _StorageTools:
@staticmethod
def __clean_attributes(element: html.HtmlElement, forbidden: tuple = ()) -> Dict:
if not element.attrib:
return {}
return {k: v.strip() for k, v in element.attrib.items() if v and v.strip() and k not in forbidden}
@classmethod
def element_to_dict(cls, element: html.HtmlElement) -> Dict:
parent = element.getparent()
result = {
"tag": str(element.tag),
"attributes": cls.__clean_attributes(element),
"text": element.text.strip() if element.text else None,
"path": cls._get_element_path(element),
}
if parent is not None:
result.update(
{
"parent_name": parent.tag,
"parent_attribs": dict(parent.attrib),
"parent_text": parent.text.strip() if parent.text else None,
}
)
siblings = [child.tag for child in parent.iterchildren() if child != element]
if siblings:
result.update({"siblings": tuple(siblings)})
children = [child.tag for child in element.iterchildren() if not isinstance(child, html_forbidden)]
if children:
result.update({"children": tuple(children)})
return result
@classmethod
def _get_element_path(cls, element: html.HtmlElement):
parent = element.getparent()
return tuple((element.tag,) if parent is None else (cls._get_element_path(parent) + (element.tag,)))
@lru_cache(128, typed=True)
def clean_spaces(string):
string = string.translate(__CLEANING_TABLE__)
return __CONSECUTIVE_SPACES_REGEX__.sub(" ", string)
|