Scrapling / scrapling /core /translator.py
Karim shoair
fix: remove forgotten unused code
a7cc20d
"""
Most of this file is an adapted version of the parsel library's translator with some modifications simply for 1 important reason...
To add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match the Parsel/Scrapy selectors format which will be important in future releases but most importantly...
So you don't have to learn a new selectors/api method like what bs4 done with soupsieve :)
If you want to learn about this, head to https://cssselect.readthedocs.io/en/latest/#cssselect.FunctionalPseudoElement
"""
from functools import lru_cache
from cssselect import HTMLTranslator as OriginalHTMLTranslator
from cssselect.xpath import ExpressionError, XPathExpr as OriginalXPathExpr
from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
from scrapling.core._types import Any, Protocol, Self
class XPathExpr(OriginalXPathExpr):
textnode: bool = False
attribute: str | None = None
@classmethod
def from_xpath(
cls,
xpath: OriginalXPathExpr,
textnode: bool = False,
attribute: str | None = None,
) -> Self:
x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
x.textnode = textnode
x.attribute = attribute
return x
def __str__(self) -> str:
path = super().__str__()
if self.textnode:
if path == "*": # pragma: no cover
path = "text()"
elif path.endswith("::*/*"): # pragma: no cover
path = path[:-3] + "text()"
else:
path += "/text()"
if self.attribute is not None:
if path.endswith("::*/*"): # pragma: no cover
path = path[:-2]
path += f"/@{self.attribute}"
return path
def join(
self: Self,
combiner: str,
other: OriginalXPathExpr,
*args: Any,
**kwargs: Any,
) -> Self:
if not isinstance(other, XPathExpr):
raise ValueError( # pragma: no cover
f"Expressions of type {__name__}.XPathExpr can ony join expressions"
f" of the same type (or its descendants), got {type(other)}"
)
super().join(combiner, other, *args, **kwargs)
self.textnode = other.textnode
self.attribute = other.attribute
return self
# e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
class TranslatorProtocol(Protocol):
def xpath_element(self, selector: Element) -> OriginalXPathExpr: # pyright: ignore # pragma: no cover
pass
def css_to_xpath(self, css: str, prefix: str = ...) -> str: # pyright: ignore # pragma: no cover
pass
class TranslatorMixin:
"""This mixin adds support to CSS pseudo elements via dynamic dispatch.
Currently supported pseudo-elements are ``::text`` and ``::attr(ATTR_NAME)``.
"""
def xpath_element(self: TranslatorProtocol, selector: Element) -> XPathExpr:
# https://github.com/python/mypy/issues/14757
xpath = super().xpath_element(selector) # type: ignore[safe-super]
return XPathExpr.from_xpath(xpath)
def xpath_pseudo_element(self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement) -> OriginalXPathExpr:
"""
Dispatch method that transforms XPath to support the pseudo-element.
"""
if isinstance(pseudo_element, FunctionalPseudoElement):
method_name = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element"
method = getattr(self, method_name, None)
if not method: # pragma: no cover
raise ExpressionError(f"The functional pseudo-element ::{pseudo_element.name}() is unknown")
xpath = method(xpath, pseudo_element)
else:
method_name = f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
method = getattr(self, method_name, None)
if not method: # pragma: no cover
raise ExpressionError(f"The pseudo-element ::{pseudo_element} is unknown")
xpath = method(xpath)
return xpath
@staticmethod
def xpath_attr_functional_pseudo_element(xpath: OriginalXPathExpr, function: FunctionalPseudoElement) -> XPathExpr:
"""Support selecting attribute values using ::attr() pseudo-element"""
if function.argument_types() not in (["STRING"], ["IDENT"]): # pragma: no cover
raise ExpressionError(f"Expected a single string or ident for ::attr(), got {function.arguments!r}")
return XPathExpr.from_xpath(xpath, attribute=function.arguments[0].value)
@staticmethod
def xpath_text_simple_pseudo_element(xpath: OriginalXPathExpr) -> XPathExpr:
"""Support selecting text nodes using ::text pseudo-element"""
return XPathExpr.from_xpath(xpath, textnode=True)
class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
return super().css_to_xpath(css, prefix)
translator = HTMLTranslator()
# Using a function instead of the translator directly to avoid Pyright override error
@lru_cache(maxsize=256)
def css_to_xpath(query: str) -> str:
"""Return the translated XPath version of a given CSS query"""
return translator.css_to_xpath(query)