""" Most of this file is an adapted version of the parsel library's translator with some modifications simply for 1 important reason... To add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match the Parsel/Scrapy selectors format which will be important in future releases but most importantly... So you don't have to learn a new selectors/api method like what bs4 done with soupsieve :) If you want to learn about this, head to https://cssselect.readthedocs.io/en/latest/#cssselect.FunctionalPseudoElement """ from functools import lru_cache from cssselect import HTMLTranslator as OriginalHTMLTranslator from cssselect.xpath import ExpressionError, XPathExpr as OriginalXPathExpr from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement from scrapling.core._types import Any, Protocol, Self class XPathExpr(OriginalXPathExpr): textnode: bool = False attribute: str | None = None @classmethod def from_xpath( cls, xpath: OriginalXPathExpr, textnode: bool = False, attribute: str | None = None, ) -> Self: x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition) x.textnode = textnode x.attribute = attribute return x def __str__(self) -> str: path = super().__str__() if self.textnode: if path == "*": # pragma: no cover path = "text()" elif path.endswith("::*/*"): # pragma: no cover path = path[:-3] + "text()" else: path += "/text()" if self.attribute is not None: if path.endswith("::*/*"): # pragma: no cover path = path[:-2] path += f"/@{self.attribute}" return path def join( self: Self, combiner: str, other: OriginalXPathExpr, *args: Any, **kwargs: Any, ) -> Self: if not isinstance(other, XPathExpr): raise ValueError( # pragma: no cover f"Expressions of type {__name__}.XPathExpr can ony join expressions" f" of the same type (or its descendants), got {type(other)}" ) super().join(combiner, other, *args, **kwargs) self.textnode = other.textnode self.attribute = other.attribute return self # e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator class TranslatorProtocol(Protocol): def xpath_element(self, selector: Element) -> OriginalXPathExpr: # pyright: ignore # pragma: no cover pass def css_to_xpath(self, css: str, prefix: str = ...) -> str: # pyright: ignore # pragma: no cover pass class TranslatorMixin: """This mixin adds support to CSS pseudo elements via dynamic dispatch. Currently supported pseudo-elements are ``::text`` and ``::attr(ATTR_NAME)``. """ def xpath_element(self: TranslatorProtocol, selector: Element) -> XPathExpr: # https://github.com/python/mypy/issues/14757 xpath = super().xpath_element(selector) # type: ignore[safe-super] return XPathExpr.from_xpath(xpath) def xpath_pseudo_element(self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement) -> OriginalXPathExpr: """ Dispatch method that transforms XPath to support the pseudo-element. """ if isinstance(pseudo_element, FunctionalPseudoElement): method_name = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element" method = getattr(self, method_name, None) if not method: # pragma: no cover raise ExpressionError(f"The functional pseudo-element ::{pseudo_element.name}() is unknown") xpath = method(xpath, pseudo_element) else: method_name = f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element" method = getattr(self, method_name, None) if not method: # pragma: no cover raise ExpressionError(f"The pseudo-element ::{pseudo_element} is unknown") xpath = method(xpath) return xpath @staticmethod def xpath_attr_functional_pseudo_element(xpath: OriginalXPathExpr, function: FunctionalPseudoElement) -> XPathExpr: """Support selecting attribute values using ::attr() pseudo-element""" if function.argument_types() not in (["STRING"], ["IDENT"]): # pragma: no cover raise ExpressionError(f"Expected a single string or ident for ::attr(), got {function.arguments!r}") return XPathExpr.from_xpath(xpath, attribute=function.arguments[0].value) @staticmethod def xpath_text_simple_pseudo_element(xpath: OriginalXPathExpr) -> XPathExpr: """Support selecting text nodes using ::text pseudo-element""" return XPathExpr.from_xpath(xpath, textnode=True) class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator): def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str: return super().css_to_xpath(css, prefix) translator = HTMLTranslator() # Using a function instead of the translator directly to avoid Pyright override error @lru_cache(maxsize=256) def css_to_xpath(query: str) -> str: """Return the translated XPath version of a given CSS query""" return translator.css_to_xpath(query)