# -*- coding:utf-8 -*- import re from ultradata_math_parser.utils import * from ultradata_math_parser.parsers.base_parser import BaseParser from ultradata_math_parser.parsers.title_parser import TitleParser class CustomParser(BaseParser): def __init__(self) -> None: super().__init__() def use_clean_rule(self, tree, clean_rules): for clean_rule in clean_rules: for x in tree.xpath(clean_rule): self.remove_node(x) return tree def use_extract_rule(self, tree, extract_rule): if "/text()" in extract_rule["value"]: return "".join(tree.xpath(extract_rule["value"])).strip() return tree.xpath(extract_rule["value"])[0] def extract(self, html="", base_url="", rule={}, **kwargs) -> dict: self.include_images = kwargs.get("include_images", False) tree = load_html(html) if tree is None: raise ValueError # base_url base_href = tree.xpath("//base/@href") if base_href and "http" in base_href[0]: base_url = base_href[0] if "clean" in rule: tree = self.use_clean_rule(tree, rule["clean"]) # 获取title if "title" not in rule: title = TitleParser().process(tree) else: title = self.use_extract_rule(tree, rule["title"]) # 文章区域 try: body_tree = self.use_extract_rule(tree, rule["content"]) except: raise ValueError if not self.include_images: self._remove_images_from_tree(body_tree) body_html = tostring(body_tree, encoding=str) body_html = self._strip_images_from_html(body_html) text_length = self._text_length_from_html(body_html) return { "xp_num": "custom", "drop_list": False, "html": body_html, "title": title, "base_url": base_url, "text_length": text_length, }