File size: 2,000 Bytes
a579dd2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
# -*- coding:utf-8 -*-
import re
from ultradata_math_parser.utils import *
from ultradata_math_parser.parsers.base_parser import BaseParser
from ultradata_math_parser.parsers.title_parser import TitleParser
class CustomParser(BaseParser):
def __init__(self) -> None:
super().__init__()
def use_clean_rule(self, tree, clean_rules):
for clean_rule in clean_rules:
for x in tree.xpath(clean_rule):
self.remove_node(x)
return tree
def use_extract_rule(self, tree, extract_rule):
if "/text()" in extract_rule["value"]:
return "".join(tree.xpath(extract_rule["value"])).strip()
return tree.xpath(extract_rule["value"])[0]
def extract(self, html="", base_url="", rule={}, **kwargs) -> dict:
self.include_images = kwargs.get("include_images", False)
tree = load_html(html)
if tree is None:
raise ValueError
# base_url
base_href = tree.xpath("//base/@href")
if base_href and "http" in base_href[0]:
base_url = base_href[0]
if "clean" in rule:
tree = self.use_clean_rule(tree, rule["clean"])
# 获取title
if "title" not in rule:
title = TitleParser().process(tree)
else:
title = self.use_extract_rule(tree, rule["title"])
# 文章区域
try:
body_tree = self.use_extract_rule(tree, rule["content"])
except:
raise ValueError
if not self.include_images:
self._remove_images_from_tree(body_tree)
body_html = tostring(body_tree, encoding=str)
body_html = self._strip_images_from_html(body_html)
text_length = self._text_length_from_html(body_html)
return {
"xp_num": "custom",
"drop_list": False,
"html": body_html,
"title": title,
"base_url": base_url,
"text_length": text_length,
}
|