File size: 2,802 Bytes
a579dd2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
# -*- coding:utf-8 -*-
from copy import deepcopy
from ultradata_math_parser.utils import *
from ultradata_math_parser.parsers.base_parser import BaseParser
from ultradata_math_parser.parsers.title_parser import TitleParser
class ArticleParser(BaseParser):
def __init__(self) -> None:
super().__init__()
def extract(self, html="", **kwargs) -> dict:
base_url = kwargs.get("base_url", "")
self.process_math = kwargs.get("process_math", self.process_math)
self.preserve_math_containers = kwargs.get("preserve_math_containers", self.preserve_math_containers)
self.include_tables = kwargs.get("include_tables", self.include_tables)
self.include_images = kwargs.get("include_images", self.include_images)
html = html.replace(" ", " ").replace(" ", " ")
tree = load_html(html)
if tree is None:
raise ValueError
title = TitleParser().process(tree)
# base_url
base_href = tree.xpath("//base/@href")
if base_href and "http" in base_href[0]:
base_url = base_href[0]
if "://blog.csdn.net/" in base_url:
for dtree in tree.xpath('//div[@id="content_views"]//ul[@class="pre-numbering"]'):
self.remove_node(dtree)
raw_tree = deepcopy(tree)
working_tree = deepcopy(tree)
# 标签转换, 增加数学标签处理
format_tree = self.convert_tags(working_tree, base_url=base_url)
format_tree = self._remove_tables_from_tree(format_tree)
format_tree = self._remove_images_from_tree(format_tree)
# 删除script style等标签及其内容
normal_tree = self.clean_tags(format_tree)
normal_tree = self._remove_tables_from_tree(normal_tree)
normal_tree = self._remove_images_from_tree(normal_tree)
fallback_tree = deepcopy(normal_tree)
subtree, xp_num, drop_list = self.xp_1_5(normal_tree)
if xp_num == "others":
subtree, drop_list = self.prune_unwanted_sections(normal_tree)
body_html = self.get_content_html(subtree, xp_num, base_url)
body_html, fallback_strategy = self.apply_fallbacks(
primary_html=body_html,
base_url=base_url,
normal_tree=fallback_tree,
raw_tree=raw_tree,
)
body_html = self._strip_tables_from_html(body_html)
body_html = self._strip_images_from_html(body_html)
text_length = self._text_length_from_html(body_html)
return {
"xp_num": xp_num,
"drop_list": drop_list,
"html": body_html,
"title": title,
"base_url": base_url,
"fallback_strategy": fallback_strategy,
"text_length": text_length,
}
|