|
|
|
|
|
|
|
|
from copy import deepcopy |
|
|
|
|
|
from ultradata_math_parser.utils import * |
|
|
from ultradata_math_parser.parsers.base_parser import BaseParser |
|
|
from ultradata_math_parser.parsers.title_parser import TitleParser |
|
|
|
|
|
|
|
|
class ArticleParser(BaseParser): |
|
|
def __init__(self) -> None: |
|
|
super().__init__() |
|
|
|
|
|
def extract(self, html="", **kwargs) -> dict: |
|
|
base_url = kwargs.get("base_url", "") |
|
|
self.process_math = kwargs.get("process_math", self.process_math) |
|
|
self.preserve_math_containers = kwargs.get("preserve_math_containers", self.preserve_math_containers) |
|
|
self.include_tables = kwargs.get("include_tables", self.include_tables) |
|
|
self.include_images = kwargs.get("include_images", self.include_images) |
|
|
html = html.replace(" ", " ").replace(" ", " ") |
|
|
tree = load_html(html) |
|
|
if tree is None: |
|
|
raise ValueError |
|
|
|
|
|
title = TitleParser().process(tree) |
|
|
|
|
|
|
|
|
base_href = tree.xpath("//base/@href") |
|
|
|
|
|
if base_href and "http" in base_href[0]: |
|
|
base_url = base_href[0] |
|
|
|
|
|
if "://blog.csdn.net/" in base_url: |
|
|
for dtree in tree.xpath('//div[@id="content_views"]//ul[@class="pre-numbering"]'): |
|
|
self.remove_node(dtree) |
|
|
|
|
|
raw_tree = deepcopy(tree) |
|
|
working_tree = deepcopy(tree) |
|
|
|
|
|
|
|
|
format_tree = self.convert_tags(working_tree, base_url=base_url) |
|
|
format_tree = self._remove_tables_from_tree(format_tree) |
|
|
format_tree = self._remove_images_from_tree(format_tree) |
|
|
|
|
|
|
|
|
normal_tree = self.clean_tags(format_tree) |
|
|
normal_tree = self._remove_tables_from_tree(normal_tree) |
|
|
normal_tree = self._remove_images_from_tree(normal_tree) |
|
|
fallback_tree = deepcopy(normal_tree) |
|
|
|
|
|
subtree, xp_num, drop_list = self.xp_1_5(normal_tree) |
|
|
if xp_num == "others": |
|
|
subtree, drop_list = self.prune_unwanted_sections(normal_tree) |
|
|
body_html = self.get_content_html(subtree, xp_num, base_url) |
|
|
|
|
|
body_html, fallback_strategy = self.apply_fallbacks( |
|
|
primary_html=body_html, |
|
|
base_url=base_url, |
|
|
normal_tree=fallback_tree, |
|
|
raw_tree=raw_tree, |
|
|
) |
|
|
|
|
|
body_html = self._strip_tables_from_html(body_html) |
|
|
body_html = self._strip_images_from_html(body_html) |
|
|
|
|
|
text_length = self._text_length_from_html(body_html) |
|
|
|
|
|
return { |
|
|
"xp_num": xp_num, |
|
|
"drop_list": drop_list, |
|
|
"html": body_html, |
|
|
"title": title, |
|
|
"base_url": base_url, |
|
|
"fallback_strategy": fallback_strategy, |
|
|
"text_length": text_length, |
|
|
} |
|
|
|