ZhouChuYue
init
a579dd2
# -*- coding:utf-8 -*-
import re
from ultradata_math_parser.config import Forum_XPATH, Unique_ID
from ultradata_math_parser.utils import *
from ultradata_math_parser.parsers.base_parser import BaseParser
from ultradata_math_parser.parsers.title_parser import TitleParser
class ForumParser(BaseParser):
def __init__(self) -> None:
super().__init__()
def extract(self, html="", base_url="", **kwargs) -> dict:
self.preserve_math_containers = kwargs.get("preserve_math_containers", self.preserve_math_containers)
self.process_math = kwargs.get("process_math", self.process_math)
self.include_tables = kwargs.get("include_tables", self.include_tables)
self.include_images = kwargs.get("include_images", self.include_images)
self.need_comment = True
html = html.replace(" ", " ").replace(" ", " ")
tree = load_html(html)
if tree is None:
raise ValueError
# 获取title
title = TitleParser().process(tree)
# base_url
base_href = tree.xpath("//base/@href")
if base_href and "http" in base_href[0]:
base_url = base_href[0]
self.generate_unique_id(tree)
format_tree = self.convert_tags(tree, base_url=base_url)
format_tree = self._remove_tables_from_tree(format_tree)
format_tree = self._remove_images_from_tree(format_tree)
normal_tree = self.clean_tags(format_tree)
normal_tree = self._remove_tables_from_tree(normal_tree)
normal_tree = self._remove_images_from_tree(normal_tree)
subtree, xp_num, drop_list = self.xp_1_5(normal_tree)
if xp_num == "others":
subtree, drop_list = self.prune_unwanted_sections(normal_tree)
body_html = self.get_content_html(subtree, xp_num, base_url)
body_html = self._strip_tables_from_html(body_html)
body_html = self._strip_images_from_html(body_html)
# 论坛等独有
body_html_tree = fromstring(body_html)
try:
body_tree = body_html_tree.body
except:
body_tree = Element("body")
body_tree.extend(body_html_tree)
main_ids = body_tree.xpath(f".//@{Unique_ID}")
for main_id in main_ids:
main_tree = normal_tree.xpath(
f".//*[@{Unique_ID}={main_id}]"
)
if main_tree:
self.remove_node(main_tree[0])
if not main_ids:
main_ids = [-1]
if xp_num != "others":
normal_tree, _ = self.prune_unwanted_sections(normal_tree)
for c_xpath in Forum_XPATH:
while normal_tree.xpath(c_xpath):
x = normal_tree.xpath(c_xpath)[0]
self.remove_node(x)
if "'post-'" in c_xpath:
if not (re.findall('post-\d+', x.attrib.get("id", "").lower()) or re.findall('post_\d+',
x.attrib.get("id",
"").lower())):
continue
if (
"header" in x.attrib.get("class", "").lower()
or "header" in x.attrib.get("id", "").lower()
):
continue
try:
if int(x.attrib.get(Unique_ID, "0")) > int(
main_ids[-1]
):
body_tree.append(x)
else:
prefix_div = Element("div")
suffix_div = Element("div")
need_prefix = False
need_suffix = False
while x.xpath(
f".//*[number(@{Unique_ID}) > {int(main_ids[-1])}]"
):
tmp_x = x.xpath(
f".//*[number(@{Unique_ID}) > {int(main_ids[-1])}]"
)[0]
self.remove_node(tmp_x)
suffix_div.append(tmp_x)
need_suffix = True
while x.xpath(
f".//*[number(@{Unique_ID}) < {int(main_ids[-1])}]"
):
tmp_x = x.xpath(
f".//*[number(@{Unique_ID}) < {int(main_ids[-1])}]"
)[0]
self.remove_node(tmp_x)
prefix_div.append(tmp_x)
need_prefix = True
if need_prefix:
body_tree.insert(0, prefix_div)
if need_suffix:
body_tree.append(suffix_div)
except:
pass
body_html = re.sub(
f' {Unique_ID}="\d+"',
"",
tostring(body_tree, encoding=str),
)
text_length = self._text_length_from_html(body_html)
return {
"xp_num": xp_num,
"drop_list": drop_list,
"html": body_html,
"title": title,
"base_url": base_url,
"text_length": text_length,
}