File size: 5,418 Bytes
a579dd2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
# -*- coding:utf-8 -*-
import re
from ultradata_math_parser.config import Forum_XPATH, Unique_ID
from ultradata_math_parser.utils import *
from ultradata_math_parser.parsers.base_parser import BaseParser
from ultradata_math_parser.parsers.title_parser import TitleParser
class ForumParser(BaseParser):
def __init__(self) -> None:
super().__init__()
def extract(self, html="", base_url="", **kwargs) -> dict:
self.preserve_math_containers = kwargs.get("preserve_math_containers", self.preserve_math_containers)
self.process_math = kwargs.get("process_math", self.process_math)
self.include_tables = kwargs.get("include_tables", self.include_tables)
self.include_images = kwargs.get("include_images", self.include_images)
self.need_comment = True
html = html.replace(" ", " ").replace(" ", " ")
tree = load_html(html)
if tree is None:
raise ValueError
# 获取title
title = TitleParser().process(tree)
# base_url
base_href = tree.xpath("//base/@href")
if base_href and "http" in base_href[0]:
base_url = base_href[0]
self.generate_unique_id(tree)
format_tree = self.convert_tags(tree, base_url=base_url)
format_tree = self._remove_tables_from_tree(format_tree)
format_tree = self._remove_images_from_tree(format_tree)
normal_tree = self.clean_tags(format_tree)
normal_tree = self._remove_tables_from_tree(normal_tree)
normal_tree = self._remove_images_from_tree(normal_tree)
subtree, xp_num, drop_list = self.xp_1_5(normal_tree)
if xp_num == "others":
subtree, drop_list = self.prune_unwanted_sections(normal_tree)
body_html = self.get_content_html(subtree, xp_num, base_url)
body_html = self._strip_tables_from_html(body_html)
body_html = self._strip_images_from_html(body_html)
# 论坛等独有
body_html_tree = fromstring(body_html)
try:
body_tree = body_html_tree.body
except:
body_tree = Element("body")
body_tree.extend(body_html_tree)
main_ids = body_tree.xpath(f".//@{Unique_ID}")
for main_id in main_ids:
main_tree = normal_tree.xpath(
f".//*[@{Unique_ID}={main_id}]"
)
if main_tree:
self.remove_node(main_tree[0])
if not main_ids:
main_ids = [-1]
if xp_num != "others":
normal_tree, _ = self.prune_unwanted_sections(normal_tree)
for c_xpath in Forum_XPATH:
while normal_tree.xpath(c_xpath):
x = normal_tree.xpath(c_xpath)[0]
self.remove_node(x)
if "'post-'" in c_xpath:
if not (re.findall('post-\d+', x.attrib.get("id", "").lower()) or re.findall('post_\d+',
x.attrib.get("id",
"").lower())):
continue
if (
"header" in x.attrib.get("class", "").lower()
or "header" in x.attrib.get("id", "").lower()
):
continue
try:
if int(x.attrib.get(Unique_ID, "0")) > int(
main_ids[-1]
):
body_tree.append(x)
else:
prefix_div = Element("div")
suffix_div = Element("div")
need_prefix = False
need_suffix = False
while x.xpath(
f".//*[number(@{Unique_ID}) > {int(main_ids[-1])}]"
):
tmp_x = x.xpath(
f".//*[number(@{Unique_ID}) > {int(main_ids[-1])}]"
)[0]
self.remove_node(tmp_x)
suffix_div.append(tmp_x)
need_suffix = True
while x.xpath(
f".//*[number(@{Unique_ID}) < {int(main_ids[-1])}]"
):
tmp_x = x.xpath(
f".//*[number(@{Unique_ID}) < {int(main_ids[-1])}]"
)[0]
self.remove_node(tmp_x)
prefix_div.append(tmp_x)
need_prefix = True
if need_prefix:
body_tree.insert(0, prefix_div)
if need_suffix:
body_tree.append(suffix_div)
except:
pass
body_html = re.sub(
f' {Unique_ID}="\d+"',
"",
tostring(body_tree, encoding=str),
)
text_length = self._text_length_from_html(body_html)
return {
"xp_num": xp_num,
"drop_list": drop_list,
"html": body_html,
"title": title,
"base_url": base_url,
"text_length": text_length,
}
|