Spaces:

openbmb
/

UltraData-Math-L0-Parser

Running

UltraData-Math-L0-Parser / ultradata_math_parser /parsers /forum_parser.py

ZhouChuYue

init

a579dd2 20 days ago

5.42 kB

	# -- coding:utf-8 --
	import re

	from ultradata_math_parser.config import Forum_XPATH, Unique_ID
	from ultradata_math_parser.utils import *
	from ultradata_math_parser.parsers.base_parser import BaseParser
	from ultradata_math_parser.parsers.title_parser import TitleParser


	class ForumParser(BaseParser):
	def __init__(self) -> None:
	super().__init__()

	def extract(self, html="", base_url="", **kwargs) -> dict:
	self.preserve_math_containers = kwargs.get("preserve_math_containers", self.preserve_math_containers)
	self.process_math = kwargs.get("process_math", self.process_math)
	self.include_tables = kwargs.get("include_tables", self.include_tables)
	self.include_images = kwargs.get("include_images", self.include_images)
	self.need_comment = True
	html = html.replace(" ", " ").replace(" ", " ")
	tree = load_html(html)
	if tree is None:
	raise ValueError

	# 获取title
	title = TitleParser().process(tree)

	# base_url
	base_href = tree.xpath("//base/@href")

	if base_href and "http" in base_href[0]:
	base_url = base_href[0]
	self.generate_unique_id(tree)

	format_tree = self.convert_tags(tree, base_url=base_url)
	format_tree = self._remove_tables_from_tree(format_tree)
	format_tree = self._remove_images_from_tree(format_tree)

	normal_tree = self.clean_tags(format_tree)
	normal_tree = self._remove_tables_from_tree(normal_tree)
	normal_tree = self._remove_images_from_tree(normal_tree)

	subtree, xp_num, drop_list = self.xp_1_5(normal_tree)
	if xp_num == "others":
	subtree, drop_list = self.prune_unwanted_sections(normal_tree)
	body_html = self.get_content_html(subtree, xp_num, base_url)
	body_html = self._strip_tables_from_html(body_html)
	body_html = self._strip_images_from_html(body_html)

	# 论坛等独有
	body_html_tree = fromstring(body_html)
	try:
	body_tree = body_html_tree.body
	except:
	body_tree = Element("body")
	body_tree.extend(body_html_tree)
	main_ids = body_tree.xpath(f".//@{Unique_ID}")

	for main_id in main_ids:
	main_tree = normal_tree.xpath(
	f".//*[@{Unique_ID}={main_id}]"
	)
	if main_tree:
	self.remove_node(main_tree[0])
	if not main_ids:
	main_ids = [-1]

	if xp_num != "others":
	normal_tree, _ = self.prune_unwanted_sections(normal_tree)
	for c_xpath in Forum_XPATH:
	while normal_tree.xpath(c_xpath):
	x = normal_tree.xpath(c_xpath)[0]
	self.remove_node(x)
	if "'post-'" in c_xpath:
	if not (re.findall('post-\d+', x.attrib.get("id", "").lower()) or re.findall('post_\d+',
	x.attrib.get("id",
	"").lower())):
	continue
	if (
	"header" in x.attrib.get("class", "").lower()
	or "header" in x.attrib.get("id", "").lower()
	):
	continue
	try:
	if int(x.attrib.get(Unique_ID, "0")) > int(
	main_ids[-1]
	):
	body_tree.append(x)
	else:
	prefix_div = Element("div")
	suffix_div = Element("div")
	need_prefix = False
	need_suffix = False
	while x.xpath(
	f".//*[number(@{Unique_ID}) > {int(main_ids[-1])}]"
	):
	tmp_x = x.xpath(
	f".//*[number(@{Unique_ID}) > {int(main_ids[-1])}]"
	)[0]
	self.remove_node(tmp_x)
	suffix_div.append(tmp_x)
	need_suffix = True
	while x.xpath(
	f".//*[number(@{Unique_ID}) < {int(main_ids[-1])}]"
	):
	tmp_x = x.xpath(
	f".//*[number(@{Unique_ID}) < {int(main_ids[-1])}]"
	)[0]
	self.remove_node(tmp_x)
	prefix_div.append(tmp_x)
	need_prefix = True
	if need_prefix:
	body_tree.insert(0, prefix_div)
	if need_suffix:
	body_tree.append(suffix_div)

	except:
	pass

	body_html = re.sub(
	f' {Unique_ID}="\d+"',
	"",
	tostring(body_tree, encoding=str),
	)

	text_length = self._text_length_from_html(body_html)

	return {
	"xp_num": xp_num,
	"drop_list": drop_list,
	"html": body_html,
	"title": title,
	"base_url": base_url,
	"text_length": text_length,
	}