|
|
|
|
|
import re |
|
|
|
|
|
from ultradata_math_parser.config import Forum_XPATH, Unique_ID |
|
|
from ultradata_math_parser.utils import * |
|
|
from ultradata_math_parser.parsers.base_parser import BaseParser |
|
|
from ultradata_math_parser.parsers.title_parser import TitleParser |
|
|
|
|
|
|
|
|
class ForumParser(BaseParser): |
|
|
def __init__(self) -> None: |
|
|
super().__init__() |
|
|
|
|
|
def extract(self, html="", base_url="", **kwargs) -> dict: |
|
|
self.preserve_math_containers = kwargs.get("preserve_math_containers", self.preserve_math_containers) |
|
|
self.process_math = kwargs.get("process_math", self.process_math) |
|
|
self.include_tables = kwargs.get("include_tables", self.include_tables) |
|
|
self.include_images = kwargs.get("include_images", self.include_images) |
|
|
self.need_comment = True |
|
|
html = html.replace(" ", " ").replace(" ", " ") |
|
|
tree = load_html(html) |
|
|
if tree is None: |
|
|
raise ValueError |
|
|
|
|
|
|
|
|
title = TitleParser().process(tree) |
|
|
|
|
|
|
|
|
base_href = tree.xpath("//base/@href") |
|
|
|
|
|
if base_href and "http" in base_href[0]: |
|
|
base_url = base_href[0] |
|
|
self.generate_unique_id(tree) |
|
|
|
|
|
format_tree = self.convert_tags(tree, base_url=base_url) |
|
|
format_tree = self._remove_tables_from_tree(format_tree) |
|
|
format_tree = self._remove_images_from_tree(format_tree) |
|
|
|
|
|
normal_tree = self.clean_tags(format_tree) |
|
|
normal_tree = self._remove_tables_from_tree(normal_tree) |
|
|
normal_tree = self._remove_images_from_tree(normal_tree) |
|
|
|
|
|
subtree, xp_num, drop_list = self.xp_1_5(normal_tree) |
|
|
if xp_num == "others": |
|
|
subtree, drop_list = self.prune_unwanted_sections(normal_tree) |
|
|
body_html = self.get_content_html(subtree, xp_num, base_url) |
|
|
body_html = self._strip_tables_from_html(body_html) |
|
|
body_html = self._strip_images_from_html(body_html) |
|
|
|
|
|
|
|
|
body_html_tree = fromstring(body_html) |
|
|
try: |
|
|
body_tree = body_html_tree.body |
|
|
except: |
|
|
body_tree = Element("body") |
|
|
body_tree.extend(body_html_tree) |
|
|
main_ids = body_tree.xpath(f".//@{Unique_ID}") |
|
|
|
|
|
for main_id in main_ids: |
|
|
main_tree = normal_tree.xpath( |
|
|
f".//*[@{Unique_ID}={main_id}]" |
|
|
) |
|
|
if main_tree: |
|
|
self.remove_node(main_tree[0]) |
|
|
if not main_ids: |
|
|
main_ids = [-1] |
|
|
|
|
|
if xp_num != "others": |
|
|
normal_tree, _ = self.prune_unwanted_sections(normal_tree) |
|
|
for c_xpath in Forum_XPATH: |
|
|
while normal_tree.xpath(c_xpath): |
|
|
x = normal_tree.xpath(c_xpath)[0] |
|
|
self.remove_node(x) |
|
|
if "'post-'" in c_xpath: |
|
|
if not (re.findall('post-\d+', x.attrib.get("id", "").lower()) or re.findall('post_\d+', |
|
|
x.attrib.get("id", |
|
|
"").lower())): |
|
|
continue |
|
|
if ( |
|
|
"header" in x.attrib.get("class", "").lower() |
|
|
or "header" in x.attrib.get("id", "").lower() |
|
|
): |
|
|
continue |
|
|
try: |
|
|
if int(x.attrib.get(Unique_ID, "0")) > int( |
|
|
main_ids[-1] |
|
|
): |
|
|
body_tree.append(x) |
|
|
else: |
|
|
prefix_div = Element("div") |
|
|
suffix_div = Element("div") |
|
|
need_prefix = False |
|
|
need_suffix = False |
|
|
while x.xpath( |
|
|
f".//*[number(@{Unique_ID}) > {int(main_ids[-1])}]" |
|
|
): |
|
|
tmp_x = x.xpath( |
|
|
f".//*[number(@{Unique_ID}) > {int(main_ids[-1])}]" |
|
|
)[0] |
|
|
self.remove_node(tmp_x) |
|
|
suffix_div.append(tmp_x) |
|
|
need_suffix = True |
|
|
while x.xpath( |
|
|
f".//*[number(@{Unique_ID}) < {int(main_ids[-1])}]" |
|
|
): |
|
|
tmp_x = x.xpath( |
|
|
f".//*[number(@{Unique_ID}) < {int(main_ids[-1])}]" |
|
|
)[0] |
|
|
self.remove_node(tmp_x) |
|
|
prefix_div.append(tmp_x) |
|
|
need_prefix = True |
|
|
if need_prefix: |
|
|
body_tree.insert(0, prefix_div) |
|
|
if need_suffix: |
|
|
body_tree.append(suffix_div) |
|
|
|
|
|
except: |
|
|
pass |
|
|
|
|
|
body_html = re.sub( |
|
|
f' {Unique_ID}="\d+"', |
|
|
"", |
|
|
tostring(body_tree, encoding=str), |
|
|
) |
|
|
|
|
|
text_length = self._text_length_from_html(body_html) |
|
|
|
|
|
return { |
|
|
"xp_num": xp_num, |
|
|
"drop_list": drop_list, |
|
|
"html": body_html, |
|
|
"title": title, |
|
|
"base_url": base_url, |
|
|
"text_length": text_length, |
|
|
} |
|
|
|