|
|
|
|
|
|
|
|
import html |
|
|
import logging |
|
|
from collections import defaultdict |
|
|
from copy import deepcopy |
|
|
from urllib.parse import unquote, urljoin |
|
|
from lxml.etree import Comment, strip_elements |
|
|
from ultradata_math_parser.config import * |
|
|
from ultradata_math_parser.readability_plus import Document as DocumentPlus |
|
|
from ultradata_math_parser.utils import * |
|
|
|
|
|
|
|
|
class BaseParser: |
|
|
def __init__(self): |
|
|
self.drop_ids = [] |
|
|
self.need_comment = False |
|
|
self.process_math = True |
|
|
self.preserve_math_containers = True |
|
|
self.include_tables = True |
|
|
self.include_images = False |
|
|
self.fallback_min_length = 250 |
|
|
self.enable_wild_text_fallback = True |
|
|
self.enable_readability_fallback = True |
|
|
self._logger = logging.getLogger(__name__) |
|
|
|
|
|
def xp_1_5(self, tree: HtmlElement): |
|
|
drop_list = False |
|
|
xp_num = "others" |
|
|
result_body = Element("body") |
|
|
|
|
|
for idx, expr in enumerate(BODY_XPATH): |
|
|
try: |
|
|
subtree = tree.xpath(expr)[0] |
|
|
xp_num = str(idx + 1) |
|
|
except IndexError: |
|
|
continue |
|
|
|
|
|
subtree, drop_list = self.prune_unwanted_sections(subtree) |
|
|
|
|
|
if len(subtree) == 0: |
|
|
xp_num = "others" |
|
|
continue |
|
|
|
|
|
ptest = subtree.xpath(".//text()[not(ancestor::a)]") |
|
|
ptest_len = text_len("".join(ptest)) |
|
|
all_text_len = text_len( |
|
|
"".join(tree.xpath("//p//text()[not(ancestor::a)]")) |
|
|
) |
|
|
if drop_list: |
|
|
if ptest_len <= 50: |
|
|
if all_text_len > 100: |
|
|
xp_num = "others" |
|
|
continue |
|
|
else: |
|
|
if ptest_len <= 20: |
|
|
if all_text_len > 100: |
|
|
xp_num = "others" |
|
|
continue |
|
|
result_body.append(subtree) |
|
|
return result_body, xp_num, drop_list |
|
|
|
|
|
return result_body, xp_num, drop_list |
|
|
|
|
|
def get_content_html(self, cleaned_tree_backup, xp_num="others", base_url=""): |
|
|
|
|
|
doc = DocumentPlus( |
|
|
cleaned_tree_backup, |
|
|
url=base_url, |
|
|
xp_num=xp_num, |
|
|
need_comment=self.need_comment, |
|
|
) |
|
|
body = doc.summary(html_partial=True) |
|
|
|
|
|
return body |
|
|
|
|
|
def _text_length_from_html(self, html_fragment): |
|
|
if not html_fragment: |
|
|
return 0 |
|
|
|
|
|
|
|
|
try: |
|
|
tree = fromstring(html_fragment) |
|
|
text = tree.text_content() |
|
|
return len(text or "") |
|
|
except Exception: |
|
|
return 0 |
|
|
|
|
|
def _is_content_sufficient(self, html_fragment): |
|
|
return self._text_length_from_html(html_fragment) >= self.fallback_min_length |
|
|
|
|
|
def _remove_tables_from_tree(self, tree: HtmlElement) -> HtmlElement: |
|
|
if self.include_tables: |
|
|
return tree |
|
|
for table in list(tree.xpath(".//table")): |
|
|
parent = table.getparent() |
|
|
if parent is not None: |
|
|
parent.remove(table) |
|
|
return tree |
|
|
|
|
|
def _strip_tables_from_html(self, html_fragment: str) -> str: |
|
|
if self.include_tables or not html_fragment: |
|
|
return html_fragment |
|
|
try: |
|
|
wrapper = fromstring(f"<div>{html_fragment}</div>") |
|
|
except Exception: |
|
|
return html_fragment |
|
|
self._remove_tables_from_tree(wrapper) |
|
|
return "".join(tostring(child, encoding=str) for child in wrapper) |
|
|
|
|
|
def _remove_images_from_tree(self, tree: HtmlElement) -> HtmlElement: |
|
|
for node in list(tree.xpath(".//img|.//picture|.//source")): |
|
|
|
|
|
if node.tag == "img": |
|
|
alt = node.get("alt", "") |
|
|
src = node.get("src", "") |
|
|
|
|
|
if alt: |
|
|
|
|
|
alt_decoded = unquote(alt.replace('&space;', ' ').replace('\', '\\')) |
|
|
|
|
|
|
|
|
is_latex = False |
|
|
|
|
|
if alt_decoded.strip().startswith('$') and len(alt_decoded.strip()) > 2: |
|
|
is_latex = True |
|
|
|
|
|
elif alt_decoded.strip().startswith('\\[') or alt_decoded.strip().endswith('\\]'): |
|
|
is_latex = True |
|
|
|
|
|
elif re.search(r'\\[a-zA-Z]+', alt_decoded): |
|
|
is_latex = True |
|
|
|
|
|
elif re.search(r'\^|_\{|_[a-zA-Z0-9]', alt_decoded): |
|
|
is_latex = True |
|
|
|
|
|
elif any(kw in src.lower() for kw in ['latex', 'codecogs', 'math', 'tex', 'equation']): |
|
|
if len(alt_decoded.strip()) > 1: |
|
|
is_latex = True |
|
|
|
|
|
if is_latex: |
|
|
|
|
|
new_span = Element("span") |
|
|
|
|
|
if alt_decoded.strip().startswith('$') or alt_decoded.strip().startswith('\\['): |
|
|
new_span.text = alt_decoded |
|
|
else: |
|
|
new_span.text = wrap_math(alt_decoded) |
|
|
|
|
|
|
|
|
parent = node.getparent() |
|
|
if parent is not None: |
|
|
node.addprevious(new_span) |
|
|
|
|
|
|
|
|
parent = node.getparent() |
|
|
if parent is not None: |
|
|
parent.remove(node) |
|
|
|
|
|
for html_map in list(tree.xpath(".//map")): |
|
|
parent = html_map.getparent() |
|
|
if parent is not None: |
|
|
parent.remove(html_map) |
|
|
return tree |
|
|
|
|
|
def _strip_images_from_html(self, html_fragment: str) -> str: |
|
|
if not html_fragment: |
|
|
return html_fragment |
|
|
try: |
|
|
wrapper = fromstring(f"<div>{html_fragment}</div>") |
|
|
except Exception: |
|
|
return html_fragment |
|
|
self._remove_images_from_tree(wrapper) |
|
|
return "".join(tostring(child, encoding=str) for child in wrapper) |
|
|
|
|
|
def recover_wild_text(self, tree, base_url="", aggressive=False): |
|
|
if tree is None: |
|
|
return None |
|
|
working_tree = deepcopy(tree) |
|
|
try: |
|
|
pruned_tree, _ = self.prune_unwanted_sections(working_tree) |
|
|
except Exception: |
|
|
pruned_tree = working_tree |
|
|
search_expr = ".//p|.//pre|.//code|.//blockquote|.//q|.//quote" |
|
|
if self.include_tables: |
|
|
search_expr += "|.//table" |
|
|
if aggressive: |
|
|
search_expr += "|.//div|.//section|.//article|.//li" |
|
|
try: |
|
|
nodes = pruned_tree.xpath(search_expr) |
|
|
except Exception: |
|
|
nodes = [] |
|
|
if not nodes: |
|
|
return None |
|
|
container = Element("div") |
|
|
seen_texts = set() |
|
|
for node in nodes: |
|
|
try: |
|
|
text_value = trim(node.text_content()) |
|
|
except Exception: |
|
|
text_value = None |
|
|
if not text_value: |
|
|
continue |
|
|
if text_len(text_value) < 10: |
|
|
continue |
|
|
if text_value in seen_texts: |
|
|
continue |
|
|
seen_texts.add(text_value) |
|
|
if node.tag == "table": |
|
|
if self.include_tables: |
|
|
container.append(deepcopy(node)) |
|
|
continue |
|
|
else: |
|
|
paragraph = Element("p") |
|
|
paragraph.text = text_value |
|
|
container.append(paragraph) |
|
|
if len(container) == 0: |
|
|
return None |
|
|
return tostring(container, encoding=str) |
|
|
|
|
|
def readability_fallback(self, tree, base_url=""): |
|
|
if tree is None: |
|
|
return None |
|
|
try: |
|
|
doc = DocumentPlus( |
|
|
deepcopy(tree), |
|
|
url=base_url, |
|
|
xp_num="others", |
|
|
need_comment=self.need_comment, |
|
|
) |
|
|
return doc.summary(html_partial=True) |
|
|
except Exception: |
|
|
return None |
|
|
|
|
|
def apply_fallbacks(self, primary_html, base_url, normal_tree, raw_tree): |
|
|
if self._is_content_sufficient(primary_html): |
|
|
return primary_html, "primary" |
|
|
|
|
|
wild_html = None |
|
|
if self.enable_wild_text_fallback: |
|
|
wild_html = self.recover_wild_text(normal_tree, base_url) |
|
|
if self._is_content_sufficient(wild_html): |
|
|
return wild_html, "wild_text" |
|
|
|
|
|
readability_html = None |
|
|
if self.enable_readability_fallback: |
|
|
readability_html = self.readability_fallback(raw_tree, base_url) |
|
|
if self._is_content_sufficient(readability_html): |
|
|
return readability_html, "readability" |
|
|
|
|
|
for candidate, name in ( |
|
|
(primary_html, "primary"), |
|
|
(wild_html, "wild_text"), |
|
|
(readability_html, "readability"), |
|
|
): |
|
|
if candidate: |
|
|
return candidate, name |
|
|
return "", "primary" |
|
|
|
|
|
def prune_unwanted_nodes(self, tree, nodelist, with_backup=False): |
|
|
if with_backup is True: |
|
|
old_len = len(tree.text_content()) |
|
|
backup = deepcopy(tree) |
|
|
for expr in nodelist: |
|
|
for subtree in tree.xpath(expr): |
|
|
if self.preserve_math_containers and subtree.xpath(".//math"): |
|
|
continue |
|
|
|
|
|
|
|
|
if '"caption"' in expr and subtree.xpath(".//img"): |
|
|
continue |
|
|
|
|
|
if "hidden" in expr: |
|
|
try: |
|
|
if re.findall( |
|
|
"overflow-x:\s*hidden", subtree.attrib["style"] |
|
|
) or re.findall( |
|
|
"overflow-y:\s*hidden", subtree.attrib["style"] |
|
|
): |
|
|
continue |
|
|
if re.findall( |
|
|
"overflow:\s*hidden", subtree.attrib["style"] |
|
|
) and re.findall("height:", subtree.attrib["style"]): |
|
|
height_px = re.findall( |
|
|
"height:\s*(\d+)", subtree.attrib["style"] |
|
|
)[0] |
|
|
if int(height_px) >= 800: |
|
|
continue |
|
|
except: |
|
|
pass |
|
|
|
|
|
if ancestor_node_check(subtree, ['code', 'pre']): |
|
|
continue |
|
|
self.remove_node(subtree) |
|
|
if with_backup is False: |
|
|
return tree |
|
|
|
|
|
new_len = len(tree.text_content()) |
|
|
if new_len > old_len / 7: |
|
|
return tree |
|
|
return backup |
|
|
|
|
|
def prune_html(self, tree): |
|
|
"""Delete selected empty elements""" |
|
|
for element in tree.xpath(".//*[not(node())]"): |
|
|
if element.tag in CUT_EMPTY_ELEMS: |
|
|
self.remove_node(element) |
|
|
return tree |
|
|
|
|
|
def remove_node(self, node: HtmlElement): |
|
|
parent = node.getparent() |
|
|
if text_strip(node.tail): |
|
|
previous = node.getprevious() |
|
|
if previous is None: |
|
|
if parent is not None: |
|
|
if text_strip(parent.text): |
|
|
parent.text = "".join([parent.text, node.tail]) |
|
|
else: |
|
|
parent.text = node.tail |
|
|
else: |
|
|
if text_strip(previous.tail): |
|
|
previous.tail = "".join([previous.tail, node.tail]) |
|
|
else: |
|
|
previous.tail = node.tail |
|
|
|
|
|
if parent is not None: |
|
|
idx = node.attrib.get(Unique_ID, "") |
|
|
parent.remove(node) |
|
|
if idx: |
|
|
self.drop_ids.append(int(idx)) |
|
|
|
|
|
def clean_tags(self, tree): |
|
|
strip_elements(tree, Comment) |
|
|
|
|
|
xp_lists = [] |
|
|
if not self.need_comment: |
|
|
xp_lists.append(REMOVE_COMMENTS_XPATH) |
|
|
xp_lists.append(CONTENT_EXTRACTOR_NOISE_XPATHS) |
|
|
for xp_list in xp_lists: |
|
|
tree = self.prune_unwanted_nodes(tree, xp_list) |
|
|
|
|
|
cleaning_list, stripping_list = ( |
|
|
MANUALLY_CLEANED.copy(), |
|
|
MANUALLY_STRIPPED.copy(), |
|
|
) |
|
|
|
|
|
for elem in tree.xpath(".//figure[descendant::table]"): |
|
|
elem.tag = "div" |
|
|
|
|
|
for expression in cleaning_list + ["form"]: |
|
|
for element in tree.getiterator(expression): |
|
|
if self.preserve_math_containers and element.xpath('.//math'): |
|
|
continue |
|
|
|
|
|
if element.tag == "form": |
|
|
ptest = element.xpath(".//text()[not(ancestor::a)]") |
|
|
if text_len("".join(ptest)) <= 60: |
|
|
self.remove_node(element) |
|
|
else: |
|
|
self.remove_node(element) |
|
|
|
|
|
HTML_CLEANER.kill_tags, HTML_CLEANER.remove_tags = cleaning_list, stripping_list |
|
|
cleaned_tree = HTML_CLEANER.clean_html(self.prune_html(tree)) |
|
|
|
|
|
return cleaned_tree |
|
|
|
|
|
def generate_unique_id(self, element): |
|
|
idx = 0 |
|
|
for node in iter_node(element): |
|
|
l_tag = node.tag.lower() |
|
|
if l_tag not in ["html", "body"]: |
|
|
node.attrib[Unique_ID] = str(idx) |
|
|
idx += 1 |
|
|
|
|
|
def clean_unique_id(self, raw_element, content_html): |
|
|
ids = re.findall(f' {Unique_ID}="(\d+)"', content_html) |
|
|
self.drop_ids = list(set(self.drop_ids)) |
|
|
self.drop_ids.sort() |
|
|
skip_ids = [-1] |
|
|
for x in ids: |
|
|
if int(x) > int(skip_ids[-1]): |
|
|
skip_ids.append(int(x)) |
|
|
drop_node = raw_element.xpath( |
|
|
f"//*[@{Unique_ID}='{x}']" |
|
|
) |
|
|
if drop_node: |
|
|
new_div = Element("div") |
|
|
for j in self.drop_ids: |
|
|
if int(j) > int(skip_ids[-1]): |
|
|
append_element = drop_node[0].xpath( |
|
|
f".//*[@{Unique_ID}='{j}']" |
|
|
) |
|
|
if append_element: |
|
|
skip_ids.append(j) |
|
|
if len(append_element[0]) > 0: |
|
|
skip_ids.extend( |
|
|
[ |
|
|
int(pjid) |
|
|
for pjid in append_element[0].xpath( |
|
|
f".//*/@{Unique_ID}" |
|
|
) |
|
|
] |
|
|
) |
|
|
append_element[0].tail = None |
|
|
new_div.append(append_element[0]) |
|
|
|
|
|
try: |
|
|
drop_node[0].addnext(new_div) |
|
|
parent = drop_node[0].getparent() |
|
|
if parent is not None: |
|
|
parent.remove(drop_node[0]) |
|
|
except: |
|
|
pass |
|
|
|
|
|
content_html = re.sub(f' {Unique_ID}="\d+"', "", content_html) |
|
|
|
|
|
drop_html = re.sub( |
|
|
f' {Unique_ID}="\d+"', |
|
|
"", |
|
|
tostring(raw_element, encoding=str), |
|
|
) |
|
|
return content_html, drop_html |
|
|
|
|
|
def math_latex_processing(self, node): |
|
|
|
|
|
if node.tag not in ["script", "style"] and text_strip(node.text): |
|
|
regex = r"\\begin{align}(.*?)\\end{align}" |
|
|
text = node.text |
|
|
matches = re.findall(regex, text, re.DOTALL) |
|
|
if matches: |
|
|
node.text = text.replace("\\begin{align}", "").replace( |
|
|
"\\end{align}", "" |
|
|
) |
|
|
|
|
|
if node.tag not in ["script", "style"] and text_strip(node.text): |
|
|
regex = r"\\begin{equation}(.*?)\\end{equation}" |
|
|
text = node.text |
|
|
matches = re.findall(regex, text, re.DOTALL) |
|
|
for match in matches: |
|
|
match = match.replace("\\begin{equation}", "") |
|
|
match = match.replace("\\end{equation}", "") |
|
|
wrapped_text = wrap_math(match, display=True) |
|
|
text = text.replace(match, wrapped_text) |
|
|
if matches: |
|
|
|
|
|
text = text.replace("\\begin{equation}", "").replace( |
|
|
"\\end{equation}", "" |
|
|
) |
|
|
node.text = text |
|
|
|
|
|
if node.tag not in ["script", "style"] and text_strip(node.tail): |
|
|
regex = r"\\begin{align}(.*?)\\end{align}" |
|
|
text = node.tail |
|
|
matches = re.findall(regex, text, re.DOTALL) |
|
|
if matches: |
|
|
node.tail = text.replace("\\begin{align}", "").replace( |
|
|
"\\end{align}", "" |
|
|
) |
|
|
|
|
|
if node.tag not in ["script", "style"] and text_strip(node.tail): |
|
|
regex = r"\\begin{equation}(.*?)\\end{equation}" |
|
|
text = node.tail |
|
|
matches = re.findall(regex, text, re.DOTALL) |
|
|
for match in matches: |
|
|
match = match.replace("\\begin{equation}", "") |
|
|
match = match.replace("\\end{equation}", "") |
|
|
wrapped_text = wrap_math(match, display=True) |
|
|
text = text.replace(match, wrapped_text) |
|
|
if matches: |
|
|
|
|
|
text = text.replace("\\begin{equation}", "").replace( |
|
|
"\\end{equation}", "" |
|
|
) |
|
|
node.tail = text |
|
|
|
|
|
node_class = node.get("class") |
|
|
|
|
|
parent = node.getparent() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if node.tag == "img": |
|
|
if node_class: |
|
|
class_list = node_class.split(" ") |
|
|
if any( |
|
|
[img_class in class_list for img_class in latex_image_class_names] |
|
|
): |
|
|
alt = node.get("alt") |
|
|
if text_strip(alt): |
|
|
new_span = Element("span") |
|
|
wrapped_alt = wrap_math(alt) |
|
|
new_span.text = wrapped_alt |
|
|
node.addprevious(new_span) |
|
|
self.remove_node(node) |
|
|
src = node.get("src") |
|
|
if src: |
|
|
if "codecogs.com" in src: |
|
|
try: |
|
|
latex = src.split("?")[1:] |
|
|
latex = "?".join( |
|
|
latex |
|
|
) |
|
|
latex = unquote(latex) |
|
|
new_span = Element("span") |
|
|
wrapped_latex = wrap_math(latex) |
|
|
new_span.text = wrapped_latex |
|
|
node.addprevious(new_span) |
|
|
self.remove_node(node) |
|
|
except: |
|
|
pass |
|
|
if "latex.php" in src: |
|
|
try: |
|
|
|
|
|
alt = node.get("alt") |
|
|
if text_strip(alt): |
|
|
|
|
|
alt = unquote(alt) |
|
|
|
|
|
wrapped_alt = wrap_math(alt) |
|
|
new_span = Element("span") |
|
|
new_span.text = wrapped_alt |
|
|
node.addprevious(new_span) |
|
|
self.remove_node(node) |
|
|
except: |
|
|
pass |
|
|
if "/images/math/codecogs" in src: |
|
|
try: |
|
|
|
|
|
alt = node.get("alt") |
|
|
if text_strip(alt): |
|
|
|
|
|
alt = unquote(alt) |
|
|
|
|
|
wrapped_alt = wrap_math(alt) |
|
|
new_span = Element("span") |
|
|
new_span.text = wrapped_alt |
|
|
node.addprevious(new_span) |
|
|
self.remove_node(node) |
|
|
except: |
|
|
pass |
|
|
if "mimetex.cgi" in src: |
|
|
try: |
|
|
latex = src.split("?")[1:] |
|
|
latex = "?".join( |
|
|
latex |
|
|
) |
|
|
latex = unquote(latex) |
|
|
new_span = Element("span") |
|
|
wrapped_latex = wrap_math(latex) |
|
|
new_span.text = wrapped_latex |
|
|
node.addprevious(new_span) |
|
|
self.remove_node(node) |
|
|
except: |
|
|
pass |
|
|
if "mathtex.cgi" in src: |
|
|
try: |
|
|
latex = src.split("?")[1:] |
|
|
latex = "?".join( |
|
|
latex |
|
|
) |
|
|
latex = unquote(latex) |
|
|
new_span = Element("span") |
|
|
wrapped_latex = wrap_math(latex) |
|
|
new_span.text = wrapped_latex |
|
|
node.addprevious(new_span) |
|
|
self.remove_node(node) |
|
|
except: |
|
|
pass |
|
|
if node_class: |
|
|
if "x-ck12" in node_class: |
|
|
try: |
|
|
latex = node.get("alt") |
|
|
if text_strip(latex): |
|
|
latex = unquote(latex) |
|
|
new_span = Element("span") |
|
|
wrapped_latex = wrap_math(latex) |
|
|
new_span.text = wrapped_latex |
|
|
node.addprevious(new_span) |
|
|
except: |
|
|
pass |
|
|
|
|
|
|
|
|
if node_class == "math-container": |
|
|
try: |
|
|
text = node.text |
|
|
if text_strip(text): |
|
|
new_span = Element("span") |
|
|
wrapped_math = wrap_math(text, display=True) |
|
|
new_span.text = wrapped_math |
|
|
if parent is not None: |
|
|
if text_strip(node.tail): |
|
|
new_span.tail = node.tail |
|
|
parent.replace(node, new_span) |
|
|
except: |
|
|
pass |
|
|
|
|
|
|
|
|
if node_class == "wp-katex-eq": |
|
|
try: |
|
|
text = node.text |
|
|
if text_strip(text): |
|
|
new_span = Element("span") |
|
|
display_attr = node.get("data-display") |
|
|
if display_attr is not None: |
|
|
display = display_attr == "true" |
|
|
else: |
|
|
display = False |
|
|
wrapped_math = wrap_math(text, display=display) |
|
|
new_span.text = wrapped_math |
|
|
if parent is not None: |
|
|
if text_strip(node.tail): |
|
|
new_span.tail = node.tail |
|
|
parent.replace(node, new_span) |
|
|
except: |
|
|
pass |
|
|
|
|
|
|
|
|
if node.tag == "script" and node.get("type") == "math/tex": |
|
|
try: |
|
|
text = node.text |
|
|
if text_strip(text): |
|
|
new_span = Element("span") |
|
|
wrapped_text = wrap_math(text) |
|
|
new_span.text = wrapped_text |
|
|
if parent is not None: |
|
|
if text_strip(node.tail): |
|
|
new_span.tail = node.tail |
|
|
parent.replace(node, new_span) |
|
|
except: |
|
|
pass |
|
|
|
|
|
|
|
|
if node.tag == "script" and node.get("type") == "math/asciimath": |
|
|
try: |
|
|
text = node.text |
|
|
if text_strip(text): |
|
|
new_span = Element("span") |
|
|
wrapped_asciimath = wrap_math(extract_asciimath(text)) |
|
|
new_span.text = wrapped_asciimath |
|
|
if parent is not None: |
|
|
if text_strip(node.tail): |
|
|
new_span.tail = node.tail |
|
|
parent.replace(node, new_span) |
|
|
except: |
|
|
|
|
|
self.remove_node(node) |
|
|
|
|
|
|
|
|
if node_class == "tex": |
|
|
try: |
|
|
|
|
|
expr = node.get("data-expr") |
|
|
if text_strip(expr): |
|
|
|
|
|
new_span = Element("span") |
|
|
wrapped_expr = wrap_math(expr) |
|
|
new_span.text = wrapped_expr |
|
|
if parent is not None: |
|
|
if text_strip(node.tail): |
|
|
new_span.tail = node.tail |
|
|
parent.replace(node, new_span) |
|
|
except: |
|
|
pass |
|
|
|
|
|
|
|
|
if node.tag == "span" and node_class == "katex": |
|
|
|
|
|
katex_html_spans = node.xpath('.//span[@class="katex-html"]') |
|
|
for katex_html_span in katex_html_spans: |
|
|
self.remove_node(katex_html_span) |
|
|
|
|
|
|
|
|
if node.tag == "span" and node_class == "MathJax_Preview": |
|
|
self.remove_node(node) |
|
|
|
|
|
if node.tag == "span" and node_class and "x-ck12-mathEditor" in node_class: |
|
|
try: |
|
|
expr = node.get("data-tex") |
|
|
if text_strip(expr): |
|
|
expr = unquote(expr).replace("\"", "").replace(""", "") |
|
|
|
|
|
new_span = Element("span") |
|
|
wrapped_expr = wrap_math(expr) |
|
|
new_span.text = wrapped_expr |
|
|
if parent is not None: |
|
|
if text_strip(node.tail): |
|
|
new_span.tail = node.tail |
|
|
parent.replace(node, new_span) |
|
|
except: |
|
|
pass |
|
|
|
|
|
|
|
|
if node.tag == "math": |
|
|
annotation_tags = node.xpath('.//annotation[@encoding="application/x-tex"]') |
|
|
if len(annotation_tags) > 0: |
|
|
annotation_tag = annotation_tags[0] |
|
|
text = annotation_tag.text |
|
|
if text_strip(text): |
|
|
new_span = Element("span") |
|
|
wrapped_text = wrap_math(text) |
|
|
new_span.text = wrapped_text |
|
|
if parent is not None: |
|
|
if text_strip(node.tail): |
|
|
new_span.tail = node.tail |
|
|
parent.replace(node, new_span) |
|
|
style_value = parent.get("style") |
|
|
if style_value: |
|
|
normalized_style_value = ( |
|
|
style_value.lower() |
|
|
.strip() |
|
|
.replace(" ", "") |
|
|
.replace(";", "") |
|
|
) |
|
|
if "display:none" in normalized_style_value: |
|
|
parent.style = "" |
|
|
elif text_strip(node.get("alttext")): |
|
|
|
|
|
alttext = node.get("alttext") |
|
|
if text_strip(alttext): |
|
|
new_span = Element("span") |
|
|
wrapped_alttext = wrap_math(alttext) |
|
|
new_span.text = wrapped_alttext |
|
|
if parent is not None: |
|
|
if text_strip(node.tail): |
|
|
new_span.tail = node.tail |
|
|
parent.replace(node, new_span) |
|
|
else: |
|
|
try: |
|
|
|
|
|
tmp_node = deepcopy(node) |
|
|
tmp_node.tail = None |
|
|
mathml = tostring(tmp_node, encoding=str) |
|
|
|
|
|
|
|
|
if "xmlns:mml" in mathml: |
|
|
mathml = mathml.replace("mml:", "") |
|
|
|
|
|
mathml = re.sub(r'xmlns:mml=".*?"', "", mathml) |
|
|
|
|
|
|
|
|
latex = mml_to_latex(mathml) |
|
|
|
|
|
new_span = Element("span") |
|
|
|
|
|
wrapped_latex = wrap_math(latex) |
|
|
new_span.text = wrapped_latex |
|
|
|
|
|
if parent is not None: |
|
|
if text_strip(node.tail): |
|
|
new_span.tail = node.tail |
|
|
parent.replace(node, new_span) |
|
|
except: |
|
|
|
|
|
self.remove_node(node) |
|
|
|
|
|
if node.tag == "mathjax": |
|
|
try: |
|
|
|
|
|
text = node.text |
|
|
if text_strip(text): |
|
|
text = html.unescape(text) |
|
|
|
|
|
matches = re.findall(r"#(.+?)#", text) |
|
|
|
|
|
for match in matches: |
|
|
try: |
|
|
latex = extract_asciimath(match) |
|
|
|
|
|
text = text.replace(f"#{match}#", latex) |
|
|
except: |
|
|
|
|
|
pass |
|
|
|
|
|
new_span = Element("span") |
|
|
|
|
|
new_span.text = text |
|
|
|
|
|
if parent is not None: |
|
|
if text_strip(node.tail): |
|
|
new_span.tail = node.tail |
|
|
parent.replace(node, new_span) |
|
|
except: |
|
|
pass |
|
|
|
|
|
def convert_tags(self, element, base_url=""): |
|
|
USELESS_ATTR_LIST = USELESS_ATTR |
|
|
if not self.need_comment: |
|
|
USELESS_ATTR_LIST = USELESS_ATTR_LIST + ["comment"] |
|
|
for node in iter_node(element): |
|
|
|
|
|
if self.process_math: |
|
|
|
|
|
self.math_latex_processing(node) |
|
|
|
|
|
if "data-src" in node.attrib and "src" not in node.attrib: |
|
|
node.attrib["src"] = node.attrib["data-src"] |
|
|
if "src" in node.attrib and node.attrib["src"] and base_url: |
|
|
src_url = node.attrib["src"] |
|
|
absolute_url = urljoin(base_url, src_url) |
|
|
node.attrib["src"] = absolute_url |
|
|
|
|
|
if node.tag.lower() == "div" and not node.getchildren(): |
|
|
node.tag = "p" |
|
|
|
|
|
class_name = node.get("class") |
|
|
if class_name: |
|
|
if class_name.lower() in USELESS_ATTR_LIST: |
|
|
self.remove_node(node) |
|
|
|
|
|
return element |
|
|
|
|
|
def delete_by_link_density( |
|
|
self, subtree, tagname, backtracking=False, favor_precision=False |
|
|
): |
|
|
need_del_par = [] |
|
|
skip_par = [] |
|
|
drop_list = False |
|
|
for descendant in subtree.iter(tagname): |
|
|
pparent = descendant.getparent() |
|
|
if pparent in need_del_par or pparent in skip_par: |
|
|
continue |
|
|
siblings = descendant.xpath(f"following-sibling::{tagname}") |
|
|
|
|
|
if 'list' in descendant.get("class", "") and len(descendant.xpath('./a')) >= 5: |
|
|
need_del_par.append(descendant) |
|
|
need_del_par.extend(siblings) |
|
|
continue |
|
|
|
|
|
nn = [descendant] |
|
|
nn.extend(siblings) |
|
|
txt_max_num = 0 |
|
|
if len(siblings) + 1 >= 4: |
|
|
pass |
|
|
else: |
|
|
txt_max_dict = { |
|
|
"read": 0, |
|
|
"more": 0, |
|
|
"...": 0, |
|
|
"阅读": 0, |
|
|
"更多": 0, |
|
|
"详细": 0, |
|
|
"detail": 0, |
|
|
"article": 0, |
|
|
"blog": 0, |
|
|
"news": 0, |
|
|
} |
|
|
if tagname == "div" or tagname == "article" or tagname == "section": |
|
|
for j in nn: |
|
|
txt = "".join(j.xpath(".//text()")).strip() |
|
|
for x in [ |
|
|
"read", |
|
|
"more", |
|
|
"...", |
|
|
"阅读", |
|
|
"更多", |
|
|
"详细", |
|
|
"detail", |
|
|
"article", |
|
|
"blog", |
|
|
"news", |
|
|
]: |
|
|
if txt.lower().endswith(x): |
|
|
txt_max_dict[x] += 1 |
|
|
txt_num = max(txt_max_dict.values()) |
|
|
if txt_max_num < txt_num: |
|
|
txt_max_num = txt_num |
|
|
if txt_max_num >= 3: |
|
|
break |
|
|
if txt_max_num >= 3: |
|
|
pass |
|
|
else: |
|
|
continue |
|
|
skip_par.append(pparent) |
|
|
a_num = 0 |
|
|
for j in siblings: |
|
|
if j.xpath(".//a"): |
|
|
if tagname == "p": |
|
|
if density_of_a_text(j, pre=0.8): |
|
|
a_num += 1 |
|
|
elif tagname in ["div", "section", "article"]: |
|
|
if density_of_a_text(j, pre=0.2): |
|
|
a_num += 1 |
|
|
else: |
|
|
if self.need_comment: |
|
|
|
|
|
break_flg = False |
|
|
for c_xpath in Forum_XPATH[:-1]: |
|
|
if j.xpath(c_xpath.replace(".//*", "self::*")): |
|
|
break_flg = True |
|
|
break |
|
|
if break_flg: |
|
|
continue |
|
|
if tagname == "li": |
|
|
if text_len("".join(j.xpath(".//text()[not(ancestor::a)]"))) > 50: |
|
|
continue |
|
|
a_num += 1 |
|
|
|
|
|
if a_num < len(siblings): |
|
|
if a_num >= 15 and ( |
|
|
tagname == "div" or tagname == "article" or tagname == "section" |
|
|
): |
|
|
pass |
|
|
else: |
|
|
continue |
|
|
|
|
|
similarity_with_siblings_nums = similarity_with_siblings( |
|
|
descendant, siblings |
|
|
) |
|
|
if tagname == "article" or tagname == "item": |
|
|
similarity_with_siblings_nums = similarity_with_siblings_nums * 1.5 |
|
|
|
|
|
if tagname == "div" or tagname == "article" or tagname == "section": |
|
|
title_max_num = 0 |
|
|
for ll in [".//head[@rend='h2']", ".//head[@rend='h1']", "./article"]: |
|
|
title_num = 0 |
|
|
for jj in nn: |
|
|
if jj.xpath(ll): |
|
|
title_num += 1 |
|
|
if title_max_num < title_num: |
|
|
title_max_num = title_num |
|
|
if title_max_num >= 4: |
|
|
similarity_with_siblings_nums = similarity_with_siblings_nums * 1.5 |
|
|
|
|
|
if txt_max_num >= 3: |
|
|
pass |
|
|
elif similarity_with_siblings_nums < 0.84: |
|
|
if len(siblings) >= 15 and ( |
|
|
tagname == "div" or tagname == "article" or tagname == "section" |
|
|
): |
|
|
pass |
|
|
else: |
|
|
continue |
|
|
|
|
|
class_attr = descendant.get("class") if descendant.get("class") else "" |
|
|
if ( |
|
|
re.findall("post-", class_attr, re.I) |
|
|
or re.findall("-post", class_attr, re.I) |
|
|
or re.findall("blog|aricle", class_attr, re.I) |
|
|
): |
|
|
drop_list = True |
|
|
sk_flg = True |
|
|
for dl in siblings: |
|
|
if ( |
|
|
text_len("".join(descendant.xpath(".//text()"))) * 2 |
|
|
< text_len("".join(dl.xpath(".//text()"))) |
|
|
and sk_flg |
|
|
): |
|
|
self.remove_node(descendant) |
|
|
sk_flg = False |
|
|
else: |
|
|
self.remove_node(dl) |
|
|
else: |
|
|
need_del_par.append(descendant) |
|
|
need_del_par.extend(siblings) |
|
|
for node in need_del_par: |
|
|
drop_list = True |
|
|
try: |
|
|
self.remove_node(node) |
|
|
except Exception as e: |
|
|
pass |
|
|
|
|
|
myelems, deletions = defaultdict(list), [] |
|
|
|
|
|
if tagname == "div": |
|
|
for elem in subtree.iter(tagname): |
|
|
if density_of_a_text(elem, pre=0.8) and img_div_check(elem): |
|
|
deletions.append(elem) |
|
|
|
|
|
for elem in subtree.iter(tagname): |
|
|
elemtext = trim(elem.text_content()) |
|
|
result, templist = link_density_test(elem, elemtext, favor_precision) |
|
|
if result is True and img_div_check(elem): |
|
|
|
|
|
if tagname in ['ul', 'li', 'div', 'p'] and ancestor_node_check(elem, ['td']): |
|
|
continue |
|
|
deletions.append(elem) |
|
|
elif backtracking is True and len(templist) > 0: |
|
|
myelems[elemtext].append(elem) |
|
|
if backtracking is True: |
|
|
if favor_precision is False: |
|
|
threshold = 100 |
|
|
else: |
|
|
threshold = 200 |
|
|
for text, elem in myelems.items(): |
|
|
if 0 < len(text) < threshold and len(elem) >= 3: |
|
|
deletions.extend(elem) |
|
|
|
|
|
for elem in uniquify_list(deletions): |
|
|
try: |
|
|
if self.need_comment: |
|
|
|
|
|
break_flg = False |
|
|
for c_xpath in Forum_XPATH[:-1]: |
|
|
if elem.xpath(c_xpath): |
|
|
break_flg = True |
|
|
break |
|
|
if break_flg: |
|
|
continue |
|
|
self.remove_node(elem) |
|
|
except AttributeError: |
|
|
pass |
|
|
return subtree, drop_list |
|
|
|
|
|
def prune_unwanted_sections(self, tree): |
|
|
tmp_OVERALL_DISCARD_XPATH = OVERALL_DISCARD_XPATH |
|
|
if self.need_comment: |
|
|
tmp_OVERALL_DISCARD_XPATH = tmp_OVERALL_DISCARD_XPATH[:-1] |
|
|
tree = self.prune_unwanted_nodes( |
|
|
tree, tmp_OVERALL_DISCARD_XPATH, with_backup=True |
|
|
) |
|
|
for xp_list in [ |
|
|
PAYWALL_DISCARD_XPATH, |
|
|
TEASER_DISCARD_XPATH, |
|
|
DISCARD_IMAGE_ELEMENTS, |
|
|
]: |
|
|
tree = self.prune_unwanted_nodes(tree, xp_list) |
|
|
|
|
|
tree, drop_list_1 = self.delete_by_link_density( |
|
|
tree, "div", backtracking=True, favor_precision=False |
|
|
) |
|
|
tree, drop_list_1_1 = self.delete_by_link_density( |
|
|
tree, "article", backtracking=False, favor_precision=False |
|
|
) |
|
|
tree, drop_list_1_2 = self.delete_by_link_density( |
|
|
tree, "section", backtracking=False, favor_precision=False |
|
|
) |
|
|
tree, drop_list_2_1 = self.delete_by_link_density( |
|
|
tree, "ul", backtracking=False, favor_precision=False |
|
|
) |
|
|
tree, drop_list_2_2 = self.delete_by_link_density( |
|
|
tree, "li", backtracking=False, favor_precision=False |
|
|
) |
|
|
tree, drop_list_3_1 = self.delete_by_link_density( |
|
|
tree, "dl", backtracking=False, favor_precision=False |
|
|
) |
|
|
tree, drop_list_3_3 = self.delete_by_link_density( |
|
|
tree, "dt", backtracking=False, favor_precision=False |
|
|
) |
|
|
tree, drop_list_3_2 = self.delete_by_link_density( |
|
|
tree, "dd", backtracking=False, favor_precision=False |
|
|
) |
|
|
tree, drop_list_3 = self.delete_by_link_density( |
|
|
tree, "p", backtracking=False, favor_precision=False |
|
|
) |
|
|
|
|
|
return ( |
|
|
tree, |
|
|
drop_list_1 |
|
|
or drop_list_2_1 |
|
|
or drop_list_2_2 |
|
|
or drop_list_3 |
|
|
or drop_list_1_1 |
|
|
or drop_list_1_2 |
|
|
or drop_list_3_1 |
|
|
or drop_list_3_2 |
|
|
or drop_list_3_3, |
|
|
) |
|
|
|