# -*- coding:utf-8 -*-
import html
import logging
from collections import defaultdict
from copy import deepcopy
from urllib.parse import unquote, urljoin
from lxml.etree import Comment, strip_elements
from ultradata_math_parser.config import *
from ultradata_math_parser.readability_plus import Document as DocumentPlus
from ultradata_math_parser.utils import *
class BaseParser:
def __init__(self):
self.drop_ids = []
self.need_comment = False
self.process_math = True
self.preserve_math_containers = True
self.include_tables = True
self.include_images = False
self.fallback_min_length = 250
self.enable_wild_text_fallback = True
self.enable_readability_fallback = True
self._logger = logging.getLogger(__name__)
def xp_1_5(self, tree: HtmlElement):
drop_list = False
xp_num = "others"
result_body = Element("body")
for idx, expr in enumerate(BODY_XPATH):
try:
subtree = tree.xpath(expr)[0]
xp_num = str(idx + 1)
except IndexError:
continue
subtree, drop_list = self.prune_unwanted_sections(subtree)
if len(subtree) == 0:
xp_num = "others"
continue
ptest = subtree.xpath(".//text()[not(ancestor::a)]")
ptest_len = text_len("".join(ptest))
all_text_len = text_len(
"".join(tree.xpath("//p//text()[not(ancestor::a)]"))
)
if drop_list:
if ptest_len <= 50:
if all_text_len > 100:
xp_num = "others"
continue
else:
if ptest_len <= 20:
if all_text_len > 100:
xp_num = "others"
continue
result_body.append(subtree)
return result_body, xp_num, drop_list
return result_body, xp_num, drop_list
def get_content_html(self, cleaned_tree_backup, xp_num="others", base_url=""):
# readability_plus
doc = DocumentPlus(
cleaned_tree_backup,
url=base_url,
xp_num=xp_num,
need_comment=self.need_comment,
)
body = doc.summary(html_partial=True)
return body
def _text_length_from_html(self, html_fragment):
if not html_fragment:
return 0
# 使用 lxml.html.fromstring 解析后提取 text_content
# 不再依赖 w3m
try:
tree = fromstring(html_fragment)
text = tree.text_content()
return len(text or "")
except Exception:
return 0
def _is_content_sufficient(self, html_fragment):
return self._text_length_from_html(html_fragment) >= self.fallback_min_length
def _remove_tables_from_tree(self, tree: HtmlElement) -> HtmlElement:
if self.include_tables:
return tree
for table in list(tree.xpath(".//table")):
parent = table.getparent()
if parent is not None:
parent.remove(table)
return tree
def _strip_tables_from_html(self, html_fragment: str) -> str:
if self.include_tables or not html_fragment:
return html_fragment
try:
wrapper = fromstring(f"
{html_fragment}
")
except Exception:
return html_fragment
self._remove_tables_from_tree(wrapper)
return "".join(tostring(child, encoding=str) for child in wrapper)
def _remove_images_from_tree(self, tree: HtmlElement) -> HtmlElement:
for node in list(tree.xpath(".//img|.//picture|.//source")):
# 在删除IMG之前,检查ALT是否包含LaTeX公式
if node.tag == "img":
alt = node.get("alt", "")
src = node.get("src", "")
if alt:
# URL解码(处理 &space; 等编码)
alt_decoded = unquote(alt.replace('&space;', ' ').replace('\', '\\'))
# 检测ALT是否包含LaTeX特征
is_latex = False
# 1. 以$开头结尾
if alt_decoded.strip().startswith('$') and len(alt_decoded.strip()) > 2:
is_latex = True
# 2. 以\[开头或\]结尾 (display math)
elif alt_decoded.strip().startswith('\\[') or alt_decoded.strip().endswith('\\]'):
is_latex = True
# 3. 包含LaTeX命令 (\frac, \sum, \alpha等)
elif re.search(r'\\[a-zA-Z]+', alt_decoded):
is_latex = True
# 4. 包含上下标
elif re.search(r'\^|_\{|_[a-zA-Z0-9]', alt_decoded):
is_latex = True
# 5. src包含latex相关关键词(作为辅助判断)
elif any(kw in src.lower() for kw in ['latex', 'codecogs', 'math', 'tex', 'equation']):
if len(alt_decoded.strip()) > 1:
is_latex = True
if is_latex:
# 创建span保存LaTeX公式
new_span = Element("span")
# 确保公式被正确包装
if alt_decoded.strip().startswith('$') or alt_decoded.strip().startswith('\\['):
new_span.text = alt_decoded
else:
new_span.text = wrap_math(alt_decoded)
# 在img之前插入span
parent = node.getparent()
if parent is not None:
node.addprevious(new_span)
# 删除图片节点
parent = node.getparent()
if parent is not None:
parent.remove(node)
for html_map in list(tree.xpath(".//map")):
parent = html_map.getparent()
if parent is not None:
parent.remove(html_map)
return tree
def _strip_images_from_html(self, html_fragment: str) -> str:
if not html_fragment:
return html_fragment
try:
wrapper = fromstring(f"{html_fragment}
")
except Exception:
return html_fragment
self._remove_images_from_tree(wrapper)
return "".join(tostring(child, encoding=str) for child in wrapper)
def recover_wild_text(self, tree, base_url="", aggressive=False):
if tree is None:
return None
working_tree = deepcopy(tree)
try:
pruned_tree, _ = self.prune_unwanted_sections(working_tree)
except Exception:
pruned_tree = working_tree
search_expr = ".//p|.//pre|.//code|.//blockquote|.//q|.//quote"
if self.include_tables:
search_expr += "|.//table"
if aggressive:
search_expr += "|.//div|.//section|.//article|.//li"
try:
nodes = pruned_tree.xpath(search_expr)
except Exception:
nodes = []
if not nodes:
return None
container = Element("div")
seen_texts = set()
for node in nodes:
try:
text_value = trim(node.text_content())
except Exception:
text_value = None
if not text_value:
continue
if text_len(text_value) < 10:
continue
if text_value in seen_texts:
continue
seen_texts.add(text_value)
if node.tag == "table":
if self.include_tables:
container.append(deepcopy(node))
continue
else:
paragraph = Element("p")
paragraph.text = text_value
container.append(paragraph)
if len(container) == 0:
return None
return tostring(container, encoding=str)
def readability_fallback(self, tree, base_url=""):
if tree is None:
return None
try:
doc = DocumentPlus(
deepcopy(tree),
url=base_url,
xp_num="others",
need_comment=self.need_comment,
)
return doc.summary(html_partial=True)
except Exception:
return None
def apply_fallbacks(self, primary_html, base_url, normal_tree, raw_tree):
if self._is_content_sufficient(primary_html):
return primary_html, "primary"
wild_html = None
if self.enable_wild_text_fallback:
wild_html = self.recover_wild_text(normal_tree, base_url)
if self._is_content_sufficient(wild_html):
return wild_html, "wild_text"
readability_html = None
if self.enable_readability_fallback:
readability_html = self.readability_fallback(raw_tree, base_url)
if self._is_content_sufficient(readability_html):
return readability_html, "readability"
for candidate, name in (
(primary_html, "primary"),
(wild_html, "wild_text"),
(readability_html, "readability"),
):
if candidate:
return candidate, name
return "", "primary"
def prune_unwanted_nodes(self, tree, nodelist, with_backup=False):
if with_backup is True:
old_len = len(tree.text_content())
backup = deepcopy(tree)
for expr in nodelist:
for subtree in tree.xpath(expr):
if self.preserve_math_containers and subtree.xpath(".//math"):
continue
# DISCARD_IMAGE_ELEMENTS 需要特殊判断
if '"caption"' in expr and subtree.xpath(".//img"):
continue
# 有些出现hidden
if "hidden" in expr:
try:
if re.findall(
"overflow-x:\s*hidden", subtree.attrib["style"]
) or re.findall(
"overflow-y:\s*hidden", subtree.attrib["style"]
):
continue
if re.findall(
"overflow:\s*hidden", subtree.attrib["style"]
) and re.findall("height:", subtree.attrib["style"]):
height_px = re.findall(
"height:\s*(\d+)", subtree.attrib["style"]
)[0]
if int(height_px) >= 800:
continue
except:
pass
if ancestor_node_check(subtree, ['code', 'pre']):
continue
self.remove_node(subtree)
if with_backup is False:
return tree
# else:
new_len = len(tree.text_content())
if new_len > old_len / 7:
return tree
return backup
def prune_html(self, tree):
"""Delete selected empty elements"""
for element in tree.xpath(".//*[not(node())]"):
if element.tag in CUT_EMPTY_ELEMS:
self.remove_node(element)
return tree
def remove_node(self, node: HtmlElement):
parent = node.getparent()
if text_strip(node.tail):
previous = node.getprevious()
if previous is None:
if parent is not None:
if text_strip(parent.text):
parent.text = "".join([parent.text, node.tail])
else:
parent.text = node.tail
else:
if text_strip(previous.tail):
previous.tail = "".join([previous.tail, node.tail])
else:
previous.tail = node.tail
if parent is not None:
idx = node.attrib.get(Unique_ID, "")
parent.remove(node)
if idx:
self.drop_ids.append(int(idx))
def clean_tags(self, tree):
strip_elements(tree, Comment)
xp_lists = []
if not self.need_comment:
xp_lists.append(REMOVE_COMMENTS_XPATH)
xp_lists.append(CONTENT_EXTRACTOR_NOISE_XPATHS)
for xp_list in xp_lists:
tree = self.prune_unwanted_nodes(tree, xp_list)
cleaning_list, stripping_list = (
MANUALLY_CLEANED.copy(),
MANUALLY_STRIPPED.copy(),
)
for elem in tree.xpath(".//figure[descendant::table]"):
elem.tag = "div"
for expression in cleaning_list + ["form"]:
for element in tree.getiterator(expression):
if self.preserve_math_containers and element.xpath('.//math'):
continue
# 针对form 标签特殊处理
if element.tag == "form":
ptest = element.xpath(".//text()[not(ancestor::a)]")
if text_len("".join(ptest)) <= 60: # 50
self.remove_node(element)
else:
self.remove_node(element)
HTML_CLEANER.kill_tags, HTML_CLEANER.remove_tags = cleaning_list, stripping_list
cleaned_tree = HTML_CLEANER.clean_html(self.prune_html(tree))
return cleaned_tree
def generate_unique_id(self, element):
idx = 0
for node in iter_node(element):
l_tag = node.tag.lower()
if l_tag not in ["html", "body"]:
node.attrib[Unique_ID] = str(idx)
idx += 1
def clean_unique_id(self, raw_element, content_html):
ids = re.findall(f' {Unique_ID}="(\d+)"', content_html)
self.drop_ids = list(set(self.drop_ids))
self.drop_ids.sort()
skip_ids = [-1]
for x in ids:
if int(x) > int(skip_ids[-1]):
skip_ids.append(int(x))
drop_node = raw_element.xpath(
f"//*[@{Unique_ID}='{x}']"
)
if drop_node:
new_div = Element("div")
for j in self.drop_ids:
if int(j) > int(skip_ids[-1]):
append_element = drop_node[0].xpath(
f".//*[@{Unique_ID}='{j}']"
)
if append_element:
skip_ids.append(j)
if len(append_element[0]) > 0:
skip_ids.extend(
[
int(pjid)
for pjid in append_element[0].xpath(
f".//*/@{Unique_ID}"
)
]
)
append_element[0].tail = None
new_div.append(append_element[0])
try:
drop_node[0].addnext(new_div)
parent = drop_node[0].getparent()
if parent is not None:
parent.remove(drop_node[0])
except:
pass
content_html = re.sub(f' {Unique_ID}="\d+"', "", content_html)
drop_html = re.sub(
f' {Unique_ID}="\d+"',
"",
tostring(raw_element, encoding=str),
)
return content_html, drop_html
def math_latex_processing(self, node):
# 1. 文本中有\\begin{align} 或 \\begin{equation}
if node.tag not in ["script", "style"] and text_strip(node.text):
regex = r"\\begin{align}(.*?)\\end{align}"
text = node.text
matches = re.findall(regex, text, re.DOTALL)
if matches:
node.text = text.replace("\\begin{align}", "").replace(
"\\end{align}", ""
)
if node.tag not in ["script", "style"] and text_strip(node.text):
regex = r"\\begin{equation}(.*?)\\end{equation}"
text = node.text
matches = re.findall(regex, text, re.DOTALL)
for match in matches:
match = match.replace("\\begin{equation}", "")
match = match.replace("\\end{equation}", "")
wrapped_text = wrap_math(match, display=True)
text = text.replace(match, wrapped_text)
if matches:
# Remove the \begin{equation} and \end{equation} tags
text = text.replace("\\begin{equation}", "").replace(
"\\end{equation}", ""
)
node.text = text
if node.tag not in ["script", "style"] and text_strip(node.tail):
regex = r"\\begin{align}(.*?)\\end{align}"
text = node.tail
matches = re.findall(regex, text, re.DOTALL)
if matches:
node.tail = text.replace("\\begin{align}", "").replace(
"\\end{align}", ""
)
if node.tag not in ["script", "style"] and text_strip(node.tail):
regex = r"\\begin{equation}(.*?)\\end{equation}"
text = node.tail
matches = re.findall(regex, text, re.DOTALL)
for match in matches:
match = match.replace("\\begin{equation}", "")
match = match.replace("\\end{equation}", "")
wrapped_text = wrap_math(match, display=True)
text = text.replace(match, wrapped_text)
if matches:
# Remove the \begin{equation} and \end{equation} tags
text = text.replace("\\begin{equation}", "").replace(
"\\end{equation}", ""
)
node.tail = text
node_class = node.get("class")
parent = node.getparent()
# 2. class 为 texerror 的标签
# Find the text between {} (maximum length) and replace the texerror with that text
# 3. img中的latex
if node.tag == "img":
if node_class:
class_list = node_class.split(" ")
if any(
[img_class in class_list for img_class in latex_image_class_names]
):
alt = node.get("alt")
if text_strip(alt):
new_span = Element("span")
wrapped_alt = wrap_math(alt)
new_span.text = wrapped_alt
node.addprevious(new_span)
self.remove_node(node)
src = node.get("src")
if src:
if "codecogs.com" in src:
try:
latex = src.split("?")[1:]
latex = "?".join(
latex
) # In case there are multiple ? in the latex
latex = unquote(latex)
new_span = Element("span")
wrapped_latex = wrap_math(latex)
new_span.text = wrapped_latex
node.addprevious(new_span)
self.remove_node(node)
except:
pass
if "latex.php" in src:
try:
# they usually have "alt='-i u_t + \Delta u = |u|^2 u'"
alt = node.get("alt")
if text_strip(alt):
# Unescape the latex
alt = unquote(alt)
# Get the latex
wrapped_alt = wrap_math(alt)
new_span = Element("span")
new_span.text = wrapped_alt
node.addprevious(new_span)
self.remove_node(node)
except:
pass
if "/images/math/codecogs" in src:
try:
# they usually have "alt='-i u_t + \Delta u = |u|^2 u'"
alt = node.get("alt")
if text_strip(alt):
# Unescape the latex
alt = unquote(alt)
# Get the latex
wrapped_alt = wrap_math(alt)
new_span = Element("span")
new_span.text = wrapped_alt
node.addprevious(new_span)
self.remove_node(node)
except:
pass
if "mimetex.cgi" in src:
try:
latex = src.split("?")[1:]
latex = "?".join(
latex
) # In case there are multiple ? in the latex
latex = unquote(latex)
new_span = Element("span")
wrapped_latex = wrap_math(latex)
new_span.text = wrapped_latex
node.addprevious(new_span)
self.remove_node(node)
except:
pass
if "mathtex.cgi" in src:
try:
latex = src.split("?")[1:]
latex = "?".join(
latex
) # In case there are multiple ? in the latex
latex = unquote(latex)
new_span = Element("span")
wrapped_latex = wrap_math(latex)
new_span.text = wrapped_latex
node.addprevious(new_span)
self.remove_node(node)
except:
pass
if node_class:
if "x-ck12" in node_class:
try:
latex = node.get("alt")
if text_strip(latex):
latex = unquote(latex)
new_span = Element("span")
wrapped_latex = wrap_math(latex)
new_span.text = wrapped_latex
node.addprevious(new_span)
except:
pass
# 4. class 为 math-container
if node_class == "math-container":
try:
text = node.text
if text_strip(text):
new_span = Element("span")
wrapped_math = wrap_math(text, display=True)
new_span.text = wrapped_math
if parent is not None:
if text_strip(node.tail):
new_span.tail = node.tail
parent.replace(node, new_span)
except:
pass
# 5. class 为 wp-katex-eq
if node_class == "wp-katex-eq":
try:
text = node.text
if text_strip(text):
new_span = Element("span")
display_attr = node.get("data-display")
if display_attr is not None:
display = display_attr == "true"
else:
display = False
wrapped_math = wrap_math(text, display=display)
new_span.text = wrapped_math
if parent is not None:
if text_strip(node.tail):
new_span.tail = node.tail
parent.replace(node, new_span)
except:
pass
# 6. script[type="math/tex"]
if node.tag == "script" and node.get("type") == "math/tex":
try:
text = node.text
if text_strip(text):
new_span = Element("span")
wrapped_text = wrap_math(text)
new_span.text = wrapped_text
if parent is not None:
if text_strip(node.tail):
new_span.tail = node.tail
parent.replace(node, new_span)
except:
pass
# 7. script[type="math/asciimath"]
if node.tag == "script" and node.get("type") == "math/asciimath":
try:
text = node.text
if text_strip(text):
new_span = Element("span")
wrapped_asciimath = wrap_math(extract_asciimath(text))
new_span.text = wrapped_asciimath
if parent is not None:
if text_strip(node.tail):
new_span.tail = node.tail
parent.replace(node, new_span)
except:
# Delete this script tag
self.remove_node(node)
# 8. class tex
if node_class == "tex":
try:
# Check if they have data-expr attr
expr = node.get("data-expr")
if text_strip(expr):
# Replace with a span
new_span = Element("span")
wrapped_expr = wrap_math(expr)
new_span.text = wrapped_expr
if parent is not None:
if text_strip(node.tail):
new_span.tail = node.tail
parent.replace(node, new_span)
except:
pass
# 9. span.katex
if node.tag == "span" and node_class == "katex":
# Find any spans with class "katex-html" and remove them
katex_html_spans = node.xpath('.//span[@class="katex-html"]')
for katex_html_span in katex_html_spans:
self.remove_node(katex_html_span)
# 10. Remove any .MathJax_Preview spans
if node.tag == "span" and node_class == "MathJax_Preview":
self.remove_node(node)
if node.tag == "span" and node_class and "x-ck12-mathEditor" in node_class:
try:
expr = node.get("data-tex")
if text_strip(expr):
expr = unquote(expr).replace("\"", "").replace(""", "")
# Replace with a span
new_span = Element("span")
wrapped_expr = wrap_math(expr)
new_span.text = wrapped_expr
if parent is not None:
if text_strip(node.tail):
new_span.tail = node.tail
parent.replace(node, new_span)
except:
pass
# 11. all math tags
if node.tag == "math":
annotation_tags = node.xpath('.//annotation[@encoding="application/x-tex"]')
if len(annotation_tags) > 0:
annotation_tag = annotation_tags[0]
text = annotation_tag.text
if text_strip(text):
new_span = Element("span")
wrapped_text = wrap_math(text)
new_span.text = wrapped_text
if parent is not None:
if text_strip(node.tail):
new_span.tail = node.tail
parent.replace(node, new_span)
style_value = parent.get("style")
if style_value:
normalized_style_value = (
style_value.lower()
.strip()
.replace(" ", "")
.replace(";", "")
)
if "display:none" in normalized_style_value:
parent.style = ""
elif text_strip(node.get("alttext")):
# Get the alttext attribute
alttext = node.get("alttext")
if text_strip(alttext):
new_span = Element("span")
wrapped_alttext = wrap_math(alttext)
new_span.text = wrapped_alttext
if parent is not None:
if text_strip(node.tail):
new_span.tail = node.tail
parent.replace(node, new_span)
else:
try:
# Try translating to LaTeX
tmp_node = deepcopy(node)
tmp_node.tail = None
mathml = tostring(tmp_node, encoding=str)
# If this includes xmlns:mml, then we need to replace all
# instances of mml: with nothing
if "xmlns:mml" in mathml:
mathml = mathml.replace("mml:", "")
# replace xmlns:mml="..." with nothing
mathml = re.sub(r'xmlns:mml=".*?"', "", mathml)
# if 'xmlns=' in mathml:
# mathml = re.sub(r"xmlns='.*?'", '', mathml)
latex = mml_to_latex(mathml)
# Make a new span tag
new_span = Element("span")
# Set the html of the new span tag to the text
wrapped_latex = wrap_math(latex)
new_span.text = wrapped_latex
# Then, we need to replace the math tag with the new span tag
if parent is not None:
if text_strip(node.tail):
new_span.tail = node.tail
parent.replace(node, new_span)
except:
self.remove_node(node)
if node.tag == "mathjax":
try:
# Get the inner text of the mathjax tag
text = node.text
if text_strip(text):
text = html.unescape(text)
# Use regex to find text wrapped in hashes
matches = re.findall(r"#(.+?)#", text)
# For each match, replace the match with the LaTeX
for match in matches:
try:
latex = extract_asciimath(match)
# Replace the match with the LaTeX
text = text.replace(f"#{match}#", latex)
except:
pass
# Create a new span tag
new_span = Element("span")
# Set the html of the new span tag to the text
new_span.text = text
# Then, we need to replace the mathjax tag with the new span tag
if parent is not None:
if text_strip(node.tail):
new_span.tail = node.tail
parent.replace(node, new_span)
except:
pass
def convert_tags(self, element, base_url=""):
USELESS_ATTR_LIST = USELESS_ATTR
if not self.need_comment:
USELESS_ATTR_LIST = USELESS_ATTR_LIST + ["comment"]
for node in iter_node(element):
if self.process_math:
# 增加数学标签转换
self.math_latex_processing(node)
if "data-src" in node.attrib and "src" not in node.attrib:
node.attrib["src"] = node.attrib["data-src"]
if "src" in node.attrib and node.attrib["src"] and base_url:
src_url = node.attrib["src"]
absolute_url = urljoin(base_url, src_url)
node.attrib["src"] = absolute_url
if node.tag.lower() == "div" and not node.getchildren():
node.tag = "p"
class_name = node.get("class")
if class_name:
if class_name.lower() in USELESS_ATTR_LIST:
self.remove_node(node)
return element
def delete_by_link_density(
self, subtree, tagname, backtracking=False, favor_precision=False
):
need_del_par = []
skip_par = []
drop_list = False
for descendant in subtree.iter(tagname):
pparent = descendant.getparent()
if pparent in need_del_par or pparent in skip_par:
continue
siblings = descendant.xpath(f"following-sibling::{tagname}")
if 'list' in descendant.get("class", "") and len(descendant.xpath('./a')) >= 5:
need_del_par.append(descendant)
need_del_par.extend(siblings)
continue
nn = [descendant]
nn.extend(siblings)
txt_max_num = 0
if len(siblings) + 1 >= 4:
pass
else:
txt_max_dict = {
"read": 0,
"more": 0,
"...": 0,
"阅读": 0,
"更多": 0,
"详细": 0,
"detail": 0,
"article": 0,
"blog": 0,
"news": 0,
}
if tagname == "div" or tagname == "article" or tagname == "section":
for j in nn:
txt = "".join(j.xpath(".//text()")).strip()
for x in [
"read",
"more",
"...",
"阅读",
"更多",
"详细",
"detail",
"article",
"blog",
"news",
]:
if txt.lower().endswith(x):
txt_max_dict[x] += 1
txt_num = max(txt_max_dict.values())
if txt_max_num < txt_num:
txt_max_num = txt_num
if txt_max_num >= 3:
break
if txt_max_num >= 3:
pass
else:
continue
skip_par.append(pparent)
a_num = 0
for j in siblings:
if j.xpath(".//a"):
if tagname == "p":
if density_of_a_text(j, pre=0.8):
a_num += 1
elif tagname in ["div", "section", "article"]:
if density_of_a_text(j, pre=0.2):
a_num += 1
else:
if self.need_comment:
# 增加判断是否包含评论 再决定是否删除
break_flg = False
for c_xpath in Forum_XPATH[:-1]:
if j.xpath(c_xpath.replace(".//*", "self::*")):
break_flg = True
break
if break_flg:
continue
if tagname == "li":
if text_len("".join(j.xpath(".//text()[not(ancestor::a)]"))) > 50:
continue
a_num += 1
if a_num < len(siblings):
if a_num >= 15 and (
tagname == "div" or tagname == "article" or tagname == "section"
):
pass
else:
continue
similarity_with_siblings_nums = similarity_with_siblings(
descendant, siblings
)
if tagname == "article" or tagname == "item": # or tagname == "section"
similarity_with_siblings_nums = similarity_with_siblings_nums * 1.5
# 列表有个很特殊的地方 另一种情况就是 descendant和siblings 都包含title/h1 | h2 标签
if tagname == "div" or tagname == "article" or tagname == "section":
title_max_num = 0
for ll in [".//head[@rend='h2']", ".//head[@rend='h1']", "./article"]:
title_num = 0
for jj in nn:
if jj.xpath(ll):
title_num += 1
if title_max_num < title_num:
title_max_num = title_num
if title_max_num >= 4:
similarity_with_siblings_nums = similarity_with_siblings_nums * 1.5
if txt_max_num >= 3:
pass
elif similarity_with_siblings_nums < 0.84:
if len(siblings) >= 15 and (
tagname == "div" or tagname == "article" or tagname == "section"
):
pass
else:
continue
# 父div中包含多同级div 且div class post-时,删除其余节点,保留第一篇文章
class_attr = descendant.get("class") if descendant.get("class") else ""
if (
re.findall("post-", class_attr, re.I)
or re.findall("-post", class_attr, re.I)
or re.findall("blog|aricle", class_attr, re.I)
):
drop_list = True
sk_flg = True
for dl in siblings:
if (
text_len("".join(descendant.xpath(".//text()"))) * 2
< text_len("".join(dl.xpath(".//text()")))
and sk_flg
):
self.remove_node(descendant)
sk_flg = False
else:
self.remove_node(dl)
else:
need_del_par.append(descendant)
need_del_par.extend(siblings)
for node in need_del_par:
drop_list = True
try:
self.remove_node(node)
except Exception as e:
pass
myelems, deletions = defaultdict(list), []
if tagname == "div":
for elem in subtree.iter(tagname):
if density_of_a_text(elem, pre=0.8) and img_div_check(elem):
deletions.append(elem)
for elem in subtree.iter(tagname):
elemtext = trim(elem.text_content())
result, templist = link_density_test(elem, elemtext, favor_precision)
if result is True and img_div_check(elem):
# 保留table中的链接
if tagname in ['ul', 'li', 'div', 'p'] and ancestor_node_check(elem, ['td']):
continue
deletions.append(elem)
elif backtracking is True and len(templist) > 0: # if?
myelems[elemtext].append(elem)
if backtracking is True:
if favor_precision is False:
threshold = 100
else:
threshold = 200
for text, elem in myelems.items():
if 0 < len(text) < threshold and len(elem) >= 3:
deletions.extend(elem)
for elem in uniquify_list(deletions):
try:
if self.need_comment:
# 增加判断是否包含评论 再决定是否删除
break_flg = False
for c_xpath in Forum_XPATH[:-1]:
if elem.xpath(c_xpath):
break_flg = True
break
if break_flg:
continue
self.remove_node(elem)
except AttributeError:
pass
return subtree, drop_list
def prune_unwanted_sections(self, tree):
tmp_OVERALL_DISCARD_XPATH = OVERALL_DISCARD_XPATH
if self.need_comment:
tmp_OVERALL_DISCARD_XPATH = tmp_OVERALL_DISCARD_XPATH[:-1]
tree = self.prune_unwanted_nodes(
tree, tmp_OVERALL_DISCARD_XPATH, with_backup=True
)
for xp_list in [
PAYWALL_DISCARD_XPATH,
TEASER_DISCARD_XPATH,
DISCARD_IMAGE_ELEMENTS,
]:
tree = self.prune_unwanted_nodes(tree, xp_list)
# remove elements by link density
tree, drop_list_1 = self.delete_by_link_density(
tree, "div", backtracking=True, favor_precision=False
)
tree, drop_list_1_1 = self.delete_by_link_density(
tree, "article", backtracking=False, favor_precision=False
)
tree, drop_list_1_2 = self.delete_by_link_density(
tree, "section", backtracking=False, favor_precision=False
)
tree, drop_list_2_1 = self.delete_by_link_density(
tree, "ul", backtracking=False, favor_precision=False
)
tree, drop_list_2_2 = self.delete_by_link_density(
tree, "li", backtracking=False, favor_precision=False
)
tree, drop_list_3_1 = self.delete_by_link_density(
tree, "dl", backtracking=False, favor_precision=False
)
tree, drop_list_3_3 = self.delete_by_link_density(
tree, "dt", backtracking=False, favor_precision=False
)
tree, drop_list_3_2 = self.delete_by_link_density(
tree, "dd", backtracking=False, favor_precision=False
)
tree, drop_list_3 = self.delete_by_link_density(
tree, "p", backtracking=False, favor_precision=False
)
return (
tree,
drop_list_1
or drop_list_2_1
or drop_list_2_2
or drop_list_3
or drop_list_1_1
or drop_list_1_2
or drop_list_3_1
or drop_list_3_2
or drop_list_3_3,
)