# -*- coding:utf-8 -*- import html import logging from collections import defaultdict from copy import deepcopy from urllib.parse import unquote, urljoin from lxml.etree import Comment, strip_elements from ultradata_math_parser.config import * from ultradata_math_parser.readability_plus import Document as DocumentPlus from ultradata_math_parser.utils import * class BaseParser: def __init__(self): self.drop_ids = [] self.need_comment = False self.process_math = True self.preserve_math_containers = True self.include_tables = True self.include_images = False self.fallback_min_length = 250 self.enable_wild_text_fallback = True self.enable_readability_fallback = True self._logger = logging.getLogger(__name__) def xp_1_5(self, tree: HtmlElement): drop_list = False xp_num = "others" result_body = Element("body") for idx, expr in enumerate(BODY_XPATH): try: subtree = tree.xpath(expr)[0] xp_num = str(idx + 1) except IndexError: continue subtree, drop_list = self.prune_unwanted_sections(subtree) if len(subtree) == 0: xp_num = "others" continue ptest = subtree.xpath(".//text()[not(ancestor::a)]") ptest_len = text_len("".join(ptest)) all_text_len = text_len( "".join(tree.xpath("//p//text()[not(ancestor::a)]")) ) if drop_list: if ptest_len <= 50: if all_text_len > 100: xp_num = "others" continue else: if ptest_len <= 20: if all_text_len > 100: xp_num = "others" continue result_body.append(subtree) return result_body, xp_num, drop_list return result_body, xp_num, drop_list def get_content_html(self, cleaned_tree_backup, xp_num="others", base_url=""): # readability_plus doc = DocumentPlus( cleaned_tree_backup, url=base_url, xp_num=xp_num, need_comment=self.need_comment, ) body = doc.summary(html_partial=True) return body def _text_length_from_html(self, html_fragment): if not html_fragment: return 0 # 使用 lxml.html.fromstring 解析后提取 text_content # 不再依赖 w3m try: tree = fromstring(html_fragment) text = tree.text_content() return len(text or "") except Exception: return 0 def _is_content_sufficient(self, html_fragment): return self._text_length_from_html(html_fragment) >= self.fallback_min_length def _remove_tables_from_tree(self, tree: HtmlElement) -> HtmlElement: if self.include_tables: return tree for table in list(tree.xpath(".//table")): parent = table.getparent() if parent is not None: parent.remove(table) return tree def _strip_tables_from_html(self, html_fragment: str) -> str: if self.include_tables or not html_fragment: return html_fragment try: wrapper = fromstring(f"
{html_fragment}
") except Exception: return html_fragment self._remove_tables_from_tree(wrapper) return "".join(tostring(child, encoding=str) for child in wrapper) def _remove_images_from_tree(self, tree: HtmlElement) -> HtmlElement: for node in list(tree.xpath(".//img|.//picture|.//source")): # 在删除IMG之前,检查ALT是否包含LaTeX公式 if node.tag == "img": alt = node.get("alt", "") src = node.get("src", "") if alt: # URL解码(处理 &space; 等编码) alt_decoded = unquote(alt.replace('&space;', ' ').replace('\', '\\')) # 检测ALT是否包含LaTeX特征 is_latex = False # 1. 以$开头结尾 if alt_decoded.strip().startswith('$') and len(alt_decoded.strip()) > 2: is_latex = True # 2. 以\[开头或\]结尾 (display math) elif alt_decoded.strip().startswith('\\[') or alt_decoded.strip().endswith('\\]'): is_latex = True # 3. 包含LaTeX命令 (\frac, \sum, \alpha等) elif re.search(r'\\[a-zA-Z]+', alt_decoded): is_latex = True # 4. 包含上下标 elif re.search(r'\^|_\{|_[a-zA-Z0-9]', alt_decoded): is_latex = True # 5. src包含latex相关关键词(作为辅助判断) elif any(kw in src.lower() for kw in ['latex', 'codecogs', 'math', 'tex', 'equation']): if len(alt_decoded.strip()) > 1: is_latex = True if is_latex: # 创建span保存LaTeX公式 new_span = Element("span") # 确保公式被正确包装 if alt_decoded.strip().startswith('$') or alt_decoded.strip().startswith('\\['): new_span.text = alt_decoded else: new_span.text = wrap_math(alt_decoded) # 在img之前插入span parent = node.getparent() if parent is not None: node.addprevious(new_span) # 删除图片节点 parent = node.getparent() if parent is not None: parent.remove(node) for html_map in list(tree.xpath(".//map")): parent = html_map.getparent() if parent is not None: parent.remove(html_map) return tree def _strip_images_from_html(self, html_fragment: str) -> str: if not html_fragment: return html_fragment try: wrapper = fromstring(f"
{html_fragment}
") except Exception: return html_fragment self._remove_images_from_tree(wrapper) return "".join(tostring(child, encoding=str) for child in wrapper) def recover_wild_text(self, tree, base_url="", aggressive=False): if tree is None: return None working_tree = deepcopy(tree) try: pruned_tree, _ = self.prune_unwanted_sections(working_tree) except Exception: pruned_tree = working_tree search_expr = ".//p|.//pre|.//code|.//blockquote|.//q|.//quote" if self.include_tables: search_expr += "|.//table" if aggressive: search_expr += "|.//div|.//section|.//article|.//li" try: nodes = pruned_tree.xpath(search_expr) except Exception: nodes = [] if not nodes: return None container = Element("div") seen_texts = set() for node in nodes: try: text_value = trim(node.text_content()) except Exception: text_value = None if not text_value: continue if text_len(text_value) < 10: continue if text_value in seen_texts: continue seen_texts.add(text_value) if node.tag == "table": if self.include_tables: container.append(deepcopy(node)) continue else: paragraph = Element("p") paragraph.text = text_value container.append(paragraph) if len(container) == 0: return None return tostring(container, encoding=str) def readability_fallback(self, tree, base_url=""): if tree is None: return None try: doc = DocumentPlus( deepcopy(tree), url=base_url, xp_num="others", need_comment=self.need_comment, ) return doc.summary(html_partial=True) except Exception: return None def apply_fallbacks(self, primary_html, base_url, normal_tree, raw_tree): if self._is_content_sufficient(primary_html): return primary_html, "primary" wild_html = None if self.enable_wild_text_fallback: wild_html = self.recover_wild_text(normal_tree, base_url) if self._is_content_sufficient(wild_html): return wild_html, "wild_text" readability_html = None if self.enable_readability_fallback: readability_html = self.readability_fallback(raw_tree, base_url) if self._is_content_sufficient(readability_html): return readability_html, "readability" for candidate, name in ( (primary_html, "primary"), (wild_html, "wild_text"), (readability_html, "readability"), ): if candidate: return candidate, name return "", "primary" def prune_unwanted_nodes(self, tree, nodelist, with_backup=False): if with_backup is True: old_len = len(tree.text_content()) backup = deepcopy(tree) for expr in nodelist: for subtree in tree.xpath(expr): if self.preserve_math_containers and subtree.xpath(".//math"): continue # DISCARD_IMAGE_ELEMENTS 需要特殊判断 if '"caption"' in expr and subtree.xpath(".//img"): continue # 有些出现hidden if "hidden" in expr: try: if re.findall( "overflow-x:\s*hidden", subtree.attrib["style"] ) or re.findall( "overflow-y:\s*hidden", subtree.attrib["style"] ): continue if re.findall( "overflow:\s*hidden", subtree.attrib["style"] ) and re.findall("height:", subtree.attrib["style"]): height_px = re.findall( "height:\s*(\d+)", subtree.attrib["style"] )[0] if int(height_px) >= 800: continue except: pass if ancestor_node_check(subtree, ['code', 'pre']): continue self.remove_node(subtree) if with_backup is False: return tree # else: new_len = len(tree.text_content()) if new_len > old_len / 7: return tree return backup def prune_html(self, tree): """Delete selected empty elements""" for element in tree.xpath(".//*[not(node())]"): if element.tag in CUT_EMPTY_ELEMS: self.remove_node(element) return tree def remove_node(self, node: HtmlElement): parent = node.getparent() if text_strip(node.tail): previous = node.getprevious() if previous is None: if parent is not None: if text_strip(parent.text): parent.text = "".join([parent.text, node.tail]) else: parent.text = node.tail else: if text_strip(previous.tail): previous.tail = "".join([previous.tail, node.tail]) else: previous.tail = node.tail if parent is not None: idx = node.attrib.get(Unique_ID, "") parent.remove(node) if idx: self.drop_ids.append(int(idx)) def clean_tags(self, tree): strip_elements(tree, Comment) xp_lists = [] if not self.need_comment: xp_lists.append(REMOVE_COMMENTS_XPATH) xp_lists.append(CONTENT_EXTRACTOR_NOISE_XPATHS) for xp_list in xp_lists: tree = self.prune_unwanted_nodes(tree, xp_list) cleaning_list, stripping_list = ( MANUALLY_CLEANED.copy(), MANUALLY_STRIPPED.copy(), ) for elem in tree.xpath(".//figure[descendant::table]"): elem.tag = "div" for expression in cleaning_list + ["form"]: for element in tree.getiterator(expression): if self.preserve_math_containers and element.xpath('.//math'): continue # 针对form 标签特殊处理 if element.tag == "form": ptest = element.xpath(".//text()[not(ancestor::a)]") if text_len("".join(ptest)) <= 60: # 50 self.remove_node(element) else: self.remove_node(element) HTML_CLEANER.kill_tags, HTML_CLEANER.remove_tags = cleaning_list, stripping_list cleaned_tree = HTML_CLEANER.clean_html(self.prune_html(tree)) return cleaned_tree def generate_unique_id(self, element): idx = 0 for node in iter_node(element): l_tag = node.tag.lower() if l_tag not in ["html", "body"]: node.attrib[Unique_ID] = str(idx) idx += 1 def clean_unique_id(self, raw_element, content_html): ids = re.findall(f' {Unique_ID}="(\d+)"', content_html) self.drop_ids = list(set(self.drop_ids)) self.drop_ids.sort() skip_ids = [-1] for x in ids: if int(x) > int(skip_ids[-1]): skip_ids.append(int(x)) drop_node = raw_element.xpath( f"//*[@{Unique_ID}='{x}']" ) if drop_node: new_div = Element("div") for j in self.drop_ids: if int(j) > int(skip_ids[-1]): append_element = drop_node[0].xpath( f".//*[@{Unique_ID}='{j}']" ) if append_element: skip_ids.append(j) if len(append_element[0]) > 0: skip_ids.extend( [ int(pjid) for pjid in append_element[0].xpath( f".//*/@{Unique_ID}" ) ] ) append_element[0].tail = None new_div.append(append_element[0]) try: drop_node[0].addnext(new_div) parent = drop_node[0].getparent() if parent is not None: parent.remove(drop_node[0]) except: pass content_html = re.sub(f' {Unique_ID}="\d+"', "", content_html) drop_html = re.sub( f' {Unique_ID}="\d+"', "", tostring(raw_element, encoding=str), ) return content_html, drop_html def math_latex_processing(self, node): # 1. 文本中有\\begin{align} 或 \\begin{equation} if node.tag not in ["script", "style"] and text_strip(node.text): regex = r"\\begin{align}(.*?)\\end{align}" text = node.text matches = re.findall(regex, text, re.DOTALL) if matches: node.text = text.replace("\\begin{align}", "").replace( "\\end{align}", "" ) if node.tag not in ["script", "style"] and text_strip(node.text): regex = r"\\begin{equation}(.*?)\\end{equation}" text = node.text matches = re.findall(regex, text, re.DOTALL) for match in matches: match = match.replace("\\begin{equation}", "") match = match.replace("\\end{equation}", "") wrapped_text = wrap_math(match, display=True) text = text.replace(match, wrapped_text) if matches: # Remove the \begin{equation} and \end{equation} tags text = text.replace("\\begin{equation}", "").replace( "\\end{equation}", "" ) node.text = text if node.tag not in ["script", "style"] and text_strip(node.tail): regex = r"\\begin{align}(.*?)\\end{align}" text = node.tail matches = re.findall(regex, text, re.DOTALL) if matches: node.tail = text.replace("\\begin{align}", "").replace( "\\end{align}", "" ) if node.tag not in ["script", "style"] and text_strip(node.tail): regex = r"\\begin{equation}(.*?)\\end{equation}" text = node.tail matches = re.findall(regex, text, re.DOTALL) for match in matches: match = match.replace("\\begin{equation}", "") match = match.replace("\\end{equation}", "") wrapped_text = wrap_math(match, display=True) text = text.replace(match, wrapped_text) if matches: # Remove the \begin{equation} and \end{equation} tags text = text.replace("\\begin{equation}", "").replace( "\\end{equation}", "" ) node.tail = text node_class = node.get("class") parent = node.getparent() # 2. class 为 texerror 的标签 # Find the text between {} (maximum length) and replace the texerror with that text # 3. img中的latex if node.tag == "img": if node_class: class_list = node_class.split(" ") if any( [img_class in class_list for img_class in latex_image_class_names] ): alt = node.get("alt") if text_strip(alt): new_span = Element("span") wrapped_alt = wrap_math(alt) new_span.text = wrapped_alt node.addprevious(new_span) self.remove_node(node) src = node.get("src") if src: if "codecogs.com" in src: try: latex = src.split("?")[1:] latex = "?".join( latex ) # In case there are multiple ? in the latex latex = unquote(latex) new_span = Element("span") wrapped_latex = wrap_math(latex) new_span.text = wrapped_latex node.addprevious(new_span) self.remove_node(node) except: pass if "latex.php" in src: try: # they usually have "alt='-i u_t + \Delta u = |u|^2 u'" alt = node.get("alt") if text_strip(alt): # Unescape the latex alt = unquote(alt) # Get the latex wrapped_alt = wrap_math(alt) new_span = Element("span") new_span.text = wrapped_alt node.addprevious(new_span) self.remove_node(node) except: pass if "/images/math/codecogs" in src: try: # they usually have "alt='-i u_t + \Delta u = |u|^2 u'" alt = node.get("alt") if text_strip(alt): # Unescape the latex alt = unquote(alt) # Get the latex wrapped_alt = wrap_math(alt) new_span = Element("span") new_span.text = wrapped_alt node.addprevious(new_span) self.remove_node(node) except: pass if "mimetex.cgi" in src: try: latex = src.split("?")[1:] latex = "?".join( latex ) # In case there are multiple ? in the latex latex = unquote(latex) new_span = Element("span") wrapped_latex = wrap_math(latex) new_span.text = wrapped_latex node.addprevious(new_span) self.remove_node(node) except: pass if "mathtex.cgi" in src: try: latex = src.split("?")[1:] latex = "?".join( latex ) # In case there are multiple ? in the latex latex = unquote(latex) new_span = Element("span") wrapped_latex = wrap_math(latex) new_span.text = wrapped_latex node.addprevious(new_span) self.remove_node(node) except: pass if node_class: if "x-ck12" in node_class: try: latex = node.get("alt") if text_strip(latex): latex = unquote(latex) new_span = Element("span") wrapped_latex = wrap_math(latex) new_span.text = wrapped_latex node.addprevious(new_span) except: pass # 4. class 为 math-container if node_class == "math-container": try: text = node.text if text_strip(text): new_span = Element("span") wrapped_math = wrap_math(text, display=True) new_span.text = wrapped_math if parent is not None: if text_strip(node.tail): new_span.tail = node.tail parent.replace(node, new_span) except: pass # 5. class 为 wp-katex-eq if node_class == "wp-katex-eq": try: text = node.text if text_strip(text): new_span = Element("span") display_attr = node.get("data-display") if display_attr is not None: display = display_attr == "true" else: display = False wrapped_math = wrap_math(text, display=display) new_span.text = wrapped_math if parent is not None: if text_strip(node.tail): new_span.tail = node.tail parent.replace(node, new_span) except: pass # 6. script[type="math/tex"] if node.tag == "script" and node.get("type") == "math/tex": try: text = node.text if text_strip(text): new_span = Element("span") wrapped_text = wrap_math(text) new_span.text = wrapped_text if parent is not None: if text_strip(node.tail): new_span.tail = node.tail parent.replace(node, new_span) except: pass # 7. script[type="math/asciimath"] if node.tag == "script" and node.get("type") == "math/asciimath": try: text = node.text if text_strip(text): new_span = Element("span") wrapped_asciimath = wrap_math(extract_asciimath(text)) new_span.text = wrapped_asciimath if parent is not None: if text_strip(node.tail): new_span.tail = node.tail parent.replace(node, new_span) except: # Delete this script tag self.remove_node(node) # 8. class tex if node_class == "tex": try: # Check if they have data-expr attr expr = node.get("data-expr") if text_strip(expr): # Replace with a span new_span = Element("span") wrapped_expr = wrap_math(expr) new_span.text = wrapped_expr if parent is not None: if text_strip(node.tail): new_span.tail = node.tail parent.replace(node, new_span) except: pass # 9. span.katex if node.tag == "span" and node_class == "katex": # Find any spans with class "katex-html" and remove them katex_html_spans = node.xpath('.//span[@class="katex-html"]') for katex_html_span in katex_html_spans: self.remove_node(katex_html_span) # 10. Remove any .MathJax_Preview spans if node.tag == "span" and node_class == "MathJax_Preview": self.remove_node(node) if node.tag == "span" and node_class and "x-ck12-mathEditor" in node_class: try: expr = node.get("data-tex") if text_strip(expr): expr = unquote(expr).replace("\"", "").replace(""", "") # Replace with a span new_span = Element("span") wrapped_expr = wrap_math(expr) new_span.text = wrapped_expr if parent is not None: if text_strip(node.tail): new_span.tail = node.tail parent.replace(node, new_span) except: pass # 11. all math tags if node.tag == "math": annotation_tags = node.xpath('.//annotation[@encoding="application/x-tex"]') if len(annotation_tags) > 0: annotation_tag = annotation_tags[0] text = annotation_tag.text if text_strip(text): new_span = Element("span") wrapped_text = wrap_math(text) new_span.text = wrapped_text if parent is not None: if text_strip(node.tail): new_span.tail = node.tail parent.replace(node, new_span) style_value = parent.get("style") if style_value: normalized_style_value = ( style_value.lower() .strip() .replace(" ", "") .replace(";", "") ) if "display:none" in normalized_style_value: parent.style = "" elif text_strip(node.get("alttext")): # Get the alttext attribute alttext = node.get("alttext") if text_strip(alttext): new_span = Element("span") wrapped_alttext = wrap_math(alttext) new_span.text = wrapped_alttext if parent is not None: if text_strip(node.tail): new_span.tail = node.tail parent.replace(node, new_span) else: try: # Try translating to LaTeX tmp_node = deepcopy(node) tmp_node.tail = None mathml = tostring(tmp_node, encoding=str) # If this includes xmlns:mml, then we need to replace all # instances of mml: with nothing if "xmlns:mml" in mathml: mathml = mathml.replace("mml:", "") # replace xmlns:mml="..." with nothing mathml = re.sub(r'xmlns:mml=".*?"', "", mathml) # if 'xmlns=' in mathml: # mathml = re.sub(r"xmlns='.*?'", '', mathml) latex = mml_to_latex(mathml) # Make a new span tag new_span = Element("span") # Set the html of the new span tag to the text wrapped_latex = wrap_math(latex) new_span.text = wrapped_latex # Then, we need to replace the math tag with the new span tag if parent is not None: if text_strip(node.tail): new_span.tail = node.tail parent.replace(node, new_span) except: self.remove_node(node) if node.tag == "mathjax": try: # Get the inner text of the mathjax tag text = node.text if text_strip(text): text = html.unescape(text) # Use regex to find text wrapped in hashes matches = re.findall(r"#(.+?)#", text) # For each match, replace the match with the LaTeX for match in matches: try: latex = extract_asciimath(match) # Replace the match with the LaTeX text = text.replace(f"#{match}#", latex) except: pass # Create a new span tag new_span = Element("span") # Set the html of the new span tag to the text new_span.text = text # Then, we need to replace the mathjax tag with the new span tag if parent is not None: if text_strip(node.tail): new_span.tail = node.tail parent.replace(node, new_span) except: pass def convert_tags(self, element, base_url=""): USELESS_ATTR_LIST = USELESS_ATTR if not self.need_comment: USELESS_ATTR_LIST = USELESS_ATTR_LIST + ["comment"] for node in iter_node(element): if self.process_math: # 增加数学标签转换 self.math_latex_processing(node) if "data-src" in node.attrib and "src" not in node.attrib: node.attrib["src"] = node.attrib["data-src"] if "src" in node.attrib and node.attrib["src"] and base_url: src_url = node.attrib["src"] absolute_url = urljoin(base_url, src_url) node.attrib["src"] = absolute_url if node.tag.lower() == "div" and not node.getchildren(): node.tag = "p" class_name = node.get("class") if class_name: if class_name.lower() in USELESS_ATTR_LIST: self.remove_node(node) return element def delete_by_link_density( self, subtree, tagname, backtracking=False, favor_precision=False ): need_del_par = [] skip_par = [] drop_list = False for descendant in subtree.iter(tagname): pparent = descendant.getparent() if pparent in need_del_par or pparent in skip_par: continue siblings = descendant.xpath(f"following-sibling::{tagname}") if 'list' in descendant.get("class", "") and len(descendant.xpath('./a')) >= 5: need_del_par.append(descendant) need_del_par.extend(siblings) continue nn = [descendant] nn.extend(siblings) txt_max_num = 0 if len(siblings) + 1 >= 4: pass else: txt_max_dict = { "read": 0, "more": 0, "...": 0, "阅读": 0, "更多": 0, "详细": 0, "detail": 0, "article": 0, "blog": 0, "news": 0, } if tagname == "div" or tagname == "article" or tagname == "section": for j in nn: txt = "".join(j.xpath(".//text()")).strip() for x in [ "read", "more", "...", "阅读", "更多", "详细", "detail", "article", "blog", "news", ]: if txt.lower().endswith(x): txt_max_dict[x] += 1 txt_num = max(txt_max_dict.values()) if txt_max_num < txt_num: txt_max_num = txt_num if txt_max_num >= 3: break if txt_max_num >= 3: pass else: continue skip_par.append(pparent) a_num = 0 for j in siblings: if j.xpath(".//a"): if tagname == "p": if density_of_a_text(j, pre=0.8): a_num += 1 elif tagname in ["div", "section", "article"]: if density_of_a_text(j, pre=0.2): a_num += 1 else: if self.need_comment: # 增加判断是否包含评论 再决定是否删除 break_flg = False for c_xpath in Forum_XPATH[:-1]: if j.xpath(c_xpath.replace(".//*", "self::*")): break_flg = True break if break_flg: continue if tagname == "li": if text_len("".join(j.xpath(".//text()[not(ancestor::a)]"))) > 50: continue a_num += 1 if a_num < len(siblings): if a_num >= 15 and ( tagname == "div" or tagname == "article" or tagname == "section" ): pass else: continue similarity_with_siblings_nums = similarity_with_siblings( descendant, siblings ) if tagname == "article" or tagname == "item": # or tagname == "section" similarity_with_siblings_nums = similarity_with_siblings_nums * 1.5 # 列表有个很特殊的地方 另一种情况就是 descendant和siblings 都包含title/h1 | h2 标签 if tagname == "div" or tagname == "article" or tagname == "section": title_max_num = 0 for ll in [".//head[@rend='h2']", ".//head[@rend='h1']", "./article"]: title_num = 0 for jj in nn: if jj.xpath(ll): title_num += 1 if title_max_num < title_num: title_max_num = title_num if title_max_num >= 4: similarity_with_siblings_nums = similarity_with_siblings_nums * 1.5 if txt_max_num >= 3: pass elif similarity_with_siblings_nums < 0.84: if len(siblings) >= 15 and ( tagname == "div" or tagname == "article" or tagname == "section" ): pass else: continue # 父div中包含多同级div 且div class post-时,删除其余节点,保留第一篇文章 class_attr = descendant.get("class") if descendant.get("class") else "" if ( re.findall("post-", class_attr, re.I) or re.findall("-post", class_attr, re.I) or re.findall("blog|aricle", class_attr, re.I) ): drop_list = True sk_flg = True for dl in siblings: if ( text_len("".join(descendant.xpath(".//text()"))) * 2 < text_len("".join(dl.xpath(".//text()"))) and sk_flg ): self.remove_node(descendant) sk_flg = False else: self.remove_node(dl) else: need_del_par.append(descendant) need_del_par.extend(siblings) for node in need_del_par: drop_list = True try: self.remove_node(node) except Exception as e: pass myelems, deletions = defaultdict(list), [] if tagname == "div": for elem in subtree.iter(tagname): if density_of_a_text(elem, pre=0.8) and img_div_check(elem): deletions.append(elem) for elem in subtree.iter(tagname): elemtext = trim(elem.text_content()) result, templist = link_density_test(elem, elemtext, favor_precision) if result is True and img_div_check(elem): # 保留table中的链接 if tagname in ['ul', 'li', 'div', 'p'] and ancestor_node_check(elem, ['td']): continue deletions.append(elem) elif backtracking is True and len(templist) > 0: # if? myelems[elemtext].append(elem) if backtracking is True: if favor_precision is False: threshold = 100 else: threshold = 200 for text, elem in myelems.items(): if 0 < len(text) < threshold and len(elem) >= 3: deletions.extend(elem) for elem in uniquify_list(deletions): try: if self.need_comment: # 增加判断是否包含评论 再决定是否删除 break_flg = False for c_xpath in Forum_XPATH[:-1]: if elem.xpath(c_xpath): break_flg = True break if break_flg: continue self.remove_node(elem) except AttributeError: pass return subtree, drop_list def prune_unwanted_sections(self, tree): tmp_OVERALL_DISCARD_XPATH = OVERALL_DISCARD_XPATH if self.need_comment: tmp_OVERALL_DISCARD_XPATH = tmp_OVERALL_DISCARD_XPATH[:-1] tree = self.prune_unwanted_nodes( tree, tmp_OVERALL_DISCARD_XPATH, with_backup=True ) for xp_list in [ PAYWALL_DISCARD_XPATH, TEASER_DISCARD_XPATH, DISCARD_IMAGE_ELEMENTS, ]: tree = self.prune_unwanted_nodes(tree, xp_list) # remove elements by link density tree, drop_list_1 = self.delete_by_link_density( tree, "div", backtracking=True, favor_precision=False ) tree, drop_list_1_1 = self.delete_by_link_density( tree, "article", backtracking=False, favor_precision=False ) tree, drop_list_1_2 = self.delete_by_link_density( tree, "section", backtracking=False, favor_precision=False ) tree, drop_list_2_1 = self.delete_by_link_density( tree, "ul", backtracking=False, favor_precision=False ) tree, drop_list_2_2 = self.delete_by_link_density( tree, "li", backtracking=False, favor_precision=False ) tree, drop_list_3_1 = self.delete_by_link_density( tree, "dl", backtracking=False, favor_precision=False ) tree, drop_list_3_3 = self.delete_by_link_density( tree, "dt", backtracking=False, favor_precision=False ) tree, drop_list_3_2 = self.delete_by_link_density( tree, "dd", backtracking=False, favor_precision=False ) tree, drop_list_3 = self.delete_by_link_density( tree, "p", backtracking=False, favor_precision=False ) return ( tree, drop_list_1 or drop_list_2_1 or drop_list_2_2 or drop_list_3 or drop_list_1_1 or drop_list_1_2 or drop_list_3_1 or drop_list_3_2 or drop_list_3_3, )