# -*- coding:utf-8 -*- from ultradata_math_parser.utils import * from ultradata_math_parser.config import * class TitleParser: def extract_by_meta(self, element: HtmlElement): for xpath in METAS: title = element.xpath(xpath) if title: return "".join(title) def extract_by_title(self, element: HtmlElement): return "".join(element.xpath("//title//text()")).strip() def extract_by_hs(self, element: HtmlElement): hs = element.xpath("//h1//text()|//h2//text()|//h3//text()") return hs or [] def extract_by_h(self, element: HtmlElement): for xpath in ["//h1", "//h2", "//h3"]: children = element.xpath(xpath) if not children: continue child = children[0] texts = child.xpath("./text()") if texts and len(texts): return texts[0].strip() def process(self, element: HtmlElement): title_extracted_by_meta = self.extract_by_meta(element) if title_extracted_by_meta: return title_extracted_by_meta title_extracted_by_h = self.extract_by_h(element) title_extracted_by_hs = self.extract_by_hs(element) title_extracted_by_title = self.extract_by_title(element) title_extracted_by_hs = sorted( title_extracted_by_hs, key=lambda x: similarity2(x, title_extracted_by_title), reverse=True, ) if title_extracted_by_hs: return lcs_of_2(title_extracted_by_hs[0], title_extracted_by_title) if title_extracted_by_title: return title_extracted_by_title return title_extracted_by_h