ZhouChuYue
init
a579dd2
# -*- coding:utf-8 -*-
Unique_ID = "ultradata_math_parser_id_internal"
PAYWALL_DISCARD_XPATH = [
""".//*[(self::div or self::p)][
contains(@id, "paywall") or contains(@id, "premium") or
contains(@class, "paid-content") or contains(@class, "paidcontent") or
contains(@class, "obfuscated") or contains(@class, "blurred") or
contains(@class, "restricted") or contains(@class, "overlay")
]""",
]
OVERALL_DISCARD_XPATH = [
# navigation + footers, news outlets related posts, sharing, jp-post-flair jp-relatedposts
""".//*[(self::div or self::item or self::ul
or self::p or self::section or self::span)][
contains(translate(@id, "F","f"), "footer") or contains(translate(@class, "F","f"), "footer")
or contains(@id, "related") or contains(translate(@class, "R", "r"), "related") or
contains(@id, "viral") or contains(@class, "viral") or
starts-with(@id, "shar") or starts-with(@class, "shar") or
contains(@class, "share-") or
contains(translate(@id, "S", "s"), "share") or
contains(@id, "social") or contains(@class, "social") or contains(@class, "sociable") or
contains(@id, "syndication") or contains(@class, "syndication") or
starts-with(@id, "jp-") or starts-with(@id, "dpsp-content") or
contains(@class, "embedded") or contains(@class, "embed")
or contains(@id, "newsletter") or contains(@class, "newsletter")
or contains(@class, "subnav") or
contains(@id, "cookie") or contains(@class, "cookie") or contains(@id, "tags")
or contains(@class, "tags") or contains(@id, "sidebar") or
contains(@class, "sidebar") or contains(@id, "banner") or contains(@class, "banner")
or contains(@class, "meta") or
contains(@id, "menu") or contains(@class, "menu") or
contains(translate(@id, "N", "n"), "nav") or contains(translate(@role, "N", "n"), "nav")
or starts-with(@class, "nav") or contains(translate(@class, "N", "n"), "navigation") or
contains(@class, "navbar") or contains(@class, "navbox") or starts-with(@class, "post-nav")
or contains(@id, "breadcrumb") or contains(@class, "breadcrumb") or
contains(@id, "bread-crumb") or contains(@class, "bread-crumb") or
contains(@id, "author") or contains(@class, "author") or
contains(@id, "button") or contains(@class, "button")
or contains(translate(@class, "B", "b"), "byline")
or contains(@class, "rating") or starts-with(@class, "widget") or
contains(@class, "attachment") or contains(@class, "timestamp") or
contains(@class, "user-info") or contains(@class, "user-profile") or
contains(@class, "-ad-") or contains(@class, "-icon")
or contains(@class, "article-infos") or
contains(translate(@class, "I", "i"), "infoline")
or contains(@data-component, "MostPopularStories")
or contains(@class, "outbrain") or contains(@class, "taboola")
or contains(@class, "criteo") or contains(@class, "options")
or contains(@class, "consent") or contains(@class, "modal-content")
or contains(@class, "paid-content") or contains(@class, "paidcontent")
or contains(@id, "premium-") or contains(@id, "paywall")
or contains(@class, "obfuscated") or contains(@class, "blurred")
or contains(@class, " ad ")
or contains(@class, "next-post")
or contains(@class, "yin") or contains(@class, "zlylin") or
contains(@class, "xg1") or contains(@id, "bmdh")
or @data-lp-replacement-content]""",
# hidden parts
""".//*[starts-with(@class, "hide-") or contains(@class, "hide-print") or contains(@id, "hidden")
or contains(@style, "hidden") or contains(@hidden, "hidden") or contains(@class, "noprint")
or contains(@style, "display:none") or contains(@class, " hidden") or @aria-hidden="true"
or contains(@class, "notloaded")]""",
# comment debris
# or contains(@class, "message-container") or contains(@id, "message_container")
""".//*[@class="comments-title" or contains(@class, "comments-title") or
contains(@class, "nocomments") or starts-with(@id, "reply-") or starts-with(@class, "reply-") or
contains(@class, "-reply-") or contains(@class, "message") or contains(@id, "message_container")
or contains(@id, "akismet") or contains(@class, "akismet")] """,
]
TEASER_DISCARD_XPATH = [
""".//*[(self::div or self::item or self::ul
or self::p or self::section or self::span)][
contains(translate(@id, "T", "t"), "teaser") or contains(translate(@class, "T", "t"), "teaser")
]""",
]
PRECISION_DISCARD_XPATH = [
".//header",
""".//*[(self::div or self::item or self::ul
or self::p or self::section or self::span)][
contains(@id, "bottom") or contains(@class, "bottom") or
contains(@id, "link") or contains(@class, "link")
or contains(@style, "border")
]""",
]
DISCARD_IMAGE_ELEMENTS = [
""".//*[(self::div or self::item or self::ul
or self::p or self::section or self::span)][
contains(@id, "caption") or contains(@class, "caption")
]
"""
]
REMOVE_COMMENTS_XPATH = [
""".//*[(self::div or self::ul or self::section)][
starts-with(translate(@id, "C","c"), 'comment') or
starts-with(translate(@class, "C","c"), 'comment') or starts-with(translate(@name, "C","c"), 'comment') or
contains(@class, 'article-comments') or contains(@class, 'post-comments')
or starts-with(@id, 'comol') or starts-with(@id, 'disqus_thread')
or starts-with(@id, 'dsq-comments')
]"""
]
CONTENT_EXTRACTOR_NOISE_XPATHS = [
# '//div[contains(@class, "comment") or contains(@name, "comment") or contains(@id, "comment")]',
'//div[starts-with(@class, "advert") or starts-with(@name, "advert") or starts-with(@id, "advert")]',
'//div[contains(@style, "display: none")]',
'//div[contains(@style, "display:none")]',
]
# 保留图片,音频,视频
MANUALLY_CLEANED = [
"aside",
"embed",
"footer",
"head",
"iframe",
"menu",
"object",
"script",
"applet",
"canvas",
"map",
"svg",
"area",
"blink",
"button",
"datalist",
"dialog",
"frame",
"frameset",
"fieldset",
"hr",
"link",
"input",
"ins",
"label",
"legend",
"marquee",
"menuitem",
"nav",
"noscript",
"optgroup",
"option",
"output",
"param",
"progress",
"rp",
"rt",
"rtc",
"select",
"style",
"track",
"textarea",
"time",
"use",
]
MANUALLY_STRIPPED = [
"abbr",
"acronym",
"address",
"bdi",
"bdo",
"big",
"cite",
"data",
"dfn",
"font",
"hgroup",
"ins",
"mark",
"meta",
"ruby",
"small",
"tbody",
"template",
"tfoot",
"thead",
]
CUT_EMPTY_ELEMS = {
"article",
"b",
"blockquote",
"dd",
"div",
"dt",
"em",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"i",
"li",
"main",
"p",
"pre",
"q",
"section",
"span",
"strong",
}
USELESS_ATTR = [
"share",
"contribution",
"copyright",
"copy-right",
"disclaimer",
"recommend",
"related",
"footer",
"social",
"submeta",
"report-infor",
]
BODY_XPATH = [
""".//*[(self::article or self::div or self::main or self::section)][
@class="post" or @class="entry" or
contains(@class, "post-text") or contains(@class, "post_text") or
contains(@class, "post-body") or contains(@class, "post-entry") or contains(@class, "postentry") or
contains(@class, "post-content") or contains(@class, "post_content") or
contains(@class, "postcontent") or contains(@class, "postContent") or
contains(@class, "article-text") or contains(@class, "articletext") or contains(@class, "articleText")
or contains(@id, "entry-content") or
contains(@class, "entry-content") or contains(@id, "article-content") or
contains(@class, "article-content") or contains(@id, "article__content") or
contains(@class, "article__content") or contains(@id, "article-body") or
contains(@class, "article-body") or contains(@id, "article__body") or
contains(@class, "article__body") or @itemprop="articleBody" or
contains(translate(@id, "B", "b"), "articlebody") or contains(translate(@class, "B", "b"), "articlebody")
or @id="articleContent" or contains(@class, "ArticleContent") or
contains(@class, "page-content") or contains(@class, "text-content") or
contains(@id, "body-text") or contains(@class, "body-text") or contains(@class, "body-content") or contains(translate(@class, "B", "b"), "textbody") or
contains(@class, "article__container") or contains(@id, "art-content") or contains(@class, "art-content")][1]""",
"(.//article)[1]",
"""(.//*[(self::article or self::div or self::main or self::section)][
contains(@class, 'post-bodycopy') or
contains(@class, 'storycontent') or contains(@class, 'story-content') or
@class='postarea' or @class='art-postcontent' or
contains(@class, 'theme-content') or contains(@class, 'blog-content') or
contains(@class, 'section-content') or contains(@class, 'single-content') or
contains(@class, 'single-post') or
contains(@class, 'main-column') or contains(@class, 'wpb_text_column') or
starts-with(@id, 'primary') or starts-with(@class, 'article ') or @class="text" or
@id="article" or @class="cell" or @id="story" or @class="story" or
contains(@class, "story-body") or contains(@class, "field-body") or
contains(translate(@class, "FULTEX","fultex"), "fulltext")
or @role='article'])[1]""",
"""(.//*[(self::article or self::div or self::main or self::section)][
contains(@id, "content-main") or contains(@class, "content-main") or contains(@class, "content_main") or
contains(@id, "content-body") or contains(@class, "content-body") or contains(@id, "contentBody")
or contains(@class, "content__body") or contains(translate(@id, "CM","cm"), "main-content") or contains(translate(@class, "CM","cm"), "main-content")
or contains(translate(@class, "CP","cp"), "page-content") or
@id="content" or @class="content"])[1]""",
'(.//*[(self::article or self::div or self::section)][starts-with(@class, "main") or starts-with(@id, "main") or starts-with(@role, "main")])[1]|(.//main)[1]',
]
Forum_XPATH = [
""".//*[(self::article or self::div or self::main or self::section or self::li or self::tr)][
contains(@id, 'question') or contains(@class, 'question')]""",
""".//*[(self::article or self::div or self::main or self::section or self::li or self::tr)][
contains(@id, 'answer') or contains(@class, 'answer')]""",
""".//*[(self::article or self::div or self::main or self::section or self::li or self::tr)][
contains(@id, 'comment') or contains(@class, 'comment') or contains(@class, 'Comment')]""",
""".//*[(self::article or self::div or self::main or self::section or self::li or self::tr)][contains(@class, "message-container") or contains(@id, "message_container") or contains(@class, "Messages_container")]""",
""".//*[(self::article or self::div or self::main or self::section or self::p or self::span or self::li or self::tr)][
contains(@id, 'comment-content') or contains(@class, 'comment-content') or contains(@class, 'comment-body') or contains(@class, 'comment-body') or contains(@class, "post-reply") or contains(@class, "reply_content") or contains(@class, "reply-content") or contains(@class, "reply_post") or contains(@class, "post-reply") or contains(@id, "reply") or contains(@class, "post-text") or contains(@class, "post_text") or
contains(@class, "post-body") or contains(@class, "postbody") or contains(@class, "post-entry") or contains(@class, "postentry") or contains(@component, 'post') or
contains(@class, "post-content") or contains(@class, "post_content") or contains(@class, "p_content") or contains(@class, "Post_content") or contains(@class, "message-post") or contains(@class, "js-post")]""",
# id 包含post-加数字组成的形式
""".//*[(self::article or self::div or self::main or self::section or self::p or self::span or self::li or self::tr)][contains(@id, 'post-') or contains(@id, 'post_')]"""
]
METAS = [
'//meta[starts-with(@property, "og:title")]/@content',
'//meta[starts-with(@name, "og:title")]/@content',
'//meta[starts-with(@property, "title")]/@content',
'//meta[starts-with(@name, "title")]/@content',
'//meta[starts-with(@property, "page:title")]/@content',
'//meta[starts-with(@name, "page:title")]/@content',
]
URL_PATTERNS_TO_HTML_TYPE = {
}
# 内置的网站适配规则(根据 URL 模式匹配,使用 CustomParser)
BUILTIN_SITE_RULES = {
# answers.com 系列网站适配
"answers.com": {
"clean": [
"//script",
"//style",
],
"title": {
"mode": "xpath",
"value": "//h1[@property='name']//text() | //h1[contains(@class, 'headline1')]//text()"
},
"content": {
"mode": "xpath",
# 只提取答案内容
"value": "//div[@property='content'] | //div[contains(@class, 'markdownStyles')]"
}
},
}
SCORING_WEIGHTS = {
"content_length": 1.0,
"paragraph_quality": 0.0,
"link_density": 0.0,
"text_density": 0.0,
"punctuation_density": 0.0,
"structure_completeness": 0.0,
"xpath_confidence": 0.0,
"noise_elements": 0.0,
"code_block_quality": 0.0,
"list_structure": 0.0,
}
SCORE_THRESHOLDS = {
"min_acceptable_score": 3.0,
"similar_threshold": 0.5,
}