tktm8's picture
Upload 59 files
fb05e78 verified
from bs4 import BeautifulSoup
from readability import Document
def extract_main_html(html: str) -> str:
soup = BeautifulSoup(html, "lxml")
main = soup.select_one("main#content") or soup
for sel in [
"header", "footer", "nav", "aside",
".breadcrumbs", ".ast-breadcrumbs", ".yoast-breadcrumbs",
".site-header", ".site-footer",
".widget", ".sidebar", ".post-navigation", ".navigation",
".comments-area", ".comment-respond", ".entry-footer", ".entry-meta",
".jp-relatedposts", ".related-posts", ".sharedaddy", ".share-buttons",
".wp-block-search", ".search-form",
'a[href*="facebook.com/sharer"]', 'a[href*="twitter.com/share"]',
]:
for el in main.select(sel):
el.decompose()
article = main.select_one("article") or main
h1 = article.select_one("h1.entry-title, h1")
content = (
article.select_one(".entry-content")
or article.select_one(".nv-content-wrap")
or article.select_one(".post-content")
or article.select_one(".single-content")
or article.select_one(".content")
)
if content:
for sel in [
".sharedaddy", ".share", ".sns", ".advertisement", ".adsbygoogle",
".post-navigation", ".entry-footer", ".toc_container", ".table-of-contents",
".jp-relatedposts", ".related-posts"
]:
for el in content.select(sel):
el.decompose()
parts = []
if h1:
parts.append(str(h1))
parts.append(str(content))
return "".join(parts)
global_content = soup.select_one(".entry-content")
if global_content:
parts = []
if h1:
parts.append(str(h1))
parts.append(str(global_content))
return "".join(parts)
doc = Document(html)
return f"<h1>{doc.short_title()}</h1>{doc.summary()}"