Spaces:
Sleeping
Sleeping
File size: 1,906 Bytes
fb05e78 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
from bs4 import BeautifulSoup
from readability import Document
def extract_main_html(html: str) -> str:
soup = BeautifulSoup(html, "lxml")
main = soup.select_one("main#content") or soup
for sel in [
"header", "footer", "nav", "aside",
".breadcrumbs", ".ast-breadcrumbs", ".yoast-breadcrumbs",
".site-header", ".site-footer",
".widget", ".sidebar", ".post-navigation", ".navigation",
".comments-area", ".comment-respond", ".entry-footer", ".entry-meta",
".jp-relatedposts", ".related-posts", ".sharedaddy", ".share-buttons",
".wp-block-search", ".search-form",
'a[href*="facebook.com/sharer"]', 'a[href*="twitter.com/share"]',
]:
for el in main.select(sel):
el.decompose()
article = main.select_one("article") or main
h1 = article.select_one("h1.entry-title, h1")
content = (
article.select_one(".entry-content")
or article.select_one(".nv-content-wrap")
or article.select_one(".post-content")
or article.select_one(".single-content")
or article.select_one(".content")
)
if content:
for sel in [
".sharedaddy", ".share", ".sns", ".advertisement", ".adsbygoogle",
".post-navigation", ".entry-footer", ".toc_container", ".table-of-contents",
".jp-relatedposts", ".related-posts"
]:
for el in content.select(sel):
el.decompose()
parts = []
if h1:
parts.append(str(h1))
parts.append(str(content))
return "".join(parts)
global_content = soup.select_one(".entry-content")
if global_content:
parts = []
if h1:
parts.append(str(h1))
parts.append(str(global_content))
return "".join(parts)
doc = Document(html)
return f"<h1>{doc.short_title()}</h1>{doc.summary()}"
|