Spaces:
Sleeping
Sleeping
| from bs4 import BeautifulSoup | |
| from readability import Document | |
| def extract_main_html(html: str) -> str: | |
| soup = BeautifulSoup(html, "lxml") | |
| main = soup.select_one("main#content") or soup | |
| for sel in [ | |
| "header", "footer", "nav", "aside", | |
| ".breadcrumbs", ".ast-breadcrumbs", ".yoast-breadcrumbs", | |
| ".site-header", ".site-footer", | |
| ".widget", ".sidebar", ".post-navigation", ".navigation", | |
| ".comments-area", ".comment-respond", ".entry-footer", ".entry-meta", | |
| ".jp-relatedposts", ".related-posts", ".sharedaddy", ".share-buttons", | |
| ".wp-block-search", ".search-form", | |
| 'a[href*="facebook.com/sharer"]', 'a[href*="twitter.com/share"]', | |
| ]: | |
| for el in main.select(sel): | |
| el.decompose() | |
| article = main.select_one("article") or main | |
| h1 = article.select_one("h1.entry-title, h1") | |
| content = ( | |
| article.select_one(".entry-content") | |
| or article.select_one(".nv-content-wrap") | |
| or article.select_one(".post-content") | |
| or article.select_one(".single-content") | |
| or article.select_one(".content") | |
| ) | |
| if content: | |
| for sel in [ | |
| ".sharedaddy", ".share", ".sns", ".advertisement", ".adsbygoogle", | |
| ".post-navigation", ".entry-footer", ".toc_container", ".table-of-contents", | |
| ".jp-relatedposts", ".related-posts" | |
| ]: | |
| for el in content.select(sel): | |
| el.decompose() | |
| parts = [] | |
| if h1: | |
| parts.append(str(h1)) | |
| parts.append(str(content)) | |
| return "".join(parts) | |
| global_content = soup.select_one(".entry-content") | |
| if global_content: | |
| parts = [] | |
| if h1: | |
| parts.append(str(h1)) | |
| parts.append(str(global_content)) | |
| return "".join(parts) | |
| doc = Document(html) | |
| return f"<h1>{doc.short_title()}</h1>{doc.summary()}" | |