from bs4 import BeautifulSoup from readability import Document def extract_main_html(html: str) -> str: soup = BeautifulSoup(html, "lxml") main = soup.select_one("main#content") or soup for sel in [ "header", "footer", "nav", "aside", ".breadcrumbs", ".ast-breadcrumbs", ".yoast-breadcrumbs", ".site-header", ".site-footer", ".widget", ".sidebar", ".post-navigation", ".navigation", ".comments-area", ".comment-respond", ".entry-footer", ".entry-meta", ".jp-relatedposts", ".related-posts", ".sharedaddy", ".share-buttons", ".wp-block-search", ".search-form", 'a[href*="facebook.com/sharer"]', 'a[href*="twitter.com/share"]', ]: for el in main.select(sel): el.decompose() article = main.select_one("article") or main h1 = article.select_one("h1.entry-title, h1") content = ( article.select_one(".entry-content") or article.select_one(".nv-content-wrap") or article.select_one(".post-content") or article.select_one(".single-content") or article.select_one(".content") ) if content: for sel in [ ".sharedaddy", ".share", ".sns", ".advertisement", ".adsbygoogle", ".post-navigation", ".entry-footer", ".toc_container", ".table-of-contents", ".jp-relatedposts", ".related-posts" ]: for el in content.select(sel): el.decompose() parts = [] if h1: parts.append(str(h1)) parts.append(str(content)) return "".join(parts) global_content = soup.select_one(".entry-content") if global_content: parts = [] if h1: parts.append(str(h1)) parts.append(str(global_content)) return "".join(parts) doc = Document(html) return f"

{doc.short_title()}

{doc.summary()}"