File size: 1,906 Bytes
fb05e78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from bs4 import BeautifulSoup
from readability import Document

def extract_main_html(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")

    main = soup.select_one("main#content") or soup
    for sel in [
        "header", "footer", "nav", "aside",
        ".breadcrumbs", ".ast-breadcrumbs", ".yoast-breadcrumbs",
        ".site-header", ".site-footer",
        ".widget", ".sidebar", ".post-navigation", ".navigation",
        ".comments-area", ".comment-respond", ".entry-footer", ".entry-meta",
        ".jp-relatedposts", ".related-posts", ".sharedaddy", ".share-buttons",
        ".wp-block-search", ".search-form",
        'a[href*="facebook.com/sharer"]', 'a[href*="twitter.com/share"]',
    ]:
        for el in main.select(sel):
            el.decompose()

    article = main.select_one("article") or main
    h1 = article.select_one("h1.entry-title, h1")
    content = (
        article.select_one(".entry-content")
        or article.select_one(".nv-content-wrap")
        or article.select_one(".post-content")
        or article.select_one(".single-content")
        or article.select_one(".content")
    )

    if content:
        for sel in [
            ".sharedaddy", ".share", ".sns", ".advertisement", ".adsbygoogle",
            ".post-navigation", ".entry-footer", ".toc_container", ".table-of-contents",
            ".jp-relatedposts", ".related-posts"
        ]:
            for el in content.select(sel):
                el.decompose()

        parts = []
        if h1:
            parts.append(str(h1))
        parts.append(str(content))
        return "".join(parts)

    global_content = soup.select_one(".entry-content")
    if global_content:
        parts = []
        if h1:
            parts.append(str(h1))
        parts.append(str(global_content))
        return "".join(parts)

    doc = Document(html)
    return f"<h1>{doc.short_title()}</h1>{doc.summary()}"