File size: 2,189 Bytes
00e61ba
 
 
 
5f01545
30f4683
00e61ba
 
5f01545
3c8bf47
 
 
 
00e61ba
 
 
 
 
 
 
 
 
629d00c
00e61ba
 
 
 
 
 
 
 
 
 
 
 
 
3c8bf47
00e61ba
 
30f4683
00e61ba
 
30f4683
00e61ba
 
 
 
 
 
 
 
30f4683
5f01545
00e61ba
 
5f01545
00e61ba
 
5f01545
00e61ba
 
5f01545
 
 
00e61ba
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import nltk
# NLTK ํ† ํฌ๋‚˜์ด์ € ๋ฆฌ์†Œ์Šค ์ž๋™ ๋‹ค์šด๋กœ๋“œ
nltk.download("punkt")
nltk.download("punkt_tab")

import gradio as gr
import trafilatura
import requests
from markdownify import markdownify as md
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer

def summarize_text(text, sentence_count=3):
    # ํ•œ๊ตญ์–ด๋„ ๋ฌธ์žฅ ๋‹จ์œ„๋กœ ๋Š๊ธฐ ์œ„ํ•ด english ํ† ํฌ๋‚˜์ด์ € ์‚ฌ์šฉ
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = TextRankSummarizer()
    summary_sentences = summarizer(parser.document, sentence_count)
    return "\n".join(str(sentence) for sentence in summary_sentences)

def extract_and_summarize(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        r = requests.get(url, headers=headers, timeout=10)
        r.raise_for_status()

        # HTML ํ˜•ํƒœ๋กœ ๋ณธ๋ฌธ ์ถ”์ถœ
        html_content = trafilatura.extract(
            r.text,
            output_format="html",
            include_tables=True,
            favor_recall=True
        )

        if not html_content:
            return "๋ณธ๋ฌธ์„ ์ถ”์ถœํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.", ""

        # HTML โ†’ Markdown ๋ณ€ํ™˜
        markdown_text = md(html_content, heading_style="ATX")

        # ์š”์•ฝ ์ƒ์„ฑ
        summary = summarize_text(markdown_text, sentence_count=3)

        return markdown_text, summary

    except requests.exceptions.Timeout:
        return "์š”์ฒญ์ด ์‹œ๊ฐ„ ์ดˆ๊ณผ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.", ""
    except requests.exceptions.RequestException as e:
        return f"์š”์ฒญ ์‹คํŒจ: {e}", ""
    except Exception as e:
        return f"์—๋Ÿฌ ๋ฐœ์ƒ: {e}", ""

iface = gr.Interface(
    fn=extract_and_summarize,
    inputs=gr.Textbox(label="URL ์ž…๋ ฅ", placeholder="https://example.com"),
    outputs=[
        gr.Markdown(label="์ถ”์ถœ๋œ ๋ณธ๋ฌธ"),
        gr.Textbox(label="์ž๋™ ์š”์•ฝ", lines=5)
    ],
    title="๋ณธ๋ฌธ ์ถ”์ถœ๊ธฐ + ์ž๋™ ์š”์•ฝ",
    description="์›นํŽ˜์ด์ง€ URL์„ ์ž…๋ ฅํ•˜๋ฉด ๋ณธ๋ฌธ์„ ์ถ”์ถœํ•˜๊ณ , TextRank ์•Œ๊ณ ๋ฆฌ์ฆ˜์œผ๋กœ 3๋ฌธ์žฅ ์š”์•ฝ์„ ์ œ๊ณตํ•ฉ๋‹ˆ๋‹ค."
)

if __name__ == "__main__":
    iface.launch()