File size: 2,736 Bytes
b454ab3
 
 
 
c221558
 
 
 
f6a9bc3
 
 
e97007d
c221558
f6a9bc3
dbf0822
f6a9bc3
 
e97007d
f6a9bc3
e97007d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c221558
 
 
 
 
 
 
 
 
 
 
 
 
e97007d
c221558
 
b454ab3
e97007d
 
c221558
e97007d
c221558
 
e97007d
c221558
 
e97007d
c221558
f6a9bc3
 
e97007d
 
f6a9bc3
e97007d
 
c221558
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import nltk
nltk.download("punkt")
nltk.download("punkt_tab")

import gradio as gr
import trafilatura
import requests
from markdownify import markdownify as md
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer
import re

def summarize_text(text, sentence_count=3):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = TextRankSummarizer()
    summary_sentences = summarizer(parser.document, sentence_count)
    return [str(sentence) for sentence in summary_sentences]

def paraphrase_text(sentences):
    # ๊ฐ„๋‹จํ•œ ๋กœ์ปฌ paraphrase: ๋™์˜์–ด ์น˜ํ™˜ + ๋ฌธ์žฅ ๊ตฌ์กฐ ๋ณ€๊ฒฝ
    # (LLM ์—†์ด ๋™์ž‘, ํ’ˆ์งˆ์€ ๊ธฐ๋ณธ ์ˆ˜์ค€)
    paraphrased = []
    replacements = {
        "๋…ธ์กฐ": "๋…ธ๋™์กฐํ•ฉ",
        "์„ฑ๊ณผ๊ธ‰": "์„ฑ๊ณผ ๋ณด์ƒ๊ธˆ",
        "์š”๊ตฌ": "์š”์ฒญ",
        "๋ถˆ๋งŒ": "์ด์˜ ์ œ๊ธฐ",
        "ํ•ฉ์˜์•ˆ": "ํ˜‘์ƒ ๊ฒฐ๊ณผ์•ˆ"
    }
    for s in sentences:
        new_s = s
        for k, v in replacements.items():
            new_s = re.sub(k, v, new_s)
        # ์–ด์ˆœ ์‚ด์ง ๋ณ€๊ฒฝ ์˜ˆ์‹œ
        if "๋Š”" in new_s:
            parts = new_s.split("๋Š”", 1)
            new_s = f"{parts[1].strip()} โ€” {parts[0].strip()}๋Š”"
        paraphrased.append(new_s)
    return paraphrased

def extract_summarize_paraphrase(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        r = requests.get(url, headers=headers, timeout=10)
        r.raise_for_status()

        html_content = trafilatura.extract(
            r.text,
            output_format="html",
            include_tables=True,
            favor_recall=True
        )

        if not html_content:
            return "๋ณธ๋ฌธ์„ ์ถ”์ถœํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.", "", ""

        markdown_text = md(html_content, heading_style="ATX")

        summary_sentences = summarize_text(markdown_text, sentence_count=3)
        paraphrased_sentences = paraphrase_text(summary_sentences)

        return markdown_text, "\n".join(summary_sentences), "\n".join(paraphrased_sentences)

    except Exception as e:
        return f"์—๋Ÿฌ ๋ฐœ์ƒ: {e}", "", ""

iface = gr.Interface(
    fn=extract_summarize_paraphrase,
    inputs=gr.Textbox(label="URL ์ž…๋ ฅ", placeholder="https://example.com"),
    outputs=[
        gr.Markdown(label="์ถ”์ถœ๋œ ๋ณธ๋ฌธ"),
        gr.Textbox(label="์ž๋™ ์š”์•ฝ", lines=5),
        gr.Textbox(label="์ž๋™ ์žฌ์ž‘์„ฑ", lines=5)
    ],
    title="๋ณธ๋ฌธ ์ถ”์ถœ๊ธฐ + ์ž๋™ ์š”์•ฝ + ์ž๋™ ์žฌ์ž‘์„ฑ",
    description="์›นํŽ˜์ด์ง€ URL์„ ์ž…๋ ฅํ•˜๋ฉด ๋ณธ๋ฌธ์„ ์ถ”์ถœํ•˜๊ณ , 3๋ฌธ์žฅ ์š”์•ฝ๊ณผ ์žฌ์ž‘์„ฑ(Paraphrasing) ๊ฒฐ๊ณผ๋ฅผ ์ œ๊ณตํ•ฉ๋‹ˆ๋‹ค."
)

if __name__ == "__main__":
    iface.launch()