orgoflu commited on
Commit
42dfadf
ยท
verified ยท
1 Parent(s): f7edcfe
Files changed (1) hide show
  1. app.py +113 -0
app.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ nltk.download("punkt")
3
+
4
+ import gradio as gr
5
+ import trafilatura
6
+ import requests
7
+ from markdownify import markdownify as md
8
+ from sumy.parsers.plaintext import PlaintextParser
9
+ from sumy.nlp.tokenizers import Tokenizer
10
+ from sumy.summarizers.text_rank import TextRankSummarizer
11
+ import re
12
+
13
+ # ===== ์œ ํ‹ธ =====
14
+ def clean_text(text: str) -> str:
15
+ text = re.sub(r'\s+', ' ', text).strip()
16
+ return text
17
+
18
+ def remove_duplicates(sentences):
19
+ seen = set()
20
+ result = []
21
+ for s in sentences:
22
+ s_clean = s.strip()
23
+ if s_clean and s_clean not in seen:
24
+ seen.add(s_clean)
25
+ result.append(s_clean)
26
+ return result
27
+
28
+ # ===== ์š”์•ฝ =====
29
+ def summarize_text(text):
30
+ text = clean_text(text)
31
+ length = len(text)
32
+
33
+ # ๋ณธ๋ฌธ ๊ธธ์ด์— ๋”ฐ๋ผ ๋ฌธ์žฅ ์ˆ˜ ์ž๋™ ์กฐ์ ˆ
34
+ if length < 300:
35
+ sentence_count = 1
36
+ elif length < 800:
37
+ sentence_count = 2
38
+ elif length < 1500:
39
+ sentence_count = 3
40
+ else:
41
+ sentence_count = 4
42
+
43
+ parser = PlaintextParser.from_string(text, Tokenizer("korean"))
44
+ summarizer = TextRankSummarizer()
45
+ summary_sentences = summarizer(parser.document, sentence_count)
46
+
47
+ summary_list = [str(sentence) for sentence in summary_sentences]
48
+ summary_list = remove_duplicates(summary_list)
49
+ return summary_list
50
+
51
+ # ===== ๊ฐ„๋‹จ ํŒจ๋Ÿฌํ”„๋ ˆ์ด์ฆˆ =====
52
+ def paraphrase_text(sentences):
53
+ replacements = {
54
+ "๋…ธ์กฐ": "๋…ธ๋™์กฐํ•ฉ",
55
+ "์„ฑ๊ณผ๊ธ‰": "์„ฑ๊ณผ ๋ณด์ƒ๊ธˆ",
56
+ "์š”๊ตฌ": "์š”์ฒญ",
57
+ "๋ถˆ๋งŒ": "์ด์˜ ์ œ๊ธฐ",
58
+ "ํ•ฉ์˜์•ˆ": "ํ˜‘์ƒ ๊ฒฐ๊ณผ์•ˆ"
59
+ }
60
+ paraphrased = []
61
+ for s in sentences:
62
+ new_s = s
63
+ for k, v in replacements.items():
64
+ new_s = re.sub(k, v, new_s)
65
+ # ์–ด์ˆœ ๋ณ€๊ฒฝ ์˜ˆ์‹œ
66
+ if "๋Š”" in new_s:
67
+ parts = new_s.split("๋Š”", 1)
68
+ if len(parts) == 2 and parts[0] and parts[1]:
69
+ new_s = f"{parts[1].strip()} โ€” {parts[0].strip()}๋Š”"
70
+ paraphrased.append(new_s)
71
+ return paraphrased
72
+
73
+ # ===== ์ „์ฒด ํŒŒ์ดํ”„๋ผ์ธ =====
74
+ def extract_summarize_paraphrase(url):
75
+ headers = {"User-Agent": "Mozilla/5.0"}
76
+ try:
77
+ r = requests.get(url, headers=headers, timeout=10)
78
+ r.raise_for_status()
79
+
80
+ html_content = trafilatura.extract(
81
+ r.text,
82
+ output_format="html",
83
+ include_tables=False,
84
+ favor_recall=True
85
+ )
86
+
87
+ if not html_content:
88
+ return "๋ณธ๋ฌธ์„ ์ถ”์ถœํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.", "", ""
89
+
90
+ markdown_text = md(html_content, heading_style="ATX")
91
+ summary_sentences = summarize_text(markdown_text)
92
+ paraphrased_sentences = paraphrase_text(summary_sentences)
93
+
94
+ return markdown_text, "\n".join(summary_sentences), "\n".join(paraphrased_sentences)
95
+
96
+ except Exception as e:
97
+ return f"์—๋Ÿฌ ๋ฐœ์ƒ: {e}", "", ""
98
+
99
+ # ===== Gradio UI =====
100
+ iface = gr.Interface(
101
+ fn=extract_summarize_paraphrase,
102
+ inputs=gr.Textbox(label="URL ์ž…๋ ฅ", placeholder="https://example.com"),
103
+ outputs=[
104
+ gr.Markdown(label="์ถ”์ถœ๋œ ๋ณธ๋ฌธ"),
105
+ gr.Textbox(label="์ž๋™ ์š”์•ฝ", lines=5),
106
+ gr.Textbox(label="์ž๋™ ์žฌ์ž‘์„ฑ", lines=5)
107
+ ],
108
+ title="ํ•œ๊ตญ์–ด ๋ณธ๋ฌธ ์ถ”์ถœ + ์ž๋™ ์š”์•ฝ + ์ž๋™ ์žฌ์ž‘์„ฑ",
109
+ description="์›นํŽ˜์ด์ง€ URL์„ ์ž…๋ ฅํ•˜๋ฉด ๋ณธ๋ฌธ์„ ์ถ”์ถœํ•˜๊ณ , ๊ธธ์ด์— ๋งž์ถฐ ์ž๋™์œผ๋กœ ๋ฌธ์žฅ ์ˆ˜๋ฅผ ์กฐ์ ˆํ•ด ์š”์•ฝํ•˜๋ฉฐ, ๊ฐ„๋‹จํ•œ ์žฌ์ž‘์„ฑ ๊ฒฐ๊ณผ๋ฅผ ์ œ๊ณตํ•ฉ๋‹ˆ๋‹ค."
110
+ )
111
+
112
+ if __name__ == "__main__":
113
+ iface.launch()