orgoflu commited on
Commit
3c8bf47
ยท
verified ยท
1 Parent(s): eebc78a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +227 -67
app.py CHANGED
@@ -1,100 +1,260 @@
1
- import nltk
2
- nltk.download("punkt")
 
 
3
 
 
 
 
 
 
4
  import gradio as gr
5
- import trafilatura, requests, re
6
  from markdownify import markdownify as md
 
 
 
 
 
 
 
7
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
8
 
9
- # ===== ๋ชจ๋ธ ๋ชฉ๋ก =====
 
 
 
10
  MODEL_OPTIONS = {
11
  "Qwen2.5-1.5B-Instruct": "Qwen/Qwen2.5-1.5B-Instruct",
12
- "CLOVA-Text(๋Œ€์ฒด)": "skt/kogpt2-base-v2" # ํ—ˆ๊ฐ€ ์—†์ด ์‚ฌ์šฉ ๊ฐ€๋Šฅ
 
 
 
 
 
 
 
13
  }
14
 
15
- # ===== ํ…์ŠคํŠธ ๋ชจ๋ธ ๋กœ๋“œ =====
16
- def load_text_model(model_choice):
 
 
 
 
17
  model_name = MODEL_OPTIONS[model_choice]
18
- tokenizer = AutoTokenizer.from_pretrained(model_name)
19
- model = AutoModelForCausalLM.from_pretrained(model_name).to("cpu")
20
- return pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
 
 
21
 
22
- # ===== ํ…์ŠคํŠธ ์ „์ฒ˜๋ฆฌ =====
23
- def clean_text(text):
24
- return re.sub(r'\s+', ' ', text).strip()
 
25
 
26
- # ===== ํ…์ŠคํŠธ ๋ถ„ํ•  =====
27
- def chunk_text(text, chunk_size=500):
28
- text = clean_text(text)
29
- return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- # ===== LLM ์š”์•ฝ =====
32
- def llm_summary(text, model_choice):
33
- llm = load_text_model(model_choice)
34
- prompt = f"๋‹ค์Œ ๊ธ€์„ 3๋ฌธ์žฅ ์ด๋‚ด๋กœ ์š”์•ฝ:\n{text}"
35
- out = llm(prompt, max_new_tokens=150, do_sample=False, temperature=0.7,
36
- repetition_penalty=1.2, no_repeat_ngram_size=3)
37
- return out[0]["generated_text"].replace(prompt, "").strip()
38
-
39
- # ===== ๋ถ„ํ•  ์š”์•ฝ โ†’ ํ†ตํ•ฉ ์š”์•ฝ =====
40
- def multi_stage_summary(text, model_choice):
41
- chunks = chunk_text(text)
42
- partial_summaries = [llm_summary(chunk, model_choice) for chunk in chunks]
43
- combined_summary = " ".join(partial_summaries)
44
- return llm_summary(combined_summary, model_choice)
45
-
46
- # ===== ์žฌ์ž‘์„ฑ =====
47
- def rewrite_with_llm(text, model_choice):
48
  llm = load_text_model(model_choice)
49
- prompt = f"""๋‹ค์Œ ๋ฌธ์žฅ์„ ์˜๋ฏธ๋Š” ์œ ์ง€ํ•˜๋˜, ์›๋ฌธ์— ์—†๋Š” ๋‚ด์šฉ์€ ์ ˆ๋Œ€ ์ถ”๊ฐ€ํ•˜์ง€ ๋ง๊ณ ,
50
- ๋ฐ˜๋ณต ์—†์ด ๊ฐ„๊ฒฐํ•˜๊ณ  ๋งค๋„๋Ÿฝ๊ฒŒ ๋ฐ”๊ฟ”์ฃผ์„ธ์š”.
51
-
52
- ๋ฌธ์žฅ:
53
- {text}
54
- """
55
- out = llm(prompt, max_new_tokens=200, do_sample=False, temperature=0.7,
56
- repetition_penalty=1.2, no_repeat_ngram_size=3)
57
- return out[0]["generated_text"].replace(prompt, "").strip()
58
-
59
- # ===== URL ์ฒ˜๋ฆฌ =====
60
- def process_url(url, model_choice):
 
 
 
 
 
 
61
  try:
62
- r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
 
63
  r.raise_for_status()
64
 
65
- # ์›๋ฌธ ์ถ”์ถœ
66
- plain_text = trafilatura.extract(r.text, output_format="txt", include_tables=False, favor_recall=True) or ""
67
- html_content = trafilatura.extract(r.text, output_format="html", include_tables=False, favor_recall=True)
 
 
 
 
 
 
 
 
 
 
 
 
68
  markdown_text = md(html_content or r.text, heading_style="ATX")
69
 
70
- # ์ฒซ ์ค„ ํˆดํŒ
71
- first_line = plain_text.strip().split("\n")[0].strip()
72
- link_html = f'<a href="{url}" title="{first_line}" target="_blank">์›๋ฌธ ๋ณด๊ธฐ</a>'
 
 
 
73
 
74
- # ๋ถ„ํ•  ์š”์•ฝ โ†’ ํ†ตํ•ฉ ์š”์•ฝ
75
- final_summary = multi_stage_summary(plain_text, model_choice)
 
 
 
 
 
 
 
 
 
 
 
76
 
77
- # ์žฌ์ž‘์„ฑ
78
- paraphrased_text = rewrite_with_llm(final_summary, model_choice)
 
 
 
 
79
 
80
- return link_html + "<br><br>" + markdown_text, final_summary, paraphrased_text
81
  except Exception as e:
82
- return f"์—๋Ÿฌ ๋ฐœ์ƒ: {e}", "์š”์•ฝ ์—†์Œ", "์žฌ์ž‘์„ฑ ์—†์Œ"
 
 
 
 
83
 
84
- # ===== Gradio UI =====
85
  iface = gr.Interface(
86
  fn=process_url,
87
  inputs=[
88
- gr.Textbox(label="URL ์ž…๋ ฅ", placeholder="https://example.com"),
89
- gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), value="Qwen2.5-1.5B-Instruct", label="์žฌ์ž‘์„ฑ ๋ชจ๋ธ ์„ ํƒ")
 
 
90
  ],
91
  outputs=[
92
- gr.HTML(label="์›๋ฌธ ๋งํฌ + ์ถ”์ถœ๋œ ๋ณธ๋ฌธ"),
93
- gr.Textbox(label="์ž๋™ ์š”์•ฝ", lines=5),
94
- gr.Textbox(label="์ž๋™ ์žฌ์ž‘์„ฑ (LLM)", lines=5)
 
95
  ],
96
- title="ํ•œ๊ตญ์–ด ๋ณธ๋ฌธ ์ถ”์ถœ + ๋ถ„ํ•  ์š”์•ฝ + LLM ์žฌ์ž‘์„ฑ",
97
- description="๊ธด ์›๋ฌธ๋„ ๋ถ„ํ•  ์š”์•ฝ ํ›„ ํ†ตํ•ฉ ์žฌ์ž‘์„ฑ์œผ๋กœ ํ’ˆ์งˆ ์œ ์ง€"
98
  )
99
 
100
  if __name__ == "__main__":
 
1
+ # app.py
2
+ # ํ•œ๊ตญ์–ด ๊ธฐ์‚ฌ ์ถ”์ถœ โ†’ ์„ ํƒ ์••์ถ•(๋น„๋ถ„ํ• ) โ†’ LLM ์š”์•ฝ โ†’ LLM ์žฌ์ž‘์„ฑ
3
+ # ๋ชจ๋ธ: Qwen2.5-1.5B-Instruct, skt/kogpt2-base-v2 (๋‘˜ ๋‹ค ์œ ์ง€)
4
+ # ํ—›์†Œ๋ฆฌ/๋ฐ˜๋ณต ์–ต์ œ: ์ „์ฒ˜๋ฆฌ, ๋””์ฝ”๋”ฉ ์ œ์•ฝ, ๊ฒฐ๊ณผ ๊ฒ€์ฆ(ํด๋ฐฑ) ์ ์šฉ
5
 
6
+ import re
7
+ import time
8
+ import uuid
9
+ import json
10
+ import requests
11
  import gradio as gr
12
+ import trafilatura
13
  from markdownify import markdownify as md
14
+
15
+ # ์„ ํƒ ์••์ถ•(๋ฌธ๋งฅ ๋ณด์กดํ˜• ๋ฌธ์žฅ ์„ ํƒ)
16
+ from sumy.parsers.plaintext import PlaintextParser
17
+ from sumy.nlp.tokenizers import Tokenizer
18
+ from sumy.summarizers.text_rank import TextRankSummarizer
19
+
20
+ # Hugging Face
21
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
22
 
23
+ # =========================
24
+ # ๋ชจ๋ธ ํ”„๋ฆฌ์…‹/๋กœ๋”
25
+ # =========================
26
+
27
  MODEL_OPTIONS = {
28
  "Qwen2.5-1.5B-Instruct": "Qwen/Qwen2.5-1.5B-Instruct",
29
+ "CLOVA-Text(๋Œ€์ฒด)": "skt/kogpt2-base-v2"
30
+ }
31
+
32
+ PRESETS = {
33
+ "Qwen2.5-1.5B-Instruct": dict(do_sample=False, temperature=0.2, top_p=0.9,
34
+ repetition_penalty=1.2, no_repeat_ngram_size=3),
35
+ "CLOVA-Text(๋Œ€์ฒด)": dict(do_sample=False, temperature=0.2, top_p=0.9,
36
+ repetition_penalty=1.25, no_repeat_ngram_size=4),
37
  }
38
 
39
+ # ๊ฐ„๋‹จ ์บ์‹œ(์„ธ์…˜ ์ค‘ ์ค‘๋ณต ๋กœ๋”ฉ ๋ฐฉ์ง€)
40
+ _PIPELINES = {}
41
+
42
+ def load_text_model(model_choice: str):
43
+ if model_choice in _PIPELINES:
44
+ return _PIPELINES[model_choice]
45
  model_name = MODEL_OPTIONS[model_choice]
46
+ tok = AutoTokenizer.from_pretrained(model_name)
47
+ mdl = AutoModelForCausalLM.from_pretrained(model_name)
48
+ pl = pipeline("text-generation", model=mdl, tokenizer=tok, device=-1) # CPU
49
+ _PIPELINES[model_choice] = pl
50
+ return pl
51
 
52
+ def llm_generate(llm, prompt: str, model_choice: str, max_new_tokens: int):
53
+ kw = PRESETS.get(model_choice, PRESETS["Qwen2.5-1.5B-Instruct"]).copy()
54
+ out = llm(prompt, max_new_tokens=max_new_tokens, **kw)[0]["generated_text"]
55
+ return out
56
 
57
+ # =========================
58
+ # ์ „์ฒ˜๋ฆฌ / ์„ ํƒ ์••์ถ• / ๊ฐ€๋“œ๋ ˆ์ผ
59
+ # =========================
60
+
61
+ def dedup_lines(text: str) -> str:
62
+ seen, out = set(), []
63
+ for line in text.splitlines():
64
+ s = line.strip()
65
+ if s and s not in seen:
66
+ seen.add(s)
67
+ out.append(s)
68
+ return " ".join(out)
69
+
70
+ def compress_repeated_phrases(text: str) -> str:
71
+ # 4ํšŒ ์ด์ƒ ๋ฐ˜๋ณต๋˜๋Š” 2~20์ž ๊ตฌ์ ˆ์„ 3ํšŒ๋กœ ์ถ•์•ฝ
72
+ return re.sub(r"(\S.{3,20}?)\s+(?:\1\s+){3,}", r"\1 \1 \1 ", text)
73
+
74
+ def preprocess(text: str) -> str:
75
+ t = dedup_lines(text)
76
+ t = compress_repeated_phrases(t)
77
+ t = re.sub(r"\s+", " ", t).strip()
78
+ return t
79
+
80
+ def select_key_sentences(text: str, target_chars: int = 1200, k: int = 10) -> str:
81
+ """
82
+ ๋น„๋ถ„ํ•  ๋ฐฉ์‹: ์›๋ฌธ ์ „์ฒด์—์„œ ํ•ต์‹ฌ ๋ฌธ์žฅ์„ ๊ณ ๋ฅด๊ณ  ์›๋ฌธ ์ˆœ์„œ๋ฅผ ์ตœ๋Œ€ํ•œ ๋ณด์กด.
83
+ target_chars: LLM ์ž…๋ ฅ ์ปจํ…์ŠคํŠธ ๊ธธ์ด(์ž์ˆ˜ ๊ธฐ์ค€).
84
+ """
85
+ try:
86
+ parser = PlaintextParser.from_string(text, Tokenizer("korean"))
87
+ s = TextRankSummarizer()
88
+ candidates = [str(x) for x in s(parser.document, k)]
89
+ # ๋ฌธ์žฅ ๋‹จ์œ„๋กœ ์›๋ฌธ์„ ๋‚˜๋ˆ  candidates๊ฐ€ ํฌํ•จ๋œ ๋ฌธ์žฅ๋งŒ ์ˆœ์„œ๋Œ€๋กœ ์„ ํƒ
90
+ sentences = re.split(r'(?<=[.!?ใ€‚])\s+', text)
91
+ ordered = [sent for sent in sentences if any(c in sent for c in candidates)]
92
+ out, total = [], 0
93
+ for sent in (ordered or candidates):
94
+ if not sent.strip():
95
+ continue
96
+ if total + len(sent) <= target_chars:
97
+ out.append(sent)
98
+ total += len(sent)
99
+ else:
100
+ break
101
+ if out:
102
+ return " ".join(out)
103
+ return text[:target_chars]
104
+ except Exception:
105
+ # sumy ์‹คํŒจ ์‹œ ์•ˆ์ „ ํด๋ฐฑ
106
+ return text[:target_chars]
107
+
108
+ def hard_limit(s: str, n: int) -> str:
109
+ return s[:n].rstrip()
110
+
111
+ def jaccard(a: str, b: str) -> float:
112
+ sa, sb = set(a.split()), set(b.split())
113
+ if not sa or not sb:
114
+ return 0.0
115
+ return len(sa & sb) / len(sa | sb)
116
+
117
+ BANNED = ["๋”ธ๊ธฐ", "์—ฐ์• ", "์—ฐ์˜ˆ", "์ปค๋ฎค๋‹ˆํ‹ฐ"]
118
+
119
+ def validate(original: str, summary: str, fallback: str) -> str:
120
+ # ์œ ์‚ฌ๋„/๊ธˆ์ง€์–ด ๊ฒ€์‚ฌ โ†’ ์‹คํŒจ ์‹œ ํด๋ฐฑ
121
+ if jaccard(original, summary) < 0.15:
122
+ return fallback
123
+ if any(b in summary for b in BANNED):
124
+ return fallback
125
+ return summary
126
+
127
+ # =========================
128
+ # ํ”„๋กฌํ”„ํŠธ
129
+ # =========================
130
+
131
+ def build_summary_prompt(context: str) -> str:
132
+ return f"""์—ญํ• : ํ•œ๊ตญ์–ด ๊ธฐ์‚ฌ ์š”์•ฝ ์ „๋ฌธ๊ฐ€.
133
+ ๊ทœ์น™:
134
+ - ์›๋ฌธ์— ์—†๋Š” ์‚ฌ์‹ค/์ˆ˜์น˜/์ธ์šฉ ์ถ”๊ฐ€ ๊ธˆ์ง€
135
+ - 3๋ฌธ์žฅ, 300์ž ์ด๋‚ด
136
+ - ์ค‘๋ณต ํ‘œํ˜„ ๊ธˆ์ง€
137
+ - ๊ด‘๊ณ /์ถ”์ฒœ ๊ธฐ์‚ฌ/์™ธ๋ถ€ ๋งํฌ ๋‚ด์šฉ ์ œ์™ธ
138
+
139
+ ์›๋ฌธ:
140
+ {context}
141
+
142
+ ์š”์•ฝ:"""
143
+
144
+ def build_rewrite_prompt(summary: str) -> str:
145
+ return f"""์—ญํ• : ํ•œ๊ตญ์–ด ๋ฌธ์žฅ ๋‹ค๋“ฌ๊ธฐ ์ „๋ฌธ๊ฐ€.
146
+ ๊ทœ์น™:
147
+ - ์˜๋ฏธ ๋ณด์กด, ์‚ฌ์‹ค ์ถ”๊ฐ€/์‚ญ์ œ ๊ธˆ์ง€
148
+ - 2~3๋ฌธ์žฅ, 250์ž ์ด๋‚ด
149
+ - ๊ฐ™์€ ๊ตฌ์ ˆ ๋ฐ˜๋ณต ๊ธˆ์ง€
150
+ - ๊ฐ„๊ฒฐํ•˜๊ณ  ๋ช…ํ™•ํ•˜๊ฒŒ
151
+
152
+ ๋Œ€์ƒ:
153
+ {summary}
154
+
155
+ ๊ฐœ์„ ๋ณธ:"""
156
+
157
+ # =========================
158
+ # ํŒŒ์ดํ”„๋ผ์ธ
159
+ # =========================
160
+
161
+ def run_pipeline(plain_text: str, model_choice: str):
162
+ t0 = time.time()
163
+ src = preprocess(plain_text)
164
+ condensed = select_key_sentences(src, target_chars=1200, k=10)
165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  llm = load_text_model(model_choice)
167
+
168
+ # ์š”์•ฝ
169
+ sum_prompt = build_summary_prompt(condensed)
170
+ raw_sum = llm_generate(llm, sum_prompt, model_choice, max_new_tokens=220).replace(sum_prompt, "").strip()
171
+ summary = hard_limit(raw_sum, 300)
172
+ extractive_fb = condensed[:300]
173
+ summary = validate(src, summary, extractive_fb)
174
+
175
+ # ์žฌ์ž‘์„ฑ
176
+ rw_prompt = build_rewrite_prompt(summary)
177
+ raw_rw = llm_generate(llm, rw_prompt, model_choice, max_new_tokens=200).replace(rw_prompt, "").strip()
178
+ rewrite = hard_limit(raw_rw, 250)
179
+ rewrite = validate(src, rewrite, summary)
180
+
181
+ latency_ms = int((time.time() - t0) * 1000)
182
+ return summary, rewrite, latency_ms, src, condensed
183
+
184
+ def process_url(url: str, model_choice: str):
185
  try:
186
+ # Fetch
187
+ r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=12)
188
  r.raise_for_status()
189
 
190
+ # ๋ณธ๋ฌธ ์ถ”์ถœ
191
+ plain_text = trafilatura.extract(
192
+ r.text,
193
+ output_format="txt",
194
+ include_tables=False,
195
+ include_comments=False,
196
+ favor_recall=True
197
+ ) or ""
198
+ html_content = trafilatura.extract(
199
+ r.text,
200
+ output_format="html",
201
+ include_tables=False,
202
+ include_comments=False,
203
+ favor_recall=True
204
+ )
205
  markdown_text = md(html_content or r.text, heading_style="ATX")
206
 
207
+ # ํŒŒ์ดํ”„๋ผ์ธ ์‹คํ–‰
208
+ summary, rewrite, latency_ms, src, condensed = run_pipeline(plain_text, model_choice)
209
+
210
+ # ๋งํฌ+์›๋ฌธ ๋ฏธ๋ฆฌ๋ณด๊ธฐ
211
+ header = plain_text.strip().split("\n")[0].strip() if plain_text else url
212
+ link_html = f'<a href="{url}" title="{header}" target="_blank">์›๋ฌธ ๋ณด๊ธฐ</a>'
213
 
214
+ # ๋กœ๊ทธ(์ฝ˜์†”)
215
+ print(json.dumps({
216
+ "id": str(uuid.uuid4()),
217
+ "model": model_choice,
218
+ "url": url,
219
+ "len_src": len(src),
220
+ "len_condensed": len(condensed),
221
+ "len_sum": len(summary),
222
+ "len_rw": len(rewrite),
223
+ "jaccard_sum": jaccard(src, summary),
224
+ "jaccard_rw": jaccard(src, rewrite),
225
+ "latency_ms": latency_ms
226
+ }, ensure_ascii=False))
227
 
228
+ return (
229
+ link_html + "<br><br>" + markdown_text,
230
+ summary,
231
+ rewrite,
232
+ f"{latency_ms} ms"
233
+ )
234
 
 
235
  except Exception as e:
236
+ return f"<b>์—๋Ÿฌ</b>: {e}", "", "", ""
237
+
238
+ # =========================
239
+ # UI
240
+ # =========================
241
 
 
242
  iface = gr.Interface(
243
  fn=process_url,
244
  inputs=[
245
+ gr.Textbox(label="URL ์ž…๋ ฅ", placeholder="https://n.news.naver.com/..."),
246
+ gr.Dropdown(choices=list(MODEL_OPTIONS.keys()),
247
+ value="Qwen2.5-1.5B-Instruct",
248
+ label="๋ชจ๋ธ ์„ ํƒ")
249
  ],
250
  outputs=[
251
+ gr.HTML(label="์›๋ฌธ ๋งํฌ + ์ถ”์ถœ๋œ ๋ณธ๋ฌธ ๋ฏธ๋ฆฌ๋ณด๊ธฐ"),
252
+ gr.Textbox(label="์ž๋™ ์š”์•ฝ(3๋ฌธ์žฅ/300์ž ์ด๋‚ด)", lines=6),
253
+ gr.Textbox(label="์ž๋™ ์žฌ์ž‘์„ฑ(2~3๋ฌธ์žฅ/250์ž ์ด๋‚ด)", lines=6),
254
+ gr.Textbox(label="์ง€์—ฐ ์‹œ๊ฐ„", lines=1)
255
  ],
256
+ title="ํ•œ๊ตญ์–ด ๋‰ด์Šค ์š”์•ฝยท์žฌ์ž‘์„ฑ (๋น„๋ถ„ํ•  ์ปจํ…์ŠคํŠธ)",
257
+ description="ํŒŒ์‹ฑ ์›๋ฌธ ์ „์ฒด๋ฅผ ์„ ํƒ์ ์œผ๋กœ ์••์ถ•ํ•ด ๋ฌธ๋งฅ์„ ์œ ์ง€ํ•˜๊ณ , LLM ์š”์•ฝ/์žฌ์ž‘์„ฑ์— ๊ฐ•ํ•œ ์ œ์•ฝ๊ณผ ํด๋ฐฑ์„ ์ ์šฉํ•ฉ๋‹ˆ๋‹ค."
258
  )
259
 
260
  if __name__ == "__main__":