orgoflu commited on
Commit
0dc5312
Β·
verified Β·
1 Parent(s): 629d00c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -66
app.py CHANGED
@@ -1,6 +1,5 @@
1
  # app.py
2
- # HTML νŒŒμ‹± β†’ μžλ™μš”μ•½(TextRank, κ°€λ“œ) β†’ LLM μž¬μž‘μ„±
3
- # Qwen2.5-1.5B-Instruct, skt/kogpt2-base-v2
4
 
5
  import requests
6
  import trafilatura
@@ -11,34 +10,10 @@ from sumy.parsers.plaintext import PlaintextParser
11
  from sumy.nlp.tokenizers import Tokenizer
12
  from sumy.summarizers.text_rank import TextRankSummarizer
13
 
14
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
15
-
16
- # ===== λͺ¨λΈ μ„€μ • =====
17
-
18
- MODEL_OPTIONS = {
19
- "Qwen2.5-1.5B-Instruct": "Qwen/Qwen2.5-1.5B-Instruct",
20
- "CLOVA-Text(λŒ€μ²΄)": "skt/kogpt2-base-v2"
21
- }
22
- _PIPELINES = {}
23
- def load_llm(model_choice: str):
24
- if model_choice in _PIPELINES:
25
- return _PIPELINES[model_choice]
26
- model_id = MODEL_OPTIONS[model_choice]
27
- tok = AutoTokenizer.from_pretrained(model_id)
28
- mdl = AutoModelForCausalLM.from_pretrained(model_id)
29
- pl = pipeline("text-generation", model=mdl, tokenizer=tok, device=-1)
30
- _PIPELINES[model_choice] = pl
31
- return pl
32
-
33
  # ===== μžλ™μš”μ•½(TextRank) =====
34
-
35
  def auto_summarize(text: str, n_sentences: int = 3) -> str:
36
- """
37
- Sumy TextRank 기반 μΆ”μΆœ μš”μ•½.
38
- μ‹€νŒ¨ν•˜λ©΄ μ•ž 500자 폴백.
39
- """
40
  try:
41
- parser = PlaintextParser.from_string(text, Tokenizer("korean"))
42
  summarizer = TextRankSummarizer()
43
  sents = [str(s) for s in summarizer(parser.document, n_sentences)]
44
  summary = " ".join(sents).strip()
@@ -46,26 +21,10 @@ def auto_summarize(text: str, n_sentences: int = 3) -> str:
46
  except Exception:
47
  return text[:500]
48
 
49
- # ===== LLM μž¬μž‘μ„± =====
50
-
51
- def rewrite_with_llm(summary: str, model_choice: str) -> str:
52
- llm = load_llm(model_choice)
53
- prompt = f"λ‹€μŒ μš”μ•½λ¬Έμ„ 더 κ°„κ²°ν•˜κ³  λ§€λ„λŸ½κ²Œ 닀듬어라:\n{summary}\n"
54
- out = llm(
55
- prompt,
56
- max_new_tokens=150,
57
- do_sample=False,
58
- temperature=0.2,
59
- repetition_penalty=1.2,
60
- no_repeat_ngram_size=3
61
- )[0]["generated_text"]
62
- return out.replace(prompt, "").strip()
63
-
64
  # ===== URL 처리 =====
65
-
66
- def process_url(url: str, model_choice: str):
67
  # 1) HTML νŒŒμ‹±
68
- r = requests.get(url, headers={"User-Agent":"Mozilla/5.0"}, timeout=10)
69
  r.raise_for_status()
70
  plain = trafilatura.extract(
71
  r.text,
@@ -83,39 +42,25 @@ def process_url(url: str, model_choice: str):
83
  )
84
  md_preview = md(html or r.text, heading_style="ATX")
85
 
86
- # 2) μžλ™μš”μ•½(TextRank, 폴백 포함)
87
  auto_sum = auto_summarize(plain, n_sentences=3)
88
 
89
- # 3) LLM μž¬μž‘μ„±
90
- final = rewrite_with_llm(auto_sum, model_choice)
91
-
92
- # 4) κ²°κ³Ό 리턴
93
  link_html = f'<a href="{url}" target="_blank">원문 보기</a>'
94
- return (
95
- link_html + "<br><br>" + md_preview,
96
- auto_sum,
97
- final
98
- )
99
 
100
  # ===== Gradio UI =====
101
-
102
  iface = gr.Interface(
103
  fn=process_url,
104
  inputs=[
105
- gr.Textbox(label="URL μž…λ ₯", placeholder="https://n.news.naver.com/..."),
106
- gr.Dropdown(
107
- choices=list(MODEL_OPTIONS.keys()),
108
- value="Qwen2.5-1.5B-Instruct",
109
- label="λͺ¨λΈ 선택"
110
- )
111
  ],
112
  outputs=[
113
  gr.HTML(label="원문 링크 + λ³Έλ¬Έ 미리보기"),
114
- gr.Textbox(label="μžλ™μš”μ•½", lines=4),
115
- gr.Textbox(label="LLM μž¬μž‘μ„±", lines=4)
116
  ],
117
- title="HTML νŒŒμ‹± β†’ μžλ™μš”μ•½ β†’ LLM μž¬μž‘μ„±",
118
- description="TextRank μžλ™μš”μ•½ ν›„ Qwen/KoGPT2둜 λ‹€λ“¬μŠ΅λ‹ˆλ‹€. μš”μ•½ λ‹¨κ³„μ—μ„œ μ—λŸ¬ λ°œμƒ μ‹œ μ•ž 500자 폴백."
119
  )
120
 
121
  if __name__ == "__main__":
 
1
  # app.py
2
+ # HTML νŒŒμ‹± β†’ μžλ™μš”μ•½(TextRank)만 μˆ˜ν–‰
 
3
 
4
  import requests
5
  import trafilatura
 
10
  from sumy.nlp.tokenizers import Tokenizer
11
  from sumy.summarizers.text_rank import TextRankSummarizer
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  # ===== μžλ™μš”μ•½(TextRank) =====
 
14
  def auto_summarize(text: str, n_sentences: int = 3) -> str:
 
 
 
 
15
  try:
16
+ parser = PlaintextParser.from_string(text, Tokenizer("korean"))
17
  summarizer = TextRankSummarizer()
18
  sents = [str(s) for s in summarizer(parser.document, n_sentences)]
19
  summary = " ".join(sents).strip()
 
21
  except Exception:
22
  return text[:500]
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  # ===== URL 처리 =====
25
+ def process_url(url: str):
 
26
  # 1) HTML νŒŒμ‹±
27
+ r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
28
  r.raise_for_status()
29
  plain = trafilatura.extract(
30
  r.text,
 
42
  )
43
  md_preview = md(html or r.text, heading_style="ATX")
44
 
45
+ # 2) μžλ™μš”μ•½
46
  auto_sum = auto_summarize(plain, n_sentences=3)
47
 
48
+ # 3) κ²°κ³Ό λ°˜ν™˜
 
 
 
49
  link_html = f'<a href="{url}" target="_blank">원문 보기</a>'
50
+ return link_html + "<br><br>" + md_preview, auto_sum
 
 
 
 
51
 
52
  # ===== Gradio UI =====
 
53
  iface = gr.Interface(
54
  fn=process_url,
55
  inputs=[
56
+ gr.Textbox(label="URL μž…λ ₯", placeholder="https://n.news.naver.com/...")
 
 
 
 
 
57
  ],
58
  outputs=[
59
  gr.HTML(label="원문 링크 + λ³Έλ¬Έ 미리보기"),
60
+ gr.Textbox(label="μžλ™μš”μ•½", lines=4)
 
61
  ],
62
+ title="HTML νŒŒμ‹± β†’ μžλ™μš”μ•½",
63
+ description="HTMLμ—μ„œ 본문을 μΆ”μΆœν•˜κ³  TextRank둜 μžλ™μš”μ•½λ§Œ μˆ˜ν–‰ν•©λ‹ˆλ‹€."
64
  )
65
 
66
  if __name__ == "__main__":