orgoflu commited on
Commit
629d00c
Β·
verified Β·
1 Parent(s): 30f4683

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -23
app.py CHANGED
@@ -1,6 +1,6 @@
1
  # app.py
2
- # HTML νŒŒμ‹± β†’ μžλ™μš”μ•½(TextRank) β†’ LLM μž¬μž‘μ„±
3
- # λͺ¨λΈ: Qwen2.5-1.5B-Instruct, skt/kogpt2-base-v2
4
 
5
  import requests
6
  import trafilatura
@@ -19,33 +19,38 @@ MODEL_OPTIONS = {
19
  "Qwen2.5-1.5B-Instruct": "Qwen/Qwen2.5-1.5B-Instruct",
20
  "CLOVA-Text(λŒ€μ²΄)": "skt/kogpt2-base-v2"
21
  }
22
-
23
  _PIPELINES = {}
24
  def load_llm(model_choice: str):
25
  if model_choice in _PIPELINES:
26
  return _PIPELINES[model_choice]
27
- model_name = MODEL_OPTIONS[model_choice]
28
- tok = AutoTokenizer.from_pretrained(model_name)
29
- mdl = AutoModelForCausalLM.from_pretrained(model_name)
30
- pl = pipeline("text-generation", model=mdl, tokenizer=tok, device=-1)
31
  _PIPELINES[model_choice] = pl
32
  return pl
33
 
34
  # ===== μžλ™μš”μ•½(TextRank) =====
35
 
36
- def auto_summarize(text: str, sentences_count: int = 3) -> str:
37
- parser = PlaintextParser.from_string(text, Tokenizer("korean"))
38
- summarizer = TextRankSummarizer()
39
- sents = [str(s) for s in summarizer(parser.document, sentences_count)]
40
- return " ".join(sents).strip()
 
 
 
 
 
 
 
 
41
 
42
  # ===== LLM μž¬μž‘μ„± =====
43
 
44
  def rewrite_with_llm(summary: str, model_choice: str) -> str:
45
  llm = load_llm(model_choice)
46
- prompt = f"""λ‹€μŒ μš”μ•½λ¬Έμ„ 더 κ°„κ²°ν•˜κ³  λ§€λ„λŸ½κ²Œ 닀듬어라:
47
- {summary}
48
- """
49
  out = llm(
50
  prompt,
51
  max_new_tokens=150,
@@ -78,13 +83,13 @@ def process_url(url: str, model_choice: str):
78
  )
79
  md_preview = md(html or r.text, heading_style="ATX")
80
 
81
- # 2) μžλ™μš”μ•½(TextRank)
82
- auto_sum = auto_summarize(plain, sentences_count=3)
83
 
84
  # 3) LLM μž¬μž‘μ„±
85
  final = rewrite_with_llm(auto_sum, model_choice)
86
 
87
- # 4) κ²°κ³Ό λ°˜ν™˜
88
  link_html = f'<a href="{url}" target="_blank">원문 보기</a>'
89
  return (
90
  link_html + "<br><br>" + md_preview,
@@ -98,17 +103,19 @@ iface = gr.Interface(
98
  fn=process_url,
99
  inputs=[
100
  gr.Textbox(label="URL μž…λ ₯", placeholder="https://n.news.naver.com/..."),
101
- gr.Dropdown(choices=list(MODEL_OPTIONS.keys()),
102
- value="Qwen2.5-1.5B-Instruct",
103
- label="λͺ¨λΈ 선택")
 
 
104
  ],
105
  outputs=[
106
  gr.HTML(label="원문 링크 + λ³Έλ¬Έ 미리보기"),
107
- gr.Textbox(label="μžλ™μš”μ•½ (3λ¬Έμž₯ TextRank)", lines=4),
108
  gr.Textbox(label="LLM μž¬μž‘μ„±", lines=4)
109
  ],
110
  title="HTML νŒŒμ‹± β†’ μžλ™μš”μ•½ β†’ LLM μž¬μž‘μ„±",
111
- description="κΈ΄ 글은 TextRank둜 λ¨Όμ € μš”μ•½ν•œ λ’€, Qwen/KoGPT2둜 κΉ”λ”ν•˜κ²Œ λ‹€λ“¬μŠ΅λ‹ˆλ‹€."
112
  )
113
 
114
  if __name__ == "__main__":
 
1
  # app.py
2
+ # HTML νŒŒμ‹± β†’ μžλ™μš”μ•½(TextRank, κ°€λ“œ) β†’ LLM μž¬μž‘μ„±
3
+ # Qwen2.5-1.5B-Instruct, skt/kogpt2-base-v2
4
 
5
  import requests
6
  import trafilatura
 
19
  "Qwen2.5-1.5B-Instruct": "Qwen/Qwen2.5-1.5B-Instruct",
20
  "CLOVA-Text(λŒ€μ²΄)": "skt/kogpt2-base-v2"
21
  }
 
22
  _PIPELINES = {}
23
  def load_llm(model_choice: str):
24
  if model_choice in _PIPELINES:
25
  return _PIPELINES[model_choice]
26
+ model_id = MODEL_OPTIONS[model_choice]
27
+ tok = AutoTokenizer.from_pretrained(model_id)
28
+ mdl = AutoModelForCausalLM.from_pretrained(model_id)
29
+ pl = pipeline("text-generation", model=mdl, tokenizer=tok, device=-1)
30
  _PIPELINES[model_choice] = pl
31
  return pl
32
 
33
  # ===== μžλ™μš”μ•½(TextRank) =====
34
 
35
+ def auto_summarize(text: str, n_sentences: int = 3) -> str:
36
+ """
37
+ Sumy TextRank 기반 μΆ”μΆœ μš”μ•½.
38
+ μ‹€νŒ¨ν•˜λ©΄ μ•ž 500자 폴백.
39
+ """
40
+ try:
41
+ parser = PlaintextParser.from_string(text, Tokenizer("korean"))
42
+ summarizer = TextRankSummarizer()
43
+ sents = [str(s) for s in summarizer(parser.document, n_sentences)]
44
+ summary = " ".join(sents).strip()
45
+ return summary or text[:500]
46
+ except Exception:
47
+ return text[:500]
48
 
49
  # ===== LLM μž¬μž‘μ„± =====
50
 
51
  def rewrite_with_llm(summary: str, model_choice: str) -> str:
52
  llm = load_llm(model_choice)
53
+ prompt = f"λ‹€μŒ μš”μ•½λ¬Έμ„ 더 κ°„κ²°ν•˜κ³  λ§€λ„λŸ½κ²Œ 닀듬어라:\n{summary}\n"
 
 
54
  out = llm(
55
  prompt,
56
  max_new_tokens=150,
 
83
  )
84
  md_preview = md(html or r.text, heading_style="ATX")
85
 
86
+ # 2) μžλ™μš”μ•½(TextRank, 폴백 포함)
87
+ auto_sum = auto_summarize(plain, n_sentences=3)
88
 
89
  # 3) LLM μž¬μž‘μ„±
90
  final = rewrite_with_llm(auto_sum, model_choice)
91
 
92
+ # 4) κ²°κ³Ό 리턴
93
  link_html = f'<a href="{url}" target="_blank">원문 보기</a>'
94
  return (
95
  link_html + "<br><br>" + md_preview,
 
103
  fn=process_url,
104
  inputs=[
105
  gr.Textbox(label="URL μž…λ ₯", placeholder="https://n.news.naver.com/..."),
106
+ gr.Dropdown(
107
+ choices=list(MODEL_OPTIONS.keys()),
108
+ value="Qwen2.5-1.5B-Instruct",
109
+ label="λͺ¨λΈ 선택"
110
+ )
111
  ],
112
  outputs=[
113
  gr.HTML(label="원문 링크 + λ³Έλ¬Έ 미리보기"),
114
+ gr.Textbox(label="μžλ™μš”μ•½", lines=4),
115
  gr.Textbox(label="LLM μž¬μž‘μ„±", lines=4)
116
  ],
117
  title="HTML νŒŒμ‹± β†’ μžλ™μš”μ•½ β†’ LLM μž¬μž‘μ„±",
118
+ description="TextRank μžλ™μš”μ•½ ν›„ Qwen/KoGPT2둜 λ‹€λ“¬μŠ΅λ‹ˆλ‹€. μš”μ•½ λ‹¨κ³„μ—μ„œ μ—λŸ¬ λ°œμƒ μ‹œ μ•ž 500자 폴백."
119
  )
120
 
121
  if __name__ == "__main__":