orgoflu commited on
Commit
eebc78a
Β·
verified Β·
1 Parent(s): 574b3b3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -36
app.py CHANGED
@@ -4,15 +4,12 @@ nltk.download("punkt")
4
  import gradio as gr
5
  import trafilatura, requests, re
6
  from markdownify import markdownify as md
7
- from sumy.parsers.plaintext import PlaintextParser
8
- from sumy.nlp.tokenizers import Tokenizer
9
- from sumy.summarizers.text_rank import TextRankSummarizer
10
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
11
 
12
  # ===== λͺ¨λΈ λͺ©λ‘ =====
13
  MODEL_OPTIONS = {
14
  "Qwen2.5-1.5B-Instruct": "Qwen/Qwen2.5-1.5B-Instruct",
15
- "CLOVA-Text(λŒ€μ²΄)": "skt/kogpt2-base-v2"
16
  }
17
 
18
  # ===== ν…μŠ€νŠΈ λͺ¨λΈ λ‘œλ“œ =====
@@ -23,38 +20,41 @@ def load_text_model(model_choice):
23
  return pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
24
 
25
  # ===== ν…μŠ€νŠΈ μ „μ²˜λ¦¬ =====
26
- def clean_text(text):
27
  return re.sub(r'\s+', ' ', text).strip()
28
 
29
- # ===== μžλ™ μš”μ•½ =====
30
- def summarize_text(text):
31
  text = clean_text(text)
32
- length = len(text)
33
- sentence_count = 1 if length < 300 else 2 if length < 800 else 3 if length < 1500 else 4
34
- try:
35
- parser = PlaintextParser.from_string(text, Tokenizer("korean"))
36
- if not parser.document.sentences: raise ValueError
37
- except:
38
- try:
39
- parser = PlaintextParser.from_string(text, Tokenizer("english"))
40
- if not parser.document.sentences: raise ValueError
41
- except:
42
- return re.split(r'(?<=[.!?])\s+', text)[:sentence_count]
43
- summarizer = TextRankSummarizer()
44
- return [str(s) for s in summarizer(parser.document, sentence_count)]
 
 
 
45
 
46
  # ===== μž¬μž‘μ„± =====
47
- def rewrite_with_llm(sentences, model_choice):
48
- llm_pipeline = load_text_model(model_choice)
49
- joined_text = "\n".join(sentences)
50
  prompt = f"""λ‹€μŒ λ¬Έμž₯을 μ˜λ―ΈλŠ” μœ μ§€ν•˜λ˜, 원문에 μ—†λŠ” λ‚΄μš©μ€ μ ˆλŒ€ μΆ”κ°€ν•˜μ§€ 말고,
51
- λ¬Έμž₯만 더 μžμ—°μŠ€λŸ½κ²Œ λ°”κΏ”μ£Όμ„Έμš”. λ‹€λ₯Έ μ„€λͺ…μ΄λ‚˜ λΆ€μ—° λ¬Έμž₯은 μ“°μ§€ λ§ˆμ„Έμš”.
52
 
53
  λ¬Έμž₯:
54
- {joined_text}
55
  """
56
- result = llm_pipeline(prompt, max_new_tokens=150, do_sample=False, temperature=0)
57
- return result[0]["generated_text"].replace(prompt, "").strip()
 
58
 
59
  # ===== URL 처리 =====
60
  def process_url(url, model_choice):
@@ -62,23 +62,22 @@ def process_url(url, model_choice):
62
  r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
63
  r.raise_for_status()
64
 
65
- # 원문 순수 ν…μŠ€νŠΈ μΆ”μΆœ (μš”μ•½μš©)
66
  plain_text = trafilatura.extract(r.text, output_format="txt", include_tables=False, favor_recall=True) or ""
67
- # HTML β†’ λ§ˆν¬λ‹€μš΄ (좜λ ₯용)
68
  html_content = trafilatura.extract(r.text, output_format="html", include_tables=False, favor_recall=True)
69
  markdown_text = md(html_content or r.text, heading_style="ATX")
70
 
71
- # 첫 쀄 β†’ 툴팁
72
  first_line = plain_text.strip().split("\n")[0].strip()
73
  link_html = f'<a href="{url}" title="{first_line}" target="_blank">원문 보기</a>'
74
 
75
- # μš”μ•½
76
- summary_sentences = summarize_text(plain_text) or ["μš”μ•½ μ—†μŒ"]
77
 
78
  # μž¬μž‘μ„±
79
- paraphrased_text = rewrite_with_llm(summary_sentences, model_choice)
80
 
81
- return link_html + "<br><br>" + markdown_text, "\n".join(summary_sentences), paraphrased_text
82
  except Exception as e:
83
  return f"μ—λŸ¬ λ°œμƒ: {e}", "μš”μ•½ μ—†μŒ", "μž¬μž‘μ„± μ—†μŒ"
84
 
@@ -94,8 +93,8 @@ iface = gr.Interface(
94
  gr.Textbox(label="μžλ™ μš”μ•½", lines=5),
95
  gr.Textbox(label="μžλ™ μž¬μž‘μ„± (LLM)", lines=5)
96
  ],
97
- title="ν•œκ΅­μ–΄ λ³Έλ¬Έ μΆ”μΆœ + μžλ™ μš”μ•½ + LLM μž¬μž‘μ„±",
98
- description="원문 ν…μŠ€νŠΈμ—μ„œ λ°”λ‘œ μš”μ•½ ν›„, μ„ νƒν•œ λͺ¨λΈ(Qwen λ˜λŠ” KoGPT2)둜 μž¬μž‘μ„±ν•©λ‹ˆλ‹€."
99
  )
100
 
101
  if __name__ == "__main__":
 
4
  import gradio as gr
5
  import trafilatura, requests, re
6
  from markdownify import markdownify as md
 
 
 
7
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
8
 
9
  # ===== λͺ¨λΈ λͺ©λ‘ =====
10
  MODEL_OPTIONS = {
11
  "Qwen2.5-1.5B-Instruct": "Qwen/Qwen2.5-1.5B-Instruct",
12
+ "CLOVA-Text(λŒ€μ²΄)": "skt/kogpt2-base-v2" # ν—ˆκ°€ 없이 μ‚¬μš© κ°€λŠ₯
13
  }
14
 
15
  # ===== ν…μŠ€νŠΈ λͺ¨λΈ λ‘œλ“œ =====
 
20
  return pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
21
 
22
  # ===== ν…μŠ€νŠΈ μ „μ²˜λ¦¬ =====
23
+ def clean_text(text):
24
  return re.sub(r'\s+', ' ', text).strip()
25
 
26
+ # ===== ν…μŠ€νŠΈ λΆ„ν•  =====
27
+ def chunk_text(text, chunk_size=500):
28
  text = clean_text(text)
29
+ return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
30
+
31
+ # ===== LLM μš”μ•½ =====
32
+ def llm_summary(text, model_choice):
33
+ llm = load_text_model(model_choice)
34
+ prompt = f"λ‹€μŒ 글을 3λ¬Έμž₯ μ΄λ‚΄λ‘œ μš”μ•½:\n{text}"
35
+ out = llm(prompt, max_new_tokens=150, do_sample=False, temperature=0.7,
36
+ repetition_penalty=1.2, no_repeat_ngram_size=3)
37
+ return out[0]["generated_text"].replace(prompt, "").strip()
38
+
39
+ # ===== λΆ„ν•  μš”μ•½ β†’ 톡합 μš”μ•½ =====
40
+ def multi_stage_summary(text, model_choice):
41
+ chunks = chunk_text(text)
42
+ partial_summaries = [llm_summary(chunk, model_choice) for chunk in chunks]
43
+ combined_summary = " ".join(partial_summaries)
44
+ return llm_summary(combined_summary, model_choice)
45
 
46
  # ===== μž¬μž‘μ„± =====
47
+ def rewrite_with_llm(text, model_choice):
48
+ llm = load_text_model(model_choice)
 
49
  prompt = f"""λ‹€μŒ λ¬Έμž₯을 μ˜λ―ΈλŠ” μœ μ§€ν•˜λ˜, 원문에 μ—†λŠ” λ‚΄μš©μ€ μ ˆλŒ€ μΆ”κ°€ν•˜μ§€ 말고,
50
+ 반볡 없이 κ°„κ²°ν•˜κ³  λ§€λ„λŸ½κ²Œ λ°”κΏ”μ£Όμ„Έμš”.
51
 
52
  λ¬Έμž₯:
53
+ {text}
54
  """
55
+ out = llm(prompt, max_new_tokens=200, do_sample=False, temperature=0.7,
56
+ repetition_penalty=1.2, no_repeat_ngram_size=3)
57
+ return out[0]["generated_text"].replace(prompt, "").strip()
58
 
59
  # ===== URL 처리 =====
60
  def process_url(url, model_choice):
 
62
  r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
63
  r.raise_for_status()
64
 
65
+ # 원문 μΆ”μΆœ
66
  plain_text = trafilatura.extract(r.text, output_format="txt", include_tables=False, favor_recall=True) or ""
 
67
  html_content = trafilatura.extract(r.text, output_format="html", include_tables=False, favor_recall=True)
68
  markdown_text = md(html_content or r.text, heading_style="ATX")
69
 
70
+ # 첫 쀄 툴팁
71
  first_line = plain_text.strip().split("\n")[0].strip()
72
  link_html = f'<a href="{url}" title="{first_line}" target="_blank">원문 보기</a>'
73
 
74
+ # λΆ„ν•  μš”μ•½ β†’ 톡합 μš”μ•½
75
+ final_summary = multi_stage_summary(plain_text, model_choice)
76
 
77
  # μž¬μž‘μ„±
78
+ paraphrased_text = rewrite_with_llm(final_summary, model_choice)
79
 
80
+ return link_html + "<br><br>" + markdown_text, final_summary, paraphrased_text
81
  except Exception as e:
82
  return f"μ—λŸ¬ λ°œμƒ: {e}", "μš”μ•½ μ—†μŒ", "μž¬μž‘μ„± μ—†μŒ"
83
 
 
93
  gr.Textbox(label="μžλ™ μš”μ•½", lines=5),
94
  gr.Textbox(label="μžλ™ μž¬μž‘μ„± (LLM)", lines=5)
95
  ],
96
+ title="ν•œκ΅­μ–΄ λ³Έλ¬Έ μΆ”μΆœ + λΆ„ν•  μš”μ•½ + LLM μž¬μž‘μ„±",
97
+ description="κΈ΄ 원문도 λΆ„ν•  μš”μ•½ ν›„ 톡합 μž¬μž‘μ„±μœΌλ‘œ ν’ˆμ§ˆ μœ μ§€"
98
  )
99
 
100
  if __name__ == "__main__":