Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,15 +4,12 @@ nltk.download("punkt")
|
|
| 4 |
import gradio as gr
|
| 5 |
import trafilatura, requests, re
|
| 6 |
from markdownify import markdownify as md
|
| 7 |
-
from sumy.parsers.plaintext import PlaintextParser
|
| 8 |
-
from sumy.nlp.tokenizers import Tokenizer
|
| 9 |
-
from sumy.summarizers.text_rank import TextRankSummarizer
|
| 10 |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
| 11 |
|
| 12 |
# ===== λͺ¨λΈ λͺ©λ‘ =====
|
| 13 |
MODEL_OPTIONS = {
|
| 14 |
"Qwen2.5-1.5B-Instruct": "Qwen/Qwen2.5-1.5B-Instruct",
|
| 15 |
-
"CLOVA-Text(λ체)": "skt/kogpt2-base-v2"
|
| 16 |
}
|
| 17 |
|
| 18 |
# ===== ν
μ€νΈ λͺ¨λΈ λ‘λ =====
|
|
@@ -23,38 +20,41 @@ def load_text_model(model_choice):
|
|
| 23 |
return pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
|
| 24 |
|
| 25 |
# ===== ν
μ€νΈ μ μ²λ¦¬ =====
|
| 26 |
-
def clean_text(text):
|
| 27 |
return re.sub(r'\s+', ' ', text).strip()
|
| 28 |
|
| 29 |
-
# =====
|
| 30 |
-
def
|
| 31 |
text = clean_text(text)
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
# ===== μ¬μμ± =====
|
| 47 |
-
def rewrite_with_llm(
|
| 48 |
-
|
| 49 |
-
joined_text = "\n".join(sentences)
|
| 50 |
prompt = f"""λ€μ λ¬Έμ₯μ μλ―Έλ μ μ§νλ, μλ¬Έμ μλ λ΄μ©μ μ λ μΆκ°νμ§ λ§κ³ ,
|
| 51 |
-
|
| 52 |
|
| 53 |
λ¬Έμ₯:
|
| 54 |
-
{
|
| 55 |
"""
|
| 56 |
-
|
| 57 |
-
|
|
|
|
| 58 |
|
| 59 |
# ===== URL μ²λ¦¬ =====
|
| 60 |
def process_url(url, model_choice):
|
|
@@ -62,23 +62,22 @@ def process_url(url, model_choice):
|
|
| 62 |
r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
|
| 63 |
r.raise_for_status()
|
| 64 |
|
| 65 |
-
# μλ¬Έ
|
| 66 |
plain_text = trafilatura.extract(r.text, output_format="txt", include_tables=False, favor_recall=True) or ""
|
| 67 |
-
# HTML β λ§ν¬λ€μ΄ (μΆλ ₯μ©)
|
| 68 |
html_content = trafilatura.extract(r.text, output_format="html", include_tables=False, favor_recall=True)
|
| 69 |
markdown_text = md(html_content or r.text, heading_style="ATX")
|
| 70 |
|
| 71 |
-
# 첫 μ€
|
| 72 |
first_line = plain_text.strip().split("\n")[0].strip()
|
| 73 |
link_html = f'<a href="{url}" title="{first_line}" target="_blank">μλ¬Έ 보기</a>'
|
| 74 |
|
| 75 |
-
# μμ½
|
| 76 |
-
|
| 77 |
|
| 78 |
# μ¬μμ±
|
| 79 |
-
paraphrased_text = rewrite_with_llm(
|
| 80 |
|
| 81 |
-
return link_html + "<br><br>" + markdown_text,
|
| 82 |
except Exception as e:
|
| 83 |
return f"μλ¬ λ°μ: {e}", "μμ½ μμ", "μ¬μμ± μμ"
|
| 84 |
|
|
@@ -94,8 +93,8 @@ iface = gr.Interface(
|
|
| 94 |
gr.Textbox(label="μλ μμ½", lines=5),
|
| 95 |
gr.Textbox(label="μλ μ¬μμ± (LLM)", lines=5)
|
| 96 |
],
|
| 97 |
-
title="νκ΅μ΄ λ³Έλ¬Έ μΆμΆ +
|
| 98 |
-
description="μλ¬Έ
|
| 99 |
)
|
| 100 |
|
| 101 |
if __name__ == "__main__":
|
|
|
|
| 4 |
import gradio as gr
|
| 5 |
import trafilatura, requests, re
|
| 6 |
from markdownify import markdownify as md
|
|
|
|
|
|
|
|
|
|
| 7 |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
| 8 |
|
| 9 |
# ===== λͺ¨λΈ λͺ©λ‘ =====
|
| 10 |
MODEL_OPTIONS = {
|
| 11 |
"Qwen2.5-1.5B-Instruct": "Qwen/Qwen2.5-1.5B-Instruct",
|
| 12 |
+
"CLOVA-Text(λ체)": "skt/kogpt2-base-v2" # νκ° μμ΄ μ¬μ© κ°λ₯
|
| 13 |
}
|
| 14 |
|
| 15 |
# ===== ν
μ€νΈ λͺ¨λΈ λ‘λ =====
|
|
|
|
| 20 |
return pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
|
| 21 |
|
| 22 |
# ===== ν
μ€νΈ μ μ²λ¦¬ =====
|
| 23 |
+
def clean_text(text):
|
| 24 |
return re.sub(r'\s+', ' ', text).strip()
|
| 25 |
|
| 26 |
+
# ===== ν
μ€νΈ λΆν =====
|
| 27 |
+
def chunk_text(text, chunk_size=500):
|
| 28 |
text = clean_text(text)
|
| 29 |
+
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
|
| 30 |
+
|
| 31 |
+
# ===== LLM μμ½ =====
|
| 32 |
+
def llm_summary(text, model_choice):
|
| 33 |
+
llm = load_text_model(model_choice)
|
| 34 |
+
prompt = f"λ€μ κΈμ 3λ¬Έμ₯ μ΄λ΄λ‘ μμ½:\n{text}"
|
| 35 |
+
out = llm(prompt, max_new_tokens=150, do_sample=False, temperature=0.7,
|
| 36 |
+
repetition_penalty=1.2, no_repeat_ngram_size=3)
|
| 37 |
+
return out[0]["generated_text"].replace(prompt, "").strip()
|
| 38 |
+
|
| 39 |
+
# ===== λΆν μμ½ β ν΅ν© μμ½ =====
|
| 40 |
+
def multi_stage_summary(text, model_choice):
|
| 41 |
+
chunks = chunk_text(text)
|
| 42 |
+
partial_summaries = [llm_summary(chunk, model_choice) for chunk in chunks]
|
| 43 |
+
combined_summary = " ".join(partial_summaries)
|
| 44 |
+
return llm_summary(combined_summary, model_choice)
|
| 45 |
|
| 46 |
# ===== μ¬μμ± =====
|
| 47 |
+
def rewrite_with_llm(text, model_choice):
|
| 48 |
+
llm = load_text_model(model_choice)
|
|
|
|
| 49 |
prompt = f"""λ€μ λ¬Έμ₯μ μλ―Έλ μ μ§νλ, μλ¬Έμ μλ λ΄μ©μ μ λ μΆκ°νμ§ λ§κ³ ,
|
| 50 |
+
λ°λ³΅ μμ΄ κ°κ²°νκ³ λ§€λλ½κ² λ°κΏμ£ΌμΈμ.
|
| 51 |
|
| 52 |
λ¬Έμ₯:
|
| 53 |
+
{text}
|
| 54 |
"""
|
| 55 |
+
out = llm(prompt, max_new_tokens=200, do_sample=False, temperature=0.7,
|
| 56 |
+
repetition_penalty=1.2, no_repeat_ngram_size=3)
|
| 57 |
+
return out[0]["generated_text"].replace(prompt, "").strip()
|
| 58 |
|
| 59 |
# ===== URL μ²λ¦¬ =====
|
| 60 |
def process_url(url, model_choice):
|
|
|
|
| 62 |
r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
|
| 63 |
r.raise_for_status()
|
| 64 |
|
| 65 |
+
# μλ¬Έ μΆμΆ
|
| 66 |
plain_text = trafilatura.extract(r.text, output_format="txt", include_tables=False, favor_recall=True) or ""
|
|
|
|
| 67 |
html_content = trafilatura.extract(r.text, output_format="html", include_tables=False, favor_recall=True)
|
| 68 |
markdown_text = md(html_content or r.text, heading_style="ATX")
|
| 69 |
|
| 70 |
+
# 첫 μ€ ν΄ν
|
| 71 |
first_line = plain_text.strip().split("\n")[0].strip()
|
| 72 |
link_html = f'<a href="{url}" title="{first_line}" target="_blank">μλ¬Έ 보기</a>'
|
| 73 |
|
| 74 |
+
# λΆν μμ½ β ν΅ν© μμ½
|
| 75 |
+
final_summary = multi_stage_summary(plain_text, model_choice)
|
| 76 |
|
| 77 |
# μ¬μμ±
|
| 78 |
+
paraphrased_text = rewrite_with_llm(final_summary, model_choice)
|
| 79 |
|
| 80 |
+
return link_html + "<br><br>" + markdown_text, final_summary, paraphrased_text
|
| 81 |
except Exception as e:
|
| 82 |
return f"μλ¬ λ°μ: {e}", "μμ½ μμ", "μ¬μμ± μμ"
|
| 83 |
|
|
|
|
| 93 |
gr.Textbox(label="μλ μμ½", lines=5),
|
| 94 |
gr.Textbox(label="μλ μ¬μμ± (LLM)", lines=5)
|
| 95 |
],
|
| 96 |
+
title="νκ΅μ΄ λ³Έλ¬Έ μΆμΆ + λΆν μμ½ + LLM μ¬μμ±",
|
| 97 |
+
description="κΈ΄ μλ¬Έλ λΆν μμ½ ν ν΅ν© μ¬μμ±μΌλ‘ νμ§ μ μ§"
|
| 98 |
)
|
| 99 |
|
| 100 |
if __name__ == "__main__":
|