Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -12,18 +12,22 @@ import re
|
|
| 12 |
import torch
|
| 13 |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
| 14 |
|
| 15 |
-
# =====
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
# ===== ์ ํธ =====
|
| 29 |
def clean_text(text: str) -> str:
|
|
@@ -71,21 +75,23 @@ def summarize_text(text):
|
|
| 71 |
summary_list.sort(key=lambda s: text.find(s))
|
| 72 |
return summary_list
|
| 73 |
|
| 74 |
-
# ===== LLM ์๋์ฌ์์ฑ
|
| 75 |
-
def rewrite_with_llm(sentences):
|
|
|
|
|
|
|
|
|
|
| 76 |
joined_text = "\n".join(sentences)
|
| 77 |
-
prompt = f"""๋ค์ ๋ฌธ์ฅ์ ์๋ฏธ๋ ์ ์งํ๋,
|
| 78 |
-
|
| 79 |
|
| 80 |
๋ฌธ์ฅ:
|
| 81 |
{joined_text}
|
| 82 |
"""
|
| 83 |
-
result = llm_pipeline(prompt, max_new_tokens=
|
| 84 |
-
# ํ๋กฌํํธ ๋ถ๋ถ ์ ๊ฑฐ ํ ์๋ ๊ณต๋ฐฑ ์ ๊ฑฐ
|
| 85 |
return result[0]["generated_text"].replace(prompt, "").strip()
|
| 86 |
|
| 87 |
# ===== ์ ์ฒด ํ์ดํ๋ผ์ธ =====
|
| 88 |
-
def extract_summarize_paraphrase(url):
|
| 89 |
headers = {"User-Agent": "Mozilla/5.0"}
|
| 90 |
try:
|
| 91 |
r = requests.get(url, headers=headers, timeout=10)
|
|
@@ -107,7 +113,7 @@ def extract_summarize_paraphrase(url):
|
|
| 107 |
if not summary_sentences:
|
| 108 |
summary_sentences = ["์์ฝ ์์"]
|
| 109 |
|
| 110 |
-
paraphrased_text = rewrite_with_llm(summary_sentences)
|
| 111 |
|
| 112 |
return (
|
| 113 |
markdown_text or "๋ณธ๋ฌธ ์์",
|
|
@@ -121,14 +127,17 @@ def extract_summarize_paraphrase(url):
|
|
| 121 |
# ===== Gradio UI =====
|
| 122 |
iface = gr.Interface(
|
| 123 |
fn=extract_summarize_paraphrase,
|
| 124 |
-
inputs=
|
|
|
|
|
|
|
|
|
|
| 125 |
outputs=[
|
| 126 |
gr.Markdown(label="์ถ์ถ๋ ๋ณธ๋ฌธ"),
|
| 127 |
gr.Textbox(label="์๋ ์์ฝ", lines=5),
|
| 128 |
gr.Textbox(label="์๋ ์ฌ์์ฑ (LLM)", lines=5)
|
| 129 |
],
|
| 130 |
-
title="ํ๊ตญ์ด ๋ณธ๋ฌธ ์ถ์ถ + ์๋ ์์ฝ +
|
| 131 |
-
description="๋ณธ๋ฌธ์ TextRank๋ก ์์ฝํ๊ณ , ์ฌ์์ฑ์ Hugging Face Hub LLM์ผ๋ก ์ฒ๋ฆฌํฉ๋๋ค.
|
| 132 |
)
|
| 133 |
|
| 134 |
if __name__ == "__main__":
|
|
|
|
| 12 |
import torch
|
| 13 |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
| 14 |
|
| 15 |
+
# ===== ์ง์ ๋ชจ๋ธ ๋ชฉ๋ก =====
|
| 16 |
+
MODEL_OPTIONS = {
|
| 17 |
+
"Qwen2.5-1.5B-Instruct (ํ์งโ, ๋๋ฆผ)": "Qwen/Qwen2.5-1.5B-Instruct",
|
| 18 |
+
"Qwen2.5-0.5B-Instruct (๋น ๋ฆ, ๊ฒฝ๋)": "Qwen/Qwen2.5-0.5B-Instruct",
|
| 19 |
+
"Phi-3-Mini-4K-Instruct (๋น ๋ฆ, ๊ฒฝ๋)": "microsoft/Phi-3-mini-4k-instruct",
|
| 20 |
+
"Mistral-7B-Instruct-v0.3": "mistralai/Mistral-7B-Instruct-v0.3"
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
# ===== ๋ชจ๋ธ ๋ก๋ ํจ์ =====
|
| 24 |
+
def load_model(model_name):
|
| 25 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 26 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 27 |
+
model_name,
|
| 28 |
+
torch_dtype=torch.float32
|
| 29 |
+
).to("cpu")
|
| 30 |
+
return pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
|
| 31 |
|
| 32 |
# ===== ์ ํธ =====
|
| 33 |
def clean_text(text: str) -> str:
|
|
|
|
| 75 |
summary_list.sort(key=lambda s: text.find(s))
|
| 76 |
return summary_list
|
| 77 |
|
| 78 |
+
# ===== LLM ์๋์ฌ์์ฑ =====
|
| 79 |
+
def rewrite_with_llm(sentences, model_choice):
|
| 80 |
+
model_name = MODEL_OPTIONS[model_choice]
|
| 81 |
+
llm_pipeline = load_model(model_name)
|
| 82 |
+
|
| 83 |
joined_text = "\n".join(sentences)
|
| 84 |
+
prompt = f"""๋ค์ ๋ฌธ์ฅ์ ์๋ฏธ๋ ์ ์งํ๋, ์๋ฌธ์ ์๋ ๋ด์ฉ์ ์ ๋ ์ถ๊ฐํ์ง ๋ง๊ณ ,
|
| 85 |
+
๋ฌธ์ฅ๋ง ๋ ์์ฐ์ค๋ฝ๊ฒ ๋ฐ๊ฟ์ฃผ์ธ์. ๋ค๋ฅธ ์ค๋ช
์ด๋ ๋ถ์ฐ ๋ฌธ์ฅ์ ์ฐ์ง ๋ง์ธ์.
|
| 86 |
|
| 87 |
๋ฌธ์ฅ:
|
| 88 |
{joined_text}
|
| 89 |
"""
|
| 90 |
+
result = llm_pipeline(prompt, max_new_tokens=180, do_sample=False, temperature=0)
|
|
|
|
| 91 |
return result[0]["generated_text"].replace(prompt, "").strip()
|
| 92 |
|
| 93 |
# ===== ์ ์ฒด ํ์ดํ๋ผ์ธ =====
|
| 94 |
+
def extract_summarize_paraphrase(url, model_choice):
|
| 95 |
headers = {"User-Agent": "Mozilla/5.0"}
|
| 96 |
try:
|
| 97 |
r = requests.get(url, headers=headers, timeout=10)
|
|
|
|
| 113 |
if not summary_sentences:
|
| 114 |
summary_sentences = ["์์ฝ ์์"]
|
| 115 |
|
| 116 |
+
paraphrased_text = rewrite_with_llm(summary_sentences, model_choice)
|
| 117 |
|
| 118 |
return (
|
| 119 |
markdown_text or "๋ณธ๋ฌธ ์์",
|
|
|
|
| 127 |
# ===== Gradio UI =====
|
| 128 |
iface = gr.Interface(
|
| 129 |
fn=extract_summarize_paraphrase,
|
| 130 |
+
inputs=[
|
| 131 |
+
gr.Textbox(label="URL ์
๋ ฅ", placeholder="https://example.com"),
|
| 132 |
+
gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), value="Qwen2.5-0.5B-Instruct (๋น ๋ฆ, ๊ฒฝ๋)", label="์ฌ์์ฑ ๋ชจ๋ธ ์ ํ")
|
| 133 |
+
],
|
| 134 |
outputs=[
|
| 135 |
gr.Markdown(label="์ถ์ถ๋ ๋ณธ๋ฌธ"),
|
| 136 |
gr.Textbox(label="์๋ ์์ฝ", lines=5),
|
| 137 |
gr.Textbox(label="์๋ ์ฌ์์ฑ (LLM)", lines=5)
|
| 138 |
],
|
| 139 |
+
title="ํ๊ตญ์ด ๋ณธ๋ฌธ ์ถ์ถ + ์๋ ์์ฝ + LLM ์ฌ์์ฑ (๋ชจ๋ธ ์ ํ ๊ฐ๋ฅ)",
|
| 140 |
+
description="๋ณธ๋ฌธ์ TextRank๋ก ์์ฝํ๊ณ , ์ฌ์์ฑ์ ์ ํํ Hugging Face Hub LLM์ผ๋ก ์ฒ๋ฆฌํฉ๋๋ค."
|
| 141 |
)
|
| 142 |
|
| 143 |
if __name__ == "__main__":
|