Spaces:

orgoflu
/

moro_text_2

Sleeping

App Files Files Community

orgoflu commited on Sep 11, 2025

Commit

af6f11c

verified ·

1 Parent(s): 6cc7695

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -23

app.py CHANGED Viewed

@@ -12,18 +12,22 @@ import re
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-# ===== LLM 로드 (경량 모델로 속도 개선) =====
-# Qwen2.5-1.5B-Instruct → 품질은 좋지만 느림
-# 속도 개선 위해 phi-3-mini-4k-instruct 같은 경량 모델도 가능
-MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_NAME,
-    torch_dtype=torch.float32
-).to("cpu")
-llm_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
 # ===== 유틸 =====
 def clean_text(text: str) -> str:
@@ -71,21 +75,23 @@ def summarize_text(text):
     summary_list.sort(key=lambda s: text.find(s))
     return summary_list
-# ===== LLM 자동재작성 (불필요 안내문 제거) =====
-def rewrite_with_llm(sentences):
     joined_text = "\n".join(sentences)
-    prompt = f"""다음 문장을 의미는 유지하되, 더 자연스럽고 읽기 쉽게 재작성해 주세요.
-출력은 재작성된 문장만 포함하고, 다른 설명이나 부연 문장은 절대 쓰지 마세요.
 문장:
 {joined_text}
 """
-    result = llm_pipeline(prompt, max_new_tokens=300, do_sample=False)
-    # 프롬프트 부분 제거 후 양끝 공백 제거
     return result[0]["generated_text"].replace(prompt, "").strip()
 # ===== 전체 파이프라인 =====
-def extract_summarize_paraphrase(url):
     headers = {"User-Agent": "Mozilla/5.0"}
     try:
         r = requests.get(url, headers=headers, timeout=10)
@@ -107,7 +113,7 @@ def extract_summarize_paraphrase(url):
         if not summary_sentences:
             summary_sentences = ["요약 없음"]
-        paraphrased_text = rewrite_with_llm(summary_sentences)
         return (
             markdown_text or "본문 없음",
@@ -121,14 +127,17 @@ def extract_summarize_paraphrase(url):
 # ===== Gradio UI =====
 iface = gr.Interface(
     fn=extract_summarize_paraphrase,
-    inputs=gr.Textbox(label="URL 입력", placeholder="https://example.com"),
     outputs=[
         gr.Markdown(label="추출된 본문"),
         gr.Textbox(label="자동 요약", lines=5),
         gr.Textbox(label="자동 재작성 (LLM)", lines=5)
     ],
-    title="한국어 본문 추출 + 자동 요약 + HF LLM 재작성",
-    description="본문은 TextRank로 요약하고, 재작성은 Hugging Face Hub LLM으로 처리합니다. 출력은 재작성된 문장만 포함합니다."
 )
 if __name__ == "__main__":

 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+# ===== 지원 모델 목록 =====
+MODEL_OPTIONS = {
+    "Qwen2.5-1.5B-Instruct (품질↑, 느림)": "Qwen/Qwen2.5-1.5B-Instruct",
+    "Qwen2.5-0.5B-Instruct (빠름, 경량)": "Qwen/Qwen2.5-0.5B-Instruct",
+    "Phi-3-Mini-4K-Instruct (빠름, 경량)": "microsoft/Phi-3-mini-4k-instruct",
+    "Mistral-7B-Instruct-v0.3": "mistralai/Mistral-7B-Instruct-v0.3"
+}
+# ===== 모델 로드 함수 =====
+def load_model(model_name):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype=torch.float32
+    ).to("cpu")
+    return pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
 # ===== 유틸 =====
 def clean_text(text: str) -> str:
     summary_list.sort(key=lambda s: text.find(s))
     return summary_list
+# ===== LLM 자동재작성 =====
+def rewrite_with_llm(sentences, model_choice):
+    model_name = MODEL_OPTIONS[model_choice]
+    llm_pipeline = load_model(model_name)
     joined_text = "\n".join(sentences)
+    prompt = f"""다음 문장을 의미는 유지하되, 원문에 없는 내용은 절대 추가하지 말고,
+문장만 더 자연스럽게 바꿔주세요. 다른 설명이나 부연 문장은 쓰지 마세요.
 문장:
 {joined_text}
 """
+    result = llm_pipeline(prompt, max_new_tokens=180, do_sample=False, temperature=0)
     return result[0]["generated_text"].replace(prompt, "").strip()
 # ===== 전체 파이프라인 =====
+def extract_summarize_paraphrase(url, model_choice):
     headers = {"User-Agent": "Mozilla/5.0"}
     try:
         r = requests.get(url, headers=headers, timeout=10)
         if not summary_sentences:
             summary_sentences = ["요약 없음"]
+        paraphrased_text = rewrite_with_llm(summary_sentences, model_choice)
         return (
             markdown_text or "본문 없음",
 # ===== Gradio UI =====
 iface = gr.Interface(
     fn=extract_summarize_paraphrase,
+    inputs=[
+        gr.Textbox(label="URL 입력", placeholder="https://example.com"),
+        gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), value="Qwen2.5-0.5B-Instruct (빠름, 경량)", label="재작성 모델 선택")
+    ],
     outputs=[
         gr.Markdown(label="추출된 본문"),
         gr.Textbox(label="자동 요약", lines=5),
         gr.Textbox(label="자동 재작성 (LLM)", lines=5)
     ],
+    title="한국어 본문 추출 + 자동 요약 + LLM 재작성 (모델 선택 가능)",
+    description="본문은 TextRank로 요약하고, 재작성은 선택한 Hugging Face Hub LLM으로 처리합니다."
 )
 if __name__ == "__main__":