Spaces:

orgoflu
/

moro_text_2

Sleeping

App Files Files Community

orgoflu commited on Sep 11, 2025

Commit

26bd648

verified ·

1 Parent(s): 208dd23

Update app.py

Browse files

Files changed (1) hide show

app.py +127 -23

app.py CHANGED Viewed

@@ -1,23 +1,30 @@
 import gradio as gr
 import torch
-from transformers import pipeline, AutoTokenizer, AutoModelForVision2Seq
-# ===== 모델 목록 =====
 MODEL_OPTIONS = {
     "Qwen2.5-1.5B-Instruct": "Qwen/Qwen2.5-1.5B-Instruct",
-    "Gemma-3-4B-it": "google/gemma-3-4b-it",
     "CLOVA-Donut-CORDv2": "naver-clova-ix/donut-base-finetuned-cord-v2"
 }
 # ===== 모델 로드 =====
 def load_model(model_name):
     if model_name == "naver-clova-ix/donut-base-finetuned-cord-v2":
-        # Vision2Seq 모델 로드
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         model = AutoModelForVision2Seq.from_pretrained(model_name)
         return pipeline("image-to-text", model=model, tokenizer=tokenizer)
     else:
-        from transformers import AutoModelForCausalLM, AutoTokenizer
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
@@ -26,25 +33,122 @@ def load_model(model_name):
         ).to("cpu")
         return pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
-# ===== CLOVA 이미지 처리 =====
-def process_image_with_clova(image):
-    pipe = load_model("naver-clova-ix/donut-base-finetuned-cord-v2")
-    result = pipe(image)
-    return result[0]["generated_text"]
-# ===== Gradio UI =====
-with gr.Blocks() as iface:
-    gr.Markdown("## Qwen / Gemma / CLOVA Donut 테스트")
-    with gr.Tab("텍스트 URL 요약/재작성"):
-        url_input = gr.Textbox(label="URL 입력")
-        model_choice = gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), value="Qwen2.5-1.5B-Instruct")
-        output_text = gr.Textbox(label="출력")
-        # 여기에 기존 URL 처리 함수 연결
-    with gr.Tab("CLOVA 이미지 → 텍스트"):
-        image_input = gr.Image(type="pil", label="이미지 업로드")
-        clova_output = gr.Textbox(label="인식 결과")
-        image_input.change(process_image_with_clova, inputs=image_input, outputs=clova_output)
-iface.launch()

+import nltk
+nltk.download("punkt")
 import gradio as gr
+import trafilatura
+import requests
+from markdownify import markdownify as md
+from sumy.parsers.plaintext import PlaintextParser
+from sumy.nlp.tokenizers import Tokenizer
+from sumy.summarizers.text_rank import TextRankSummarizer
+import re
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, AutoModelForVision2Seq
+# ===== 사용할 모델 2개 =====
 MODEL_OPTIONS = {
     "Qwen2.5-1.5B-Instruct": "Qwen/Qwen2.5-1.5B-Instruct",
     "CLOVA-Donut-CORDv2": "naver-clova-ix/donut-base-finetuned-cord-v2"
 }
 # ===== 모델 로드 =====
 def load_model(model_name):
     if model_name == "naver-clova-ix/donut-base-finetuned-cord-v2":
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         model = AutoModelForVision2Seq.from_pretrained(model_name)
         return pipeline("image-to-text", model=model, tokenizer=tokenizer)
     else:
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
         ).to("cpu")
         return pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
+# ===== 텍스트 전처리 =====
+def clean_text(text: str) -> str:
+    return re.sub(r'\s+', ' ', text).strip()
+def remove_duplicates(sentences):
+    seen, result = set(), []
+    for s in sentences:
+        s_clean = s.strip()
+        if s_clean and s_clean not in seen:
+            seen.add(s_clean)
+            result.append(s_clean)
+    return result
+# ===== 자동 요약 =====
+def summarize_text(text):
+    text = clean_text(text)
+    length = len(text)
+    if length < 300:
+        sentence_count = 1
+    elif length < 800:
+        sentence_count = 2
+    elif length < 1500:
+        sentence_count = 3
+    else:
+        sentence_count = 4
+    try:
+        parser = PlaintextParser.from_string(text, Tokenizer("korean"))
+        if len(parser.document.sentences) == 0:
+            raise ValueError
+    except:
+        try:
+            parser = PlaintextParser.from_string(text, Tokenizer("english"))
+            if len(parser.document.sentences) == 0:
+                raise ValueError
+        except:
+            sentences = re.split(r'(?<=[.!?])\s+', text)
+            return sentences[:sentence_count]
+    summarizer = TextRankSummarizer()
+    summary_sentences = summarizer(parser.document, sentence_count)
+    summary_list = [str(sentence) for sentence in summary_sentences]
+    summary_list = remove_duplicates(summary_list)
+    summary_list.sort(key=lambda s: text.find(s))
+    return summary_list
+# ===== LLM 재작성 =====
+def rewrite_with_llm(sentences, model_choice):
+    model_name = MODEL_OPTIONS[model_choice]
+    llm_pipeline = load_model(model_name)
+    joined_text = "\n".join(sentences)
+    if model_choice == "CLOVA-Donut-CORDv2":
+        # CLOVA Donut은 원래 이미지 전용이지만, 텍스트를 이미지 없이 처리하도록 변형
+        # 여기서는 단순히 입력 텍스트를 그대로 반환하거나, 필요 시 후처리 가능
+        return joined_text  # CLOVA는 텍스트 재작성 기능이 없으므로 그대로 반환
+    prompt = f"""다음 문장을 의미는 유지하되, 원문에 없는 내용은 절대 추가하지 말고,
+문장만 더 자연스럽게 바꿔주세요. 다른 설명이나 부연 문장은 쓰지 마세요.
+문장:
+{joined_text}
+"""
+    result = llm_pipeline(prompt, max_new_tokens=150, do_sample=False, temperature=0)
+    return result[0]["generated_text"].replace(prompt, "").strip()
+# ===== 전체 파이프라인 =====
+def extract_summarize_paraphrase(url, model_choice):
+    headers = {"User-Agent": "Mozilla/5.0"}
+    try:
+        r = requests.get(url, headers=headers, timeout=10)
+        r.raise_for_status()
+        html_content = trafilatura.extract(
+            r.text,
+            output_format="html",
+            include_tables=False,
+            favor_recall=True
+        )
+        if not html_content:
+            markdown_text = md(r.text, heading_style="ATX")
+        else:
+            markdown_text = md(html_content, heading_style="ATX")
+        summary_sentences = summarize_text(markdown_text)
+        if not summary_sentences:
+            summary_sentences = ["요약 없음"]
+        paraphrased_text = rewrite_with_llm(summary_sentences, model_choice)
+        return (
+            markdown_text or "본문 없음",
+            "\n".join(summary_sentences),
+            paraphrased_text
+        )
+    except Exception as e:
+        return f"에러 발생: {e}", "요약 없음", "재작성 없음"
+# ===== Gradio UI =====
+iface = gr.Interface(
+    fn=extract_summarize_paraphrase,
+    inputs=[
+        gr.Textbox(label="URL 입력", placeholder="https://example.com"),
+        gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), value="Qwen2.5-1.5B-Instruct", label="재작성 모델 선택")
+    ],
+    outputs=[
+        gr.Markdown(label="추출된 본문"),
+        gr.Textbox(label="자동 요약", lines=5),
+        gr.Textbox(label="자동 재작성 (LLM)", lines=5)
+    ],
+    title="한국어 본문 추출 + 자동 요약 + LLM 재작성",
+    description="Qwen 1.5B 또는 CLOVA Donut(CORDv2)로 재작성"
+)
+if __name__ == "__main__":
+    iface.launch()