Spaces:

orgoflu
/

moro_text_2

Sleeping

App Files Files Community

orgoflu commited on Sep 11, 2025

Commit

208dd23

verified ·

1 Parent(s): 276cd92

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -129

app.py CHANGED Viewed

@@ -1,144 +1,50 @@
-import nltk
-nltk.download("punkt")
 import gradio as gr
-import trafilatura
-import requests
-from markdownify import markdownify as md
-from sumy.parsers.plaintext import PlaintextParser
-from sumy.nlp.tokenizers import Tokenizer
-from sumy.summarizers.text_rank import TextRankSummarizer
-import re
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-# ===== 사용할 모델 3개 =====
 MODEL_OPTIONS = {
     "Qwen2.5-1.5B-Instruct": "Qwen/Qwen2.5-1.5B-Instruct",
     "Gemma-3-4B-it": "google/gemma-3-4b-it",
-    "HyperCLOVA-X-Seed-3B": "naver-clova/HyperCLOVA-X-Seed-3B"
 }
 # ===== 모델 로드 =====
 def load_model(model_name):
-    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name,
-        torch_dtype=torch.float32,
-        trust_remote_code=True
-    ).to("cpu")
-    return pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
-# ===== 텍스트 전처리 =====
-def clean_text(text: str) -> str:
-    return re.sub(r'\s+', ' ', text).strip()
-def remove_duplicates(sentences):
-    seen, result = set(), []
-    for s in sentences:
-        s_clean = s.strip()
-        if s_clean and s_clean not in seen:
-            seen.add(s_clean)
-            result.append(s_clean)
-    return result
-# ===== 자동 요약 =====
-def summarize_text(text):
-    text = clean_text(text)
-    length = len(text)
-    if length < 300:
-        sentence_count = 1
-    elif length < 800:
-        sentence_count = 2
-    elif length < 1500:
-        sentence_count = 3
     else:
-        sentence_count = 4
-    try:
-        parser = PlaintextParser.from_string(text, Tokenizer("korean"))
-        if len(parser.document.sentences) == 0:
-            raise ValueError
-    except:
-        try:
-            parser = PlaintextParser.from_string(text, Tokenizer("english"))
-            if len(parser.document.sentences) == 0:
-                raise ValueError
-        except:
-            sentences = re.split(r'(?<=[.!?])\s+', text)
-            return sentences[:sentence_count]
-    summarizer = TextRankSummarizer()
-    summary_sentences = summarizer(parser.document, sentence_count)
-    summary_list = [str(sentence) for sentence in summary_sentences]
-    summary_list = remove_duplicates(summary_list)
-    summary_list.sort(key=lambda s: text.find(s))
-    return summary_list
-# ===== LLM 재작성 =====
-def rewrite_with_llm(sentences, model_choice):
-    model_name = MODEL_OPTIONS[model_choice]
-    llm_pipeline = load_model(model_name)
-    joined_text = "\n".join(sentences)
-    prompt = f"""다음 문장을 의미는 유지하되, 원문에 없는 내용은 절대 추가하지 말고,
-문장만 더 자연스럽게 바꿔주세요. 다른 설명이나 부연 문장은 쓰지 마세요.
-문장:
-{joined_text}
-"""
-    result = llm_pipeline(prompt, max_new_tokens=150, do_sample=False, temperature=0)
-    return result[0]["generated_text"].replace(prompt, "").strip()
-# ===== 전체 파이프라인 =====
-def extract_summarize_paraphrase(url, model_choice):
-    headers = {"User-Agent": "Mozilla/5.0"}
-    try:
-        r = requests.get(url, headers=headers, timeout=10)
-        r.raise_for_status()
-        html_content = trafilatura.extract(
-            r.text,
-            output_format="html",
-            include_tables=False,
-            favor_recall=True
-        )
-        if not html_content:
-            markdown_text = md(r.text, heading_style="ATX")
-        else:
-            markdown_text = md(html_content, heading_style="ATX")
-        summary_sentences = summarize_text(markdown_text)
-        if not summary_sentences:
-            summary_sentences = ["요약 없음"]
-        paraphrased_text = rewrite_with_llm(summary_sentences, model_choice)
-        return (
-            markdown_text or "본문 없음",
-            "\n".join(summary_sentences),
-            paraphrased_text
-        )
-    except Exception as e:
-        return f"에러 발생: {e}", "요약 없음", "재작성 없음"
-# ===== Gradio UI =====
-iface = gr.Interface(
-    fn=extract_summarize_paraphrase,
-    inputs=[
-        gr.Textbox(label="URL 입력", placeholder="https://example.com"),
-        gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), value="Qwen2.5-1.5B-Instruct", label="재작성 모델 선택")
-    ],
-    outputs=[
-        gr.Markdown(label="추출된 본문"),
-        gr.Textbox(label="자동 요약", lines=5),
-        gr.Textbox(label="자동 재작성 (LLM)", lines=5)
-    ],
-    title="한국어 본문 추출 + 자동 요약 + LLM 재작성",
-    description="Qwen 1.5B, Gemma 3 E4B, HyperCLOVA-X-Seed-3B 중 선택하여 재작성"
-)
-if __name__ == "__main__":
-    iface.launch()

 import gradio as gr
 import torch
+from transformers import pipeline, AutoTokenizer, AutoModelForVision2Seq
+# ===== 모델 목록 =====
 MODEL_OPTIONS = {
     "Qwen2.5-1.5B-Instruct": "Qwen/Qwen2.5-1.5B-Instruct",
     "Gemma-3-4B-it": "google/gemma-3-4b-it",
+    "CLOVA-Donut-CORDv2": "naver-clova-ix/donut-base-finetuned-cord-v2"
 }
 # ===== 모델 로드 =====
 def load_model(model_name):
+    if model_name == "naver-clova-ix/donut-base-finetuned-cord-v2":
+        # Vision2Seq 모델 로드
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForVision2Seq.from_pretrained(model_name)
+        return pipeline("image-to-text", model=model, tokenizer=tokenizer)
     else:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.float32,
+            trust_remote_code=True
+        ).to("cpu")
+        return pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
+# ===== CLOVA 이미지 처리 =====
+def process_image_with_clova(image):
+    pipe = load_model("naver-clova-ix/donut-base-finetuned-cord-v2")
+    result = pipe(image)
+    return result[0]["generated_text"]
+# ===== Gradio UI =====
+with gr.Blocks() as iface:
+    gr.Markdown("## Qwen / Gemma / CLOVA Donut 테스트")
+    with gr.Tab("텍스트 URL 요약/재작성"):
+        url_input = gr.Textbox(label="URL 입력")
+        model_choice = gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), value="Qwen2.5-1.5B-Instruct")
+        output_text = gr.Textbox(label="출력")
+        # 여기에 기존 URL 처리 함수 연결
+    with gr.Tab("CLOVA 이미지 → 텍스트"):
+        image_input = gr.Image(type="pil", label="이미지 업로드")
+        clova_output = gr.Textbox(label="인식 결과")
+        image_input.change(process_image_with_clova, inputs=image_input, outputs=clova_output)
+iface.launch()