Spaces:

orgoflu
/

moro_text_2

Sleeping

File size: 5,178 Bytes

import nltk
nltk.download("punkt")

import gradio as gr
import trafilatura
import requests
from markdownify import markdownify as md
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer
import re
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, AutoModelForVision2Seq

# ===== 사용할 모델 2개 =====
MODEL_OPTIONS = {
    "Qwen2.5-1.5B-Instruct": "Qwen/Qwen2.5-1.5B-Instruct",
    "CLOVA-Donut-CORDv2": "naver-clova-ix/donut-base-finetuned-cord-v2"
}

# ===== 모델 로드 =====
def load_model(model_name):
    if model_name == "naver-clova-ix/donut-base-finetuned-cord-v2":
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForVision2Seq.from_pretrained(model_name)
        return pipeline("image-to-text", model=model, tokenizer=tokenizer)
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float32,
            trust_remote_code=True
        ).to("cpu")
        return pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)

# ===== 텍스트 전처리 =====
def clean_text(text: str) -> str:
    return re.sub(r'\s+', ' ', text).strip()

def remove_duplicates(sentences):
    seen, result = set(), []
    for s in sentences:
        s_clean = s.strip()
        if s_clean and s_clean not in seen:
            seen.add(s_clean)
            result.append(s_clean)
    return result

# ===== 자동 요약 =====
def summarize_text(text):
    text = clean_text(text)
    length = len(text)
    if length < 300:
        sentence_count = 1
    elif length < 800:
        sentence_count = 2
    elif length < 1500:
        sentence_count = 3
    else:
        sentence_count = 4

    try:
        parser = PlaintextParser.from_string(text, Tokenizer("korean"))
        if len(parser.document.sentences) == 0:
            raise ValueError
    except:
        try:
            parser = PlaintextParser.from_string(text, Tokenizer("english"))
            if len(parser.document.sentences) == 0:
                raise ValueError
        except:
            sentences = re.split(r'(?<=[.!?])\s+', text)
            return sentences[:sentence_count]

    summarizer = TextRankSummarizer()
    summary_sentences = summarizer(parser.document, sentence_count)
    summary_list = [str(sentence) for sentence in summary_sentences]
    summary_list = remove_duplicates(summary_list)
    summary_list.sort(key=lambda s: text.find(s))
    return summary_list

# ===== LLM 재작성 =====
def rewrite_with_llm(sentences, model_choice):
    model_name = MODEL_OPTIONS[model_choice]
    llm_pipeline = load_model(model_name)

    joined_text = "\n".join(sentences)

    if model_choice == "CLOVA-Donut-CORDv2":
        # CLOVA Donut은 원래 이미지 전용이지만, 여기서는 텍스트 입력도 그대로 반환
        return joined_text

    prompt = f"""다음 문장을 의미는 유지하되, 원문에 없는 내용은 절대 추가하지 말고,
문장만 더 자연스럽게 바꿔주세요. 다른 설명이나 부연 문장은 쓰지 마세요.

문장:
{joined_text}
"""
    result = llm_pipeline(prompt, max_new_tokens=150, do_sample=False, temperature=0)
    return result[0]["generated_text"].replace(prompt, "").strip()

# ===== 전체 파이프라인 =====
def extract_summarize_paraphrase(url, model_choice):
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        r = requests.get(url, headers=headers, timeout=10)
        r.raise_for_status()

        html_content = trafilatura.extract(
            r.text,
            output_format="html",
            include_tables=False,
            favor_recall=True
        )

        if not html_content:
            markdown_text = md(r.text, heading_style="ATX")
        else:
            markdown_text = md(html_content, heading_style="ATX")

        summary_sentences = summarize_text(markdown_text)
        if not summary_sentences:
            summary_sentences = ["요약 없음"]

        paraphrased_text = rewrite_with_llm(summary_sentences, model_choice)

        return (
            markdown_text or "본문 없음",
            "\n".join(summary_sentences),
            paraphrased_text
        )

    except Exception as e:
        return f"에러 발생: {e}", "요약 없음", "재작성 없음"

# ===== Gradio UI =====
iface = gr.Interface(
    fn=extract_summarize_paraphrase,
    inputs=[
        gr.Textbox(label="URL 입력", placeholder="https://example.com"),
        gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), value="Qwen2.5-1.5B-Instruct", label="재작성 모델 선택")
    ],
    outputs=[
        gr.Markdown(label="추출된 본문"),
        gr.Textbox(label="자동 요약", lines=5),
        gr.Textbox(label="자동 재작성 (LLM)", lines=5)
    ],
    title="한국어 본문 추출 + 자동 요약 + LLM 재작성",
    description="Qwen 1.5B 또는 CLOVA Donut(CORDv2)로 재작성"
)

if __name__ == "__main__":
    iface.launch()