import nltk nltk.download("punkt") import gradio as gr import trafilatura import requests from markdownify import markdownify as md from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.text_rank import TextRankSummarizer import re import torch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, AutoModelForVision2Seq # ===== 사용할 모델 2개 ===== MODEL_OPTIONS = { "Qwen2.5-1.5B-Instruct": "Qwen/Qwen2.5-1.5B-Instruct", "CLOVA-Donut-CORDv2": "naver-clova-ix/donut-base-finetuned-cord-v2" } # ===== 모델 로드 ===== def load_model(model_name): if model_name == "naver-clova-ix/donut-base-finetuned-cord-v2": tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForVision2Seq.from_pretrained(model_name) return pipeline("image-to-text", model=model, tokenizer=tokenizer) else: tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float32, trust_remote_code=True ).to("cpu") return pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1) # ===== 텍스트 전처리 ===== def clean_text(text: str) -> str: return re.sub(r'\s+', ' ', text).strip() def remove_duplicates(sentences): seen, result = set(), [] for s in sentences: s_clean = s.strip() if s_clean and s_clean not in seen: seen.add(s_clean) result.append(s_clean) return result # ===== 자동 요약 ===== def summarize_text(text): text = clean_text(text) length = len(text) if length < 300: sentence_count = 1 elif length < 800: sentence_count = 2 elif length < 1500: sentence_count = 3 else: sentence_count = 4 try: parser = PlaintextParser.from_string(text, Tokenizer("korean")) if len(parser.document.sentences) == 0: raise ValueError except: try: parser = PlaintextParser.from_string(text, Tokenizer("english")) if len(parser.document.sentences) == 0: raise ValueError except: sentences = re.split(r'(?<=[.!?])\s+', text) return sentences[:sentence_count] summarizer = TextRankSummarizer() summary_sentences = summarizer(parser.document, sentence_count) summary_list = [str(sentence) for sentence in summary_sentences] summary_list = remove_duplicates(summary_list) summary_list.sort(key=lambda s: text.find(s)) return summary_list # ===== LLM 재작성 ===== def rewrite_with_llm(sentences, model_choice): model_name = MODEL_OPTIONS[model_choice] llm_pipeline = load_model(model_name) joined_text = "\n".join(sentences) if model_choice == "CLOVA-Donut-CORDv2": # CLOVA Donut은 원래 이미지 전용이지만, 여기서는 텍스트 입력도 그대로 반환 return joined_text prompt = f"""다음 문장을 의미는 유지하되, 원문에 없는 내용은 절대 추가하지 말고, 문장만 더 자연스럽게 바꿔주세요. 다른 설명이나 부연 문장은 쓰지 마세요. 문장: {joined_text} """ result = llm_pipeline(prompt, max_new_tokens=150, do_sample=False, temperature=0) return result[0]["generated_text"].replace(prompt, "").strip() # ===== 전체 파이프라인 ===== def extract_summarize_paraphrase(url, model_choice): headers = {"User-Agent": "Mozilla/5.0"} try: r = requests.get(url, headers=headers, timeout=10) r.raise_for_status() html_content = trafilatura.extract( r.text, output_format="html", include_tables=False, favor_recall=True ) if not html_content: markdown_text = md(r.text, heading_style="ATX") else: markdown_text = md(html_content, heading_style="ATX") summary_sentences = summarize_text(markdown_text) if not summary_sentences: summary_sentences = ["요약 없음"] paraphrased_text = rewrite_with_llm(summary_sentences, model_choice) return ( markdown_text or "본문 없음", "\n".join(summary_sentences), paraphrased_text ) except Exception as e: return f"에러 발생: {e}", "요약 없음", "재작성 없음" # ===== Gradio UI ===== iface = gr.Interface( fn=extract_summarize_paraphrase, inputs=[ gr.Textbox(label="URL 입력", placeholder="https://example.com"), gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), value="Qwen2.5-1.5B-Instruct", label="재작성 모델 선택") ], outputs=[ gr.Markdown(label="추출된 본문"), gr.Textbox(label="자동 요약", lines=5), gr.Textbox(label="자동 재작성 (LLM)", lines=5) ], title="한국어 본문 추출 + 자동 요약 + LLM 재작성", description="Qwen 1.5B 또는 CLOVA Donut(CORDv2)로 재작성" ) if __name__ == "__main__": iface.launch()