Spaces:

orgoflu
/

moro_text_2

Sleeping

App Files Files Community

moro_text_2 / app.py

orgoflu

Update app.py

7f49497 verified 5 months ago

raw

history blame contribute delete

5.18 kB

	import nltk
	nltk.download("punkt")

	import gradio as gr
	import trafilatura
	import requests
	from markdownify import markdownify as md
	from sumy.parsers.plaintext import PlaintextParser
	from sumy.nlp.tokenizers import Tokenizer
	from sumy.summarizers.text_rank import TextRankSummarizer
	import re
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, AutoModelForVision2Seq

	# ===== 사용할 모델 2개 =====
	MODEL_OPTIONS = {
	"Qwen2.5-1.5B-Instruct": "Qwen/Qwen2.5-1.5B-Instruct",
	"CLOVA-Donut-CORDv2": "naver-clova-ix/donut-base-finetuned-cord-v2"
	}

	# ===== 모델 로드 =====
	def load_model(model_name):
	if model_name == "naver-clova-ix/donut-base-finetuned-cord-v2":
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForVision2Seq.from_pretrained(model_name)
	return pipeline("image-to-text", model=model, tokenizer=tokenizer)
	else:
	tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.float32,
	trust_remote_code=True
	).to("cpu")
	return pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)

	# ===== 텍스트 전처리 =====
	def clean_text(text: str) -> str:
	return re.sub(r'\s+', ' ', text).strip()

	def remove_duplicates(sentences):
	seen, result = set(), []
	for s in sentences:
	s_clean = s.strip()
	if s_clean and s_clean not in seen:
	seen.add(s_clean)
	result.append(s_clean)
	return result

	# ===== 자동 요약 =====
	def summarize_text(text):
	text = clean_text(text)
	length = len(text)
	if length < 300:
	sentence_count = 1
	elif length < 800:
	sentence_count = 2
	elif length < 1500:
	sentence_count = 3
	else:
	sentence_count = 4

	try:
	parser = PlaintextParser.from_string(text, Tokenizer("korean"))
	if len(parser.document.sentences) == 0:
	raise ValueError
	except:
	try:
	parser = PlaintextParser.from_string(text, Tokenizer("english"))
	if len(parser.document.sentences) == 0:
	raise ValueError
	except:
	sentences = re.split(r'(?<=[.!?])\s+', text)
	return sentences[:sentence_count]

	summarizer = TextRankSummarizer()
	summary_sentences = summarizer(parser.document, sentence_count)
	summary_list = [str(sentence) for sentence in summary_sentences]
	summary_list = remove_duplicates(summary_list)
	summary_list.sort(key=lambda s: text.find(s))
	return summary_list

	# ===== LLM 재작성 =====
	def rewrite_with_llm(sentences, model_choice):
	model_name = MODEL_OPTIONS[model_choice]
	llm_pipeline = load_model(model_name)

	joined_text = "\n".join(sentences)

	if model_choice == "CLOVA-Donut-CORDv2":
	# CLOVA Donut은 원래 이미지 전용이지만, 여기서는 텍스트 입력도 그대로 반환
	return joined_text

	prompt = f"""다음 문장을 의미는 유지하되, 원문에 없는 내용은 절대 추가하지 말고,
	문장만 더 자연스럽게 바꿔주세요. 다른 설명이나 부연 문장은 쓰지 마세요.

	문장:
	{joined_text}
	"""
	result = llm_pipeline(prompt, max_new_tokens=150, do_sample=False, temperature=0)
	return result[0]["generated_text"].replace(prompt, "").strip()

	# ===== 전체 파이프라인 =====
	def extract_summarize_paraphrase(url, model_choice):
	headers = {"User-Agent": "Mozilla/5.0"}
	try:
	r = requests.get(url, headers=headers, timeout=10)
	r.raise_for_status()

	html_content = trafilatura.extract(
	r.text,
	output_format="html",
	include_tables=False,
	favor_recall=True
	)

	if not html_content:
	markdown_text = md(r.text, heading_style="ATX")
	else:
	markdown_text = md(html_content, heading_style="ATX")

	summary_sentences = summarize_text(markdown_text)
	if not summary_sentences:
	summary_sentences = ["요약 없음"]

	paraphrased_text = rewrite_with_llm(summary_sentences, model_choice)

	return (
	markdown_text or "본문 없음",
	"\n".join(summary_sentences),
	paraphrased_text
	)

	except Exception as e:
	return f"에러 발생: {e}", "요약 없음", "재작성 없음"

	# ===== Gradio UI =====
	iface = gr.Interface(
	fn=extract_summarize_paraphrase,
	inputs=[
	gr.Textbox(label="URL 입력", placeholder="https://example.com"),
	gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), value="Qwen2.5-1.5B-Instruct", label="재작성 모델 선택")
	],
	outputs=[
	gr.Markdown(label="추출된 본문"),
	gr.Textbox(label="자동 요약", lines=5),
	gr.Textbox(label="자동 재작성 (LLM)", lines=5)
	],
	title="한국어 본문 추출 + 자동 요약 + LLM 재작성",
	description="Qwen 1.5B 또는 CLOVA Donut(CORDv2)로 재작성"
	)

	if __name__ == "__main__":
	iface.launch()