moro_text_image / app.py
orgoflu's picture
Update app.py
e97007d verified
import nltk
nltk.download("punkt")
nltk.download("punkt_tab")
import gradio as gr
import trafilatura
import requests
from markdownify import markdownify as md
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer
import re
def summarize_text(text, sentence_count=3):
parser = PlaintextParser.from_string(text, Tokenizer("english"))
summarizer = TextRankSummarizer()
summary_sentences = summarizer(parser.document, sentence_count)
return [str(sentence) for sentence in summary_sentences]
def paraphrase_text(sentences):
# ๊ฐ„๋‹จํ•œ ๋กœ์ปฌ paraphrase: ๋™์˜์–ด ์น˜ํ™˜ + ๋ฌธ์žฅ ๊ตฌ์กฐ ๋ณ€๊ฒฝ
# (LLM ์—†์ด ๋™์ž‘, ํ’ˆ์งˆ์€ ๊ธฐ๋ณธ ์ˆ˜์ค€)
paraphrased = []
replacements = {
"๋…ธ์กฐ": "๋…ธ๋™์กฐํ•ฉ",
"์„ฑ๊ณผ๊ธ‰": "์„ฑ๊ณผ ๋ณด์ƒ๊ธˆ",
"์š”๊ตฌ": "์š”์ฒญ",
"๋ถˆ๋งŒ": "์ด์˜ ์ œ๊ธฐ",
"ํ•ฉ์˜์•ˆ": "ํ˜‘์ƒ ๊ฒฐ๊ณผ์•ˆ"
}
for s in sentences:
new_s = s
for k, v in replacements.items():
new_s = re.sub(k, v, new_s)
# ์–ด์ˆœ ์‚ด์ง ๋ณ€๊ฒฝ ์˜ˆ์‹œ
if "๋Š”" in new_s:
parts = new_s.split("๋Š”", 1)
new_s = f"{parts[1].strip()} โ€” {parts[0].strip()}๋Š”"
paraphrased.append(new_s)
return paraphrased
def extract_summarize_paraphrase(url):
headers = {"User-Agent": "Mozilla/5.0"}
try:
r = requests.get(url, headers=headers, timeout=10)
r.raise_for_status()
html_content = trafilatura.extract(
r.text,
output_format="html",
include_tables=True,
favor_recall=True
)
if not html_content:
return "๋ณธ๋ฌธ์„ ์ถ”์ถœํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.", "", ""
markdown_text = md(html_content, heading_style="ATX")
summary_sentences = summarize_text(markdown_text, sentence_count=3)
paraphrased_sentences = paraphrase_text(summary_sentences)
return markdown_text, "\n".join(summary_sentences), "\n".join(paraphrased_sentences)
except Exception as e:
return f"์—๋Ÿฌ ๋ฐœ์ƒ: {e}", "", ""
iface = gr.Interface(
fn=extract_summarize_paraphrase,
inputs=gr.Textbox(label="URL ์ž…๋ ฅ", placeholder="https://example.com"),
outputs=[
gr.Markdown(label="์ถ”์ถœ๋œ ๋ณธ๋ฌธ"),
gr.Textbox(label="์ž๋™ ์š”์•ฝ", lines=5),
gr.Textbox(label="์ž๋™ ์žฌ์ž‘์„ฑ", lines=5)
],
title="๋ณธ๋ฌธ ์ถ”์ถœ๊ธฐ + ์ž๋™ ์š”์•ฝ + ์ž๋™ ์žฌ์ž‘์„ฑ",
description="์›นํŽ˜์ด์ง€ URL์„ ์ž…๋ ฅํ•˜๋ฉด ๋ณธ๋ฌธ์„ ์ถ”์ถœํ•˜๊ณ , 3๋ฌธ์žฅ ์š”์•ฝ๊ณผ ์žฌ์ž‘์„ฑ(Paraphrasing) ๊ฒฐ๊ณผ๋ฅผ ์ œ๊ณตํ•ฉ๋‹ˆ๋‹ค."
)
if __name__ == "__main__":
iface.launch()