moro_text_4 / app.py
orgoflu's picture
Update app.py
00e61ba verified
import nltk
# NLTK ํ† ํฌ๋‚˜์ด์ € ๋ฆฌ์†Œ์Šค ์ž๋™ ๋‹ค์šด๋กœ๋“œ
nltk.download("punkt")
nltk.download("punkt_tab")
import gradio as gr
import trafilatura
import requests
from markdownify import markdownify as md
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer
def summarize_text(text, sentence_count=3):
# ํ•œ๊ตญ์–ด๋„ ๋ฌธ์žฅ ๋‹จ์œ„๋กœ ๋Š๊ธฐ ์œ„ํ•ด english ํ† ํฌ๋‚˜์ด์ € ์‚ฌ์šฉ
parser = PlaintextParser.from_string(text, Tokenizer("english"))
summarizer = TextRankSummarizer()
summary_sentences = summarizer(parser.document, sentence_count)
return "\n".join(str(sentence) for sentence in summary_sentences)
def extract_and_summarize(url):
headers = {"User-Agent": "Mozilla/5.0"}
try:
r = requests.get(url, headers=headers, timeout=10)
r.raise_for_status()
# HTML ํ˜•ํƒœ๋กœ ๋ณธ๋ฌธ ์ถ”์ถœ
html_content = trafilatura.extract(
r.text,
output_format="html",
include_tables=True,
favor_recall=True
)
if not html_content:
return "๋ณธ๋ฌธ์„ ์ถ”์ถœํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.", ""
# HTML โ†’ Markdown ๋ณ€ํ™˜
markdown_text = md(html_content, heading_style="ATX")
# ์š”์•ฝ ์ƒ์„ฑ
summary = summarize_text(markdown_text, sentence_count=3)
return markdown_text, summary
except requests.exceptions.Timeout:
return "์š”์ฒญ์ด ์‹œ๊ฐ„ ์ดˆ๊ณผ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.", ""
except requests.exceptions.RequestException as e:
return f"์š”์ฒญ ์‹คํŒจ: {e}", ""
except Exception as e:
return f"์—๋Ÿฌ ๋ฐœ์ƒ: {e}", ""
iface = gr.Interface(
fn=extract_and_summarize,
inputs=gr.Textbox(label="URL ์ž…๋ ฅ", placeholder="https://example.com"),
outputs=[
gr.Markdown(label="์ถ”์ถœ๋œ ๋ณธ๋ฌธ"),
gr.Textbox(label="์ž๋™ ์š”์•ฝ", lines=5)
],
title="๋ณธ๋ฌธ ์ถ”์ถœ๊ธฐ + ์ž๋™ ์š”์•ฝ",
description="์›นํŽ˜์ด์ง€ URL์„ ์ž…๋ ฅํ•˜๋ฉด ๋ณธ๋ฌธ์„ ์ถ”์ถœํ•˜๊ณ , TextRank ์•Œ๊ณ ๋ฆฌ์ฆ˜์œผ๋กœ 3๋ฌธ์žฅ ์š”์•ฝ์„ ์ œ๊ณตํ•ฉ๋‹ˆ๋‹ค."
)
if __name__ == "__main__":
iface.launch()