orgoflu's picture
Update app.py
bba79ad verified
import re
import requests
import trafilatura
from bs4 import BeautifulSoup
import gradio as gr
# TextRank summarizer (summa)
try:
from summa.summarizer import summarize as textrank_summarize
HAS_SUMMA = True
except Exception:
HAS_SUMMA = False
DEFAULT_HEADERS = {"User-Agent": "Mozilla/5.0"}
# -----------------------------
# Fetch
# -----------------------------
def fetch_html(url: str, timeout: int = 12) -> str:
r = requests.get(url, headers=DEFAULT_HEADERS, timeout=timeout)
r.raise_for_status()
return r.text
# -----------------------------
# HTML -> Text (preserve code blocks)
# -----------------------------
def html_to_text_preserve_code(html: str) -> str:
"""
1) trafilatura๋กœ ๋ณธ๋ฌธ๋งŒ ๋‚จ๊ธด HTML์„ ์–ป๋Š”๋‹ค.
2) <pre>, <pre><code>, inline <code>๋ฅผ Markdown ์ฝ”๋“œํ‘œ๊ธฐ(```, `...`)๋กœ ๋ฐ”๊พผ๋‹ค.
3) ๋‚˜๋จธ์ง€ ํƒœ๊ทธ ์ œ๊ฑฐ ํ›„ ์ค„๋ฐ”๊ฟˆ ๋ณด์กดํ•œ ํ…์ŠคํŠธ๋ฅผ ๋ฐ˜ํ™˜ํ•œ๋‹ค.
"""
extracted = trafilatura.extract(
html,
output_format="html",
include_tables=True,
favor_recall=True
)
if not extracted:
return "๋ณธ๋ฌธ์„ ์ถ”์ถœํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
soup = BeautifulSoup(extracted, "html.parser")
# <pre> (including nested <code>) -> fenced block
for pre in soup.find_all("pre"):
code_tag = pre.find("code")
code_text = code_tag.get_text() if code_tag else pre.get_text()
code_text = code_text.replace("\r\n", "\n")
code_text = re.sub(r"\n{3,}", "\n\n", code_text).strip("\n")
pre.replace_with(f"\n```\n{code_text}\n```\n")
# inline <code> -> `...`
for c in soup.find_all("code"):
c_text = c.get_text().replace("`", "\\`")
c.replace_with(f"`{c_text}`")
# strip tags, keep newlines
text_output = soup.get_text("\n")
text_output = re.sub(r"\n{3,}", "\n\n", text_output).strip()
return text_output
# -----------------------------
# Sentence splitting (basic, lang-agnostic-ish)
# -----------------------------
_SENT_SPLIT_REGEX = re.compile(r"(?<=[\.!\?ใ€‚๏ผ๏ผŸ])\s+|\n+")
def split_sentences(text: str):
# quick-and-dirty splitter handling ., !, ?, CJK punctuation, and newlines
parts = _SENT_SPLIT_REGEX.split(text)
return [s.strip() for s in parts if s.strip()]
# -----------------------------
# Summarize
# -----------------------------
def summarize_text(text: str, max_sentences: int = 3) -> str:
text = (text or "").strip()
if not text:
return ""
# Try TextRank via summa first
if HAS_SUMMA:
try:
# ratio๋Š” ๋Œ€๋žต์ ์ธ ๊ธธ์ด ๋น„์œจ, ๋„ˆ๋ฌด ์งง์œผ๋ฉด sentences ์˜ต์…˜ ์‚ฌ์šฉ
# summa๋Š” ํ•œ๊ตญ์–ด๋„ ์–ด๋А์ •๋„ ์ž‘๋™ํ•˜์ง€๋งŒ ํ’ˆ์งˆ์€ ํ…์ŠคํŠธ์— ๋”ฐ๋ผ ๋‹ฌ๋ผ์ง
candidate = textrank_summarize(text, split=True)
if candidate:
# Pick top-N sentences
return "\n".join(candidate[:max_sentences]).strip()
except Exception:
pass
# Fallback: Lead-N sentences
sents = split_sentences(text)
if not sents:
# last resort, truncate
return text[:800]
return "\n".join(sents[:max_sentences]).strip()
# -----------------------------
# Handlers
# -----------------------------
def handle_html(url: str) -> str:
url = (url or "").strip()
if not url:
return "โŒ URL์„ ์ž…๋ ฅํ•˜์„ธ์š”."
try:
return fetch_html(url)
except Exception as e:
return f"์—๋Ÿฌ: {e}"
def handle_text(url: str) -> str:
url = (url or "").strip()
if not url:
return "โŒ URL์„ ์ž…๋ ฅํ•˜์„ธ์š”."
try:
html = fetch_html(url)
return html_to_text_preserve_code(html)
except Exception as e:
return f"์—๋Ÿฌ: {e}"
def handle_summary(url: str, sent_n: int) -> str:
url = (url or "").strip()
if not url:
return "โŒ URL์„ ์ž…๋ ฅํ•˜์„ธ์š”."
try:
html = fetch_html(url)
text = html_to_text_preserve_code(html)
if not text or text.startswith("๋ณธ๋ฌธ์„ ์ถ”์ถœํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."):
return text
summary = summarize_text(text, max_sentences=int(sent_n))
if not summary:
return "์š”์•ฝ์„ ์ƒ์„ฑํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
return f"๐Ÿ“ ์ž๋™์š”์•ฝ ({sent_n}๋ฌธ์žฅ)\n\n{summary}"
except Exception as e:
return f"์—๋Ÿฌ: {e}"
# -----------------------------
# UI
# -----------------------------
with gr.Blocks(css="""
#container { max-width: 920px; margin: 0 auto; }
.small { color:#666; font-size:14px; }
""") as demo:
gr.Markdown("## ๋งํฌ ์ž…๋ ฅ โ†’ ์›๋ณธ HTML / ํ…์ŠคํŠธ(์ฝ”๋“œ๋ธ”๋Ÿญ ๋ณด์กด) / ์ž๋™์š”์•ฝ", elem_id="container")
with gr.Row():
url_input = gr.Textbox(label="URL", placeholder="https://example.com", scale=4)
gr.Markdown('<div class="small">URL์„ ์ž…๋ ฅํ•˜๊ณ  ์›ํ•˜๋Š” ๋™์ž‘ ๋ฒ„ํŠผ์„ ๋ˆ„๋ฅด์„ธ์š”.</div>')
with gr.Row():
btn_html = gr.Button("์›๋ณธ HTML ๋ณด๊ธฐ", scale=1)
btn_text = gr.Button("ํ…์ŠคํŠธ ๋ณด๊ธฐ (์ฝ”๋“œ๋ธ”๋Ÿญ ๋ณด์กด)", scale=1)
with gr.Row():
sent_n = gr.Slider(1, 8, value=3, step=1, label="์š”์•ฝ ๋ฌธ์žฅ ์ˆ˜")
btn_sum = gr.Button("์ž๋™์š”์•ฝ ๋ณด๊ธฐ", scale=1)
output = gr.Textbox(label="๊ฒฐ๊ณผ", lines=26, show_copy_button=True)
btn_html.click(fn=handle_html, inputs=url_input, outputs=output)
btn_text.click(fn=handle_text, inputs=url_input, outputs=output)
btn_sum.click(fn=handle_summary, inputs=[url_input, sent_n], outputs=output)
if __name__ == "__main__":
demo.launch()