Spaces:
Sleeping
Sleeping
File size: 5,552 Bytes
2382c2d 1ae79cc 2382c2d 1ae79cc bba79ad 2382c2d bba79ad 2382c2d bba79ad 2382c2d bba79ad 2382c2d bba79ad 2382c2d bba79ad 2382c2d bba79ad 2382c2d bba79ad 2382c2d 1a41c9a 1ae79cc bba79ad 2382c2d 1ae79cc 2382c2d bba79ad 2382c2d bba79ad 1ae79cc 2382c2d bba79ad 2382c2d bba79ad 2382c2d bba79ad 1a41c9a 2382c2d bba79ad 2382c2d bba79ad 1a41c9a 2382c2d bba79ad 1ae79cc 1a41c9a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
import re
import requests
import trafilatura
from bs4 import BeautifulSoup
import gradio as gr
# TextRank summarizer (summa)
try:
from summa.summarizer import summarize as textrank_summarize
HAS_SUMMA = True
except Exception:
HAS_SUMMA = False
DEFAULT_HEADERS = {"User-Agent": "Mozilla/5.0"}
# -----------------------------
# Fetch
# -----------------------------
def fetch_html(url: str, timeout: int = 12) -> str:
r = requests.get(url, headers=DEFAULT_HEADERS, timeout=timeout)
r.raise_for_status()
return r.text
# -----------------------------
# HTML -> Text (preserve code blocks)
# -----------------------------
def html_to_text_preserve_code(html: str) -> str:
"""
1) trafilatura๋ก ๋ณธ๋ฌธ๋ง ๋จ๊ธด HTML์ ์ป๋๋ค.
2) <pre>, <pre><code>, inline <code>๋ฅผ Markdown ์ฝ๋ํ๊ธฐ(```, `...`)๋ก ๋ฐ๊พผ๋ค.
3) ๋๋จธ์ง ํ๊ทธ ์ ๊ฑฐ ํ ์ค๋ฐ๊ฟ ๋ณด์กดํ ํ
์คํธ๋ฅผ ๋ฐํํ๋ค.
"""
extracted = trafilatura.extract(
html,
output_format="html",
include_tables=True,
favor_recall=True
)
if not extracted:
return "๋ณธ๋ฌธ์ ์ถ์ถํ ์ ์์ต๋๋ค."
soup = BeautifulSoup(extracted, "html.parser")
# <pre> (including nested <code>) -> fenced block
for pre in soup.find_all("pre"):
code_tag = pre.find("code")
code_text = code_tag.get_text() if code_tag else pre.get_text()
code_text = code_text.replace("\r\n", "\n")
code_text = re.sub(r"\n{3,}", "\n\n", code_text).strip("\n")
pre.replace_with(f"\n```\n{code_text}\n```\n")
# inline <code> -> `...`
for c in soup.find_all("code"):
c_text = c.get_text().replace("`", "\\`")
c.replace_with(f"`{c_text}`")
# strip tags, keep newlines
text_output = soup.get_text("\n")
text_output = re.sub(r"\n{3,}", "\n\n", text_output).strip()
return text_output
# -----------------------------
# Sentence splitting (basic, lang-agnostic-ish)
# -----------------------------
_SENT_SPLIT_REGEX = re.compile(r"(?<=[\.!\?ใ๏ผ๏ผ])\s+|\n+")
def split_sentences(text: str):
# quick-and-dirty splitter handling ., !, ?, CJK punctuation, and newlines
parts = _SENT_SPLIT_REGEX.split(text)
return [s.strip() for s in parts if s.strip()]
# -----------------------------
# Summarize
# -----------------------------
def summarize_text(text: str, max_sentences: int = 3) -> str:
text = (text or "").strip()
if not text:
return ""
# Try TextRank via summa first
if HAS_SUMMA:
try:
# ratio๋ ๋๋ต์ ์ธ ๊ธธ์ด ๋น์จ, ๋๋ฌด ์งง์ผ๋ฉด sentences ์ต์
์ฌ์ฉ
# summa๋ ํ๊ตญ์ด๋ ์ด๋์ ๋ ์๋ํ์ง๋ง ํ์ง์ ํ
์คํธ์ ๋ฐ๋ผ ๋ฌ๋ผ์ง
candidate = textrank_summarize(text, split=True)
if candidate:
# Pick top-N sentences
return "\n".join(candidate[:max_sentences]).strip()
except Exception:
pass
# Fallback: Lead-N sentences
sents = split_sentences(text)
if not sents:
# last resort, truncate
return text[:800]
return "\n".join(sents[:max_sentences]).strip()
# -----------------------------
# Handlers
# -----------------------------
def handle_html(url: str) -> str:
url = (url or "").strip()
if not url:
return "โ URL์ ์
๋ ฅํ์ธ์."
try:
return fetch_html(url)
except Exception as e:
return f"์๋ฌ: {e}"
def handle_text(url: str) -> str:
url = (url or "").strip()
if not url:
return "โ URL์ ์
๋ ฅํ์ธ์."
try:
html = fetch_html(url)
return html_to_text_preserve_code(html)
except Exception as e:
return f"์๋ฌ: {e}"
def handle_summary(url: str, sent_n: int) -> str:
url = (url or "").strip()
if not url:
return "โ URL์ ์
๋ ฅํ์ธ์."
try:
html = fetch_html(url)
text = html_to_text_preserve_code(html)
if not text or text.startswith("๋ณธ๋ฌธ์ ์ถ์ถํ ์ ์์ต๋๋ค."):
return text
summary = summarize_text(text, max_sentences=int(sent_n))
if not summary:
return "์์ฝ์ ์์ฑํ ์ ์์ต๋๋ค."
return f"๐ ์๋์์ฝ ({sent_n}๋ฌธ์ฅ)\n\n{summary}"
except Exception as e:
return f"์๋ฌ: {e}"
# -----------------------------
# UI
# -----------------------------
with gr.Blocks(css="""
#container { max-width: 920px; margin: 0 auto; }
.small { color:#666; font-size:14px; }
""") as demo:
gr.Markdown("## ๋งํฌ ์
๋ ฅ โ ์๋ณธ HTML / ํ
์คํธ(์ฝ๋๋ธ๋ญ ๋ณด์กด) / ์๋์์ฝ", elem_id="container")
with gr.Row():
url_input = gr.Textbox(label="URL", placeholder="https://example.com", scale=4)
gr.Markdown('<div class="small">URL์ ์
๋ ฅํ๊ณ ์ํ๋ ๋์ ๋ฒํผ์ ๋๋ฅด์ธ์.</div>')
with gr.Row():
btn_html = gr.Button("์๋ณธ HTML ๋ณด๊ธฐ", scale=1)
btn_text = gr.Button("ํ
์คํธ ๋ณด๊ธฐ (์ฝ๋๋ธ๋ญ ๋ณด์กด)", scale=1)
with gr.Row():
sent_n = gr.Slider(1, 8, value=3, step=1, label="์์ฝ ๋ฌธ์ฅ ์")
btn_sum = gr.Button("์๋์์ฝ ๋ณด๊ธฐ", scale=1)
output = gr.Textbox(label="๊ฒฐ๊ณผ", lines=26, show_copy_button=True)
btn_html.click(fn=handle_html, inputs=url_input, outputs=output)
btn_text.click(fn=handle_text, inputs=url_input, outputs=output)
btn_sum.click(fn=handle_summary, inputs=[url_input, sent_n], outputs=output)
if __name__ == "__main__":
demo.launch() |