Spaces:
Sleeping
Sleeping
| import re | |
| import requests | |
| import trafilatura | |
| from bs4 import BeautifulSoup | |
| import gradio as gr | |
| # TextRank summarizer (summa) | |
| try: | |
| from summa.summarizer import summarize as textrank_summarize | |
| HAS_SUMMA = True | |
| except Exception: | |
| HAS_SUMMA = False | |
| DEFAULT_HEADERS = {"User-Agent": "Mozilla/5.0"} | |
| # ----------------------------- | |
| # Fetch | |
| # ----------------------------- | |
| def fetch_html(url: str, timeout: int = 12) -> str: | |
| r = requests.get(url, headers=DEFAULT_HEADERS, timeout=timeout) | |
| r.raise_for_status() | |
| return r.text | |
| # ----------------------------- | |
| # HTML -> Text (preserve code blocks) | |
| # ----------------------------- | |
| def html_to_text_preserve_code(html: str) -> str: | |
| """ | |
| 1) trafilatura๋ก ๋ณธ๋ฌธ๋ง ๋จ๊ธด HTML์ ์ป๋๋ค. | |
| 2) <pre>, <pre><code>, inline <code>๋ฅผ Markdown ์ฝ๋ํ๊ธฐ(```, `...`)๋ก ๋ฐ๊พผ๋ค. | |
| 3) ๋๋จธ์ง ํ๊ทธ ์ ๊ฑฐ ํ ์ค๋ฐ๊ฟ ๋ณด์กดํ ํ ์คํธ๋ฅผ ๋ฐํํ๋ค. | |
| """ | |
| extracted = trafilatura.extract( | |
| html, | |
| output_format="html", | |
| include_tables=True, | |
| favor_recall=True | |
| ) | |
| if not extracted: | |
| return "๋ณธ๋ฌธ์ ์ถ์ถํ ์ ์์ต๋๋ค." | |
| soup = BeautifulSoup(extracted, "html.parser") | |
| # <pre> (including nested <code>) -> fenced block | |
| for pre in soup.find_all("pre"): | |
| code_tag = pre.find("code") | |
| code_text = code_tag.get_text() if code_tag else pre.get_text() | |
| code_text = code_text.replace("\r\n", "\n") | |
| code_text = re.sub(r"\n{3,}", "\n\n", code_text).strip("\n") | |
| pre.replace_with(f"\n```\n{code_text}\n```\n") | |
| # inline <code> -> `...` | |
| for c in soup.find_all("code"): | |
| c_text = c.get_text().replace("`", "\\`") | |
| c.replace_with(f"`{c_text}`") | |
| # strip tags, keep newlines | |
| text_output = soup.get_text("\n") | |
| text_output = re.sub(r"\n{3,}", "\n\n", text_output).strip() | |
| return text_output | |
| # ----------------------------- | |
| # Sentence splitting (basic, lang-agnostic-ish) | |
| # ----------------------------- | |
| _SENT_SPLIT_REGEX = re.compile(r"(?<=[\.!\?ใ๏ผ๏ผ])\s+|\n+") | |
| def split_sentences(text: str): | |
| # quick-and-dirty splitter handling ., !, ?, CJK punctuation, and newlines | |
| parts = _SENT_SPLIT_REGEX.split(text) | |
| return [s.strip() for s in parts if s.strip()] | |
| # ----------------------------- | |
| # Summarize | |
| # ----------------------------- | |
| def summarize_text(text: str, max_sentences: int = 3) -> str: | |
| text = (text or "").strip() | |
| if not text: | |
| return "" | |
| # Try TextRank via summa first | |
| if HAS_SUMMA: | |
| try: | |
| # ratio๋ ๋๋ต์ ์ธ ๊ธธ์ด ๋น์จ, ๋๋ฌด ์งง์ผ๋ฉด sentences ์ต์ ์ฌ์ฉ | |
| # summa๋ ํ๊ตญ์ด๋ ์ด๋์ ๋ ์๋ํ์ง๋ง ํ์ง์ ํ ์คํธ์ ๋ฐ๋ผ ๋ฌ๋ผ์ง | |
| candidate = textrank_summarize(text, split=True) | |
| if candidate: | |
| # Pick top-N sentences | |
| return "\n".join(candidate[:max_sentences]).strip() | |
| except Exception: | |
| pass | |
| # Fallback: Lead-N sentences | |
| sents = split_sentences(text) | |
| if not sents: | |
| # last resort, truncate | |
| return text[:800] | |
| return "\n".join(sents[:max_sentences]).strip() | |
| # ----------------------------- | |
| # Handlers | |
| # ----------------------------- | |
| def handle_html(url: str) -> str: | |
| url = (url or "").strip() | |
| if not url: | |
| return "โ URL์ ์ ๋ ฅํ์ธ์." | |
| try: | |
| return fetch_html(url) | |
| except Exception as e: | |
| return f"์๋ฌ: {e}" | |
| def handle_text(url: str) -> str: | |
| url = (url or "").strip() | |
| if not url: | |
| return "โ URL์ ์ ๋ ฅํ์ธ์." | |
| try: | |
| html = fetch_html(url) | |
| return html_to_text_preserve_code(html) | |
| except Exception as e: | |
| return f"์๋ฌ: {e}" | |
| def handle_summary(url: str, sent_n: int) -> str: | |
| url = (url or "").strip() | |
| if not url: | |
| return "โ URL์ ์ ๋ ฅํ์ธ์." | |
| try: | |
| html = fetch_html(url) | |
| text = html_to_text_preserve_code(html) | |
| if not text or text.startswith("๋ณธ๋ฌธ์ ์ถ์ถํ ์ ์์ต๋๋ค."): | |
| return text | |
| summary = summarize_text(text, max_sentences=int(sent_n)) | |
| if not summary: | |
| return "์์ฝ์ ์์ฑํ ์ ์์ต๋๋ค." | |
| return f"๐ ์๋์์ฝ ({sent_n}๋ฌธ์ฅ)\n\n{summary}" | |
| except Exception as e: | |
| return f"์๋ฌ: {e}" | |
| # ----------------------------- | |
| # UI | |
| # ----------------------------- | |
| with gr.Blocks(css=""" | |
| #container { max-width: 920px; margin: 0 auto; } | |
| .small { color:#666; font-size:14px; } | |
| """) as demo: | |
| gr.Markdown("## ๋งํฌ ์ ๋ ฅ โ ์๋ณธ HTML / ํ ์คํธ(์ฝ๋๋ธ๋ญ ๋ณด์กด) / ์๋์์ฝ", elem_id="container") | |
| with gr.Row(): | |
| url_input = gr.Textbox(label="URL", placeholder="https://example.com", scale=4) | |
| gr.Markdown('<div class="small">URL์ ์ ๋ ฅํ๊ณ ์ํ๋ ๋์ ๋ฒํผ์ ๋๋ฅด์ธ์.</div>') | |
| with gr.Row(): | |
| btn_html = gr.Button("์๋ณธ HTML ๋ณด๊ธฐ", scale=1) | |
| btn_text = gr.Button("ํ ์คํธ ๋ณด๊ธฐ (์ฝ๋๋ธ๋ญ ๋ณด์กด)", scale=1) | |
| with gr.Row(): | |
| sent_n = gr.Slider(1, 8, value=3, step=1, label="์์ฝ ๋ฌธ์ฅ ์") | |
| btn_sum = gr.Button("์๋์์ฝ ๋ณด๊ธฐ", scale=1) | |
| output = gr.Textbox(label="๊ฒฐ๊ณผ", lines=26, show_copy_button=True) | |
| btn_html.click(fn=handle_html, inputs=url_input, outputs=output) | |
| btn_text.click(fn=handle_text, inputs=url_input, outputs=output) | |
| btn_sum.click(fn=handle_summary, inputs=[url_input, sent_n], outputs=output) | |
| if __name__ == "__main__": | |
| demo.launch() |