Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,60 +1,85 @@
|
|
| 1 |
-
import
|
| 2 |
-
import trafilatura
|
| 3 |
import requests
|
| 4 |
-
|
| 5 |
-
from
|
| 6 |
-
|
| 7 |
-
from sumy.summarizers.text_rank import TextRankSummarizer
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
return "โ URL์ ์
๋ ฅํ์ธ์."
|
| 23 |
try:
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
html_content = trafilatura.extract(
|
| 29 |
-
r.text,
|
| 30 |
-
output_format="html",
|
| 31 |
-
include_tables=True,
|
| 32 |
-
favor_recall=True
|
| 33 |
-
)
|
| 34 |
-
if not html_content:
|
| 35 |
-
return "๋ณธ๋ฌธ์ ์ถ์ถํ ์ ์์ต๋๋ค."
|
| 36 |
-
|
| 37 |
-
markdown_text = md(html_content, heading_style="ATX")
|
| 38 |
-
summary_text = auto_summarize(markdown_text, sentence_count=3)
|
| 39 |
-
|
| 40 |
-
if mode == "์๋์์ฝ":
|
| 41 |
-
return f"๐ **์๋์์ฝ**\n\n{summary_text}\n\n๐ [์๋ฌธ ๋ณด๊ธฐ]({url})"
|
| 42 |
-
else:
|
| 43 |
-
return f"๐ **์๋ฌธ**\n\n{markdown_text}\n\n๐ [์๋ฌธ ๋ณด๊ธฐ]({url})"
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
except Exception as e:
|
| 46 |
-
return f"์๋ฌ: {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
-
# === Gradio UI ===
|
| 49 |
-
with gr.Blocks() as demo:
|
| 50 |
-
gr.Markdown("## ๐ ๋งํฌ โ ์๋์์ฝ / ์๋ฌธ ๋ณด๊ธฐ")
|
| 51 |
with gr.Row():
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
|
| 57 |
-
|
|
|
|
| 58 |
|
| 59 |
if __name__ == "__main__":
|
| 60 |
demo.launch()
|
|
|
|
| 1 |
+
import re
|
|
|
|
| 2 |
import requests
|
| 3 |
+
import trafilatura
|
| 4 |
+
from bs4 import BeautifulSoup
|
| 5 |
+
import gradio as gr
|
|
|
|
| 6 |
|
| 7 |
+
DEFAULT_HEADERS = {"User-Agent": "Mozilla/5.0"}
|
| 8 |
+
|
| 9 |
+
def fetch_html(url: str, timeout: int = 12) -> str:
|
| 10 |
+
r = requests.get(url, headers=DEFAULT_HEADERS, timeout=timeout)
|
| 11 |
+
r.raise_for_status()
|
| 12 |
+
return r.text
|
| 13 |
+
|
| 14 |
+
def html_to_text_preserve_code(html: str) -> str:
|
| 15 |
+
extracted = trafilatura.extract(
|
| 16 |
+
html,
|
| 17 |
+
output_format="html",
|
| 18 |
+
include_tables=True,
|
| 19 |
+
favor_recall=True
|
| 20 |
+
)
|
| 21 |
+
if not extracted:
|
| 22 |
+
return "๋ณธ๋ฌธ์ ์ถ์ถํ ์ ์์ต๋๋ค."
|
| 23 |
+
|
| 24 |
+
soup = BeautifulSoup(extracted, "html.parser")
|
| 25 |
+
|
| 26 |
+
# <pre><code> ๋ฐ <pre>๋ฅผ fenced code block์ผ๋ก
|
| 27 |
+
for pre in soup.find_all("pre"):
|
| 28 |
+
code_tag = pre.find("code")
|
| 29 |
+
code_text = code_tag.get_text() if code_tag else pre.get_text()
|
| 30 |
+
code_text = code_text.replace("\r\n", "\n")
|
| 31 |
+
code_text = re.sub(r"\n{3,}", "\n\n", code_text).strip("\n")
|
| 32 |
+
pre.replace_with(f"\n```\n{code_text}\n```\n")
|
| 33 |
+
|
| 34 |
+
# inline <code> โ `...`
|
| 35 |
+
for c in soup.find_all("code"):
|
| 36 |
+
c_text = c.get_text().replace("`", "\\`")
|
| 37 |
+
c.replace_with(f"`{c_text}`")
|
| 38 |
+
|
| 39 |
+
# ํ๊ทธ ์ ๊ฑฐ + ์ค๋ฐ๊ฟ ์ ์ง
|
| 40 |
+
text_output = soup.get_text("\n")
|
| 41 |
+
text_output = re.sub(r"\n{3,}", "\n\n", text_output).strip()
|
| 42 |
+
return text_output
|
| 43 |
+
|
| 44 |
+
def handle_html(url: str) -> str:
|
| 45 |
+
url = (url or "").strip()
|
| 46 |
+
if not url:
|
| 47 |
return "โ URL์ ์
๋ ฅํ์ธ์."
|
| 48 |
try:
|
| 49 |
+
html = fetch_html(url)
|
| 50 |
+
return html
|
| 51 |
+
except Exception as e:
|
| 52 |
+
return f"์๋ฌ: {e}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
+
def handle_text(url: str) -> str:
|
| 55 |
+
url = (url or "").strip()
|
| 56 |
+
if not url:
|
| 57 |
+
return "โ URL์ ์
๋ ฅํ์ธ์."
|
| 58 |
+
try:
|
| 59 |
+
html = fetch_html(url)
|
| 60 |
+
text = html_to_text_preserve_code(html)
|
| 61 |
+
return text
|
| 62 |
except Exception as e:
|
| 63 |
+
return f"์๋ฌ: {e}"
|
| 64 |
+
|
| 65 |
+
with gr.Blocks(css="""
|
| 66 |
+
#container { max-width: 920px; margin: 0 auto; }
|
| 67 |
+
.small { color:#666; font-size:14px; }
|
| 68 |
+
""") as demo:
|
| 69 |
+
gr.Markdown("## ๋งํฌ ์
๋ ฅ โ ์๋ณธ HTML / ํ
์คํธ(์ฝ๋๋ธ๋ญ ๋ณด์กด) ๋ณด๊ธฐ", elem_id="container")
|
| 70 |
+
|
| 71 |
+
with gr.Row():
|
| 72 |
+
url_input = gr.Textbox(label="URL", placeholder="https://example.com", scale=4)
|
| 73 |
+
gr.Markdown('<div class="small">URL์ ์
๋ ฅํ๊ณ ์ํ๋ ๋ฒํผ์ ๋๋ฌ ๊ฒฐ๊ณผ๋ฅผ ์๋์์ ํ์ธํ์ธ์.</div>')
|
| 74 |
|
|
|
|
|
|
|
|
|
|
| 75 |
with gr.Row():
|
| 76 |
+
btn_html = gr.Button("์๋ณธ HTML ๋ณด๊ธฐ", scale=1)
|
| 77 |
+
btn_text = gr.Button("ํ
์คํธ ๋ณด๊ธฐ (์ฝ๋๋ธ๋ญ ๋ณด์กด)", scale=1)
|
| 78 |
+
|
| 79 |
+
output = gr.Textbox(label="๊ฒฐ๊ณผ", lines=24, show_copy_button=True)
|
| 80 |
|
| 81 |
+
btn_html.click(fn=handle_html, inputs=url_input, outputs=output)
|
| 82 |
+
btn_text.click(fn=handle_text, inputs=url_input, outputs=output)
|
| 83 |
|
| 84 |
if __name__ == "__main__":
|
| 85 |
demo.launch()
|