Spaces:

orgoflu
/

moro_flask_proxy

Sleeping

App Files Files Community

orgoflu commited on Sep 11, 2025

Commit

bba79ad

verified ·

1 Parent(s): fcd51b7

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -9

app.py CHANGED Viewed

@@ -4,14 +4,32 @@ import trafilatura
 from bs4 import BeautifulSoup
 import gradio as gr
 DEFAULT_HEADERS = {"User-Agent": "Mozilla/5.0"}
 def fetch_html(url: str, timeout: int = 12) -> str:
     r = requests.get(url, headers=DEFAULT_HEADERS, timeout=timeout)
     r.raise_for_status()
     return r.text
 def html_to_text_preserve_code(html: str) -> str:
     extracted = trafilatura.extract(
         html,
         output_format="html",
@@ -23,7 +41,7 @@ def html_to_text_preserve_code(html: str) -> str:
     soup = BeautifulSoup(extracted, "html.parser")
-    # <pre><code> 및 <pre>를 fenced code block으로
     for pre in soup.find_all("pre"):
         code_tag = pre.find("code")
         code_text = code_tag.get_text() if code_tag else pre.get_text()
@@ -31,55 +49,116 @@ def html_to_text_preserve_code(html: str) -> str:
         code_text = re.sub(r"\n{3,}", "\n\n", code_text).strip("\n")
         pre.replace_with(f"\n```\n{code_text}\n```\n")
-    # inline <code> → `...`
     for c in soup.find_all("code"):
         c_text = c.get_text().replace("`", "\\`")
         c.replace_with(f"`{c_text}`")
-    # 태그 제거 + 줄바꿈 유지
     text_output = soup.get_text("\n")
     text_output = re.sub(r"\n{3,}", "\n\n", text_output).strip()
     return text_output
 def handle_html(url: str) -> str:
     url = (url or "").strip()
     if not url:
         return "❌ URL을 입력하세요."
     try:
-        html = fetch_html(url)
-        return html
     except Exception as e:
         return f"에러: {e}"
 def handle_text(url: str) -> str:
     url = (url or "").strip()
     if not url:
         return "❌ URL을 입력하세요."
     try:
         html = fetch_html(url)
         text = html_to_text_preserve_code(html)
-        return text
     except Exception as e:
         return f"에러: {e}"
 with gr.Blocks(css="""
   #container { max-width: 920px; margin: 0 auto; }
   .small { color:#666; font-size:14px; }
 """) as demo:
-    gr.Markdown("## 링크 입력 → 원본 HTML / 텍스트(코드블럭 보존) 보기", elem_id="container")
     with gr.Row():
         url_input = gr.Textbox(label="URL", placeholder="https://example.com", scale=4)
-    gr.Markdown('<div class="small">URL을 입력하고 원하는 버튼을 눌러 결과를 아래에서 확인하세요.</div>')
     with gr.Row():
         btn_html = gr.Button("원본 HTML 보기", scale=1)
         btn_text = gr.Button("텍스트 보기 (코드블럭 보존)", scale=1)
-    output = gr.Textbox(label="결과", lines=24, show_copy_button=True)
     btn_html.click(fn=handle_html, inputs=url_input, outputs=output)
     btn_text.click(fn=handle_text, inputs=url_input, outputs=output)
 if __name__ == "__main__":
     demo.launch()

 from bs4 import BeautifulSoup
 import gradio as gr
+# TextRank summarizer (summa)
+try:
+    from summa.summarizer import summarize as textrank_summarize
+    HAS_SUMMA = True
+except Exception:
+    HAS_SUMMA = False
 DEFAULT_HEADERS = {"User-Agent": "Mozilla/5.0"}
+# -----------------------------
+# Fetch
+# -----------------------------
 def fetch_html(url: str, timeout: int = 12) -> str:
     r = requests.get(url, headers=DEFAULT_HEADERS, timeout=timeout)
     r.raise_for_status()
     return r.text
+# -----------------------------
+# HTML -> Text (preserve code blocks)
+# -----------------------------
 def html_to_text_preserve_code(html: str) -> str:
+    """
+    1) trafilatura로 본문만 남긴 HTML을 얻는다.
+    2) <pre>, <pre><code>, inline <code>를 Markdown 코드표기(```, `...`)로 바꾼다.
+    3) 나머지 태그 제거 후 줄바꿈 보존한 텍스트를 반환한다.
+    """
     extracted = trafilatura.extract(
         html,
         output_format="html",
     soup = BeautifulSoup(extracted, "html.parser")
+    # <pre> (including nested <code>) -> fenced block
     for pre in soup.find_all("pre"):
         code_tag = pre.find("code")
         code_text = code_tag.get_text() if code_tag else pre.get_text()
         code_text = re.sub(r"\n{3,}", "\n\n", code_text).strip("\n")
         pre.replace_with(f"\n```\n{code_text}\n```\n")
+    # inline <code> -> `...`
     for c in soup.find_all("code"):
         c_text = c.get_text().replace("`", "\\`")
         c.replace_with(f"`{c_text}`")
+    # strip tags, keep newlines
     text_output = soup.get_text("\n")
     text_output = re.sub(r"\n{3,}", "\n\n", text_output).strip()
     return text_output
+# -----------------------------
+# Sentence splitting (basic, lang-agnostic-ish)
+# -----------------------------
+_SENT_SPLIT_REGEX = re.compile(r"(?<=[\.!\?。！？])\s+|\n+")
+def split_sentences(text: str):
+    # quick-and-dirty splitter handling ., !, ?, CJK punctuation, and newlines
+    parts = _SENT_SPLIT_REGEX.split(text)
+    return [s.strip() for s in parts if s.strip()]
+# -----------------------------
+# Summarize
+# -----------------------------
+def summarize_text(text: str, max_sentences: int = 3) -> str:
+    text = (text or "").strip()
+    if not text:
+        return ""
+    # Try TextRank via summa first
+    if HAS_SUMMA:
+        try:
+            # ratio는 대략적인 길이 비율, 너무 짧으면 sentences 옵션 사용
+            # summa는 한국어도 어느정도 작동하지만 품질은 텍스트에 따라 달라짐
+            candidate = textrank_summarize(text, split=True)
+            if candidate:
+                # Pick top-N sentences
+                return "\n".join(candidate[:max_sentences]).strip()
+        except Exception:
+            pass
+    # Fallback: Lead-N sentences
+    sents = split_sentences(text)
+    if not sents:
+        # last resort, truncate
+        return text[:800]
+    return "\n".join(sents[:max_sentences]).strip()
+# -----------------------------
+# Handlers
+# -----------------------------
 def handle_html(url: str) -> str:
     url = (url or "").strip()
     if not url:
         return "❌ URL을 입력하세요."
     try:
+        return fetch_html(url)
     except Exception as e:
         return f"에러: {e}"
 def handle_text(url: str) -> str:
+    url = (url or "").strip()
+    if not url:
+        return "❌ URL을 입력하세요."
+    try:
+        html = fetch_html(url)
+        return html_to_text_preserve_code(html)
+    except Exception as e:
+        return f"에러: {e}"
+def handle_summary(url: str, sent_n: int) -> str:
     url = (url or "").strip()
     if not url:
         return "❌ URL을 입력하세요."
     try:
         html = fetch_html(url)
         text = html_to_text_preserve_code(html)
+        if not text or text.startswith("본문을 추출할 수 없습니다."):
+            return text
+        summary = summarize_text(text, max_sentences=int(sent_n))
+        if not summary:
+            return "요약을 생성할 수 없습니다."
+        return f"📝 자동요약 ({sent_n}문장)\n\n{summary}"
     except Exception as e:
         return f"에러: {e}"
+# -----------------------------
+# UI
+# -----------------------------
 with gr.Blocks(css="""
   #container { max-width: 920px; margin: 0 auto; }
   .small { color:#666; font-size:14px; }
 """) as demo:
+    gr.Markdown("## 링크 입력 → 원본 HTML / 텍스트(코드블럭 보존) / 자동요약", elem_id="container")
     with gr.Row():
         url_input = gr.Textbox(label="URL", placeholder="https://example.com", scale=4)
+    gr.Markdown('<div class="small">URL을 입력하고 원하는 동작 버튼을 누르세요.</div>')
     with gr.Row():
         btn_html = gr.Button("원본 HTML 보기", scale=1)
         btn_text = gr.Button("텍스트 보기 (코드블럭 보존)", scale=1)
+    with gr.Row():
+        sent_n = gr.Slider(1, 8, value=3, step=1, label="요약 문장 수")
+        btn_sum = gr.Button("자동요약 보기", scale=1)
+    output = gr.Textbox(label="결과", lines=26, show_copy_button=True)
     btn_html.click(fn=handle_html, inputs=url_input, outputs=output)
     btn_text.click(fn=handle_text, inputs=url_input, outputs=output)
+    btn_sum.click(fn=handle_summary, inputs=[url_input, sent_n], outputs=output)
 if __name__ == "__main__":
     demo.launch()