import re import requests import trafilatura from bs4 import BeautifulSoup import gradio as gr # TextRank summarizer (summa) try: from summa.summarizer import summarize as textrank_summarize HAS_SUMMA = True except Exception: HAS_SUMMA = False DEFAULT_HEADERS = {"User-Agent": "Mozilla/5.0"} # ----------------------------- # Fetch # ----------------------------- def fetch_html(url: str, timeout: int = 12) -> str: r = requests.get(url, headers=DEFAULT_HEADERS, timeout=timeout) r.raise_for_status() return r.text # ----------------------------- # HTML -> Text (preserve code blocks) # ----------------------------- def html_to_text_preserve_code(html: str) -> str: """ 1) trafilatura로 본문만 남긴 HTML을 얻는다. 2)
, 
, inline 를 Markdown 코드표기(```, `...`)로 바꾼다.
    3) 나머지 태그 제거 후 줄바꿈 보존한 텍스트를 반환한다.
    """
    extracted = trafilatura.extract(
        html,
        output_format="html",
        include_tables=True,
        favor_recall=True
    )
    if not extracted:
        return "본문을 추출할 수 없습니다."

    soup = BeautifulSoup(extracted, "html.parser")

    # 
 (including nested ) -> fenced block
    for pre in soup.find_all("pre"):
        code_tag = pre.find("code")
        code_text = code_tag.get_text() if code_tag else pre.get_text()
        code_text = code_text.replace("\r\n", "\n")
        code_text = re.sub(r"\n{3,}", "\n\n", code_text).strip("\n")
        pre.replace_with(f"\n```\n{code_text}\n```\n")

    # inline  -> `...`
    for c in soup.find_all("code"):
        c_text = c.get_text().replace("`", "\\`")
        c.replace_with(f"`{c_text}`")

    # strip tags, keep newlines
    text_output = soup.get_text("\n")
    text_output = re.sub(r"\n{3,}", "\n\n", text_output).strip()
    return text_output

# -----------------------------
# Sentence splitting (basic, lang-agnostic-ish)
# -----------------------------
_SENT_SPLIT_REGEX = re.compile(r"(?<=[\.!\?。!?])\s+|\n+")

def split_sentences(text: str):
    # quick-and-dirty splitter handling ., !, ?, CJK punctuation, and newlines
    parts = _SENT_SPLIT_REGEX.split(text)
    return [s.strip() for s in parts if s.strip()]

# -----------------------------
# Summarize
# -----------------------------
def summarize_text(text: str, max_sentences: int = 3) -> str:
    text = (text or "").strip()
    if not text:
        return ""

    # Try TextRank via summa first
    if HAS_SUMMA:
        try:
            # ratio는 대략적인 길이 비율, 너무 짧으면 sentences 옵션 사용
            # summa는 한국어도 어느정도 작동하지만 품질은 텍스트에 따라 달라짐
            candidate = textrank_summarize(text, split=True)
            if candidate:
                # Pick top-N sentences
                return "\n".join(candidate[:max_sentences]).strip()
        except Exception:
            pass

    # Fallback: Lead-N sentences
    sents = split_sentences(text)
    if not sents:
        # last resort, truncate
        return text[:800]
    return "\n".join(sents[:max_sentences]).strip()

# -----------------------------
# Handlers
# -----------------------------
def handle_html(url: str) -> str:
    url = (url or "").strip()
    if not url:
        return "❌ URL을 입력하세요."
    try:
        return fetch_html(url)
    except Exception as e:
        return f"에러: {e}"

def handle_text(url: str) -> str:
    url = (url or "").strip()
    if not url:
        return "❌ URL을 입력하세요."
    try:
        html = fetch_html(url)
        return html_to_text_preserve_code(html)
    except Exception as e:
        return f"에러: {e}"

def handle_summary(url: str, sent_n: int) -> str:
    url = (url or "").strip()
    if not url:
        return "❌ URL을 입력하세요."
    try:
        html = fetch_html(url)
        text = html_to_text_preserve_code(html)
        if not text or text.startswith("본문을 추출할 수 없습니다."):
            return text
        summary = summarize_text(text, max_sentences=int(sent_n))
        if not summary:
            return "요약을 생성할 수 없습니다."
        return f"📝 자동요약 ({sent_n}문장)\n\n{summary}"
    except Exception as e:
        return f"에러: {e}"

# -----------------------------
# UI
# -----------------------------
with gr.Blocks(css="""
  #container { max-width: 920px; margin: 0 auto; }
  .small { color:#666; font-size:14px; }
""") as demo:
    gr.Markdown("## 링크 입력 → 원본 HTML / 텍스트(코드블럭 보존) / 자동요약", elem_id="container")

    with gr.Row():
        url_input = gr.Textbox(label="URL", placeholder="https://example.com", scale=4)
    gr.Markdown('
URL을 입력하고 원하는 동작 버튼을 누르세요.
') with gr.Row(): btn_html = gr.Button("원본 HTML 보기", scale=1) btn_text = gr.Button("텍스트 보기 (코드블럭 보존)", scale=1) with gr.Row(): sent_n = gr.Slider(1, 8, value=3, step=1, label="요약 문장 수") btn_sum = gr.Button("자동요약 보기", scale=1) output = gr.Textbox(label="결과", lines=26, show_copy_button=True) btn_html.click(fn=handle_html, inputs=url_input, outputs=output) btn_text.click(fn=handle_text, inputs=url_input, outputs=output) btn_sum.click(fn=handle_summary, inputs=[url_input, sent_n], outputs=output) if __name__ == "__main__": demo.launch()