Spaces:

orgoflu
/

moro_summer_image_0

Sleeping

File size: 7,002 Bytes

import re
import requests
from bs4 import BeautifulSoup
import gradio as gr
from paddleocr import PaddleOCR
from PIL import Image
import numpy as np
import cv2

# Summarizer
try:
    from summa.summarizer import summarize as textrank_summarize
    HAS_SUMMA = True
except Exception:
    HAS_SUMMA = False

DEFAULT_HEADERS = {"User-Agent": "Mozilla/5.0"}

# -----------------------------
# Fetch
# -----------------------------
def fetch_html(url: str, timeout: int = 12) -> str:
    r = requests.get(url, headers=DEFAULT_HEADERS, timeout=timeout)
    r.raise_for_status()
    return r.text

# -----------------------------
# HTML -> Text (preserve code blocks)
# -----------------------------
def _strip_noise_tags(soup: BeautifulSoup):
    for t in soup(["script", "style", "noscript", "svg", "iframe"]):
        t.decompose()
    for t in soup.find_all(["header", "footer", "nav", "aside"]):
        t.decompose()

def _pick_content_root(soup: BeautifulSoup):
    for selector in ["main", "article", "[role='main']"]:
        node = soup.select_one(selector)
        if node:
            return node
    candidates = soup.find_all("div")
    if not candidates:
        return soup.body or soup
    best = max(candidates, key=lambda d: len(d.get_text(strip=True) or ""))
    return best if best else (soup.body or soup)

def html_to_text_preserve_code(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")
    if not soup:
        return "본문을 추출할 수 없습니다."
    _strip_noise_tags(soup)
    root = _pick_content_root(soup)
    if not root:
        root = soup
    for pre in root.find_all("pre"):
        code_tag = pre.find("code")
        code_text = code_tag.get_text() if code_tag else pre.get_text()
        code_text = code_text.replace("\r\n", "\n")
        code_text = re.sub(r"\n{3,}", "\n\n", code_text).strip("\n")
        pre.replace_with(f"\n```\n{code_text}\n```\n")
    for c in root.find_all("code"):
        c_text = c.get_text().replace("`", "\\`")
        c.replace_with(f"`{c_text}`")
    text_output = root.get_text("\n")
    text_output = re.sub(r"\n{3,}", "\n\n", text_output).strip()
    return text_output or "본문을 추출할 수 없습니다."

# -----------------------------
# Sentence splitting
# -----------------------------
_SENT_SPLIT_REGEX = re.compile(r"(?<=[\.!\?。！？])\s+|\n+")

def split_sentences(text: str):
    parts = _SENT_SPLIT_REGEX.split(text)
    return [s.strip() for s in parts if s.strip()]

# -----------------------------
# Summarize
# -----------------------------
def summarize_text(text: str, max_sentences: int = 3) -> str:
    text = (text or "").strip()
    if not text:
        return ""
    if HAS_SUMMA:
        try:
            candidate = textrank_summarize(text, split=True)
            if candidate:
                return "\n".join(candidate[:max_sentences]).strip()
        except Exception:
            pass
    sents = split_sentences(text)
    if not sents:
        return text[:800]
    return "\n".join(sents[:max_sentences]).strip()

# -----------------------------
# Handlers
# -----------------------------
def handle_html(url: str) -> str:
    url = (url or "").strip()
    if not url:
        return "❌ URL을 입력하세요."
    try:
        return fetch_html(url)
    except Exception as e:
        return f"에러: {e}"

def handle_text(url: str) -> str:
    url = (url or "").strip()
    if not url:
        return "❌ URL을 입력하세요."
    try:
        html = fetch_html(url)
        return html_to_text_preserve_code(html)
    except Exception as e:
        return f"에러: {e}"

def handle_summary(url: str, sent_n: int) -> str:
    url = (url or "").strip()
    if not url:
        return "❌ URL을 입력하세요."
    try:
        html = fetch_html(url)
        text = html_to_text_preserve_code(html)
        if not text or text.startswith("본문을 추출할 수 없습니다."):
            return text
        summary = summarize_text(text, max_sentences=int(sent_n))
        if not summary:
            return "요약을 생성할 수 없습니다."
        return f"📝 자동요약 ({sent_n}문장)\n\n{summary}"
    except Exception as e:
        return f"에러: {e}"

# -----------------------------
# 이미지 전처리 + PaddleOCR
# -----------------------------
def preprocess_image(img):
    if isinstance(img, np.ndarray):
        arr = img
    else:
        arr = np.array(img)

    gray = cv2.cvtColor(arr, cv2.COLOR_RGB2GRAY)
    # 대비 강화 (이진화)
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    # 크기 키우기
    h, w = thresh.shape
    if h < 600:
        scale = 600 / h
        thresh = cv2.resize(thresh, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
    return thresh

ocr = PaddleOCR(use_angle_cls=True, lang='korean')  # 한국어+영어 지원

def handle_image(img) -> str:
    if img is None:
        return "❌ 이미지를 업로드하세요."
    try:
        proc_img = preprocess_image(img)
        results = ocr.ocr(proc_img)
        if not results or not results[0]:
            return "텍스트를 추출할 수 없습니다."

        lines = []
        for res in results[0]:
            if len(res) == 2 and isinstance(res[1], tuple):
                txt, conf = res[1]
                lines.append(f"{txt} (conf:{conf:.2f})")
        return "\n".join(lines) if lines else "텍스트를 추출할 수 없습니다."
    except Exception as e:
        return f"에러: {e}"

# -----------------------------
# UI
# -----------------------------
with gr.Blocks(css="""
  #container { max-width: 920px; margin: 0 auto; }
  .small { color:#666; font-size:14px; }
""") as demo:
    gr.Markdown("## URL → HTML/텍스트/요약 + 이미지 OCR (PaddleOCR + 전처리)", elem_id="container")

    with gr.Row():
        url_input = gr.Textbox(label="URL", placeholder="https://example.com", scale=4)
    gr.Markdown('<div class="small">URL을 입력하고 원하는 동작 버튼을 누르세요.</div>')

    with gr.Row():
        btn_html = gr.Button("원본 HTML 보기", scale=1)
        btn_text = gr.Button("텍스트 보기 (코드블럭 보존)", scale=1)
    with gr.Row():
        sent_n = gr.Slider(1, 8, value=3, step=1, label="요약 문장 수")
        btn_sum = gr.Button("자동요약 보기", scale=1)

    gr.Markdown("### 이미지 업로드 → OCR (PaddleOCR + 전처리)")
    with gr.Row():
        img_input = gr.Image(type="numpy", label="이미지 업로드")
        btn_img = gr.Button("이미지 OCR 실행", scale=1)

    output = gr.Textbox(label="결과", lines=26, show_copy_button=True)

    btn_html.click(fn=handle_html, inputs=url_input, outputs=output)
    btn_text.click(fn=handle_text, inputs=url_input, outputs=output)
    btn_sum.click(fn=handle_summary, inputs=[url_input, sent_n], outputs=output)
    btn_img.click(fn=handle_image, inputs=img_input, outputs=output)

if __name__ == "__main__":
    demo.launch()