import re import requests from bs4 import BeautifulSoup import gradio as gr from paddleocr import PaddleOCR from PIL import Image import numpy as np import cv2 # Summarizer try: from summa.summarizer import summarize as textrank_summarize HAS_SUMMA = True except Exception: HAS_SUMMA = False DEFAULT_HEADERS = {"User-Agent": "Mozilla/5.0"} # ----------------------------- # Fetch # ----------------------------- def fetch_html(url: str, timeout: int = 12) -> str: r = requests.get(url, headers=DEFAULT_HEADERS, timeout=timeout) r.raise_for_status() return r.text # ----------------------------- # HTML -> Text (preserve code blocks) # ----------------------------- def _strip_noise_tags(soup: BeautifulSoup): for t in soup(["script", "style", "noscript", "svg", "iframe"]): t.decompose() for t in soup.find_all(["header", "footer", "nav", "aside"]): t.decompose() def _pick_content_root(soup: BeautifulSoup): for selector in ["main", "article", "[role='main']"]: node = soup.select_one(selector) if node: return node candidates = soup.find_all("div") if not candidates: return soup.body or soup best = max(candidates, key=lambda d: len(d.get_text(strip=True) or "")) return best if best else (soup.body or soup) def html_to_text_preserve_code(html: str) -> str: soup = BeautifulSoup(html, "html.parser") if not soup: return "본문을 추출할 수 없습니다." _strip_noise_tags(soup) root = _pick_content_root(soup) if not root: root = soup for pre in root.find_all("pre"): code_tag = pre.find("code") code_text = code_tag.get_text() if code_tag else pre.get_text() code_text = code_text.replace("\r\n", "\n") code_text = re.sub(r"\n{3,}", "\n\n", code_text).strip("\n") pre.replace_with(f"\n```\n{code_text}\n```\n") for c in root.find_all("code"): c_text = c.get_text().replace("`", "\\`") c.replace_with(f"`{c_text}`") text_output = root.get_text("\n") text_output = re.sub(r"\n{3,}", "\n\n", text_output).strip() return text_output or "본문을 추출할 수 없습니다." # ----------------------------- # Sentence splitting # ----------------------------- _SENT_SPLIT_REGEX = re.compile(r"(?<=[\.!\?。!?])\s+|\n+") def split_sentences(text: str): parts = _SENT_SPLIT_REGEX.split(text) return [s.strip() for s in parts if s.strip()] # ----------------------------- # Summarize # ----------------------------- def summarize_text(text: str, max_sentences: int = 3) -> str: text = (text or "").strip() if not text: return "" if HAS_SUMMA: try: candidate = textrank_summarize(text, split=True) if candidate: return "\n".join(candidate[:max_sentences]).strip() except Exception: pass sents = split_sentences(text) if not sents: return text[:800] return "\n".join(sents[:max_sentences]).strip() # ----------------------------- # Handlers # ----------------------------- def handle_html(url: str) -> str: url = (url or "").strip() if not url: return "❌ URL을 입력하세요." try: return fetch_html(url) except Exception as e: return f"에러: {e}" def handle_text(url: str) -> str: url = (url or "").strip() if not url: return "❌ URL을 입력하세요." try: html = fetch_html(url) return html_to_text_preserve_code(html) except Exception as e: return f"에러: {e}" def handle_summary(url: str, sent_n: int) -> str: url = (url or "").strip() if not url: return "❌ URL을 입력하세요." try: html = fetch_html(url) text = html_to_text_preserve_code(html) if not text or text.startswith("본문을 추출할 수 없습니다."): return text summary = summarize_text(text, max_sentences=int(sent_n)) if not summary: return "요약을 생성할 수 없습니다." return f"📝 자동요약 ({sent_n}문장)\n\n{summary}" except Exception as e: return f"에러: {e}" # ----------------------------- # 이미지 전처리 + PaddleOCR # ----------------------------- def preprocess_image(img): if isinstance(img, np.ndarray): arr = img else: arr = np.array(img) gray = cv2.cvtColor(arr, cv2.COLOR_RGB2GRAY) # 대비 강화 (이진화) _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) # 크기 키우기 h, w = thresh.shape if h < 600: scale = 600 / h thresh = cv2.resize(thresh, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR) return thresh ocr = PaddleOCR(use_angle_cls=True, lang='korean') # 한국어+영어 지원 def handle_image(img) -> str: if img is None: return "❌ 이미지를 업로드하세요." try: proc_img = preprocess_image(img) results = ocr.ocr(proc_img) if not results or not results[0]: return "텍스트를 추출할 수 없습니다." lines = [] for res in results[0]: if len(res) == 2 and isinstance(res[1], tuple): txt, conf = res[1] lines.append(f"{txt} (conf:{conf:.2f})") return "\n".join(lines) if lines else "텍스트를 추출할 수 없습니다." except Exception as e: return f"에러: {e}" # ----------------------------- # UI # ----------------------------- with gr.Blocks(css=""" #container { max-width: 920px; margin: 0 auto; } .small { color:#666; font-size:14px; } """) as demo: gr.Markdown("## URL → HTML/텍스트/요약 + 이미지 OCR (PaddleOCR + 전처리)", elem_id="container") with gr.Row(): url_input = gr.Textbox(label="URL", placeholder="https://example.com", scale=4) gr.Markdown('