Spaces:

maxmunzy
/

schoolbridge

Sleeping

File size: 11,224 Bytes

7f105c8

"""HWP/PDF/text 입력 → clean_text 변환.

파이프라인 [1] 단계. 호스트 앱이 어떤 양식으로 보내든 백엔드가 텍스트로 흡수.

지원:
  - text (text/plain) → 그대로
  - PDF (application/pdf) → pdfplumber 본문 + 표 [표] 섹션
  - HWP/HWPX → LibreOffice + H2Orestart로 ODT 변환 → content.xml 직접 파싱

이미지(.jpg/.png) OCR은 별도 단계 (세종님 OCR 합류 시 추가).

ODT 경로 채택 이유 (vs 이전 HWP→PDF):
  HWP→PDF→pdfplumber는 LibreOffice가 텍스트를 두 번 그려 글자가 중복
  추출되는 문제 ("22002266학학년년도도"). ODT(zip+content.xml)는 구조화된
  단일 출력이라 중복 0. 검증: hwp5txt 28b 실패, docx 0c 실패, odt 1899c
  키워드 6/6 보존.

보안 모델:
  - 원본 filename은 .suffix 추출에만 사용. 추출된 suffix는 화이트리스트 검사
    (TEXT_EXTS / PDF_EXTS / HWP_EXTS) 통과 못 하면 ParserError로 즉시 거부.
  - 디스크 저장 경로는 tempfile.TemporaryDirectory + 고정 이름 input{suffix}.
    원본 filename은 어떤 경로/명령에도 사용되지 않음.
  - subprocess 호출은 list 인자 형태(쉘 미사용) → 명령 주입 표면 없음.
"""
from __future__ import annotations

import os
import re
import subprocess
import tempfile
import xml.etree.ElementTree as ET
import zipfile
from pathlib import Path

# pdfplumber는 외부 의존이라 CI/테스트 안전하게 가드.
# 운영에선 requirements.txt + Dockerfile로 보장. 부재면 첫 PDF 호출 시점에 명확한 메시지.
try:
    import pdfplumber  # type: ignore
except ImportError as error:
    print(f"[parser] pdfplumber unavailable: {error}")
    pdfplumber = None


PDF_EXTS = {".pdf"}
HWP_EXTS = {".hwp", ".hwpx"}
TEXT_EXTS = {".txt", ".md"}
IMG_EXTS = {".jpg", ".jpeg", ".png", ".webp"}
ALLOWED_EXTS = PDF_EXTS | HWP_EXTS | TEXT_EXTS | IMG_EXTS

# LibreOffice 변환 타임아웃 (초). 큰 HWP는 ENV로 오버라이드 가능.
LIBREOFFICE_TIMEOUT_SECONDS = int(os.environ.get("PARSER_LIBREOFFICE_TIMEOUT", "300"))


class ParserError(RuntimeError):
    """변환 실패 시 호출부가 잡을 수 있는 단일 예외."""


def normalize(text: str) -> str:
    """null 제거 + 한 줄 안 다중 공백만 정리. 줄바꿈 보존, 연속 빈 줄은 1개로."""
    text = text.replace("\x00", " ")
    out_lines: list[str] = []
    prev_empty = False
    for line in text.split("\n"):
        line = re.sub(r"[ \t]+", " ", line).strip()
        if not line:
            if not prev_empty:
                out_lines.append(line)
            prev_empty = True
        else:
            out_lines.append(line)
            prev_empty = False
    return "\n".join(out_lines).strip()


def _pdf_to_text(pdf_path: Path) -> str:
    """본문 텍스트(표 영역 제외) + 표(행 단위 정리) 분리."""
    if pdfplumber is None:
        raise ParserError(
            "pdfplumber 미설치. backend 컨테이너 재빌드(docker compose build backend) "
            "또는 pip install pdfplumber 필요."
        )

    body_pages: list[str] = []
    table_blocks: list[str] = []

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables() or []
            for tbl in tables:
                rows: list[str] = []
                for row in tbl:
                    cells = [(c or "").replace("\n", " ").strip() for c in row]
                    if any(cells):
                        rows.append(" | ".join(cells))
                if rows:
                    table_blocks.append("\n".join(rows))

            table_bboxes = [t.bbox for t in (page.find_tables() or [])]
            if table_bboxes:
                def outside_tables(obj):
                    if obj.get("object_type") != "char":
                        return True
                    cx = (obj["x0"] + obj["x1"]) / 2
                    cy = (obj["top"] + obj["bottom"]) / 2
                    for bbox in table_bboxes:
                        x0, top, x1, bottom = bbox
                        if x0 <= cx <= x1 and top <= cy <= bottom:
                            return False
                    return True
                page_view = page.filter(outside_tables)
                body = page_view.extract_text() or ""
            else:
                body = page.extract_text() or ""

            if body:
                body_pages.append(body)

    parts: list[str] = []
    if body_pages:
        parts.append("\n\n".join(body_pages))
    if table_blocks:
        parts.append("[표]\n" + "\n\n".join(table_blocks))
    return "\n\n".join(parts)


def _image_to_text(img_path: Path) -> str:
    """카메라 사진(.jpg/.png) → 텍스트. Tesseract 한국어 OCR.

    전체 이미지 1차 OCR → 한국어 문자 외 노이즈 정리.
    표 영역 재처리(2차 OCR)는 별도 로직으로 확장 가능.
    """
    try:
        import pytesseract
        from PIL import Image
    except ImportError as e:
        raise ParserError(f"OCR 의존 미설치: {e}. Docker 재빌드 필요.")

    try:
        img = Image.open(img_path).convert("RGB")
    except Exception as e:
        raise ParserError(f"이미지 열기 실패: {e}")

    try:
        # psm 3: 자동 레이아웃 감지 (표·단락 혼재 가정통신문에 적합)
        text = pytesseract.image_to_string(img, lang="kor", config="--psm 3 --oem 1")
    except Exception as e:
        raise ParserError(f"Tesseract OCR 실패: {e}")

    return text


# ODT content.xml 네임스페이스
_ODT_TEXT_NS = "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}"
_ODT_TABLE_NS = "{urn:oasis:names:tc:opendocument:xmlns:table:1.0}"
_ODT_DRAW_NS = "{urn:oasis:names:tc:opendocument:xmlns:drawing:1.0}"


def _hwp_to_odt(hwp_path: Path, out_dir: Path) -> Path:
    """LibreOffice headless로 HWP → ODT.

    HWP→PDF 경로의 doubled-char 문제 회피. ODT는 zip 구조라 본문/표가
    단일 트리에 한 번만 들어감. 타임아웃: PARSER_LIBREOFFICE_TIMEOUT.
    """
    out_dir.mkdir(parents=True, exist_ok=True)
    try:
        result = subprocess.run(
            [
                "libreoffice", "--headless",
                "--convert-to", "odt",
                "--outdir", str(out_dir),
                str(hwp_path),
            ],
            capture_output=True, text=True,
            timeout=LIBREOFFICE_TIMEOUT_SECONDS,
        )
    except subprocess.TimeoutExpired:
        raise ParserError(
            f"LibreOffice 변환 타임아웃 ({LIBREOFFICE_TIMEOUT_SECONDS}초 초과). "
            "큰 HWP면 PARSER_LIBREOFFICE_TIMEOUT 환경변수로 늘릴 수 있음."
        )
    odt_path = out_dir / f"{hwp_path.stem}.odt"
    # H2Orestart가 ODT 변환 후 종료 시점에 종종 Signal 11 (cleanup 버그)을 내지만
    # 출력 파일은 정상. 파일 존재 여부를 성공 기준으로 — returncode/stderr는 참고만.
    if not odt_path.exists():
        raise ParserError(
            f"LibreOffice ODT 변환 실패 (출력 파일 없음). "
            f"returncode={result.returncode}, stderr={result.stderr.strip()[:200]}"
        )
    return odt_path


def _odt_to_text(odt_path: Path, mark_header: bool = False) -> str:
    """ODT(zip) content.xml → 본문 + 표 영역 평면 텍스트.

    표 안 paragraph는 본문 처리에서 제외(중복 방지). 표는 셀 단위 공백 합치고
    행 단위 줄바꿈으로 평면화 — `|` 구분자 X, `[표]` 마커 X.
    윤정님 split_sentences가 헤더 키워드 lookahead("운영시간"/"운영방법"/...)로
    행 안에서 의미 단위 자연 분리하므로 셀 구분자 불필요.

    mark_header=True: 각 표의 첫 번째 행 앞에 "[헤더] " 마킹 + 셀을 " | " 구분.
    기본값 False — 기존 호출부(parse_bytes_to_text, batch_convert.py) 변경 없음.
    """
    with zipfile.ZipFile(odt_path) as z:
        with z.open("content.xml") as f:
            tree = ET.parse(f)

    # 표/draw:frame 안 element id 모음 → 본문 처리에서 제외
    # draw:frame: 텍스트 상자/이미지 프레임 — 본문과 같은 텍스트가 중복 저장돼
    # 3배 이상 반복되는 아티팩트 원인. 표 inner 제외와 동일 방식.
    table_inner_ids: set[int] = set()
    for table in tree.iter(_ODT_TABLE_NS + "table"):
        for elem in table.iter():
            table_inner_ids.add(id(elem))
    for frame in tree.iter(_ODT_DRAW_NS + "frame"):
        for elem in frame.iter():
            table_inner_ids.add(id(elem))

    body_parts: list[str] = []
    for elem in tree.iter():
        tag = elem.tag
        if tag in (_ODT_TEXT_NS + "p", _ODT_TEXT_NS + "h"):
            if id(elem) in table_inner_ids:
                continue
            text = "".join(elem.itertext()).strip()
            if text:
                body_parts.append(text)

    table_blocks: list[str] = []
    for table in tree.iter(_ODT_TABLE_NS + "table"):
        rows: list[str] = []
        for row_idx, row in enumerate(table.iter(_ODT_TABLE_NS + "table-row")):
            cells: list[str] = []
            for cell in row.iter(_ODT_TABLE_NS + "table-cell"):
                cell_text = "".join(cell.itertext()).strip()
                if cell_text:
                    cells.append(cell_text)
            if cells:
                if mark_header and row_idx == 0:
                    rows.append("[헤더] " + " | ".join(cells))
                else:
                    rows.append(" ".join(cells))
        if rows:
            table_blocks.append("\n".join(rows))

    parts: list[str] = []
    if body_parts:
        parts.append("\n".join(body_parts))
    if table_blocks:
        parts.append("\n\n".join(table_blocks))
    return "\n\n".join(parts)


def parse_bytes_to_text(data: bytes, filename: str) -> str:
    """업로드된 bytes + 파일명 → 정규화된 clean_text.

    호출부(라우터)는 파일 확장자 분기 신경 안 쓰고 이 함수만 부르면 됨.
    """
    if not data:
        return ""

    suffix = Path(filename).suffix.lower()

    # 화이트리스트 검사: 알 수 없는 suffix는 일찍 거부.
    # (subprocess는 어차피 list-form이라 명령 주입은 불가능하지만 표면을 줄임)
    if suffix and suffix not in ALLOWED_EXTS:
        raise ParserError(f"지원하지 않는 파일 형식: {suffix}")

    if suffix in TEXT_EXTS or suffix == "":
        return normalize(data.decode("utf-8", errors="replace"))

    with tempfile.TemporaryDirectory() as tmp:
        tmp_dir = Path(tmp)
        # 디스크 경로는 항상 tempdir 안의 고정 이름. 원본 filename은 어디에도 안 들어감.
        src_path = tmp_dir / f"input{suffix}"
        src_path.write_bytes(data)

        if suffix in PDF_EXTS:
            raw = _pdf_to_text(src_path)
            return normalize(raw)

        if suffix in HWP_EXTS:
            odt_path = _hwp_to_odt(src_path, tmp_dir)
            raw = _odt_to_text(odt_path)
            return normalize(raw)

        if suffix in IMG_EXTS:
            raw = _image_to_text(src_path)
            return normalize(raw)

    raise ParserError(f"지원하지 않는 파일 형식: {suffix}")