Spaces:

Corin1998
/

IR_PR_PilotPro

Sleeping

File size: 7,322 Bytes

# rag/ingest.py
from __future__ import annotations
import io, uuid, re, os, traceback
from typing import List, Dict, Tuple, Optional

from irpr.deps import add_to_index

# =========================
# PDF → テキスト（多段フォールバック）
# =========================

def _extract_with_pypdf(pdf_bytes: bytes) -> str:
    try:
        from pypdf import PdfReader  # type: ignore
        reader = PdfReader(io.BytesIO(pdf_bytes))
        texts = [(p.extract_text() or "") for p in reader.pages]
        return "\n".join(texts)
    except Exception:
        try:
            import PyPDF2  # type: ignore
            reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
            texts = [(p.extract_text() or "") for p in reader.pages]
            return "\n".join(texts)
        except Exception:
            return ""

def _extract_with_pdfminer(pdf_bytes: bytes) -> str:
    try:
        from pdfminer.high_level import extract_text  # type: ignore
        return extract_text(io.BytesIO(pdf_bytes)) or ""
    except Exception:
        return ""

def _extract_with_pymupdf_text(pdf_bytes: bytes) -> Tuple[str, int, int]:
    try:
        import fitz  # PyMuPDF
    except Exception:
        return "", 0, 0
    try:
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    except Exception:
        return "", 0, 0

    buf: List[str] = []
    text_chars = 0
    for i in range(len(doc)):
        try:
            page = doc.load_page(i)
            t = page.get_text("text") or ""
            text_chars += len(t.strip())
            buf.append(t)
        except Exception:
            buf.append("")
    pages = len(buf)
    doc.close()
    return ("\n".join(buf), text_chars, pages)

def _ocr_with_tesseract_via_pymupdf(pdf_bytes: bytes, dpi_scale: float = 2.0) -> str:
    """
    Tesseract OCR（任意）。pytesseract / Tesseract 本体が無い場合は空文字で返す。
    Tesseract の未導入や言語データ欠如（jpn.traineddata 無し）による FileNotFoundError も
    ここで握りつぶして空文字を返します（上位で「OCRが必要」として案内）。
    """
    try:
        import fitz  # PyMuPDF
        from PIL import Image
        import pytesseract
    except Exception:
        return ""  # OCR不可（依存未導入）

    try:
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    except Exception:
        return ""

    lang = os.environ.get("TESSERACT_LANG", "jpn+eng")  # 日本語+英語
    text_buf: List[str] = []
    for i in range(len(doc)):
        try:
            page = doc.load_page(i)
            mat = fitz.Matrix(dpi_scale, dpi_scale)
            pix = page.get_pixmap(matrix=mat, alpha=False)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            try:
                t = pytesseract.image_to_string(img, lang=lang) or ""
            except FileNotFoundError:
                # tesseract バイナリ or lang データが無い
                t = ""
            text_buf.append(t)
        except Exception:
            text_buf.append("")
    doc.close()
    return "\n".join(text_buf).strip()

def _is_meaningful(text: str, min_len: int = 10) -> bool:
    return bool(text and text.strip() and len(text.strip()) >= min_len)

def pdf_bytes_to_text(pdf_bytes: bytes) -> Tuple[str, Dict]:
    """
    PDF → テキスト。抽出メタを返す:
      meta = {"method": "...", "scanned_likely": bool, "pages": int|None}
    FileNotFoundError（tesseract 未導入 等）はここで潰して RuntimeError にまとめます。
    """
    # 1) pypdf / PyPDF2
    text = _extract_with_pypdf(pdf_bytes)
    if _is_meaningful(text):
        return text, {"method": "pypdf", "scanned_likely": False, "pages": None}

    # 2) pdfminer.six
    text = _extract_with_pdfminer(pdf_bytes)
    if _is_meaningful(text):
        return text, {"method": "pdfminer", "scanned_likely": False, "pages": None}

    # 3) PyMuPDF get_text
    pm_text, _chars, pages = _extract_with_pymupdf_text(pdf_bytes)
    if _is_meaningful(pm_text):
        return pm_text, {"method": "pymupdf", "scanned_likely": False, "pages": pages}

    # 4) OCR（依存が無い/学習データが無い時の FileNotFoundError は握りつぶして空文字）
    try:
        ocr_text = _ocr_with_tesseract_via_pymupdf(pdf_bytes)
    except FileNotFoundError:
        ocr_text = ""
    if _is_meaningful(ocr_text):
        return ocr_text, {"method": "ocr", "scanned_likely": True, "pages": pages or None}

    # 5) ここまで全滅 → 明示的に RuntimeError
    raise RuntimeError(
        "PDFテキスト抽出に失敗しました（pypdf/PyPDF2/pdfminer.six/PyMuPDF/OCR）。"
        "スキャンPDFの可能性が高いです。OCR を有効化するには "
        "『tesseract-ocr + pytesseract（必要なら tesseract-ocr-jpn）』を導入してください。"
    )

# =========================
# テキスト整形・分割
# =========================
_WS_RE = re.compile(r"[ \t\u3000]+")  # 半角/全角スペース畳み込み

def normalize_text(s: str) -> str:
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    s = _WS_RE.sub(" ", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s.strip()

def chunk_text(text: str, chunk_size: int = 1200, overlap: int = 200, min_chunk: int = 200) -> List[str]:
    text = text.strip()
    if not text:
        return []
    chunks: List[str] = []
    i = 0
    n = len(text)
    step = max(1, chunk_size - overlap)
    while i < n:
        j = min(n, i + chunk_size)
        chunk = text[i:j].strip()
        if chunk:
            chunks.append(chunk)
        if j >= n:
            break
        i += step
    if len(chunks) >= 2 and len(chunks[-1]) < min_chunk:
        chunks[-2] = (chunks[-2] + "\n" + chunks[-1]).strip()
        chunks.pop()
    return chunks

# =========================
# 外部公開 API
# =========================
def ingest_pdf_bytes(title: str, source_url: str, pdf_bytes: bytes) -> int:
    """
    PDF バイト列を解析し、チャンクをインデックス登録（ファイルは書かない）
    例外は RuntimeError に正規化して上位に伝えます（FileNotFound は潰す）
    """
    if not pdf_bytes:
        raise RuntimeError("empty pdf_bytes")

    try:
        raw, meta = pdf_bytes_to_text(pdf_bytes)
    except FileNotFoundError as e:
        raise RuntimeError(f"OCR 実行に必要なバイナリ/言語データが見つかりません: {e}") from e

    txt = normalize_text(raw)
    if not _is_meaningful(txt):
        raise RuntimeError("Parsed text is too short or empty after normalization")

    doc_id = str(uuid.uuid4())
    chunks = chunk_text(txt, chunk_size=1200, overlap=200, min_chunk=200)

    records: List[Dict] = []
    for idx, ck in enumerate(chunks):
        records.append({
            "doc_id": doc_id,
            "chunk_id": f"{idx:04d}",
            "title": title,
            "source_url": source_url,
            "text": ck,
        })

    added = add_to_index(records)
    return int(added)

# 互換ダミー
def ingest_edinet_for_company(edinet_code: str, date: Optional[str] = None) -> int:
    return 0

def download_edinet_pdf(*args, **kwargs):
    raise NotImplementedError("download_edinet_pdf is not implemented in this minimal build.")