Spaces:

Corin1998
/

IR_PR_PilotPro

Sleeping

App Files Files Community

Corin1998 commited on Sep 17, 2025

Commit

ab914af

verified ·

1 Parent(s): e04eca5

Update rag/ingest.py

Browse files

Files changed (1) hide show

rag/ingest.py +35 -38

rag/ingest.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # rag/ingest.py
 from __future__ import annotations
-import io, uuid, re, os
 from typing import List, Dict, Tuple, Optional
 from irpr.deps import add_to_index
@@ -10,7 +10,6 @@ from irpr.deps import add_to_index
 # =========================
 def _extract_with_pypdf(pdf_bytes: bytes) -> str:
-    # pypdf → PyPDF2 の順で試す
     try:
         from pypdf import PdfReader  # type: ignore
         reader = PdfReader(io.BytesIO(pdf_bytes))
@@ -33,10 +32,6 @@ def _extract_with_pdfminer(pdf_bytes: bytes) -> str:
         return ""
 def _extract_with_pymupdf_text(pdf_bytes: bytes) -> Tuple[str, int, int]:
-    """
-    PyMuPDF で page.get_text("text") を取得。
-    返り値: (全文, 文字数合計, ページ数)
-    """
     try:
         import fitz  # PyMuPDF
     except Exception:
@@ -56,37 +51,41 @@ def _extract_with_pymupdf_text(pdf_bytes: bytes) -> Tuple[str, int, int]:
             buf.append(t)
         except Exception:
             buf.append("")
     doc.close()
-    return ("\n".join(buf), text_chars, len(buf))
 def _ocr_with_tesseract_via_pymupdf(pdf_bytes: bytes, dpi_scale: float = 2.0) -> str:
     """
-    Tesseract OCR で画像ベースPDFからテキスト抽出。
-    - pytesseract & Tesseract 本体が必要
-    - 依存が無い環境では例外を投げずに空文字返す
     """
     try:
         import fitz  # PyMuPDF
         from PIL import Image
         import pytesseract
     except Exception:
-        return ""  # OCR不可能（依存未導入）
     try:
         doc = fitz.open(stream=pdf_bytes, filetype="pdf")
     except Exception:
         return ""
-    lang = os.environ.get("TESSERACT_LANG", "jpn+eng")  # 日本語+英語をデフォルト
     text_buf: List[str] = []
     for i in range(len(doc)):
         try:
             page = doc.load_page(i)
-            # DPI ~ 72 * scale。2.0 なら 144dpi 相当
             mat = fitz.Matrix(dpi_scale, dpi_scale)
-            pix = page.get_pixmap(matrix=mat, alpha=False)  # RGB
             img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-            t = pytesseract.image_to_string(img, lang=lang) or ""
             text_buf.append(t)
         except Exception:
             text_buf.append("")
@@ -99,11 +98,8 @@ def _is_meaningful(text: str, min_len: int = 10) -> bool:
 def pdf_bytes_to_text(pdf_bytes: bytes) -> Tuple[str, Dict]:
     """
     PDF → テキスト。抽出メタを返す:
-      meta = {
-        "method": "pypdf|pdfminer|pymupdf|ocr",
-        "scanned_likely": bool,
-        "pages": int
-      }
     """
     # 1) pypdf / PyPDF2
     text = _extract_with_pypdf(pdf_bytes)
@@ -116,20 +112,23 @@ def pdf_bytes_to_text(pdf_bytes: bytes) -> Tuple[str, Dict]:
         return text, {"method": "pdfminer", "scanned_likely": False, "pages": None}
     # 3) PyMuPDF get_text
-    pm_text, text_chars, pages = _extract_with_pymupdf_text(pdf_bytes)
     if _is_meaningful(pm_text):
         return pm_text, {"method": "pymupdf", "scanned_likely": False, "pages": pages}
-    # 4) ここまで空 ⇒ スキャンPDFの可能性が高い → OCR を試す
-    ocr_text = _ocr_with_tesseract_via_pymupdf(pdf_bytes)
     if _is_meaningful(ocr_text):
         return ocr_text, {"method": "ocr", "scanned_likely": True, "pages": pages or None}
-    # OCR も不可（依存未導入 or 画像品質不良）
     raise RuntimeError(
-        "Failed to parse PDF with pypdf/PyPDF2/pdfminer.six/PyMuPDF. "
-        "This looks like a scanned (image) PDF and OCR was not available or failed. "
-        "Install Tesseract + pytesseract for OCR (e.g., apt-get install tesseract-ocr; pip install pytesseract Pillow PyMuPDF)."
     )
 # =========================
@@ -143,12 +142,7 @@ def normalize_text(s: str) -> str:
     s = re.sub(r"\n{3,}", "\n\n", s)
     return s.strip()
-def chunk_text(
-    text: str,
-    chunk_size: int = 1200,
-    overlap: int = 200,
-    min_chunk: int = 200,
-) -> List[str]:
     text = text.strip()
     if not text:
         return []
@@ -174,14 +168,17 @@ def chunk_text(
 # =========================
 def ingest_pdf_bytes(title: str, source_url: str, pdf_bytes: bytes) -> int:
     """
-    アップロード API から渡された PDF バイト列を解析・登録（ファイルは書かない）
-    - テキスト抽出は pypdf→pdfminer→PyMuPDF→(任意)OCR の順にフォールバック
-    - OCR は pytesseract + Tesseract がある場合のみ自動使用
     """
     if not pdf_bytes:
-        raise ValueError("empty pdf_bytes")
-    raw, meta = pdf_bytes_to_text(pdf_bytes)
     txt = normalize_text(raw)
     if not _is_meaningful(txt):
         raise RuntimeError("Parsed text is too short or empty after normalization")

 # rag/ingest.py
 from __future__ import annotations
+import io, uuid, re, os, traceback
 from typing import List, Dict, Tuple, Optional
 from irpr.deps import add_to_index
 # =========================
 def _extract_with_pypdf(pdf_bytes: bytes) -> str:
     try:
         from pypdf import PdfReader  # type: ignore
         reader = PdfReader(io.BytesIO(pdf_bytes))
         return ""
 def _extract_with_pymupdf_text(pdf_bytes: bytes) -> Tuple[str, int, int]:
     try:
         import fitz  # PyMuPDF
     except Exception:
             buf.append(t)
         except Exception:
             buf.append("")
+    pages = len(buf)
     doc.close()
+    return ("\n".join(buf), text_chars, pages)
 def _ocr_with_tesseract_via_pymupdf(pdf_bytes: bytes, dpi_scale: float = 2.0) -> str:
     """
+    Tesseract OCR（任意）。pytesseract / Tesseract 本体が無い場合は空文字で返す。
+    Tesseract の未導入や言語データ欠如（jpn.traineddata 無し）による FileNotFoundError も
+    ここで握りつぶして空文字を返します（上位で「OCRが必要」として案内）。
     """
     try:
         import fitz  # PyMuPDF
         from PIL import Image
         import pytesseract
     except Exception:
+        return ""  # OCR不可（依存未導入）
     try:
         doc = fitz.open(stream=pdf_bytes, filetype="pdf")
     except Exception:
         return ""
+    lang = os.environ.get("TESSERACT_LANG", "jpn+eng")  # 日本語+英語
     text_buf: List[str] = []
     for i in range(len(doc)):
         try:
             page = doc.load_page(i)
             mat = fitz.Matrix(dpi_scale, dpi_scale)
+            pix = page.get_pixmap(matrix=mat, alpha=False)
             img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            try:
+                t = pytesseract.image_to_string(img, lang=lang) or ""
+            except FileNotFoundError:
+                # tesseract バイナリ or lang データが無い
+                t = ""
             text_buf.append(t)
         except Exception:
             text_buf.append("")
 def pdf_bytes_to_text(pdf_bytes: bytes) -> Tuple[str, Dict]:
     """
     PDF → テキスト。抽出メタを返す:
+      meta = {"method": "...", "scanned_likely": bool, "pages": int|None}
+    FileNotFoundError（tesseract 未導入 等）はここで潰して RuntimeError にまとめます。
     """
     # 1) pypdf / PyPDF2
     text = _extract_with_pypdf(pdf_bytes)
         return text, {"method": "pdfminer", "scanned_likely": False, "pages": None}
     # 3) PyMuPDF get_text
+    pm_text, _chars, pages = _extract_with_pymupdf_text(pdf_bytes)
     if _is_meaningful(pm_text):
         return pm_text, {"method": "pymupdf", "scanned_likely": False, "pages": pages}
+    # 4) OCR（依存が無い/学習データが無い時の FileNotFoundError は握りつぶして空文字）
+    try:
+        ocr_text = _ocr_with_tesseract_via_pymupdf(pdf_bytes)
+    except FileNotFoundError:
+        ocr_text = ""
     if _is_meaningful(ocr_text):
         return ocr_text, {"method": "ocr", "scanned_likely": True, "pages": pages or None}
+    # 5) ここまで全滅 → 明示的に RuntimeError
     raise RuntimeError(
+        "PDFテキスト抽出に失敗しました（pypdf/PyPDF2/pdfminer.six/PyMuPDF/OCR）。"
+        "スキャンPDFの可能性が高いです。OCR を有効化するには "
+        "『tesseract-ocr + pytesseract（必要なら tesseract-ocr-jpn）』を導入してください。"
     )
 # =========================
     s = re.sub(r"\n{3,}", "\n\n", s)
     return s.strip()
+def chunk_text(text: str, chunk_size: int = 1200, overlap: int = 200, min_chunk: int = 200) -> List[str]:
     text = text.strip()
     if not text:
         return []
 # =========================
 def ingest_pdf_bytes(title: str, source_url: str, pdf_bytes: bytes) -> int:
     """
+    PDF バイト列を解析し、チャンクをインデックス登録（ファイルは書かない）
+    例外は RuntimeError に正規化して上位に伝えます（FileNotFound は潰す）
     """
     if not pdf_bytes:
+        raise RuntimeError("empty pdf_bytes")
+    try:
+        raw, meta = pdf_bytes_to_text(pdf_bytes)
+    except FileNotFoundError as e:
+        raise RuntimeError(f"OCR 実行に必要なバイナリ/言語データが見つかりません: {e}") from e
     txt = normalize_text(raw)
     if not _is_meaningful(txt):
         raise RuntimeError("Parsed text is too short or empty after normalization")