Spaces:

Corin1998
/

IR_PR_PilotPro

Sleeping

App Files Files Community

Corin1998 commited on Sep 17, 2025

Commit

28ac450

verified ·

1 Parent(s): 359d165

Update rag/ingest.py

Browse files

Files changed (1) hide show

rag/ingest.py +105 -52

rag/ingest.py CHANGED Viewed

@@ -1,61 +1,136 @@
 # rag/ingest.py
 from __future__ import annotations
-import io, uuid, re
 from typing import List, Dict, Tuple, Optional
-# ベクタ登録は deps 側に委譲（保存先の作成・権限などもそちらで面倒を見ます）
 from irpr.deps import add_to_index
 # =========================
-# PDF → テキスト（メモリ内で完結）
 # =========================
 def _extract_with_pypdf(pdf_bytes: bytes) -> str:
     try:
-        # pypdf（新名称）
         from pypdf import PdfReader  # type: ignore
         reader = PdfReader(io.BytesIO(pdf_bytes))
-        texts = []
-        for p in reader.pages:
-            # extract_text() が None の場合があるのでガード
-            t = p.extract_text() or ""
-            texts.append(t)
         return "\n".join(texts)
     except Exception:
-        # 旧パッケージ名 PyPDF2 にフォールバック
         try:
             import PyPDF2  # type: ignore
             reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
-            texts = []
-            for p in reader.pages:
-                t = p.extract_text() or ""
-                texts.append(t)
             return "\n".join(texts)
         except Exception:
             return ""
 def _extract_with_pdfminer(pdf_bytes: bytes) -> str:
     try:
-        # pdfminer.six（純Python・精度高め）
         from pdfminer.high_level import extract_text  # type: ignore
-        # file-like を渡せる
         return extract_text(io.BytesIO(pdf_bytes)) or ""
     except Exception:
         return ""
-def pdf_bytes_to_text(pdf_bytes: bytes) -> str:
     # 1) pypdf / PyPDF2
     text = _extract_with_pypdf(pdf_bytes)
     if _is_meaningful(text):
-        return text
     # 2) pdfminer.six
     text = _extract_with_pdfminer(pdf_bytes)
     if _is_meaningful(text):
-        return text
-    # 3) どちらも失敗
-    raise RuntimeError("Failed to parse PDF with pypdf/PyPDF2/pdfminer.six")
-def _is_meaningful(text: str) -> bool:
-    return bool(text and text.strip() and len(text.strip()) >= 10)
 # =========================
 # テキスト整形・分割
@@ -63,10 +138,8 @@ def _is_meaningful(text: str) -> bool:
 _WS_RE = re.compile(r"[ \t\u3000]+")  # 半角/全角スペース畳み込み
 def normalize_text(s: str) -> str:
-    # 改行は温存しつつ、連続スペースを1つに
     s = s.replace("\r\n", "\n").replace("\r", "\n")
     s = _WS_RE.sub(" ", s)
-    # 連続改行は最大2に
     s = re.sub(r"\n{3,}", "\n\n", s)
     return s.strip()
@@ -76,20 +149,13 @@ def chunk_text(
     overlap: int = 200,
     min_chunk: int = 200,
 ) -> List[str]:
-    """
-    文字数ベースのシンプル分割。
-    - overlap で前後文脈を少し残す
-    - 最終チャンクが短すぎる場合は前チャンクに吸収
-    """
     text = text.strip()
     if not text:
         return []
     chunks: List[str] = []
     i = 0
     n = len(text)
     step = max(1, chunk_size - overlap)
     while i < n:
         j = min(n, i + chunk_size)
         chunk = text[i:j].strip()
@@ -98,12 +164,9 @@ def chunk_text(
         if j >= n:
             break
         i += step
-    # 末尾が短すぎる場合はマージ
     if len(chunks) >= 2 and len(chunks[-1]) < min_chunk:
         chunks[-2] = (chunks[-2] + "\n" + chunks[-1]).strip()
         chunks.pop()
     return chunks
 # =========================
@@ -111,22 +174,18 @@ def chunk_text(
 # =========================
 def ingest_pdf_bytes(title: str, source_url: str, pdf_bytes: bytes) -> int:
     """
-    アップロード API から渡された PDF バイト列をそのまま解析・登録。
-    - 一切ファイルは作らない（= FileNotFound/PermissionDenied を回避）
-    - チャンク化後、irpr.deps.add_to_index へ登録
-    Returns: 追加チャンク数
     """
     if not pdf_bytes:
         raise ValueError("empty pdf_bytes")
-    # PDF → テキスト（メモリ内）
-    raw = pdf_bytes_to_text(pdf_bytes)
     txt = normalize_text(raw)
     if not _is_meaningful(txt):
-        # 日本語埋め込みの品質のため最低限の長さチェック
-        raise RuntimeError("Parsed text is too short or empty")
-    # チャンク分割
     doc_id = str(uuid.uuid4())
     chunks = chunk_text(txt, chunk_size=1200, overlap=200, min_chunk=200)
@@ -136,22 +195,16 @@ def ingest_pdf_bytes(title: str, source_url: str, pdf_bytes: bytes) -> int:
             "doc_id": doc_id,
             "chunk_id": f"{idx:04d}",
             "title": title,
-            "source_url": source_url,   # /files/uploads/<name> をそのままリンクに
             "text": ck,
         })
-    # ベクタ登録（保存先は deps 側が責任もって作成・権限付与）
     added = add_to_index(records)
     return int(added)
-# 既存のエンドポイントから参照されている可能性に配慮してダミー実装を残す
 def ingest_edinet_for_company(edinet_code: str, date: Optional[str] = None) -> int:
-    """
-    将来的に EDINET ダウンロード → 解析を実装する場合のフック。
-    現状はアップロード PDF の処理に一本化しているため 0 を返す。
-    """
     return 0
-# 参考: 以前のコード互換のため残しておく（未使用）
 def download_edinet_pdf(*args, **kwargs):
     raise NotImplementedError("download_edinet_pdf is not implemented in this minimal build.")

 # rag/ingest.py
 from __future__ import annotations
+import io, uuid, re, os
 from typing import List, Dict, Tuple, Optional
 from irpr.deps import add_to_index
 # =========================
+# PDF → テキスト（多段フォールバック）
 # =========================
 def _extract_with_pypdf(pdf_bytes: bytes) -> str:
+    # pypdf → PyPDF2 の順で試す
     try:
         from pypdf import PdfReader  # type: ignore
         reader = PdfReader(io.BytesIO(pdf_bytes))
+        texts = [(p.extract_text() or "") for p in reader.pages]
         return "\n".join(texts)
     except Exception:
         try:
             import PyPDF2  # type: ignore
             reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
+            texts = [(p.extract_text() or "") for p in reader.pages]
             return "\n".join(texts)
         except Exception:
             return ""
 def _extract_with_pdfminer(pdf_bytes: bytes) -> str:
     try:
         from pdfminer.high_level import extract_text  # type: ignore
         return extract_text(io.BytesIO(pdf_bytes)) or ""
     except Exception:
         return ""
+def _extract_with_pymupdf_text(pdf_bytes: bytes) -> Tuple[str, int, int]:
+    """
+    PyMuPDF で page.get_text("text") を取得。
+    返り値: (全文, 文字数合計, ページ数)
+    """
+    try:
+        import fitz  # PyMuPDF
+    except Exception:
+        return "", 0, 0
+    try:
+        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+    except Exception:
+        return "", 0, 0
+    buf: List[str] = []
+    text_chars = 0
+    for i in range(len(doc)):
+        try:
+            page = doc.load_page(i)
+            t = page.get_text("text") or ""
+            text_chars += len(t.strip())
+            buf.append(t)
+        except Exception:
+            buf.append("")
+    doc.close()
+    return ("\n".join(buf), text_chars, len(buf))
+def _ocr_with_tesseract_via_pymupdf(pdf_bytes: bytes, dpi_scale: float = 2.0) -> str:
+    """
+    Tesseract OCR で画像ベースPDFからテキスト抽出。
+    - pytesseract & Tesseract 本体が必要
+    - 依存が無い環境では例外を投げずに空文字返す
+    """
+    try:
+        import fitz  # PyMuPDF
+        from PIL import Image
+        import pytesseract
+    except Exception:
+        return ""  # OCR不可能（依存未導入）
+    try:
+        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+    except Exception:
+        return ""
+    lang = os.environ.get("TESSERACT_LANG", "jpn+eng")  # 日本語+英語をデフォルト
+    text_buf: List[str] = []
+    for i in range(len(doc)):
+        try:
+            page = doc.load_page(i)
+            # DPI ~ 72 * scale。2.0 なら 144dpi 相当
+            mat = fitz.Matrix(dpi_scale, dpi_scale)
+            pix = page.get_pixmap(matrix=mat, alpha=False)  # RGB
+            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            t = pytesseract.image_to_string(img, lang=lang) or ""
+            text_buf.append(t)
+        except Exception:
+            text_buf.append("")
+    doc.close()
+    return "\n".join(text_buf).strip()
+def _is_meaningful(text: str, min_len: int = 10) -> bool:
+    return bool(text and text.strip() and len(text.strip()) >= min_len)
+def pdf_bytes_to_text(pdf_bytes: bytes) -> Tuple[str, Dict]:
+    """
+    PDF → テキスト。抽出メタを返す:
+      meta = {
+        "method": "pypdf|pdfminer|pymupdf|ocr",
+        "scanned_likely": bool,
+        "pages": int
+      }
+    """
     # 1) pypdf / PyPDF2
     text = _extract_with_pypdf(pdf_bytes)
     if _is_meaningful(text):
+        return text, {"method": "pypdf", "scanned_likely": False, "pages": None}
     # 2) pdfminer.six
     text = _extract_with_pdfminer(pdf_bytes)
     if _is_meaningful(text):
+        return text, {"method": "pdfminer", "scanned_likely": False, "pages": None}
+    # 3) PyMuPDF get_text
+    pm_text, text_chars, pages = _extract_with_pymupdf_text(pdf_bytes)
+    if _is_meaningful(pm_text):
+        return pm_text, {"method": "pymupdf", "scanned_likely": False, "pages": pages}
+    # 4) ここまで空 ⇒ スキャンPDFの可能性が高い → OCR を試す
+    ocr_text = _ocr_with_tesseract_via_pymupdf(pdf_bytes)
+    if _is_meaningful(ocr_text):
+        return ocr_text, {"method": "ocr", "scanned_likely": True, "pages": pages or None}
+    # OCR も不可（依存未導入 or 画像品質不良）
+    raise RuntimeError(
+        "Failed to parse PDF with pypdf/PyPDF2/pdfminer.six/PyMuPDF. "
+        "This looks like a scanned (image) PDF and OCR was not available or failed. "
+        "Install Tesseract + pytesseract for OCR (e.g., apt-get install tesseract-ocr; pip install pytesseract Pillow PyMuPDF)."
+    )
 # =========================
 # テキスト整形・分割
 _WS_RE = re.compile(r"[ \t\u3000]+")  # 半角/全角スペース畳み込み
 def normalize_text(s: str) -> str:
     s = s.replace("\r\n", "\n").replace("\r", "\n")
     s = _WS_RE.sub(" ", s)
     s = re.sub(r"\n{3,}", "\n\n", s)
     return s.strip()
     overlap: int = 200,
     min_chunk: int = 200,
 ) -> List[str]:
     text = text.strip()
     if not text:
         return []
     chunks: List[str] = []
     i = 0
     n = len(text)
     step = max(1, chunk_size - overlap)
     while i < n:
         j = min(n, i + chunk_size)
         chunk = text[i:j].strip()
         if j >= n:
             break
         i += step
     if len(chunks) >= 2 and len(chunks[-1]) < min_chunk:
         chunks[-2] = (chunks[-2] + "\n" + chunks[-1]).strip()
         chunks.pop()
     return chunks
 # =========================
 # =========================
 def ingest_pdf_bytes(title: str, source_url: str, pdf_bytes: bytes) -> int:
     """
+    アップロード API から渡された PDF バイト列を解析・登録（ファイルは書かない）
+    - テキスト抽出は pypdf→pdfminer→PyMuPDF→(任意)OCR の順にフォールバック
+    - OCR は pytesseract + Tesseract がある場合のみ自動使用
     """
     if not pdf_bytes:
         raise ValueError("empty pdf_bytes")
+    raw, meta = pdf_bytes_to_text(pdf_bytes)
     txt = normalize_text(raw)
     if not _is_meaningful(txt):
+        raise RuntimeError("Parsed text is too short or empty after normalization")
     doc_id = str(uuid.uuid4())
     chunks = chunk_text(txt, chunk_size=1200, overlap=200, min_chunk=200)
             "doc_id": doc_id,
             "chunk_id": f"{idx:04d}",
             "title": title,
+            "source_url": source_url,
             "text": ck,
         })
     added = add_to_index(records)
     return int(added)
+# 互換ダミー
 def ingest_edinet_for_company(edinet_code: str, date: Optional[str] = None) -> int:
     return 0
 def download_edinet_pdf(*args, **kwargs):
     raise NotImplementedError("download_edinet_pdf is not implemented in this minimal build.")