Spaces:

Corin1998
/

IR_PR_PilotPro

Sleeping

App Files Files Community

Corin1998 commited on Sep 17, 2025

Commit

359d165

verified ·

1 Parent(s): 0a6f690

Update rag/ingest.py

Browse files

Files changed (1) hide show

rag/ingest.py +144 -32

rag/ingest.py CHANGED Viewed

@@ -1,45 +1,157 @@
 # rag/ingest.py
 from __future__ import annotations
-import io, uuid
-from typing import List
-import fitz  # PyMuPDF
 from irpr.deps import add_to_index
-def _split_text(text: str, chunk_size=800, overlap=150) -> List[str]:
-    text = (text or "").strip()
     if not text:
         return []
-    chunks = []
     i = 0
-    while i < len(text):
-        chunk = text[i:i+chunk_size]
-        chunks.append(chunk)
-        i += chunk_size - overlap
-        if i < 0 or i >= len(text):
             break
     return chunks
 def ingest_pdf_bytes(title: str, source_url: str, pdf_bytes: bytes) -> int:
-    doc = fitz.open(stream=io.BytesIO(pdf_bytes), filetype="pdf")
-    all_chunks = []
     doc_id = str(uuid.uuid4())
-    for page_no in range(doc.page_count):
-        page = doc.load_page(page_no)
-        raw = page.get_text("text")
-        # ページ番号などを付与しておく
-        page_text = f"[p.{page_no+1}] {raw}".strip()
-        for j, ch in enumerate(_split_text(page_text, 900, 150)):
-            all_chunks.append({
-                "text": ch,
-                "title": title,
-                "source_url": source_url,
-                "doc_id": doc_id,
-                "chunk_id": f"{page_no+1}-{j+1}",
-            })
-    doc.close()
-    return add_to_index(all_chunks)
-# ---- EDINET ダミー実装（OpenAI専用版では未サポート）----
-def ingest_edinet_for_company(edinet_code: str, date: str) -> int:
-    # ここでは何もしない（将来実装用の置き場所）
     return 0

 # rag/ingest.py
 from __future__ import annotations
+import io, uuid, re
+from typing import List, Dict, Tuple, Optional
+# ベクタ登録は deps 側に委譲（保存先の作成・権限などもそちらで面倒を見ます）
 from irpr.deps import add_to_index
+# =========================
+# PDF → テキスト（メモリ内で完結）
+# =========================
+def _extract_with_pypdf(pdf_bytes: bytes) -> str:
+    try:
+        # pypdf（新名称）
+        from pypdf import PdfReader  # type: ignore
+        reader = PdfReader(io.BytesIO(pdf_bytes))
+        texts = []
+        for p in reader.pages:
+            # extract_text() が None の場合があるのでガード
+            t = p.extract_text() or ""
+            texts.append(t)
+        return "\n".join(texts)
+    except Exception:
+        # 旧パッケージ名 PyPDF2 にフォールバック
+        try:
+            import PyPDF2  # type: ignore
+            reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
+            texts = []
+            for p in reader.pages:
+                t = p.extract_text() or ""
+                texts.append(t)
+            return "\n".join(texts)
+        except Exception:
+            return ""
+def _extract_with_pdfminer(pdf_bytes: bytes) -> str:
+    try:
+        # pdfminer.six（純Python・精度高め）
+        from pdfminer.high_level import extract_text  # type: ignore
+        # file-like を渡せる
+        return extract_text(io.BytesIO(pdf_bytes)) or ""
+    except Exception:
+        return ""
+def pdf_bytes_to_text(pdf_bytes: bytes) -> str:
+    # 1) pypdf / PyPDF2
+    text = _extract_with_pypdf(pdf_bytes)
+    if _is_meaningful(text):
+        return text
+    # 2) pdfminer.six
+    text = _extract_with_pdfminer(pdf_bytes)
+    if _is_meaningful(text):
+        return text
+    # 3) どちらも失敗
+    raise RuntimeError("Failed to parse PDF with pypdf/PyPDF2/pdfminer.six")
+def _is_meaningful(text: str) -> bool:
+    return bool(text and text.strip() and len(text.strip()) >= 10)
+# =========================
+# テキスト整形・分割
+# =========================
+_WS_RE = re.compile(r"[ \t\u3000]+")  # 半角/全角スペース畳み込み
+def normalize_text(s: str) -> str:
+    # 改行は温存しつつ、連続スペースを1つに
+    s = s.replace("\r\n", "\n").replace("\r", "\n")
+    s = _WS_RE.sub(" ", s)
+    # 連続改行は最大2に
+    s = re.sub(r"\n{3,}", "\n\n", s)
+    return s.strip()
+def chunk_text(
+    text: str,
+    chunk_size: int = 1200,
+    overlap: int = 200,
+    min_chunk: int = 200,
+) -> List[str]:
+    """
+    文字数ベースのシンプル分割。
+    - overlap で前後文脈を少し残す
+    - 最終チャンクが短すぎる場合は前チャンクに吸収
+    """
+    text = text.strip()
     if not text:
         return []
+    chunks: List[str] = []
     i = 0
+    n = len(text)
+    step = max(1, chunk_size - overlap)
+    while i < n:
+        j = min(n, i + chunk_size)
+        chunk = text[i:j].strip()
+        if chunk:
+            chunks.append(chunk)
+        if j >= n:
             break
+        i += step
+    # 末尾が短すぎる場合はマージ
+    if len(chunks) >= 2 and len(chunks[-1]) < min_chunk:
+        chunks[-2] = (chunks[-2] + "\n" + chunks[-1]).strip()
+        chunks.pop()
     return chunks
+# =========================
+# 外部公開 API
+# =========================
 def ingest_pdf_bytes(title: str, source_url: str, pdf_bytes: bytes) -> int:
+    """
+    アップロード API から渡された PDF バイト列をそのまま解析・登録。
+    - 一切ファイルは作らない（= FileNotFound/PermissionDenied を回避）
+    - チャンク化後、irpr.deps.add_to_index へ登録
+    Returns: 追加チャンク数
+    """
+    if not pdf_bytes:
+        raise ValueError("empty pdf_bytes")
+    # PDF → テキスト（メモリ内）
+    raw = pdf_bytes_to_text(pdf_bytes)
+    txt = normalize_text(raw)
+    if not _is_meaningful(txt):
+        # 日本語埋め込みの品質のため最低限の長さチェック
+        raise RuntimeError("Parsed text is too short or empty")
+    # チャンク分割
     doc_id = str(uuid.uuid4())
+    chunks = chunk_text(txt, chunk_size=1200, overlap=200, min_chunk=200)
+    records: List[Dict] = []
+    for idx, ck in enumerate(chunks):
+        records.append({
+            "doc_id": doc_id,
+            "chunk_id": f"{idx:04d}",
+            "title": title,
+            "source_url": source_url,   # /files/uploads/<name> をそのままリンクに
+            "text": ck,
+        })
+    # ベクタ登録（保存先は deps 側が責任もって作成・権限付与）
+    added = add_to_index(records)
+    return int(added)
+# 既存のエンドポイントから参照されている可能性に配慮してダミー実装を残す
+def ingest_edinet_for_company(edinet_code: str, date: Optional[str] = None) -> int:
+    """
+    将来的に EDINET ダウンロード → 解析を実装する場合のフック。
+    現状はアップロード PDF の処理に一本化しているため 0 を返す。
+    """
     return 0
+# 参考: 以前のコード互換のため残しておく（未使用）
+def download_edinet_pdf(*args, **kwargs):
+    raise NotImplementedError("download_edinet_pdf is not implemented in this minimal build.")