Spaces:
Sleeping
Sleeping
Update rag/ingest.py
Browse files- rag/ingest.py +144 -32
rag/ingest.py
CHANGED
|
@@ -1,45 +1,157 @@
|
|
| 1 |
# rag/ingest.py
|
| 2 |
from __future__ import annotations
|
| 3 |
-
import io, uuid
|
| 4 |
-
from typing import List
|
| 5 |
-
|
|
|
|
| 6 |
from irpr.deps import add_to_index
|
| 7 |
|
| 8 |
-
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
if not text:
|
| 11 |
return []
|
| 12 |
-
|
|
|
|
| 13 |
i = 0
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
return chunks
|
| 21 |
|
|
|
|
|
|
|
|
|
|
| 22 |
def ingest_pdf_bytes(title: str, source_url: str, pdf_bytes: bytes) -> int:
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
doc_id = str(uuid.uuid4())
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
return
|
| 41 |
-
|
| 42 |
-
#
|
| 43 |
-
def ingest_edinet_for_company(edinet_code: str, date: str) -> int:
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
| 45 |
return 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# rag/ingest.py
|
| 2 |
from __future__ import annotations
|
| 3 |
+
import io, uuid, re
|
| 4 |
+
from typing import List, Dict, Tuple, Optional
|
| 5 |
+
|
| 6 |
+
# ベクタ登録は deps 側に委譲(保存先の作成・権限などもそちらで面倒を見ます)
|
| 7 |
from irpr.deps import add_to_index
|
| 8 |
|
| 9 |
+
# =========================
|
| 10 |
+
# PDF → テキスト(メモリ内で完結)
|
| 11 |
+
# =========================
|
| 12 |
+
def _extract_with_pypdf(pdf_bytes: bytes) -> str:
|
| 13 |
+
try:
|
| 14 |
+
# pypdf(新名称)
|
| 15 |
+
from pypdf import PdfReader # type: ignore
|
| 16 |
+
reader = PdfReader(io.BytesIO(pdf_bytes))
|
| 17 |
+
texts = []
|
| 18 |
+
for p in reader.pages:
|
| 19 |
+
# extract_text() が None の場合があるのでガード
|
| 20 |
+
t = p.extract_text() or ""
|
| 21 |
+
texts.append(t)
|
| 22 |
+
return "\n".join(texts)
|
| 23 |
+
except Exception:
|
| 24 |
+
# 旧パッケージ名 PyPDF2 にフォールバック
|
| 25 |
+
try:
|
| 26 |
+
import PyPDF2 # type: ignore
|
| 27 |
+
reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
|
| 28 |
+
texts = []
|
| 29 |
+
for p in reader.pages:
|
| 30 |
+
t = p.extract_text() or ""
|
| 31 |
+
texts.append(t)
|
| 32 |
+
return "\n".join(texts)
|
| 33 |
+
except Exception:
|
| 34 |
+
return ""
|
| 35 |
+
|
| 36 |
+
def _extract_with_pdfminer(pdf_bytes: bytes) -> str:
|
| 37 |
+
try:
|
| 38 |
+
# pdfminer.six(純Python・精度高め)
|
| 39 |
+
from pdfminer.high_level import extract_text # type: ignore
|
| 40 |
+
# file-like を渡せる
|
| 41 |
+
return extract_text(io.BytesIO(pdf_bytes)) or ""
|
| 42 |
+
except Exception:
|
| 43 |
+
return ""
|
| 44 |
+
|
| 45 |
+
def pdf_bytes_to_text(pdf_bytes: bytes) -> str:
|
| 46 |
+
# 1) pypdf / PyPDF2
|
| 47 |
+
text = _extract_with_pypdf(pdf_bytes)
|
| 48 |
+
if _is_meaningful(text):
|
| 49 |
+
return text
|
| 50 |
+
# 2) pdfminer.six
|
| 51 |
+
text = _extract_with_pdfminer(pdf_bytes)
|
| 52 |
+
if _is_meaningful(text):
|
| 53 |
+
return text
|
| 54 |
+
# 3) どちらも失敗
|
| 55 |
+
raise RuntimeError("Failed to parse PDF with pypdf/PyPDF2/pdfminer.six")
|
| 56 |
+
|
| 57 |
+
def _is_meaningful(text: str) -> bool:
|
| 58 |
+
return bool(text and text.strip() and len(text.strip()) >= 10)
|
| 59 |
+
|
| 60 |
+
# =========================
|
| 61 |
+
# テキスト整形・分割
|
| 62 |
+
# =========================
|
| 63 |
+
_WS_RE = re.compile(r"[ \t\u3000]+") # 半角/全角スペース畳み込み
|
| 64 |
+
|
| 65 |
+
def normalize_text(s: str) -> str:
|
| 66 |
+
# 改行は温存しつつ、連続スペースを1つに
|
| 67 |
+
s = s.replace("\r\n", "\n").replace("\r", "\n")
|
| 68 |
+
s = _WS_RE.sub(" ", s)
|
| 69 |
+
# 連続改行は最大2に
|
| 70 |
+
s = re.sub(r"\n{3,}", "\n\n", s)
|
| 71 |
+
return s.strip()
|
| 72 |
+
|
| 73 |
+
def chunk_text(
|
| 74 |
+
text: str,
|
| 75 |
+
chunk_size: int = 1200,
|
| 76 |
+
overlap: int = 200,
|
| 77 |
+
min_chunk: int = 200,
|
| 78 |
+
) -> List[str]:
|
| 79 |
+
"""
|
| 80 |
+
文字数ベースのシンプル分割。
|
| 81 |
+
- overlap で前後文脈を少し残す
|
| 82 |
+
- 最終チャンクが短すぎる場合は前チャンクに吸収
|
| 83 |
+
"""
|
| 84 |
+
text = text.strip()
|
| 85 |
if not text:
|
| 86 |
return []
|
| 87 |
+
|
| 88 |
+
chunks: List[str] = []
|
| 89 |
i = 0
|
| 90 |
+
n = len(text)
|
| 91 |
+
step = max(1, chunk_size - overlap)
|
| 92 |
+
|
| 93 |
+
while i < n:
|
| 94 |
+
j = min(n, i + chunk_size)
|
| 95 |
+
chunk = text[i:j].strip()
|
| 96 |
+
if chunk:
|
| 97 |
+
chunks.append(chunk)
|
| 98 |
+
if j >= n:
|
| 99 |
break
|
| 100 |
+
i += step
|
| 101 |
+
|
| 102 |
+
# 末尾が短すぎる場合はマージ
|
| 103 |
+
if len(chunks) >= 2 and len(chunks[-1]) < min_chunk:
|
| 104 |
+
chunks[-2] = (chunks[-2] + "\n" + chunks[-1]).strip()
|
| 105 |
+
chunks.pop()
|
| 106 |
+
|
| 107 |
return chunks
|
| 108 |
|
| 109 |
+
# =========================
|
| 110 |
+
# 外部公開 API
|
| 111 |
+
# =========================
|
| 112 |
def ingest_pdf_bytes(title: str, source_url: str, pdf_bytes: bytes) -> int:
|
| 113 |
+
"""
|
| 114 |
+
アップロード API から渡された PDF バイト列をそのまま解析・登録。
|
| 115 |
+
- 一切ファイルは作らない(= FileNotFound/PermissionDenied を回避)
|
| 116 |
+
- チャンク化後、irpr.deps.add_to_index へ登録
|
| 117 |
+
Returns: 追加チャンク数
|
| 118 |
+
"""
|
| 119 |
+
if not pdf_bytes:
|
| 120 |
+
raise ValueError("empty pdf_bytes")
|
| 121 |
+
|
| 122 |
+
# PDF → テキスト(メモリ内)
|
| 123 |
+
raw = pdf_bytes_to_text(pdf_bytes)
|
| 124 |
+
txt = normalize_text(raw)
|
| 125 |
+
if not _is_meaningful(txt):
|
| 126 |
+
# 日本語埋め込みの品質のため最低限の長さチェック
|
| 127 |
+
raise RuntimeError("Parsed text is too short or empty")
|
| 128 |
+
|
| 129 |
+
# チャンク分割
|
| 130 |
doc_id = str(uuid.uuid4())
|
| 131 |
+
chunks = chunk_text(txt, chunk_size=1200, overlap=200, min_chunk=200)
|
| 132 |
+
|
| 133 |
+
records: List[Dict] = []
|
| 134 |
+
for idx, ck in enumerate(chunks):
|
| 135 |
+
records.append({
|
| 136 |
+
"doc_id": doc_id,
|
| 137 |
+
"chunk_id": f"{idx:04d}",
|
| 138 |
+
"title": title,
|
| 139 |
+
"source_url": source_url, # /files/uploads/<name> をそのままリンクに
|
| 140 |
+
"text": ck,
|
| 141 |
+
})
|
| 142 |
+
|
| 143 |
+
# ベクタ登録(保存先は deps 側が責任もって作成・権限付与)
|
| 144 |
+
added = add_to_index(records)
|
| 145 |
+
return int(added)
|
| 146 |
+
|
| 147 |
+
# 既存のエンドポイントから参照されている可能性に配慮してダミー実装を残す
|
| 148 |
+
def ingest_edinet_for_company(edinet_code: str, date: Optional[str] = None) -> int:
|
| 149 |
+
"""
|
| 150 |
+
将来的に EDINET ダウンロード → 解析を実装する場合のフック。
|
| 151 |
+
現状はアップロード PDF の処理に一本化しているため 0 を返す。
|
| 152 |
+
"""
|
| 153 |
return 0
|
| 154 |
+
|
| 155 |
+
# 参考: 以前のコード互換のため残しておく(未使用)
|
| 156 |
+
def download_edinet_pdf(*args, **kwargs):
|
| 157 |
+
raise NotImplementedError("download_edinet_pdf is not implemented in this minimal build.")
|