Spaces:
Sleeping
Sleeping
Update rag/ingest.py
Browse files- rag/ingest.py +105 -52
rag/ingest.py
CHANGED
|
@@ -1,61 +1,136 @@
|
|
| 1 |
# rag/ingest.py
|
| 2 |
from __future__ import annotations
|
| 3 |
-
import io, uuid, re
|
| 4 |
from typing import List, Dict, Tuple, Optional
|
| 5 |
|
| 6 |
-
# ベクタ登録は deps 側に委譲(保存先の作成・権限などもそちらで面倒を見ます)
|
| 7 |
from irpr.deps import add_to_index
|
| 8 |
|
| 9 |
# =========================
|
| 10 |
-
# PDF → テキスト(
|
| 11 |
# =========================
|
|
|
|
| 12 |
def _extract_with_pypdf(pdf_bytes: bytes) -> str:
|
|
|
|
| 13 |
try:
|
| 14 |
-
# pypdf(新名称)
|
| 15 |
from pypdf import PdfReader # type: ignore
|
| 16 |
reader = PdfReader(io.BytesIO(pdf_bytes))
|
| 17 |
-
texts = []
|
| 18 |
-
for p in reader.pages:
|
| 19 |
-
# extract_text() が None の場合があるのでガード
|
| 20 |
-
t = p.extract_text() or ""
|
| 21 |
-
texts.append(t)
|
| 22 |
return "\n".join(texts)
|
| 23 |
except Exception:
|
| 24 |
-
# 旧パッケージ名 PyPDF2 にフォールバック
|
| 25 |
try:
|
| 26 |
import PyPDF2 # type: ignore
|
| 27 |
reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
|
| 28 |
-
texts = []
|
| 29 |
-
for p in reader.pages:
|
| 30 |
-
t = p.extract_text() or ""
|
| 31 |
-
texts.append(t)
|
| 32 |
return "\n".join(texts)
|
| 33 |
except Exception:
|
| 34 |
return ""
|
| 35 |
|
| 36 |
def _extract_with_pdfminer(pdf_bytes: bytes) -> str:
|
| 37 |
try:
|
| 38 |
-
# pdfminer.six(純Python・精度高め)
|
| 39 |
from pdfminer.high_level import extract_text # type: ignore
|
| 40 |
-
# file-like を渡せる
|
| 41 |
return extract_text(io.BytesIO(pdf_bytes)) or ""
|
| 42 |
except Exception:
|
| 43 |
return ""
|
| 44 |
|
| 45 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
# 1) pypdf / PyPDF2
|
| 47 |
text = _extract_with_pypdf(pdf_bytes)
|
| 48 |
if _is_meaningful(text):
|
| 49 |
-
return text
|
|
|
|
| 50 |
# 2) pdfminer.six
|
| 51 |
text = _extract_with_pdfminer(pdf_bytes)
|
| 52 |
if _is_meaningful(text):
|
| 53 |
-
return text
|
| 54 |
-
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
-
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
# =========================
|
| 61 |
# テキスト整形・分割
|
|
@@ -63,10 +138,8 @@ def _is_meaningful(text: str) -> bool:
|
|
| 63 |
_WS_RE = re.compile(r"[ \t\u3000]+") # 半角/全角スペース畳み込み
|
| 64 |
|
| 65 |
def normalize_text(s: str) -> str:
|
| 66 |
-
# 改行は温存しつつ、連続スペースを1つに
|
| 67 |
s = s.replace("\r\n", "\n").replace("\r", "\n")
|
| 68 |
s = _WS_RE.sub(" ", s)
|
| 69 |
-
# 連続改行は最大2に
|
| 70 |
s = re.sub(r"\n{3,}", "\n\n", s)
|
| 71 |
return s.strip()
|
| 72 |
|
|
@@ -76,20 +149,13 @@ def chunk_text(
|
|
| 76 |
overlap: int = 200,
|
| 77 |
min_chunk: int = 200,
|
| 78 |
) -> List[str]:
|
| 79 |
-
"""
|
| 80 |
-
文字数ベースのシンプル分割。
|
| 81 |
-
- overlap で前後文脈を少し残す
|
| 82 |
-
- 最終チャンクが短すぎる場合は前チャンクに吸収
|
| 83 |
-
"""
|
| 84 |
text = text.strip()
|
| 85 |
if not text:
|
| 86 |
return []
|
| 87 |
-
|
| 88 |
chunks: List[str] = []
|
| 89 |
i = 0
|
| 90 |
n = len(text)
|
| 91 |
step = max(1, chunk_size - overlap)
|
| 92 |
-
|
| 93 |
while i < n:
|
| 94 |
j = min(n, i + chunk_size)
|
| 95 |
chunk = text[i:j].strip()
|
|
@@ -98,12 +164,9 @@ def chunk_text(
|
|
| 98 |
if j >= n:
|
| 99 |
break
|
| 100 |
i += step
|
| 101 |
-
|
| 102 |
-
# 末尾が短すぎる場合はマージ
|
| 103 |
if len(chunks) >= 2 and len(chunks[-1]) < min_chunk:
|
| 104 |
chunks[-2] = (chunks[-2] + "\n" + chunks[-1]).strip()
|
| 105 |
chunks.pop()
|
| 106 |
-
|
| 107 |
return chunks
|
| 108 |
|
| 109 |
# =========================
|
|
@@ -111,22 +174,18 @@ def chunk_text(
|
|
| 111 |
# =========================
|
| 112 |
def ingest_pdf_bytes(title: str, source_url: str, pdf_bytes: bytes) -> int:
|
| 113 |
"""
|
| 114 |
-
アップロード API から渡された PDF バイト列を
|
| 115 |
-
-
|
| 116 |
-
-
|
| 117 |
-
Returns: 追加チャンク数
|
| 118 |
"""
|
| 119 |
if not pdf_bytes:
|
| 120 |
raise ValueError("empty pdf_bytes")
|
| 121 |
|
| 122 |
-
|
| 123 |
-
raw = pdf_bytes_to_text(pdf_bytes)
|
| 124 |
txt = normalize_text(raw)
|
| 125 |
if not _is_meaningful(txt):
|
| 126 |
-
|
| 127 |
-
raise RuntimeError("Parsed text is too short or empty")
|
| 128 |
|
| 129 |
-
# チャンク分割
|
| 130 |
doc_id = str(uuid.uuid4())
|
| 131 |
chunks = chunk_text(txt, chunk_size=1200, overlap=200, min_chunk=200)
|
| 132 |
|
|
@@ -136,22 +195,16 @@ def ingest_pdf_bytes(title: str, source_url: str, pdf_bytes: bytes) -> int:
|
|
| 136 |
"doc_id": doc_id,
|
| 137 |
"chunk_id": f"{idx:04d}",
|
| 138 |
"title": title,
|
| 139 |
-
"source_url": source_url,
|
| 140 |
"text": ck,
|
| 141 |
})
|
| 142 |
|
| 143 |
-
# ベクタ登録(保存先は deps 側が責任もって作成・権限付与)
|
| 144 |
added = add_to_index(records)
|
| 145 |
return int(added)
|
| 146 |
|
| 147 |
-
#
|
| 148 |
def ingest_edinet_for_company(edinet_code: str, date: Optional[str] = None) -> int:
|
| 149 |
-
"""
|
| 150 |
-
将来的に EDINET ダウンロード → 解析を実装する場合のフック。
|
| 151 |
-
現状はアップロード PDF の処理に一本化しているため 0 を返す。
|
| 152 |
-
"""
|
| 153 |
return 0
|
| 154 |
|
| 155 |
-
# 参考: 以前のコード互換のため残しておく(未使用)
|
| 156 |
def download_edinet_pdf(*args, **kwargs):
|
| 157 |
raise NotImplementedError("download_edinet_pdf is not implemented in this minimal build.")
|
|
|
|
| 1 |
# rag/ingest.py
|
| 2 |
from __future__ import annotations
|
| 3 |
+
import io, uuid, re, os
|
| 4 |
from typing import List, Dict, Tuple, Optional
|
| 5 |
|
|
|
|
| 6 |
from irpr.deps import add_to_index
|
| 7 |
|
| 8 |
# =========================
|
| 9 |
+
# PDF → テキスト(多段フォールバック)
|
| 10 |
# =========================
|
| 11 |
+
|
| 12 |
def _extract_with_pypdf(pdf_bytes: bytes) -> str:
|
| 13 |
+
# pypdf → PyPDF2 の順で試す
|
| 14 |
try:
|
|
|
|
| 15 |
from pypdf import PdfReader # type: ignore
|
| 16 |
reader = PdfReader(io.BytesIO(pdf_bytes))
|
| 17 |
+
texts = [(p.extract_text() or "") for p in reader.pages]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
return "\n".join(texts)
|
| 19 |
except Exception:
|
|
|
|
| 20 |
try:
|
| 21 |
import PyPDF2 # type: ignore
|
| 22 |
reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
|
| 23 |
+
texts = [(p.extract_text() or "") for p in reader.pages]
|
|
|
|
|
|
|
|
|
|
| 24 |
return "\n".join(texts)
|
| 25 |
except Exception:
|
| 26 |
return ""
|
| 27 |
|
| 28 |
def _extract_with_pdfminer(pdf_bytes: bytes) -> str:
|
| 29 |
try:
|
|
|
|
| 30 |
from pdfminer.high_level import extract_text # type: ignore
|
|
|
|
| 31 |
return extract_text(io.BytesIO(pdf_bytes)) or ""
|
| 32 |
except Exception:
|
| 33 |
return ""
|
| 34 |
|
| 35 |
+
def _extract_with_pymupdf_text(pdf_bytes: bytes) -> Tuple[str, int, int]:
|
| 36 |
+
"""
|
| 37 |
+
PyMuPDF で page.get_text("text") を取得。
|
| 38 |
+
返り値: (全文, 文字数合計, ページ数)
|
| 39 |
+
"""
|
| 40 |
+
try:
|
| 41 |
+
import fitz # PyMuPDF
|
| 42 |
+
except Exception:
|
| 43 |
+
return "", 0, 0
|
| 44 |
+
try:
|
| 45 |
+
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 46 |
+
except Exception:
|
| 47 |
+
return "", 0, 0
|
| 48 |
+
|
| 49 |
+
buf: List[str] = []
|
| 50 |
+
text_chars = 0
|
| 51 |
+
for i in range(len(doc)):
|
| 52 |
+
try:
|
| 53 |
+
page = doc.load_page(i)
|
| 54 |
+
t = page.get_text("text") or ""
|
| 55 |
+
text_chars += len(t.strip())
|
| 56 |
+
buf.append(t)
|
| 57 |
+
except Exception:
|
| 58 |
+
buf.append("")
|
| 59 |
+
doc.close()
|
| 60 |
+
return ("\n".join(buf), text_chars, len(buf))
|
| 61 |
+
|
| 62 |
+
def _ocr_with_tesseract_via_pymupdf(pdf_bytes: bytes, dpi_scale: float = 2.0) -> str:
|
| 63 |
+
"""
|
| 64 |
+
Tesseract OCR で画像ベースPDFからテキスト抽出。
|
| 65 |
+
- pytesseract & Tesseract 本体が必要
|
| 66 |
+
- 依存が無い環境では例外を投げずに空文字返す
|
| 67 |
+
"""
|
| 68 |
+
try:
|
| 69 |
+
import fitz # PyMuPDF
|
| 70 |
+
from PIL import Image
|
| 71 |
+
import pytesseract
|
| 72 |
+
except Exception:
|
| 73 |
+
return "" # OCR不可能(依存未導入)
|
| 74 |
+
|
| 75 |
+
try:
|
| 76 |
+
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 77 |
+
except Exception:
|
| 78 |
+
return ""
|
| 79 |
+
|
| 80 |
+
lang = os.environ.get("TESSERACT_LANG", "jpn+eng") # 日本語+英語をデフォルト
|
| 81 |
+
text_buf: List[str] = []
|
| 82 |
+
for i in range(len(doc)):
|
| 83 |
+
try:
|
| 84 |
+
page = doc.load_page(i)
|
| 85 |
+
# DPI ~ 72 * scale。2.0 なら 144dpi 相当
|
| 86 |
+
mat = fitz.Matrix(dpi_scale, dpi_scale)
|
| 87 |
+
pix = page.get_pixmap(matrix=mat, alpha=False) # RGB
|
| 88 |
+
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
| 89 |
+
t = pytesseract.image_to_string(img, lang=lang) or ""
|
| 90 |
+
text_buf.append(t)
|
| 91 |
+
except Exception:
|
| 92 |
+
text_buf.append("")
|
| 93 |
+
doc.close()
|
| 94 |
+
return "\n".join(text_buf).strip()
|
| 95 |
+
|
| 96 |
+
def _is_meaningful(text: str, min_len: int = 10) -> bool:
|
| 97 |
+
return bool(text and text.strip() and len(text.strip()) >= min_len)
|
| 98 |
+
|
| 99 |
+
def pdf_bytes_to_text(pdf_bytes: bytes) -> Tuple[str, Dict]:
|
| 100 |
+
"""
|
| 101 |
+
PDF → テキスト。抽出メタを返す:
|
| 102 |
+
meta = {
|
| 103 |
+
"method": "pypdf|pdfminer|pymupdf|ocr",
|
| 104 |
+
"scanned_likely": bool,
|
| 105 |
+
"pages": int
|
| 106 |
+
}
|
| 107 |
+
"""
|
| 108 |
# 1) pypdf / PyPDF2
|
| 109 |
text = _extract_with_pypdf(pdf_bytes)
|
| 110 |
if _is_meaningful(text):
|
| 111 |
+
return text, {"method": "pypdf", "scanned_likely": False, "pages": None}
|
| 112 |
+
|
| 113 |
# 2) pdfminer.six
|
| 114 |
text = _extract_with_pdfminer(pdf_bytes)
|
| 115 |
if _is_meaningful(text):
|
| 116 |
+
return text, {"method": "pdfminer", "scanned_likely": False, "pages": None}
|
| 117 |
+
|
| 118 |
+
# 3) PyMuPDF get_text
|
| 119 |
+
pm_text, text_chars, pages = _extract_with_pymupdf_text(pdf_bytes)
|
| 120 |
+
if _is_meaningful(pm_text):
|
| 121 |
+
return pm_text, {"method": "pymupdf", "scanned_likely": False, "pages": pages}
|
| 122 |
|
| 123 |
+
# 4) ここまで空 ⇒ スキャンPDFの可能性が高い → OCR を試す
|
| 124 |
+
ocr_text = _ocr_with_tesseract_via_pymupdf(pdf_bytes)
|
| 125 |
+
if _is_meaningful(ocr_text):
|
| 126 |
+
return ocr_text, {"method": "ocr", "scanned_likely": True, "pages": pages or None}
|
| 127 |
+
|
| 128 |
+
# OCR も不可(依存未導入 or 画像品質不良)
|
| 129 |
+
raise RuntimeError(
|
| 130 |
+
"Failed to parse PDF with pypdf/PyPDF2/pdfminer.six/PyMuPDF. "
|
| 131 |
+
"This looks like a scanned (image) PDF and OCR was not available or failed. "
|
| 132 |
+
"Install Tesseract + pytesseract for OCR (e.g., apt-get install tesseract-ocr; pip install pytesseract Pillow PyMuPDF)."
|
| 133 |
+
)
|
| 134 |
|
| 135 |
# =========================
|
| 136 |
# テキスト整形・分割
|
|
|
|
| 138 |
_WS_RE = re.compile(r"[ \t\u3000]+") # 半角/全角スペース畳み込み
|
| 139 |
|
| 140 |
def normalize_text(s: str) -> str:
|
|
|
|
| 141 |
s = s.replace("\r\n", "\n").replace("\r", "\n")
|
| 142 |
s = _WS_RE.sub(" ", s)
|
|
|
|
| 143 |
s = re.sub(r"\n{3,}", "\n\n", s)
|
| 144 |
return s.strip()
|
| 145 |
|
|
|
|
| 149 |
overlap: int = 200,
|
| 150 |
min_chunk: int = 200,
|
| 151 |
) -> List[str]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
text = text.strip()
|
| 153 |
if not text:
|
| 154 |
return []
|
|
|
|
| 155 |
chunks: List[str] = []
|
| 156 |
i = 0
|
| 157 |
n = len(text)
|
| 158 |
step = max(1, chunk_size - overlap)
|
|
|
|
| 159 |
while i < n:
|
| 160 |
j = min(n, i + chunk_size)
|
| 161 |
chunk = text[i:j].strip()
|
|
|
|
| 164 |
if j >= n:
|
| 165 |
break
|
| 166 |
i += step
|
|
|
|
|
|
|
| 167 |
if len(chunks) >= 2 and len(chunks[-1]) < min_chunk:
|
| 168 |
chunks[-2] = (chunks[-2] + "\n" + chunks[-1]).strip()
|
| 169 |
chunks.pop()
|
|
|
|
| 170 |
return chunks
|
| 171 |
|
| 172 |
# =========================
|
|
|
|
| 174 |
# =========================
|
| 175 |
def ingest_pdf_bytes(title: str, source_url: str, pdf_bytes: bytes) -> int:
|
| 176 |
"""
|
| 177 |
+
アップロード API から渡された PDF バイト列を解析・登録(ファイルは書かない)
|
| 178 |
+
- テキスト抽出は pypdf→pdfminer→PyMuPDF→(任意)OCR の順にフォールバック
|
| 179 |
+
- OCR は pytesseract + Tesseract がある場合のみ自動使用
|
|
|
|
| 180 |
"""
|
| 181 |
if not pdf_bytes:
|
| 182 |
raise ValueError("empty pdf_bytes")
|
| 183 |
|
| 184 |
+
raw, meta = pdf_bytes_to_text(pdf_bytes)
|
|
|
|
| 185 |
txt = normalize_text(raw)
|
| 186 |
if not _is_meaningful(txt):
|
| 187 |
+
raise RuntimeError("Parsed text is too short or empty after normalization")
|
|
|
|
| 188 |
|
|
|
|
| 189 |
doc_id = str(uuid.uuid4())
|
| 190 |
chunks = chunk_text(txt, chunk_size=1200, overlap=200, min_chunk=200)
|
| 191 |
|
|
|
|
| 195 |
"doc_id": doc_id,
|
| 196 |
"chunk_id": f"{idx:04d}",
|
| 197 |
"title": title,
|
| 198 |
+
"source_url": source_url,
|
| 199 |
"text": ck,
|
| 200 |
})
|
| 201 |
|
|
|
|
| 202 |
added = add_to_index(records)
|
| 203 |
return int(added)
|
| 204 |
|
| 205 |
+
# 互換ダミー
|
| 206 |
def ingest_edinet_for_company(edinet_code: str, date: Optional[str] = None) -> int:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
return 0
|
| 208 |
|
|
|
|
| 209 |
def download_edinet_pdf(*args, **kwargs):
|
| 210 |
raise NotImplementedError("download_edinet_pdf is not implemented in this minimal build.")
|