Spaces:
Sleeping
Sleeping
Update rag/ingest.py
Browse files- rag/ingest.py +35 -38
rag/ingest.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
# rag/ingest.py
|
| 2 |
from __future__ import annotations
|
| 3 |
-
import io, uuid, re, os
|
| 4 |
from typing import List, Dict, Tuple, Optional
|
| 5 |
|
| 6 |
from irpr.deps import add_to_index
|
|
@@ -10,7 +10,6 @@ from irpr.deps import add_to_index
|
|
| 10 |
# =========================
|
| 11 |
|
| 12 |
def _extract_with_pypdf(pdf_bytes: bytes) -> str:
|
| 13 |
-
# pypdf → PyPDF2 の順で試す
|
| 14 |
try:
|
| 15 |
from pypdf import PdfReader # type: ignore
|
| 16 |
reader = PdfReader(io.BytesIO(pdf_bytes))
|
|
@@ -33,10 +32,6 @@ def _extract_with_pdfminer(pdf_bytes: bytes) -> str:
|
|
| 33 |
return ""
|
| 34 |
|
| 35 |
def _extract_with_pymupdf_text(pdf_bytes: bytes) -> Tuple[str, int, int]:
|
| 36 |
-
"""
|
| 37 |
-
PyMuPDF で page.get_text("text") を取得。
|
| 38 |
-
返り値: (全文, 文字数合計, ページ数)
|
| 39 |
-
"""
|
| 40 |
try:
|
| 41 |
import fitz # PyMuPDF
|
| 42 |
except Exception:
|
|
@@ -56,37 +51,41 @@ def _extract_with_pymupdf_text(pdf_bytes: bytes) -> Tuple[str, int, int]:
|
|
| 56 |
buf.append(t)
|
| 57 |
except Exception:
|
| 58 |
buf.append("")
|
|
|
|
| 59 |
doc.close()
|
| 60 |
-
return ("\n".join(buf), text_chars,
|
| 61 |
|
| 62 |
def _ocr_with_tesseract_via_pymupdf(pdf_bytes: bytes, dpi_scale: float = 2.0) -> str:
|
| 63 |
"""
|
| 64 |
-
Tesseract OCR で
|
| 65 |
-
|
| 66 |
-
|
| 67 |
"""
|
| 68 |
try:
|
| 69 |
import fitz # PyMuPDF
|
| 70 |
from PIL import Image
|
| 71 |
import pytesseract
|
| 72 |
except Exception:
|
| 73 |
-
return "" # OCR不可
|
| 74 |
|
| 75 |
try:
|
| 76 |
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 77 |
except Exception:
|
| 78 |
return ""
|
| 79 |
|
| 80 |
-
lang = os.environ.get("TESSERACT_LANG", "jpn+eng") # 日本語+英語
|
| 81 |
text_buf: List[str] = []
|
| 82 |
for i in range(len(doc)):
|
| 83 |
try:
|
| 84 |
page = doc.load_page(i)
|
| 85 |
-
# DPI ~ 72 * scale。2.0 なら 144dpi 相当
|
| 86 |
mat = fitz.Matrix(dpi_scale, dpi_scale)
|
| 87 |
-
pix = page.get_pixmap(matrix=mat, alpha=False)
|
| 88 |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
text_buf.append(t)
|
| 91 |
except Exception:
|
| 92 |
text_buf.append("")
|
|
@@ -99,11 +98,8 @@ def _is_meaningful(text: str, min_len: int = 10) -> bool:
|
|
| 99 |
def pdf_bytes_to_text(pdf_bytes: bytes) -> Tuple[str, Dict]:
|
| 100 |
"""
|
| 101 |
PDF → テキスト。抽出メタを返す:
|
| 102 |
-
meta = {
|
| 103 |
-
|
| 104 |
-
"scanned_likely": bool,
|
| 105 |
-
"pages": int
|
| 106 |
-
}
|
| 107 |
"""
|
| 108 |
# 1) pypdf / PyPDF2
|
| 109 |
text = _extract_with_pypdf(pdf_bytes)
|
|
@@ -116,20 +112,23 @@ def pdf_bytes_to_text(pdf_bytes: bytes) -> Tuple[str, Dict]:
|
|
| 116 |
return text, {"method": "pdfminer", "scanned_likely": False, "pages": None}
|
| 117 |
|
| 118 |
# 3) PyMuPDF get_text
|
| 119 |
-
pm_text,
|
| 120 |
if _is_meaningful(pm_text):
|
| 121 |
return pm_text, {"method": "pymupdf", "scanned_likely": False, "pages": pages}
|
| 122 |
|
| 123 |
-
# 4)
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
| 125 |
if _is_meaningful(ocr_text):
|
| 126 |
return ocr_text, {"method": "ocr", "scanned_likely": True, "pages": pages or None}
|
| 127 |
|
| 128 |
-
#
|
| 129 |
raise RuntimeError(
|
| 130 |
-
"
|
| 131 |
-
"
|
| 132 |
-
"
|
| 133 |
)
|
| 134 |
|
| 135 |
# =========================
|
|
@@ -143,12 +142,7 @@ def normalize_text(s: str) -> str:
|
|
| 143 |
s = re.sub(r"\n{3,}", "\n\n", s)
|
| 144 |
return s.strip()
|
| 145 |
|
| 146 |
-
def chunk_text(
|
| 147 |
-
text: str,
|
| 148 |
-
chunk_size: int = 1200,
|
| 149 |
-
overlap: int = 200,
|
| 150 |
-
min_chunk: int = 200,
|
| 151 |
-
) -> List[str]:
|
| 152 |
text = text.strip()
|
| 153 |
if not text:
|
| 154 |
return []
|
|
@@ -174,14 +168,17 @@ def chunk_text(
|
|
| 174 |
# =========================
|
| 175 |
def ingest_pdf_bytes(title: str, source_url: str, pdf_bytes: bytes) -> int:
|
| 176 |
"""
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
- OCR は pytesseract + Tesseract がある場合のみ自動使用
|
| 180 |
"""
|
| 181 |
if not pdf_bytes:
|
| 182 |
-
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
-
raw, meta = pdf_bytes_to_text(pdf_bytes)
|
| 185 |
txt = normalize_text(raw)
|
| 186 |
if not _is_meaningful(txt):
|
| 187 |
raise RuntimeError("Parsed text is too short or empty after normalization")
|
|
|
|
| 1 |
# rag/ingest.py
|
| 2 |
from __future__ import annotations
|
| 3 |
+
import io, uuid, re, os, traceback
|
| 4 |
from typing import List, Dict, Tuple, Optional
|
| 5 |
|
| 6 |
from irpr.deps import add_to_index
|
|
|
|
| 10 |
# =========================
|
| 11 |
|
| 12 |
def _extract_with_pypdf(pdf_bytes: bytes) -> str:
|
|
|
|
| 13 |
try:
|
| 14 |
from pypdf import PdfReader # type: ignore
|
| 15 |
reader = PdfReader(io.BytesIO(pdf_bytes))
|
|
|
|
| 32 |
return ""
|
| 33 |
|
| 34 |
def _extract_with_pymupdf_text(pdf_bytes: bytes) -> Tuple[str, int, int]:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
try:
|
| 36 |
import fitz # PyMuPDF
|
| 37 |
except Exception:
|
|
|
|
| 51 |
buf.append(t)
|
| 52 |
except Exception:
|
| 53 |
buf.append("")
|
| 54 |
+
pages = len(buf)
|
| 55 |
doc.close()
|
| 56 |
+
return ("\n".join(buf), text_chars, pages)
|
| 57 |
|
| 58 |
def _ocr_with_tesseract_via_pymupdf(pdf_bytes: bytes, dpi_scale: float = 2.0) -> str:
|
| 59 |
"""
|
| 60 |
+
Tesseract OCR(任意)。pytesseract / Tesseract 本体が無い場合は空文字で返す。
|
| 61 |
+
Tesseract の未導入や言語データ欠如(jpn.traineddata 無し)による FileNotFoundError も
|
| 62 |
+
ここで握りつぶして空文字を返します(上位で「OCRが必要」として案内)。
|
| 63 |
"""
|
| 64 |
try:
|
| 65 |
import fitz # PyMuPDF
|
| 66 |
from PIL import Image
|
| 67 |
import pytesseract
|
| 68 |
except Exception:
|
| 69 |
+
return "" # OCR不可(依存未導入)
|
| 70 |
|
| 71 |
try:
|
| 72 |
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 73 |
except Exception:
|
| 74 |
return ""
|
| 75 |
|
| 76 |
+
lang = os.environ.get("TESSERACT_LANG", "jpn+eng") # 日本語+英語
|
| 77 |
text_buf: List[str] = []
|
| 78 |
for i in range(len(doc)):
|
| 79 |
try:
|
| 80 |
page = doc.load_page(i)
|
|
|
|
| 81 |
mat = fitz.Matrix(dpi_scale, dpi_scale)
|
| 82 |
+
pix = page.get_pixmap(matrix=mat, alpha=False)
|
| 83 |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
| 84 |
+
try:
|
| 85 |
+
t = pytesseract.image_to_string(img, lang=lang) or ""
|
| 86 |
+
except FileNotFoundError:
|
| 87 |
+
# tesseract バイナリ or lang データが無い
|
| 88 |
+
t = ""
|
| 89 |
text_buf.append(t)
|
| 90 |
except Exception:
|
| 91 |
text_buf.append("")
|
|
|
|
| 98 |
def pdf_bytes_to_text(pdf_bytes: bytes) -> Tuple[str, Dict]:
|
| 99 |
"""
|
| 100 |
PDF → テキスト。抽出メタを返す:
|
| 101 |
+
meta = {"method": "...", "scanned_likely": bool, "pages": int|None}
|
| 102 |
+
FileNotFoundError(tesseract 未導入 等)はここで潰して RuntimeError にまとめます。
|
|
|
|
|
|
|
|
|
|
| 103 |
"""
|
| 104 |
# 1) pypdf / PyPDF2
|
| 105 |
text = _extract_with_pypdf(pdf_bytes)
|
|
|
|
| 112 |
return text, {"method": "pdfminer", "scanned_likely": False, "pages": None}
|
| 113 |
|
| 114 |
# 3) PyMuPDF get_text
|
| 115 |
+
pm_text, _chars, pages = _extract_with_pymupdf_text(pdf_bytes)
|
| 116 |
if _is_meaningful(pm_text):
|
| 117 |
return pm_text, {"method": "pymupdf", "scanned_likely": False, "pages": pages}
|
| 118 |
|
| 119 |
+
# 4) OCR(依存が無い/学習データが無い時の FileNotFoundError は握りつぶして空文字)
|
| 120 |
+
try:
|
| 121 |
+
ocr_text = _ocr_with_tesseract_via_pymupdf(pdf_bytes)
|
| 122 |
+
except FileNotFoundError:
|
| 123 |
+
ocr_text = ""
|
| 124 |
if _is_meaningful(ocr_text):
|
| 125 |
return ocr_text, {"method": "ocr", "scanned_likely": True, "pages": pages or None}
|
| 126 |
|
| 127 |
+
# 5) ここまで全滅 → 明示的に RuntimeError
|
| 128 |
raise RuntimeError(
|
| 129 |
+
"PDFテキスト抽出に失敗しました(pypdf/PyPDF2/pdfminer.six/PyMuPDF/OCR)。"
|
| 130 |
+
"スキャンPDFの可能性が高いです。OCR を有効化するには "
|
| 131 |
+
"『tesseract-ocr + pytesseract(必要なら tesseract-ocr-jpn)』を導入してください。"
|
| 132 |
)
|
| 133 |
|
| 134 |
# =========================
|
|
|
|
| 142 |
s = re.sub(r"\n{3,}", "\n\n", s)
|
| 143 |
return s.strip()
|
| 144 |
|
| 145 |
+
def chunk_text(text: str, chunk_size: int = 1200, overlap: int = 200, min_chunk: int = 200) -> List[str]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
text = text.strip()
|
| 147 |
if not text:
|
| 148 |
return []
|
|
|
|
| 168 |
# =========================
|
| 169 |
def ingest_pdf_bytes(title: str, source_url: str, pdf_bytes: bytes) -> int:
|
| 170 |
"""
|
| 171 |
+
PDF バイト列を解析し、チャンクをインデックス登録(ファイルは書かない)
|
| 172 |
+
例外は RuntimeError に正規化して上位に伝えます(FileNotFound は潰す)
|
|
|
|
| 173 |
"""
|
| 174 |
if not pdf_bytes:
|
| 175 |
+
raise RuntimeError("empty pdf_bytes")
|
| 176 |
+
|
| 177 |
+
try:
|
| 178 |
+
raw, meta = pdf_bytes_to_text(pdf_bytes)
|
| 179 |
+
except FileNotFoundError as e:
|
| 180 |
+
raise RuntimeError(f"OCR 実行に必要なバイナリ/言語データが見つかりません: {e}") from e
|
| 181 |
|
|
|
|
| 182 |
txt = normalize_text(raw)
|
| 183 |
if not _is_meaningful(txt):
|
| 184 |
raise RuntimeError("Parsed text is too short or empty after normalization")
|