Corin1998 commited on
Commit
ab914af
·
verified ·
1 Parent(s): e04eca5

Update rag/ingest.py

Browse files
Files changed (1) hide show
  1. rag/ingest.py +35 -38
rag/ingest.py CHANGED
@@ -1,6 +1,6 @@
1
  # rag/ingest.py
2
  from __future__ import annotations
3
- import io, uuid, re, os
4
  from typing import List, Dict, Tuple, Optional
5
 
6
  from irpr.deps import add_to_index
@@ -10,7 +10,6 @@ from irpr.deps import add_to_index
10
  # =========================
11
 
12
  def _extract_with_pypdf(pdf_bytes: bytes) -> str:
13
- # pypdf → PyPDF2 の順で試す
14
  try:
15
  from pypdf import PdfReader # type: ignore
16
  reader = PdfReader(io.BytesIO(pdf_bytes))
@@ -33,10 +32,6 @@ def _extract_with_pdfminer(pdf_bytes: bytes) -> str:
33
  return ""
34
 
35
  def _extract_with_pymupdf_text(pdf_bytes: bytes) -> Tuple[str, int, int]:
36
- """
37
- PyMuPDF で page.get_text("text") を取得。
38
- 返り値: (全文, 文字数合計, ページ数)
39
- """
40
  try:
41
  import fitz # PyMuPDF
42
  except Exception:
@@ -56,37 +51,41 @@ def _extract_with_pymupdf_text(pdf_bytes: bytes) -> Tuple[str, int, int]:
56
  buf.append(t)
57
  except Exception:
58
  buf.append("")
 
59
  doc.close()
60
- return ("\n".join(buf), text_chars, len(buf))
61
 
62
  def _ocr_with_tesseract_via_pymupdf(pdf_bytes: bytes, dpi_scale: float = 2.0) -> str:
63
  """
64
- Tesseract OCR で画像ベースPDFからテキスト抽出
65
- - pytesseract & Tesseract 本体が必要
66
- - 依存が無い環境は例外を投げずに空文字返す
67
  """
68
  try:
69
  import fitz # PyMuPDF
70
  from PIL import Image
71
  import pytesseract
72
  except Exception:
73
- return "" # OCR不可(依存未導入)
74
 
75
  try:
76
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
77
  except Exception:
78
  return ""
79
 
80
- lang = os.environ.get("TESSERACT_LANG", "jpn+eng") # 日本語+英語をデフォルト
81
  text_buf: List[str] = []
82
  for i in range(len(doc)):
83
  try:
84
  page = doc.load_page(i)
85
- # DPI ~ 72 * scale。2.0 なら 144dpi 相当
86
  mat = fitz.Matrix(dpi_scale, dpi_scale)
87
- pix = page.get_pixmap(matrix=mat, alpha=False) # RGB
88
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
89
- t = pytesseract.image_to_string(img, lang=lang) or ""
 
 
 
 
90
  text_buf.append(t)
91
  except Exception:
92
  text_buf.append("")
@@ -99,11 +98,8 @@ def _is_meaningful(text: str, min_len: int = 10) -> bool:
99
  def pdf_bytes_to_text(pdf_bytes: bytes) -> Tuple[str, Dict]:
100
  """
101
  PDF → テキスト。抽出メタを返す:
102
- meta = {
103
- "method": "pypdf|pdfminer|pymupdf|ocr",
104
- "scanned_likely": bool,
105
- "pages": int
106
- }
107
  """
108
  # 1) pypdf / PyPDF2
109
  text = _extract_with_pypdf(pdf_bytes)
@@ -116,20 +112,23 @@ def pdf_bytes_to_text(pdf_bytes: bytes) -> Tuple[str, Dict]:
116
  return text, {"method": "pdfminer", "scanned_likely": False, "pages": None}
117
 
118
  # 3) PyMuPDF get_text
119
- pm_text, text_chars, pages = _extract_with_pymupdf_text(pdf_bytes)
120
  if _is_meaningful(pm_text):
121
  return pm_text, {"method": "pymupdf", "scanned_likely": False, "pages": pages}
122
 
123
- # 4) ここまで空 ⇒ スキャンPDFの可能性 OCR を試す
124
- ocr_text = _ocr_with_tesseract_via_pymupdf(pdf_bytes)
 
 
 
125
  if _is_meaningful(ocr_text):
126
  return ocr_text, {"method": "ocr", "scanned_likely": True, "pages": pages or None}
127
 
128
- # OCR も不可(依存未導入 or 画像品質不良)
129
  raise RuntimeError(
130
- "Failed to parse PDF with pypdf/PyPDF2/pdfminer.six/PyMuPDF. "
131
- "This looks like a scanned (image) PDF and OCR was not available or failed. "
132
- "Install Tesseract + pytesseract for OCR (e.g., apt-get install tesseract-ocr; pip install pytesseract Pillow PyMuPDF)."
133
  )
134
 
135
  # =========================
@@ -143,12 +142,7 @@ def normalize_text(s: str) -> str:
143
  s = re.sub(r"\n{3,}", "\n\n", s)
144
  return s.strip()
145
 
146
- def chunk_text(
147
- text: str,
148
- chunk_size: int = 1200,
149
- overlap: int = 200,
150
- min_chunk: int = 200,
151
- ) -> List[str]:
152
  text = text.strip()
153
  if not text:
154
  return []
@@ -174,14 +168,17 @@ def chunk_text(
174
  # =========================
175
  def ingest_pdf_bytes(title: str, source_url: str, pdf_bytes: bytes) -> int:
176
  """
177
- アップロード API から渡された PDF バイト列を解析登録(ファイルは書かない)
178
- - テキスト抽出pypdf→pdfminer→PyMuPDF→(任意)OCR の順フォールバック
179
- - OCR は pytesseract + Tesseract がある場合のみ自動使用
180
  """
181
  if not pdf_bytes:
182
- raise ValueError("empty pdf_bytes")
 
 
 
 
 
183
 
184
- raw, meta = pdf_bytes_to_text(pdf_bytes)
185
  txt = normalize_text(raw)
186
  if not _is_meaningful(txt):
187
  raise RuntimeError("Parsed text is too short or empty after normalization")
 
1
  # rag/ingest.py
2
  from __future__ import annotations
3
+ import io, uuid, re, os, traceback
4
  from typing import List, Dict, Tuple, Optional
5
 
6
  from irpr.deps import add_to_index
 
10
  # =========================
11
 
12
  def _extract_with_pypdf(pdf_bytes: bytes) -> str:
 
13
  try:
14
  from pypdf import PdfReader # type: ignore
15
  reader = PdfReader(io.BytesIO(pdf_bytes))
 
32
  return ""
33
 
34
  def _extract_with_pymupdf_text(pdf_bytes: bytes) -> Tuple[str, int, int]:
 
 
 
 
35
  try:
36
  import fitz # PyMuPDF
37
  except Exception:
 
51
  buf.append(t)
52
  except Exception:
53
  buf.append("")
54
+ pages = len(buf)
55
  doc.close()
56
+ return ("\n".join(buf), text_chars, pages)
57
 
58
  def _ocr_with_tesseract_via_pymupdf(pdf_bytes: bytes, dpi_scale: float = 2.0) -> str:
59
  """
60
+ Tesseract OCR(任意)。pytesseract / Tesseract 本体が無い場合は空文字返す
61
+ Tesseract の未導入や言語データ欠如(jpn.traineddata 無し)による FileNotFoundError
62
+ ここ握りつぶして空文字しま(上位で「OCRが必要」として案内)。
63
  """
64
  try:
65
  import fitz # PyMuPDF
66
  from PIL import Image
67
  import pytesseract
68
  except Exception:
69
+ return "" # OCR不可(依存未導入)
70
 
71
  try:
72
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
73
  except Exception:
74
  return ""
75
 
76
+ lang = os.environ.get("TESSERACT_LANG", "jpn+eng") # 日本語+英語
77
  text_buf: List[str] = []
78
  for i in range(len(doc)):
79
  try:
80
  page = doc.load_page(i)
 
81
  mat = fitz.Matrix(dpi_scale, dpi_scale)
82
+ pix = page.get_pixmap(matrix=mat, alpha=False)
83
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
84
+ try:
85
+ t = pytesseract.image_to_string(img, lang=lang) or ""
86
+ except FileNotFoundError:
87
+ # tesseract バイナリ or lang データが無い
88
+ t = ""
89
  text_buf.append(t)
90
  except Exception:
91
  text_buf.append("")
 
98
  def pdf_bytes_to_text(pdf_bytes: bytes) -> Tuple[str, Dict]:
99
  """
100
  PDF → テキスト。抽出メタを返す:
101
+ meta = {"method": "...", "scanned_likely": bool, "pages": int|None}
102
+ FileNotFoundError(tesseract 未導入 等)はここで潰して RuntimeError にまとめます。
 
 
 
103
  """
104
  # 1) pypdf / PyPDF2
105
  text = _extract_with_pypdf(pdf_bytes)
 
112
  return text, {"method": "pdfminer", "scanned_likely": False, "pages": None}
113
 
114
  # 3) PyMuPDF get_text
115
+ pm_text, _chars, pages = _extract_with_pymupdf_text(pdf_bytes)
116
  if _is_meaningful(pm_text):
117
  return pm_text, {"method": "pymupdf", "scanned_likely": False, "pages": pages}
118
 
119
+ # 4) OCR(依存/学習データが無い時の FileNotFoundError は握りつぶして空文字)
120
+ try:
121
+ ocr_text = _ocr_with_tesseract_via_pymupdf(pdf_bytes)
122
+ except FileNotFoundError:
123
+ ocr_text = ""
124
  if _is_meaningful(ocr_text):
125
  return ocr_text, {"method": "ocr", "scanned_likely": True, "pages": pages or None}
126
 
127
+ # 5) ここまで全滅 明示的に RuntimeError
128
  raise RuntimeError(
129
+ "PDFテキスト抽出に失敗しました(pypdf/PyPDF2/pdfminer.six/PyMuPDF/OCR)。"
130
+ "スキャンPDFの可能性が高いです。OCR を有効化するには "
131
+ "『tesseract-ocr + pytesseract(必要なら tesseract-ocr-jpn)』を導入してください。"
132
  )
133
 
134
  # =========================
 
142
  s = re.sub(r"\n{3,}", "\n\n", s)
143
  return s.strip()
144
 
145
+ def chunk_text(text: str, chunk_size: int = 1200, overlap: int = 200, min_chunk: int = 200) -> List[str]:
 
 
 
 
 
146
  text = text.strip()
147
  if not text:
148
  return []
 
168
  # =========================
169
  def ingest_pdf_bytes(title: str, source_url: str, pdf_bytes: bytes) -> int:
170
  """
171
+ PDF バイト列を解析し、チャンクをインデックス登録(ファイルは書かない)
172
+ 例外RuntimeError正規化して上位に伝えます(FileNotFound は潰す)
 
173
  """
174
  if not pdf_bytes:
175
+ raise RuntimeError("empty pdf_bytes")
176
+
177
+ try:
178
+ raw, meta = pdf_bytes_to_text(pdf_bytes)
179
+ except FileNotFoundError as e:
180
+ raise RuntimeError(f"OCR 実行に必要なバイナリ/言語データが見つかりません: {e}") from e
181
 
 
182
  txt = normalize_text(raw)
183
  if not _is_meaningful(txt):
184
  raise RuntimeError("Parsed text is too short or empty after normalization")