Corin1998 commited on
Commit
28ac450
·
verified ·
1 Parent(s): 359d165

Update rag/ingest.py

Browse files
Files changed (1) hide show
  1. rag/ingest.py +105 -52
rag/ingest.py CHANGED
@@ -1,61 +1,136 @@
1
  # rag/ingest.py
2
  from __future__ import annotations
3
- import io, uuid, re
4
  from typing import List, Dict, Tuple, Optional
5
 
6
- # ベクタ登録は deps 側に委譲(保存先の作成・権限などもそちらで面倒を見ます)
7
  from irpr.deps import add_to_index
8
 
9
  # =========================
10
- # PDF → テキスト(メモリ内で完結
11
  # =========================
 
12
  def _extract_with_pypdf(pdf_bytes: bytes) -> str:
 
13
  try:
14
- # pypdf(新名称)
15
  from pypdf import PdfReader # type: ignore
16
  reader = PdfReader(io.BytesIO(pdf_bytes))
17
- texts = []
18
- for p in reader.pages:
19
- # extract_text() が None の場合があるのでガード
20
- t = p.extract_text() or ""
21
- texts.append(t)
22
  return "\n".join(texts)
23
  except Exception:
24
- # 旧パッケージ名 PyPDF2 にフォールバック
25
  try:
26
  import PyPDF2 # type: ignore
27
  reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
28
- texts = []
29
- for p in reader.pages:
30
- t = p.extract_text() or ""
31
- texts.append(t)
32
  return "\n".join(texts)
33
  except Exception:
34
  return ""
35
 
36
  def _extract_with_pdfminer(pdf_bytes: bytes) -> str:
37
  try:
38
- # pdfminer.six(純Python・精度高め)
39
  from pdfminer.high_level import extract_text # type: ignore
40
- # file-like を渡せる
41
  return extract_text(io.BytesIO(pdf_bytes)) or ""
42
  except Exception:
43
  return ""
44
 
45
- def pdf_bytes_to_text(pdf_bytes: bytes) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  # 1) pypdf / PyPDF2
47
  text = _extract_with_pypdf(pdf_bytes)
48
  if _is_meaningful(text):
49
- return text
 
50
  # 2) pdfminer.six
51
  text = _extract_with_pdfminer(pdf_bytes)
52
  if _is_meaningful(text):
53
- return text
54
- # 3) どちらも失敗
55
- raise RuntimeError("Failed to parse PDF with pypdf/PyPDF2/pdfminer.six")
 
 
 
56
 
57
- def _is_meaningful(text: str) -> bool:
58
- return bool(text and text.strip() and len(text.strip()) >= 10)
 
 
 
 
 
 
 
 
 
59
 
60
  # =========================
61
  # テキスト整形・分割
@@ -63,10 +138,8 @@ def _is_meaningful(text: str) -> bool:
63
  _WS_RE = re.compile(r"[ \t\u3000]+") # 半角/全角スペース畳み込み
64
 
65
  def normalize_text(s: str) -> str:
66
- # 改行は温存しつつ、連続スペースを1つに
67
  s = s.replace("\r\n", "\n").replace("\r", "\n")
68
  s = _WS_RE.sub(" ", s)
69
- # 連続改行は最大2に
70
  s = re.sub(r"\n{3,}", "\n\n", s)
71
  return s.strip()
72
 
@@ -76,20 +149,13 @@ def chunk_text(
76
  overlap: int = 200,
77
  min_chunk: int = 200,
78
  ) -> List[str]:
79
- """
80
- 文字数ベースのシンプル分割。
81
- - overlap で前後文脈を少し残す
82
- - 最終チャンクが短すぎる場合は前チャンクに吸収
83
- """
84
  text = text.strip()
85
  if not text:
86
  return []
87
-
88
  chunks: List[str] = []
89
  i = 0
90
  n = len(text)
91
  step = max(1, chunk_size - overlap)
92
-
93
  while i < n:
94
  j = min(n, i + chunk_size)
95
  chunk = text[i:j].strip()
@@ -98,12 +164,9 @@ def chunk_text(
98
  if j >= n:
99
  break
100
  i += step
101
-
102
- # 末尾が短すぎる場合はマージ
103
  if len(chunks) >= 2 and len(chunks[-1]) < min_chunk:
104
  chunks[-2] = (chunks[-2] + "\n" + chunks[-1]).strip()
105
  chunks.pop()
106
-
107
  return chunks
108
 
109
  # =========================
@@ -111,22 +174,18 @@ def chunk_text(
111
  # =========================
112
  def ingest_pdf_bytes(title: str, source_url: str, pdf_bytes: bytes) -> int:
113
  """
114
- アップロード API から渡された PDF バイト列をそのまま解析・登録
115
- - 一切ファイル作らない(= FileNotFound/PermissionDenied を回避)
116
- - チャンク化後、irpr.deps.add_to_index へ登録
117
- Returns: 追加チャンク数
118
  """
119
  if not pdf_bytes:
120
  raise ValueError("empty pdf_bytes")
121
 
122
- # PDF テキスト(メモリ内)
123
- raw = pdf_bytes_to_text(pdf_bytes)
124
  txt = normalize_text(raw)
125
  if not _is_meaningful(txt):
126
- # 日本語埋め込みの品質のため最低限の長さチェック
127
- raise RuntimeError("Parsed text is too short or empty")
128
 
129
- # チャンク分割
130
  doc_id = str(uuid.uuid4())
131
  chunks = chunk_text(txt, chunk_size=1200, overlap=200, min_chunk=200)
132
 
@@ -136,22 +195,16 @@ def ingest_pdf_bytes(title: str, source_url: str, pdf_bytes: bytes) -> int:
136
  "doc_id": doc_id,
137
  "chunk_id": f"{idx:04d}",
138
  "title": title,
139
- "source_url": source_url, # /files/uploads/<name> をそのままリンクに
140
  "text": ck,
141
  })
142
 
143
- # ベクタ登録(保存先は deps 側が責任もって作成・権限付与)
144
  added = add_to_index(records)
145
  return int(added)
146
 
147
- # 既存のエンドポイントから参照されている可能性に配慮してダミー実装を残す
148
  def ingest_edinet_for_company(edinet_code: str, date: Optional[str] = None) -> int:
149
- """
150
- 将来的に EDINET ダウンロード → 解析を実装する場合のフック。
151
- 現状はアップロード PDF の処理に一本化しているため 0 を返す。
152
- """
153
  return 0
154
 
155
- # 参考: 以前のコード互換のため残しておく(未使用)
156
  def download_edinet_pdf(*args, **kwargs):
157
  raise NotImplementedError("download_edinet_pdf is not implemented in this minimal build.")
 
1
  # rag/ingest.py
2
  from __future__ import annotations
3
+ import io, uuid, re, os
4
  from typing import List, Dict, Tuple, Optional
5
 
 
6
  from irpr.deps import add_to_index
7
 
8
  # =========================
9
+ # PDF → テキスト(多段フォールバック
10
  # =========================
11
+
12
  def _extract_with_pypdf(pdf_bytes: bytes) -> str:
13
+ # pypdf → PyPDF2 の順で試す
14
  try:
 
15
  from pypdf import PdfReader # type: ignore
16
  reader = PdfReader(io.BytesIO(pdf_bytes))
17
+ texts = [(p.extract_text() or "") for p in reader.pages]
 
 
 
 
18
  return "\n".join(texts)
19
  except Exception:
 
20
  try:
21
  import PyPDF2 # type: ignore
22
  reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
23
+ texts = [(p.extract_text() or "") for p in reader.pages]
 
 
 
24
  return "\n".join(texts)
25
  except Exception:
26
  return ""
27
 
28
  def _extract_with_pdfminer(pdf_bytes: bytes) -> str:
29
  try:
 
30
  from pdfminer.high_level import extract_text # type: ignore
 
31
  return extract_text(io.BytesIO(pdf_bytes)) or ""
32
  except Exception:
33
  return ""
34
 
35
+ def _extract_with_pymupdf_text(pdf_bytes: bytes) -> Tuple[str, int, int]:
36
+ """
37
+ PyMuPDF で page.get_text("text") を取得。
38
+ 返り値: (全文, 文字数合計, ページ数)
39
+ """
40
+ try:
41
+ import fitz # PyMuPDF
42
+ except Exception:
43
+ return "", 0, 0
44
+ try:
45
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
46
+ except Exception:
47
+ return "", 0, 0
48
+
49
+ buf: List[str] = []
50
+ text_chars = 0
51
+ for i in range(len(doc)):
52
+ try:
53
+ page = doc.load_page(i)
54
+ t = page.get_text("text") or ""
55
+ text_chars += len(t.strip())
56
+ buf.append(t)
57
+ except Exception:
58
+ buf.append("")
59
+ doc.close()
60
+ return ("\n".join(buf), text_chars, len(buf))
61
+
62
+ def _ocr_with_tesseract_via_pymupdf(pdf_bytes: bytes, dpi_scale: float = 2.0) -> str:
63
+ """
64
+ Tesseract OCR で画像ベースPDFからテキスト抽出。
65
+ - pytesseract & Tesseract 本体が必要
66
+ - 依存が無い環境では例外を投げずに空文字返す
67
+ """
68
+ try:
69
+ import fitz # PyMuPDF
70
+ from PIL import Image
71
+ import pytesseract
72
+ except Exception:
73
+ return "" # OCR不可能(依存未導入)
74
+
75
+ try:
76
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
77
+ except Exception:
78
+ return ""
79
+
80
+ lang = os.environ.get("TESSERACT_LANG", "jpn+eng") # 日本語+英語をデフォルト
81
+ text_buf: List[str] = []
82
+ for i in range(len(doc)):
83
+ try:
84
+ page = doc.load_page(i)
85
+ # DPI ~ 72 * scale。2.0 なら 144dpi 相当
86
+ mat = fitz.Matrix(dpi_scale, dpi_scale)
87
+ pix = page.get_pixmap(matrix=mat, alpha=False) # RGB
88
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
89
+ t = pytesseract.image_to_string(img, lang=lang) or ""
90
+ text_buf.append(t)
91
+ except Exception:
92
+ text_buf.append("")
93
+ doc.close()
94
+ return "\n".join(text_buf).strip()
95
+
96
+ def _is_meaningful(text: str, min_len: int = 10) -> bool:
97
+ return bool(text and text.strip() and len(text.strip()) >= min_len)
98
+
99
+ def pdf_bytes_to_text(pdf_bytes: bytes) -> Tuple[str, Dict]:
100
+ """
101
+ PDF → テキスト。抽出メタを返す:
102
+ meta = {
103
+ "method": "pypdf|pdfminer|pymupdf|ocr",
104
+ "scanned_likely": bool,
105
+ "pages": int
106
+ }
107
+ """
108
  # 1) pypdf / PyPDF2
109
  text = _extract_with_pypdf(pdf_bytes)
110
  if _is_meaningful(text):
111
+ return text, {"method": "pypdf", "scanned_likely": False, "pages": None}
112
+
113
  # 2) pdfminer.six
114
  text = _extract_with_pdfminer(pdf_bytes)
115
  if _is_meaningful(text):
116
+ return text, {"method": "pdfminer", "scanned_likely": False, "pages": None}
117
+
118
+ # 3) PyMuPDF get_text
119
+ pm_text, text_chars, pages = _extract_with_pymupdf_text(pdf_bytes)
120
+ if _is_meaningful(pm_text):
121
+ return pm_text, {"method": "pymupdf", "scanned_likely": False, "pages": pages}
122
 
123
+ # 4) ここまで空 ⇒ スキャンPDFの可能性が高い → OCR を試す
124
+ ocr_text = _ocr_with_tesseract_via_pymupdf(pdf_bytes)
125
+ if _is_meaningful(ocr_text):
126
+ return ocr_text, {"method": "ocr", "scanned_likely": True, "pages": pages or None}
127
+
128
+ # OCR も不可(依存未導入 or 画像品質不良)
129
+ raise RuntimeError(
130
+ "Failed to parse PDF with pypdf/PyPDF2/pdfminer.six/PyMuPDF. "
131
+ "This looks like a scanned (image) PDF and OCR was not available or failed. "
132
+ "Install Tesseract + pytesseract for OCR (e.g., apt-get install tesseract-ocr; pip install pytesseract Pillow PyMuPDF)."
133
+ )
134
 
135
  # =========================
136
  # テキスト整形・分割
 
138
  _WS_RE = re.compile(r"[ \t\u3000]+") # 半角/全角スペース畳み込み
139
 
140
  def normalize_text(s: str) -> str:
 
141
  s = s.replace("\r\n", "\n").replace("\r", "\n")
142
  s = _WS_RE.sub(" ", s)
 
143
  s = re.sub(r"\n{3,}", "\n\n", s)
144
  return s.strip()
145
 
 
149
  overlap: int = 200,
150
  min_chunk: int = 200,
151
  ) -> List[str]:
 
 
 
 
 
152
  text = text.strip()
153
  if not text:
154
  return []
 
155
  chunks: List[str] = []
156
  i = 0
157
  n = len(text)
158
  step = max(1, chunk_size - overlap)
 
159
  while i < n:
160
  j = min(n, i + chunk_size)
161
  chunk = text[i:j].strip()
 
164
  if j >= n:
165
  break
166
  i += step
 
 
167
  if len(chunks) >= 2 and len(chunks[-1]) < min_chunk:
168
  chunks[-2] = (chunks[-2] + "\n" + chunks[-1]).strip()
169
  chunks.pop()
 
170
  return chunks
171
 
172
  # =========================
 
174
  # =========================
175
  def ingest_pdf_bytes(title: str, source_url: str, pdf_bytes: bytes) -> int:
176
  """
177
+ アップロード API から渡された PDF バイト列を解析・登録(ファイルは書かない)
178
+ - テキスト抽出pypdf→pdfminer→PyMuPDF→(任意)OCR の順にフォールバック
179
+ - OCR は pytesseract + Tesseract がある場合のみ自動使用
 
180
  """
181
  if not pdf_bytes:
182
  raise ValueError("empty pdf_bytes")
183
 
184
+ raw, meta = pdf_bytes_to_text(pdf_bytes)
 
185
  txt = normalize_text(raw)
186
  if not _is_meaningful(txt):
187
+ raise RuntimeError("Parsed text is too short or empty after normalization")
 
188
 
 
189
  doc_id = str(uuid.uuid4())
190
  chunks = chunk_text(txt, chunk_size=1200, overlap=200, min_chunk=200)
191
 
 
195
  "doc_id": doc_id,
196
  "chunk_id": f"{idx:04d}",
197
  "title": title,
198
+ "source_url": source_url,
199
  "text": ck,
200
  })
201
 
 
202
  added = add_to_index(records)
203
  return int(added)
204
 
205
+ # 互換ダミー
206
  def ingest_edinet_for_company(edinet_code: str, date: Optional[str] = None) -> int:
 
 
 
 
207
  return 0
208
 
 
209
  def download_edinet_pdf(*args, **kwargs):
210
  raise NotImplementedError("download_edinet_pdf is not implemented in this minimal build.")