Corin1998 commited on
Commit
81a064e
·
verified ·
1 Parent(s): 9fc4a26

Upload 7 files

Browse files
Files changed (3) hide show
  1. app.py +52 -13
  2. config.yaml +3 -24
  3. ingest.py +29 -31
app.py CHANGED
@@ -1,5 +1,5 @@
1
  from __future__ import annotations
2
- import os, json, yaml, subprocess, sys, pathlib, traceback, shutil, re
3
  from typing import List, Dict, Tuple, Iterable, Optional
4
 
5
  from fastapi import FastAPI, Body
@@ -56,11 +56,18 @@ except Exception as e:
56
  CFG = DEFAULT_CFG
57
  CFG_ERR = "config.yaml 読み込みエラー: " + str(e)
58
 
59
- # ===== paths & utils =====
60
- INDEX_PATH = pathlib.Path("data/index/faiss.index")
61
- META_PATH = pathlib.Path("data/index/meta.jsonl")
62
- PDF_DIR = pathlib.Path("data/pdf")
 
 
 
 
 
 
63
  PDF_DIR.mkdir(parents=True, exist_ok=True)
 
64
 
65
  def _lazy_imports():
66
  global faiss, np, embed_texts, chat, detect_out_of_scope, sanitize, compliance_block, SCOPE_HINT
@@ -172,14 +179,18 @@ def _safe_name(name: str) -> str:
172
 
173
  def save_uploaded_pdfs(file_paths: Optional[Iterable[str]]) -> Tuple[int, List[str], List[str]]:
174
  """
175
- Gradioのgr.Files(type='filepath') から渡される一時ファイルパス群を data/pdf/ にコピー
176
- return: (保存数, 保存先パス一覧, スキップorエラー一覧)
177
  """
178
  saved, skipped = [], []
179
  if not file_paths:
180
  return 0, saved, ["アップロードされたPDFがありません。"]
181
 
182
- PDF_DIR.mkdir(parents=True, exist_ok=True)
 
 
 
 
183
  for fp in file_paths:
184
  if not fp:
185
  continue
@@ -192,8 +203,11 @@ def save_uploaded_pdfs(file_paths: Optional[Iterable[str]]) -> Tuple[int, List[s
192
  continue
193
  dst = PDF_DIR / _safe_name(src.name)
194
  try:
195
- shutil.copy2(src, dst)
 
196
  saved.append(str(dst))
 
 
197
  except Exception as e:
198
  skipped.append(f"{src.name}: コピー失敗 ({e})")
199
  return len(saved), saved, skipped
@@ -202,7 +216,7 @@ def upload_and_rebuild(file_paths: Optional[List[str]]) -> str:
202
  n, saved, skipped = save_uploaded_pdfs(file_paths)
203
  msg = []
204
  if n > 0:
205
- msg.append(f"📥 {n} 件のPDFを data/pdf/ に保存しました。")
206
  msg.extend([f"- {p}" for p in saved[:10]])
207
  if skipped:
208
  msg.append("⚠️ スキップ/エラー:")
@@ -216,9 +230,9 @@ def rebuild_index() -> str:
216
  if not _check_api_key():
217
  return "OPENAI_API_KEY が未設定です。コンソール / Secrets に登録してください。"
218
  if not list(PDF_DIR.glob("*.pdf")):
219
- return "data/pdf/ にPDFがありません。PDFをアップロードして再実行してください。"
220
  try:
221
- out = subprocess.run([sys.executable, "ingest.py"], capture_output=True, text=True, check=True)
222
  # キャッシュ破棄
223
  global _INDEX, _METAS
224
  _INDEX = None
@@ -229,6 +243,29 @@ def rebuild_index() -> str:
229
  except Exception as e:
230
  return "❌ 予期せぬエラー: " + str(e) + "\n" + traceback.format_exc()[-1200:]
231
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  # ===== FastAPI =====
233
  app = FastAPI(title=CFG.get("app_name", "RAG Bot"))
234
  app.add_middleware(
@@ -278,9 +315,11 @@ with gr.Blocks(fill_height=True, title=CFG.get("app_name", "RAG Bot")) as demo:
278
  uploads = gr.Files(label="PDFをドラッグ&ドロップ(複数可)", file_types=[".pdf"], type="filepath")
279
  with gr.Row():
280
  up_btn = gr.Button("アップロード → インデックス再構築", variant="secondary")
 
281
  up_log = gr.Markdown()
 
282
  up_btn.click(fn=upload_and_rebuild, inputs=[uploads], outputs=[up_log])
 
283
 
284
  from gradio.routes import mount_gradio_app
285
  mount_gradio_app(app, demo, path="/")
286
-
 
1
  from __future__ import annotations
2
+ import os, json, yaml, subprocess, sys, pathlib, traceback, shutil, re, getpass, stat
3
  from typing import List, Dict, Tuple, Iterable, Optional
4
 
5
  from fastapi import FastAPI, Body
 
56
  CFG = DEFAULT_CFG
57
  CFG_ERR = "config.yaml 読み込みエラー: " + str(e)
58
 
59
+ # ===== absolute paths =====
60
+ BASE_DIR = pathlib.Path(__file__).resolve().parent
61
+ DATA_DIR = BASE_DIR / "data"
62
+ INDEX_DIR = DATA_DIR / "index"
63
+ PDF_DIR = DATA_DIR / "pdf"
64
+
65
+ INDEX_PATH = INDEX_DIR / "faiss.index"
66
+ META_PATH = INDEX_DIR / "meta.jsonl"
67
+
68
+ # ensure dirs
69
  PDF_DIR.mkdir(parents=True, exist_ok=True)
70
+ INDEX_DIR.mkdir(parents=True, exist_ok=True)
71
 
72
  def _lazy_imports():
73
  global faiss, np, embed_texts, chat, detect_out_of_scope, sanitize, compliance_block, SCOPE_HINT
 
179
 
180
  def save_uploaded_pdfs(file_paths: Optional[Iterable[str]]) -> Tuple[int, List[str], List[str]]:
181
  """
182
+ gr.Files(type='filepath') から一時ファイル群を data/pdf/ に保存
183
+ return: (保存数, 保存先, スキップ/エラー)
184
  """
185
  saved, skipped = [], []
186
  if not file_paths:
187
  return 0, saved, ["アップロードされたPDFがありません。"]
188
 
189
+ try:
190
+ PDF_DIR.mkdir(parents=True, exist_ok=True)
191
+ except Exception as e:
192
+ return 0, [], [f"data/pdf の作成に失敗: {e}"]
193
+
194
  for fp in file_paths:
195
  if not fp:
196
  continue
 
203
  continue
204
  dst = PDF_DIR / _safe_name(src.name)
205
  try:
206
+ # 権限問題を避けるため copyfile(メタデータを引き継がない)
207
+ shutil.copyfile(src, dst)
208
  saved.append(str(dst))
209
+ except PermissionError as e:
210
+ skipped.append(f"{src.name}: Permission denied({dst})。Dockerfileの所有権設定を確認してください。")
211
  except Exception as e:
212
  skipped.append(f"{src.name}: コピー失敗 ({e})")
213
  return len(saved), saved, skipped
 
216
  n, saved, skipped = save_uploaded_pdfs(file_paths)
217
  msg = []
218
  if n > 0:
219
+ msg.append(f"📥 {n} 件のPDFを {PDF_DIR} に保存しました。")
220
  msg.extend([f"- {p}" for p in saved[:10]])
221
  if skipped:
222
  msg.append("⚠️ スキップ/エラー:")
 
230
  if not _check_api_key():
231
  return "OPENAI_API_KEY が未設定です。コンソール / Secrets に登録してください。"
232
  if not list(PDF_DIR.glob("*.pdf")):
233
+ return f"{PDF_DIR} にPDFがありません。PDFをアップロードして再実行してください。"
234
  try:
235
+ out = subprocess.run([sys.executable, str(BASE_DIR / "ingest.py")], capture_output=True, text=True, check=True)
236
  # キャッシュ破棄
237
  global _INDEX, _METAS
238
  _INDEX = None
 
243
  except Exception as e:
244
  return "❌ 予期せぬエラー: " + str(e) + "\n" + traceback.format_exc()[-1200:]
245
 
246
+ # ===== File-system diagnose (optional) =====
247
+ def fs_diagnose() -> str:
248
+ lines = []
249
+ lines.append(f"User: {getpass.getuser()}")
250
+ lines.append(f"CWD : {os.getcwd()}")
251
+ for p in [BASE_DIR, DATA_DIR, PDF_DIR, INDEX_DIR]:
252
+ try:
253
+ st = p.stat()
254
+ mode = stat.filemode(st.st_mode)
255
+ lines.append(f"{p} exists={p.exists()} owner={st.st_uid}:{st.st_gid} mode={mode}")
256
+ except Exception as e:
257
+ lines.append(f"{p} stat error: {e}")
258
+ # 書き込みテスト
259
+ try:
260
+ test = PDF_DIR / "_write_test.tmp"
261
+ with open(test, "wb") as f:
262
+ f.write(b"ok")
263
+ test.unlink()
264
+ lines.append("WRITE TEST: OK (data/pdf に書き込み可能)")
265
+ except Exception as e:
266
+ lines.append(f"WRITE TEST: NG -> {e}")
267
+ return "```\n" + "\n".join(lines) + "\n```"
268
+
269
  # ===== FastAPI =====
270
  app = FastAPI(title=CFG.get("app_name", "RAG Bot"))
271
  app.add_middleware(
 
315
  uploads = gr.Files(label="PDFをドラッグ&ドロップ(複数可)", file_types=[".pdf"], type="filepath")
316
  with gr.Row():
317
  up_btn = gr.Button("アップロード → インデックス再構築", variant="secondary")
318
+ diag_btn = gr.Button("📋 ストレージ診断")
319
  up_log = gr.Markdown()
320
+ diag_log = gr.Markdown()
321
  up_btn.click(fn=upload_and_rebuild, inputs=[uploads], outputs=[up_log])
322
+ diag_btn.click(fn=fs_diagnose, outputs=[diag_log])
323
 
324
  from gradio.routes import mount_gradio_app
325
  mount_gradio_app(app, demo, path="/")
 
config.yaml CHANGED
@@ -1,16 +1,8 @@
1
  app_name: "IR/ESG RAG Bot (OpenAI, 8 languages)"
2
  embedding_model: "text-embedding-3-large"
3
  normalize_embeddings: true
4
-
5
- chunk:
6
- target_chars: 1400
7
- overlap_chars: 180
8
-
9
- retrieval:
10
- top_k: 6
11
- score_threshold: 0.15
12
- mmr_lambda: 0.3
13
-
14
  llm:
15
  model: "gpt-4o-mini"
16
  max_output_tokens: 700
@@ -18,19 +10,6 @@ llm:
18
  system_prompt: |-
19
  あなたは上場企業のIR・ESG開示に特化したRAGアシスタントです。回答は常に根拠(文書名・ページ)を箇条書きで示し、
20
  文書外の推測や断定は避けます。数値は年度と単位を明記し、最新年度を優先してください。
21
-
22
  languages:
23
  preferred: [ja, en, zh, ko, fr, de, es, it]
24
- labels:
25
- ja: "日本語"
26
- en: "English"
27
- zh: "中文"
28
- ko: "한국어"
29
- fr: "Français"
30
- de: "Deutsch"
31
- es: "Español"
32
- it: "Italiano"
33
-
34
- logging:
35
- save_qa: true
36
- path: "logs/qa_log.jsonl"
 
1
  app_name: "IR/ESG RAG Bot (OpenAI, 8 languages)"
2
  embedding_model: "text-embedding-3-large"
3
  normalize_embeddings: true
4
+ chunk: { target_chars: 1400, overlap_chars: 180 }
5
+ retrieval: { top_k: 6, score_threshold: 0.15, mmr_lambda: 0.3 }
 
 
 
 
 
 
 
 
6
  llm:
7
  model: "gpt-4o-mini"
8
  max_output_tokens: 700
 
10
  system_prompt: |-
11
  あなたは上場企業のIR・ESG開示に特化したRAGアシスタントです。回答は常に根拠(文書名・ページ)を箇条書きで示し、
12
  文書外の推測や断定は避けます。数値は年度と単位を明記し、最新年度を優先してください。
 
13
  languages:
14
  preferred: [ja, en, zh, ko, fr, de, es, it]
15
+ labels: { ja: 日本語, en: English, zh: 中文, ko: 한국어, fr: Français, de: Deutsch, es: Español, it: Italiano }
 
 
 
 
 
 
 
 
 
 
 
 
ingest.py CHANGED
@@ -10,18 +10,22 @@ import yaml
10
  from openai_client import embed_texts
11
  from guardrails import sanitize
12
 
13
- CFG = yaml.safe_load(open("config.yaml", encoding="utf-8"))
14
- EMB_MODEL = CFG["embedding_model"]
15
- NORMALIZE = CFG.get("normalize_embeddings", True)
16
-
17
- DATA_DIR = pathlib.Path("data")
18
- PDF_DIR = DATA_DIR / "pdf"
19
  INDEX_DIR = DATA_DIR / "index"
20
- META_PATH = INDEX_DIR / "meta.jsonl" # app.py と一致
21
  INDEX_PATH = INDEX_DIR / "faiss.index"
22
 
23
- def read_pdf_with_pages(path: str) -> List[Tuple[int, str]]:
24
- pages: List[Tuple[int, str]] = []
 
 
 
 
 
 
25
  reader = PdfReader(path)
26
  for i, p in enumerate(reader.pages):
27
  txt = p.extract_text() or ""
@@ -29,8 +33,8 @@ def read_pdf_with_pages(path: str) -> List[Tuple[int, str]]:
29
  pages.append((i + 1, txt))
30
  return pages
31
 
32
- def split_chunks(pages: List[Tuple[int, str]], target_chars: int, overlap_chars: int) -> List[Dict]:
33
- chunks: List[Dict] = []
34
  for page, text in pages:
35
  if not text:
36
  continue
@@ -51,33 +55,27 @@ def l2_normalize(m: np.ndarray) -> np.ndarray:
51
 
52
  def build_index():
53
  INDEX_DIR.mkdir(parents=True, exist_ok=True)
54
- meta_f = open(META_PATH, "w", encoding="utf-8")
55
-
56
- target_chars = CFG["chunk"]["target_chars"]
57
- overlap_chars = CFG["chunk"]["overlap_chars"]
58
-
59
- texts: List[str] = []
60
- for pdf in sorted(PDF_DIR.glob("*.pdf")):
61
- print(f"Processing {pdf.name}...")
62
- pages = read_pdf_with_pages(str(pdf))
63
- chunks = split_chunks(pages, target_chars, overlap_chars)
64
- for c in chunks:
65
- t = c["text"][:1800]
66
- texts.append(t)
67
- meta = {"source": pdf.name, "page": c["page"], "text": sanitize(t)}
68
- meta_f.write(json.dumps(meta, ensure_ascii=False) + "\n")
69
-
70
- meta_f.close()
71
 
72
- if not texts:
73
  raise SystemExit("Put PDFs under data/pdf/")
74
 
 
75
  vecs = embed_texts(texts, EMB_MODEL)
76
  mat = np.array(vecs, dtype="float32")
77
  mat = l2_normalize(mat)
78
 
79
- # コサイン類似(正規化済みベクトル × 内積)
80
- index = faiss.IndexFlatIP(mat.shape[1])
81
  index.add(mat)
82
  faiss.write_index(index, str(INDEX_PATH))
83
  print(f"Index {len(texts)} chunks → {INDEX_PATH}")
 
10
  from openai_client import embed_texts
11
  from guardrails import sanitize
12
 
13
+ # absolute paths
14
+ BASE_DIR = pathlib.Path(__file__).resolve().parent
15
+ DATA_DIR = BASE_DIR / "data"
16
+ PDF_DIR = DATA_DIR / "pdf"
 
 
17
  INDEX_DIR = DATA_DIR / "index"
18
+ META_PATH = INDEX_DIR / "meta.jsonl"
19
  INDEX_PATH = INDEX_DIR / "faiss.index"
20
 
21
+ CFG = yaml.safe_load(open(BASE_DIR / "config.yaml", encoding="utf-8"))
22
+ EMB_MODEL = CFG["embedding_model"]
23
+ NORMALIZE = CFG.get("normalize_embeddings", True)
24
+ target_chars = CFG["chunk"]["target_chars"]
25
+ overlap_chars = CFG["chunk"]["overlap_chars"]
26
+
27
+ def read_pdf_with_pages(path: str):
28
+ pages = []
29
  reader = PdfReader(path)
30
  for i, p in enumerate(reader.pages):
31
  txt = p.extract_text() or ""
 
33
  pages.append((i + 1, txt))
34
  return pages
35
 
36
+ def split_chunks(pages: List[Tuple[int, str]], target_chars: int, overlap_chars: int):
37
+ chunks = []
38
  for page, text in pages:
39
  if not text:
40
  continue
 
55
 
56
  def build_index():
57
  INDEX_DIR.mkdir(parents=True, exist_ok=True)
58
+ with open(META_PATH, "w", encoding="utf-8") as meta_f:
59
+ texts: List[str] = []
60
+ for pdf in sorted(PDF_DIR.glob("*.pdf")):
61
+ print(f"Processing {pdf.name}...")
62
+ pages = read_pdf_with_pages(str(pdf))
63
+ chunks = split_chunks(pages, target_chars, overlap_chars)
64
+ for c in chunks:
65
+ t = c["text"][:1800]
66
+ texts.append(t)
67
+ meta = {"source": pdf.name, "page": c["page"], "text": sanitize(t)}
68
+ meta_f.write(json.dumps(meta, ensure_ascii=False) + "\n")
 
 
 
 
 
 
69
 
70
+ if not META_PATH.exists() or META_PATH.stat().st_size == 0:
71
  raise SystemExit("Put PDFs under data/pdf/")
72
 
73
+ # embed
74
  vecs = embed_texts(texts, EMB_MODEL)
75
  mat = np.array(vecs, dtype="float32")
76
  mat = l2_normalize(mat)
77
 
78
+ index = faiss.IndexFlatIP(mat.shape[1]) # cosine via normalized dot
 
79
  index.add(mat)
80
  faiss.write_index(index, str(INDEX_PATH))
81
  print(f"Index {len(texts)} chunks → {INDEX_PATH}")