Corin1998 commited on
Commit
870d2c7
·
verified ·
1 Parent(s): 81a064e

Upload 7 files

Browse files
Files changed (1) hide show
  1. app.py +34 -25
app.py CHANGED
@@ -15,7 +15,7 @@ DEFAULT_CFG = {
15
  "chunk": {"target_chars": 1400, "overlap_chars": 180},
16
  "retrieval": {"top_k": 6, "score_threshold": 0.15, "mmr_lambda": 0.3},
17
  "llm": {
18
- "model": "gpt-4o-mini",
19
  "max_output_tokens": 700,
20
  "temperature": 0.2,
21
  "system_prompt": (
@@ -65,10 +65,10 @@ PDF_DIR = DATA_DIR / "pdf"
65
  INDEX_PATH = INDEX_DIR / "faiss.index"
66
  META_PATH = INDEX_DIR / "meta.jsonl"
67
 
68
- # ensure dirs
69
  PDF_DIR.mkdir(parents=True, exist_ok=True)
70
  INDEX_DIR.mkdir(parents=True, exist_ok=True)
71
 
 
72
  def _lazy_imports():
73
  global faiss, np, embed_texts, chat, detect_out_of_scope, sanitize, compliance_block, SCOPE_HINT
74
  import faiss, numpy as np
@@ -79,14 +79,16 @@ def _lazy_imports():
79
  def _index_exists(): return INDEX_PATH.exists() and META_PATH.exists()
80
  def _check_api_key(): return bool(os.getenv("OPENAI_API_KEY"))
81
 
82
- # ===== retrieval =====
83
  _INDEX = None
84
  _METAS = None
85
 
86
  def _ensure_index_loaded():
87
  global _INDEX, _METAS
88
- if _INDEX is not None and _METAS is not None: return
89
- if not _index_exists(): raise RuntimeError("index_not_ready")
 
 
90
  faiss, *_ = _lazy_imports()
91
  _INDEX = faiss.read_index(str(INDEX_PATH))
92
  _METAS = [json.loads(l) for l in open(META_PATH, encoding="utf-8")]
@@ -135,7 +137,8 @@ _LANG_INSTRUCTIONS = {
135
 
136
  def generate_answer(q: str, lang: str = "ja"):
137
  q = (q or "").strip()
138
- if not q: return "質問を入力してください。", {}
 
139
  try:
140
  _, _, _, chat, detect_out_of_scope, sanitize, compliance_block, SCOPE_HINT = _lazy_imports()
141
  if detect_out_of_scope(q):
@@ -167,7 +170,7 @@ def generate_answer(q: str, lang: str = "ja"):
167
  except Exception as e:
168
  return "❌ 実行時エラー: " + str(e) + "\n" + traceback.format_exc()[-1200:], {}
169
 
170
- # ===== Upload & Rebuild =====
171
  SAFE_RE = re.compile(r"[^A-Za-z0-9._-]+")
172
 
173
  def _safe_name(name: str) -> str:
@@ -178,14 +181,9 @@ def _safe_name(name: str) -> str:
178
  return base
179
 
180
  def save_uploaded_pdfs(file_paths: Optional[Iterable[str]]) -> Tuple[int, List[str], List[str]]:
181
- """
182
- gr.Files(type='filepath') からの一時ファイル群を data/pdf/ に保存
183
- return: (保存数, 保存先, スキップ/エラー)
184
- """
185
  saved, skipped = [], []
186
  if not file_paths:
187
  return 0, saved, ["アップロードされたPDFがありません。"]
188
-
189
  try:
190
  PDF_DIR.mkdir(parents=True, exist_ok=True)
191
  except Exception as e:
@@ -203,10 +201,10 @@ def save_uploaded_pdfs(file_paths: Optional[Iterable[str]]) -> Tuple[int, List[s
203
  continue
204
  dst = PDF_DIR / _safe_name(src.name)
205
  try:
206
- # 権限問題を避けるため copyfile(メタデータを引き継がない)
207
  shutil.copyfile(src, dst)
208
  saved.append(str(dst))
209
- except PermissionError as e:
210
  skipped.append(f"{src.name}: Permission denied({dst})。Dockerfileの所有権設定を確認してください。")
211
  except Exception as e:
212
  skipped.append(f"{src.name}: コピー失敗 ({e})")
@@ -221,7 +219,6 @@ def upload_and_rebuild(file_paths: Optional[List[str]]) -> str:
221
  if skipped:
222
  msg.append("⚠️ スキップ/エラー:")
223
  msg.extend([f"- {s}" for s in skipped[:10]])
224
- # 自動でインデックス再構築
225
  msg.append("\n🔧 インデックス再構築を開始します…")
226
  msg.append(rebuild_index())
227
  return "\n".join(msg)
@@ -232,8 +229,8 @@ def rebuild_index() -> str:
232
  if not list(PDF_DIR.glob("*.pdf")):
233
  return f"{PDF_DIR} にPDFがありません。PDFをアップロードして再実行してください。"
234
  try:
235
- out = subprocess.run([sys.executable, str(BASE_DIR / "ingest.py")], capture_output=True, text=True, check=True)
236
- # キャッシュ破棄
237
  global _INDEX, _METAS
238
  _INDEX = None
239
  _METAS = None
@@ -243,7 +240,7 @@ def rebuild_index() -> str:
243
  except Exception as e:
244
  return "❌ 予期せぬエラー: " + str(e) + "\n" + traceback.format_exc()[-1200:]
245
 
246
- # ===== File-system diagnose (optional) =====
247
  def fs_diagnose() -> str:
248
  lines = []
249
  lines.append(f"User: {getpass.getuser()}")
@@ -255,7 +252,6 @@ def fs_diagnose() -> str:
255
  lines.append(f"{p} exists={p.exists()} owner={st.st_uid}:{st.st_gid} mode={mode}")
256
  except Exception as e:
257
  lines.append(f"{p} stat error: {e}")
258
- # 書き込みテスト
259
  try:
260
  test = PDF_DIR / "_write_test.tmp"
261
  with open(test, "wb") as f:
@@ -286,7 +282,6 @@ def api_rebuild():
286
 
287
  # ===== Gradio UI mounted at "/" =====
288
  LANGS = CFG["languages"]["preferred"]
289
- LABELS = CFG["languages"].get("labels", {l: l for l in LANGS})
290
 
291
  with gr.Blocks(fill_height=True, title=CFG.get("app_name", "RAG Bot")) as demo:
292
  gr.Markdown("# IR・ESG開示RAG(OpenAI API)— 8言語対応")
@@ -309,16 +304,30 @@ with gr.Blocks(fill_height=True, title=CFG.get("app_name", "RAG Bot")) as demo:
309
  ask.click(fn=generate_answer, inputs=[q, lang], outputs=[ans, cites])
310
  rebuild_btn.click(fn=rebuild_index, outputs=[log])
311
 
312
- # --- Upload & Rebuild ---
313
  gr.Markdown("## 📄 PDFアップロード")
314
  with gr.Row():
315
- uploads = gr.Files(label="PDFをドラッグ&ドロップ(複数可)", file_types=[".pdf"], type="filepath")
 
 
 
 
 
 
 
 
 
 
 
 
316
  with gr.Row():
317
- up_btn = gr.Button("アップロード → インデックス再構築", variant="secondary")
318
  diag_btn = gr.Button("📋 ストレージ診断")
319
- up_log = gr.Markdown()
320
  diag_log = gr.Markdown()
321
- up_btn.click(fn=upload_and_rebuild, inputs=[uploads], outputs=[up_log])
 
 
322
  diag_btn.click(fn=fs_diagnose, outputs=[diag_log])
323
 
324
  from gradio.routes import mount_gradio_app
 
15
  "chunk": {"target_chars": 1400, "overlap_chars": 180},
16
  "retrieval": {"top_k": 6, "score_threshold": 0.15, "mmr_lambda": 0.3},
17
  "llm": {
18
+ "model": "gpt-4o-mini", # 必要に応じて利用可能なモデルに変更
19
  "max_output_tokens": 700,
20
  "temperature": 0.2,
21
  "system_prompt": (
 
65
  INDEX_PATH = INDEX_DIR / "faiss.index"
66
  META_PATH = INDEX_DIR / "meta.jsonl"
67
 
 
68
  PDF_DIR.mkdir(parents=True, exist_ok=True)
69
  INDEX_DIR.mkdir(parents=True, exist_ok=True)
70
 
71
+ # ===== lazy imports =====
72
  def _lazy_imports():
73
  global faiss, np, embed_texts, chat, detect_out_of_scope, sanitize, compliance_block, SCOPE_HINT
74
  import faiss, numpy as np
 
79
  def _index_exists(): return INDEX_PATH.exists() and META_PATH.exists()
80
  def _check_api_key(): return bool(os.getenv("OPENAI_API_KEY"))
81
 
82
+ # ===== retrieval helpers =====
83
  _INDEX = None
84
  _METAS = None
85
 
86
  def _ensure_index_loaded():
87
  global _INDEX, _METAS
88
+ if _INDEX is not None and _METAS is not None:
89
+ return
90
+ if not _index_exists():
91
+ raise RuntimeError("index_not_ready")
92
  faiss, *_ = _lazy_imports()
93
  _INDEX = faiss.read_index(str(INDEX_PATH))
94
  _METAS = [json.loads(l) for l in open(META_PATH, encoding="utf-8")]
 
137
 
138
  def generate_answer(q: str, lang: str = "ja"):
139
  q = (q or "").strip()
140
+ if not q:
141
+ return "質問を入力してください。", {}
142
  try:
143
  _, _, _, chat, detect_out_of_scope, sanitize, compliance_block, SCOPE_HINT = _lazy_imports()
144
  if detect_out_of_scope(q):
 
170
  except Exception as e:
171
  return "❌ 実行時エラー: " + str(e) + "\n" + traceback.format_exc()[-1200:], {}
172
 
173
+ # ===== Upload & Rebuild (helpers) =====
174
  SAFE_RE = re.compile(r"[^A-Za-z0-9._-]+")
175
 
176
  def _safe_name(name: str) -> str:
 
181
  return base
182
 
183
  def save_uploaded_pdfs(file_paths: Optional[Iterable[str]]) -> Tuple[int, List[str], List[str]]:
 
 
 
 
184
  saved, skipped = [], []
185
  if not file_paths:
186
  return 0, saved, ["アップロードされたPDFがありません。"]
 
187
  try:
188
  PDF_DIR.mkdir(parents=True, exist_ok=True)
189
  except Exception as e:
 
201
  continue
202
  dst = PDF_DIR / _safe_name(src.name)
203
  try:
204
+ # メタデータを持ち越さない(権限エラー回避
205
  shutil.copyfile(src, dst)
206
  saved.append(str(dst))
207
+ except PermissionError:
208
  skipped.append(f"{src.name}: Permission denied({dst})。Dockerfileの所有権設定を確認してください。")
209
  except Exception as e:
210
  skipped.append(f"{src.name}: コピー失敗 ({e})")
 
219
  if skipped:
220
  msg.append("⚠️ スキップ/エラー:")
221
  msg.extend([f"- {s}" for s in skipped[:10]])
 
222
  msg.append("\n🔧 インデックス再構築を開始します…")
223
  msg.append(rebuild_index())
224
  return "\n".join(msg)
 
229
  if not list(PDF_DIR.glob("*.pdf")):
230
  return f"{PDF_DIR} にPDFがありません。PDFをアップロードして再実行してください。"
231
  try:
232
+ out = subprocess.run([sys.executable, str(BASE_DIR / "ingest.py")],
233
+ capture_output=True, text=True, check=True)
234
  global _INDEX, _METAS
235
  _INDEX = None
236
  _METAS = None
 
240
  except Exception as e:
241
  return "❌ 予期せぬエラー: " + str(e) + "\n" + traceback.format_exc()[-1200:]
242
 
243
+ # ===== File-system diagnose =====
244
  def fs_diagnose() -> str:
245
  lines = []
246
  lines.append(f"User: {getpass.getuser()}")
 
252
  lines.append(f"{p} exists={p.exists()} owner={st.st_uid}:{st.st_gid} mode={mode}")
253
  except Exception as e:
254
  lines.append(f"{p} stat error: {e}")
 
255
  try:
256
  test = PDF_DIR / "_write_test.tmp"
257
  with open(test, "wb") as f:
 
282
 
283
  # ===== Gradio UI mounted at "/" =====
284
  LANGS = CFG["languages"]["preferred"]
 
285
 
286
  with gr.Blocks(fill_height=True, title=CFG.get("app_name", "RAG Bot")) as demo:
287
  gr.Markdown("# IR・ESG開示RAG(OpenAI API)— 8言語対応")
 
304
  ask.click(fn=generate_answer, inputs=[q, lang], outputs=[ans, cites])
305
  rebuild_btn.click(fn=rebuild_index, outputs=[log])
306
 
307
+ # --- Upload & Rebuild(Stateで安定化) ---
308
  gr.Markdown("## 📄 PDFアップロード")
309
  with gr.Row():
310
+ uploaded_files = gr.State([]) # 一時パス保持
311
+ uploads = gr.Files(
312
+ label="PDFをドラッグ&ドロップ(複数可)",
313
+ file_types=[".pdf"],
314
+ type="filepath",
315
+ file_count="multiple",
316
+ )
317
+
318
+ def _capture_files(fs: Optional[List[str]]) -> List[str]:
319
+ return fs or []
320
+
321
+ uploads.change(fn=_capture_files, inputs=[uploads], outputs=[uploaded_files])
322
+
323
  with gr.Row():
324
+ up_btn = gr.Button("アップロード → インデックス再構築", variant="secondary")
325
  diag_btn = gr.Button("📋 ストレージ診断")
326
+ up_log = gr.Markdown()
327
  diag_log = gr.Markdown()
328
+
329
+ # ボタンは State を入力にする
330
+ up_btn.click(fn=upload_and_rebuild, inputs=[uploaded_files], outputs=[up_log])
331
  diag_btn.click(fn=fs_diagnose, outputs=[diag_log])
332
 
333
  from gradio.routes import mount_gradio_app