Spaces:
Sleeping
Sleeping
Upload 7 files
Browse files
app.py
CHANGED
|
@@ -15,7 +15,7 @@ DEFAULT_CFG = {
|
|
| 15 |
"chunk": {"target_chars": 1400, "overlap_chars": 180},
|
| 16 |
"retrieval": {"top_k": 6, "score_threshold": 0.15, "mmr_lambda": 0.3},
|
| 17 |
"llm": {
|
| 18 |
-
"model": "gpt-4o-mini",
|
| 19 |
"max_output_tokens": 700,
|
| 20 |
"temperature": 0.2,
|
| 21 |
"system_prompt": (
|
|
@@ -65,10 +65,10 @@ PDF_DIR = DATA_DIR / "pdf"
|
|
| 65 |
INDEX_PATH = INDEX_DIR / "faiss.index"
|
| 66 |
META_PATH = INDEX_DIR / "meta.jsonl"
|
| 67 |
|
| 68 |
-
# ensure dirs
|
| 69 |
PDF_DIR.mkdir(parents=True, exist_ok=True)
|
| 70 |
INDEX_DIR.mkdir(parents=True, exist_ok=True)
|
| 71 |
|
|
|
|
| 72 |
def _lazy_imports():
|
| 73 |
global faiss, np, embed_texts, chat, detect_out_of_scope, sanitize, compliance_block, SCOPE_HINT
|
| 74 |
import faiss, numpy as np
|
|
@@ -79,14 +79,16 @@ def _lazy_imports():
|
|
| 79 |
def _index_exists(): return INDEX_PATH.exists() and META_PATH.exists()
|
| 80 |
def _check_api_key(): return bool(os.getenv("OPENAI_API_KEY"))
|
| 81 |
|
| 82 |
-
# ===== retrieval =====
|
| 83 |
_INDEX = None
|
| 84 |
_METAS = None
|
| 85 |
|
| 86 |
def _ensure_index_loaded():
|
| 87 |
global _INDEX, _METAS
|
| 88 |
-
if _INDEX is not None and _METAS is not None:
|
| 89 |
-
|
|
|
|
|
|
|
| 90 |
faiss, *_ = _lazy_imports()
|
| 91 |
_INDEX = faiss.read_index(str(INDEX_PATH))
|
| 92 |
_METAS = [json.loads(l) for l in open(META_PATH, encoding="utf-8")]
|
|
@@ -135,7 +137,8 @@ _LANG_INSTRUCTIONS = {
|
|
| 135 |
|
| 136 |
def generate_answer(q: str, lang: str = "ja"):
|
| 137 |
q = (q or "").strip()
|
| 138 |
-
if not q:
|
|
|
|
| 139 |
try:
|
| 140 |
_, _, _, chat, detect_out_of_scope, sanitize, compliance_block, SCOPE_HINT = _lazy_imports()
|
| 141 |
if detect_out_of_scope(q):
|
|
@@ -167,7 +170,7 @@ def generate_answer(q: str, lang: str = "ja"):
|
|
| 167 |
except Exception as e:
|
| 168 |
return "❌ 実行時エラー: " + str(e) + "\n" + traceback.format_exc()[-1200:], {}
|
| 169 |
|
| 170 |
-
# ===== Upload & Rebuild =====
|
| 171 |
SAFE_RE = re.compile(r"[^A-Za-z0-9._-]+")
|
| 172 |
|
| 173 |
def _safe_name(name: str) -> str:
|
|
@@ -178,14 +181,9 @@ def _safe_name(name: str) -> str:
|
|
| 178 |
return base
|
| 179 |
|
| 180 |
def save_uploaded_pdfs(file_paths: Optional[Iterable[str]]) -> Tuple[int, List[str], List[str]]:
|
| 181 |
-
"""
|
| 182 |
-
gr.Files(type='filepath') からの一時ファイル群を data/pdf/ に保存
|
| 183 |
-
return: (保存数, 保存先, スキップ/エラー)
|
| 184 |
-
"""
|
| 185 |
saved, skipped = [], []
|
| 186 |
if not file_paths:
|
| 187 |
return 0, saved, ["アップロードされたPDFがありません。"]
|
| 188 |
-
|
| 189 |
try:
|
| 190 |
PDF_DIR.mkdir(parents=True, exist_ok=True)
|
| 191 |
except Exception as e:
|
|
@@ -203,10 +201,10 @@ def save_uploaded_pdfs(file_paths: Optional[Iterable[str]]) -> Tuple[int, List[s
|
|
| 203 |
continue
|
| 204 |
dst = PDF_DIR / _safe_name(src.name)
|
| 205 |
try:
|
| 206 |
-
#
|
| 207 |
shutil.copyfile(src, dst)
|
| 208 |
saved.append(str(dst))
|
| 209 |
-
except PermissionError
|
| 210 |
skipped.append(f"{src.name}: Permission denied({dst})。Dockerfileの所有権設定を確認してください。")
|
| 211 |
except Exception as e:
|
| 212 |
skipped.append(f"{src.name}: コピー失敗 ({e})")
|
|
@@ -221,7 +219,6 @@ def upload_and_rebuild(file_paths: Optional[List[str]]) -> str:
|
|
| 221 |
if skipped:
|
| 222 |
msg.append("⚠️ スキップ/エラー:")
|
| 223 |
msg.extend([f"- {s}" for s in skipped[:10]])
|
| 224 |
-
# 自動でインデックス再構築
|
| 225 |
msg.append("\n🔧 インデックス再構築を開始します…")
|
| 226 |
msg.append(rebuild_index())
|
| 227 |
return "\n".join(msg)
|
|
@@ -232,8 +229,8 @@ def rebuild_index() -> str:
|
|
| 232 |
if not list(PDF_DIR.glob("*.pdf")):
|
| 233 |
return f"{PDF_DIR} にPDFがありません。PDFをアップロードして再実行してください。"
|
| 234 |
try:
|
| 235 |
-
out = subprocess.run([sys.executable, str(BASE_DIR / "ingest.py")],
|
| 236 |
-
|
| 237 |
global _INDEX, _METAS
|
| 238 |
_INDEX = None
|
| 239 |
_METAS = None
|
|
@@ -243,7 +240,7 @@ def rebuild_index() -> str:
|
|
| 243 |
except Exception as e:
|
| 244 |
return "❌ 予期せぬエラー: " + str(e) + "\n" + traceback.format_exc()[-1200:]
|
| 245 |
|
| 246 |
-
# ===== File-system diagnose
|
| 247 |
def fs_diagnose() -> str:
|
| 248 |
lines = []
|
| 249 |
lines.append(f"User: {getpass.getuser()}")
|
|
@@ -255,7 +252,6 @@ def fs_diagnose() -> str:
|
|
| 255 |
lines.append(f"{p} exists={p.exists()} owner={st.st_uid}:{st.st_gid} mode={mode}")
|
| 256 |
except Exception as e:
|
| 257 |
lines.append(f"{p} stat error: {e}")
|
| 258 |
-
# 書き込みテスト
|
| 259 |
try:
|
| 260 |
test = PDF_DIR / "_write_test.tmp"
|
| 261 |
with open(test, "wb") as f:
|
|
@@ -286,7 +282,6 @@ def api_rebuild():
|
|
| 286 |
|
| 287 |
# ===== Gradio UI mounted at "/" =====
|
| 288 |
LANGS = CFG["languages"]["preferred"]
|
| 289 |
-
LABELS = CFG["languages"].get("labels", {l: l for l in LANGS})
|
| 290 |
|
| 291 |
with gr.Blocks(fill_height=True, title=CFG.get("app_name", "RAG Bot")) as demo:
|
| 292 |
gr.Markdown("# IR・ESG開示RAG(OpenAI API)— 8言語対応")
|
|
@@ -309,16 +304,30 @@ with gr.Blocks(fill_height=True, title=CFG.get("app_name", "RAG Bot")) as demo:
|
|
| 309 |
ask.click(fn=generate_answer, inputs=[q, lang], outputs=[ans, cites])
|
| 310 |
rebuild_btn.click(fn=rebuild_index, outputs=[log])
|
| 311 |
|
| 312 |
-
# --- Upload & Rebuild ---
|
| 313 |
gr.Markdown("## 📄 PDFアップロード")
|
| 314 |
with gr.Row():
|
| 315 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
with gr.Row():
|
| 317 |
-
up_btn
|
| 318 |
diag_btn = gr.Button("📋 ストレージ診断")
|
| 319 |
-
up_log
|
| 320 |
diag_log = gr.Markdown()
|
| 321 |
-
|
|
|
|
|
|
|
| 322 |
diag_btn.click(fn=fs_diagnose, outputs=[diag_log])
|
| 323 |
|
| 324 |
from gradio.routes import mount_gradio_app
|
|
|
|
| 15 |
"chunk": {"target_chars": 1400, "overlap_chars": 180},
|
| 16 |
"retrieval": {"top_k": 6, "score_threshold": 0.15, "mmr_lambda": 0.3},
|
| 17 |
"llm": {
|
| 18 |
+
"model": "gpt-4o-mini", # 必要に応じて利用可能なモデルに変更
|
| 19 |
"max_output_tokens": 700,
|
| 20 |
"temperature": 0.2,
|
| 21 |
"system_prompt": (
|
|
|
|
| 65 |
INDEX_PATH = INDEX_DIR / "faiss.index"
|
| 66 |
META_PATH = INDEX_DIR / "meta.jsonl"
|
| 67 |
|
|
|
|
| 68 |
PDF_DIR.mkdir(parents=True, exist_ok=True)
|
| 69 |
INDEX_DIR.mkdir(parents=True, exist_ok=True)
|
| 70 |
|
| 71 |
+
# ===== lazy imports =====
|
| 72 |
def _lazy_imports():
|
| 73 |
global faiss, np, embed_texts, chat, detect_out_of_scope, sanitize, compliance_block, SCOPE_HINT
|
| 74 |
import faiss, numpy as np
|
|
|
|
| 79 |
def _index_exists(): return INDEX_PATH.exists() and META_PATH.exists()
|
| 80 |
def _check_api_key(): return bool(os.getenv("OPENAI_API_KEY"))
|
| 81 |
|
| 82 |
+
# ===== retrieval helpers =====
|
| 83 |
_INDEX = None
|
| 84 |
_METAS = None
|
| 85 |
|
| 86 |
def _ensure_index_loaded():
|
| 87 |
global _INDEX, _METAS
|
| 88 |
+
if _INDEX is not None and _METAS is not None:
|
| 89 |
+
return
|
| 90 |
+
if not _index_exists():
|
| 91 |
+
raise RuntimeError("index_not_ready")
|
| 92 |
faiss, *_ = _lazy_imports()
|
| 93 |
_INDEX = faiss.read_index(str(INDEX_PATH))
|
| 94 |
_METAS = [json.loads(l) for l in open(META_PATH, encoding="utf-8")]
|
|
|
|
| 137 |
|
| 138 |
def generate_answer(q: str, lang: str = "ja"):
|
| 139 |
q = (q or "").strip()
|
| 140 |
+
if not q:
|
| 141 |
+
return "質問を入力してください。", {}
|
| 142 |
try:
|
| 143 |
_, _, _, chat, detect_out_of_scope, sanitize, compliance_block, SCOPE_HINT = _lazy_imports()
|
| 144 |
if detect_out_of_scope(q):
|
|
|
|
| 170 |
except Exception as e:
|
| 171 |
return "❌ 実行時エラー: " + str(e) + "\n" + traceback.format_exc()[-1200:], {}
|
| 172 |
|
| 173 |
+
# ===== Upload & Rebuild (helpers) =====
|
| 174 |
SAFE_RE = re.compile(r"[^A-Za-z0-9._-]+")
|
| 175 |
|
| 176 |
def _safe_name(name: str) -> str:
|
|
|
|
| 181 |
return base
|
| 182 |
|
| 183 |
def save_uploaded_pdfs(file_paths: Optional[Iterable[str]]) -> Tuple[int, List[str], List[str]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
saved, skipped = [], []
|
| 185 |
if not file_paths:
|
| 186 |
return 0, saved, ["アップロードされたPDFがありません。"]
|
|
|
|
| 187 |
try:
|
| 188 |
PDF_DIR.mkdir(parents=True, exist_ok=True)
|
| 189 |
except Exception as e:
|
|
|
|
| 201 |
continue
|
| 202 |
dst = PDF_DIR / _safe_name(src.name)
|
| 203 |
try:
|
| 204 |
+
# メタデータを持ち越さない(権限エラー回避)
|
| 205 |
shutil.copyfile(src, dst)
|
| 206 |
saved.append(str(dst))
|
| 207 |
+
except PermissionError:
|
| 208 |
skipped.append(f"{src.name}: Permission denied({dst})。Dockerfileの所有権設定を確認してください。")
|
| 209 |
except Exception as e:
|
| 210 |
skipped.append(f"{src.name}: コピー失敗 ({e})")
|
|
|
|
| 219 |
if skipped:
|
| 220 |
msg.append("⚠️ スキップ/エラー:")
|
| 221 |
msg.extend([f"- {s}" for s in skipped[:10]])
|
|
|
|
| 222 |
msg.append("\n🔧 インデックス再構築を開始します…")
|
| 223 |
msg.append(rebuild_index())
|
| 224 |
return "\n".join(msg)
|
|
|
|
| 229 |
if not list(PDF_DIR.glob("*.pdf")):
|
| 230 |
return f"{PDF_DIR} にPDFがありません。PDFをアップロードして再実行してください。"
|
| 231 |
try:
|
| 232 |
+
out = subprocess.run([sys.executable, str(BASE_DIR / "ingest.py")],
|
| 233 |
+
capture_output=True, text=True, check=True)
|
| 234 |
global _INDEX, _METAS
|
| 235 |
_INDEX = None
|
| 236 |
_METAS = None
|
|
|
|
| 240 |
except Exception as e:
|
| 241 |
return "❌ 予期せぬエラー: " + str(e) + "\n" + traceback.format_exc()[-1200:]
|
| 242 |
|
| 243 |
+
# ===== File-system diagnose =====
|
| 244 |
def fs_diagnose() -> str:
|
| 245 |
lines = []
|
| 246 |
lines.append(f"User: {getpass.getuser()}")
|
|
|
|
| 252 |
lines.append(f"{p} exists={p.exists()} owner={st.st_uid}:{st.st_gid} mode={mode}")
|
| 253 |
except Exception as e:
|
| 254 |
lines.append(f"{p} stat error: {e}")
|
|
|
|
| 255 |
try:
|
| 256 |
test = PDF_DIR / "_write_test.tmp"
|
| 257 |
with open(test, "wb") as f:
|
|
|
|
| 282 |
|
| 283 |
# ===== Gradio UI mounted at "/" =====
|
| 284 |
LANGS = CFG["languages"]["preferred"]
|
|
|
|
| 285 |
|
| 286 |
with gr.Blocks(fill_height=True, title=CFG.get("app_name", "RAG Bot")) as demo:
|
| 287 |
gr.Markdown("# IR・ESG開示RAG(OpenAI API)— 8言語対応")
|
|
|
|
| 304 |
ask.click(fn=generate_answer, inputs=[q, lang], outputs=[ans, cites])
|
| 305 |
rebuild_btn.click(fn=rebuild_index, outputs=[log])
|
| 306 |
|
| 307 |
+
# --- Upload & Rebuild(Stateで安定化) ---
|
| 308 |
gr.Markdown("## 📄 PDFアップロード")
|
| 309 |
with gr.Row():
|
| 310 |
+
uploaded_files = gr.State([]) # 一時パス保持
|
| 311 |
+
uploads = gr.Files(
|
| 312 |
+
label="PDFをドラッグ&ドロップ(複数可)",
|
| 313 |
+
file_types=[".pdf"],
|
| 314 |
+
type="filepath",
|
| 315 |
+
file_count="multiple",
|
| 316 |
+
)
|
| 317 |
+
|
| 318 |
+
def _capture_files(fs: Optional[List[str]]) -> List[str]:
|
| 319 |
+
return fs or []
|
| 320 |
+
|
| 321 |
+
uploads.change(fn=_capture_files, inputs=[uploads], outputs=[uploaded_files])
|
| 322 |
+
|
| 323 |
with gr.Row():
|
| 324 |
+
up_btn = gr.Button("アップロード → インデックス再構築", variant="secondary")
|
| 325 |
diag_btn = gr.Button("📋 ストレージ診断")
|
| 326 |
+
up_log = gr.Markdown()
|
| 327 |
diag_log = gr.Markdown()
|
| 328 |
+
|
| 329 |
+
# ボタンは State を入力にする
|
| 330 |
+
up_btn.click(fn=upload_and_rebuild, inputs=[uploaded_files], outputs=[up_log])
|
| 331 |
diag_btn.click(fn=fs_diagnose, outputs=[diag_log])
|
| 332 |
|
| 333 |
from gradio.routes import mount_gradio_app
|