Spaces:
Sleeping
Sleeping
Upload 7 files
Browse files- app.py +52 -13
- config.yaml +3 -24
- ingest.py +29 -31
app.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
-
import os, json, yaml, subprocess, sys, pathlib, traceback, shutil, re
|
| 3 |
from typing import List, Dict, Tuple, Iterable, Optional
|
| 4 |
|
| 5 |
from fastapi import FastAPI, Body
|
|
@@ -56,11 +56,18 @@ except Exception as e:
|
|
| 56 |
CFG = DEFAULT_CFG
|
| 57 |
CFG_ERR = "config.yaml 読み込みエラー: " + str(e)
|
| 58 |
|
| 59 |
-
# ===== paths
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
PDF_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
| 64 |
|
| 65 |
def _lazy_imports():
|
| 66 |
global faiss, np, embed_texts, chat, detect_out_of_scope, sanitize, compliance_block, SCOPE_HINT
|
|
@@ -172,14 +179,18 @@ def _safe_name(name: str) -> str:
|
|
| 172 |
|
| 173 |
def save_uploaded_pdfs(file_paths: Optional[Iterable[str]]) -> Tuple[int, List[str], List[str]]:
|
| 174 |
"""
|
| 175 |
-
|
| 176 |
-
return: (保存数, 保存先
|
| 177 |
"""
|
| 178 |
saved, skipped = [], []
|
| 179 |
if not file_paths:
|
| 180 |
return 0, saved, ["アップロードされたPDFがありません。"]
|
| 181 |
|
| 182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
for fp in file_paths:
|
| 184 |
if not fp:
|
| 185 |
continue
|
|
@@ -192,8 +203,11 @@ def save_uploaded_pdfs(file_paths: Optional[Iterable[str]]) -> Tuple[int, List[s
|
|
| 192 |
continue
|
| 193 |
dst = PDF_DIR / _safe_name(src.name)
|
| 194 |
try:
|
| 195 |
-
|
|
|
|
| 196 |
saved.append(str(dst))
|
|
|
|
|
|
|
| 197 |
except Exception as e:
|
| 198 |
skipped.append(f"{src.name}: コピー失敗 ({e})")
|
| 199 |
return len(saved), saved, skipped
|
|
@@ -202,7 +216,7 @@ def upload_and_rebuild(file_paths: Optional[List[str]]) -> str:
|
|
| 202 |
n, saved, skipped = save_uploaded_pdfs(file_paths)
|
| 203 |
msg = []
|
| 204 |
if n > 0:
|
| 205 |
-
msg.append(f"📥 {n} 件のPDFを
|
| 206 |
msg.extend([f"- {p}" for p in saved[:10]])
|
| 207 |
if skipped:
|
| 208 |
msg.append("⚠️ スキップ/エラー:")
|
|
@@ -216,9 +230,9 @@ def rebuild_index() -> str:
|
|
| 216 |
if not _check_api_key():
|
| 217 |
return "OPENAI_API_KEY が未設定です。コンソール / Secrets に登録してください。"
|
| 218 |
if not list(PDF_DIR.glob("*.pdf")):
|
| 219 |
-
return "
|
| 220 |
try:
|
| 221 |
-
out = subprocess.run([sys.executable, "ingest.py"], capture_output=True, text=True, check=True)
|
| 222 |
# キャッシュ破棄
|
| 223 |
global _INDEX, _METAS
|
| 224 |
_INDEX = None
|
|
@@ -229,6 +243,29 @@ def rebuild_index() -> str:
|
|
| 229 |
except Exception as e:
|
| 230 |
return "❌ 予期せぬエラー: " + str(e) + "\n" + traceback.format_exc()[-1200:]
|
| 231 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
# ===== FastAPI =====
|
| 233 |
app = FastAPI(title=CFG.get("app_name", "RAG Bot"))
|
| 234 |
app.add_middleware(
|
|
@@ -278,9 +315,11 @@ with gr.Blocks(fill_height=True, title=CFG.get("app_name", "RAG Bot")) as demo:
|
|
| 278 |
uploads = gr.Files(label="PDFをドラッグ&ドロップ(複数可)", file_types=[".pdf"], type="filepath")
|
| 279 |
with gr.Row():
|
| 280 |
up_btn = gr.Button("アップロード → インデックス再構築", variant="secondary")
|
|
|
|
| 281 |
up_log = gr.Markdown()
|
|
|
|
| 282 |
up_btn.click(fn=upload_and_rebuild, inputs=[uploads], outputs=[up_log])
|
|
|
|
| 283 |
|
| 284 |
from gradio.routes import mount_gradio_app
|
| 285 |
mount_gradio_app(app, demo, path="/")
|
| 286 |
-
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
+
import os, json, yaml, subprocess, sys, pathlib, traceback, shutil, re, getpass, stat
|
| 3 |
from typing import List, Dict, Tuple, Iterable, Optional
|
| 4 |
|
| 5 |
from fastapi import FastAPI, Body
|
|
|
|
| 56 |
CFG = DEFAULT_CFG
|
| 57 |
CFG_ERR = "config.yaml 読み込みエラー: " + str(e)
|
| 58 |
|
| 59 |
+
# ===== absolute paths =====
|
| 60 |
+
BASE_DIR = pathlib.Path(__file__).resolve().parent
|
| 61 |
+
DATA_DIR = BASE_DIR / "data"
|
| 62 |
+
INDEX_DIR = DATA_DIR / "index"
|
| 63 |
+
PDF_DIR = DATA_DIR / "pdf"
|
| 64 |
+
|
| 65 |
+
INDEX_PATH = INDEX_DIR / "faiss.index"
|
| 66 |
+
META_PATH = INDEX_DIR / "meta.jsonl"
|
| 67 |
+
|
| 68 |
+
# ensure dirs
|
| 69 |
PDF_DIR.mkdir(parents=True, exist_ok=True)
|
| 70 |
+
INDEX_DIR.mkdir(parents=True, exist_ok=True)
|
| 71 |
|
| 72 |
def _lazy_imports():
|
| 73 |
global faiss, np, embed_texts, chat, detect_out_of_scope, sanitize, compliance_block, SCOPE_HINT
|
|
|
|
| 179 |
|
| 180 |
def save_uploaded_pdfs(file_paths: Optional[Iterable[str]]) -> Tuple[int, List[str], List[str]]:
|
| 181 |
"""
|
| 182 |
+
gr.Files(type='filepath') からの一時ファイル群を data/pdf/ に保存
|
| 183 |
+
return: (保存数, 保存先, スキップ/エラー)
|
| 184 |
"""
|
| 185 |
saved, skipped = [], []
|
| 186 |
if not file_paths:
|
| 187 |
return 0, saved, ["アップロードされたPDFがありません。"]
|
| 188 |
|
| 189 |
+
try:
|
| 190 |
+
PDF_DIR.mkdir(parents=True, exist_ok=True)
|
| 191 |
+
except Exception as e:
|
| 192 |
+
return 0, [], [f"data/pdf の作成に失敗: {e}"]
|
| 193 |
+
|
| 194 |
for fp in file_paths:
|
| 195 |
if not fp:
|
| 196 |
continue
|
|
|
|
| 203 |
continue
|
| 204 |
dst = PDF_DIR / _safe_name(src.name)
|
| 205 |
try:
|
| 206 |
+
# 権限問題を避けるため copyfile(メタデータを引き継がない)
|
| 207 |
+
shutil.copyfile(src, dst)
|
| 208 |
saved.append(str(dst))
|
| 209 |
+
except PermissionError as e:
|
| 210 |
+
skipped.append(f"{src.name}: Permission denied({dst})。Dockerfileの所有権設定を確認してください。")
|
| 211 |
except Exception as e:
|
| 212 |
skipped.append(f"{src.name}: コピー失敗 ({e})")
|
| 213 |
return len(saved), saved, skipped
|
|
|
|
| 216 |
n, saved, skipped = save_uploaded_pdfs(file_paths)
|
| 217 |
msg = []
|
| 218 |
if n > 0:
|
| 219 |
+
msg.append(f"📥 {n} 件のPDFを {PDF_DIR} に保存しました。")
|
| 220 |
msg.extend([f"- {p}" for p in saved[:10]])
|
| 221 |
if skipped:
|
| 222 |
msg.append("⚠️ スキップ/エラー:")
|
|
|
|
| 230 |
if not _check_api_key():
|
| 231 |
return "OPENAI_API_KEY が未設定です。コンソール / Secrets に登録してください。"
|
| 232 |
if not list(PDF_DIR.glob("*.pdf")):
|
| 233 |
+
return f"{PDF_DIR} にPDFがありません。PDFをアップロードして再実行してください。"
|
| 234 |
try:
|
| 235 |
+
out = subprocess.run([sys.executable, str(BASE_DIR / "ingest.py")], capture_output=True, text=True, check=True)
|
| 236 |
# キャッシュ破棄
|
| 237 |
global _INDEX, _METAS
|
| 238 |
_INDEX = None
|
|
|
|
| 243 |
except Exception as e:
|
| 244 |
return "❌ 予期せぬエラー: " + str(e) + "\n" + traceback.format_exc()[-1200:]
|
| 245 |
|
| 246 |
+
# ===== File-system diagnose (optional) =====
|
| 247 |
+
def fs_diagnose() -> str:
|
| 248 |
+
lines = []
|
| 249 |
+
lines.append(f"User: {getpass.getuser()}")
|
| 250 |
+
lines.append(f"CWD : {os.getcwd()}")
|
| 251 |
+
for p in [BASE_DIR, DATA_DIR, PDF_DIR, INDEX_DIR]:
|
| 252 |
+
try:
|
| 253 |
+
st = p.stat()
|
| 254 |
+
mode = stat.filemode(st.st_mode)
|
| 255 |
+
lines.append(f"{p} exists={p.exists()} owner={st.st_uid}:{st.st_gid} mode={mode}")
|
| 256 |
+
except Exception as e:
|
| 257 |
+
lines.append(f"{p} stat error: {e}")
|
| 258 |
+
# 書き込みテスト
|
| 259 |
+
try:
|
| 260 |
+
test = PDF_DIR / "_write_test.tmp"
|
| 261 |
+
with open(test, "wb") as f:
|
| 262 |
+
f.write(b"ok")
|
| 263 |
+
test.unlink()
|
| 264 |
+
lines.append("WRITE TEST: OK (data/pdf に書き込み可能)")
|
| 265 |
+
except Exception as e:
|
| 266 |
+
lines.append(f"WRITE TEST: NG -> {e}")
|
| 267 |
+
return "```\n" + "\n".join(lines) + "\n```"
|
| 268 |
+
|
| 269 |
# ===== FastAPI =====
|
| 270 |
app = FastAPI(title=CFG.get("app_name", "RAG Bot"))
|
| 271 |
app.add_middleware(
|
|
|
|
| 315 |
uploads = gr.Files(label="PDFをドラッグ&ドロップ(複数可)", file_types=[".pdf"], type="filepath")
|
| 316 |
with gr.Row():
|
| 317 |
up_btn = gr.Button("アップロード → インデックス再構築", variant="secondary")
|
| 318 |
+
diag_btn = gr.Button("📋 ストレージ診断")
|
| 319 |
up_log = gr.Markdown()
|
| 320 |
+
diag_log = gr.Markdown()
|
| 321 |
up_btn.click(fn=upload_and_rebuild, inputs=[uploads], outputs=[up_log])
|
| 322 |
+
diag_btn.click(fn=fs_diagnose, outputs=[diag_log])
|
| 323 |
|
| 324 |
from gradio.routes import mount_gradio_app
|
| 325 |
mount_gradio_app(app, demo, path="/")
|
|
|
config.yaml
CHANGED
|
@@ -1,16 +1,8 @@
|
|
| 1 |
app_name: "IR/ESG RAG Bot (OpenAI, 8 languages)"
|
| 2 |
embedding_model: "text-embedding-3-large"
|
| 3 |
normalize_embeddings: true
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
target_chars: 1400
|
| 7 |
-
overlap_chars: 180
|
| 8 |
-
|
| 9 |
-
retrieval:
|
| 10 |
-
top_k: 6
|
| 11 |
-
score_threshold: 0.15
|
| 12 |
-
mmr_lambda: 0.3
|
| 13 |
-
|
| 14 |
llm:
|
| 15 |
model: "gpt-4o-mini"
|
| 16 |
max_output_tokens: 700
|
|
@@ -18,19 +10,6 @@ llm:
|
|
| 18 |
system_prompt: |-
|
| 19 |
あなたは上場企業のIR・ESG開示に特化したRAGアシスタントです。回答は常に根拠(文書名・ページ)を箇条書きで示し、
|
| 20 |
文書外の推測や断定は避けます。数値は年度と単位を明記し、最新年度を優先してください。
|
| 21 |
-
|
| 22 |
languages:
|
| 23 |
preferred: [ja, en, zh, ko, fr, de, es, it]
|
| 24 |
-
labels:
|
| 25 |
-
ja: "日本語"
|
| 26 |
-
en: "English"
|
| 27 |
-
zh: "中文"
|
| 28 |
-
ko: "한국어"
|
| 29 |
-
fr: "Français"
|
| 30 |
-
de: "Deutsch"
|
| 31 |
-
es: "Español"
|
| 32 |
-
it: "Italiano"
|
| 33 |
-
|
| 34 |
-
logging:
|
| 35 |
-
save_qa: true
|
| 36 |
-
path: "logs/qa_log.jsonl"
|
|
|
|
| 1 |
app_name: "IR/ESG RAG Bot (OpenAI, 8 languages)"
|
| 2 |
embedding_model: "text-embedding-3-large"
|
| 3 |
normalize_embeddings: true
|
| 4 |
+
chunk: { target_chars: 1400, overlap_chars: 180 }
|
| 5 |
+
retrieval: { top_k: 6, score_threshold: 0.15, mmr_lambda: 0.3 }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
llm:
|
| 7 |
model: "gpt-4o-mini"
|
| 8 |
max_output_tokens: 700
|
|
|
|
| 10 |
system_prompt: |-
|
| 11 |
あなたは上場企業のIR・ESG開示に特化したRAGアシスタントです。回答は常に根拠(文書名・ページ)を箇条書きで示し、
|
| 12 |
文書外の推測や断定は避けます。数値は年度と単位を明記し、最新年度を優先してください。
|
|
|
|
| 13 |
languages:
|
| 14 |
preferred: [ja, en, zh, ko, fr, de, es, it]
|
| 15 |
+
labels: { ja: 日本語, en: English, zh: 中文, ko: 한국어, fr: Français, de: Deutsch, es: Español, it: Italiano }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ingest.py
CHANGED
|
@@ -10,18 +10,22 @@ import yaml
|
|
| 10 |
from openai_client import embed_texts
|
| 11 |
from guardrails import sanitize
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
DATA_DIR = pathlib.Path("data")
|
| 18 |
-
PDF_DIR = DATA_DIR / "pdf"
|
| 19 |
INDEX_DIR = DATA_DIR / "index"
|
| 20 |
-
META_PATH = INDEX_DIR / "meta.jsonl"
|
| 21 |
INDEX_PATH = INDEX_DIR / "faiss.index"
|
| 22 |
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
reader = PdfReader(path)
|
| 26 |
for i, p in enumerate(reader.pages):
|
| 27 |
txt = p.extract_text() or ""
|
|
@@ -29,8 +33,8 @@ def read_pdf_with_pages(path: str) -> List[Tuple[int, str]]:
|
|
| 29 |
pages.append((i + 1, txt))
|
| 30 |
return pages
|
| 31 |
|
| 32 |
-
def split_chunks(pages: List[Tuple[int, str]], target_chars: int, overlap_chars: int)
|
| 33 |
-
chunks
|
| 34 |
for page, text in pages:
|
| 35 |
if not text:
|
| 36 |
continue
|
|
@@ -51,33 +55,27 @@ def l2_normalize(m: np.ndarray) -> np.ndarray:
|
|
| 51 |
|
| 52 |
def build_index():
|
| 53 |
INDEX_DIR.mkdir(parents=True, exist_ok=True)
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
t = c["text"][:1800]
|
| 66 |
-
texts.append(t)
|
| 67 |
-
meta = {"source": pdf.name, "page": c["page"], "text": sanitize(t)}
|
| 68 |
-
meta_f.write(json.dumps(meta, ensure_ascii=False) + "\n")
|
| 69 |
-
|
| 70 |
-
meta_f.close()
|
| 71 |
|
| 72 |
-
if not
|
| 73 |
raise SystemExit("Put PDFs under data/pdf/")
|
| 74 |
|
|
|
|
| 75 |
vecs = embed_texts(texts, EMB_MODEL)
|
| 76 |
mat = np.array(vecs, dtype="float32")
|
| 77 |
mat = l2_normalize(mat)
|
| 78 |
|
| 79 |
-
#
|
| 80 |
-
index = faiss.IndexFlatIP(mat.shape[1])
|
| 81 |
index.add(mat)
|
| 82 |
faiss.write_index(index, str(INDEX_PATH))
|
| 83 |
print(f"Index {len(texts)} chunks → {INDEX_PATH}")
|
|
|
|
| 10 |
from openai_client import embed_texts
|
| 11 |
from guardrails import sanitize
|
| 12 |
|
| 13 |
+
# absolute paths
|
| 14 |
+
BASE_DIR = pathlib.Path(__file__).resolve().parent
|
| 15 |
+
DATA_DIR = BASE_DIR / "data"
|
| 16 |
+
PDF_DIR = DATA_DIR / "pdf"
|
|
|
|
|
|
|
| 17 |
INDEX_DIR = DATA_DIR / "index"
|
| 18 |
+
META_PATH = INDEX_DIR / "meta.jsonl"
|
| 19 |
INDEX_PATH = INDEX_DIR / "faiss.index"
|
| 20 |
|
| 21 |
+
CFG = yaml.safe_load(open(BASE_DIR / "config.yaml", encoding="utf-8"))
|
| 22 |
+
EMB_MODEL = CFG["embedding_model"]
|
| 23 |
+
NORMALIZE = CFG.get("normalize_embeddings", True)
|
| 24 |
+
target_chars = CFG["chunk"]["target_chars"]
|
| 25 |
+
overlap_chars = CFG["chunk"]["overlap_chars"]
|
| 26 |
+
|
| 27 |
+
def read_pdf_with_pages(path: str):
|
| 28 |
+
pages = []
|
| 29 |
reader = PdfReader(path)
|
| 30 |
for i, p in enumerate(reader.pages):
|
| 31 |
txt = p.extract_text() or ""
|
|
|
|
| 33 |
pages.append((i + 1, txt))
|
| 34 |
return pages
|
| 35 |
|
| 36 |
+
def split_chunks(pages: List[Tuple[int, str]], target_chars: int, overlap_chars: int):
|
| 37 |
+
chunks = []
|
| 38 |
for page, text in pages:
|
| 39 |
if not text:
|
| 40 |
continue
|
|
|
|
| 55 |
|
| 56 |
def build_index():
|
| 57 |
INDEX_DIR.mkdir(parents=True, exist_ok=True)
|
| 58 |
+
with open(META_PATH, "w", encoding="utf-8") as meta_f:
|
| 59 |
+
texts: List[str] = []
|
| 60 |
+
for pdf in sorted(PDF_DIR.glob("*.pdf")):
|
| 61 |
+
print(f"Processing {pdf.name}...")
|
| 62 |
+
pages = read_pdf_with_pages(str(pdf))
|
| 63 |
+
chunks = split_chunks(pages, target_chars, overlap_chars)
|
| 64 |
+
for c in chunks:
|
| 65 |
+
t = c["text"][:1800]
|
| 66 |
+
texts.append(t)
|
| 67 |
+
meta = {"source": pdf.name, "page": c["page"], "text": sanitize(t)}
|
| 68 |
+
meta_f.write(json.dumps(meta, ensure_ascii=False) + "\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
+
if not META_PATH.exists() or META_PATH.stat().st_size == 0:
|
| 71 |
raise SystemExit("Put PDFs under data/pdf/")
|
| 72 |
|
| 73 |
+
# embed
|
| 74 |
vecs = embed_texts(texts, EMB_MODEL)
|
| 75 |
mat = np.array(vecs, dtype="float32")
|
| 76 |
mat = l2_normalize(mat)
|
| 77 |
|
| 78 |
+
index = faiss.IndexFlatIP(mat.shape[1]) # cosine via normalized dot
|
|
|
|
| 79 |
index.add(mat)
|
| 80 |
faiss.write_index(index, str(INDEX_PATH))
|
| 81 |
print(f"Index {len(texts)} chunks → {INDEX_PATH}")
|