Corin1998 commited on
Commit
f31c318
·
verified ·
1 Parent(s): d17f367

Upload 8 files

Browse files
Files changed (6) hide show
  1. app.py +39 -37
  2. config.yaml +30 -25
  3. guardrails.py +3 -3
  4. ingest.py +2 -5
  5. openai_client.py +2 -2
  6. requirements.txt +1 -1
app.py CHANGED
@@ -1,17 +1,18 @@
1
- # app.py — Gradio SDK / Lazy index loading / Rebuild index button
2
  from __future__ import annotations
3
- import os, json, yaml, subprocess, sys, time, pathlib
4
  from typing import List, Dict, Tuple
5
 
6
  import gradio as gr
7
 
 
8
  CFG = yaml.safe_load(open("config.yaml", encoding="utf-8"))
9
 
10
  INDEX_PATH = pathlib.Path("data/index/faiss.index")
11
  META_PATH = pathlib.Path("data/index/meta.jsonl")
12
 
13
- # ---- Lazy imports (index系は必要時にのみ読み込む) ----
14
- def lazy_imports():
15
  global faiss, np, embed_texts, chat, detect_out_of_scope, sanitize, compliance_block, SCOPE_HINT
16
  import faiss
17
  import numpy as np
@@ -19,13 +20,13 @@ def lazy_imports():
19
  from guardrails import detect_out_of_scope, sanitize, compliance_block, SCOPE_HINT
20
  return faiss, np, embed_texts, chat, detect_out_of_scope, sanitize, compliance_block, SCOPE_HINT
21
 
22
- def index_exists() -> bool:
23
  return INDEX_PATH.exists() and META_PATH.exists()
24
 
25
- def check_api_key() -> bool:
26
  return bool(os.getenv("OPENAI_API_KEY"))
27
 
28
- # ---- Search helpers (load on demand) ----
29
  _INDEX = None
30
  _METAS = None
31
 
@@ -33,20 +34,20 @@ def _ensure_index_loaded():
33
  global _INDEX, _METAS
34
  if _INDEX is not None and _METAS is not None:
35
  return
36
- if not index_exists():
37
  raise RuntimeError("index_not_ready")
38
- faiss, np, *_ = lazy_imports()
39
  _INDEX = faiss.read_index(str(INDEX_PATH))
40
  _METAS = [json.loads(l) for l in open(META_PATH, encoding="utf-8")]
41
 
42
  def _embed_query(q: str):
43
- _, np, embed_texts, *_ = lazy_imports()
44
  v = np.array(embed_texts([q], CFG["embedding_model"])[0], dtype="float32")
45
  v = v / (np.linalg.norm(v) + 1e-12)
46
  return v[None, :]
47
 
48
  def _search(q: str):
49
- faiss, np, *_ = lazy_imports()
50
  _ensure_index_loaded()
51
  TOP_K = CFG["retrieval"]["top_k"]
52
  SCORE_TH = CFG["retrieval"]["score_threshold"]
@@ -72,41 +73,44 @@ def _format_context(chunks: List[Dict]) -> str:
72
 
73
  # ---- Handlers ----
74
  def rebuild_index() -> str:
75
- if not check_api_key():
76
  return "OPENAI_API_KEY が未設定です。Spaces → Settings → Secrets で登録してください。"
77
  pdf_dir = pathlib.Path("data/pdf")
78
- pdfs = list(pdf_dir.glob("*.pdf"))
79
- if not pdfs:
80
  return "data/pdf/ にPDFがありません。PDFを置いて再実行してください。"
81
  try:
82
- # ingest.py を実行して index を作る
83
  out = subprocess.run([sys.executable, "ingest.py"], capture_output=True, text=True, check=True)
84
- return f"✅ インデクス生成完了:\n{out.stdout[-800:]}"
 
 
 
 
85
  except subprocess.CalledProcessError as e:
86
- return f"❌ インデックス生成に失敗:\n{e.stdout}\n{e.stderr}"
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  def generate_answer(q: str, lang: str):
89
  q = (q or "").strip()
90
  if not q:
91
  return "質問を入力してください。", {}
92
  try:
93
- _, _, _, chat, detect_out_of_scope, sanitize, compliance_block, SCOPE_HINT = lazy_imports()
94
  if detect_out_of_scope(q):
95
  return f"{SCOPE_HINT}\nIR/ESG関連の事項についてお尋ねください。", {}
96
  chunks = _search(q)
97
  context = _format_context(chunks)
98
-
99
- lang_note = {
100
- "ja": "回答は日本語で出力してください。",
101
- "en": "Answer in English.",
102
- "zh": "请用中文回答。",
103
- "ko": "한국어로 답변하세요.",
104
- "fr": "Répondez en français.",
105
- "de": "Bitte auf Deutsch antworten.",
106
- "es": "Responde en español.",
107
- "it": "Rispondi in italiano.",
108
- }.get(lang, "Answer in the user's language.")
109
-
110
  user_prompt = (
111
  "以下のコンテキストのみを根拠に、簡潔かつ正確に回答してください。\n"
112
  "必ず箇条書きで根拠(文書名とページ)を列挙してください。\n"
@@ -120,20 +124,17 @@ def generate_answer(q: str, lang: str):
120
  text = sanitize(text) + "\n\n" + compliance_block()
121
  citations = [{"source": c["source"], "page": c["page"], "score": round(c["score"], 3)} for c in chunks]
122
  return text, {"citations": citations}
123
-
124
  except RuntimeError as e:
125
  if str(e) == "index_not_ready":
126
- msg = (
127
  "⚠️ インデックスがまだありません。\n"
128
  "1) data/pdf/ にPDFを置く\n"
129
- "2) 下の『インデックス再構築』ボタンを押す(OpenAI APIキー必須)\n"
130
- )
131
- return msg, {}
132
  raise
133
 
134
  # ---- UI ----
135
  LANGS = CFG["languages"]["preferred"]
136
- LABELS = CFG["languages"].get("labels", {l: l for l in LANGS})
137
 
138
  with gr.Blocks(fill_height=True, title=CFG.get("app_name", "RAG Bot")) as demo:
139
  gr.Markdown("# IR・ESG開示RAG(OpenAI API)— 8言語対応")
@@ -151,4 +152,5 @@ with gr.Blocks(fill_height=True, title=CFG.get("app_name", "RAG Bot")) as demo:
151
  ask.click(fn=generate_answer, inputs=[q, lang], outputs=[ans, cites])
152
  rebuild.click(fn=rebuild_index, outputs=[log])
153
 
154
- demo # Gradio SDK はこの変数を自動起動します
 
 
1
+ # app.py — Gradio SDK / 8 languages / Lazy index loading / Rebuild index button
2
  from __future__ import annotations
3
+ import os, json, yaml, subprocess, sys, pathlib
4
  from typing import List, Dict, Tuple
5
 
6
  import gradio as gr
7
 
8
+ # ---- Load config ----
9
  CFG = yaml.safe_load(open("config.yaml", encoding="utf-8"))
10
 
11
  INDEX_PATH = pathlib.Path("data/index/faiss.index")
12
  META_PATH = pathlib.Path("data/index/meta.jsonl")
13
 
14
+ # ---- Lazy imports ----
15
+ def _lazy_imports():
16
  global faiss, np, embed_texts, chat, detect_out_of_scope, sanitize, compliance_block, SCOPE_HINT
17
  import faiss
18
  import numpy as np
 
20
  from guardrails import detect_out_of_scope, sanitize, compliance_block, SCOPE_HINT
21
  return faiss, np, embed_texts, chat, detect_out_of_scope, sanitize, compliance_block, SCOPE_HINT
22
 
23
+ def _index_exists() -> bool:
24
  return INDEX_PATH.exists() and META_PATH.exists()
25
 
26
+ def _check_api_key() -> bool:
27
  return bool(os.getenv("OPENAI_API_KEY"))
28
 
29
+ # ---- Globals (Lazy) ----
30
  _INDEX = None
31
  _METAS = None
32
 
 
34
  global _INDEX, _METAS
35
  if _INDEX is not None and _METAS is not None:
36
  return
37
+ if not _index_exists():
38
  raise RuntimeError("index_not_ready")
39
+ faiss, *_ = _lazy_imports()
40
  _INDEX = faiss.read_index(str(INDEX_PATH))
41
  _METAS = [json.loads(l) for l in open(META_PATH, encoding="utf-8")]
42
 
43
  def _embed_query(q: str):
44
+ _, np, embed_texts, *_ = _lazy_imports()
45
  v = np.array(embed_texts([q], CFG["embedding_model"])[0], dtype="float32")
46
  v = v / (np.linalg.norm(v) + 1e-12)
47
  return v[None, :]
48
 
49
  def _search(q: str):
50
+ faiss, np, *_ = _lazy_imports()
51
  _ensure_index_loaded()
52
  TOP_K = CFG["retrieval"]["top_k"]
53
  SCORE_TH = CFG["retrieval"]["score_threshold"]
 
73
 
74
  # ---- Handlers ----
75
  def rebuild_index() -> str:
76
+ if not _check_api_key():
77
  return "OPENAI_API_KEY が未設定です。Spaces → Settings → Secrets で登録してください。"
78
  pdf_dir = pathlib.Path("data/pdf")
79
+ pdf_dir.mkdir(parents=True, exist_ok=True)
80
+ if not list(pdf_dir.glob("*.pdf")):
81
  return "data/pdf/ にPDFがありません。PDFを置いて再実行してください。"
82
  try:
 
83
  out = subprocess.run([sys.executable, "ingest.py"], capture_output=True, text=True, check=True)
84
+ # キャシュ破棄
85
+ global _INDEX, _METAS
86
+ _INDEX = None
87
+ _METAS = None
88
+ return "✅ インデックス生成完了\n```\n" + (out.stdout[-1200:] or "") + "\n```"
89
  except subprocess.CalledProcessError as e:
90
+ return f"❌ インデックス生成に失敗\nstdout:\n{e.stdout}\n\nstderr:\n{e.stderr}"
91
+
92
+ _LANG_INSTRUCTIONS = {
93
+ "ja": "回答は日本語で出力してください。",
94
+ "en": "Answer in English.",
95
+ "zh": "请用中文回答。",
96
+ "ko": "한국어로 답변하세요.",
97
+ "fr": "Répondez en français.",
98
+ "de": "Bitte auf Deutsch antworten.",
99
+ "es": "Responde en español.",
100
+ "it": "Rispondi in italiano.",
101
+ }
102
 
103
  def generate_answer(q: str, lang: str):
104
  q = (q or "").strip()
105
  if not q:
106
  return "質問を入力してください。", {}
107
  try:
108
+ _, _, _, chat, detect_out_of_scope, sanitize, compliance_block, SCOPE_HINT = _lazy_imports()
109
  if detect_out_of_scope(q):
110
  return f"{SCOPE_HINT}\nIR/ESG関連の事項についてお尋ねください。", {}
111
  chunks = _search(q)
112
  context = _format_context(chunks)
113
+ lang_note = _LANG_INSTRUCTIONS.get(lang, "Answer in the user's language.")
 
 
 
 
 
 
 
 
 
 
 
114
  user_prompt = (
115
  "以下のコンテキストのみを根拠に、簡潔かつ正確に回答してください。\n"
116
  "必ず箇条書きで根拠(文書名とページ)を列挙してください。\n"
 
124
  text = sanitize(text) + "\n\n" + compliance_block()
125
  citations = [{"source": c["source"], "page": c["page"], "score": round(c["score"], 3)} for c in chunks]
126
  return text, {"citations": citations}
 
127
  except RuntimeError as e:
128
  if str(e) == "index_not_ready":
129
+ return (
130
  "⚠️ インデックスがまだありません。\n"
131
  "1) data/pdf/ にPDFを置く\n"
132
+ "2) 『インデックス再構築』ボタンを押す(OpenAI APIキー必須)\n"
133
+ ), {}
 
134
  raise
135
 
136
  # ---- UI ----
137
  LANGS = CFG["languages"]["preferred"]
 
138
 
139
  with gr.Blocks(fill_height=True, title=CFG.get("app_name", "RAG Bot")) as demo:
140
  gr.Markdown("# IR・ESG開示RAG(OpenAI API)— 8言語対応")
 
152
  ask.click(fn=generate_answer, inputs=[q, lang], outputs=[ans, cites])
153
  rebuild.click(fn=rebuild_index, outputs=[log])
154
 
155
+ # Gradio SDK はこの変数を自動検出して起動します
156
+ demo
config.yaml CHANGED
@@ -1,31 +1,36 @@
1
- app_name "IR/ESG RAG Bot(OpenAI,8 languages)
2
- embedding_model:"text-embedding-3-large"
3
  normalize_embeddings: true
 
4
  chunk:
5
- target_chars:1400
6
- overlap_chars:180
 
7
  retrieval:
8
- top_k:6
9
- score_threshold:0.15
10
- mmr_lambda:0.3
 
11
  llm:
12
- model:"gpt-4"
13
- max_output_tokens:700
14
- temperature:0.2
15
- system_prompt: |
16
-  あなたは上場企業のIR/ESG開示に特化したRAGアシスタントです。回答は常に根拠(文書名・ページ)を箇条書きで示し、
17
-  文書外の推測や断定は避けます。数値は年度と単位を明記し、最新年度を優先してください。
18
- language:
19
- preferred:[ja,en,zh,ko,de,es,it,fr]
20
- labels:
21
- ja:"日本語"
22
- en:"English"
23
- zh:"中文"
24
- ko:"한국어"
25
- de:"Deutsch"
26
- es:"Español"
27
- it:"Italiano"
28
- fr:"Français"
 
 
29
  logging:
30
  save_qa: true
31
- path:"logs/qa_log.jsonl"
 
1
+ app_name: "IR/ESG RAG Bot (OpenAI, 8 languages)"
2
+ embedding_model: "text-embedding-3-large"
3
  normalize_embeddings: true
4
+
5
  chunk:
6
+ target_chars: 1400
7
+ overlap_chars: 180
8
+
9
  retrieval:
10
+ top_k: 6
11
+ score_threshold: 0.15
12
+ mmr_lambda: 0.3
13
+
14
  llm:
15
+ model: "gpt-4o-mini"
16
+ max_output_tokens: 700
17
+ temperature: 0.2
18
+ system_prompt: |-
19
+ あなたは上場企業のIRESG開示に特化したRAGアシスタントです。回答は常に根拠(文書名・ページ)を箇条書きで示し、
20
+ 文書外の推測や断定は避けます。数値は年度と単位を明記し、最新年度を優先してください。
21
+
22
+ languages:
23
+ preferred: [ja, en, zh, ko, fr, de, es, it]
24
+ labels:
25
+ ja: "日本語"
26
+ en: "English"
27
+ zh: "中文"
28
+ ko: "한국어"
29
+ fr: "Français"
30
+ de: "Deutsch"
31
+ es: "Español"
32
+ it: "Italiano"
33
+
34
  logging:
35
  save_qa: true
36
+ path: "logs/qa_log.jsonl"
guardrails.py CHANGED
@@ -7,10 +7,10 @@ ALLOWED_TOPICS = [
7
  ]
8
  OUT_OF_SCOPE_PATTERNS = [r"採用の可否", r"未公開情報", r"株価予想", r"インサイダー", r"個人情報"]
9
 
10
- # 簡易PIIマスク(郵便・電話っぽい連番・メール)
11
  PII = re.compile(
12
- r"(\d{3}-\d{4})" # 郵便番号 123-4567
13
- r"|(\d{2,4}-\d{2,4}-\d{3,4})" # 電話番号 03-1234-5678 など
14
  r"|([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+)" # メール
15
  )
16
 
 
7
  ]
8
  OUT_OF_SCOPE_PATTERNS = [r"採用の可否", r"未公開情報", r"株価予想", r"インサイダー", r"個人情報"]
9
 
10
+ # 簡易PIIマスク(郵便・電話・メール)
11
  PII = re.compile(
12
+ r"(\d{3}-\d{4})" # 郵便番号
13
+ r"|(\d{2,4}-\d{2,4}-\d{3,4})" # 電話番号
14
  r"|([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+)" # メール
15
  )
16
 
ingest.py CHANGED
@@ -17,7 +17,7 @@ NORMALIZE = CFG.get("normalize_embeddings", True)
17
  DATA_DIR = pathlib.Path("data")
18
  PDF_DIR = DATA_DIR / "pdf"
19
  INDEX_DIR = DATA_DIR / "index"
20
- META_PATH = INDEX_DIR / "meta.jsonl" # app.py に合わせて .jsonl
21
  INDEX_PATH = INDEX_DIR / "faiss.index"
22
 
23
  def read_pdf_with_pages(path: str) -> List[Tuple[int, str]]:
@@ -37,7 +37,7 @@ def split_chunks(pages: List[Tuple[int, str]], target_chars: int, overlap_chars:
37
  start = 0
38
  while start < len(text):
39
  end = min(len(text), start + target_chars)
40
- chunk = text[start:end] # ← タイポ修正(strart -> start)
41
  if len(chunk.strip()) >= 50:
42
  chunks.append({"page": page, "text": chunk})
43
  start = end - overlap_chars if end - overlap_chars > 0 else end
@@ -57,8 +57,6 @@ def build_index():
57
  overlap_chars = CFG["chunk"]["overlap_chars"]
58
 
59
  texts: List[str] = []
60
- metas: List[Dict] = []
61
-
62
  for pdf in sorted(PDF_DIR.glob("*.pdf")):
63
  print(f"Processing {pdf.name}...")
64
  pages = read_pdf_with_pages(str(pdf))
@@ -67,7 +65,6 @@ def build_index():
67
  t = c["text"][:1800]
68
  texts.append(t)
69
  meta = {"source": pdf.name, "page": c["page"], "text": sanitize(t)}
70
- metas.append(meta)
71
  meta_f.write(json.dumps(meta, ensure_ascii=False) + "\n")
72
 
73
  meta_f.close()
 
17
  DATA_DIR = pathlib.Path("data")
18
  PDF_DIR = DATA_DIR / "pdf"
19
  INDEX_DIR = DATA_DIR / "index"
20
+ META_PATH = INDEX_DIR / "meta.jsonl" # app.py と一致
21
  INDEX_PATH = INDEX_DIR / "faiss.index"
22
 
23
  def read_pdf_with_pages(path: str) -> List[Tuple[int, str]]:
 
37
  start = 0
38
  while start < len(text):
39
  end = min(len(text), start + target_chars)
40
+ chunk = text[start:end]
41
  if len(chunk.strip()) >= 50:
42
  chunks.append({"page": page, "text": chunk})
43
  start = end - overlap_chars if end - overlap_chars > 0 else end
 
57
  overlap_chars = CFG["chunk"]["overlap_chars"]
58
 
59
  texts: List[str] = []
 
 
60
  for pdf in sorted(PDF_DIR.glob("*.pdf")):
61
  print(f"Processing {pdf.name}...")
62
  pages = read_pdf_with_pages(str(pdf))
 
65
  t = c["text"][:1800]
66
  texts.append(t)
67
  meta = {"source": pdf.name, "page": c["page"], "text": sanitize(t)}
 
68
  meta_f.write(json.dumps(meta, ensure_ascii=False) + "\n")
69
 
70
  meta_f.close()
openai_client.py CHANGED
@@ -16,9 +16,9 @@ def embed_texts(texts: List[str], model: str) -> List[List[float]]:
16
  resp = client().embeddings.create(model=model, input=texts)
17
  return [d.embedding for d in resp.data]
18
 
19
- # Responses API(app.py 側の呼び出しシグネチャに合わせる)
20
  def chat(messages: List[Dict], model: str, max_output_tokens: int = 700, temperature: float = 0.2) -> str:
21
- # Responses APIは input=messages 形式
22
  resp = client().responses.create(
23
  model=model,
24
  input=messages,
 
16
  resp = client().embeddings.create(model=model, input=texts)
17
  return [d.embedding for d in resp.data]
18
 
19
+ # Responses API
20
  def chat(messages: List[Dict], model: str, max_output_tokens: int = 700, temperature: float = 0.2) -> str:
21
+ # Responses APIは input=messages
22
  resp = client().responses.create(
23
  model=model,
24
  input=messages,
requirements.txt CHANGED
@@ -6,4 +6,4 @@ gradio==4.44.0
6
  fastapi==0.112.0
7
  uvicorn==0.30.5
8
  httpx==0.27.0
9
- pydantic==2.8.2
 
6
  fastapi==0.112.0
7
  uvicorn==0.30.5
8
  httpx==0.27.0
9
+ pydantic==2.8.2