Corin1998 commited on
Commit
852a240
·
verified ·
1 Parent(s): bba2b70

Upload 8 files

Browse files
Files changed (8) hide show
  1. README.md +36 -12
  2. app.py +110 -0
  3. config.yaml +31 -0
  4. guardrails.py +26 -0
  5. ingest.py +87 -0
  6. openai_client.py +24 -0
  7. repository layout +15 -0
  8. requirements.txt +9 -0
README.md CHANGED
@@ -1,12 +1,36 @@
1
- ---
2
- title: IR ESG RAG Bot
3
- emoji: 🌍
4
- colorFrom: red
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 5.43.1
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ```markdown
2
+ # IRESG開示RAGボット(OpenAI API版・8言語対応)
3
+
4
+
5
+ ## クイックスタート
6
+ 1. `data/pdf/` にIR/ESG PDFを配置
7
+ 2. `pip install -r requirements.txt`
8
+ 3. `python ingest.py` → `data/index/` 生成
9
+ 4. `export OPENAI_API_KEY=...`(必要に応じて `OPENAI_BASE_URL`)
10
+ 5. `python app.py` → Gradio UI / `/api/answer`
11
+
12
+
13
+ ## 埋め込みサンプル
14
+ ```html
15
+ <script>
16
+ async function askRag(question, lang="ja"){
17
+ const r = await fetch("https://<your-host>/api/answer",{
18
+ method:"POST", headers:{"Content-Type":"application/json"},
19
+ body: JSON.stringify({question, lang})
20
+ });
21
+ const data = await r.json();
22
+ console.log(data.text, data.citations);
23
+ }
24
+ </script>
25
+ ```
26
+
27
+
28
+ ## モデル推奨
29
+ - 生成: `gpt-4o-mini`
30
+ - 埋め込み: `text-embedding-3-large`
31
+
32
+
33
+ ## 運用Tips
34
+ - PDF直リンク + `#page=<n>` を `meta.jsonl` に保持すれば、根拠クリックで該当ページに飛べます。
35
+ - 年度更新はPDF差替え→`python ingest.py`。CI/CDで自動化を推奨。
36
+ - ログには個人情報を含めない。
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from__future__import __annotations
2
+ import os, json,yaml
3
+ from typing import List,Dict,Tuple
4
+
5
+ import gradio as gr
6
+ import faiss ,numpy as np
7
+
8
+ from openai_client import embed_texts,chat
9
+ from guardrails import detect_out_of_scope,sanitize,copliance_block,SCOPE_HINT
10
+
11
+ CFG=yaml.safe_load(open("config.yaml",encoding="utf-8"))
12
+ EMB_MODEL=CFG["emb_model"]
13
+ TOP_K=CFG["retrieval"]["top_k"]
14
+ SCORE_TH=CFG["retrieval"]["score_threshold"]
15
+ LLM_MODEL=CFG["llm"]["model"]
16
+ LANGS =CFG["languages"]["preferred"]
17
+ LABELS =CFG["languages"].get("labels",{l: l for l in LANGS})
18
+
19
+ INDEX = faiss.read_index("data/index/index.faiss")
20
+ METAS = [json.loads(l) for l in open("data/index/meta.json", encoding="utf-8")]
21
+
22
+ def embed_query(q:str)->np.ndarray:
23
+ v = np.array(embed_texts([q], EMB_MODEL)[0],dtype=np.float32)
24
+ v = v/(np.linalg.norm(v)+1e-12)
25
+ return v[None,:]
26
+
27
+ def search(q:str, top_k:int =TOP_K)->List[Dict]:
28
+ qv = embed_query(q)
29
+ sims, idxs = INDEX.search(qv, top_k*4)
30
+ sims, idxs = sims[0], idxs[0]
31
+ picked = []
32
+ seen = set()
33
+ for score, idx in zip(sims, idxs):
34
+ if score < SCORE_TH :
35
+ continue
36
+ c = METAS[idx]
37
+ key = (c["source"], c["page"])
38
+ if key in seen:
39
+ continue
40
+ seen.add(key)
41
+ picked.append({**c, "score": float(score)})
42
+ if len(picked) >= top_k:
43
+ break
44
+ return picked
45
+
46
+ def format_context(chunks:List[Dict])->str:
47
+ return"\n".join([
48
+ f"-出典:{c['source']} p.{c['page']} |抜粋: {c['text'][:180].replace('\n', ' ')}... "for c in chunks
49
+ ])
50
+
51
+ _LANG_INSTRUCTIONS = {
52
+ "ja":"回答は日本語で出力してください",
53
+ "en":"Answer in English.",
54
+ "zh":"请用中文回答。",
55
+ "ko":"한국어로 대답하십시오.",
56
+ "fr":"Répondez en français.",
57
+ "de":"Antworten Sie auf Deutsch.",
58
+ "es":"Responde en español.",
59
+ "it":"Rispondi in italiano.",
60
+ }
61
+
62
+ def generate_answer(q:str, lang:str)->Tuple[str,str]:
63
+ q=(q or "").strip()
64
+ if not q:
65
+ return "", "質問が入力してください。"{}"
66
+
67
+ if detect_out_of_scope(q):
68
+ return f"{SCOPE_HINT}\nIR/ESG関連の事項についてお尋ねください。","{}"
69
+
70
+ chunks = search(q)
71
+ context = format_context(chunks)
72
+
73
+ lang_note = _LANG_INSTRUCTIONS.get(lang, "Answer in the user's preferred language.")
74
+ user_prompt=(
75
+ "以下のコンテキストのみを根拠に、簡潔かつ正確に回答してください。\n"
76
+ "必ず箇条書きで根拠(文書名とページ)を列挙してください。\n"
77
+ f"{context}\n\n[コンテキスト]\n\n|質問|\n{q}"
78
+ )
79
+
80
+ messages= [
81
+ {"role": "system", "content": CFG["llm"]["system_prompt"]},
82
+ {"role": "user", "content": user_prompt},
83
+ ]
84
+
85
+ text=chat(messages,model=LLM_MODEL,max_output_tokens=CFG["llm"]["max_output_tokens"],temperature=CFG["llm"]["temperature"])
86
+ text=sanitize(text)+"\n\n" + complicance_block()
87
+ return text, json.dumps(meta,ensure_ascii=False)
88
+
89
+ with gr.Blocks(fill_height=True,title=CFG.get("app_name","RAG Bot")) as demo:
90
+ gr.Markdown("#IR/ESG開示RAG(OpenAI API)-8言語対応")
91
+ q= gr.Textbox("質問 / Question / 質问 / 질문 / Question / Frage / Pregunta / Domanda",lines=3,placeholder="例:2024年度のGHG排出量(スコープ1-3)は?")
92
+ lang = gr.Dropdown(choices=LANGS,value=LANGS[0],label="回答言語/Output Language")
93
+ ask = gr.Button("回答する// Answer / 回答 / 답변 / Répondre / Antworten / Responder / Rispondere", variant="primary")
94
+ ans = gr.Markdown()
95
+ cites = gr.JSON(label="根拠メタデータ/Citations")
96
+ ask.click(fn=generate_answer, inputs=[q, lang], outputs=[ans, cites])
97
+
98
+ import fastapi
99
+ api = fastapi.FastAPI()
100
+ app = gr.mount_gradio_app(api,demo,path="/")
101
+
102
+ @api.post("/api/answer")
103
+ async def api_answer(payload:dict):
104
+ text,meta=generate_answer(payload.get("question",""), payload.get("lang", "ja"))
105
+ data =json.loads(meta)
106
+ return{"text":text,**data}
107
+ if__name__=="__main__":
108
+ import uvicorn
109
+ uvicorn.run(app,host="0.0.0.0",port=int(os.getenv("PORT", 7860)))
110
+
config.yaml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ app_name "IR/ESG RAG Bot(OpenAI,8 languages)
2
+ embedding_model:"text-embedding-3-large"
3
+ normalize_embeddings: true
4
+ chunk:
5
+ target_chars:1400
6
+ overlap_chars:180
7
+ retrieval:
8
+ top_k:6
9
+ score_threshold:0.15
10
+ mmr_lambda:0.3
11
+ llm:
12
+ model:"gpt-4"
13
+ max_output_tokens:700
14
+ temperature:0.2
15
+ system_prompt: |
16
+  あなたは上場企業のIR/ESG開示に特化したRAGアシスタントです。回答は常に根拠(文書名・ページ)を箇条書きで示し、
17
+  文書外の推測や断定は避けます。数値は年度と単位を明記し、最新年度を優先してください。
18
+ language:
19
+ preferred:[ja,en,zh,ko,de,es,it,fr]
20
+ labels:
21
+ ja:"日本語"
22
+ en:"English"
23
+ zh:"中文"
24
+ ko:"한국어"
25
+ de:"Deutsch"
26
+ es:"Español"
27
+ it:"Italiano"
28
+ fr:"Français"
29
+ logging:
30
+ save_qa: true
31
+ path:"logs/qa_log.jsonl"
guardrails.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from__future__import __annotations
2
+ import re
3
+
4
+ ALLOWED_TOPICS =[
5
+ r"IR",r"投資家",r"決算",r"財務",r"ガバナンス",r"統合報告",r"サステナビリティ",
6
+ r"人的資本",r"リスク",r"セグメント",r"株主",r"資本政策",r"ESG",r"GHG"
7
+ ]
8
+ OUT_OF_SCOPE_PATTERNS =[r"採用の可否",r"未公開情報",r"株価予想",r"インサイダー",r"個人情報"]
9
+ PII = re.compile(r"(\d{3}-\d{4})|\d{2,4}-\d{4}|[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+)")
10
+
11
+ SCOPE_HINT=(
12
+ "このボットはIR/ESG開示文書(統合報告書、サステナ、決算短信、コーポガバ報告)を根拠とするQ&A専用です。"
13
+ )
14
+
15
+ def detect_out_of_scope(q:str)->bool:
16
+ if any(re.search(p,q)for p in OUT_OF_SCOPE_PATTERNS):
17
+ return True
18
+ if not any(re.search(p,q)for p in ALLOWED_TOPICS):
19
+ return True
20
+ return False
21
+
22
+ def sanitize(text:str)->str:
23
+ return PII.sub("[REDACTED]", text)
24
+
25
+ def compliance_block()->str:
26
+ return"※免責:本回答は公開済みIR/ESG資料に基づく情報提供であり、投資判断を目的としません。"
ingest.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from__future__import __annotations
2
+ import os,json,pathlib
3
+ from typing import List,Dict,Tuple
4
+
5
+ import numpy as np
6
+ import faiss
7
+ from pypdf import PdfReader
8
+ import yaml
9
+
10
+ from openai_client import embed_texts
11
+ from guardrails import sanitize
12
+
13
+ CFG = yaml.safe_load(open("config.yaml",encoding="utf-8"))
14
+ EMB_MODEL = CFG["embedding_model"]
15
+ NORMALIZE = CFG.get("normalize_embeddings", True)
16
+
17
+ DATA_DIR = pathlib.Path("data")
18
+ PDF_DIR = DATA_DIR / "pdf"
19
+ INDEX_DIR = DATA_DIR / "index"
20
+ META_PATH = INDEX_DIR / "meta.json"
21
+ INDEX_PATH = INDEX_DIR / "faiss.index"
22
+
23
+ def read_pdf_with_pages(path:str)->List[Tuple[int,str]]:
24
+ pages = []
25
+ reader = PdfReader(path)
26
+ for i, p in enumerate(reader.pages):
27
+ txt = p.extract_text()or""
28
+ txt="\n".join([line.strip()for line in txt.splitlines()if line.strip()])
29
+ pages.append((i+1, txt))
30
+ return pages
31
+
32
+ def split_chunks(pages:List[Tuple[int,str]],target_chars:int,overlap_chars:int)->List[Dict]:
33
+ chunks=[]
34
+ for page,text in pages:
35
+ if not text: continue
36
+ start=0
37
+ while start<len(text):
38
+ end = min(len(text),start + target_chars)
39
+ chunk=text[strart:end]
40
+ if len(chunk.strip())>=50:
41
+ chunks.append({"page":page,"text":chunk})
42
+ start = end - overlap_chars if end - overlap_chars > 0 else end
43
+ return chunks
44
+
45
+ def l2_normalize(m:np.ndarray)->np.ndarray:
46
+ if not NORMALIZE:
47
+ return m
48
+ norms=np.linalg.norm(m, axis=1, keepdims=True);1e-12
49
+ return m / norms
50
+
51
+ def build_index():
52
+ INDEX_DIR.mkdir(parents=True, exist_ok=True)
53
+ meta_f = open(META_PATH, "w", encoding="utf-8")
54
+
55
+ target_chars = CFG ["chunk"]["target_chars"]
56
+ overlap_chars = CFG["chunk"]["overlap_chars"]
57
+
58
+ texts=[]
59
+ metas=[]
60
+
61
+ for pdf in sorted(PDF_DIR.glob("*.pdf")):
62
+ print(f"Processing {pdf.name}...")
63
+ pages = read_pdf_with_pages(pdf)
64
+ chunks = split_chunks(pages, target_chars, overlap_chars)
65
+ for c in chunks:
66
+ t=c["text"][:1800]
67
+ texts.append(t)
68
+ meta={"source":pdf.name, "page":c["page"],"text":sanitize(t)}
69
+ metas.append(meta)
70
+ meta_f.write(json.dumps(meta, ensure_ascii=False) + "\n")
71
+
72
+ meta_f.close()
73
+
74
+ if not texts:
75
+ raise SystemExit("Put PDFs under data/pdf/ ")
76
+
77
+ vecs = embed_texts(texts,EMB_MODEL)
78
+ mat=np.array(vecs, dtype="float32")
79
+ mat = l2_normalize(mat)
80
+
81
+ index = faiss.IndexFlatL2(mat.shape[1])
82
+ index.add(mat)
83
+ faiss.write_index(index, str(INDEX_PATH))
84
+ print(f"Index {len(texts)} chunks → {INDEX_PATH}")
85
+
86
+ if __name__ == "__main__":
87
+ build_index()
openai_client.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from__future__import annoutations
2
+ from typing import List, Dict
3
+ from openai import OpenAI
4
+
5
+ _client = None
6
+
7
+ def client() -> OpenAI:
8
+ global _client
9
+ if _client is None:
10
+ _client = OpenAI()
11
+ return _client
12
+
13
+ def embed_text(text: List[str],model:str) -> List[List[float]]:
14
+ response = client().embeddings.create(model=model,input=texts)
15
+ return [d.embedding for d in response.data]
16
+
17
+ def chat(messages:List[Dict],model:str,max_output_tokens:int =700,temperature:float=0.2) -> str:
18
+ response = client().responses.create(
19
+ model=model,
20
+ messages=messages,
21
+ max_tokens=max_output_tokens,
22
+ temperature=temperature,
23
+ )
24
+ return response.output_text
repository layout ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ir-esg-rag-openai-8lang/
2
+ ├── app.py # Gradio UI + FastAPI (embed可) — 8言語対応
3
+ ├── ingest.py # PDF→チャンク→OpenAI Embeddings→FAISS
4
+ ├── guardrails.py # スコープ/PII/免責
5
+ ├── openai_client.py # Responses API呼び出し・共通ユーティリティ
6
+ ├── config.yaml # モデル/閾値/言語
7
+ ├── requirements.txt
8
+ ├── README.md
9
+ ├── data/
10
+ │ ├── pdf/
11
+ │ └── index/
12
+ │ ├── faiss.index
13
+ │ └── meta.jsonl
14
+ └── logs/
15
+ └── qa_log.jsonl
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ openai>=1.40.0
2
+ faiss-cpu==1.8.0.post1
3
+ pypdf==4.2.0
4
+ PyYAML==6.0.2
5
+ gradio==4.44.0
6
+ fastapi==0.112.0
7
+ uvicorn==0.30.5
8
+ httpx==0.27.0
9
+ pydantic==2.8.2