IR_ESG_RAG_Bot / ingest.py
Corin1998's picture
Upload 8 files
f31c318 verified
from __future__ import annotations
import json, pathlib
from typing import List, Dict, Tuple
import numpy as np
import faiss
from pypdf import PdfReader
import yaml
from openai_client import embed_texts
from guardrails import sanitize
CFG = yaml.safe_load(open("config.yaml", encoding="utf-8"))
EMB_MODEL = CFG["embedding_model"]
NORMALIZE = CFG.get("normalize_embeddings", True)
DATA_DIR = pathlib.Path("data")
PDF_DIR = DATA_DIR / "pdf"
INDEX_DIR = DATA_DIR / "index"
META_PATH = INDEX_DIR / "meta.jsonl" # app.py と一致
INDEX_PATH = INDEX_DIR / "faiss.index"
def read_pdf_with_pages(path: str) -> List[Tuple[int, str]]:
pages: List[Tuple[int, str]] = []
reader = PdfReader(path)
for i, p in enumerate(reader.pages):
txt = p.extract_text() or ""
txt = "\n".join(line.strip() for line in txt.splitlines() if line.strip())
pages.append((i + 1, txt))
return pages
def split_chunks(pages: List[Tuple[int, str]], target_chars: int, overlap_chars: int) -> List[Dict]:
chunks: List[Dict] = []
for page, text in pages:
if not text:
continue
start = 0
while start < len(text):
end = min(len(text), start + target_chars)
chunk = text[start:end]
if len(chunk.strip()) >= 50:
chunks.append({"page": page, "text": chunk})
start = end - overlap_chars if end - overlap_chars > 0 else end
return chunks
def l2_normalize(m: np.ndarray) -> np.ndarray:
if not NORMALIZE:
return m
norms = np.linalg.norm(m, axis=1, keepdims=True) + 1e-12
return m / norms
def build_index():
INDEX_DIR.mkdir(parents=True, exist_ok=True)
meta_f = open(META_PATH, "w", encoding="utf-8")
target_chars = CFG["chunk"]["target_chars"]
overlap_chars = CFG["chunk"]["overlap_chars"]
texts: List[str] = []
for pdf in sorted(PDF_DIR.glob("*.pdf")):
print(f"Processing {pdf.name}...")
pages = read_pdf_with_pages(str(pdf))
chunks = split_chunks(pages, target_chars, overlap_chars)
for c in chunks:
t = c["text"][:1800]
texts.append(t)
meta = {"source": pdf.name, "page": c["page"], "text": sanitize(t)}
meta_f.write(json.dumps(meta, ensure_ascii=False) + "\n")
meta_f.close()
if not texts:
raise SystemExit("Put PDFs under data/pdf/")
vecs = embed_texts(texts, EMB_MODEL)
mat = np.array(vecs, dtype="float32")
mat = l2_normalize(mat)
# コサイン類似(正規化済みベクトル × 内積)
index = faiss.IndexFlatIP(mat.shape[1])
index.add(mat)
faiss.write_index(index, str(INDEX_PATH))
print(f"Index {len(texts)} chunks → {INDEX_PATH}")
if __name__ == "__main__":
build_index()