from __future__ import annotations import json, pathlib from typing import List, Dict, Tuple import numpy as np import faiss from pypdf import PdfReader import yaml from openai_client import embed_texts from guardrails import sanitize CFG = yaml.safe_load(open("config.yaml", encoding="utf-8")) EMB_MODEL = CFG["embedding_model"] NORMALIZE = CFG.get("normalize_embeddings", True) DATA_DIR = pathlib.Path("data") PDF_DIR = DATA_DIR / "pdf" INDEX_DIR = DATA_DIR / "index" META_PATH = INDEX_DIR / "meta.jsonl" # app.py と一致 INDEX_PATH = INDEX_DIR / "faiss.index" def read_pdf_with_pages(path: str) -> List[Tuple[int, str]]: pages: List[Tuple[int, str]] = [] reader = PdfReader(path) for i, p in enumerate(reader.pages): txt = p.extract_text() or "" txt = "\n".join(line.strip() for line in txt.splitlines() if line.strip()) pages.append((i + 1, txt)) return pages def split_chunks(pages: List[Tuple[int, str]], target_chars: int, overlap_chars: int) -> List[Dict]: chunks: List[Dict] = [] for page, text in pages: if not text: continue start = 0 while start < len(text): end = min(len(text), start + target_chars) chunk = text[start:end] if len(chunk.strip()) >= 50: chunks.append({"page": page, "text": chunk}) start = end - overlap_chars if end - overlap_chars > 0 else end return chunks def l2_normalize(m: np.ndarray) -> np.ndarray: if not NORMALIZE: return m norms = np.linalg.norm(m, axis=1, keepdims=True) + 1e-12 return m / norms def build_index(): INDEX_DIR.mkdir(parents=True, exist_ok=True) meta_f = open(META_PATH, "w", encoding="utf-8") target_chars = CFG["chunk"]["target_chars"] overlap_chars = CFG["chunk"]["overlap_chars"] texts: List[str] = [] for pdf in sorted(PDF_DIR.glob("*.pdf")): print(f"Processing {pdf.name}...") pages = read_pdf_with_pages(str(pdf)) chunks = split_chunks(pages, target_chars, overlap_chars) for c in chunks: t = c["text"][:1800] texts.append(t) meta = {"source": pdf.name, "page": c["page"], "text": sanitize(t)} meta_f.write(json.dumps(meta, ensure_ascii=False) + "\n") meta_f.close() if not texts: raise SystemExit("Put PDFs under data/pdf/") vecs = embed_texts(texts, EMB_MODEL) mat = np.array(vecs, dtype="float32") mat = l2_normalize(mat) # コサイン類似(正規化済みベクトル × 内積) index = faiss.IndexFlatIP(mat.shape[1]) index.add(mat) faiss.write_index(index, str(INDEX_PATH)) print(f"Index {len(texts)} chunks → {INDEX_PATH}") if __name__ == "__main__": build_index()