Spaces:
Runtime error
Runtime error
| from __future__ import annotations | |
| import json, pathlib | |
| from typing import List, Dict, Tuple | |
| import numpy as np | |
| import faiss | |
| from pypdf import PdfReader | |
| import yaml | |
| from openai_client import embed_texts | |
| from guardrails import sanitize | |
| CFG = yaml.safe_load(open("config.yaml", encoding="utf-8")) | |
| EMB_MODEL = CFG["embedding_model"] | |
| NORMALIZE = CFG.get("normalize_embeddings", True) | |
| DATA_DIR = pathlib.Path("data") | |
| PDF_DIR = DATA_DIR / "pdf" | |
| INDEX_DIR = DATA_DIR / "index" | |
| META_PATH = INDEX_DIR / "meta.jsonl" # app.py と一致 | |
| INDEX_PATH = INDEX_DIR / "faiss.index" | |
| def read_pdf_with_pages(path: str) -> List[Tuple[int, str]]: | |
| pages: List[Tuple[int, str]] = [] | |
| reader = PdfReader(path) | |
| for i, p in enumerate(reader.pages): | |
| txt = p.extract_text() or "" | |
| txt = "\n".join(line.strip() for line in txt.splitlines() if line.strip()) | |
| pages.append((i + 1, txt)) | |
| return pages | |
| def split_chunks(pages: List[Tuple[int, str]], target_chars: int, overlap_chars: int) -> List[Dict]: | |
| chunks: List[Dict] = [] | |
| for page, text in pages: | |
| if not text: | |
| continue | |
| start = 0 | |
| while start < len(text): | |
| end = min(len(text), start + target_chars) | |
| chunk = text[start:end] | |
| if len(chunk.strip()) >= 50: | |
| chunks.append({"page": page, "text": chunk}) | |
| start = end - overlap_chars if end - overlap_chars > 0 else end | |
| return chunks | |
| def l2_normalize(m: np.ndarray) -> np.ndarray: | |
| if not NORMALIZE: | |
| return m | |
| norms = np.linalg.norm(m, axis=1, keepdims=True) + 1e-12 | |
| return m / norms | |
| def build_index(): | |
| INDEX_DIR.mkdir(parents=True, exist_ok=True) | |
| meta_f = open(META_PATH, "w", encoding="utf-8") | |
| target_chars = CFG["chunk"]["target_chars"] | |
| overlap_chars = CFG["chunk"]["overlap_chars"] | |
| texts: List[str] = [] | |
| for pdf in sorted(PDF_DIR.glob("*.pdf")): | |
| print(f"Processing {pdf.name}...") | |
| pages = read_pdf_with_pages(str(pdf)) | |
| chunks = split_chunks(pages, target_chars, overlap_chars) | |
| for c in chunks: | |
| t = c["text"][:1800] | |
| texts.append(t) | |
| meta = {"source": pdf.name, "page": c["page"], "text": sanitize(t)} | |
| meta_f.write(json.dumps(meta, ensure_ascii=False) + "\n") | |
| meta_f.close() | |
| if not texts: | |
| raise SystemExit("Put PDFs under data/pdf/") | |
| vecs = embed_texts(texts, EMB_MODEL) | |
| mat = np.array(vecs, dtype="float32") | |
| mat = l2_normalize(mat) | |
| # コサイン類似(正規化済みベクトル × 内積) | |
| index = faiss.IndexFlatIP(mat.shape[1]) | |
| index.add(mat) | |
| faiss.write_index(index, str(INDEX_PATH)) | |
| print(f"Index {len(texts)} chunks → {INDEX_PATH}") | |
| if __name__ == "__main__": | |
| build_index() | |