Spaces:
Runtime error
Runtime error
File size: 2,793 Bytes
d17f367 852a240 d17f367 852a240 f31c318 852a240 d17f367 852a240 d17f367 852a240 d17f367 f31c318 d17f367 852a240 d17f367 852a240 d17f367 852a240 d17f367 852a240 d17f367 852a240 d17f367 852a240 d17f367 852a240 d17f367 852a240 d17f367 852a240 d17f367 852a240 d17f367 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
from __future__ import annotations
import json, pathlib
from typing import List, Dict, Tuple
import numpy as np
import faiss
from pypdf import PdfReader
import yaml
from openai_client import embed_texts
from guardrails import sanitize
CFG = yaml.safe_load(open("config.yaml", encoding="utf-8"))
EMB_MODEL = CFG["embedding_model"]
NORMALIZE = CFG.get("normalize_embeddings", True)
DATA_DIR = pathlib.Path("data")
PDF_DIR = DATA_DIR / "pdf"
INDEX_DIR = DATA_DIR / "index"
META_PATH = INDEX_DIR / "meta.jsonl" # app.py と一致
INDEX_PATH = INDEX_DIR / "faiss.index"
def read_pdf_with_pages(path: str) -> List[Tuple[int, str]]:
pages: List[Tuple[int, str]] = []
reader = PdfReader(path)
for i, p in enumerate(reader.pages):
txt = p.extract_text() or ""
txt = "\n".join(line.strip() for line in txt.splitlines() if line.strip())
pages.append((i + 1, txt))
return pages
def split_chunks(pages: List[Tuple[int, str]], target_chars: int, overlap_chars: int) -> List[Dict]:
chunks: List[Dict] = []
for page, text in pages:
if not text:
continue
start = 0
while start < len(text):
end = min(len(text), start + target_chars)
chunk = text[start:end]
if len(chunk.strip()) >= 50:
chunks.append({"page": page, "text": chunk})
start = end - overlap_chars if end - overlap_chars > 0 else end
return chunks
def l2_normalize(m: np.ndarray) -> np.ndarray:
if not NORMALIZE:
return m
norms = np.linalg.norm(m, axis=1, keepdims=True) + 1e-12
return m / norms
def build_index():
INDEX_DIR.mkdir(parents=True, exist_ok=True)
meta_f = open(META_PATH, "w", encoding="utf-8")
target_chars = CFG["chunk"]["target_chars"]
overlap_chars = CFG["chunk"]["overlap_chars"]
texts: List[str] = []
for pdf in sorted(PDF_DIR.glob("*.pdf")):
print(f"Processing {pdf.name}...")
pages = read_pdf_with_pages(str(pdf))
chunks = split_chunks(pages, target_chars, overlap_chars)
for c in chunks:
t = c["text"][:1800]
texts.append(t)
meta = {"source": pdf.name, "page": c["page"], "text": sanitize(t)}
meta_f.write(json.dumps(meta, ensure_ascii=False) + "\n")
meta_f.close()
if not texts:
raise SystemExit("Put PDFs under data/pdf/")
vecs = embed_texts(texts, EMB_MODEL)
mat = np.array(vecs, dtype="float32")
mat = l2_normalize(mat)
# コサイン類似(正規化済みベクトル × 内積)
index = faiss.IndexFlatIP(mat.shape[1])
index.add(mat)
faiss.write_index(index, str(INDEX_PATH))
print(f"Index {len(texts)} chunks → {INDEX_PATH}")
if __name__ == "__main__":
build_index()
|