import json from pathlib import Path import numpy as np import faiss import openai from dotenv import load_dotenv # ---------- setup ---------- load_dotenv() # pulls OPENAI_API_KEY from .env client = openai.OpenAI() TXT_FILE = "glossary.txt" OUT_INDEX = "glossary.index" OUT_CHUNKS = "chunks.json" EMBED_MODEL = "text-embedding-3-small" # ---------------------------- # ---------- load + chunk ---------- txt = Path(TXT_FILE).read_text(encoding="utf8") chunks = [c.strip() for c in txt.split("\n\n") if c.strip()] # ---------- embed ---------- def embed(texts): res = client.embeddings.create(model=EMBED_MODEL, input=texts) return [d.embedding for d in res.data] vecs = np.array(embed(chunks), dtype="float32") faiss.normalize_L2(vecs) # cosine similarity wants unit vectors # ---------- build index ---------- dim = vecs.shape[1] index = faiss.IndexFlatIP(dim) # inner product == cosine when vectors norm-1 index.add(vecs) # ---------- save ---------- faiss.write_index(index, OUT_INDEX) Path(OUT_CHUNKS).write_text(json.dumps(chunks, ensure_ascii=False), encoding="utf8") print(f"Built {index.ntotal} vectors → {OUT_INDEX}")