Spaces:
Sleeping
Sleeping
| import json | |
| from pathlib import Path | |
| import numpy as np | |
| import faiss | |
| import openai | |
| from dotenv import load_dotenv | |
| # ---------- setup ---------- | |
| load_dotenv() # pulls OPENAI_API_KEY from .env | |
| client = openai.OpenAI() | |
| TXT_FILE = "glossary.txt" | |
| OUT_INDEX = "glossary.index" | |
| OUT_CHUNKS = "chunks.json" | |
| EMBED_MODEL = "text-embedding-3-small" | |
| # ---------------------------- | |
| # ---------- load + chunk ---------- | |
| txt = Path(TXT_FILE).read_text(encoding="utf8") | |
| chunks = [c.strip() for c in txt.split("\n\n") if c.strip()] | |
| # ---------- embed ---------- | |
| def embed(texts): | |
| res = client.embeddings.create(model=EMBED_MODEL, input=texts) | |
| return [d.embedding for d in res.data] | |
| vecs = np.array(embed(chunks), dtype="float32") | |
| faiss.normalize_L2(vecs) # cosine similarity wants unit vectors | |
| # ---------- build index ---------- | |
| dim = vecs.shape[1] | |
| index = faiss.IndexFlatIP(dim) # inner product == cosine when vectors norm-1 | |
| index.add(vecs) | |
| # ---------- save ---------- | |
| faiss.write_index(index, OUT_INDEX) | |
| Path(OUT_CHUNKS).write_text(json.dumps(chunks, ensure_ascii=False), encoding="utf8") | |
| print(f"Built {index.ntotal} vectors → {OUT_INDEX}") | |