digitChatBot / build_index.py
paradox44's picture
Upload 7 files
bd7261b verified
import json
from pathlib import Path
import numpy as np
import faiss
import openai
from dotenv import load_dotenv
# ---------- setup ----------
load_dotenv() # pulls OPENAI_API_KEY from .env
client = openai.OpenAI()
TXT_FILE = "glossary.txt"
OUT_INDEX = "glossary.index"
OUT_CHUNKS = "chunks.json"
EMBED_MODEL = "text-embedding-3-small"
# ----------------------------
# ---------- load + chunk ----------
txt = Path(TXT_FILE).read_text(encoding="utf8")
chunks = [c.strip() for c in txt.split("\n\n") if c.strip()]
# ---------- embed ----------
def embed(texts):
res = client.embeddings.create(model=EMBED_MODEL, input=texts)
return [d.embedding for d in res.data]
vecs = np.array(embed(chunks), dtype="float32")
faiss.normalize_L2(vecs) # cosine similarity wants unit vectors
# ---------- build index ----------
dim = vecs.shape[1]
index = faiss.IndexFlatIP(dim) # inner product == cosine when vectors norm-1
index.add(vecs)
# ---------- save ----------
faiss.write_index(index, OUT_INDEX)
Path(OUT_CHUNKS).write_text(json.dumps(chunks, ensure_ascii=False), encoding="utf8")
print(f"Built {index.ntotal} vectors → {OUT_INDEX}")