pankaj
SHL recommender — initial deploy
870800f
"""Embed all documents and persist to a Chroma collection at data/chroma/.
Run once. Re-run when the corpus or the embedding model changes.
Uses the same LangChain HuggingFaceEmbeddings wrapper as the live retrieval
path (`recsys.pipeline.get_embeddings`), so document and query vectors live
in the same space.
"""
from __future__ import annotations
import json
from pathlib import Path
import chromadb
from tqdm import tqdm
from recsys.pipeline import get_embeddings
ROOT = Path(__file__).resolve().parent.parent
DOCS = ROOT / "data" / "documents.jsonl"
DB_DIR = ROOT / "data" / "chroma"
COLLECTION = "shl_baseline"
def main() -> None:
docs = [json.loads(l) for l in DOCS.read_text(encoding="utf-8").splitlines() if l.strip()]
print(f"loaded {len(docs)} documents")
embedder = get_embeddings()
model_name = getattr(embedder, "model_name", "unknown")
print(f"embedding model: {model_name}")
client = chromadb.PersistentClient(path=str(DB_DIR))
try:
client.delete_collection(COLLECTION)
except Exception:
pass
coll = client.create_collection(
name=COLLECTION,
metadata={"hnsw:space": "cosine", "embed_model": model_name},
)
batch = 64
for i in tqdm(range(0, len(docs), batch), desc="embed+index"):
chunk = docs[i : i + batch]
ids = [d["id"] for d in chunk]
texts = [d["text"] for d in chunk]
metas = [d["metadata"] for d in chunk]
# embed_documents (no query prefix) for indexing
embs = embedder.embed_documents(texts)
coll.add(ids=ids, embeddings=embs, metadatas=metas, documents=texts)
print(f"\ncollection '{COLLECTION}' has {coll.count()} items at {DB_DIR}")
if __name__ == "__main__":
main()