"""Embed all documents and persist to a Chroma collection at data/chroma/. Run once. Re-run when the corpus or the embedding model changes. Uses the same LangChain HuggingFaceEmbeddings wrapper as the live retrieval path (`recsys.pipeline.get_embeddings`), so document and query vectors live in the same space. """ from __future__ import annotations import json from pathlib import Path import chromadb from tqdm import tqdm from recsys.pipeline import get_embeddings ROOT = Path(__file__).resolve().parent.parent DOCS = ROOT / "data" / "documents.jsonl" DB_DIR = ROOT / "data" / "chroma" COLLECTION = "shl_baseline" def main() -> None: docs = [json.loads(l) for l in DOCS.read_text(encoding="utf-8").splitlines() if l.strip()] print(f"loaded {len(docs)} documents") embedder = get_embeddings() model_name = getattr(embedder, "model_name", "unknown") print(f"embedding model: {model_name}") client = chromadb.PersistentClient(path=str(DB_DIR)) try: client.delete_collection(COLLECTION) except Exception: pass coll = client.create_collection( name=COLLECTION, metadata={"hnsw:space": "cosine", "embed_model": model_name}, ) batch = 64 for i in tqdm(range(0, len(docs), batch), desc="embed+index"): chunk = docs[i : i + batch] ids = [d["id"] for d in chunk] texts = [d["text"] for d in chunk] metas = [d["metadata"] for d in chunk] # embed_documents (no query prefix) for indexing embs = embedder.embed_documents(texts) coll.add(ids=ids, embeddings=embs, metadatas=metas, documents=texts) print(f"\ncollection '{COLLECTION}' has {coll.count()} items at {DB_DIR}") if __name__ == "__main__": main()