Spaces:
Sleeping
Sleeping
| """Embed all documents and persist to a Chroma collection at data/chroma/. | |
| Run once. Re-run when the corpus or the embedding model changes. | |
| Uses the same LangChain HuggingFaceEmbeddings wrapper as the live retrieval | |
| path (`recsys.pipeline.get_embeddings`), so document and query vectors live | |
| in the same space. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| from pathlib import Path | |
| import chromadb | |
| from tqdm import tqdm | |
| from recsys.pipeline import get_embeddings | |
| ROOT = Path(__file__).resolve().parent.parent | |
| DOCS = ROOT / "data" / "documents.jsonl" | |
| DB_DIR = ROOT / "data" / "chroma" | |
| COLLECTION = "shl_baseline" | |
| def main() -> None: | |
| docs = [json.loads(l) for l in DOCS.read_text(encoding="utf-8").splitlines() if l.strip()] | |
| print(f"loaded {len(docs)} documents") | |
| embedder = get_embeddings() | |
| model_name = getattr(embedder, "model_name", "unknown") | |
| print(f"embedding model: {model_name}") | |
| client = chromadb.PersistentClient(path=str(DB_DIR)) | |
| try: | |
| client.delete_collection(COLLECTION) | |
| except Exception: | |
| pass | |
| coll = client.create_collection( | |
| name=COLLECTION, | |
| metadata={"hnsw:space": "cosine", "embed_model": model_name}, | |
| ) | |
| batch = 64 | |
| for i in tqdm(range(0, len(docs), batch), desc="embed+index"): | |
| chunk = docs[i : i + batch] | |
| ids = [d["id"] for d in chunk] | |
| texts = [d["text"] for d in chunk] | |
| metas = [d["metadata"] for d in chunk] | |
| # embed_documents (no query prefix) for indexing | |
| embs = embedder.embed_documents(texts) | |
| coll.add(ids=ids, embeddings=embs, metadatas=metas, documents=texts) | |
| print(f"\ncollection '{COLLECTION}' has {coll.count()} items at {DB_DIR}") | |
| if __name__ == "__main__": | |
| main() | |