"""One-time ChromaDB indexing of the item catalog. Runs SBERT (all-MiniLM-L6-v2) over every item's title + description and stores the embedding in ChromaDB under data/chroma/. The collection is persistent — subsequent runs skip indexing if it already exists. After this runs, Task B's retrieval layer can do semantic search: "thoughtful literary fiction about identity" → 50 nearest items Usage: python -m scripts.build_index python -m scripts.build_index --force # rebuild from scratch """ from __future__ import annotations import argparse import logging import pandas as pd from core.config import settings from core.retrieval import ItemRetriever logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") log = logging.getLogger(__name__) def main(): ap = argparse.ArgumentParser() ap.add_argument("--force", action="store_true", help="Rebuild the index from scratch (default: skip if already built)") args = ap.parse_args() items_path = settings.processed_dir / "items.parquet" if not items_path.exists(): raise SystemExit(f"Items file not found at {items_path}. Run `python data/prepare_data.py` first.") items = pd.read_parquet(items_path) log.info(f"Loaded {len(items):,} items") retriever = ItemRetriever() log.info(f"Building index at {retriever.persist_dir}") retriever.build_index(items, batch_size=256, force=args.force) # Quick smoke test log.info("Smoke testing retrieval...") results = retriever.retrieve("thoughtful literary fiction about identity", k=5) log.info(f"Test query returned {len(results)} results:") for r in results: log.info(f" [{r.distance:.3f}] {r.domain} | {r.title[:80]}") if __name__ == "__main__": main()