Spaces:
Running
Running
| """One-time ChromaDB indexing of the item catalog. | |
| Runs SBERT (all-MiniLM-L6-v2) over every item's title + description and | |
| stores the embedding in ChromaDB under data/chroma/. The collection is | |
| persistent — subsequent runs skip indexing if it already exists. | |
| After this runs, Task B's retrieval layer can do semantic search: | |
| "thoughtful literary fiction about identity" → 50 nearest items | |
| Usage: | |
| python -m scripts.build_index | |
| python -m scripts.build_index --force # rebuild from scratch | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import logging | |
| import pandas as pd | |
| from core.config import settings | |
| from core.retrieval import ItemRetriever | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") | |
| log = logging.getLogger(__name__) | |
| def main(): | |
| ap = argparse.ArgumentParser() | |
| ap.add_argument("--force", action="store_true", | |
| help="Rebuild the index from scratch (default: skip if already built)") | |
| args = ap.parse_args() | |
| items_path = settings.processed_dir / "items.parquet" | |
| if not items_path.exists(): | |
| raise SystemExit(f"Items file not found at {items_path}. Run `python data/prepare_data.py` first.") | |
| items = pd.read_parquet(items_path) | |
| log.info(f"Loaded {len(items):,} items") | |
| retriever = ItemRetriever() | |
| log.info(f"Building index at {retriever.persist_dir}") | |
| retriever.build_index(items, batch_size=256, force=args.force) | |
| # Quick smoke test | |
| log.info("Smoke testing retrieval...") | |
| results = retriever.retrieve("thoughtful literary fiction about identity", k=5) | |
| log.info(f"Test query returned {len(results)} results:") | |
| for r in results: | |
| log.info(f" [{r.distance:.3f}] {r.domain} | {r.title[:80]}") | |
| if __name__ == "__main__": | |
| main() | |