Recommendation-Agent / scripts /build_index.py
Israelbliz's picture
Upload scripts
a971a56 verified
"""One-time ChromaDB indexing of the item catalog.
Runs SBERT (all-MiniLM-L6-v2) over every item's title + description and
stores the embedding in ChromaDB under data/chroma/. The collection is
persistent — subsequent runs skip indexing if it already exists.
After this runs, Task B's retrieval layer can do semantic search:
"thoughtful literary fiction about identity" → 50 nearest items
Usage:
python -m scripts.build_index
python -m scripts.build_index --force # rebuild from scratch
"""
from __future__ import annotations
import argparse
import logging
import pandas as pd
from core.config import settings
from core.retrieval import ItemRetriever
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
log = logging.getLogger(__name__)
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--force", action="store_true",
help="Rebuild the index from scratch (default: skip if already built)")
args = ap.parse_args()
items_path = settings.processed_dir / "items.parquet"
if not items_path.exists():
raise SystemExit(f"Items file not found at {items_path}. Run `python data/prepare_data.py` first.")
items = pd.read_parquet(items_path)
log.info(f"Loaded {len(items):,} items")
retriever = ItemRetriever()
log.info(f"Building index at {retriever.persist_dir}")
retriever.build_index(items, batch_size=256, force=args.force)
# Quick smoke test
log.info("Smoke testing retrieval...")
results = retriever.retrieve("thoughtful literary fiction about identity", k=5)
log.info(f"Test query returned {len(results)} results:")
for r in results:
log.info(f" [{r.distance:.3f}] {r.domain} | {r.title[:80]}")
if __name__ == "__main__":
main()