""" scripts/download_data.py - Download ChromaDB data from HuggingFace Dataset """ import os from pathlib import Path from huggingface_hub import snapshot_download import sys def download_chromadb_data(): """Download ChromaDB vector stores from HuggingFace Dataset""" print("=" * 80) print("📥 Downloading ChromaDB data from HuggingFace Dataset...") print("=" * 80) # Target directories data_dir = Path("data") vector_stores_dir = data_dir / "vector_stores" # Check if data already exists if vector_stores_dir.exists() and len(list(vector_stores_dir.glob("pipeline_*_corpus"))) >= 6: print("✅ ChromaDB data already exists locally. Skipping download.") return True # Create directories data_dir.mkdir(exist_ok=True) vector_stores_dir.mkdir(exist_ok=True) try: # Download from HuggingFace Dataset # Replace YOUR_HF_USERNAME with your actual username dataset_repo = "puji4ml/rag-pipeline-chromadb-data" print(f"📦 Downloading from: {dataset_repo}") print("⏳ This may take 5-10 minutes on first run...") # Download using snapshot_download (handles LFS automatically) snapshot_download( repo_id=dataset_repo, repo_type="dataset", local_dir=".", # Download to current directory local_dir_use_symlinks=False, resume_download=True ) print("✅ ChromaDB data downloaded successfully!") print("=" * 80) # Verify download pipelines = ["a", "b", "c", "d", "e", "f"] for pipeline in pipelines: corpus_path = vector_stores_dir / f"pipeline_{pipeline}_corpus" if corpus_path.exists(): print(f" ✓ Pipeline {pipeline.upper()} corpus found") else: print(f" ✗ Pipeline {pipeline.upper()} corpus MISSING!") return False print("=" * 80) return True except Exception as e: print(f"❌ Error downloading data: {e}") print("=" * 80) return False if __name__ == "__main__": success = download_chromadb_data() sys.exit(0 if success else 1)