Spaces:
Sleeping
Sleeping
| """ | |
| scripts/download_data.py - Download ChromaDB data from HuggingFace Dataset | |
| """ | |
| import os | |
| from pathlib import Path | |
| from huggingface_hub import snapshot_download | |
| import sys | |
| def download_chromadb_data(): | |
| """Download ChromaDB vector stores from HuggingFace Dataset""" | |
| print("=" * 80) | |
| print("π₯ Downloading ChromaDB data from HuggingFace Dataset...") | |
| print("=" * 80) | |
| # Target directories | |
| data_dir = Path("data") | |
| vector_stores_dir = data_dir / "vector_stores" | |
| # Check if data already exists | |
| if vector_stores_dir.exists() and len(list(vector_stores_dir.glob("pipeline_*_corpus"))) >= 6: | |
| print("β ChromaDB data already exists locally. Skipping download.") | |
| return True | |
| # Create directories | |
| data_dir.mkdir(exist_ok=True) | |
| vector_stores_dir.mkdir(exist_ok=True) | |
| try: | |
| # Download from HuggingFace Dataset | |
| # Replace YOUR_HF_USERNAME with your actual username | |
| dataset_repo = "puji4ml/rag-pipeline-chromadb-data" | |
| print(f"π¦ Downloading from: {dataset_repo}") | |
| print("β³ This may take 5-10 minutes on first run...") | |
| # Download using snapshot_download (handles LFS automatically) | |
| snapshot_download( | |
| repo_id=dataset_repo, | |
| repo_type="dataset", | |
| local_dir=".", # Download to current directory | |
| local_dir_use_symlinks=False, | |
| resume_download=True | |
| ) | |
| print("β ChromaDB data downloaded successfully!") | |
| print("=" * 80) | |
| # Verify download | |
| pipelines = ["a", "b", "c", "d", "e", "f"] | |
| for pipeline in pipelines: | |
| corpus_path = vector_stores_dir / f"pipeline_{pipeline}_corpus" | |
| if corpus_path.exists(): | |
| print(f" β Pipeline {pipeline.upper()} corpus found") | |
| else: | |
| print(f" β Pipeline {pipeline.upper()} corpus MISSING!") | |
| return False | |
| print("=" * 80) | |
| return True | |
| except Exception as e: | |
| print(f"β Error downloading data: {e}") | |
| print("=" * 80) | |
| return False | |
| if __name__ == "__main__": | |
| success = download_chromadb_data() | |
| sys.exit(0 if success else 1) | |