RAG-Pipeline-Optimizer / scripts /download_data.py
puji4ml's picture
hf dataset loader
7454e68 verified
"""
scripts/download_data.py - Download ChromaDB data from HuggingFace Dataset
"""
import os
from pathlib import Path
from huggingface_hub import snapshot_download
import sys
def download_chromadb_data():
"""Download ChromaDB vector stores from HuggingFace Dataset"""
print("=" * 80)
print("πŸ“₯ Downloading ChromaDB data from HuggingFace Dataset...")
print("=" * 80)
# Target directories
data_dir = Path("data")
vector_stores_dir = data_dir / "vector_stores"
# Check if data already exists
if vector_stores_dir.exists() and len(list(vector_stores_dir.glob("pipeline_*_corpus"))) >= 6:
print("βœ… ChromaDB data already exists locally. Skipping download.")
return True
# Create directories
data_dir.mkdir(exist_ok=True)
vector_stores_dir.mkdir(exist_ok=True)
try:
# Download from HuggingFace Dataset
# Replace YOUR_HF_USERNAME with your actual username
dataset_repo = "puji4ml/rag-pipeline-chromadb-data"
print(f"πŸ“¦ Downloading from: {dataset_repo}")
print("⏳ This may take 5-10 minutes on first run...")
# Download using snapshot_download (handles LFS automatically)
snapshot_download(
repo_id=dataset_repo,
repo_type="dataset",
local_dir=".", # Download to current directory
local_dir_use_symlinks=False,
resume_download=True
)
print("βœ… ChromaDB data downloaded successfully!")
print("=" * 80)
# Verify download
pipelines = ["a", "b", "c", "d", "e", "f"]
for pipeline in pipelines:
corpus_path = vector_stores_dir / f"pipeline_{pipeline}_corpus"
if corpus_path.exists():
print(f" βœ“ Pipeline {pipeline.upper()} corpus found")
else:
print(f" βœ— Pipeline {pipeline.upper()} corpus MISSING!")
return False
print("=" * 80)
return True
except Exception as e:
print(f"❌ Error downloading data: {e}")
print("=" * 80)
return False
if __name__ == "__main__":
success = download_chromadb_data()
sys.exit(0 if success else 1)