Spaces:
Sleeping
Sleeping
File size: 2,349 Bytes
7454e68 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
"""
scripts/download_data.py - Download ChromaDB data from HuggingFace Dataset
"""
import os
from pathlib import Path
from huggingface_hub import snapshot_download
import sys
def download_chromadb_data():
"""Download ChromaDB vector stores from HuggingFace Dataset"""
print("=" * 80)
print("π₯ Downloading ChromaDB data from HuggingFace Dataset...")
print("=" * 80)
# Target directories
data_dir = Path("data")
vector_stores_dir = data_dir / "vector_stores"
# Check if data already exists
if vector_stores_dir.exists() and len(list(vector_stores_dir.glob("pipeline_*_corpus"))) >= 6:
print("β
ChromaDB data already exists locally. Skipping download.")
return True
# Create directories
data_dir.mkdir(exist_ok=True)
vector_stores_dir.mkdir(exist_ok=True)
try:
# Download from HuggingFace Dataset
# Replace YOUR_HF_USERNAME with your actual username
dataset_repo = "puji4ml/rag-pipeline-chromadb-data"
print(f"π¦ Downloading from: {dataset_repo}")
print("β³ This may take 5-10 minutes on first run...")
# Download using snapshot_download (handles LFS automatically)
snapshot_download(
repo_id=dataset_repo,
repo_type="dataset",
local_dir=".", # Download to current directory
local_dir_use_symlinks=False,
resume_download=True
)
print("β
ChromaDB data downloaded successfully!")
print("=" * 80)
# Verify download
pipelines = ["a", "b", "c", "d", "e", "f"]
for pipeline in pipelines:
corpus_path = vector_stores_dir / f"pipeline_{pipeline}_corpus"
if corpus_path.exists():
print(f" β Pipeline {pipeline.upper()} corpus found")
else:
print(f" β Pipeline {pipeline.upper()} corpus MISSING!")
return False
print("=" * 80)
return True
except Exception as e:
print(f"β Error downloading data: {e}")
print("=" * 80)
return False
if __name__ == "__main__":
success = download_chromadb_data()
sys.exit(0 if success else 1)
|