File size: 2,349 Bytes
7454e68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
"""

scripts/download_data.py - Download ChromaDB data from HuggingFace Dataset

"""

import os
from pathlib import Path
from huggingface_hub import snapshot_download
import sys

def download_chromadb_data():
    """Download ChromaDB vector stores from HuggingFace Dataset"""
    
    print("=" * 80)
    print("πŸ“₯ Downloading ChromaDB data from HuggingFace Dataset...")
    print("=" * 80)
    
    # Target directories
    data_dir = Path("data")
    vector_stores_dir = data_dir / "vector_stores"
    
    # Check if data already exists
    if vector_stores_dir.exists() and len(list(vector_stores_dir.glob("pipeline_*_corpus"))) >= 6:
        print("βœ… ChromaDB data already exists locally. Skipping download.")
        return True
    
    # Create directories
    data_dir.mkdir(exist_ok=True)
    vector_stores_dir.mkdir(exist_ok=True)
    
    try:
        # Download from HuggingFace Dataset
        # Replace YOUR_HF_USERNAME with your actual username
        dataset_repo = "puji4ml/rag-pipeline-chromadb-data"
        
        print(f"πŸ“¦ Downloading from: {dataset_repo}")
        print("⏳ This may take 5-10 minutes on first run...")
        
        # Download using snapshot_download (handles LFS automatically)
        snapshot_download(
            repo_id=dataset_repo,
            repo_type="dataset",
            local_dir=".",  # Download to current directory
            local_dir_use_symlinks=False,
            resume_download=True
        )
        
        print("βœ… ChromaDB data downloaded successfully!")
        print("=" * 80)
        
        # Verify download
        pipelines = ["a", "b", "c", "d", "e", "f"]
        for pipeline in pipelines:
            corpus_path = vector_stores_dir / f"pipeline_{pipeline}_corpus"
            if corpus_path.exists():
                print(f"  βœ“ Pipeline {pipeline.upper()} corpus found")
            else:
                print(f"  βœ— Pipeline {pipeline.upper()} corpus MISSING!")
                return False
        
        print("=" * 80)
        return True
        
    except Exception as e:
        print(f"❌ Error downloading data: {e}")
        print("=" * 80)
        return False

if __name__ == "__main__":
    success = download_chromadb_data()
    sys.exit(0 if success else 1)