File size: 1,492 Bytes
773c256
 
 
 
 
 
 
c429a2d
773c256
 
 
 
c429a2d
773c256
 
 
 
c429a2d
 
773c256
 
c429a2d
 
773c256
 
 
 
 
 
 
 
c429a2d
 
773c256
c429a2d
 
773c256
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from pathlib import Path
import os

def download_data():
    try:
        from huggingface_hub import snapshot_download
    except ImportError:
        print("huggingface_hub not installed. Installing...")
        os.system("pip install huggingface_hub")
        from huggingface_hub import snapshot_download
    
    # Configuration - CHANGE THIS to your HuggingFace repo
    HF_REPO_ID = os.getenv("HF_DATA_REPO", "hungnha/do_an_tot_nghiep")
    
    data_path = Path("data")
    
    if data_path.exists() and any(data_path.iterdir()):
        print("Data folder already exists. Skipping download.")
        print(f"To re-download, delete the 'data/' folder first.")
        return
    
    print(f"Downloading data from HuggingFace: {HF_REPO_ID}")
    print("This may take a few minutes...")
    
    try:
        snapshot_download(
            repo_id=HF_REPO_ID,
            repo_type="dataset",
            local_dir="data",
            local_dir_use_symlinks=False,  # Download actual files, not symlinks
        )
        print("Download complete!")
        print(f"Data saved to: {data_path.absolute()}")
    except Exception as e:
        print(f"Error downloading data: {e}")
        print("\nTips:")
        print("   1. Make sure the HF_DATA_REPO environment variable is set correctly")
        print("   2. Or update HF_REPO_ID in this script")
        print("   3. If repo is private, run: huggingface-cli login")
        raise

if __name__ == "__main__":
    download_data()