File size: 766 Bytes
fb11af9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import argparse

from huggingface_hub import snapshot_download


"""
python3 scripts/download_hf_data.py --repo_id HuggingFaceFW/fineweb --local_dir ./fineweb/ --allow_patterns sample/10BT/*
"""


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--repo_id", type=str, default="HuggingFaceFW/fineweb")
    parser.add_argument("--local_dir", type=str, default="./fineweb/")
    parser.add_argument("--allow_patterns", type=str, default=None)
    args = parser.parse_args()

    repo_id = args.repo_id
    local_dir = args.local_dir
    allow_patterns = args.allow_patterns

    folder = snapshot_download(
        repo_id,
        repo_type="dataset",
        local_dir=local_dir,
        allow_patterns=allow_patterns,
    )