File size: 1,471 Bytes
9415028
 
 
74851ea
 
9415028
 
 
 
 
 
 
 
 
 
 
74851ea
 
9415028
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
"""
Download and cache the Nemotron-Personas-USA dataset.

Downloads 1M synthetic US personas (~2GB) from HuggingFace.
Default location: <project_root>/data/nemotron/
Only runs once — subsequent calls detect the cached dataset and skip.

Usage:
    uv run python scripts/setup_data.py
    uv run python scripts/setup_data.py --data-dir /custom/path
"""

import argparse
from pathlib import Path
from datasets import load_dataset, load_from_disk

PROJECT_ROOT = Path(__file__).resolve().parent.parent
DEFAULT_DATA_DIR = PROJECT_ROOT / "data" / "nemotron"


def setup(data_dir: Path = DEFAULT_DATA_DIR):
    if (data_dir / "dataset_info.json").exists():
        ds = load_from_disk(str(data_dir))
        print(f"Dataset already cached: {data_dir}")
        print(f"  {len(ds)} personas, {len(ds.column_names)} fields")
        return ds

    print("Downloading nvidia/Nemotron-Personas-USA (1M rows, ~2GB)...")
    print("This only needs to happen once.\n")

    ds = load_dataset("nvidia/Nemotron-Personas-USA", split="train")
    data_dir.mkdir(parents=True, exist_ok=True)
    ds.save_to_disk(str(data_dir))

    print(f"\nSaved to {data_dir}")
    print(f"  {len(ds)} personas, {len(ds.column_names)} fields")
    print(f"  Columns: {ds.column_names}")
    return ds


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--data-dir", type=Path, default=DEFAULT_DATA_DIR)
    args = parser.parse_args()
    setup(args.data_dir)