sgo / scripts /setup_data.py
Eric Xu
Remove all hardcoded paths, make skill portable
74851ea
"""
Download and cache the Nemotron-Personas-USA dataset.
Downloads 1M synthetic US personas (~2GB) from HuggingFace.
Default location: <project_root>/data/nemotron/
Only runs once — subsequent calls detect the cached dataset and skip.
Usage:
uv run python scripts/setup_data.py
uv run python scripts/setup_data.py --data-dir /custom/path
"""
import argparse
from pathlib import Path
from datasets import load_dataset, load_from_disk
PROJECT_ROOT = Path(__file__).resolve().parent.parent
DEFAULT_DATA_DIR = PROJECT_ROOT / "data" / "nemotron"
def setup(data_dir: Path = DEFAULT_DATA_DIR):
if (data_dir / "dataset_info.json").exists():
ds = load_from_disk(str(data_dir))
print(f"Dataset already cached: {data_dir}")
print(f" {len(ds)} personas, {len(ds.column_names)} fields")
return ds
print("Downloading nvidia/Nemotron-Personas-USA (1M rows, ~2GB)...")
print("This only needs to happen once.\n")
ds = load_dataset("nvidia/Nemotron-Personas-USA", split="train")
data_dir.mkdir(parents=True, exist_ok=True)
ds.save_to_disk(str(data_dir))
print(f"\nSaved to {data_dir}")
print(f" {len(ds)} personas, {len(ds.column_names)} fields")
print(f" Columns: {ds.column_names}")
return ds
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--data-dir", type=Path, default=DEFAULT_DATA_DIR)
args = parser.parse_args()
setup(args.data_dir)