""" Download and cache the Nemotron-Personas-USA dataset. Downloads 1M synthetic US personas (~2GB) from HuggingFace. Default location: /data/nemotron/ Only runs once — subsequent calls detect the cached dataset and skip. Usage: uv run python scripts/setup_data.py uv run python scripts/setup_data.py --data-dir /custom/path """ import argparse from pathlib import Path from datasets import load_dataset, load_from_disk PROJECT_ROOT = Path(__file__).resolve().parent.parent DEFAULT_DATA_DIR = PROJECT_ROOT / "data" / "nemotron" def setup(data_dir: Path = DEFAULT_DATA_DIR): if (data_dir / "dataset_info.json").exists(): ds = load_from_disk(str(data_dir)) print(f"Dataset already cached: {data_dir}") print(f" {len(ds)} personas, {len(ds.column_names)} fields") return ds print("Downloading nvidia/Nemotron-Personas-USA (1M rows, ~2GB)...") print("This only needs to happen once.\n") ds = load_dataset("nvidia/Nemotron-Personas-USA", split="train") data_dir.mkdir(parents=True, exist_ok=True) ds.save_to_disk(str(data_dir)) print(f"\nSaved to {data_dir}") print(f" {len(ds)} personas, {len(ds.column_names)} fields") print(f" Columns: {ds.column_names}") return ds if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--data-dir", type=Path, default=DEFAULT_DATA_DIR) args = parser.parse_args() setup(args.data_dir)