Spaces:
Running
on
Zero
Running
on
Zero
| """ | |
| Download the public dataset from Hugging Face and save it to disk. | |
| Example: | |
| python scripts/download_hf_dataset.py \ | |
| --dataset ZhangNy/radiology-dataset \ | |
| --split train \ | |
| --output ./hf_dataset_prepared | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import sys | |
| from pathlib import Path | |
| # Allow running as `python scripts/*.py` without installing the package. | |
| sys.path.append(str(Path(__file__).resolve().parents[1])) | |
| def main() -> int: | |
| parser = argparse.ArgumentParser(description="Download HF dataset to local disk") | |
| parser.add_argument("--dataset", type=str, default="ZhangNy/radiology-dataset", help="HF dataset repo id") | |
| parser.add_argument("--split", type=str, default="train", help="Dataset split") | |
| parser.add_argument("--output", type=str, default="./hf_dataset_prepared", help="Output directory (save_to_disk)") | |
| parser.add_argument("--cache-dir", type=str, default=None, help="Optional datasets cache dir") | |
| args = parser.parse_args() | |
| from datasets import load_dataset | |
| out_dir = Path(args.output) | |
| out_dir.parent.mkdir(parents=True, exist_ok=True) | |
| ds = load_dataset(args.dataset, split=args.split, cache_dir=args.cache_dir) | |
| ds.save_to_disk(str(out_dir)) | |
| print(f"β Saved dataset to: {out_dir} (rows={len(ds)})") | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |