Thoracic-Radiology-RAG-System / scripts /download_hf_dataset.py
ZhangNy's picture
Add Space app files
75db650
"""
Download the public dataset from Hugging Face and save it to disk.
Example:
python scripts/download_hf_dataset.py \
--dataset ZhangNy/radiology-dataset \
--split train \
--output ./hf_dataset_prepared
"""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
# Allow running as `python scripts/*.py` without installing the package.
sys.path.append(str(Path(__file__).resolve().parents[1]))
def main() -> int:
parser = argparse.ArgumentParser(description="Download HF dataset to local disk")
parser.add_argument("--dataset", type=str, default="ZhangNy/radiology-dataset", help="HF dataset repo id")
parser.add_argument("--split", type=str, default="train", help="Dataset split")
parser.add_argument("--output", type=str, default="./hf_dataset_prepared", help="Output directory (save_to_disk)")
parser.add_argument("--cache-dir", type=str, default=None, help="Optional datasets cache dir")
args = parser.parse_args()
from datasets import load_dataset
out_dir = Path(args.output)
out_dir.parent.mkdir(parents=True, exist_ok=True)
ds = load_dataset(args.dataset, split=args.split, cache_dir=args.cache_dir)
ds.save_to_disk(str(out_dir))
print(f"βœ“ Saved dataset to: {out_dir} (rows={len(ds)})")
return 0
if __name__ == "__main__":
raise SystemExit(main())