Spaces:
Running
Running
File size: 1,513 Bytes
16dc556 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | """Push the verified SFT data to an HF dataset repo so Colab/HF-Jobs can pull it.
uv run training/build_dataset.py --n 2000 --out data/train.jsonl # (re)generate
huggingface-cli login # HF write token
uv run scripts/push_dataset.py --repo build-small-hackathon/scrubdata-sft
"""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
DEFAULT_REPO = "build-small-hackathon/scrubdata-sft"
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--repo", default=DEFAULT_REPO)
ap.add_argument("--file", default="data/train.jsonl")
ap.add_argument("--path-in-repo", default="train.jsonl")
args = ap.parse_args()
src = Path(args.file)
if not src.exists() or src.stat().st_size == 0:
print(f"No data at {src}. Run training/build_dataset.py first.", file=sys.stderr)
return 1
try:
from huggingface_hub import HfApi
except ImportError:
print("huggingface_hub not installed.", file=sys.stderr)
return 1
api = HfApi()
api.create_repo(args.repo, repo_type="dataset", exist_ok=True)
api.upload_file(path_or_fileobj=str(src), path_in_repo=args.path_in_repo,
repo_id=args.repo, repo_type="dataset",
commit_message="Update ScrubData SFT data")
print(f"Pushed {src} → https://huggingface.co/datasets/{args.repo}")
return 0
if __name__ == "__main__":
raise SystemExit(main())
|