Spaces:
Running
Running
| """Push the verified SFT data to an HF dataset repo so Colab/HF-Jobs can pull it. | |
| uv run training/build_dataset.py --n 2000 --out data/train.jsonl # (re)generate | |
| huggingface-cli login # HF write token | |
| uv run scripts/push_dataset.py --repo build-small-hackathon/scrubdata-sft | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import sys | |
| from pathlib import Path | |
| DEFAULT_REPO = "build-small-hackathon/scrubdata-sft" | |
| def main() -> int: | |
| ap = argparse.ArgumentParser() | |
| ap.add_argument("--repo", default=DEFAULT_REPO) | |
| ap.add_argument("--file", default="data/train.jsonl") | |
| ap.add_argument("--path-in-repo", default="train.jsonl") | |
| args = ap.parse_args() | |
| src = Path(args.file) | |
| if not src.exists() or src.stat().st_size == 0: | |
| print(f"No data at {src}. Run training/build_dataset.py first.", file=sys.stderr) | |
| return 1 | |
| try: | |
| from huggingface_hub import HfApi | |
| except ImportError: | |
| print("huggingface_hub not installed.", file=sys.stderr) | |
| return 1 | |
| api = HfApi() | |
| api.create_repo(args.repo, repo_type="dataset", exist_ok=True) | |
| api.upload_file(path_or_fileobj=str(src), path_in_repo=args.path_in_repo, | |
| repo_id=args.repo, repo_type="dataset", | |
| commit_message="Update ScrubData SFT data") | |
| print(f"Pushed {src} → https://huggingface.co/datasets/{args.repo}") | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |