File size: 1,513 Bytes
16dc556
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
"""Push the verified SFT data to an HF dataset repo so Colab/HF-Jobs can pull it.

    uv run training/build_dataset.py --n 2000 --out data/train.jsonl   # (re)generate
    huggingface-cli login                                              # HF write token
    uv run scripts/push_dataset.py --repo build-small-hackathon/scrubdata-sft
"""

from __future__ import annotations

import argparse
import sys
from pathlib import Path

DEFAULT_REPO = "build-small-hackathon/scrubdata-sft"


def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("--repo", default=DEFAULT_REPO)
    ap.add_argument("--file", default="data/train.jsonl")
    ap.add_argument("--path-in-repo", default="train.jsonl")
    args = ap.parse_args()

    src = Path(args.file)
    if not src.exists() or src.stat().st_size == 0:
        print(f"No data at {src}. Run training/build_dataset.py first.", file=sys.stderr)
        return 1
    try:
        from huggingface_hub import HfApi
    except ImportError:
        print("huggingface_hub not installed.", file=sys.stderr)
        return 1

    api = HfApi()
    api.create_repo(args.repo, repo_type="dataset", exist_ok=True)
    api.upload_file(path_or_fileobj=str(src), path_in_repo=args.path_in_repo,
                    repo_id=args.repo, repo_type="dataset",
                    commit_message="Update ScrubData SFT data")
    print(f"Pushed {src} → https://huggingface.co/datasets/{args.repo}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())