File size: 3,735 Bytes
ceee0e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af54ccd
ceee0e3
 
 
 
 
 
 
bc20ef9
9552aaf
ceee0e3
 
 
 
 
 
af54ccd
ceee0e3
 
 
 
 
 
 
 
a1e637f
ceee0e3
 
9552aaf
ceee0e3
9552aaf
 
 
 
 
ceee0e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9552aaf
 
 
ceee0e3
 
 
 
a1e637f
ceee0e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc20ef9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
"""
Submit ultimate_sota_training.py to Hugging Face GPU Jobs (HfApi.run_job).

The Job command must be a single robust shell line (semicolon-separated). Hugging Face
has been observed to flatten multiline `bash -lc` payloads, which breaks `set` and can
leave the job stuck or failing silently.

Requires: huggingface_hub, `huggingface-cli login`.

Secrets: if SKIP_HUB_PUSH is not 1, the job requests Hub secret name HF_TOKEN mapped into
the container as env HF_TOKEN (Settings → Access Tokens / Job secrets).

Environment (optional):
  HF_JOB_NAMESPACE     default: whoami
  HF_JOB_FLAVOR        default: l4x1 (often faster than T4 for this workload; override with t4-small to save $)
  HF_JOB_IMAGE         default: pytorch CUDA 12.4 devel
  HF_JOB_TIMEOUT       default: 8h
  TRAIN_REPO_GIT_URL, OPENENV_BASE_URL
  TRAIN_MAX_STEPS      default: 80 (faster run; raise for stronger fit)
  ROWS_PER_TASK        default: 32
  GRPO_NUM_GENERATIONS default: 2
  SKIP_HUB_PUSH        default: 0
"""
from __future__ import annotations

import os
import shlex

from huggingface_hub import HfApi
from huggingface_hub.utils import get_token

_DEFAULT_REPO = "https://huggingface.co/spaces/md896/sql-debug-env"
_REPO_URL = os.environ.get("TRAIN_REPO_GIT_URL", _DEFAULT_REPO)
_OPENENV = os.environ.get("OPENENV_BASE_URL", "https://md896-sql-debug-env.hf.space")
_MAX_STEPS = os.environ.get("TRAIN_MAX_STEPS", "80")
_ROWS = os.environ.get("ROWS_PER_TASK", "32")
_NUM_GEN = os.environ.get("GRPO_NUM_GENERATIONS", "2")
_SKIP_PUSH = os.environ.get("SKIP_HUB_PUSH", "0")
_TIMEOUT = os.environ.get("HF_JOB_TIMEOUT", "8h")
# l4x1: newer GPU, good for Unsloth; use HF_JOB_FLAVOR=t4-small if queue or cost is better for you
_FLAVOR = os.environ.get("HF_JOB_FLAVOR", "l4x1")
_IMAGE = os.environ.get(
    "HF_JOB_IMAGE",
    "pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel",
)
_NAMESPACE = os.environ.get("HF_JOB_NAMESPACE", "md896")

_SECRETS = None
_local_hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") or get_token()
if _SKIP_PUSH.strip().lower() not in ("1", "true", "yes"):
    if _local_hf_token:
        _SECRETS = {"HF_TOKEN": _local_hf_token}
    else:
        # Job can still train; push/upload steps in script will gracefully skip/fail with clear logs.
        _SECRETS = None

# One line only — survives UI/API newline flattening.
_bash = (
    "set -euxo pipefail; "
    "export DEBIAN_FRONTEND=noninteractive; "
    "apt-get update -qq && apt-get install -y -qq git ca-certificates; "
    "export PIP_BREAK_SYSTEM_PACKAGES=1; "
    f"rm -rf train-repo; git clone {shlex.quote(_REPO_URL)} train-repo; "
    "cd train-repo; "
    "python -u ultimate_sota_training.py"
)

_job_env = {
    "OPENENV_BASE_URL": _OPENENV,
    "TRAIN_MAX_STEPS": _MAX_STEPS,
    "ROWS_PER_TASK": _ROWS,
    "GRPO_NUM_GENERATIONS": _NUM_GEN,
    "SKIP_HUB_PUSH": _SKIP_PUSH,
    "ARTIFACT_SPACE_ID": os.environ.get("ARTIFACT_SPACE_ID", "md896/sql-debug-env"),
    "MODEL_HUB_REPO_ID": os.environ.get("MODEL_HUB_REPO_ID", "md896/sql-debug-agent-qwen05b-grpo"),
    "HARD_EVAL_SAMPLES": os.environ.get("HARD_EVAL_SAMPLES", "16"),
}

if __name__ == "__main__":
    api = HfApi()
    ns = _NAMESPACE
    job = api.run_job(
        image=_IMAGE,
        command=["bash", "-lc", _bash],
        flavor=_FLAVOR,
        namespace=ns,
        timeout=_TIMEOUT,
        secrets=_SECRETS,
        env=_job_env,
    )
    print("JOB_ID:", job.id)
    print("JOB_URL:", job.url)
    print("FLAVOR:", _FLAVOR, "| TRAIN_MAX_STEPS:", _MAX_STEPS, "| ROWS_PER_TASK:", _ROWS)
    print(
        "Note: SCHEDULING is Hugging Face queue time, not your script. "
        "Cancel stuck jobs and retry, or try HF_JOB_FLAVOR=t4-small / t4-medium."
    )