Spaces:
Running
Running
File size: 3,735 Bytes
ceee0e3 af54ccd ceee0e3 bc20ef9 9552aaf ceee0e3 af54ccd ceee0e3 a1e637f ceee0e3 9552aaf ceee0e3 9552aaf ceee0e3 9552aaf ceee0e3 a1e637f ceee0e3 bc20ef9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 | """
Submit ultimate_sota_training.py to Hugging Face GPU Jobs (HfApi.run_job).
The Job command must be a single robust shell line (semicolon-separated). Hugging Face
has been observed to flatten multiline `bash -lc` payloads, which breaks `set` and can
leave the job stuck or failing silently.
Requires: huggingface_hub, `huggingface-cli login`.
Secrets: if SKIP_HUB_PUSH is not 1, the job requests Hub secret name HF_TOKEN mapped into
the container as env HF_TOKEN (Settings → Access Tokens / Job secrets).
Environment (optional):
HF_JOB_NAMESPACE default: whoami
HF_JOB_FLAVOR default: l4x1 (often faster than T4 for this workload; override with t4-small to save $)
HF_JOB_IMAGE default: pytorch CUDA 12.4 devel
HF_JOB_TIMEOUT default: 8h
TRAIN_REPO_GIT_URL, OPENENV_BASE_URL
TRAIN_MAX_STEPS default: 80 (faster run; raise for stronger fit)
ROWS_PER_TASK default: 32
GRPO_NUM_GENERATIONS default: 2
SKIP_HUB_PUSH default: 0
"""
from __future__ import annotations
import os
import shlex
from huggingface_hub import HfApi
from huggingface_hub.utils import get_token
_DEFAULT_REPO = "https://huggingface.co/spaces/md896/sql-debug-env"
_REPO_URL = os.environ.get("TRAIN_REPO_GIT_URL", _DEFAULT_REPO)
_OPENENV = os.environ.get("OPENENV_BASE_URL", "https://md896-sql-debug-env.hf.space")
_MAX_STEPS = os.environ.get("TRAIN_MAX_STEPS", "80")
_ROWS = os.environ.get("ROWS_PER_TASK", "32")
_NUM_GEN = os.environ.get("GRPO_NUM_GENERATIONS", "2")
_SKIP_PUSH = os.environ.get("SKIP_HUB_PUSH", "0")
_TIMEOUT = os.environ.get("HF_JOB_TIMEOUT", "8h")
# l4x1: newer GPU, good for Unsloth; use HF_JOB_FLAVOR=t4-small if queue or cost is better for you
_FLAVOR = os.environ.get("HF_JOB_FLAVOR", "l4x1")
_IMAGE = os.environ.get(
"HF_JOB_IMAGE",
"pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel",
)
_NAMESPACE = os.environ.get("HF_JOB_NAMESPACE", "md896")
_SECRETS = None
_local_hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") or get_token()
if _SKIP_PUSH.strip().lower() not in ("1", "true", "yes"):
if _local_hf_token:
_SECRETS = {"HF_TOKEN": _local_hf_token}
else:
# Job can still train; push/upload steps in script will gracefully skip/fail with clear logs.
_SECRETS = None
# One line only — survives UI/API newline flattening.
_bash = (
"set -euxo pipefail; "
"export DEBIAN_FRONTEND=noninteractive; "
"apt-get update -qq && apt-get install -y -qq git ca-certificates; "
"export PIP_BREAK_SYSTEM_PACKAGES=1; "
f"rm -rf train-repo; git clone {shlex.quote(_REPO_URL)} train-repo; "
"cd train-repo; "
"python -u ultimate_sota_training.py"
)
_job_env = {
"OPENENV_BASE_URL": _OPENENV,
"TRAIN_MAX_STEPS": _MAX_STEPS,
"ROWS_PER_TASK": _ROWS,
"GRPO_NUM_GENERATIONS": _NUM_GEN,
"SKIP_HUB_PUSH": _SKIP_PUSH,
"ARTIFACT_SPACE_ID": os.environ.get("ARTIFACT_SPACE_ID", "md896/sql-debug-env"),
"MODEL_HUB_REPO_ID": os.environ.get("MODEL_HUB_REPO_ID", "md896/sql-debug-agent-qwen05b-grpo"),
"HARD_EVAL_SAMPLES": os.environ.get("HARD_EVAL_SAMPLES", "16"),
}
if __name__ == "__main__":
api = HfApi()
ns = _NAMESPACE
job = api.run_job(
image=_IMAGE,
command=["bash", "-lc", _bash],
flavor=_FLAVOR,
namespace=ns,
timeout=_TIMEOUT,
secrets=_SECRETS,
env=_job_env,
)
print("JOB_ID:", job.id)
print("JOB_URL:", job.url)
print("FLAVOR:", _FLAVOR, "| TRAIN_MAX_STEPS:", _MAX_STEPS, "| ROWS_PER_TASK:", _ROWS)
print(
"Note: SCHEDULING is Hugging Face queue time, not your script. "
"Cancel stuck jobs and retry, or try HF_JOB_FLAVOR=t4-small / t4-medium."
)
|