sql-debug-env / launch_job.py
md896's picture
Avoid whoami rate-limit during job submission
a1e637f
"""
Submit ultimate_sota_training.py to Hugging Face GPU Jobs (HfApi.run_job).
The Job command must be a single robust shell line (semicolon-separated). Hugging Face
has been observed to flatten multiline `bash -lc` payloads, which breaks `set` and can
leave the job stuck or failing silently.
Requires: huggingface_hub, `huggingface-cli login`.
Secrets: if SKIP_HUB_PUSH is not 1, the job requests Hub secret name HF_TOKEN mapped into
the container as env HF_TOKEN (Settings → Access Tokens / Job secrets).
Environment (optional):
HF_JOB_NAMESPACE default: whoami
HF_JOB_FLAVOR default: l4x1 (often faster than T4 for this workload; override with t4-small to save $)
HF_JOB_IMAGE default: pytorch CUDA 12.4 devel
HF_JOB_TIMEOUT default: 8h
TRAIN_REPO_GIT_URL, OPENENV_BASE_URL
TRAIN_MAX_STEPS default: 80 (faster run; raise for stronger fit)
ROWS_PER_TASK default: 32
GRPO_NUM_GENERATIONS default: 2
SKIP_HUB_PUSH default: 0
"""
from __future__ import annotations
import os
import shlex
from huggingface_hub import HfApi
from huggingface_hub.utils import get_token
_DEFAULT_REPO = "https://huggingface.co/spaces/md896/sql-debug-env"
_REPO_URL = os.environ.get("TRAIN_REPO_GIT_URL", _DEFAULT_REPO)
_OPENENV = os.environ.get("OPENENV_BASE_URL", "https://md896-sql-debug-env.hf.space")
_MAX_STEPS = os.environ.get("TRAIN_MAX_STEPS", "80")
_ROWS = os.environ.get("ROWS_PER_TASK", "32")
_NUM_GEN = os.environ.get("GRPO_NUM_GENERATIONS", "2")
_SKIP_PUSH = os.environ.get("SKIP_HUB_PUSH", "0")
_TIMEOUT = os.environ.get("HF_JOB_TIMEOUT", "8h")
# l4x1: newer GPU, good for Unsloth; use HF_JOB_FLAVOR=t4-small if queue or cost is better for you
_FLAVOR = os.environ.get("HF_JOB_FLAVOR", "l4x1")
_IMAGE = os.environ.get(
"HF_JOB_IMAGE",
"pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel",
)
_NAMESPACE = os.environ.get("HF_JOB_NAMESPACE", "md896")
_SECRETS = None
_local_hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") or get_token()
if _SKIP_PUSH.strip().lower() not in ("1", "true", "yes"):
if _local_hf_token:
_SECRETS = {"HF_TOKEN": _local_hf_token}
else:
# Job can still train; push/upload steps in script will gracefully skip/fail with clear logs.
_SECRETS = None
# One line only — survives UI/API newline flattening.
_bash = (
"set -euxo pipefail; "
"export DEBIAN_FRONTEND=noninteractive; "
"apt-get update -qq && apt-get install -y -qq git ca-certificates; "
"export PIP_BREAK_SYSTEM_PACKAGES=1; "
f"rm -rf train-repo; git clone {shlex.quote(_REPO_URL)} train-repo; "
"cd train-repo; "
"python -u ultimate_sota_training.py"
)
_job_env = {
"OPENENV_BASE_URL": _OPENENV,
"TRAIN_MAX_STEPS": _MAX_STEPS,
"ROWS_PER_TASK": _ROWS,
"GRPO_NUM_GENERATIONS": _NUM_GEN,
"SKIP_HUB_PUSH": _SKIP_PUSH,
"ARTIFACT_SPACE_ID": os.environ.get("ARTIFACT_SPACE_ID", "md896/sql-debug-env"),
"MODEL_HUB_REPO_ID": os.environ.get("MODEL_HUB_REPO_ID", "md896/sql-debug-agent-qwen05b-grpo"),
"HARD_EVAL_SAMPLES": os.environ.get("HARD_EVAL_SAMPLES", "16"),
}
if __name__ == "__main__":
api = HfApi()
ns = _NAMESPACE
job = api.run_job(
image=_IMAGE,
command=["bash", "-lc", _bash],
flavor=_FLAVOR,
namespace=ns,
timeout=_TIMEOUT,
secrets=_SECRETS,
env=_job_env,
)
print("JOB_ID:", job.id)
print("JOB_URL:", job.url)
print("FLAVOR:", _FLAVOR, "| TRAIN_MAX_STEPS:", _MAX_STEPS, "| ROWS_PER_TASK:", _ROWS)
print(
"Note: SCHEDULING is Hugging Face queue time, not your script. "
"Cancel stuck jobs and retry, or try HF_JOB_FLAVOR=t4-small / t4-medium."
)