File size: 5,847 Bytes
5e21013 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | """Build and push the bee-train-online Kaggle kernel from local source.
Source of truth: workers/kaggle-online-train/train.py (the content between
`# === KAGGLE-PASTE START ===` and `# === KAGGLE-PASTE END ===`).
This script wraps it in a one-cell .ipynb, writes the kernel-metadata.json
with the bee-secrets dataset attached, and runs `kaggle kernels push`.
The push triggers a fresh run on Kaggle's GPU.
Why dataset_sources matters here: Kaggle Secrets (UI-only) are stripped
on every CLI push. We attach the secrets dataset via metadata so the
kernel always has access to its tokens β see scripts/bootstrap_kaggle_secrets.py.
Pre-flight guard: refuses to push if the kernel is already running or
queued (unless --force). This prevents the "4 concurrent sessions"
failure mode where each manual `python scripts/push_kaggle_kernel.py`
spins up a new container alongside live ones, wasting Kaggle quota.
"""
from __future__ import annotations
import argparse
import json
import re
import subprocess
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parent.parent
SOURCE = REPO_ROOT / "workers/kaggle-online-train/train.py"
PUSH_DIR = Path("/tmp/bee-kaggle-push")
KERNEL_ID = "ceocxx/bee-train-online"
SECRETS_DATASET = "ceocxx/bee-secrets"
def kernel_status(kernel_id: str) -> str:
"""Return the current Kaggle kernel status as a lower-case string.
Returns "" if the status check fails β caller can decide to proceed
or bail. Status values seen in the wild: "complete", "running",
"queued", "cancel_requested", "cancel_acknowledged", "error".
"""
try:
res = subprocess.run(
["kaggle", "kernels", "status", kernel_id],
capture_output=True, text=True, timeout=30, check=False,
)
# Output: ceocxx/bee-train-online has status "KernelWorkerStatus.RUNNING"
m = re.search(r'status\s+"KernelWorkerStatus\.([A-Z_]+)"', res.stdout)
return m.group(1).lower() if m else ""
except Exception:
return ""
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--force",
action="store_true",
help="Push even if the kernel is currently running/queued (use with care).",
)
args = parser.parse_args()
status = kernel_status(KERNEL_ID)
if status in {"running", "queued"} and not args.force:
print(
f"[refuse] {KERNEL_ID} status={status!r} β pushing now would create "
f"a duplicate session and waste Kaggle quota.\n"
f" Use --force to override, or wait for the current run "
f"to finish (the cron will pick up automatically).",
file=sys.stderr,
)
sys.exit(2)
if status:
print(f"[ok] {KERNEL_ID} status={status!r} β proceeding to push.")
src = SOURCE.read_text(encoding="utf-8")
m = re.search(r"# === KAGGLE-PASTE START ===\n(.*?)# === KAGGLE-PASTE END ===", src, re.DOTALL)
if not m:
sys.exit("paste markers not found in workers/kaggle-online-train/train.py")
cell_source = m.group(1).rstrip() + "\n"
PUSH_DIR.mkdir(parents=True, exist_ok=True)
nb = {
"metadata": {
"kernelspec": {"language": "python", "display_name": "Python 3", "name": "python3"},
"language_info": {
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"version": "3.12",
"file_extension": ".py",
"codemirror_mode": {"name": "ipython", "version": 3},
"name": "python",
"mimetype": "text/x-python",
},
"kaggle": {
"accelerator": "nvidiaTeslaT4",
"dataSources": [{"sourceType": "datasetVersion", "datasetId": SECRETS_DATASET}],
"isInternetEnabled": True,
"language": "python",
"sourceType": "notebook",
"isGpuEnabled": True,
},
},
"nbformat_minor": 4,
"nbformat": 4,
"cells": [
{"cell_type": "code", "source": cell_source, "metadata": {"trusted": True},
"outputs": [], "execution_count": None}
],
}
(PUSH_DIR / "bee-train-online.ipynb").write_text(json.dumps(nb), encoding="utf-8")
meta = {
"id": KERNEL_ID,
"title": "bee-train-online",
"code_file": "bee-train-online.ipynb",
"language": "python",
"kernel_type": "notebook",
"is_private": True,
"enable_gpu": True,
"enable_tpu": False,
"enable_internet": True,
"keywords": [],
"dataset_sources": [SECRETS_DATASET],
"kernel_sources": [],
"competition_sources": [],
"model_sources": [],
}
(PUSH_DIR / "kernel-metadata.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")
print(f"wrote {PUSH_DIR}/bee-train-online.ipynb ({len(cell_source)} chars in cell)")
print(f"dataset_sources: [{SECRETS_DATASET}]")
# Force T4 β the documented Kaggle API accelerator string is
# `NvidiaTeslaT4` (PascalCase). Our previous `gpuT4x2` was silently
# rejected by the API, falling back to the P100 default. Modern
# torch (2.10+ cu128) wheels dropped sm_60 support, so any P100
# allocation = `cudaErrorNoKernelImageForDevice` at first CUDA call.
# Verified empty against Kaggle CLI 2.1.0 / API docs 2026-05-01.
res = subprocess.run(
["kaggle", "kernels", "push", "-p", str(PUSH_DIR),
"--accelerator", "NvidiaTeslaT4"],
capture_output=True, text=True,
)
print(res.stdout.strip())
if res.returncode != 0:
print(res.stderr.strip(), file=sys.stderr)
sys.exit(res.returncode)
if __name__ == "__main__":
main()
|