bee / scripts /push_kaggle_kernel.py
Bee Deploy
HF Space backend deploy [de0cba5]
5e21013
"""Build and push the bee-train-online Kaggle kernel from local source.
Source of truth: workers/kaggle-online-train/train.py (the content between
`# === KAGGLE-PASTE START ===` and `# === KAGGLE-PASTE END ===`).
This script wraps it in a one-cell .ipynb, writes the kernel-metadata.json
with the bee-secrets dataset attached, and runs `kaggle kernels push`.
The push triggers a fresh run on Kaggle's GPU.
Why dataset_sources matters here: Kaggle Secrets (UI-only) are stripped
on every CLI push. We attach the secrets dataset via metadata so the
kernel always has access to its tokens β€” see scripts/bootstrap_kaggle_secrets.py.
Pre-flight guard: refuses to push if the kernel is already running or
queued (unless --force). This prevents the "4 concurrent sessions"
failure mode where each manual `python scripts/push_kaggle_kernel.py`
spins up a new container alongside live ones, wasting Kaggle quota.
"""
from __future__ import annotations
import argparse
import json
import re
import subprocess
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parent.parent
SOURCE = REPO_ROOT / "workers/kaggle-online-train/train.py"
PUSH_DIR = Path("/tmp/bee-kaggle-push")
KERNEL_ID = "ceocxx/bee-train-online"
SECRETS_DATASET = "ceocxx/bee-secrets"
def kernel_status(kernel_id: str) -> str:
"""Return the current Kaggle kernel status as a lower-case string.
Returns "" if the status check fails β€” caller can decide to proceed
or bail. Status values seen in the wild: "complete", "running",
"queued", "cancel_requested", "cancel_acknowledged", "error".
"""
try:
res = subprocess.run(
["kaggle", "kernels", "status", kernel_id],
capture_output=True, text=True, timeout=30, check=False,
)
# Output: ceocxx/bee-train-online has status "KernelWorkerStatus.RUNNING"
m = re.search(r'status\s+"KernelWorkerStatus\.([A-Z_]+)"', res.stdout)
return m.group(1).lower() if m else ""
except Exception:
return ""
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--force",
action="store_true",
help="Push even if the kernel is currently running/queued (use with care).",
)
args = parser.parse_args()
status = kernel_status(KERNEL_ID)
if status in {"running", "queued"} and not args.force:
print(
f"[refuse] {KERNEL_ID} status={status!r} β€” pushing now would create "
f"a duplicate session and waste Kaggle quota.\n"
f" Use --force to override, or wait for the current run "
f"to finish (the cron will pick up automatically).",
file=sys.stderr,
)
sys.exit(2)
if status:
print(f"[ok] {KERNEL_ID} status={status!r} β€” proceeding to push.")
src = SOURCE.read_text(encoding="utf-8")
m = re.search(r"# === KAGGLE-PASTE START ===\n(.*?)# === KAGGLE-PASTE END ===", src, re.DOTALL)
if not m:
sys.exit("paste markers not found in workers/kaggle-online-train/train.py")
cell_source = m.group(1).rstrip() + "\n"
PUSH_DIR.mkdir(parents=True, exist_ok=True)
nb = {
"metadata": {
"kernelspec": {"language": "python", "display_name": "Python 3", "name": "python3"},
"language_info": {
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"version": "3.12",
"file_extension": ".py",
"codemirror_mode": {"name": "ipython", "version": 3},
"name": "python",
"mimetype": "text/x-python",
},
"kaggle": {
"accelerator": "nvidiaTeslaT4",
"dataSources": [{"sourceType": "datasetVersion", "datasetId": SECRETS_DATASET}],
"isInternetEnabled": True,
"language": "python",
"sourceType": "notebook",
"isGpuEnabled": True,
},
},
"nbformat_minor": 4,
"nbformat": 4,
"cells": [
{"cell_type": "code", "source": cell_source, "metadata": {"trusted": True},
"outputs": [], "execution_count": None}
],
}
(PUSH_DIR / "bee-train-online.ipynb").write_text(json.dumps(nb), encoding="utf-8")
meta = {
"id": KERNEL_ID,
"title": "bee-train-online",
"code_file": "bee-train-online.ipynb",
"language": "python",
"kernel_type": "notebook",
"is_private": True,
"enable_gpu": True,
"enable_tpu": False,
"enable_internet": True,
"keywords": [],
"dataset_sources": [SECRETS_DATASET],
"kernel_sources": [],
"competition_sources": [],
"model_sources": [],
}
(PUSH_DIR / "kernel-metadata.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")
print(f"wrote {PUSH_DIR}/bee-train-online.ipynb ({len(cell_source)} chars in cell)")
print(f"dataset_sources: [{SECRETS_DATASET}]")
# Force T4 β€” the documented Kaggle API accelerator string is
# `NvidiaTeslaT4` (PascalCase). Our previous `gpuT4x2` was silently
# rejected by the API, falling back to the P100 default. Modern
# torch (2.10+ cu128) wheels dropped sm_60 support, so any P100
# allocation = `cudaErrorNoKernelImageForDevice` at first CUDA call.
# Verified empty against Kaggle CLI 2.1.0 / API docs 2026-05-01.
res = subprocess.run(
["kaggle", "kernels", "push", "-p", str(PUSH_DIR),
"--accelerator", "NvidiaTeslaT4"],
capture_output=True, text=True,
)
print(res.stdout.strip())
if res.returncode != 0:
print(res.stderr.strip(), file=sys.stderr)
sys.exit(res.returncode)
if __name__ == "__main__":
main()