"""Build and push the bee-train-online Kaggle kernel from local source. Source of truth: workers/kaggle-online-train/train.py (the content between `# === KAGGLE-PASTE START ===` and `# === KAGGLE-PASTE END ===`). This script wraps it in a one-cell .ipynb, writes the kernel-metadata.json with the bee-secrets dataset attached, and runs `kaggle kernels push`. The push triggers a fresh run on Kaggle's GPU. Why dataset_sources matters here: Kaggle Secrets (UI-only) are stripped on every CLI push. We attach the secrets dataset via metadata so the kernel always has access to its tokens — see scripts/bootstrap_kaggle_secrets.py. Pre-flight guard: refuses to push if the kernel is already running or queued (unless --force). This prevents the "4 concurrent sessions" failure mode where each manual `python scripts/push_kaggle_kernel.py` spins up a new container alongside live ones, wasting Kaggle quota. """ from __future__ import annotations import argparse import json import re import subprocess import sys from pathlib import Path REPO_ROOT = Path(__file__).resolve().parent.parent SOURCE = REPO_ROOT / "workers/kaggle-online-train/train.py" PUSH_DIR = Path("/tmp/bee-kaggle-push") KERNEL_ID = "ceocxx/bee-train-online" SECRETS_DATASET = "ceocxx/bee-secrets" def kernel_status(kernel_id: str) -> str: """Return the current Kaggle kernel status as a lower-case string. Returns "" if the status check fails — caller can decide to proceed or bail. Status values seen in the wild: "complete", "running", "queued", "cancel_requested", "cancel_acknowledged", "error". """ try: res = subprocess.run( ["kaggle", "kernels", "status", kernel_id], capture_output=True, text=True, timeout=30, check=False, ) # Output: ceocxx/bee-train-online has status "KernelWorkerStatus.RUNNING" m = re.search(r'status\s+"KernelWorkerStatus\.([A-Z_]+)"', res.stdout) return m.group(1).lower() if m else "" except Exception: return "" def main() -> None: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--force", action="store_true", help="Push even if the kernel is currently running/queued (use with care).", ) args = parser.parse_args() status = kernel_status(KERNEL_ID) if status in {"running", "queued"} and not args.force: print( f"[refuse] {KERNEL_ID} status={status!r} — pushing now would create " f"a duplicate session and waste Kaggle quota.\n" f" Use --force to override, or wait for the current run " f"to finish (the cron will pick up automatically).", file=sys.stderr, ) sys.exit(2) if status: print(f"[ok] {KERNEL_ID} status={status!r} — proceeding to push.") src = SOURCE.read_text(encoding="utf-8") m = re.search(r"# === KAGGLE-PASTE START ===\n(.*?)# === KAGGLE-PASTE END ===", src, re.DOTALL) if not m: sys.exit("paste markers not found in workers/kaggle-online-train/train.py") cell_source = m.group(1).rstrip() + "\n" PUSH_DIR.mkdir(parents=True, exist_ok=True) nb = { "metadata": { "kernelspec": {"language": "python", "display_name": "Python 3", "name": "python3"}, "language_info": { "pygments_lexer": "ipython3", "nbconvert_exporter": "python", "version": "3.12", "file_extension": ".py", "codemirror_mode": {"name": "ipython", "version": 3}, "name": "python", "mimetype": "text/x-python", }, "kaggle": { "accelerator": "nvidiaTeslaT4", "dataSources": [{"sourceType": "datasetVersion", "datasetId": SECRETS_DATASET}], "isInternetEnabled": True, "language": "python", "sourceType": "notebook", "isGpuEnabled": True, }, }, "nbformat_minor": 4, "nbformat": 4, "cells": [ {"cell_type": "code", "source": cell_source, "metadata": {"trusted": True}, "outputs": [], "execution_count": None} ], } (PUSH_DIR / "bee-train-online.ipynb").write_text(json.dumps(nb), encoding="utf-8") meta = { "id": KERNEL_ID, "title": "bee-train-online", "code_file": "bee-train-online.ipynb", "language": "python", "kernel_type": "notebook", "is_private": True, "enable_gpu": True, "enable_tpu": False, "enable_internet": True, "keywords": [], "dataset_sources": [SECRETS_DATASET], "kernel_sources": [], "competition_sources": [], "model_sources": [], } (PUSH_DIR / "kernel-metadata.json").write_text(json.dumps(meta, indent=2), encoding="utf-8") print(f"wrote {PUSH_DIR}/bee-train-online.ipynb ({len(cell_source)} chars in cell)") print(f"dataset_sources: [{SECRETS_DATASET}]") # Force T4 — the documented Kaggle API accelerator string is # `NvidiaTeslaT4` (PascalCase). Our previous `gpuT4x2` was silently # rejected by the API, falling back to the P100 default. Modern # torch (2.10+ cu128) wheels dropped sm_60 support, so any P100 # allocation = `cudaErrorNoKernelImageForDevice` at first CUDA call. # Verified empty against Kaggle CLI 2.1.0 / API docs 2026-05-01. res = subprocess.run( ["kaggle", "kernels", "push", "-p", str(PUSH_DIR), "--accelerator", "NvidiaTeslaT4"], capture_output=True, text=True, ) print(res.stdout.strip()) if res.returncode != 0: print(res.stderr.strip(), file=sys.stderr) sys.exit(res.returncode) if __name__ == "__main__": main()