Spaces:

cuilabs
/

bee

Running

File size: 5,847 Bytes

5e21013

"""Build and push the bee-train-online Kaggle kernel from local source.

Source of truth: workers/kaggle-online-train/train.py (the content between
`# === KAGGLE-PASTE START ===` and `# === KAGGLE-PASTE END ===`).

This script wraps it in a one-cell .ipynb, writes the kernel-metadata.json
with the bee-secrets dataset attached, and runs `kaggle kernels push`.
The push triggers a fresh run on Kaggle's GPU.

Why dataset_sources matters here: Kaggle Secrets (UI-only) are stripped
on every CLI push. We attach the secrets dataset via metadata so the
kernel always has access to its tokens — see scripts/bootstrap_kaggle_secrets.py.

Pre-flight guard: refuses to push if the kernel is already running or
queued (unless --force). This prevents the "4 concurrent sessions"
failure mode where each manual `python scripts/push_kaggle_kernel.py`
spins up a new container alongside live ones, wasting Kaggle quota.
"""
from __future__ import annotations

import argparse
import json
import re
import subprocess
import sys
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parent.parent
SOURCE = REPO_ROOT / "workers/kaggle-online-train/train.py"
PUSH_DIR = Path("/tmp/bee-kaggle-push")

KERNEL_ID = "ceocxx/bee-train-online"
SECRETS_DATASET = "ceocxx/bee-secrets"


def kernel_status(kernel_id: str) -> str:
    """Return the current Kaggle kernel status as a lower-case string.

    Returns "" if the status check fails — caller can decide to proceed
    or bail. Status values seen in the wild: "complete", "running",
    "queued", "cancel_requested", "cancel_acknowledged", "error".
    """
    try:
        res = subprocess.run(
            ["kaggle", "kernels", "status", kernel_id],
            capture_output=True, text=True, timeout=30, check=False,
        )
        # Output: ceocxx/bee-train-online has status "KernelWorkerStatus.RUNNING"
        m = re.search(r'status\s+"KernelWorkerStatus\.([A-Z_]+)"', res.stdout)
        return m.group(1).lower() if m else ""
    except Exception:
        return ""


def main() -> None:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--force",
        action="store_true",
        help="Push even if the kernel is currently running/queued (use with care).",
    )
    args = parser.parse_args()

    status = kernel_status(KERNEL_ID)
    if status in {"running", "queued"} and not args.force:
        print(
            f"[refuse] {KERNEL_ID} status={status!r} — pushing now would create "
            f"a duplicate session and waste Kaggle quota.\n"
            f"         Use --force to override, or wait for the current run "
            f"to finish (the cron will pick up automatically).",
            file=sys.stderr,
        )
        sys.exit(2)
    if status:
        print(f"[ok] {KERNEL_ID} status={status!r} — proceeding to push.")

    src = SOURCE.read_text(encoding="utf-8")
    m = re.search(r"# === KAGGLE-PASTE START ===\n(.*?)# === KAGGLE-PASTE END ===", src, re.DOTALL)
    if not m:
        sys.exit("paste markers not found in workers/kaggle-online-train/train.py")
    cell_source = m.group(1).rstrip() + "\n"

    PUSH_DIR.mkdir(parents=True, exist_ok=True)
    nb = {
        "metadata": {
            "kernelspec": {"language": "python", "display_name": "Python 3", "name": "python3"},
            "language_info": {
                "pygments_lexer": "ipython3",
                "nbconvert_exporter": "python",
                "version": "3.12",
                "file_extension": ".py",
                "codemirror_mode": {"name": "ipython", "version": 3},
                "name": "python",
                "mimetype": "text/x-python",
            },
            "kaggle": {
                "accelerator": "nvidiaTeslaT4",
                "dataSources": [{"sourceType": "datasetVersion", "datasetId": SECRETS_DATASET}],
                "isInternetEnabled": True,
                "language": "python",
                "sourceType": "notebook",
                "isGpuEnabled": True,
            },
        },
        "nbformat_minor": 4,
        "nbformat": 4,
        "cells": [
            {"cell_type": "code", "source": cell_source, "metadata": {"trusted": True},
             "outputs": [], "execution_count": None}
        ],
    }
    (PUSH_DIR / "bee-train-online.ipynb").write_text(json.dumps(nb), encoding="utf-8")

    meta = {
        "id": KERNEL_ID,
        "title": "bee-train-online",
        "code_file": "bee-train-online.ipynb",
        "language": "python",
        "kernel_type": "notebook",
        "is_private": True,
        "enable_gpu": True,
        "enable_tpu": False,
        "enable_internet": True,
        "keywords": [],
        "dataset_sources": [SECRETS_DATASET],
        "kernel_sources": [],
        "competition_sources": [],
        "model_sources": [],
    }
    (PUSH_DIR / "kernel-metadata.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")

    print(f"wrote {PUSH_DIR}/bee-train-online.ipynb ({len(cell_source)} chars in cell)")
    print(f"dataset_sources: [{SECRETS_DATASET}]")

    # Force T4 — the documented Kaggle API accelerator string is
    # `NvidiaTeslaT4` (PascalCase). Our previous `gpuT4x2` was silently
    # rejected by the API, falling back to the P100 default. Modern
    # torch (2.10+ cu128) wheels dropped sm_60 support, so any P100
    # allocation = `cudaErrorNoKernelImageForDevice` at first CUDA call.
    # Verified empty against Kaggle CLI 2.1.0 / API docs 2026-05-01.
    res = subprocess.run(
        ["kaggle", "kernels", "push", "-p", str(PUSH_DIR),
         "--accelerator", "NvidiaTeslaT4"],
        capture_output=True, text=True,
    )
    print(res.stdout.strip())
    if res.returncode != 0:
        print(res.stderr.strip(), file=sys.stderr)
        sys.exit(res.returncode)


if __name__ == "__main__":
    main()