File size: 5,847 Bytes
5e21013
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
"""Build and push the bee-train-online Kaggle kernel from local source.

Source of truth: workers/kaggle-online-train/train.py (the content between
`# === KAGGLE-PASTE START ===` and `# === KAGGLE-PASTE END ===`).

This script wraps it in a one-cell .ipynb, writes the kernel-metadata.json
with the bee-secrets dataset attached, and runs `kaggle kernels push`.
The push triggers a fresh run on Kaggle's GPU.

Why dataset_sources matters here: Kaggle Secrets (UI-only) are stripped
on every CLI push. We attach the secrets dataset via metadata so the
kernel always has access to its tokens β€” see scripts/bootstrap_kaggle_secrets.py.

Pre-flight guard: refuses to push if the kernel is already running or
queued (unless --force). This prevents the "4 concurrent sessions"
failure mode where each manual `python scripts/push_kaggle_kernel.py`
spins up a new container alongside live ones, wasting Kaggle quota.
"""
from __future__ import annotations

import argparse
import json
import re
import subprocess
import sys
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parent.parent
SOURCE = REPO_ROOT / "workers/kaggle-online-train/train.py"
PUSH_DIR = Path("/tmp/bee-kaggle-push")

KERNEL_ID = "ceocxx/bee-train-online"
SECRETS_DATASET = "ceocxx/bee-secrets"


def kernel_status(kernel_id: str) -> str:
    """Return the current Kaggle kernel status as a lower-case string.

    Returns "" if the status check fails β€” caller can decide to proceed
    or bail. Status values seen in the wild: "complete", "running",
    "queued", "cancel_requested", "cancel_acknowledged", "error".
    """
    try:
        res = subprocess.run(
            ["kaggle", "kernels", "status", kernel_id],
            capture_output=True, text=True, timeout=30, check=False,
        )
        # Output: ceocxx/bee-train-online has status "KernelWorkerStatus.RUNNING"
        m = re.search(r'status\s+"KernelWorkerStatus\.([A-Z_]+)"', res.stdout)
        return m.group(1).lower() if m else ""
    except Exception:
        return ""


def main() -> None:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--force",
        action="store_true",
        help="Push even if the kernel is currently running/queued (use with care).",
    )
    args = parser.parse_args()

    status = kernel_status(KERNEL_ID)
    if status in {"running", "queued"} and not args.force:
        print(
            f"[refuse] {KERNEL_ID} status={status!r} β€” pushing now would create "
            f"a duplicate session and waste Kaggle quota.\n"
            f"         Use --force to override, or wait for the current run "
            f"to finish (the cron will pick up automatically).",
            file=sys.stderr,
        )
        sys.exit(2)
    if status:
        print(f"[ok] {KERNEL_ID} status={status!r} β€” proceeding to push.")

    src = SOURCE.read_text(encoding="utf-8")
    m = re.search(r"# === KAGGLE-PASTE START ===\n(.*?)# === KAGGLE-PASTE END ===", src, re.DOTALL)
    if not m:
        sys.exit("paste markers not found in workers/kaggle-online-train/train.py")
    cell_source = m.group(1).rstrip() + "\n"

    PUSH_DIR.mkdir(parents=True, exist_ok=True)
    nb = {
        "metadata": {
            "kernelspec": {"language": "python", "display_name": "Python 3", "name": "python3"},
            "language_info": {
                "pygments_lexer": "ipython3",
                "nbconvert_exporter": "python",
                "version": "3.12",
                "file_extension": ".py",
                "codemirror_mode": {"name": "ipython", "version": 3},
                "name": "python",
                "mimetype": "text/x-python",
            },
            "kaggle": {
                "accelerator": "nvidiaTeslaT4",
                "dataSources": [{"sourceType": "datasetVersion", "datasetId": SECRETS_DATASET}],
                "isInternetEnabled": True,
                "language": "python",
                "sourceType": "notebook",
                "isGpuEnabled": True,
            },
        },
        "nbformat_minor": 4,
        "nbformat": 4,
        "cells": [
            {"cell_type": "code", "source": cell_source, "metadata": {"trusted": True},
             "outputs": [], "execution_count": None}
        ],
    }
    (PUSH_DIR / "bee-train-online.ipynb").write_text(json.dumps(nb), encoding="utf-8")

    meta = {
        "id": KERNEL_ID,
        "title": "bee-train-online",
        "code_file": "bee-train-online.ipynb",
        "language": "python",
        "kernel_type": "notebook",
        "is_private": True,
        "enable_gpu": True,
        "enable_tpu": False,
        "enable_internet": True,
        "keywords": [],
        "dataset_sources": [SECRETS_DATASET],
        "kernel_sources": [],
        "competition_sources": [],
        "model_sources": [],
    }
    (PUSH_DIR / "kernel-metadata.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")

    print(f"wrote {PUSH_DIR}/bee-train-online.ipynb ({len(cell_source)} chars in cell)")
    print(f"dataset_sources: [{SECRETS_DATASET}]")

    # Force T4 β€” the documented Kaggle API accelerator string is
    # `NvidiaTeslaT4` (PascalCase). Our previous `gpuT4x2` was silently
    # rejected by the API, falling back to the P100 default. Modern
    # torch (2.10+ cu128) wheels dropped sm_60 support, so any P100
    # allocation = `cudaErrorNoKernelImageForDevice` at first CUDA call.
    # Verified empty against Kaggle CLI 2.1.0 / API docs 2026-05-01.
    res = subprocess.run(
        ["kaggle", "kernels", "push", "-p", str(PUSH_DIR),
         "--accelerator", "NvidiaTeslaT4"],
        capture_output=True, text=True,
    )
    print(res.stdout.strip())
    if res.returncode != 0:
        print(res.stderr.strip(), file=sys.stderr)
        sys.exit(res.returncode)


if __name__ == "__main__":
    main()