File size: 5,373 Bytes
5e21013 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | """Build and push the bee-train-online-tpu Kaggle kernel from local source.
Sister script to scripts/push_kaggle_kernel.py β same workflow, different
kernel ID, different runner source (workers/kaggle-tpu-train/train.py),
different accelerator (TpuV6E8). Both kernels can run concurrently on
distinct Kaggle quota pools (30h/week GPU vs 20h/week TPU).
Source of truth: workers/kaggle-tpu-train/train.py (content between
`# === KAGGLE-PASTE START ===` and `# === KAGGLE-PASTE END ===`).
Pre-flight guard: refuses to push if the kernel is already running or
queued (unless --force). Same lesson as the GPU script.
"""
from __future__ import annotations
import argparse
import json
import re
import subprocess
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parent.parent
SOURCE = REPO_ROOT / "workers/kaggle-tpu-train/train.py"
PUSH_DIR = Path("/tmp/bee-kaggle-tpu-push")
# Distinct kernel ID so it runs alongside the GPU kernel without colliding
# in the user's kernel list. Same secrets dataset attaches.
KERNEL_ID = "ceocxx/bee-train-online-tpu"
SECRETS_DATASET = "ceocxx/bee-secrets"
def kernel_status(kernel_id: str) -> str:
try:
res = subprocess.run(
["kaggle", "kernels", "status", kernel_id],
capture_output=True, text=True, timeout=30, check=False,
)
m = re.search(r'status\s+"KernelWorkerStatus\.([A-Z_]+)"', res.stdout)
return m.group(1).lower() if m else ""
except Exception:
return ""
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--force",
action="store_true",
help="Push even if the kernel is currently running/queued (use with care).",
)
args = parser.parse_args()
status = kernel_status(KERNEL_ID)
if status in {"running", "queued"} and not args.force:
print(
f"[refuse] {KERNEL_ID} status={status!r} β pushing now would create "
f"a duplicate session and waste Kaggle quota.\n"
f" Use --force to override, or wait for the current run "
f"to finish (the cron will pick up automatically).",
file=sys.stderr,
)
sys.exit(2)
if status:
print(f"[ok] {KERNEL_ID} status={status!r} β proceeding to push.")
src = SOURCE.read_text(encoding="utf-8")
m = re.search(r"# === KAGGLE-PASTE START ===\n(.*?)# === KAGGLE-PASTE END ===", src, re.DOTALL)
if not m:
sys.exit("paste markers not found in workers/kaggle-tpu-train/train.py")
cell_source = m.group(1).rstrip() + "\n"
PUSH_DIR.mkdir(parents=True, exist_ok=True)
nb = {
"metadata": {
"kernelspec": {"language": "python", "display_name": "Python 3", "name": "python3"},
"language_info": {
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"version": "3.12",
"file_extension": ".py",
"codemirror_mode": {"name": "ipython", "version": 3},
"name": "python",
"mimetype": "text/x-python",
},
"kaggle": {
# Kaggle's TPU offering as of 2026-05 is v6e-8 (8 cores).
"accelerator": "TpuV6E8",
"dataSources": [{"sourceType": "datasetVersion", "datasetId": SECRETS_DATASET}],
"isInternetEnabled": True,
"language": "python",
"sourceType": "notebook",
# `isGpuEnabled` stays false for TPU kernels; Kaggle infers
# the accelerator from the metadata above.
"isGpuEnabled": False,
},
},
"nbformat_minor": 4,
"nbformat": 4,
"cells": [
{"cell_type": "code", "source": cell_source, "metadata": {"trusted": True},
"outputs": [], "execution_count": None}
],
}
(PUSH_DIR / "bee-train-online-tpu.ipynb").write_text(json.dumps(nb), encoding="utf-8")
meta = {
"id": KERNEL_ID,
"title": "bee-train-online-tpu",
"code_file": "bee-train-online-tpu.ipynb",
"language": "python",
"kernel_type": "notebook",
"is_private": True,
"enable_gpu": False,
"enable_tpu": True,
"enable_internet": True,
"keywords": [],
"dataset_sources": [SECRETS_DATASET],
"kernel_sources": [],
"competition_sources": [],
"model_sources": [],
}
(PUSH_DIR / "kernel-metadata.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")
print(f"wrote {PUSH_DIR}/bee-train-online-tpu.ipynb ({len(cell_source)} chars in cell)")
print(f"dataset_sources: [{SECRETS_DATASET}]")
# Force TPU v6e-8 explicitly β same lesson as the GPU side, where the
# CLI silently fell back to the default if the accelerator string
# didn't match Kaggle's expected enum. `TpuV6E8` is the documented
# Kaggle API value as of 2026-05.
res = subprocess.run(
["kaggle", "kernels", "push", "-p", str(PUSH_DIR),
"--accelerator", "TpuV6E8"],
capture_output=True, text=True,
)
print(res.stdout.strip())
if res.returncode != 0:
print(res.stderr.strip(), file=sys.stderr)
sys.exit(res.returncode)
if __name__ == "__main__":
main()
|