Spaces:

cuilabs
/

bee

Running

bee / scripts /push_kaggle_kernel.py

Bee Deploy

HF Space backend deploy [de0cba5]

5e21013 1 day ago

5.85 kB

	"""Build and push the bee-train-online Kaggle kernel from local source.

	Source of truth: workers/kaggle-online-train/train.py (the content between
	`# === KAGGLE-PASTE START ===` and `# === KAGGLE-PASTE END ===`).

	This script wraps it in a one-cell .ipynb, writes the kernel-metadata.json
	with the bee-secrets dataset attached, and runs `kaggle kernels push`.
	The push triggers a fresh run on Kaggle's GPU.

	Why dataset_sources matters here: Kaggle Secrets (UI-only) are stripped
	on every CLI push. We attach the secrets dataset via metadata so the
	kernel always has access to its tokens — see scripts/bootstrap_kaggle_secrets.py.

	Pre-flight guard: refuses to push if the kernel is already running or
	queued (unless --force). This prevents the "4 concurrent sessions"
	failure mode where each manual `python scripts/push_kaggle_kernel.py`
	spins up a new container alongside live ones, wasting Kaggle quota.
	"""
	from __future__ import annotations

	import argparse
	import json
	import re
	import subprocess
	import sys
	from pathlib import Path

	REPO_ROOT = Path(__file__).resolve().parent.parent
	SOURCE = REPO_ROOT / "workers/kaggle-online-train/train.py"
	PUSH_DIR = Path("/tmp/bee-kaggle-push")

	KERNEL_ID = "ceocxx/bee-train-online"
	SECRETS_DATASET = "ceocxx/bee-secrets"


	def kernel_status(kernel_id: str) -> str:
	"""Return the current Kaggle kernel status as a lower-case string.

	Returns "" if the status check fails — caller can decide to proceed
	or bail. Status values seen in the wild: "complete", "running",
	"queued", "cancel_requested", "cancel_acknowledged", "error".
	"""
	try:
	res = subprocess.run(
	["kaggle", "kernels", "status", kernel_id],
	capture_output=True, text=True, timeout=30, check=False,
	)
	# Output: ceocxx/bee-train-online has status "KernelWorkerStatus.RUNNING"
	m = re.search(r'status\s+"KernelWorkerStatus\.([A-Z_]+)"', res.stdout)
	return m.group(1).lower() if m else ""
	except Exception:
	return ""


	def main() -> None:
	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument(
	"--force",
	action="store_true",
	help="Push even if the kernel is currently running/queued (use with care).",
	)
	args = parser.parse_args()

	status = kernel_status(KERNEL_ID)
	if status in {"running", "queued"} and not args.force:
	print(
	f"[refuse] {KERNEL_ID} status={status!r} — pushing now would create "
	f"a duplicate session and waste Kaggle quota.\n"
	f" Use --force to override, or wait for the current run "
	f"to finish (the cron will pick up automatically).",
	file=sys.stderr,
	)
	sys.exit(2)
	if status:
	print(f"[ok] {KERNEL_ID} status={status!r} — proceeding to push.")

	src = SOURCE.read_text(encoding="utf-8")
	m = re.search(r"# === KAGGLE-PASTE START ===\n(.*?)# === KAGGLE-PASTE END ===", src, re.DOTALL)
	if not m:
	sys.exit("paste markers not found in workers/kaggle-online-train/train.py")
	cell_source = m.group(1).rstrip() + "\n"

	PUSH_DIR.mkdir(parents=True, exist_ok=True)
	nb = {
	"metadata": {
	"kernelspec": {"language": "python", "display_name": "Python 3", "name": "python3"},
	"language_info": {
	"pygments_lexer": "ipython3",
	"nbconvert_exporter": "python",
	"version": "3.12",
	"file_extension": ".py",
	"codemirror_mode": {"name": "ipython", "version": 3},
	"name": "python",
	"mimetype": "text/x-python",
	},
	"kaggle": {
	"accelerator": "nvidiaTeslaT4",
	"dataSources": [{"sourceType": "datasetVersion", "datasetId": SECRETS_DATASET}],
	"isInternetEnabled": True,
	"language": "python",
	"sourceType": "notebook",
	"isGpuEnabled": True,
	},
	},
	"nbformat_minor": 4,
	"nbformat": 4,
	"cells": [
	{"cell_type": "code", "source": cell_source, "metadata": {"trusted": True},
	"outputs": [], "execution_count": None}
	],
	}
	(PUSH_DIR / "bee-train-online.ipynb").write_text(json.dumps(nb), encoding="utf-8")

	meta = {
	"id": KERNEL_ID,
	"title": "bee-train-online",
	"code_file": "bee-train-online.ipynb",
	"language": "python",
	"kernel_type": "notebook",
	"is_private": True,
	"enable_gpu": True,
	"enable_tpu": False,
	"enable_internet": True,
	"keywords": [],
	"dataset_sources": [SECRETS_DATASET],
	"kernel_sources": [],
	"competition_sources": [],
	"model_sources": [],
	}
	(PUSH_DIR / "kernel-metadata.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")

	print(f"wrote {PUSH_DIR}/bee-train-online.ipynb ({len(cell_source)} chars in cell)")
	print(f"dataset_sources: [{SECRETS_DATASET}]")

	# Force T4 — the documented Kaggle API accelerator string is
	# `NvidiaTeslaT4` (PascalCase). Our previous `gpuT4x2` was silently
	# rejected by the API, falling back to the P100 default. Modern
	# torch (2.10+ cu128) wheels dropped sm_60 support, so any P100
	# allocation = `cudaErrorNoKernelImageForDevice` at first CUDA call.
	# Verified empty against Kaggle CLI 2.1.0 / API docs 2026-05-01.
	res = subprocess.run(
	["kaggle", "kernels", "push", "-p", str(PUSH_DIR),
	"--accelerator", "NvidiaTeslaT4"],
	capture_output=True, text=True,
	)
	print(res.stdout.strip())
	if res.returncode != 0:
	print(res.stderr.strip(), file=sys.stderr)
	sys.exit(res.returncode)


	if __name__ == "__main__":
	main()