"""Bootstrap a private Kaggle Dataset that holds the secrets the Bee training kernel needs. WHY THIS EXISTS --------------- Kaggle's `UserSecretsClient` (Add-ons → Secrets) is UI-managed. Bindings between a kernel and a secret are NOT preserved when the kernel is pushed via the Kaggle CLI / API — and the cron at /api/cron/kaggle-dispatch pushes on every tick. So every cron-driven run loses access to its secrets and aborts. The fix: store the same secrets in a PRIVATE Kaggle Dataset and attach that dataset to the kernel via `kernel-metadata.json`'s `dataset_sources`. Dataset attachments DO survive CLI pushes (they're part of the metadata file the kernel itself owns). Security delta vs Kaggle Secrets: - Kaggle Secrets: encrypted at rest by Kaggle; UI-only management. - Private Dataset: cleartext file inside a private Kaggle Dataset; only readable by the dataset owner (you) and the cron's KAGGLE_KEY (also yours). For a single-tenant private kernel, practically equivalent. Both gated by Kaggle authentication. Run locally with HF_TOKEN + CRON_SECRET + KAGGLE creds in env: HF_TOKEN=... CRON_SECRET=... \\ python scripts/bootstrap_kaggle_secrets.py """ from __future__ import annotations import json import os import subprocess import sys import tempfile from pathlib import Path DATASET_OWNER = "ceocxx" DATASET_SLUG = "bee-secrets" DATASET_TITLE = "Bee training kernel secrets (private)" def main() -> None: hf_token = os.environ.get("HF_TOKEN", "") cron_secret = os.environ.get("CRON_SECRET") or os.environ.get("BEE_CRON_SECRET", "") if not hf_token or not cron_secret: raise SystemExit( "Both HF_TOKEN and CRON_SECRET (or BEE_CRON_SECRET) env vars are required." ) secrets = { "hf_token": hf_token, "cron_secret": cron_secret, # Not actually secret — but kept here so the kernel has only ONE place # to look. Update both Vercel env and this dataset if the workspace # URL ever moves. "ingest_url": "https://workspace.bee.cuilabs.io/api/training/runs", "next_domain_url": "https://workspace.bee.cuilabs.io/api/training/next-domain", } with tempfile.TemporaryDirectory() as tmp: d = Path(tmp) (d / "secrets.json").write_text(json.dumps(secrets, indent=2), encoding="utf-8") (d / "dataset-metadata.json").write_text( json.dumps({ "title": DATASET_TITLE, "id": f"{DATASET_OWNER}/{DATASET_SLUG}", "licenses": [{"name": "other"}], "subtitle": "Cleartext secrets attached to bee-train-online — private only.", "description": ( "PRIVATE. Holds the HF write token and CRON bearer that " "the Bee training kernel needs. This dataset is attached " "to ceocxx/bee-train-online via the kernel-metadata.json " "dataset_sources field. Do not make public." ), "isPrivate": True, "keywords": [], }, indent=2), encoding="utf-8", ) # First call creates; subsequent calls error → fall back to version. create = subprocess.run( ["kaggle", "datasets", "create", "-p", str(d)], capture_output=True, text=True, ) out = (create.stdout + create.stderr).strip() print(out) if create.returncode != 0: print("create failed → trying `datasets version` (rotates existing)") ver = subprocess.run( ["kaggle", "datasets", "version", "-p", str(d), "-m", "rotate bee-secrets", "--dir-mode", "zip"], capture_output=True, text=True, ) print((ver.stdout + ver.stderr).strip()) if ver.returncode != 0: sys.exit(ver.returncode) if __name__ == "__main__": main()