File size: 3,971 Bytes
5e21013 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 | """Bootstrap a private Kaggle Dataset that holds the secrets the Bee
training kernel needs.
WHY THIS EXISTS
---------------
Kaggle's `UserSecretsClient` (Add-ons β Secrets) is UI-managed. Bindings
between a kernel and a secret are NOT preserved when the kernel is pushed
via the Kaggle CLI / API β and the cron at /api/cron/kaggle-dispatch
pushes on every tick. So every cron-driven run loses access to its
secrets and aborts.
The fix: store the same secrets in a PRIVATE Kaggle Dataset and attach
that dataset to the kernel via `kernel-metadata.json`'s `dataset_sources`.
Dataset attachments DO survive CLI pushes (they're part of the metadata
file the kernel itself owns).
Security delta vs Kaggle Secrets:
- Kaggle Secrets: encrypted at rest by Kaggle; UI-only management.
- Private Dataset: cleartext file inside a private Kaggle Dataset;
only readable by the dataset owner (you) and the cron's
KAGGLE_KEY (also yours). For a single-tenant private kernel,
practically equivalent. Both gated by Kaggle authentication.
Run locally with HF_TOKEN + CRON_SECRET + KAGGLE creds in env:
HF_TOKEN=... CRON_SECRET=... \\
python scripts/bootstrap_kaggle_secrets.py
"""
from __future__ import annotations
import json
import os
import subprocess
import sys
import tempfile
from pathlib import Path
DATASET_OWNER = "ceocxx"
DATASET_SLUG = "bee-secrets"
DATASET_TITLE = "Bee training kernel secrets (private)"
def main() -> None:
hf_token = os.environ.get("HF_TOKEN", "")
cron_secret = os.environ.get("CRON_SECRET") or os.environ.get("BEE_CRON_SECRET", "")
if not hf_token or not cron_secret:
raise SystemExit(
"Both HF_TOKEN and CRON_SECRET (or BEE_CRON_SECRET) env vars are required."
)
secrets = {
"hf_token": hf_token,
"cron_secret": cron_secret,
# Not actually secret β but kept here so the kernel has only ONE place
# to look. Update both Vercel env and this dataset if the workspace
# URL ever moves.
"ingest_url": "https://workspace.bee.cuilabs.io/api/training/runs",
"next_domain_url": "https://workspace.bee.cuilabs.io/api/training/next-domain",
}
with tempfile.TemporaryDirectory() as tmp:
d = Path(tmp)
(d / "secrets.json").write_text(json.dumps(secrets, indent=2), encoding="utf-8")
(d / "dataset-metadata.json").write_text(
json.dumps({
"title": DATASET_TITLE,
"id": f"{DATASET_OWNER}/{DATASET_SLUG}",
"licenses": [{"name": "other"}],
"subtitle": "Cleartext secrets attached to bee-train-online β private only.",
"description": (
"PRIVATE. Holds the HF write token and CRON bearer that "
"the Bee training kernel needs. This dataset is attached "
"to ceocxx/bee-train-online via the kernel-metadata.json "
"dataset_sources field. Do not make public."
),
"isPrivate": True,
"keywords": [],
}, indent=2),
encoding="utf-8",
)
# First call creates; subsequent calls error β fall back to version.
create = subprocess.run(
["kaggle", "datasets", "create", "-p", str(d)],
capture_output=True, text=True,
)
out = (create.stdout + create.stderr).strip()
print(out)
if create.returncode != 0:
print("create failed β trying `datasets version` (rotates existing)")
ver = subprocess.run(
["kaggle", "datasets", "version", "-p", str(d), "-m",
"rotate bee-secrets", "--dir-mode", "zip"],
capture_output=True, text=True,
)
print((ver.stdout + ver.stderr).strip())
if ver.returncode != 0:
sys.exit(ver.returncode)
if __name__ == "__main__":
main()
|