File size: 3,971 Bytes
5e21013
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""Bootstrap a private Kaggle Dataset that holds the secrets the Bee
training kernel needs.

WHY THIS EXISTS
---------------
Kaggle's `UserSecretsClient` (Add-ons β†’ Secrets) is UI-managed. Bindings
between a kernel and a secret are NOT preserved when the kernel is pushed
via the Kaggle CLI / API β€” and the cron at /api/cron/kaggle-dispatch
pushes on every tick. So every cron-driven run loses access to its
secrets and aborts.

The fix: store the same secrets in a PRIVATE Kaggle Dataset and attach
that dataset to the kernel via `kernel-metadata.json`'s `dataset_sources`.
Dataset attachments DO survive CLI pushes (they're part of the metadata
file the kernel itself owns).

Security delta vs Kaggle Secrets:
  - Kaggle Secrets: encrypted at rest by Kaggle; UI-only management.
  - Private Dataset: cleartext file inside a private Kaggle Dataset;
    only readable by the dataset owner (you) and the cron's
    KAGGLE_KEY (also yours). For a single-tenant private kernel,
    practically equivalent. Both gated by Kaggle authentication.

Run locally with HF_TOKEN + CRON_SECRET + KAGGLE creds in env:

    HF_TOKEN=... CRON_SECRET=... \\
      python scripts/bootstrap_kaggle_secrets.py
"""
from __future__ import annotations

import json
import os
import subprocess
import sys
import tempfile
from pathlib import Path

DATASET_OWNER = "ceocxx"
DATASET_SLUG = "bee-secrets"
DATASET_TITLE = "Bee training kernel secrets (private)"


def main() -> None:
    hf_token = os.environ.get("HF_TOKEN", "")
    cron_secret = os.environ.get("CRON_SECRET") or os.environ.get("BEE_CRON_SECRET", "")
    if not hf_token or not cron_secret:
        raise SystemExit(
            "Both HF_TOKEN and CRON_SECRET (or BEE_CRON_SECRET) env vars are required."
        )

    secrets = {
        "hf_token": hf_token,
        "cron_secret": cron_secret,
        # Not actually secret β€” but kept here so the kernel has only ONE place
        # to look. Update both Vercel env and this dataset if the workspace
        # URL ever moves.
        "ingest_url": "https://workspace.bee.cuilabs.io/api/training/runs",
        "next_domain_url": "https://workspace.bee.cuilabs.io/api/training/next-domain",
    }

    with tempfile.TemporaryDirectory() as tmp:
        d = Path(tmp)
        (d / "secrets.json").write_text(json.dumps(secrets, indent=2), encoding="utf-8")
        (d / "dataset-metadata.json").write_text(
            json.dumps({
                "title": DATASET_TITLE,
                "id": f"{DATASET_OWNER}/{DATASET_SLUG}",
                "licenses": [{"name": "other"}],
                "subtitle": "Cleartext secrets attached to bee-train-online β€” private only.",
                "description": (
                    "PRIVATE. Holds the HF write token and CRON bearer that "
                    "the Bee training kernel needs. This dataset is attached "
                    "to ceocxx/bee-train-online via the kernel-metadata.json "
                    "dataset_sources field. Do not make public."
                ),
                "isPrivate": True,
                "keywords": [],
            }, indent=2),
            encoding="utf-8",
        )
        # First call creates; subsequent calls error β†’ fall back to version.
        create = subprocess.run(
            ["kaggle", "datasets", "create", "-p", str(d)],
            capture_output=True, text=True,
        )
        out = (create.stdout + create.stderr).strip()
        print(out)
        if create.returncode != 0:
            print("create failed β†’ trying `datasets version` (rotates existing)")
            ver = subprocess.run(
                ["kaggle", "datasets", "version", "-p", str(d), "-m",
                 "rotate bee-secrets", "--dir-mode", "zip"],
                capture_output=True, text=True,
            )
            print((ver.stdout + ver.stderr).strip())
            if ver.returncode != 0:
                sys.exit(ver.returncode)


if __name__ == "__main__":
    main()