File size: 9,603 Bytes
0441ab1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
/**
 * Persistent cache for inferred app categories, backed by a
 * HuggingFace dataset.
 *
 * Why a dataset (not a local file)
 * ────────────────────────────────
 * The website runs in a Docker HF Space. The container's
 * filesystem is wiped on every rebuild (and rebuilds happen
 * on every push, every model update, every Space restart).
 * Re-running 200 LLM calls every cold start would be wasteful
 * and slow the user-visible /api/js-apps for the first 30 s.
 *
 * Pushing the cache to a dataset gives us:
 *   1. Persistence across rebuilds and machine moves
 *   2. A versioned audit log of how categories evolve
 *   3. A single source of truth other tooling can consume
 *      (the mobile shell could even read the dataset directly
 *      if it ever wanted to bypass the website).
 *
 * Storage shape
 * ─────────────
 *   <dataset>/categories.json
 *
 *   {
 *     "version": 1,
 *     "taxonomyVersion": 1,
 *     "updatedAt": "2026-05-10T11:08:42Z",
 *     "entries": {
 *       "<spaceId>": {
 *         "lastModified": "2026-05-08T22:13:01Z",
 *         "categories": ["storytelling", "kids", "voice"],
 *         "categorizedAt": "2026-05-10T11:08:42Z",
 *         "taxonomyVersion": 1
 *       }
 *     }
 *   }
 *
 * In-memory tier
 * ──────────────
 * The Map<spaceId, entry> is the hot path. The dataset is
 * loaded once at boot and only flushed when entries actually
 * change (the warmup batch buffers writes and flushes once
 * at the end). All synchronous access goes through the Map.
 */

import { commit, createRepo } from '@huggingface/hub';

import { TAXONOMY_VERSION } from './categories.js';

// Default location: a per-user dataset that the HF_TOKEN owner
// definitely has write access to. Override with the env var
// when promoting to the org-owned `pollen-robotics/...` dataset.
const DEFAULT_DATASET = 'tfrere/reachy-mini-app-categories';

const CACHE_FILE_PATH = 'categories.json';
const CACHE_FORMAT_VERSION = 1;

class CategoryCache {
  constructor() {
    this.entries = new Map();
    this.repoName = process.env.HF_CATEGORIES_DATASET || DEFAULT_DATASET;
    this.loaded = false;
    this.dirty = false;
    // Concurrency guard for `flush()` - we never want two
    // commit() calls fighting for the same parent commit.
    this.flushing = false;
  }

  /**
   * Load the dataset cache into memory. Best-effort: a missing
   * dataset, a 404, or a malformed JSON all collapse to "start
   * fresh, the warmup will repopulate". We never let cache load
   * failure block the server boot.
   */
  async load() {
    if (this.loaded) return;
    this.loaded = true;

    const url = `https://huggingface.co/datasets/${this.repoName}/resolve/main/${CACHE_FILE_PATH}`;
    try {
      const res = await fetch(url, {
        // Send the token even on a public dataset: it lets HF
        // bump our rate limit and keeps the path identical for
        // a future private dataset migration.
        headers: process.env.HF_TOKEN
          ? { Authorization: `Bearer ${process.env.HF_TOKEN}` }
          : undefined,
      });
      if (!res.ok) {
        if (res.status === 404) {
          console.log(
            `[CategoryCache] Dataset ${this.repoName} or ${CACHE_FILE_PATH} ` +
              `not found yet - starting empty.`,
          );
        } else {
          console.warn(
            `[CategoryCache] HTTP ${res.status} loading cache from ` +
              `${this.repoName}, starting empty.`,
          );
        }
        return;
      }
      const data = await res.json();
      const entries = data?.entries || {};
      let kept = 0;
      let staleTaxonomy = 0;
      for (const [id, raw] of Object.entries(entries)) {
        if (!raw || typeof raw !== 'object') continue;
        // Drop entries from a previous taxonomy: their slugs
        // may no longer exist or may have shifted meaning.
        // The warmup will re-run them.
        if (raw.taxonomyVersion !== TAXONOMY_VERSION) {
          staleTaxonomy++;
          continue;
        }
        this.entries.set(id, {
          lastModified: raw.lastModified || null,
          categories: Array.isArray(raw.categories) ? raw.categories : [],
          categorizedAt: raw.categorizedAt || null,
          taxonomyVersion: raw.taxonomyVersion,
        });
        kept++;
      }
      console.log(
        `[CategoryCache] Loaded ${kept} entries from ${this.repoName}` +
          (staleTaxonomy ? ` (dropped ${staleTaxonomy} stale taxonomy)` : ''),
      );
    } catch (err) {
      console.warn(
        `[CategoryCache] Load failed (${err.message}); starting empty.`,
      );
    }
  }

  get(spaceId) {
    return this.entries.get(spaceId) || null;
  }

  /**
   * Decide whether `spaceId` needs a fresh classification call.
   * It does when:
   *   - we have no entry at all, OR
   *   - the Space's `lastModified` has moved past our cached one
   *     (the README may have changed - re-classify), OR
   *   - the taxonomy version moved (handled at load() time, but
   *     belt-and-braces for hot reloads).
   */
  needsCategorization(spaceId, lastModified) {
    const entry = this.entries.get(spaceId);
    if (!entry) return true;
    if (entry.taxonomyVersion !== TAXONOMY_VERSION) return true;
    if (lastModified && entry.lastModified !== lastModified) return true;
    return false;
  }

  set(spaceId, { categories, lastModified }) {
    if (!Array.isArray(categories)) return;
    const next = {
      lastModified: lastModified || null,
      categories: [...categories],
      categorizedAt: new Date().toISOString(),
      taxonomyVersion: TAXONOMY_VERSION,
    };
    const prev = this.entries.get(spaceId);
    // Skip the dirty flag if nothing actually changed - avoids
    // a useless commit when a refresh confirms the same labels.
    if (
      prev &&
      prev.lastModified === next.lastModified &&
      prev.taxonomyVersion === next.taxonomyVersion &&
      JSON.stringify(prev.categories) === JSON.stringify(next.categories)
    ) {
      return;
    }
    this.entries.set(spaceId, next);
    this.dirty = true;
  }

  /**
   * Persist the in-memory cache to the dataset (one commit, one
   * file). No-op if nothing has changed since the last flush.
   *
   * Auto-creates the dataset on first write if it doesn't exist
   * yet (so a brand-new `HF_CATEGORIES_DATASET` value bootstraps
   * cleanly without manual setup).
   */
  async flush() {
    if (!this.dirty || this.flushing) return;
    if (!process.env.HF_TOKEN) {
      console.warn('[CategoryCache] HF_TOKEN missing; skipping flush.');
      return;
    }
    this.flushing = true;
    try {
      const payload = this.serialize();
      const blob = new Blob([JSON.stringify(payload, null, 2)], {
        type: 'application/json',
      });

      const repo = { type: 'dataset', name: this.repoName };
      const credentials = { accessToken: process.env.HF_TOKEN };

      // First attempt: plain commit. If the dataset doesn't
      // exist yet, the SDK throws and we fall through to
      // create-then-commit. We never assume the dataset exists
      // - that lets a fresh deploy auto-bootstrap.
      try {
        await commit({
          repo,
          credentials,
          title: `Update categories (${this.entries.size} apps)`,
          operations: [
            {
              operation: 'addOrUpdate',
              path: CACHE_FILE_PATH,
              content: blob,
            },
          ],
        });
      } catch (err) {
        const msg = err?.message || '';
        const looksMissing =
          msg.includes('404') ||
          msg.toLowerCase().includes('not found') ||
          msg.toLowerCase().includes('does not exist');
        if (!looksMissing) throw err;
        console.log(
          `[CategoryCache] Dataset ${this.repoName} missing - creating it.`,
        );
        await createRepo({
          repo,
          credentials,
          private: false,
          // Re-using the same blob so the initial commit ships
          // the cache content (instead of an empty repo
          // followed by a no-op commit).
          files: [
            {
              path: CACHE_FILE_PATH,
              content: await blob.arrayBuffer(),
            },
          ],
        });
      }

      this.dirty = false;
      console.log(
        `[CategoryCache] Flushed ${this.entries.size} entries to ${this.repoName}`,
      );
    } catch (err) {
      // We deliberately swallow flush errors so a HF outage
      // doesn't break the running server. The next set() will
      // re-flag dirty=true and the next flush() will retry.
      console.error(
        `[CategoryCache] Flush failed: ${err?.message || err}`,
      );
    } finally {
      this.flushing = false;
    }
  }

  serialize() {
    const entries = {};
    for (const [id, entry] of this.entries) {
      entries[id] = entry;
    }
    return {
      version: CACHE_FORMAT_VERSION,
      taxonomyVersion: TAXONOMY_VERSION,
      updatedAt: new Date().toISOString(),
      entries,
    };
  }

  /**
   * Diagnostic snapshot for /api/js-apps's `categorization`
   * sub-payload. Lets the mobile shell decide whether to show
   * "loading categories..." or to render the chips immediately.
   */
  stats() {
    return {
      total: this.entries.size,
      dataset: this.repoName,
      taxonomyVersion: TAXONOMY_VERSION,
    };
  }
}

// Singleton: there's only one cache per server process.
export const categoryCache = new CategoryCache();