Spaces:
Running
Running
| /** | |
| * Persistent cache for inferred app categories, backed by a | |
| * HuggingFace dataset. | |
| * | |
| * Why a dataset (not a local file) | |
| * ──────────────────────────────── | |
| * The website runs in a Docker HF Space. The container's | |
| * filesystem is wiped on every rebuild (and rebuilds happen | |
| * on every push, every model update, every Space restart). | |
| * Re-running 200 LLM calls every cold start would be wasteful | |
| * and slow the user-visible /api/js-apps for the first 30 s. | |
| * | |
| * Pushing the cache to a dataset gives us: | |
| * 1. Persistence across rebuilds and machine moves | |
| * 2. A versioned audit log of how categories evolve | |
| * 3. A single source of truth other tooling can consume | |
| * (the mobile shell could even read the dataset directly | |
| * if it ever wanted to bypass the website). | |
| * | |
| * Storage shape | |
| * ───────────── | |
| * <dataset>/categories.json | |
| * | |
| * { | |
| * "version": 1, | |
| * "taxonomyVersion": 1, | |
| * "updatedAt": "2026-05-10T11:08:42Z", | |
| * "entries": { | |
| * "<spaceId>": { | |
| * "lastModified": "2026-05-08T22:13:01Z", | |
| * "categories": ["storytelling", "kids", "voice"], | |
| * "categorizedAt": "2026-05-10T11:08:42Z", | |
| * "taxonomyVersion": 1 | |
| * } | |
| * } | |
| * } | |
| * | |
| * In-memory tier | |
| * ────────────── | |
| * The Map<spaceId, entry> is the hot path. The dataset is | |
| * loaded once at boot and only flushed when entries actually | |
| * change (the warmup batch buffers writes and flushes once | |
| * at the end). All synchronous access goes through the Map. | |
| */ | |
| import { commit, createRepo } from '@huggingface/hub'; | |
| import { TAXONOMY_VERSION } from './categories.js'; | |
| // Default location: a per-user dataset that the HF_TOKEN owner | |
| // definitely has write access to. Override with the env var | |
| // when promoting to the org-owned `pollen-robotics/...` dataset. | |
| const DEFAULT_DATASET = 'tfrere/reachy-mini-app-categories'; | |
| const CACHE_FILE_PATH = 'categories.json'; | |
| const CACHE_FORMAT_VERSION = 1; | |
| class CategoryCache { | |
| constructor() { | |
| this.entries = new Map(); | |
| this.repoName = process.env.HF_CATEGORIES_DATASET || DEFAULT_DATASET; | |
| this.loaded = false; | |
| this.dirty = false; | |
| // Concurrency guard for `flush()` - we never want two | |
| // commit() calls fighting for the same parent commit. | |
| this.flushing = false; | |
| } | |
| /** | |
| * Load the dataset cache into memory. Best-effort: a missing | |
| * dataset, a 404, or a malformed JSON all collapse to "start | |
| * fresh, the warmup will repopulate". We never let cache load | |
| * failure block the server boot. | |
| */ | |
| async load() { | |
| if (this.loaded) return; | |
| this.loaded = true; | |
| const url = `https://huggingface.co/datasets/${this.repoName}/resolve/main/${CACHE_FILE_PATH}`; | |
| try { | |
| const res = await fetch(url, { | |
| // Send the token even on a public dataset: it lets HF | |
| // bump our rate limit and keeps the path identical for | |
| // a future private dataset migration. | |
| headers: process.env.HF_TOKEN | |
| ? { Authorization: `Bearer ${process.env.HF_TOKEN}` } | |
| : undefined, | |
| }); | |
| if (!res.ok) { | |
| if (res.status === 404) { | |
| console.log( | |
| `[CategoryCache] Dataset ${this.repoName} or ${CACHE_FILE_PATH} ` + | |
| `not found yet - starting empty.`, | |
| ); | |
| } else { | |
| console.warn( | |
| `[CategoryCache] HTTP ${res.status} loading cache from ` + | |
| `${this.repoName}, starting empty.`, | |
| ); | |
| } | |
| return; | |
| } | |
| const data = await res.json(); | |
| const entries = data?.entries || {}; | |
| let kept = 0; | |
| let staleTaxonomy = 0; | |
| for (const [id, raw] of Object.entries(entries)) { | |
| if (!raw || typeof raw !== 'object') continue; | |
| // Drop entries from a previous taxonomy: their slugs | |
| // may no longer exist or may have shifted meaning. | |
| // The warmup will re-run them. | |
| if (raw.taxonomyVersion !== TAXONOMY_VERSION) { | |
| staleTaxonomy++; | |
| continue; | |
| } | |
| this.entries.set(id, { | |
| lastModified: raw.lastModified || null, | |
| categories: Array.isArray(raw.categories) ? raw.categories : [], | |
| categorizedAt: raw.categorizedAt || null, | |
| taxonomyVersion: raw.taxonomyVersion, | |
| }); | |
| kept++; | |
| } | |
| console.log( | |
| `[CategoryCache] Loaded ${kept} entries from ${this.repoName}` + | |
| (staleTaxonomy ? ` (dropped ${staleTaxonomy} stale taxonomy)` : ''), | |
| ); | |
| } catch (err) { | |
| console.warn( | |
| `[CategoryCache] Load failed (${err.message}); starting empty.`, | |
| ); | |
| } | |
| } | |
| get(spaceId) { | |
| return this.entries.get(spaceId) || null; | |
| } | |
| /** | |
| * Decide whether `spaceId` needs a fresh classification call. | |
| * It does when: | |
| * - we have no entry at all, OR | |
| * - the Space's `lastModified` has moved past our cached one | |
| * (the README may have changed - re-classify), OR | |
| * - the taxonomy version moved (handled at load() time, but | |
| * belt-and-braces for hot reloads). | |
| */ | |
| needsCategorization(spaceId, lastModified) { | |
| const entry = this.entries.get(spaceId); | |
| if (!entry) return true; | |
| if (entry.taxonomyVersion !== TAXONOMY_VERSION) return true; | |
| if (lastModified && entry.lastModified !== lastModified) return true; | |
| return false; | |
| } | |
| set(spaceId, { categories, lastModified }) { | |
| if (!Array.isArray(categories)) return; | |
| const next = { | |
| lastModified: lastModified || null, | |
| categories: [...categories], | |
| categorizedAt: new Date().toISOString(), | |
| taxonomyVersion: TAXONOMY_VERSION, | |
| }; | |
| const prev = this.entries.get(spaceId); | |
| // Skip the dirty flag if nothing actually changed - avoids | |
| // a useless commit when a refresh confirms the same labels. | |
| if ( | |
| prev && | |
| prev.lastModified === next.lastModified && | |
| prev.taxonomyVersion === next.taxonomyVersion && | |
| JSON.stringify(prev.categories) === JSON.stringify(next.categories) | |
| ) { | |
| return; | |
| } | |
| this.entries.set(spaceId, next); | |
| this.dirty = true; | |
| } | |
| /** | |
| * Persist the in-memory cache to the dataset (one commit, one | |
| * file). No-op if nothing has changed since the last flush. | |
| * | |
| * Auto-creates the dataset on first write if it doesn't exist | |
| * yet (so a brand-new `HF_CATEGORIES_DATASET` value bootstraps | |
| * cleanly without manual setup). | |
| */ | |
| async flush() { | |
| if (!this.dirty || this.flushing) return; | |
| if (!process.env.HF_TOKEN) { | |
| console.warn('[CategoryCache] HF_TOKEN missing; skipping flush.'); | |
| return; | |
| } | |
| this.flushing = true; | |
| try { | |
| const payload = this.serialize(); | |
| const blob = new Blob([JSON.stringify(payload, null, 2)], { | |
| type: 'application/json', | |
| }); | |
| const repo = { type: 'dataset', name: this.repoName }; | |
| const credentials = { accessToken: process.env.HF_TOKEN }; | |
| // First attempt: plain commit. If the dataset doesn't | |
| // exist yet, the SDK throws and we fall through to | |
| // create-then-commit. We never assume the dataset exists | |
| // - that lets a fresh deploy auto-bootstrap. | |
| try { | |
| await commit({ | |
| repo, | |
| credentials, | |
| title: `Update categories (${this.entries.size} apps)`, | |
| operations: [ | |
| { | |
| operation: 'addOrUpdate', | |
| path: CACHE_FILE_PATH, | |
| content: blob, | |
| }, | |
| ], | |
| }); | |
| } catch (err) { | |
| const msg = err?.message || ''; | |
| const looksMissing = | |
| msg.includes('404') || | |
| msg.toLowerCase().includes('not found') || | |
| msg.toLowerCase().includes('does not exist'); | |
| if (!looksMissing) throw err; | |
| console.log( | |
| `[CategoryCache] Dataset ${this.repoName} missing - creating it.`, | |
| ); | |
| await createRepo({ | |
| repo, | |
| credentials, | |
| private: false, | |
| // Re-using the same blob so the initial commit ships | |
| // the cache content (instead of an empty repo | |
| // followed by a no-op commit). | |
| files: [ | |
| { | |
| path: CACHE_FILE_PATH, | |
| content: await blob.arrayBuffer(), | |
| }, | |
| ], | |
| }); | |
| } | |
| this.dirty = false; | |
| console.log( | |
| `[CategoryCache] Flushed ${this.entries.size} entries to ${this.repoName}`, | |
| ); | |
| } catch (err) { | |
| // We deliberately swallow flush errors so a HF outage | |
| // doesn't break the running server. The next set() will | |
| // re-flag dirty=true and the next flush() will retry. | |
| console.error( | |
| `[CategoryCache] Flush failed: ${err?.message || err}`, | |
| ); | |
| } finally { | |
| this.flushing = false; | |
| } | |
| } | |
| serialize() { | |
| const entries = {}; | |
| for (const [id, entry] of this.entries) { | |
| entries[id] = entry; | |
| } | |
| return { | |
| version: CACHE_FORMAT_VERSION, | |
| taxonomyVersion: TAXONOMY_VERSION, | |
| updatedAt: new Date().toISOString(), | |
| entries, | |
| }; | |
| } | |
| /** | |
| * Diagnostic snapshot for /api/js-apps's `categorization` | |
| * sub-payload. Lets the mobile shell decide whether to show | |
| * "loading categories..." or to render the chips immediately. | |
| */ | |
| stats() { | |
| return { | |
| total: this.entries.size, | |
| dataset: this.repoName, | |
| taxonomyVersion: TAXONOMY_VERSION, | |
| }; | |
| } | |
| } | |
| // Singleton: there's only one cache per server process. | |
| export const categoryCache = new CategoryCache(); | |