reachy-mini-api / server /moderationCache.js
tfrere's picture
tfrere HF Staff
refactor(api): clearer config file names
3b73f6b
Raw
History Blame Contribute Delete
8.47 kB
/**
* Persistent cache for app moderation verdicts, backed by a
* HuggingFace dataset. Twin of `categoryCache.js` - same rationale
* (the Docker Space filesystem is wiped on every rebuild, so we
* persist to a dataset to avoid re-running the whole LLM sweep on
* each cold start), same in-memory-hot / dataset-cold tiering.
*
* Storage shape
* ─────────────
* <dataset>/cache/moderation.json
*
* {
* "version": 1,
* "policyVersion": 1,
* "updatedAt": "2026-05-29T09:00:00Z",
* "entries": {
* "<spaceId>": {
* "lastModified": "2026-05-08T22:13:01Z",
* "decision": "allow" | "block" | "review",
* "category": "none",
* "reason": "llm: allow/none",
* "severity": null | "hard" | "soft",
* "source": "regex" | "llm",
* "moderatedAt": "2026-05-29T09:00:00Z",
* "policyVersion": 1
* }
* }
* }
*
* `entries` holds the automated verdicts (regex + LLM), re-computed
* when a Space's README changes or the policy version bumps. The
* MANUAL killswitch lives elsewhere: a hand-edited
* `config/blocked-app-list.json` on the official dataset (see `index.js`),
* so anyone with dataset write access can block an app without
* touching this cache.
*/
import { commit, createRepo } from '@huggingface/hub';
import { MODERATION_POLICY_VERSION } from './moderate.js';
// Single store control-plane dataset (shared with official-app-list.json,
// blocked-app-list.json and categories.json - see index.js `STORE_DATASET`).
// The HF_TOKEN must have WRITE access here since this cache commits
// `moderation.json`. Precedence: a dedicated `HF_MODERATION_DATASET`
// wins (escape hatch), else the unified `STORE_DATASET`, else the
// pollen-robotics default.
const DEFAULT_DATASET = 'pollen-robotics/reachy_mini_store_data';
const CACHE_FILE_PATH = 'cache/moderation.json';
const CACHE_FORMAT_VERSION = 1;
class ModerationCache {
constructor() {
this.entries = new Map();
this.repoName =
process.env.HF_MODERATION_DATASET ||
process.env.STORE_DATASET ||
DEFAULT_DATASET;
this.loaded = false;
this.dirty = false;
this.flushing = false;
}
/**
* Load the dataset cache into memory. Best-effort: a missing
* dataset / 404 / malformed JSON collapses to "start empty, the
* warmup repopulates". Never blocks server boot.
*/
async load() {
if (this.loaded) return;
this.loaded = true;
const url = `https://huggingface.co/datasets/${this.repoName}/resolve/main/${CACHE_FILE_PATH}`;
try {
const res = await fetch(url, {
headers: process.env.HF_TOKEN
? { Authorization: `Bearer ${process.env.HF_TOKEN}` }
: undefined,
});
if (!res.ok) {
if (res.status === 404) {
console.log(
`[ModerationCache] Dataset ${this.repoName} or ${CACHE_FILE_PATH} not found yet - starting empty.`,
);
} else {
console.warn(
`[ModerationCache] HTTP ${res.status} loading cache from ${this.repoName}, starting empty.`,
);
}
return;
}
const data = await res.json();
// Verdicts: drop entries from an older policy version (the
// prompt/regex moved, so they must be re-moderated).
const entries = data?.entries || {};
let kept = 0;
let stale = 0;
for (const [id, raw] of Object.entries(entries)) {
if (!raw || typeof raw !== 'object') continue;
if (raw.policyVersion !== MODERATION_POLICY_VERSION) {
stale++;
continue;
}
this.entries.set(id, {
lastModified: raw.lastModified || null,
decision: raw.decision,
category: raw.category || 'none',
reason: raw.reason || '',
severity: raw.severity ?? null,
source: raw.source || 'llm',
moderatedAt: raw.moderatedAt || null,
policyVersion: raw.policyVersion,
});
kept++;
}
console.log(
`[ModerationCache] Loaded ${kept} verdicts from ${this.repoName}` +
(stale ? ` (dropped ${stale} stale policy)` : ''),
);
} catch (err) {
console.warn(
`[ModerationCache] Load failed (${err.message}); starting empty.`,
);
}
}
get(spaceId) {
return this.entries.get(spaceId) || null;
}
/**
* Does `spaceId` need a fresh moderation call? Yes when we have no
* verdict, the policy version moved, or the Space's `lastModified`
* advanced past our cached one (the README may have changed).
*/
needsModeration(spaceId, lastModified) {
const entry = this.entries.get(spaceId);
if (!entry) return true;
if (entry.policyVersion !== MODERATION_POLICY_VERSION) return true;
if (lastModified && entry.lastModified !== lastModified) return true;
return false;
}
set(spaceId, { decision, category, reason, severity, source, lastModified }) {
if (!decision) return;
const next = {
lastModified: lastModified || null,
decision,
category: category || 'none',
reason: reason || '',
severity: severity ?? null,
source: source || 'llm',
moderatedAt: new Date().toISOString(),
policyVersion: MODERATION_POLICY_VERSION,
};
const prev = this.entries.get(spaceId);
if (
prev &&
prev.lastModified === next.lastModified &&
prev.policyVersion === next.policyVersion &&
prev.decision === next.decision &&
prev.category === next.category
) {
return; // no material change - skip the dirty flag / commit
}
this.entries.set(spaceId, next);
this.dirty = true;
}
/**
* Persist the in-memory cache to the dataset (one commit, one
* file). No-op when nothing changed. Auto-creates the dataset on
* first write so a fresh `HF_MODERATION_DATASET` bootstraps cleanly.
*/
async flush() {
if (!this.dirty || this.flushing) return;
if (!process.env.HF_TOKEN) {
console.warn('[ModerationCache] HF_TOKEN missing; skipping flush.');
return;
}
this.flushing = true;
try {
const payload = this.serialize();
const blob = new Blob([JSON.stringify(payload, null, 2)], {
type: 'application/json',
});
const repo = { type: 'dataset', name: this.repoName };
const credentials = { accessToken: process.env.HF_TOKEN };
try {
await commit({
repo,
credentials,
title: `Update moderation (${this.entries.size} verdicts)`,
operations: [
{ operation: 'addOrUpdate', path: CACHE_FILE_PATH, content: blob },
],
});
} catch (err) {
const msg = err?.message || '';
const looksMissing =
msg.includes('404') ||
msg.toLowerCase().includes('not found') ||
msg.toLowerCase().includes('does not exist');
if (!looksMissing) throw err;
console.log(
`[ModerationCache] Dataset ${this.repoName} missing - creating it.`,
);
await createRepo({
repo,
credentials,
private: false,
files: [{ path: CACHE_FILE_PATH, content: await blob.arrayBuffer() }],
});
}
this.dirty = false;
console.log(
`[ModerationCache] Flushed ${this.entries.size} verdicts to ${this.repoName}`,
);
} catch (err) {
console.error(`[ModerationCache] Flush failed: ${err?.message || err}`);
} finally {
this.flushing = false;
}
}
serialize() {
const entries = {};
for (const [id, entry] of this.entries) entries[id] = entry;
return {
version: CACHE_FORMAT_VERSION,
policyVersion: MODERATION_POLICY_VERSION,
updatedAt: new Date().toISOString(),
entries,
};
}
/**
* Diagnostic snapshot for the `/api/js-apps` `moderation`
* sub-payload. Counts are over the verdict cache only.
*/
stats() {
let blocked = 0;
let review = 0;
for (const entry of this.entries.values()) {
if (entry.decision === 'block') blocked++;
else if (entry.decision === 'review') review++;
}
return {
total: this.entries.size,
blocked,
review,
dataset: this.repoName,
policyVersion: MODERATION_POLICY_VERSION,
};
}
}
// Singleton: one cache per server process.
export const moderationCache = new ModerationCache();