Spaces:

pollen-robotics
/

Reachy_Mini

Running

App Files Files Community

Reachy_Mini / server /categorize.js

tfrere's picture

tfrere HF Staff

feat(categories): rename `dance` to `motion`, fold dance-with-music into `music`

b38029b about 2 months ago

History Blame Contribute Delete

14.9 kB

	/**
	* LLM-based category inference for JS Reachy Mini apps.
	*
	* Pipeline (`categorizeApp`)
	* ──────────────────────────
	* 1. Fetch the Space's README from HF Hub (raw)
	* 2. Strip frontmatter, images, badges, raw HTML, then truncate
	* 3. Call a chat LLM via HF Inference Providers (OpenAI-compatible)
	* with the predefined taxonomy + the app's name/description
	* 4. Parse JSON, validate against ALLOWED_SLUGS, keep up to 3
	*
	* Robustness contract
	* ───────────────────
	* `categorizeApp` NEVER throws on transient failure (network,
	* 429, malformed JSON). It returns `null`, which the cache layer
	* interprets as "not yet categorized; retry on the next pass".
	* Hard errors (HF_TOKEN missing) are signalled by a thrown
	* `HfTokenMissingError` so the caller can short-circuit the
	* whole batch.
	*/

	import {
	buildLlmCategoryList,
	sanitizeSlugs,
	} from './categories.js';

	// HF Inference Providers - OpenAI-compatible router. Auto-routes
	// the request to whichever provider currently serves the model
	// (Together, Nebius, Fireworks, Sambanova...). The token must
	// have `Inference Providers` access (default for all PRO and
	// most FREE tokens since 2025).
	const HF_INFERENCE_URL = 'https://router.huggingface.co/v1/chat/completions';

	// 8B model: cheap, fast (~1 s per call), more than enough for a
	// closed-list multi-label classification with good descriptions.
	// If quality drifts we can swap to 70B without touching anything
	// else - the prompt is generic.
	const DEFAULT_MODEL = 'meta-llama/Llama-3.1-8B-Instruct';

	// README budget
	const README_MAX_CHARS = 3000;

	// Single-label classification: each app gets EXACTLY ONE slug -
	// the dominant one. The shape stays `string[]` for forward
	// compatibility (if we ever revert to multi-label, no API break),
	// but the array always contains 0 or 1 entry. Mobile chips and
	// "swipers per category" thus surface each app once and only once.
	const MAX_CATEGORIES_PER_APP = 1;

	// LLM call budget
	const LLM_TIMEOUT_MS = 30_000;
	const LLM_MAX_TOKENS = 120;
	const LLM_TEMPERATURE = 0;

	export class HfTokenMissingError extends Error {
	constructor() {
	super('HF_TOKEN env var is not set; cannot call HF Inference Providers.');
	this.name = 'HfTokenMissingError';
	}
	}

	/**
	* Fetch a Space's README from HF Hub. Returns the raw markdown
	* string, or `null` if the request fails (404, network, etc.) -
	* the caller falls back to "name + description only" in that case,
	* which is still enough signal for the LLM on most apps.
	*/
	export async function fetchSpaceReadme(spaceId, { signal } = {}) {
	if (!spaceId \|\| typeof spaceId !== 'string') return null;
	// The README of a HF Space lives at /spaces/<id>/raw/main/README.md.
	// The `raw` endpoint returns the file as-is (no Hub UI wrapping)
	// and is anonymous-friendly, so no auth is needed here.
	const url = `https://huggingface.co/spaces/${spaceId}/raw/main/README.md`;
	try {
	const res = await fetch(url, { signal });
	if (!res.ok) return null;
	return await res.text();
	} catch {
	return null;
	}
	}

	/**
	* Lightly clean a raw README so the LLM doesn't burn tokens on
	* boilerplate (HF frontmatter, badges, images) and so the actual
	* prose surfaces above the truncation budget.
	*
	* We keep transformations conservative: we never edit the
	* surrounding prose, we just delete decorative tokens. Anything
	* cosmetic-only that clearly isn't signal for classification
	* (badges, images, raw HTML).
	*/
	export function cleanReadme(raw) {
	if (!raw \|\| typeof raw !== 'string') return '';
	let txt = raw;

	// 1. Strip the YAML frontmatter at the very top (HF Spaces
	// ship a mandatory `---\n...metadata...\n---` block whose
	// fields are already exposed to us via the catalog payload,
	// so feeding them to the LLM is pure noise).
	txt = txt.replace(/^---\n[\s\S]*?\n---\n?/, '');

	// 2. Drop image markdown (`![alt](url)`) and HTML <img> tags.
	// Vision apps tend to load up READMEs with screenshots and
	// GIFs; the alt text is sometimes useful but more often it's
	// "demo.gif" - low signal/noise ratio.
	txt = txt.replace(/!\[[^\]]*\]\([^)]+\)/g, '');
	txt = txt.replace(/<img\b[^>]*>/gi, '');

	// 3. Strip shields.io / GitHub badges (markdown links that
	// wrap an image). They survive (2) only when nested.
	txt = txt.replace(/\[!\[[^\]]*\]\([^)]+\)\]\([^)]+\)/g, '');

	// 4. Generic HTML stripping. Most READMEs are pure markdown,
	// but some authors embed `<details>`, `<sub>`, `<center>`
	// blocks. Keep the inner text, drop the tags.
	txt = txt.replace(/<\/?[a-zA-Z][^>]*>/g, '');

	// 5. Collapse runs of blank lines so trimming doesn't waste
	// tokens on the gap.
	txt = txt.replace(/\n{3,}/g, '\n\n');

	// 6. Truncate. We slice at the paragraph boundary closest to
	// the budget so we don't end mid-sentence.
	if (txt.length > README_MAX_CHARS) {
	const cut = txt.lastIndexOf('\n\n', README_MAX_CHARS);
	txt = txt.slice(0, cut > README_MAX_CHARS / 2 ? cut : README_MAX_CHARS);
	}

	return txt.trim();
	}

	/**
	* Few-shot examples woven into the system prompt.
	*
	* Each entry encodes a pitfall the v1 prompt fell into during the
	* 24-app eval (see `scripts/evaluate-prompt-v2.py`). Keep this list
	* tight - past ~10 examples the model starts pattern-matching
	* literally on the example names rather than applying the rules.
	*
	* Format: [name, description, expected_slugs, brief_justification]
	*/
	const FEW_SHOT_EXAMPLES = [
	[
	'Reachy Morse',
	"Send Morse code through Reachy's speaker.",
	['dev-tools'],
	'(STEP 1 veto: pure technical artefact. NOT music.)',
	],
	[
	'WebRTC Demo',
	'Minimal WebRTC connection between Reachy and the browser.',
	['dev-tools'],
	'(STEP 1 veto: protocol demo. NOT vision.)',
	],
	[
	'TTS Reachy Mini',
	"Browser TTS that plays out of Reachy Mini's speaker.",
	['voice'],
	'(USER-FACING speech output is voice, NOT dev-tools.)',
	],
	[
	'Reachy Mochi - Emotional Companion',
	'Your pocket buddy that develops a mood and personality over time.',
	['companion'],
	'(explicit emotional/companion framing)',
	],
	[
	'Reachy Alive',
	'(README empty; name suggests autonomy and life-like presence)',
	['companion'],
	"(USE THE NAME when the README is empty; 'alive' = companion-like)",
	],
	[
	'Daily Surf Report',
	"Reachy reads today's surf report out loud.",
	['voice'],
	'(NOT storytelling - a report has no narrative arc. ' +
	'NOT kids - surfing/sports are not kid-targeted.)',
	],
	[
	'Music Quiz',
	'Play a blind test music game with a dancing Reachy.',
	['music'],
	'(single dominant slug - music wins over games because the app ' +
	"is primarily a music blind-test; the dancing is a side effect " +
	'of the music and is captured by `music` too)',
	],
	[
	'Mime Bot',
	'Reachy mimics your face live from your webcam.',
	['vision'],
	'(NOT companion - mimicry is visual, no emotional framing.)',
	],
	];

	function renderFewShot() {
	return FEW_SHOT_EXAMPLES.map(([name, desc, slugs, hint]) => {
	const slugsJson = JSON.stringify(slugs);
	return (
	` - ${JSON.stringify(name)}: ${JSON.stringify(desc)}\n` +
	` → {"categories": ${slugsJson}} ${hint}`
	);
	}).join('\n');
	}

	/**
	* Build the chat messages handed to the LLM.
	*
	* The system prompt is structured as a 3-step DECISION ALGORITHM
	* rather than a flat list of rules, because the 8B-class model we
	* use (Llama-3.1-8B-Instruct) follows imperative procedures more
	* reliably than soft constraints. The `dev-tools` veto in STEP 1
	* is what stops the model from silently combining it with other
	* slugs on user-facing apps.
	*
	* The few-shot examples below the rules cover the v1 pitfalls
	* (companion hallucinations, music-on-audio, kids-on-personas,
	* storytelling-on-reports). Six is the sweet spot - more starts
	* over-fitting on example wording.
	*/
	function buildMessages({ name, description, readme }) {
	const taxonomy = buildLlmCategoryList();
	const examples = renderFewShot();
	const system = `You classify a Reachy Mini robot app into a CLOSED list of categories.

	OUTPUT FORMAT
	Return ONLY a single JSON object: {"categories": ["slug"]}.
	Pick EXACTLY ONE slug - the single dominant category that best
	captures the app's primary identity. Use the EXACT slug. The list
	always contains 0 or 1 entry.
	No prose, no code fences, no commentary outside the JSON.

	DECISION ALGORITHM (apply in order)

	STEP 1 - \`dev-tools\` veto
	Is this app a PURE technical artefact with no user-facing experience
	beyond "here is how the SDK / API works"?
	Examples that pass the veto: WebRTC demo, SDK probe, debug utility,
	raw remote-control interface, dev-only test space.
	Examples that DO NOT pass the veto (they are user-facing apps):
	TTS players, voice chat, music apps, storytelling, companions -
	even when the README is dev-heavy.
	- YES -> return {"categories": ["dev-tools"]} and STOP.
	- NO -> continue to STEP 2.

	STEP 2 - Pick the SINGLE most dominant user-facing slug from the list
	below. Choose the slug that captures the app's primary identity, not
	every aspect it touches. When two slugs feel equally fitting, pick the
	one that a user would name FIRST when describing the app in one word.
	Examples of tie-breaks:
	- music-driven dance party (Reachy dances to a song) -> \`music\`.
	The music is what drives the experience.
	- pure choreography / marionette / motion replay without music ->
	\`motion\`. The movement is the experience.
	- storytelling + kids app -> prefer \`kids\` if it explicitly targets
	children, \`storytelling\` otherwise.
	- vision + games app -> prefer \`games\` if there is a play loop,
	\`vision\` if it is mostly a perception demo.
	If the README is empty or very sparse, USE THE NAME AND DESCRIPTION
	as the primary signal - do not bail to an empty list just because the
	README is thin.

	STEP 3 - Strict slug rules (each must hold, or DO NOT use the slug)
	- \`companion\`: requires EXPLICIT emotional / personality / buddy
	framing (companion, buddy, friend, mood, emotional, personality,
	pet, Tamagotchi-like, "alive", "life companion"). Being friendly is
	not enough.
	- \`music\`: requires actual music - rhythm, melody, songs, beats, DJ
	sets, instruments, music quizzes. Arbitrary audio (Morse, alarms,
	TTS, sound effects) is NOT music.
	- \`vision\`: requires the camera to DRIVE behaviour (tracking,
	classification, mimicry). Merely streaming or displaying the camera
	(WebRTC demos, remote-control viewers) is NOT vision.
	- \`storytelling\`: requires a narrative ARC - plot, characters, scenes.
	Daily reports, news, weather, Q&A are NOT storytelling (they are
	\`voice\`).
	- \`games\`: requires a play loop - score, rounds, win/lose, puzzles,
	quizzes, dice/oracles, sports simulations.
	- \`kids\`: requires kid-targeted framing (kids/children/curious minds/
	bedtime/learning for kids) in the name or description. Lifestyle,
	sports, weather, general conversation are NOT kids.

	AVAILABLE CATEGORIES
	${taxonomy}

	REFERENCE EXAMPLES
	${examples}

	Do not include any text outside the JSON object.`;

	const user =
	`App name: ${name \|\| '(unknown)'}\n` +
	`Short description: ${description \|\| '(none)'}\n\n` +
	`README excerpt:\n${readme \|\| '(no README available)'}\n\n` +
	'Return the JSON now.';

	return [
	{ role: 'system', content: system },
	{ role: 'user', content: user },
	];
	}

	/**
	* Best-effort JSON extraction. Some 8B models still wrap the
	* answer in ``` fences or prepend "Sure, here you go:". We grab
	* the first balanced `{...}` block and parse that.
	*/
	function extractJsonObject(text) {
	if (!text \|\| typeof text !== 'string') return null;
	const start = text.indexOf('{');
	if (start === -1) return null;
	let depth = 0;
	for (let i = start; i < text.length; i++) {
	const ch = text[i];
	if (ch === '{') depth++;
	else if (ch === '}') {
	depth--;
	if (depth === 0) {
	const slice = text.slice(start, i + 1);
	try {
	return JSON.parse(slice);
	} catch {
	return null;
	}
	}
	}
	}
	return null;
	}

	/**
	* Call the HF Inference Providers chat endpoint. Returns the
	* raw assistant message string, or `null` on any error.
	*/
	async function callLlm({ messages, model, signal }) {
	const token = process.env.HF_TOKEN;
	if (!token) throw new HfTokenMissingError();

	const body = {
	model,
	messages,
	temperature: LLM_TEMPERATURE,
	max_tokens: LLM_MAX_TOKENS,
	// `response_format` is honoured by some providers (Nebius,
	// Together) but ignored by others. It's a free upgrade when
	// present, harmless otherwise; the JSON-extractor below is
	// the real safety net.
	response_format: { type: 'json_object' },
	};

	let res;
	try {
	res = await fetch(HF_INFERENCE_URL, {
	method: 'POST',
	headers: {
	'Authorization': `Bearer ${token}`,
	'Content-Type': 'application/json',
	},
	body: JSON.stringify(body),
	signal,
	});
	} catch (err) {
	console.warn(`[categorize] LLM fetch failed: ${err.message}`);
	return null;
	}

	if (!res.ok) {
	const detail = await res.text().catch(() => '');
	console.warn(
	`[categorize] LLM HTTP ${res.status}: ${detail.slice(0, 200)}`,
	);
	return null;
	}

	let json;
	try {
	json = await res.json();
	} catch {
	return null;
	}
	return json?.choices?.[0]?.message?.content ?? null;
	}

	/**
	* Public entry point.
	*
	* Returns a string[] of validated slugs (0-3 items), or `null`
	* on transient failure so the caller can mark the entry "needs
	* retry" without writing a misleading empty list.
	*
	* Treat an empty array `[]` as "the LLM looked and concluded
	* none fit" - that's a valid, cacheable outcome.
	*/
	export async function categorizeApp({
	name,
	description,
	spaceId,
	model = DEFAULT_MODEL,
	} = {}) {
	if (!spaceId) return null;

	const ctrl = new AbortController();
	const timeoutId = setTimeout(() => ctrl.abort(), LLM_TIMEOUT_MS);

	try {
	const rawReadme = await fetchSpaceReadme(spaceId, { signal: ctrl.signal });
	const readme = cleanReadme(rawReadme);

	const messages = buildMessages({ name, description, readme });
	const reply = await callLlm({ messages, model, signal: ctrl.signal });
	if (reply == null) return null;

	const obj = extractJsonObject(reply);
	if (!obj \|\| !Array.isArray(obj.categories)) {
	console.warn(
	`[categorize] ${spaceId}: malformed LLM reply (truncated): ` +
	`${reply.slice(0, 120)}`,
	);
	return null;
	}
	return sanitizeSlugs(obj.categories, MAX_CATEGORIES_PER_APP);
	} finally {
	clearTimeout(timeoutId);
	}
	}