Spaces:
Running
Running
| /** | |
| * LLM-based category inference for JS Reachy Mini apps. | |
| * | |
| * Pipeline (`categorizeApp`) | |
| * ────────────────────────── | |
| * 1. Fetch the Space's README from HF Hub (raw) | |
| * 2. Strip frontmatter, images, badges, raw HTML, then truncate | |
| * 3. Call a chat LLM via HF Inference Providers (OpenAI-compatible) | |
| * with the predefined taxonomy + the app's name/description | |
| * 4. Parse JSON, validate against ALLOWED_SLUGS, keep up to 3 | |
| * | |
| * Robustness contract | |
| * ─────────────────── | |
| * `categorizeApp` NEVER throws on transient failure (network, | |
| * 429, malformed JSON). It returns `null`, which the cache layer | |
| * interprets as "not yet categorized; retry on the next pass". | |
| * Hard errors (HF_TOKEN missing) are signalled by a thrown | |
| * `HfTokenMissingError` so the caller can short-circuit the | |
| * whole batch. | |
| */ | |
| import { | |
| buildLlmCategoryList, | |
| sanitizeSlugs, | |
| } from './categories.js'; | |
| // HF Inference Providers - OpenAI-compatible router. Auto-routes | |
| // the request to whichever provider currently serves the model | |
| // (Together, Nebius, Fireworks, Sambanova...). The token must | |
| // have `Inference Providers` access (default for all PRO and | |
| // most FREE tokens since 2025). | |
| const HF_INFERENCE_URL = 'https://router.huggingface.co/v1/chat/completions'; | |
| // 8B model: cheap, fast (~1 s per call), more than enough for a | |
| // closed-list multi-label classification with good descriptions. | |
| // If quality drifts we can swap to 70B without touching anything | |
| // else - the prompt is generic. | |
| const DEFAULT_MODEL = 'meta-llama/Llama-3.1-8B-Instruct'; | |
| // README budget | |
| const README_MAX_CHARS = 3000; | |
| // Single-label classification: each app gets EXACTLY ONE slug - | |
| // the dominant one. The shape stays `string[]` for forward | |
| // compatibility (if we ever revert to multi-label, no API break), | |
| // but the array always contains 0 or 1 entry. Mobile chips and | |
| // "swipers per category" thus surface each app once and only once. | |
| const MAX_CATEGORIES_PER_APP = 1; | |
| // LLM call budget | |
| const LLM_TIMEOUT_MS = 30_000; | |
| const LLM_MAX_TOKENS = 120; | |
| const LLM_TEMPERATURE = 0; | |
| export class HfTokenMissingError extends Error { | |
| constructor() { | |
| super('HF_TOKEN env var is not set; cannot call HF Inference Providers.'); | |
| this.name = 'HfTokenMissingError'; | |
| } | |
| } | |
| /** | |
| * Fetch a Space's README from HF Hub. Returns the raw markdown | |
| * string, or `null` if the request fails (404, network, etc.) - | |
| * the caller falls back to "name + description only" in that case, | |
| * which is still enough signal for the LLM on most apps. | |
| */ | |
| export async function fetchSpaceReadme(spaceId, { signal } = {}) { | |
| if (!spaceId || typeof spaceId !== 'string') return null; | |
| // The README of a HF Space lives at /spaces/<id>/raw/main/README.md. | |
| // The `raw` endpoint returns the file as-is (no Hub UI wrapping) | |
| // and is anonymous-friendly, so no auth is needed here. | |
| const url = `https://huggingface.co/spaces/${spaceId}/raw/main/README.md`; | |
| try { | |
| const res = await fetch(url, { signal }); | |
| if (!res.ok) return null; | |
| return await res.text(); | |
| } catch { | |
| return null; | |
| } | |
| } | |
| /** | |
| * Lightly clean a raw README so the LLM doesn't burn tokens on | |
| * boilerplate (HF frontmatter, badges, images) and so the actual | |
| * prose surfaces above the truncation budget. | |
| * | |
| * We keep transformations conservative: we never edit the | |
| * surrounding prose, we just delete decorative tokens. Anything | |
| * cosmetic-only that clearly isn't signal for classification | |
| * (badges, images, raw HTML). | |
| */ | |
| export function cleanReadme(raw) { | |
| if (!raw || typeof raw !== 'string') return ''; | |
| let txt = raw; | |
| // 1. Strip the YAML frontmatter at the very top (HF Spaces | |
| // ship a mandatory `---\n...metadata...\n---` block whose | |
| // fields are already exposed to us via the catalog payload, | |
| // so feeding them to the LLM is pure noise). | |
| txt = txt.replace(/^---\n[\s\S]*?\n---\n?/, ''); | |
| // 2. Drop image markdown (``) and HTML <img> tags. | |
| // Vision apps tend to load up READMEs with screenshots and | |
| // GIFs; the alt text is sometimes useful but more often it's | |
| // "demo.gif" - low signal/noise ratio. | |
| txt = txt.replace(/!\[[^\]]*\]\([^)]+\)/g, ''); | |
| txt = txt.replace(/<img\b[^>]*>/gi, ''); | |
| // 3. Strip shields.io / GitHub badges (markdown links that | |
| // wrap an image). They survive (2) only when nested. | |
| txt = txt.replace(/\[!\[[^\]]*\]\([^)]+\)\]\([^)]+\)/g, ''); | |
| // 4. Generic HTML stripping. Most READMEs are pure markdown, | |
| // but some authors embed `<details>`, `<sub>`, `<center>` | |
| // blocks. Keep the inner text, drop the tags. | |
| txt = txt.replace(/<\/?[a-zA-Z][^>]*>/g, ''); | |
| // 5. Collapse runs of blank lines so trimming doesn't waste | |
| // tokens on the gap. | |
| txt = txt.replace(/\n{3,}/g, '\n\n'); | |
| // 6. Truncate. We slice at the paragraph boundary closest to | |
| // the budget so we don't end mid-sentence. | |
| if (txt.length > README_MAX_CHARS) { | |
| const cut = txt.lastIndexOf('\n\n', README_MAX_CHARS); | |
| txt = txt.slice(0, cut > README_MAX_CHARS / 2 ? cut : README_MAX_CHARS); | |
| } | |
| return txt.trim(); | |
| } | |
| /** | |
| * Few-shot examples woven into the system prompt. | |
| * | |
| * Each entry encodes a pitfall the v1 prompt fell into during the | |
| * 24-app eval (see `scripts/evaluate-prompt-v2.py`). Keep this list | |
| * tight - past ~10 examples the model starts pattern-matching | |
| * literally on the example names rather than applying the rules. | |
| * | |
| * Format: [name, description, expected_slugs, brief_justification] | |
| */ | |
| const FEW_SHOT_EXAMPLES = [ | |
| [ | |
| 'Reachy Morse', | |
| "Send Morse code through Reachy's speaker.", | |
| ['dev-tools'], | |
| '(STEP 1 veto: pure technical artefact. NOT music.)', | |
| ], | |
| [ | |
| 'WebRTC Demo', | |
| 'Minimal WebRTC connection between Reachy and the browser.', | |
| ['dev-tools'], | |
| '(STEP 1 veto: protocol demo. NOT vision.)', | |
| ], | |
| [ | |
| 'TTS Reachy Mini', | |
| "Browser TTS that plays out of Reachy Mini's speaker.", | |
| ['voice'], | |
| '(USER-FACING speech output is voice, NOT dev-tools.)', | |
| ], | |
| [ | |
| 'Reachy Mochi - Emotional Companion', | |
| 'Your pocket buddy that develops a mood and personality over time.', | |
| ['companion'], | |
| '(explicit emotional/companion framing)', | |
| ], | |
| [ | |
| 'Reachy Alive', | |
| '(README empty; name suggests autonomy and life-like presence)', | |
| ['companion'], | |
| "(USE THE NAME when the README is empty; 'alive' = companion-like)", | |
| ], | |
| [ | |
| 'Daily Surf Report', | |
| "Reachy reads today's surf report out loud.", | |
| ['voice'], | |
| '(NOT storytelling - a report has no narrative arc. ' + | |
| 'NOT kids - surfing/sports are not kid-targeted.)', | |
| ], | |
| [ | |
| 'Music Quiz', | |
| 'Play a blind test music game with a dancing Reachy.', | |
| ['music'], | |
| '(single dominant slug - music wins over games because the app ' + | |
| "is primarily a music blind-test; the dancing is a side effect " + | |
| 'of the music and is captured by `music` too)', | |
| ], | |
| [ | |
| 'Mime Bot', | |
| 'Reachy mimics your face live from your webcam.', | |
| ['vision'], | |
| '(NOT companion - mimicry is visual, no emotional framing.)', | |
| ], | |
| ]; | |
| function renderFewShot() { | |
| return FEW_SHOT_EXAMPLES.map(([name, desc, slugs, hint]) => { | |
| const slugsJson = JSON.stringify(slugs); | |
| return ( | |
| ` - ${JSON.stringify(name)}: ${JSON.stringify(desc)}\n` + | |
| ` → {"categories": ${slugsJson}} ${hint}` | |
| ); | |
| }).join('\n'); | |
| } | |
| /** | |
| * Build the chat messages handed to the LLM. | |
| * | |
| * The system prompt is structured as a 3-step DECISION ALGORITHM | |
| * rather than a flat list of rules, because the 8B-class model we | |
| * use (Llama-3.1-8B-Instruct) follows imperative procedures more | |
| * reliably than soft constraints. The `dev-tools` veto in STEP 1 | |
| * is what stops the model from silently combining it with other | |
| * slugs on user-facing apps. | |
| * | |
| * The few-shot examples below the rules cover the v1 pitfalls | |
| * (companion hallucinations, music-on-audio, kids-on-personas, | |
| * storytelling-on-reports). Six is the sweet spot - more starts | |
| * over-fitting on example wording. | |
| */ | |
| function buildMessages({ name, description, readme }) { | |
| const taxonomy = buildLlmCategoryList(); | |
| const examples = renderFewShot(); | |
| const system = `You classify a Reachy Mini robot app into a CLOSED list of categories. | |
| OUTPUT FORMAT | |
| Return ONLY a single JSON object: {"categories": ["slug"]}. | |
| Pick EXACTLY ONE slug - the single dominant category that best | |
| captures the app's primary identity. Use the EXACT slug. The list | |
| always contains 0 or 1 entry. | |
| No prose, no code fences, no commentary outside the JSON. | |
| DECISION ALGORITHM (apply in order) | |
| STEP 1 - \`dev-tools\` veto | |
| Is this app a PURE technical artefact with no user-facing experience | |
| beyond "here is how the SDK / API works"? | |
| Examples that pass the veto: WebRTC demo, SDK probe, debug utility, | |
| raw remote-control interface, dev-only test space. | |
| Examples that DO NOT pass the veto (they are user-facing apps): | |
| TTS players, voice chat, music apps, storytelling, companions - | |
| even when the README is dev-heavy. | |
| - YES -> return {"categories": ["dev-tools"]} and STOP. | |
| - NO -> continue to STEP 2. | |
| STEP 2 - Pick the SINGLE most dominant user-facing slug from the list | |
| below. Choose the slug that captures the app's primary identity, not | |
| every aspect it touches. When two slugs feel equally fitting, pick the | |
| one that a user would name FIRST when describing the app in one word. | |
| Examples of tie-breaks: | |
| - music-driven dance party (Reachy dances to a song) -> \`music\`. | |
| The music is what drives the experience. | |
| - pure choreography / marionette / motion replay without music -> | |
| \`motion\`. The movement is the experience. | |
| - storytelling + kids app -> prefer \`kids\` if it explicitly targets | |
| children, \`storytelling\` otherwise. | |
| - vision + games app -> prefer \`games\` if there is a play loop, | |
| \`vision\` if it is mostly a perception demo. | |
| If the README is empty or very sparse, USE THE NAME AND DESCRIPTION | |
| as the primary signal - do not bail to an empty list just because the | |
| README is thin. | |
| STEP 3 - Strict slug rules (each must hold, or DO NOT use the slug) | |
| - \`companion\`: requires EXPLICIT emotional / personality / buddy | |
| framing (companion, buddy, friend, mood, emotional, personality, | |
| pet, Tamagotchi-like, "alive", "life companion"). Being friendly is | |
| not enough. | |
| - \`music\`: requires actual music - rhythm, melody, songs, beats, DJ | |
| sets, instruments, music quizzes. Arbitrary audio (Morse, alarms, | |
| TTS, sound effects) is NOT music. | |
| - \`vision\`: requires the camera to DRIVE behaviour (tracking, | |
| classification, mimicry). Merely streaming or displaying the camera | |
| (WebRTC demos, remote-control viewers) is NOT vision. | |
| - \`storytelling\`: requires a narrative ARC - plot, characters, scenes. | |
| Daily reports, news, weather, Q&A are NOT storytelling (they are | |
| \`voice\`). | |
| - \`games\`: requires a play loop - score, rounds, win/lose, puzzles, | |
| quizzes, dice/oracles, sports simulations. | |
| - \`kids\`: requires kid-targeted framing (kids/children/curious minds/ | |
| bedtime/learning for kids) in the name or description. Lifestyle, | |
| sports, weather, general conversation are NOT kids. | |
| AVAILABLE CATEGORIES | |
| ${taxonomy} | |
| REFERENCE EXAMPLES | |
| ${examples} | |
| Do not include any text outside the JSON object.`; | |
| const user = | |
| `App name: ${name || '(unknown)'}\n` + | |
| `Short description: ${description || '(none)'}\n\n` + | |
| `README excerpt:\n${readme || '(no README available)'}\n\n` + | |
| 'Return the JSON now.'; | |
| return [ | |
| { role: 'system', content: system }, | |
| { role: 'user', content: user }, | |
| ]; | |
| } | |
| /** | |
| * Best-effort JSON extraction. Some 8B models still wrap the | |
| * answer in ``` fences or prepend "Sure, here you go:". We grab | |
| * the first balanced `{...}` block and parse that. | |
| */ | |
| function extractJsonObject(text) { | |
| if (!text || typeof text !== 'string') return null; | |
| const start = text.indexOf('{'); | |
| if (start === -1) return null; | |
| let depth = 0; | |
| for (let i = start; i < text.length; i++) { | |
| const ch = text[i]; | |
| if (ch === '{') depth++; | |
| else if (ch === '}') { | |
| depth--; | |
| if (depth === 0) { | |
| const slice = text.slice(start, i + 1); | |
| try { | |
| return JSON.parse(slice); | |
| } catch { | |
| return null; | |
| } | |
| } | |
| } | |
| } | |
| return null; | |
| } | |
| /** | |
| * Call the HF Inference Providers chat endpoint. Returns the | |
| * raw assistant message string, or `null` on any error. | |
| */ | |
| async function callLlm({ messages, model, signal }) { | |
| const token = process.env.HF_TOKEN; | |
| if (!token) throw new HfTokenMissingError(); | |
| const body = { | |
| model, | |
| messages, | |
| temperature: LLM_TEMPERATURE, | |
| max_tokens: LLM_MAX_TOKENS, | |
| // `response_format` is honoured by some providers (Nebius, | |
| // Together) but ignored by others. It's a free upgrade when | |
| // present, harmless otherwise; the JSON-extractor below is | |
| // the real safety net. | |
| response_format: { type: 'json_object' }, | |
| }; | |
| let res; | |
| try { | |
| res = await fetch(HF_INFERENCE_URL, { | |
| method: 'POST', | |
| headers: { | |
| 'Authorization': `Bearer ${token}`, | |
| 'Content-Type': 'application/json', | |
| }, | |
| body: JSON.stringify(body), | |
| signal, | |
| }); | |
| } catch (err) { | |
| console.warn(`[categorize] LLM fetch failed: ${err.message}`); | |
| return null; | |
| } | |
| if (!res.ok) { | |
| const detail = await res.text().catch(() => ''); | |
| console.warn( | |
| `[categorize] LLM HTTP ${res.status}: ${detail.slice(0, 200)}`, | |
| ); | |
| return null; | |
| } | |
| let json; | |
| try { | |
| json = await res.json(); | |
| } catch { | |
| return null; | |
| } | |
| return json?.choices?.[0]?.message?.content ?? null; | |
| } | |
| /** | |
| * Public entry point. | |
| * | |
| * Returns a string[] of validated slugs (0-3 items), or `null` | |
| * on transient failure so the caller can mark the entry "needs | |
| * retry" without writing a misleading empty list. | |
| * | |
| * Treat an empty array `[]` as "the LLM looked and concluded | |
| * none fit" - that's a valid, cacheable outcome. | |
| */ | |
| export async function categorizeApp({ | |
| name, | |
| description, | |
| spaceId, | |
| model = DEFAULT_MODEL, | |
| } = {}) { | |
| if (!spaceId) return null; | |
| const ctrl = new AbortController(); | |
| const timeoutId = setTimeout(() => ctrl.abort(), LLM_TIMEOUT_MS); | |
| try { | |
| const rawReadme = await fetchSpaceReadme(spaceId, { signal: ctrl.signal }); | |
| const readme = cleanReadme(rawReadme); | |
| const messages = buildMessages({ name, description, readme }); | |
| const reply = await callLlm({ messages, model, signal: ctrl.signal }); | |
| if (reply == null) return null; | |
| const obj = extractJsonObject(reply); | |
| if (!obj || !Array.isArray(obj.categories)) { | |
| console.warn( | |
| `[categorize] ${spaceId}: malformed LLM reply (truncated): ` + | |
| `${reply.slice(0, 120)}`, | |
| ); | |
| return null; | |
| } | |
| return sanitizeSlugs(obj.categories, MAX_CATEGORIES_PER_APP); | |
| } finally { | |
| clearTimeout(timeoutId); | |
| } | |
| } | |