Spaces:
Running
Running
File size: 2,940 Bytes
1f1908e 37982be 1f1908e 37982be 1f1908e 6e155d8 1f1908e 6e155d8 1f1908e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 | // Coding-model store for the Skill Forge. SEPARATE from runtime.js (the persona/diary
// "Text Generation Model") so picking a coding model never clobbers the writer model.
// All candidates are large (Mellum2 ~8GB, BLS Mini-Code 30B MoE, Nemotron-30B ~24GB) with no
// browser-viable build, so this is ZeroGPU-only: every choice routes through the same server
// endpoint (/text/generate/stream) the `server` engine uses, by model id. Mellum2
// (TINY_MELLUM_SPACE) and BLS Mini-Code (TINY_BLS_CODE_SPACE) are ZeroGPU sidecars; Nemotron-30B
// routes through hosted NVIDIA NIM (NVIDIA_NIM_API_KEY) since it's too big to self-host.
import { statsTracker } from '/web/genStats.js'
import { streamSse } from '/web/sseText.js'
const MODELS = [
{ id: 'nemotron-3-nano-30b-nim', label: 'Nemotron 3 Nano 30B-A3B', params: '30B (3B active)', backend: 'NVIDIA NIM', note: 'reasoning + agentic code (NVIDIA)' },
{ id: 'mellum2-zerogpu', label: 'Mellum2 12B-A2.5B', params: '12B (2.5B active)', backend: 'ZeroGPU sidecar', note: 'code model (JetBrains)' },
{ id: 'bls-mini-code-zerogpu', label: 'BLS Mini-Code 1.0', params: '30B MoE', backend: 'ZeroGPU sidecar', note: 'code model (Cohere); reasoning suppressed' },
]
const DEFAULT = 'nemotron-3-nano-30b-nim'
const KEY = 'tinyarmy.codingModel'
const get = (id) => MODELS.find((m) => m.id === id) || MODELS[0]
const loadStr = (k) => { try { return localStorage.getItem(k) || '' } catch { return '' } }
let _sel = (() => { const s = loadStr(KEY); return MODELS.some((m) => m.id === s) ? s : DEFAULT })()
const _listeners = new Set()
export function onCodingModelChange(fn) { _listeners.add(fn); return () => _listeners.delete(fn) }
const _notify = () => { for (const fn of _listeners) { try { fn() } catch { /* ignore */ } } }
export const listCodingModels = () => MODELS
export const getCodingModelId = () => _sel
export const currentCodingModel = () => get(_sel)
export function setCodingModel(id) {
if (!MODELS.some((m) => m.id === id) || id === _sel) return
_sel = id
try { localStorage.setItem(KEY, id) } catch { /* ignore */ }
_notify()
}
// Stream a coding-model completion. Same delta protocol as engineServer.stream.
// think=true asks reasoning models (Nemotron, BLS) to surface their <think>…</think> trace
// instead of hiding it, so the caller can show it in a debug panel.
export async function streamCoding(system, user, { maxTokens = 512, temperature = 0.6, think = false, onToken, onStats, signal } = {}) {
const st = statsTracker(onStats)
let full = ''
await streamSse('/text/generate/stream', {
model: _sel,
system,
user,
max_tokens: maxTokens,
temperature,
think,
}, {
signal,
onEvent(evt, parsed) {
if (evt !== 'delta') return
const piece = parsed?.content || ''
if (!piece) return
full += piece
onToken?.(piece)
st.tick()
},
})
return { text: full, stats: st.finish() }
}
|