Spaces:
Running
Running
File size: 2,985 Bytes
f8d0843 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | // Engine: wllama — llama.cpp compiled to WebAssembly, with a WebGPU backend (V3).
// Loads GGUF from HF. Local-first + actual llama.cpp. Runs WASM if no WebGPU.
import { Wllama, ModelManager } from 'https://cdn.jsdelivr.net/npm/@wllama/wllama@3.4.1/esm/index.js'
import { MODELS, DEFAULT_MODEL, getModel } from '/web/modelCatalog.js'
import { statsTracker } from '/web/genStats.js'
const WASM = { default: 'https://cdn.jsdelivr.net/npm/@wllama/wllama@3.4.1/esm/wasm/wllama.wasm' }
const mm = new ModelManager()
let _w = null, _loadedId = null, _loadPromise = null, _chain = Promise.resolve()
async function ensure(id, onProgress) {
const m = getModel(id)
if (_w && _loadedId === m.id) return _w
if (_loadPromise && _loadedId === null) return _loadPromise
if (_w && _loadedId !== m.id) { try { await _w.exit() } catch { /* ignore */ } _w = null; _loadedId = null; _loadPromise = null }
_loadPromise = (async () => {
const w = new Wllama(WASM)
await w.loadModelFromHF({ repo: m.repo, file: m.file }, {
n_ctx: 2048, progressCallback: ({ loaded, total }) => onProgress && onProgress(total ? loaded / total : 0),
})
_w = w; _loadedId = m.id; return w
})().catch((e) => { _loadPromise = null; throw e })
return _loadPromise
}
function stream(id, system, user, { maxTokens = 200, temperature = 0.8, onToken, onStats } = {}) {
const run = async () => {
const w = await ensure(id)
const st = statsTracker(onStats)
let full = ''
const s = await w.createChatCompletion({
messages: [{ role: 'system', content: system }, { role: 'user', content: user }],
max_tokens: maxTokens, temperature, top_k: 40, top_p: 0.9, stream: true,
})
for await (const ch of s) {
const piece = ch?.choices?.[0]?.delta?.content || ''
if (!piece) continue
full += piece; if (onToken) onToken(piece); st.tick()
}
return { text: full, stats: st.finish() }
}
const p = _chain.then(run, run); _chain = p.catch(() => {}); return p
}
const _match = (model, entry) => (model.files || []).map((f) => f.name || '').join('|').includes(entry.file)
export const engine = {
id: 'wllama',
label: 'wllama · llama.cpp (WASM + WebGPU)',
requiresWebGPU: false,
available: () => true,
models: MODELS,
defaultModel: DEFAULT_MODEL,
ensure, stream,
backendLabel: () => { try { return navigator.gpu ? '⚡ WebGPU' : 'CPU (WASM)' } catch { return 'CPU (WASM)' } },
async cachedSet() {
try {
const models = await mm.getModels(); const ids = new Set()
for (const m of models) for (const c of MODELS) if (_match(m, c)) ids.add(c.id)
return ids
} catch { return new Set() }
},
async deleteCached(id) {
const c = getModel(id)
if (_loadedId === id && _w) { try { await _w.exit() } catch { /* ignore */ } _w = null; _loadedId = null; _loadPromise = null }
try { for (const m of await mm.getModels()) if (_match(m, c) && m.remove) await m.remove() } catch { /* ignore */ }
},
}
|