polats commited on
Commit
f9dd2fe
·
1 Parent(s): d011d06

Add Tiny Aya text generation option

Browse files
app.py CHANGED
@@ -263,6 +263,7 @@ _DASHSCOPE_URL = _DASHSCOPE_BASE + "/api/v1/services/audio/tts/customization"
263
  # soundfile`. Lazy-loaded; the Space (cpu-basic) leaves this unset and uses DashScope.
264
  TTS_MODE = os.environ.get("TINY_TTS_MODE", "").strip().lower()
265
  VOXCPM_SPACE = os.environ.get("TINY_VOXCPM_SPACE", "").strip()
 
266
  _local_tts = None # VoiceDesign model
267
  _local_clone = None # Base model (voice clone) — lazy, only if a clone is requested
268
  _local_tts_lock = threading.Lock()
@@ -453,6 +454,19 @@ def _voxcpm_clone(text, ref_audio_b64, ref_text, instruct):
453
  return f.read()
454
 
455
 
 
 
 
 
 
 
 
 
 
 
 
 
 
456
  @fastapi_app.post("/voxcpm-tts")
457
  async def voxcpm_tts(request: Request):
458
  body = await request.json()
@@ -727,6 +741,65 @@ def persona_selftest():
727
  "tok_per_sec": round(n / s, 2) if s else None, **llm.status()}
728
 
729
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
730
  # Persona generation, woid-protocol-compatible so web/personaStream.js consumes it
731
  # unchanged: emits `model` → `delta`* → `persona-done` → `done` (or `error`). The
732
  # blocking llama.cpp generator runs in a worker thread bridged to this async SSE
 
263
  # soundfile`. Lazy-loaded; the Space (cpu-basic) leaves this unset and uses DashScope.
264
  TTS_MODE = os.environ.get("TINY_TTS_MODE", "").strip().lower()
265
  VOXCPM_SPACE = os.environ.get("TINY_VOXCPM_SPACE", "").strip()
266
+ TINY_AYA_SPACE = os.environ.get("TINY_AYA_SPACE", "").strip()
267
  _local_tts = None # VoiceDesign model
268
  _local_clone = None # Base model (voice clone) — lazy, only if a clone is requested
269
  _local_tts_lock = threading.Lock()
 
454
  return f.read()
455
 
456
 
457
+ def _tiny_aya_generate(system, user, max_tokens, temperature):
458
+ from gradio_client import Client
459
+ client = Client(TINY_AYA_SPACE, token=HF_TOKEN or None)
460
+ result = client.predict(
461
+ system or "",
462
+ user or "",
463
+ int(max_tokens or 400),
464
+ float(temperature if temperature is not None else 0.8),
465
+ api_name="/generate",
466
+ )
467
+ return str(result or "")
468
+
469
+
470
  @fastapi_app.post("/voxcpm-tts")
471
  async def voxcpm_tts(request: Request):
472
  body = await request.json()
 
741
  "tok_per_sec": round(n / s, 2) if s else None, **llm.status()}
742
 
743
 
744
+ @fastapi_app.post("/text/generate/stream")
745
+ async def text_generate_stream(request: Request):
746
+ body = await request.json()
747
+ model = (body.get("model") or "server-local").strip()
748
+ system = body.get("system") or ""
749
+ user = body.get("user") or ""
750
+ max_tokens = int(body.get("max_tokens") or body.get("maxTokens") or 400)
751
+ temperature = float(body.get("temperature") if body.get("temperature") is not None else 0.8)
752
+ stop = threading.Event()
753
+
754
+ async def gen():
755
+ yield _sse("model", {"model": model})
756
+ loop = asyncio.get_running_loop()
757
+ q: asyncio.Queue = asyncio.Queue()
758
+ DONE = object()
759
+
760
+ def worker():
761
+ try:
762
+ if model == "tiny-aya-global-zerogpu":
763
+ if not TINY_AYA_SPACE:
764
+ raise llm.LlmUnavailable("TINY_AYA_SPACE not set")
765
+ text = _tiny_aya_generate(system, user, max_tokens, temperature)
766
+ if text:
767
+ loop.call_soon_threadsafe(q.put_nowait, ("delta", text))
768
+ else:
769
+ for chunk in llm.stream_chat(
770
+ system,
771
+ user,
772
+ max_tokens=max_tokens,
773
+ temperature=temperature,
774
+ should_stop=stop.is_set,
775
+ ):
776
+ loop.call_soon_threadsafe(q.put_nowait, ("delta", chunk))
777
+ except Exception as e: # noqa: BLE001
778
+ loop.call_soon_threadsafe(q.put_nowait, ("error", str(e)))
779
+ loop.call_soon_threadsafe(q.put_nowait, (DONE, None))
780
+
781
+ threading.Thread(target=worker, daemon=True).start()
782
+
783
+ try:
784
+ while True:
785
+ kind, val = await q.get()
786
+ if kind is DONE:
787
+ break
788
+ if kind == "error":
789
+ yield _sse("error", {"error": val})
790
+ return
791
+ yield _sse("delta", {"content": val})
792
+ finally:
793
+ stop.set()
794
+ yield _sse("done", {"model": model})
795
+
796
+ return StreamingResponse(gen(), media_type="text/event-stream", headers={
797
+ "Cache-Control": "no-cache, no-transform",
798
+ "Connection": "keep-alive",
799
+ "X-Accel-Buffering": "no",
800
+ })
801
+
802
+
803
  # Persona generation, woid-protocol-compatible so web/personaStream.js consumes it
804
  # unchanged: emits `model` → `delta`* → `persona-done` → `done` (or `error`). The
805
  # blocking llama.cpp generator runs in a worker thread bridged to this async SSE
web/engineServer.js ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Engine: server-side text generation. Keeps API keys/model hosts off the client and
2
+ // lets the same picker choose either a configured local llama.cpp server or a ZeroGPU
3
+ // hosted model such as Tiny Aya Global.
4
+ import { statsTracker } from '/web/genStats.js'
5
+
6
+ const MODELS = [
7
+ { id: 'server-local', label: 'Configured server model', params: 'local/remote', note: 'uses TINY_LLM_* on the Space or local app' },
8
+ { id: 'tiny-aya-global-zerogpu', label: 'Tiny Aya Global 3.35B', params: '3.35B', note: 'ZeroGPU sidecar; multilingual' },
9
+ ]
10
+ const get = (id) => MODELS.find((m) => m.id === id) || MODELS[0]
11
+
12
+ async function streamSse(body, { onEvent, signal } = {}) {
13
+ const res = await fetch('/text/generate/stream', {
14
+ method: 'POST',
15
+ headers: { 'Content-Type': 'application/json' },
16
+ body: JSON.stringify(body),
17
+ signal,
18
+ })
19
+ if (!res.ok || !res.body) throw new Error(`HTTP ${res.status}`)
20
+ const reader = res.body.getReader()
21
+ const decoder = new TextDecoder()
22
+ let buf = ''
23
+ while (true) {
24
+ const { value, done } = await reader.read()
25
+ if (done) break
26
+ buf += decoder.decode(value, { stream: true })
27
+ const events = buf.split(/\n\n/)
28
+ buf = events.pop() ?? ''
29
+ for (const evChunk of events) {
30
+ const lines = evChunk.split('\n')
31
+ let evt = 'message'
32
+ const dataLines = []
33
+ for (const line of lines) {
34
+ if (line.startsWith('event:')) evt = line.slice(6).trim()
35
+ else if (line.startsWith('data:')) dataLines.push(line.slice(5).trimStart())
36
+ }
37
+ const data = dataLines.join('\n')
38
+ if (!data) continue
39
+ let parsed = null
40
+ try { parsed = JSON.parse(data) } catch { /* ignore */ }
41
+ if (evt === 'error') throw new Error(parsed?.error || data)
42
+ onEvent?.(evt, parsed, data)
43
+ }
44
+ }
45
+ }
46
+
47
+ async function stream(id, system, user, { maxTokens = 200, temperature = 0.8, onToken, onStats, signal } = {}) {
48
+ const m = get(id)
49
+ const st = statsTracker(onStats)
50
+ let full = ''
51
+ await streamSse({
52
+ model: m.id,
53
+ system,
54
+ user,
55
+ max_tokens: maxTokens,
56
+ temperature,
57
+ }, {
58
+ signal,
59
+ onEvent(evt, parsed) {
60
+ if (evt !== 'delta') return
61
+ const piece = parsed?.content || ''
62
+ if (!piece) return
63
+ full += piece
64
+ onToken?.(piece)
65
+ st.tick()
66
+ },
67
+ })
68
+ return { text: full, stats: st.finish() }
69
+ }
70
+
71
+ export const engine = {
72
+ id: 'server',
73
+ label: 'Server / ZeroGPU',
74
+ available: () => true,
75
+ needsDownload: false,
76
+ models: MODELS,
77
+ defaultModel: 'tiny-aya-global-zerogpu',
78
+ ensure: async () => {},
79
+ stream,
80
+ backendLabel: () => 'server',
81
+ }
web/modelBar.js CHANGED
@@ -31,7 +31,7 @@ export function mountModelBar(host, { onChange } = {}) {
31
  const del = el('button', { class: 'model-del', type: 'button', title: 'Delete this model from your browser cache' }, '🗑 delete')
32
  const info = el('div', { class: 'model-info' })
33
  host.append(el('div', { class: 'model-bar' }, [
34
- el('label', { class: 'persona-label' }, 'Engine (benchmark — runs in your browser)'),
35
  engSel,
36
  el('label', { class: 'persona-label' }, 'Model'),
37
  sel, el('div', { class: 'model-row' }, [info, del]),
@@ -53,13 +53,14 @@ export function mountModelBar(host, { onChange } = {}) {
53
  sel.value = cur
54
  const m = currentModel()
55
  const size = sizeOf(m)
56
- info.textContent = `${m.params || ''}${size ? ` · ${size}` : ''} · ${backendLabel()} · ${cached.has(m.id) ? 'cached' : 'downloads on first use'}${storeNote}`
 
57
  del.style.display = (cacheSupported() && cached.has(m.id)) ? '' : 'none'
58
  }
59
  async function refresh() {
60
  cached = cacheSupported() ? await cachedSet() : new Set()
61
  const { usage, quota } = await storageEstimate()
62
- storeNote = quota ? ` · cache ${fmtBytes(usage)}/${fmtBytes(quota)}` : ''
63
  render()
64
  }
65
 
 
31
  const del = el('button', { class: 'model-del', type: 'button', title: 'Delete this model from your browser cache' }, '🗑 delete')
32
  const info = el('div', { class: 'model-info' })
33
  host.append(el('div', { class: 'model-bar' }, [
34
+ el('label', { class: 'persona-label' }, 'Runtime'),
35
  engSel,
36
  el('label', { class: 'persona-label' }, 'Model'),
37
  sel, el('div', { class: 'model-row' }, [info, del]),
 
53
  sel.value = cur
54
  const m = currentModel()
55
  const size = sizeOf(m)
56
+ const cacheText = cacheSupported() ? (cached.has(m.id) ? 'cached' : 'downloads on first use') : (m.note || 'no browser download')
57
+ info.textContent = `${m.params || ''}${size ? ` · ${size}` : ''} · ${backendLabel()} · ${cacheText}${storeNote}`
58
  del.style.display = (cacheSupported() && cached.has(m.id)) ? '' : 'none'
59
  }
60
  async function refresh() {
61
  cached = cacheSupported() ? await cachedSet() : new Set()
62
  const { usage, quota } = await storageEstimate()
63
+ storeNote = cacheSupported() && quota ? ` · cache ${fmtBytes(usage)}/${fmtBytes(quota)}` : ''
64
  render()
65
  }
66
 
web/runtime.js CHANGED
@@ -1,13 +1,14 @@
1
- // Runtime facade — picks the active engine (wllama / Transformers.js / WebLLM) and
2
  // model, and delegates load/stream/cache. Lets you A/B the same model across engines
3
  // and compare tok/s. Panels + the model bar import only from here. (Named runtime.js,
4
  // not engine.js — that one is the game-engine bundle.)
5
  import { engine as wllama } from '/web/engineWllama.js'
6
  import { engine as transformers } from '/web/engineTransformers.js'
7
  import { engine as webllm } from '/web/engineWebllm.js'
 
8
  import { ensurePersistentStorage } from '/web/storage.js'
9
 
10
- const ENGINES = [wllama, transformers, webllm]
11
  // Persisted choices (survive refresh). Defaults: WebLLM where there's WebGPU (fastest),
12
  // else wllama so the app still works without it.
13
  const ENGINE_KEY = 'tinyarmy.llmEngine', MODELS_KEY = 'tinyarmy.llmModels'
@@ -53,6 +54,7 @@ export function setModel(id) {
53
  }
54
 
55
  export const ensureModel = async (onProgress) => {
 
56
  await ensurePersistentStorage() // keep downloads from being evicted across engine switches
57
  return eng().ensure(currentModelId(), onProgress)
58
  }
 
1
+ // Runtime facade — picks the active engine (wllama / Transformers.js / WebLLM / server) and
2
  // model, and delegates load/stream/cache. Lets you A/B the same model across engines
3
  // and compare tok/s. Panels + the model bar import only from here. (Named runtime.js,
4
  // not engine.js — that one is the game-engine bundle.)
5
  import { engine as wllama } from '/web/engineWllama.js'
6
  import { engine as transformers } from '/web/engineTransformers.js'
7
  import { engine as webllm } from '/web/engineWebllm.js'
8
+ import { engine as server } from '/web/engineServer.js'
9
  import { ensurePersistentStorage } from '/web/storage.js'
10
 
11
+ const ENGINES = [wllama, transformers, webllm, server]
12
  // Persisted choices (survive refresh). Defaults: WebLLM where there's WebGPU (fastest),
13
  // else wllama so the app still works without it.
14
  const ENGINE_KEY = 'tinyarmy.llmEngine', MODELS_KEY = 'tinyarmy.llmModels'
 
54
  }
55
 
56
  export const ensureModel = async (onProgress) => {
57
+ if (eng().needsDownload === false) return eng().ensure(currentModelId(), onProgress)
58
  await ensurePersistentStorage() // keep downloads from being evicted across engine switches
59
  return eng().ensure(currentModelId(), onProgress)
60
  }
web/settingsPanel.js CHANGED
@@ -1,7 +1,7 @@
1
  // Inject our settings sections into Gradio's OWN settings page (footer "Settings" or the
2
  // sidebar ⚙ button → ?view=settings). Not an official extension point, so we anchor on
3
  // the "Display Theme" section, clone its styling, and prepend matching sections:
4
- // • Local AI Model — the in-browser LLM engine/model picker (modelBar)
5
  // • Voice — the read-aloud TTS engine/voice picker (ttsBar)
6
  // Both drive the shared runtime.js / tts.js singletons, so every page uses the same
7
  // choice. Fragile by nature (rides Gradio's DOM): if the structure changes the sections
@@ -45,9 +45,9 @@ export function mountSettingsPanel() {
45
  injectSection(sample, 'tac-quality-settings', 'Recommended settings',
46
  'Pick a quality preset — it sets the AI model and voice together, like graphics ' +
47
  'presets in a game. Changing either by hand switches to Custom.', mountQualityBar)
48
- injectSection(sample, 'tac-model-settings', 'Local AI Model',
49
- 'The in-browser model that writes your soldiers and their war diaries. Runs on ' +
50
- 'your device; models cache in your browser.', mountModelBar)
51
  injectSection(sample, 'tac-voice-settings', 'Voice',
52
  'The provider that voices your heroes. Qwen3-TTS designs a voice from each hero’s ' +
53
  'description; Kokoro/Kitten run on your device with a named voice you pick per hero. ' +
 
1
  // Inject our settings sections into Gradio's OWN settings page (footer "Settings" or the
2
  // sidebar ⚙ button → ?view=settings). Not an official extension point, so we anchor on
3
  // the "Display Theme" section, clone its styling, and prepend matching sections:
4
+ // • Text Generation Model — the LLM engine/model picker (modelBar)
5
  // • Voice — the read-aloud TTS engine/voice picker (ttsBar)
6
  // Both drive the shared runtime.js / tts.js singletons, so every page uses the same
7
  // choice. Fragile by nature (rides Gradio's DOM): if the structure changes the sections
 
45
  injectSection(sample, 'tac-quality-settings', 'Recommended settings',
46
  'Pick a quality preset — it sets the AI model and voice together, like graphics ' +
47
  'presets in a game. Changing either by hand switches to Custom.', mountQualityBar)
48
+ injectSection(sample, 'tac-model-settings', 'Text Generation Model',
49
+ 'The model that writes your soldiers and their war diaries. Use browser-local ' +
50
+ 'models, a configured local server, or a ZeroGPU-hosted model.', mountModelBar)
51
  injectSection(sample, 'tac-voice-settings', 'Voice',
52
  'The provider that voices your heroes. Qwen3-TTS designs a voice from each hero’s ' +
53
  'description; Kokoro/Kitten run on your device with a named voice you pick per hero. ' +
web/shell/persona.css CHANGED
@@ -256,7 +256,7 @@
256
  .persona-go-alt:hover { background: var(--p-paper-2) !important; color: var(--p-ink) !important; }
257
  .tts-status { min-height: 14px; }
258
 
259
- /* ── "Local AI Model" section injected into Gradio's own Settings page ──────── */
260
  /* The model bar's styles use --p-* vars (normally scoped to .persona-view); define
261
  them here too so the picker renders correctly inside Gradio's settings modal. */
262
  .tac-set-section {
 
256
  .persona-go-alt:hover { background: var(--p-paper-2) !important; color: var(--p-ink) !important; }
257
  .tts-status { min-height: 14px; }
258
 
259
+ /* ── "Text Generation Model" section injected into Gradio's own Settings page ─ */
260
  /* The model bar's styles use --p-* vars (normally scoped to .persona-view); define
261
  them here too so the picker renders correctly inside Gradio's settings modal. */
262
  .tac-set-section {