Spaces:

build-small-hackathon
/

tiny-army

Running

App Files Files Community

polats commited on Jun 6

Commit

f9dd2fe

1 Parent(s): d011d06

Add Tiny Aya text generation option

Browse files

Files changed (6) hide show

app.py +73 -0
web/engineServer.js +81 -0
web/modelBar.js +4 -3
web/runtime.js +4 -2
web/settingsPanel.js +4 -4
web/shell/persona.css +1 -1

app.py CHANGED Viewed

@@ -263,6 +263,7 @@ _DASHSCOPE_URL = _DASHSCOPE_BASE + "/api/v1/services/audio/tts/customization"
 # soundfile`. Lazy-loaded; the Space (cpu-basic) leaves this unset and uses DashScope.
 TTS_MODE = os.environ.get("TINY_TTS_MODE", "").strip().lower()
 VOXCPM_SPACE = os.environ.get("TINY_VOXCPM_SPACE", "").strip()
 _local_tts = None       # VoiceDesign model
 _local_clone = None     # Base model (voice clone) — lazy, only if a clone is requested
 _local_tts_lock = threading.Lock()
@@ -453,6 +454,19 @@ def _voxcpm_clone(text, ref_audio_b64, ref_text, instruct):
         return f.read()
 @fastapi_app.post("/voxcpm-tts")
 async def voxcpm_tts(request: Request):
     body = await request.json()
@@ -727,6 +741,65 @@ def persona_selftest():
             "tok_per_sec": round(n / s, 2) if s else None, **llm.status()}
 # Persona generation, woid-protocol-compatible so web/personaStream.js consumes it
 # unchanged: emits `model` → `delta`* → `persona-done` → `done` (or `error`). The
 # blocking llama.cpp generator runs in a worker thread bridged to this async SSE

 # soundfile`. Lazy-loaded; the Space (cpu-basic) leaves this unset and uses DashScope.
 TTS_MODE = os.environ.get("TINY_TTS_MODE", "").strip().lower()
 VOXCPM_SPACE = os.environ.get("TINY_VOXCPM_SPACE", "").strip()
+TINY_AYA_SPACE = os.environ.get("TINY_AYA_SPACE", "").strip()
 _local_tts = None       # VoiceDesign model
 _local_clone = None     # Base model (voice clone) — lazy, only if a clone is requested
 _local_tts_lock = threading.Lock()
         return f.read()
+def _tiny_aya_generate(system, user, max_tokens, temperature):
+    from gradio_client import Client
+    client = Client(TINY_AYA_SPACE, token=HF_TOKEN or None)
+    result = client.predict(
+        system or "",
+        user or "",
+        int(max_tokens or 400),
+        float(temperature if temperature is not None else 0.8),
+        api_name="/generate",
+    )
+    return str(result or "")
 @fastapi_app.post("/voxcpm-tts")
 async def voxcpm_tts(request: Request):
     body = await request.json()
             "tok_per_sec": round(n / s, 2) if s else None, **llm.status()}
+@fastapi_app.post("/text/generate/stream")
+async def text_generate_stream(request: Request):
+    body = await request.json()
+    model = (body.get("model") or "server-local").strip()
+    system = body.get("system") or ""
+    user = body.get("user") or ""
+    max_tokens = int(body.get("max_tokens") or body.get("maxTokens") or 400)
+    temperature = float(body.get("temperature") if body.get("temperature") is not None else 0.8)
+    stop = threading.Event()
+    async def gen():
+        yield _sse("model", {"model": model})
+        loop = asyncio.get_running_loop()
+        q: asyncio.Queue = asyncio.Queue()
+        DONE = object()
+        def worker():
+            try:
+                if model == "tiny-aya-global-zerogpu":
+                    if not TINY_AYA_SPACE:
+                        raise llm.LlmUnavailable("TINY_AYA_SPACE not set")
+                    text = _tiny_aya_generate(system, user, max_tokens, temperature)
+                    if text:
+                        loop.call_soon_threadsafe(q.put_nowait, ("delta", text))
+                else:
+                    for chunk in llm.stream_chat(
+                        system,
+                        user,
+                        max_tokens=max_tokens,
+                        temperature=temperature,
+                        should_stop=stop.is_set,
+                    ):
+                        loop.call_soon_threadsafe(q.put_nowait, ("delta", chunk))
+            except Exception as e:  # noqa: BLE001
+                loop.call_soon_threadsafe(q.put_nowait, ("error", str(e)))
+            loop.call_soon_threadsafe(q.put_nowait, (DONE, None))
+        threading.Thread(target=worker, daemon=True).start()
+        try:
+            while True:
+                kind, val = await q.get()
+                if kind is DONE:
+                    break
+                if kind == "error":
+                    yield _sse("error", {"error": val})
+                    return
+                yield _sse("delta", {"content": val})
+        finally:
+            stop.set()
+        yield _sse("done", {"model": model})
+    return StreamingResponse(gen(), media_type="text/event-stream", headers={
+        "Cache-Control": "no-cache, no-transform",
+        "Connection": "keep-alive",
+        "X-Accel-Buffering": "no",
+    })
 # Persona generation, woid-protocol-compatible so web/personaStream.js consumes it
 # unchanged: emits `model` → `delta`* → `persona-done` → `done` (or `error`). The
 # blocking llama.cpp generator runs in a worker thread bridged to this async SSE

web/engineServer.js ADDED Viewed

	@@ -0,0 +1,81 @@

+// Engine: server-side text generation. Keeps API keys/model hosts off the client and
+// lets the same picker choose either a configured local llama.cpp server or a ZeroGPU
+// hosted model such as Tiny Aya Global.
+import { statsTracker } from '/web/genStats.js'
+const MODELS = [
+  { id: 'server-local', label: 'Configured server model', params: 'local/remote', note: 'uses TINY_LLM_* on the Space or local app' },
+  { id: 'tiny-aya-global-zerogpu', label: 'Tiny Aya Global 3.35B', params: '3.35B', note: 'ZeroGPU sidecar; multilingual' },
+]
+const get = (id) => MODELS.find((m) => m.id === id) || MODELS[0]
+async function streamSse(body, { onEvent, signal } = {}) {
+  const res = await fetch('/text/generate/stream', {
+    method: 'POST',
+    headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify(body),
+    signal,
+  })
+  if (!res.ok || !res.body) throw new Error(`HTTP ${res.status}`)
+  const reader = res.body.getReader()
+  const decoder = new TextDecoder()
+  let buf = ''
+  while (true) {
+    const { value, done } = await reader.read()
+    if (done) break
+    buf += decoder.decode(value, { stream: true })
+    const events = buf.split(/\n\n/)
+    buf = events.pop() ?? ''
+    for (const evChunk of events) {
+      const lines = evChunk.split('\n')
+      let evt = 'message'
+      const dataLines = []
+      for (const line of lines) {
+        if (line.startsWith('event:')) evt = line.slice(6).trim()
+        else if (line.startsWith('data:')) dataLines.push(line.slice(5).trimStart())
+      }
+      const data = dataLines.join('\n')
+      if (!data) continue
+      let parsed = null
+      try { parsed = JSON.parse(data) } catch { /* ignore */ }
+      if (evt === 'error') throw new Error(parsed?.error || data)
+      onEvent?.(evt, parsed, data)
+    }
+  }
+}
+async function stream(id, system, user, { maxTokens = 200, temperature = 0.8, onToken, onStats, signal } = {}) {
+  const m = get(id)
+  const st = statsTracker(onStats)
+  let full = ''
+  await streamSse({
+    model: m.id,
+    system,
+    user,
+    max_tokens: maxTokens,
+    temperature,
+  }, {
+    signal,
+    onEvent(evt, parsed) {
+      if (evt !== 'delta') return
+      const piece = parsed?.content || ''
+      if (!piece) return
+      full += piece
+      onToken?.(piece)
+      st.tick()
+    },
+  })
+  return { text: full, stats: st.finish() }
+}
+export const engine = {
+  id: 'server',
+  label: 'Server / ZeroGPU',
+  available: () => true,
+  needsDownload: false,
+  models: MODELS,
+  defaultModel: 'tiny-aya-global-zerogpu',
+  ensure: async () => {},
+  stream,
+  backendLabel: () => 'server',
+}

web/modelBar.js CHANGED Viewed

@@ -31,7 +31,7 @@ export function mountModelBar(host, { onChange } = {}) {
   const del = el('button', { class: 'model-del', type: 'button', title: 'Delete this model from your browser cache' }, '🗑 delete')
   const info = el('div', { class: 'model-info' })
   host.append(el('div', { class: 'model-bar' }, [
-    el('label', { class: 'persona-label' }, 'Engine (benchmark — runs in your browser)'),
     engSel,
     el('label', { class: 'persona-label' }, 'Model'),
     sel, el('div', { class: 'model-row' }, [info, del]),
@@ -53,13 +53,14 @@ export function mountModelBar(host, { onChange } = {}) {
     sel.value = cur
     const m = currentModel()
     const size = sizeOf(m)
-    info.textContent = `${m.params || ''}${size ? ` · ${size}` : ''} · ${backendLabel()} · ${cached.has(m.id) ? 'cached' : 'downloads on first use'}${storeNote}`
     del.style.display = (cacheSupported() && cached.has(m.id)) ? '' : 'none'
   }
   async function refresh() {
     cached = cacheSupported() ? await cachedSet() : new Set()
     const { usage, quota } = await storageEstimate()
-    storeNote = quota ? ` · cache ${fmtBytes(usage)}/${fmtBytes(quota)}` : ''
     render()
   }

   const del = el('button', { class: 'model-del', type: 'button', title: 'Delete this model from your browser cache' }, '🗑 delete')
   const info = el('div', { class: 'model-info' })
   host.append(el('div', { class: 'model-bar' }, [
+    el('label', { class: 'persona-label' }, 'Runtime'),
     engSel,
     el('label', { class: 'persona-label' }, 'Model'),
     sel, el('div', { class: 'model-row' }, [info, del]),
     sel.value = cur
     const m = currentModel()
     const size = sizeOf(m)
+    const cacheText = cacheSupported() ? (cached.has(m.id) ? 'cached' : 'downloads on first use') : (m.note || 'no browser download')
+    info.textContent = `${m.params || ''}${size ? ` · ${size}` : ''} · ${backendLabel()} · ${cacheText}${storeNote}`
     del.style.display = (cacheSupported() && cached.has(m.id)) ? '' : 'none'
   }
   async function refresh() {
     cached = cacheSupported() ? await cachedSet() : new Set()
     const { usage, quota } = await storageEstimate()
+    storeNote = cacheSupported() && quota ? ` · cache ${fmtBytes(usage)}/${fmtBytes(quota)}` : ''
     render()
   }

web/runtime.js CHANGED Viewed

@@ -1,13 +1,14 @@
-// Runtime facade — picks the active engine (wllama / Transformers.js / WebLLM) and
 // model, and delegates load/stream/cache. Lets you A/B the same model across engines
 // and compare tok/s. Panels + the model bar import only from here. (Named runtime.js,
 // not engine.js — that one is the game-engine bundle.)
 import { engine as wllama } from '/web/engineWllama.js'
 import { engine as transformers } from '/web/engineTransformers.js'
 import { engine as webllm } from '/web/engineWebllm.js'
 import { ensurePersistentStorage } from '/web/storage.js'
-const ENGINES = [wllama, transformers, webllm]
 // Persisted choices (survive refresh). Defaults: WebLLM where there's WebGPU (fastest),
 // else wllama so the app still works without it.
 const ENGINE_KEY = 'tinyarmy.llmEngine', MODELS_KEY = 'tinyarmy.llmModels'
@@ -53,6 +54,7 @@ export function setModel(id) {
 }
 export const ensureModel = async (onProgress) => {
   await ensurePersistentStorage() // keep downloads from being evicted across engine switches
   return eng().ensure(currentModelId(), onProgress)
 }

+// Runtime facade — picks the active engine (wllama / Transformers.js / WebLLM / server) and
 // model, and delegates load/stream/cache. Lets you A/B the same model across engines
 // and compare tok/s. Panels + the model bar import only from here. (Named runtime.js,
 // not engine.js — that one is the game-engine bundle.)
 import { engine as wllama } from '/web/engineWllama.js'
 import { engine as transformers } from '/web/engineTransformers.js'
 import { engine as webllm } from '/web/engineWebllm.js'
+import { engine as server } from '/web/engineServer.js'
 import { ensurePersistentStorage } from '/web/storage.js'
+const ENGINES = [wllama, transformers, webllm, server]
 // Persisted choices (survive refresh). Defaults: WebLLM where there's WebGPU (fastest),
 // else wllama so the app still works without it.
 const ENGINE_KEY = 'tinyarmy.llmEngine', MODELS_KEY = 'tinyarmy.llmModels'
 }
 export const ensureModel = async (onProgress) => {
+  if (eng().needsDownload === false) return eng().ensure(currentModelId(), onProgress)
   await ensurePersistentStorage() // keep downloads from being evicted across engine switches
   return eng().ensure(currentModelId(), onProgress)
 }

web/settingsPanel.js CHANGED Viewed

@@ -1,7 +1,7 @@
 // Inject our settings sections into Gradio's OWN settings page (footer "Settings" or the
 // sidebar ⚙ button → ?view=settings). Not an official extension point, so we anchor on
 // the "Display Theme" section, clone its styling, and prepend matching sections:
-//   • Local AI Model — the in-browser LLM engine/model picker (modelBar)
 //   • Voice          — the read-aloud TTS engine/voice picker (ttsBar)
 // Both drive the shared runtime.js / tts.js singletons, so every page uses the same
 // choice. Fragile by nature (rides Gradio's DOM): if the structure changes the sections
@@ -45,9 +45,9 @@ export function mountSettingsPanel() {
     injectSection(sample, 'tac-quality-settings', 'Recommended settings',
       'Pick a quality preset — it sets the AI model and voice together, like graphics ' +
       'presets in a game. Changing either by hand switches to Custom.', mountQualityBar)
-    injectSection(sample, 'tac-model-settings', 'Local AI Model',
-      'The in-browser model that writes your soldiers and their war diaries. Runs on ' +
-      'your device; models cache in your browser.', mountModelBar)
     injectSection(sample, 'tac-voice-settings', 'Voice',
       'The provider that voices your heroes. Qwen3-TTS designs a voice from each hero’s ' +
       'description; Kokoro/Kitten run on your device with a named voice you pick per hero. ' +

 // Inject our settings sections into Gradio's OWN settings page (footer "Settings" or the
 // sidebar ⚙ button → ?view=settings). Not an official extension point, so we anchor on
 // the "Display Theme" section, clone its styling, and prepend matching sections:
+//   • Text Generation Model — the LLM engine/model picker (modelBar)
 //   • Voice          — the read-aloud TTS engine/voice picker (ttsBar)
 // Both drive the shared runtime.js / tts.js singletons, so every page uses the same
 // choice. Fragile by nature (rides Gradio's DOM): if the structure changes the sections
     injectSection(sample, 'tac-quality-settings', 'Recommended settings',
       'Pick a quality preset — it sets the AI model and voice together, like graphics ' +
       'presets in a game. Changing either by hand switches to Custom.', mountQualityBar)
+    injectSection(sample, 'tac-model-settings', 'Text Generation Model',
+      'The model that writes your soldiers and their war diaries. Use browser-local ' +
+      'models, a configured local server, or a ZeroGPU-hosted model.', mountModelBar)
     injectSection(sample, 'tac-voice-settings', 'Voice',
       'The provider that voices your heroes. Qwen3-TTS designs a voice from each hero’s ' +
       'description; Kokoro/Kitten run on your device with a named voice you pick per hero. ' +

web/shell/persona.css CHANGED Viewed

@@ -256,7 +256,7 @@
 .persona-go-alt:hover { background: var(--p-paper-2) !important; color: var(--p-ink) !important; }
 .tts-status { min-height: 14px; }
-/* ── "Local AI Model" section injected into Gradio's own Settings page ──────── */
 /* The model bar's styles use --p-* vars (normally scoped to .persona-view); define
    them here too so the picker renders correctly inside Gradio's settings modal. */
 .tac-set-section {

 .persona-go-alt:hover { background: var(--p-paper-2) !important; color: var(--p-ink) !important; }
 .tts-status { min-height: 14px; }
+/* ── "Text Generation Model" section injected into Gradio's own Settings page ─ */
 /* The model bar's styles use --p-* vars (normally scoped to .persona-view); define
    them here too so the picker renders correctly inside Gradio's settings modal. */
 .tac-set-section {