Spaces:
Running
Running
Add Tiny Aya text generation option
Browse files- app.py +73 -0
- web/engineServer.js +81 -0
- web/modelBar.js +4 -3
- web/runtime.js +4 -2
- web/settingsPanel.js +4 -4
- web/shell/persona.css +1 -1
app.py
CHANGED
|
@@ -263,6 +263,7 @@ _DASHSCOPE_URL = _DASHSCOPE_BASE + "/api/v1/services/audio/tts/customization"
|
|
| 263 |
# soundfile`. Lazy-loaded; the Space (cpu-basic) leaves this unset and uses DashScope.
|
| 264 |
TTS_MODE = os.environ.get("TINY_TTS_MODE", "").strip().lower()
|
| 265 |
VOXCPM_SPACE = os.environ.get("TINY_VOXCPM_SPACE", "").strip()
|
|
|
|
| 266 |
_local_tts = None # VoiceDesign model
|
| 267 |
_local_clone = None # Base model (voice clone) — lazy, only if a clone is requested
|
| 268 |
_local_tts_lock = threading.Lock()
|
|
@@ -453,6 +454,19 @@ def _voxcpm_clone(text, ref_audio_b64, ref_text, instruct):
|
|
| 453 |
return f.read()
|
| 454 |
|
| 455 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 456 |
@fastapi_app.post("/voxcpm-tts")
|
| 457 |
async def voxcpm_tts(request: Request):
|
| 458 |
body = await request.json()
|
|
@@ -727,6 +741,65 @@ def persona_selftest():
|
|
| 727 |
"tok_per_sec": round(n / s, 2) if s else None, **llm.status()}
|
| 728 |
|
| 729 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 730 |
# Persona generation, woid-protocol-compatible so web/personaStream.js consumes it
|
| 731 |
# unchanged: emits `model` → `delta`* → `persona-done` → `done` (or `error`). The
|
| 732 |
# blocking llama.cpp generator runs in a worker thread bridged to this async SSE
|
|
|
|
| 263 |
# soundfile`. Lazy-loaded; the Space (cpu-basic) leaves this unset and uses DashScope.
|
| 264 |
TTS_MODE = os.environ.get("TINY_TTS_MODE", "").strip().lower()
|
| 265 |
VOXCPM_SPACE = os.environ.get("TINY_VOXCPM_SPACE", "").strip()
|
| 266 |
+
TINY_AYA_SPACE = os.environ.get("TINY_AYA_SPACE", "").strip()
|
| 267 |
_local_tts = None # VoiceDesign model
|
| 268 |
_local_clone = None # Base model (voice clone) — lazy, only if a clone is requested
|
| 269 |
_local_tts_lock = threading.Lock()
|
|
|
|
| 454 |
return f.read()
|
| 455 |
|
| 456 |
|
| 457 |
+
def _tiny_aya_generate(system, user, max_tokens, temperature):
|
| 458 |
+
from gradio_client import Client
|
| 459 |
+
client = Client(TINY_AYA_SPACE, token=HF_TOKEN or None)
|
| 460 |
+
result = client.predict(
|
| 461 |
+
system or "",
|
| 462 |
+
user or "",
|
| 463 |
+
int(max_tokens or 400),
|
| 464 |
+
float(temperature if temperature is not None else 0.8),
|
| 465 |
+
api_name="/generate",
|
| 466 |
+
)
|
| 467 |
+
return str(result or "")
|
| 468 |
+
|
| 469 |
+
|
| 470 |
@fastapi_app.post("/voxcpm-tts")
|
| 471 |
async def voxcpm_tts(request: Request):
|
| 472 |
body = await request.json()
|
|
|
|
| 741 |
"tok_per_sec": round(n / s, 2) if s else None, **llm.status()}
|
| 742 |
|
| 743 |
|
| 744 |
+
@fastapi_app.post("/text/generate/stream")
|
| 745 |
+
async def text_generate_stream(request: Request):
|
| 746 |
+
body = await request.json()
|
| 747 |
+
model = (body.get("model") or "server-local").strip()
|
| 748 |
+
system = body.get("system") or ""
|
| 749 |
+
user = body.get("user") or ""
|
| 750 |
+
max_tokens = int(body.get("max_tokens") or body.get("maxTokens") or 400)
|
| 751 |
+
temperature = float(body.get("temperature") if body.get("temperature") is not None else 0.8)
|
| 752 |
+
stop = threading.Event()
|
| 753 |
+
|
| 754 |
+
async def gen():
|
| 755 |
+
yield _sse("model", {"model": model})
|
| 756 |
+
loop = asyncio.get_running_loop()
|
| 757 |
+
q: asyncio.Queue = asyncio.Queue()
|
| 758 |
+
DONE = object()
|
| 759 |
+
|
| 760 |
+
def worker():
|
| 761 |
+
try:
|
| 762 |
+
if model == "tiny-aya-global-zerogpu":
|
| 763 |
+
if not TINY_AYA_SPACE:
|
| 764 |
+
raise llm.LlmUnavailable("TINY_AYA_SPACE not set")
|
| 765 |
+
text = _tiny_aya_generate(system, user, max_tokens, temperature)
|
| 766 |
+
if text:
|
| 767 |
+
loop.call_soon_threadsafe(q.put_nowait, ("delta", text))
|
| 768 |
+
else:
|
| 769 |
+
for chunk in llm.stream_chat(
|
| 770 |
+
system,
|
| 771 |
+
user,
|
| 772 |
+
max_tokens=max_tokens,
|
| 773 |
+
temperature=temperature,
|
| 774 |
+
should_stop=stop.is_set,
|
| 775 |
+
):
|
| 776 |
+
loop.call_soon_threadsafe(q.put_nowait, ("delta", chunk))
|
| 777 |
+
except Exception as e: # noqa: BLE001
|
| 778 |
+
loop.call_soon_threadsafe(q.put_nowait, ("error", str(e)))
|
| 779 |
+
loop.call_soon_threadsafe(q.put_nowait, (DONE, None))
|
| 780 |
+
|
| 781 |
+
threading.Thread(target=worker, daemon=True).start()
|
| 782 |
+
|
| 783 |
+
try:
|
| 784 |
+
while True:
|
| 785 |
+
kind, val = await q.get()
|
| 786 |
+
if kind is DONE:
|
| 787 |
+
break
|
| 788 |
+
if kind == "error":
|
| 789 |
+
yield _sse("error", {"error": val})
|
| 790 |
+
return
|
| 791 |
+
yield _sse("delta", {"content": val})
|
| 792 |
+
finally:
|
| 793 |
+
stop.set()
|
| 794 |
+
yield _sse("done", {"model": model})
|
| 795 |
+
|
| 796 |
+
return StreamingResponse(gen(), media_type="text/event-stream", headers={
|
| 797 |
+
"Cache-Control": "no-cache, no-transform",
|
| 798 |
+
"Connection": "keep-alive",
|
| 799 |
+
"X-Accel-Buffering": "no",
|
| 800 |
+
})
|
| 801 |
+
|
| 802 |
+
|
| 803 |
# Persona generation, woid-protocol-compatible so web/personaStream.js consumes it
|
| 804 |
# unchanged: emits `model` → `delta`* → `persona-done` → `done` (or `error`). The
|
| 805 |
# blocking llama.cpp generator runs in a worker thread bridged to this async SSE
|
web/engineServer.js
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Engine: server-side text generation. Keeps API keys/model hosts off the client and
|
| 2 |
+
// lets the same picker choose either a configured local llama.cpp server or a ZeroGPU
|
| 3 |
+
// hosted model such as Tiny Aya Global.
|
| 4 |
+
import { statsTracker } from '/web/genStats.js'
|
| 5 |
+
|
| 6 |
+
const MODELS = [
|
| 7 |
+
{ id: 'server-local', label: 'Configured server model', params: 'local/remote', note: 'uses TINY_LLM_* on the Space or local app' },
|
| 8 |
+
{ id: 'tiny-aya-global-zerogpu', label: 'Tiny Aya Global 3.35B', params: '3.35B', note: 'ZeroGPU sidecar; multilingual' },
|
| 9 |
+
]
|
| 10 |
+
const get = (id) => MODELS.find((m) => m.id === id) || MODELS[0]
|
| 11 |
+
|
| 12 |
+
async function streamSse(body, { onEvent, signal } = {}) {
|
| 13 |
+
const res = await fetch('/text/generate/stream', {
|
| 14 |
+
method: 'POST',
|
| 15 |
+
headers: { 'Content-Type': 'application/json' },
|
| 16 |
+
body: JSON.stringify(body),
|
| 17 |
+
signal,
|
| 18 |
+
})
|
| 19 |
+
if (!res.ok || !res.body) throw new Error(`HTTP ${res.status}`)
|
| 20 |
+
const reader = res.body.getReader()
|
| 21 |
+
const decoder = new TextDecoder()
|
| 22 |
+
let buf = ''
|
| 23 |
+
while (true) {
|
| 24 |
+
const { value, done } = await reader.read()
|
| 25 |
+
if (done) break
|
| 26 |
+
buf += decoder.decode(value, { stream: true })
|
| 27 |
+
const events = buf.split(/\n\n/)
|
| 28 |
+
buf = events.pop() ?? ''
|
| 29 |
+
for (const evChunk of events) {
|
| 30 |
+
const lines = evChunk.split('\n')
|
| 31 |
+
let evt = 'message'
|
| 32 |
+
const dataLines = []
|
| 33 |
+
for (const line of lines) {
|
| 34 |
+
if (line.startsWith('event:')) evt = line.slice(6).trim()
|
| 35 |
+
else if (line.startsWith('data:')) dataLines.push(line.slice(5).trimStart())
|
| 36 |
+
}
|
| 37 |
+
const data = dataLines.join('\n')
|
| 38 |
+
if (!data) continue
|
| 39 |
+
let parsed = null
|
| 40 |
+
try { parsed = JSON.parse(data) } catch { /* ignore */ }
|
| 41 |
+
if (evt === 'error') throw new Error(parsed?.error || data)
|
| 42 |
+
onEvent?.(evt, parsed, data)
|
| 43 |
+
}
|
| 44 |
+
}
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
async function stream(id, system, user, { maxTokens = 200, temperature = 0.8, onToken, onStats, signal } = {}) {
|
| 48 |
+
const m = get(id)
|
| 49 |
+
const st = statsTracker(onStats)
|
| 50 |
+
let full = ''
|
| 51 |
+
await streamSse({
|
| 52 |
+
model: m.id,
|
| 53 |
+
system,
|
| 54 |
+
user,
|
| 55 |
+
max_tokens: maxTokens,
|
| 56 |
+
temperature,
|
| 57 |
+
}, {
|
| 58 |
+
signal,
|
| 59 |
+
onEvent(evt, parsed) {
|
| 60 |
+
if (evt !== 'delta') return
|
| 61 |
+
const piece = parsed?.content || ''
|
| 62 |
+
if (!piece) return
|
| 63 |
+
full += piece
|
| 64 |
+
onToken?.(piece)
|
| 65 |
+
st.tick()
|
| 66 |
+
},
|
| 67 |
+
})
|
| 68 |
+
return { text: full, stats: st.finish() }
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
export const engine = {
|
| 72 |
+
id: 'server',
|
| 73 |
+
label: 'Server / ZeroGPU',
|
| 74 |
+
available: () => true,
|
| 75 |
+
needsDownload: false,
|
| 76 |
+
models: MODELS,
|
| 77 |
+
defaultModel: 'tiny-aya-global-zerogpu',
|
| 78 |
+
ensure: async () => {},
|
| 79 |
+
stream,
|
| 80 |
+
backendLabel: () => 'server',
|
| 81 |
+
}
|
web/modelBar.js
CHANGED
|
@@ -31,7 +31,7 @@ export function mountModelBar(host, { onChange } = {}) {
|
|
| 31 |
const del = el('button', { class: 'model-del', type: 'button', title: 'Delete this model from your browser cache' }, '🗑 delete')
|
| 32 |
const info = el('div', { class: 'model-info' })
|
| 33 |
host.append(el('div', { class: 'model-bar' }, [
|
| 34 |
-
el('label', { class: 'persona-label' }, '
|
| 35 |
engSel,
|
| 36 |
el('label', { class: 'persona-label' }, 'Model'),
|
| 37 |
sel, el('div', { class: 'model-row' }, [info, del]),
|
|
@@ -53,13 +53,14 @@ export function mountModelBar(host, { onChange } = {}) {
|
|
| 53 |
sel.value = cur
|
| 54 |
const m = currentModel()
|
| 55 |
const size = sizeOf(m)
|
| 56 |
-
|
|
|
|
| 57 |
del.style.display = (cacheSupported() && cached.has(m.id)) ? '' : 'none'
|
| 58 |
}
|
| 59 |
async function refresh() {
|
| 60 |
cached = cacheSupported() ? await cachedSet() : new Set()
|
| 61 |
const { usage, quota } = await storageEstimate()
|
| 62 |
-
storeNote = quota ? ` · cache ${fmtBytes(usage)}/${fmtBytes(quota)}` : ''
|
| 63 |
render()
|
| 64 |
}
|
| 65 |
|
|
|
|
| 31 |
const del = el('button', { class: 'model-del', type: 'button', title: 'Delete this model from your browser cache' }, '🗑 delete')
|
| 32 |
const info = el('div', { class: 'model-info' })
|
| 33 |
host.append(el('div', { class: 'model-bar' }, [
|
| 34 |
+
el('label', { class: 'persona-label' }, 'Runtime'),
|
| 35 |
engSel,
|
| 36 |
el('label', { class: 'persona-label' }, 'Model'),
|
| 37 |
sel, el('div', { class: 'model-row' }, [info, del]),
|
|
|
|
| 53 |
sel.value = cur
|
| 54 |
const m = currentModel()
|
| 55 |
const size = sizeOf(m)
|
| 56 |
+
const cacheText = cacheSupported() ? (cached.has(m.id) ? 'cached' : 'downloads on first use') : (m.note || 'no browser download')
|
| 57 |
+
info.textContent = `${m.params || ''}${size ? ` · ${size}` : ''} · ${backendLabel()} · ${cacheText}${storeNote}`
|
| 58 |
del.style.display = (cacheSupported() && cached.has(m.id)) ? '' : 'none'
|
| 59 |
}
|
| 60 |
async function refresh() {
|
| 61 |
cached = cacheSupported() ? await cachedSet() : new Set()
|
| 62 |
const { usage, quota } = await storageEstimate()
|
| 63 |
+
storeNote = cacheSupported() && quota ? ` · cache ${fmtBytes(usage)}/${fmtBytes(quota)}` : ''
|
| 64 |
render()
|
| 65 |
}
|
| 66 |
|
web/runtime.js
CHANGED
|
@@ -1,13 +1,14 @@
|
|
| 1 |
-
// Runtime facade — picks the active engine (wllama / Transformers.js / WebLLM) and
|
| 2 |
// model, and delegates load/stream/cache. Lets you A/B the same model across engines
|
| 3 |
// and compare tok/s. Panels + the model bar import only from here. (Named runtime.js,
|
| 4 |
// not engine.js — that one is the game-engine bundle.)
|
| 5 |
import { engine as wllama } from '/web/engineWllama.js'
|
| 6 |
import { engine as transformers } from '/web/engineTransformers.js'
|
| 7 |
import { engine as webllm } from '/web/engineWebllm.js'
|
|
|
|
| 8 |
import { ensurePersistentStorage } from '/web/storage.js'
|
| 9 |
|
| 10 |
-
const ENGINES = [wllama, transformers, webllm]
|
| 11 |
// Persisted choices (survive refresh). Defaults: WebLLM where there's WebGPU (fastest),
|
| 12 |
// else wllama so the app still works without it.
|
| 13 |
const ENGINE_KEY = 'tinyarmy.llmEngine', MODELS_KEY = 'tinyarmy.llmModels'
|
|
@@ -53,6 +54,7 @@ export function setModel(id) {
|
|
| 53 |
}
|
| 54 |
|
| 55 |
export const ensureModel = async (onProgress) => {
|
|
|
|
| 56 |
await ensurePersistentStorage() // keep downloads from being evicted across engine switches
|
| 57 |
return eng().ensure(currentModelId(), onProgress)
|
| 58 |
}
|
|
|
|
| 1 |
+
// Runtime facade — picks the active engine (wllama / Transformers.js / WebLLM / server) and
|
| 2 |
// model, and delegates load/stream/cache. Lets you A/B the same model across engines
|
| 3 |
// and compare tok/s. Panels + the model bar import only from here. (Named runtime.js,
|
| 4 |
// not engine.js — that one is the game-engine bundle.)
|
| 5 |
import { engine as wllama } from '/web/engineWllama.js'
|
| 6 |
import { engine as transformers } from '/web/engineTransformers.js'
|
| 7 |
import { engine as webllm } from '/web/engineWebllm.js'
|
| 8 |
+
import { engine as server } from '/web/engineServer.js'
|
| 9 |
import { ensurePersistentStorage } from '/web/storage.js'
|
| 10 |
|
| 11 |
+
const ENGINES = [wllama, transformers, webllm, server]
|
| 12 |
// Persisted choices (survive refresh). Defaults: WebLLM where there's WebGPU (fastest),
|
| 13 |
// else wllama so the app still works without it.
|
| 14 |
const ENGINE_KEY = 'tinyarmy.llmEngine', MODELS_KEY = 'tinyarmy.llmModels'
|
|
|
|
| 54 |
}
|
| 55 |
|
| 56 |
export const ensureModel = async (onProgress) => {
|
| 57 |
+
if (eng().needsDownload === false) return eng().ensure(currentModelId(), onProgress)
|
| 58 |
await ensurePersistentStorage() // keep downloads from being evicted across engine switches
|
| 59 |
return eng().ensure(currentModelId(), onProgress)
|
| 60 |
}
|
web/settingsPanel.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
// Inject our settings sections into Gradio's OWN settings page (footer "Settings" or the
|
| 2 |
// sidebar ⚙ button → ?view=settings). Not an official extension point, so we anchor on
|
| 3 |
// the "Display Theme" section, clone its styling, and prepend matching sections:
|
| 4 |
-
// •
|
| 5 |
// • Voice — the read-aloud TTS engine/voice picker (ttsBar)
|
| 6 |
// Both drive the shared runtime.js / tts.js singletons, so every page uses the same
|
| 7 |
// choice. Fragile by nature (rides Gradio's DOM): if the structure changes the sections
|
|
@@ -45,9 +45,9 @@ export function mountSettingsPanel() {
|
|
| 45 |
injectSection(sample, 'tac-quality-settings', 'Recommended settings',
|
| 46 |
'Pick a quality preset — it sets the AI model and voice together, like graphics ' +
|
| 47 |
'presets in a game. Changing either by hand switches to Custom.', mountQualityBar)
|
| 48 |
-
injectSection(sample, 'tac-model-settings', '
|
| 49 |
-
'The
|
| 50 |
-
'
|
| 51 |
injectSection(sample, 'tac-voice-settings', 'Voice',
|
| 52 |
'The provider that voices your heroes. Qwen3-TTS designs a voice from each hero’s ' +
|
| 53 |
'description; Kokoro/Kitten run on your device with a named voice you pick per hero. ' +
|
|
|
|
| 1 |
// Inject our settings sections into Gradio's OWN settings page (footer "Settings" or the
|
| 2 |
// sidebar ⚙ button → ?view=settings). Not an official extension point, so we anchor on
|
| 3 |
// the "Display Theme" section, clone its styling, and prepend matching sections:
|
| 4 |
+
// • Text Generation Model — the LLM engine/model picker (modelBar)
|
| 5 |
// • Voice — the read-aloud TTS engine/voice picker (ttsBar)
|
| 6 |
// Both drive the shared runtime.js / tts.js singletons, so every page uses the same
|
| 7 |
// choice. Fragile by nature (rides Gradio's DOM): if the structure changes the sections
|
|
|
|
| 45 |
injectSection(sample, 'tac-quality-settings', 'Recommended settings',
|
| 46 |
'Pick a quality preset — it sets the AI model and voice together, like graphics ' +
|
| 47 |
'presets in a game. Changing either by hand switches to Custom.', mountQualityBar)
|
| 48 |
+
injectSection(sample, 'tac-model-settings', 'Text Generation Model',
|
| 49 |
+
'The model that writes your soldiers and their war diaries. Use browser-local ' +
|
| 50 |
+
'models, a configured local server, or a ZeroGPU-hosted model.', mountModelBar)
|
| 51 |
injectSection(sample, 'tac-voice-settings', 'Voice',
|
| 52 |
'The provider that voices your heroes. Qwen3-TTS designs a voice from each hero’s ' +
|
| 53 |
'description; Kokoro/Kitten run on your device with a named voice you pick per hero. ' +
|
web/shell/persona.css
CHANGED
|
@@ -256,7 +256,7 @@
|
|
| 256 |
.persona-go-alt:hover { background: var(--p-paper-2) !important; color: var(--p-ink) !important; }
|
| 257 |
.tts-status { min-height: 14px; }
|
| 258 |
|
| 259 |
-
/* ── "
|
| 260 |
/* The model bar's styles use --p-* vars (normally scoped to .persona-view); define
|
| 261 |
them here too so the picker renders correctly inside Gradio's settings modal. */
|
| 262 |
.tac-set-section {
|
|
|
|
| 256 |
.persona-go-alt:hover { background: var(--p-paper-2) !important; color: var(--p-ink) !important; }
|
| 257 |
.tts-status { min-height: 14px; }
|
| 258 |
|
| 259 |
+
/* ── "Text Generation Model" section injected into Gradio's own Settings page ─ */
|
| 260 |
/* The model bar's styles use --p-* vars (normally scoped to .persona-view); define
|
| 261 |
them here too so the picker renders correctly inside Gradio's settings modal. */
|
| 262 |
.tac-set-section {
|