Spaces:

build-small-hackathon
/

tiny-army

Running

polats Claude Opus 4.8 (1M context) commited on Jun 4

Commit

e352ff3

1 Parent(s): 0f47058

Voice settings: add "Qwen3-TTS local" engine — localhost-only, default on localhost

- New TTS engine 'qwen3local' (Qwen3-TTS · local (your GPU)) that POSTs to same-origin
/qwen-tts. available() = isLocalhost(), so it's enabled on localhost and shown
DISABLED with a "run the project locally" note in prod.
- Default TTS engine is now qwen3local on localhost, Kokoro in prod (local-first).
- ttsBar shows each disabled engine's note instead of "n/a".
- app.py: TINY_TTS_MODE=local runs the open weights in-process (lazy qwen-tts, GPU),
so the local engine's same-origin /qwen-tts serves the model — the LeLab pattern
(one origin, no CORS/cert). Unset on the Space → DashScope as before.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

Files changed (4) hide show

app.py +35 -2
web/tts.js +5 -4
web/ttsBar.js +1 -1
web/ttsQwen3.js +29 -10

app.py CHANGED Viewed

@@ -242,6 +242,32 @@ DASHSCOPE_KEY = os.environ.get("DASHSCOPE_API_KEY", "")
 _DASHSCOPE_BASE = os.environ.get("DASHSCOPE_BASE", "https://dashscope-intl.aliyuncs.com")
 _DASHSCOPE_URL = _DASHSCOPE_BASE + "/api/v1/services/audio/tts/customization"
 def _dashscope_voice_design(text, instruct):
     payload = _json.dumps({
@@ -273,13 +299,20 @@ def _dashscope_voice_design(text, instruct):
 @fastapi_app.post("/qwen-tts")
 async def qwen_tts(request: Request):
-    if not DASHSCOPE_KEY:
-        return Response("DASHSCOPE_API_KEY not set", status_code=503)
     body = await request.json()
     text = (body.get("text") or "").strip()
     instruct = (body.get("instruct") or "").strip()
     if not text:
         return Response("text required", status_code=400)
     wav, err = await asyncio.to_thread(_dashscope_voice_design, text, instruct)
     if err:
         return Response(err, status_code=502)

 _DASHSCOPE_BASE = os.environ.get("DASHSCOPE_BASE", "https://dashscope-intl.aliyuncs.com")
 _DASHSCOPE_URL = _DASHSCOPE_BASE + "/api/v1/services/audio/tts/customization"
+# TINY_TTS_MODE=local → run the OPEN WEIGHTS in-process (your GPU, off the grid; same
+# origin so no CORS/cert dance — the LeLab pattern). Needs `pip install qwen-tts torch
+# soundfile`. Lazy-loaded; the Space (cpu-basic) leaves this unset and uses DashScope.
+TTS_MODE = os.environ.get("TINY_TTS_MODE", "").strip().lower()
+_local_tts = None
+_local_tts_lock = threading.Lock()
+def _local_voice_design(text, instruct, language="English"):
+    global _local_tts
+    import io
+    with _local_tts_lock:  # one GPU model can't decode in parallel
+        if _local_tts is None:
+            import torch
+            from qwen_tts import Qwen3TTSModel
+            mid = os.environ.get("QWEN_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign")
+            dev = "cuda:0" if torch.cuda.is_available() else "cpu"
+            dt = torch.bfloat16 if dev != "cpu" else torch.float32
+            _local_tts = Qwen3TTSModel.from_pretrained(mid, device_map=dev, dtype=dt)
+        import soundfile as sf
+        wavs, sr = _local_tts.generate_voice_design(
+            text=text, language=language, instruct=instruct or "A clear, natural voice at a moderate pace.")
+    out = io.BytesIO()
+    sf.write(out, wavs[0], sr, format="WAV")
+    return out.getvalue()
 def _dashscope_voice_design(text, instruct):
     payload = _json.dumps({
 @fastapi_app.post("/qwen-tts")
 async def qwen_tts(request: Request):
     body = await request.json()
     text = (body.get("text") or "").strip()
     instruct = (body.get("instruct") or "").strip()
+    language = body.get("language") or "English"
     if not text:
         return Response("text required", status_code=400)
+    if TTS_MODE == "local":  # in-process open weights (dev)
+        try:
+            wav = await asyncio.to_thread(_local_voice_design, text, instruct, language)
+        except Exception as e:  # noqa: BLE001 — surface a clear setup hint
+            return Response(f"local TTS error (pip install qwen-tts torch soundfile?): {e}", status_code=500)
+        return Response(wav, media_type="audio/wav", headers={"Cache-Control": "no-store"})
+    if not DASHSCOPE_KEY:
+        return Response("DASHSCOPE_API_KEY not set (or run with TINY_TTS_MODE=local)", status_code=503)
     wav, err = await asyncio.to_thread(_dashscope_voice_design, text, instruct)
     if err:
         return Response(err, status_code=502)

web/tts.js CHANGED Viewed

@@ -3,14 +3,15 @@
 // reader that speaks sentence-by-sentence so a war diary can narrate itself while the
 // LLM is still writing. Panels + the TTS bar import only from here.
 import { engine as kokoro } from '/web/ttsKokoro.js'
-import { engine as qwen3 } from '/web/ttsQwen3.js'
 import { engine as kitten } from '/web/ttsKitten.js'
 import { engine as webspeech } from '/web/ttsWebSpeech.js'
 import { playSamples, stopAudio } from '/web/ttsAudio.js'
 import { ensurePersistentStorage } from '/web/storage.js'
-const ENGINES = [kokoro, qwen3, kitten, webspeech]
-let activeId = 'kokoro'
 // Qwen3-TTS designs a voice from a free-form description (the persona's `voice`).
 // Panels set it before narrating; previewVoice() plays a one-off sample.
@@ -26,7 +27,7 @@ const voiceSel = {} // engineId -> chosen voice id
 const eng = () => ENGINES.find((e) => e.id === activeId) || ENGINES[0]
 export const listTtsEngines = () =>
-  ENGINES.map((e) => ({ id: e.id, label: e.label, available: e.available(), experimental: !!e.experimental }))
 export const getTtsEngineId = () => activeId
 export function setTtsEngine(id) { if (ENGINES.some((e) => e.id === id)) activeId = id }

 // reader that speaks sentence-by-sentence so a war diary can narrate itself while the
 // LLM is still writing. Panels + the TTS bar import only from here.
 import { engine as kokoro } from '/web/ttsKokoro.js'
+import { engine as qwen3, engineLocal as qwen3local, isLocalhost } from '/web/ttsQwen3.js'
 import { engine as kitten } from '/web/ttsKitten.js'
 import { engine as webspeech } from '/web/ttsWebSpeech.js'
 import { playSamples, stopAudio } from '/web/ttsAudio.js'
 import { ensurePersistentStorage } from '/web/storage.js'
+const ENGINES = [kokoro, qwen3local, qwen3, kitten, webspeech]
+// On localhost the local-GPU Qwen3-TTS is the default; in prod it's Kokoro (local-first).
+let activeId = isLocalhost() ? 'qwen3local' : 'kokoro'
 // Qwen3-TTS designs a voice from a free-form description (the persona's `voice`).
 // Panels set it before narrating; previewVoice() plays a one-off sample.
 const eng = () => ENGINES.find((e) => e.id === activeId) || ENGINES[0]
 export const listTtsEngines = () =>
+  ENGINES.map((e) => ({ id: e.id, label: e.label, available: e.available(), experimental: !!e.experimental, note: e.note || '' }))
 export const getTtsEngineId = () => activeId
 export function setTtsEngine(id) { if (ENGINES.some((e) => e.id === id)) activeId = id }

web/ttsBar.js CHANGED Viewed

@@ -35,7 +35,7 @@ export function mountTtsBar(host, { onChange } = {}) {
   engSel.replaceChildren(...listTtsEngines().map((e) =>
     el('option', { value: e.id, ...(e.available ? {} : { disabled: 'disabled' }) },
-      `${e.label}${e.available ? '' : ' · n/a'}`)))
   engSel.value = getTtsEngineId()
   function renderVoices() {

   engSel.replaceChildren(...listTtsEngines().map((e) =>
     el('option', { value: e.id, ...(e.available ? {} : { disabled: 'disabled' }) },
+      `${e.label}${e.available ? '' : ' · ' + (e.note || 'n/a')}`)))
   engSel.value = getTtsEngineId()
   function renderVoices() {

web/ttsQwen3.js CHANGED Viewed

@@ -31,9 +31,14 @@ const VOICES = [
 ]
 const get = (id) => VOICES.find((v) => v.id === id) || VOICES[0]
-async function synth(text, voiceId) {
   const instruct = (get(voiceId).desc() || '').trim()
-  const resp = await fetch(`${ttsBase()}/qwen-tts`, {
     method: 'POST', headers: { 'Content-Type': 'application/json' },
     body: JSON.stringify({ text, instruct, language: 'English' }),
   })
@@ -41,17 +46,31 @@ async function synth(text, voiceId) {
   return decodeAudio(await resp.arrayBuffer())
 }
 export const engine = {
   id: 'qwen3',
   label: 'Qwen3-TTS · Voice Design (cloud)',
-  mode: 'pcm',
-  needsDownload: false,
-  networked: true,
   available: () => true,
-  listVoices: () => VOICES,
-  defaultVoice: 'persona',
-  ensure: async () => { /* nothing to load — server-side */ },
-  synth,
   backendLabel: () => { const b = ttsBase(); try { return b ? '🖥 ' + new URL(b).host : '☁ DashScope' } catch { return '☁ DashScope' } },
-  setDesc(d) { _desc = (d || '').trim() },
 }

 ]
 const get = (id) => VOICES.find((v) => v.id === id) || VOICES[0]
+export const isLocalhost = () => {
+  try { return /^(localhost|127\.0\.0\.1|\[?::1\]?|0\.0\.0\.0)$/i.test(location.hostname) } catch { return false }
+}
+// POST to `${base}/qwen-tts` → WAV → samples. base '' = same-origin.
+async function postSynth(base, text, voiceId) {
   const instruct = (get(voiceId).desc() || '').trim()
+  const resp = await fetch(`${base}/qwen-tts`, {
     method: 'POST', headers: { 'Content-Type': 'application/json' },
     body: JSON.stringify({ text, instruct, language: 'English' }),
   })
   return decodeAudio(await resp.arrayBuffer())
 }
+const common = {
+  mode: 'pcm', needsDownload: false, networked: true,
+  listVoices: () => VOICES, defaultVoice: 'persona',
+  ensure: async () => { /* nothing to load — server-side */ },
+  setDesc(d) { _desc = (d || '').trim() }, // shared _desc across both variants
+}
+// CLOUD: the hosted backend (/qwen-tts → DashScope). `?tts=` can still bridge it.
 export const engine = {
+  ...common,
   id: 'qwen3',
   label: 'Qwen3-TTS · Voice Design (cloud)',
   available: () => true,
+  synth: (text, voiceId) => postSynth(ttsBase(), text, voiceId),
   backendLabel: () => { const b = ttsBase(); try { return b ? '🖥 ' + new URL(b).host : '☁ DashScope' } catch { return '☁ DashScope' } },
+}
+// LOCAL: same-origin /qwen-tts on a locally-run app.py (TINY_TTS_MODE=local → the open
+// weights on your GPU). Only offered on localhost; disabled with a note in prod.
+export const engineLocal = {
+  ...common,
+  id: 'qwen3local',
+  label: 'Qwen3-TTS · local (your GPU)',
+  available: () => isLocalhost(),
+  note: 'run the project locally',
+  synth: (text, voiceId) => postSynth('', text, voiceId),
+  backendLabel: () => '🖥 local model',
 }