polats Claude Opus 4.8 (1M context) commited on
Commit
3bc1b31
·
1 Parent(s): dffe06d

Add Qwen3-TTS local bridge: ?tts= override + tts_server.py

Browse files

LeLab-style: point the (hosted or local) UI at a self-run Qwen3-TTS server so voices
are designed on YOUR GPU, off the grid.

- ttsQwen3.js: endpoint is configurable via a ?tts=<base> query param (persisted to
localStorage; ?tts= clears it). Default stays the same-origin /qwen-tts (DashScope).
backendLabel shows 🖥 host when bridged.
- tts_server.py: standalone FastAPI server running the open weights
(Qwen3-TTS-12Hz-1.7B-VoiceDesign) via the qwen-tts package, POST /qwen-tts
{text,instruct,language} → WAV, CORS-open. QWEN_TTS_STUB=1 returns a tone so the
bridge can be smoke-tested without a GPU.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

Files changed (3) hide show
  1. .gitignore +1 -0
  2. tts_server.py +112 -0
  3. web/ttsQwen3.js +19 -3
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  __pycache__/
2
  *.pyc
 
 
1
  __pycache__/
2
  *.pyc
3
+ .venv/
tts_server.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Local Qwen3-TTS Voice Design server — the LeLab-style bridge.
2
+
3
+ Runs the OPEN WEIGHTS on YOUR machine's GPU; the hosted Tiny Army UI calls it via a
4
+ `?tts=` override, so voices are designed locally and off the grid (no DashScope key/cost).
5
+
6
+ Quick start (on a CUDA box; MPS/CPU work but are slow):
7
+
8
+ pip install qwen-tts soundfile "fastapi[standard]" uvicorn torch
9
+ python tts_server.py # serves http://localhost:8800/qwen-tts
10
+
11
+ Then open the app pointed at this server:
12
+
13
+ http://localhost:7860/?tts=http://localhost:8800 # local UI + local TTS
14
+ https://tinyarmy.noods.cc/?tts=http://localhost:8800 # hosted UI + your GPU
15
+ (browsers block https→http://localhost by default; run Chrome with
16
+ --unsafely-treat-insecure-origin-as-secure=http://localhost:8800 or serve TLS)
17
+
18
+ Smoke-test the bridge WITHOUT a GPU (returns a short tone instead of speech):
19
+
20
+ QWEN_TTS_STUB=1 python tts_server.py
21
+
22
+ Env: PORT (8800), QWEN_TTS_MODEL (Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign), QWEN_TTS_STUB.
23
+ """
24
+ import asyncio
25
+ import io
26
+ import math
27
+ import os
28
+ import struct
29
+
30
+ from fastapi import FastAPI, Request
31
+ from fastapi.responses import Response
32
+ from fastapi.middleware.cors import CORSMiddleware
33
+
34
+ MODEL_ID = os.environ.get("QWEN_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign")
35
+ STUB = os.environ.get("QWEN_TTS_STUB", "") not in ("", "0", "false", "False")
36
+ PORT = int(os.environ.get("PORT", "8800"))
37
+
38
+ app = FastAPI()
39
+ # The hosted UI is a different origin — allow it (and any localhost dev port).
40
+ app.add_middleware(
41
+ CORSMiddleware,
42
+ allow_origins=["*"], allow_methods=["*"], allow_headers=["*"], allow_credentials=False,
43
+ )
44
+
45
+ _model = None
46
+ _load_lock = asyncio.Lock()
47
+
48
+
49
+ def _load_model():
50
+ global _model
51
+ if _model is not None:
52
+ return _model
53
+ import torch
54
+ from qwen_tts import Qwen3TTSModel
55
+ if torch.cuda.is_available():
56
+ dev, dtype = "cuda:0", torch.bfloat16
57
+ elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
58
+ dev, dtype = "mps", torch.float32
59
+ else:
60
+ dev, dtype = "cpu", torch.float32
61
+ print(f"[tts] loading {MODEL_ID} on {dev} ({dtype})…", flush=True)
62
+ _model = Qwen3TTSModel.from_pretrained(MODEL_ID, device_map=dev, dtype=dtype)
63
+ print("[tts] model ready", flush=True)
64
+ return _model
65
+
66
+
67
+ def _stub_wav(text, sr=24000):
68
+ """A short A4 tone — proves the bridge end-to-end without loading the model."""
69
+ secs = min(4.0, max(0.6, len(text) / 18.0))
70
+ n = int(sr * secs)
71
+ buf = io.BytesIO()
72
+ data = b"".join(struct.pack("<h", int(0.25 * 32767 * math.sin(2 * math.pi * 440 * i / sr))) for i in range(n))
73
+ buf.write(b"RIFF"); buf.write(struct.pack("<I", 36 + len(data))); buf.write(b"WAVE")
74
+ buf.write(b"fmt "); buf.write(struct.pack("<IHHIIHH", 16, 1, 1, sr, sr * 2, 2, 16))
75
+ buf.write(b"data"); buf.write(struct.pack("<I", len(data))); buf.write(data)
76
+ return buf.getvalue()
77
+
78
+
79
+ def _synth(text, instruct, language):
80
+ if STUB:
81
+ return _stub_wav(text)
82
+ import soundfile as sf
83
+ wavs, sr = _load_model().generate_voice_design(text=text, language=language, instruct=instruct)
84
+ out = io.BytesIO(); sf.write(out, wavs[0], sr, format="WAV")
85
+ return out.getvalue()
86
+
87
+
88
+ @app.get("/health")
89
+ def health():
90
+ return {"ok": True, "model": MODEL_ID, "stub": STUB, "loaded": _model is not None}
91
+
92
+
93
+ @app.post("/qwen-tts")
94
+ async def qwen_tts(request: Request):
95
+ body = await request.json()
96
+ text = (body.get("text") or "").strip()
97
+ instruct = (body.get("instruct") or "").strip() or "A clear, natural voice at a moderate pace."
98
+ language = body.get("language") or "English"
99
+ if not text:
100
+ return Response("text required", status_code=400)
101
+ if not STUB:
102
+ async with _load_lock: # one CPU/GPU model can't decode in parallel
103
+ wav = await asyncio.to_thread(_synth, text, instruct, language)
104
+ else:
105
+ wav = _synth(text, instruct, language)
106
+ return Response(wav, media_type="audio/wav", headers={"Cache-Control": "no-store"})
107
+
108
+
109
+ if __name__ == "__main__":
110
+ import uvicorn
111
+ print(f"[tts] http://localhost:{PORT}/qwen-tts (stub={STUB}, model={MODEL_ID})", flush=True)
112
+ uvicorn.run(app, host="0.0.0.0", port=PORT)
web/ttsQwen3.js CHANGED
@@ -4,6 +4,22 @@
4
  // `voice_prompt`. NETWORKED — not local-first (clearly labeled). mode 'pcm'.
5
  import { decodeAudio } from '/web/ttsAudio.js'
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  // `desc()` returns the instruct string. 'persona' uses the dynamically-set description.
8
  let _desc = ''
9
  const VOICES = [
@@ -17,9 +33,9 @@ const get = (id) => VOICES.find((v) => v.id === id) || VOICES[0]
17
 
18
  async function synth(text, voiceId) {
19
  const instruct = (get(voiceId).desc() || '').trim()
20
- const resp = await fetch('/qwen-tts', {
21
  method: 'POST', headers: { 'Content-Type': 'application/json' },
22
- body: JSON.stringify({ text, instruct }),
23
  })
24
  if (!resp.ok) throw new Error(`Qwen3-TTS ${resp.status}: ${(await resp.text()).slice(0, 140)}`)
25
  return decodeAudio(await resp.arrayBuffer())
@@ -36,6 +52,6 @@ export const engine = {
36
  defaultVoice: 'persona',
37
  ensure: async () => { /* nothing to load — server-side */ },
38
  synth,
39
- backendLabel: () => '☁ DashScope',
40
  setDesc(d) { _desc = (d || '').trim() },
41
  }
 
4
  // `voice_prompt`. NETWORKED — not local-first (clearly labeled). mode 'pcm'.
5
  import { decodeAudio } from '/web/ttsAudio.js'
6
 
7
+ // Endpoint: default is our Space backend (/qwen-tts → DashScope). A `?tts=<base>` query
8
+ // param (persisted to localStorage) points it at a self-run local server instead —
9
+ // the LeLab-style bridge: hosted UI → Qwen3-TTS on YOUR GPU, off the grid. `?tts=`
10
+ // (empty) clears the override. e.g. ?tts=http://localhost:8800
11
+ const TTS_STORE = 'tinyarmy.ttsBase'
12
+ function ttsBase() {
13
+ try {
14
+ const q = new URLSearchParams(location.search).get('tts')
15
+ if (q !== null) {
16
+ if (q) localStorage.setItem(TTS_STORE, q.replace(/\/+$/, ''))
17
+ else localStorage.removeItem(TTS_STORE)
18
+ }
19
+ return (localStorage.getItem(TTS_STORE) || '').replace(/\/+$/, '')
20
+ } catch { return '' }
21
+ }
22
+
23
  // `desc()` returns the instruct string. 'persona' uses the dynamically-set description.
24
  let _desc = ''
25
  const VOICES = [
 
33
 
34
  async function synth(text, voiceId) {
35
  const instruct = (get(voiceId).desc() || '').trim()
36
+ const resp = await fetch(`${ttsBase()}/qwen-tts`, {
37
  method: 'POST', headers: { 'Content-Type': 'application/json' },
38
+ body: JSON.stringify({ text, instruct, language: 'English' }),
39
  })
40
  if (!resp.ok) throw new Error(`Qwen3-TTS ${resp.status}: ${(await resp.text()).slice(0, 140)}`)
41
  return decodeAudio(await resp.arrayBuffer())
 
52
  defaultVoice: 'persona',
53
  ensure: async () => { /* nothing to load — server-side */ },
54
  synth,
55
+ backendLabel: () => { const b = ttsBase(); try { return b ? '🖥 ' + new URL(b).host : '☁ DashScope' } catch { return '☁ DashScope' } },
56
  setDesc(d) { _desc = (d || '').trim() },
57
  }