polats Claude Opus 4.8 (1M context) commited on
Commit
e352ff3
·
1 Parent(s): 0f47058

Voice settings: add "Qwen3-TTS local" engine — localhost-only, default on localhost

Browse files

- New TTS engine 'qwen3local' (Qwen3-TTS · local (your GPU)) that POSTs to same-origin
/qwen-tts. available() = isLocalhost(), so it's enabled on localhost and shown
DISABLED with a "run the project locally" note in prod.
- Default TTS engine is now qwen3local on localhost, Kokoro in prod (local-first).
- ttsBar shows each disabled engine's note instead of "n/a".
- app.py: TINY_TTS_MODE=local runs the open weights in-process (lazy qwen-tts, GPU),
so the local engine's same-origin /qwen-tts serves the model — the LeLab pattern
(one origin, no CORS/cert). Unset on the Space → DashScope as before.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

Files changed (4) hide show
  1. app.py +35 -2
  2. web/tts.js +5 -4
  3. web/ttsBar.js +1 -1
  4. web/ttsQwen3.js +29 -10
app.py CHANGED
@@ -242,6 +242,32 @@ DASHSCOPE_KEY = os.environ.get("DASHSCOPE_API_KEY", "")
242
  _DASHSCOPE_BASE = os.environ.get("DASHSCOPE_BASE", "https://dashscope-intl.aliyuncs.com")
243
  _DASHSCOPE_URL = _DASHSCOPE_BASE + "/api/v1/services/audio/tts/customization"
244
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
 
246
  def _dashscope_voice_design(text, instruct):
247
  payload = _json.dumps({
@@ -273,13 +299,20 @@ def _dashscope_voice_design(text, instruct):
273
 
274
  @fastapi_app.post("/qwen-tts")
275
  async def qwen_tts(request: Request):
276
- if not DASHSCOPE_KEY:
277
- return Response("DASHSCOPE_API_KEY not set", status_code=503)
278
  body = await request.json()
279
  text = (body.get("text") or "").strip()
280
  instruct = (body.get("instruct") or "").strip()
 
281
  if not text:
282
  return Response("text required", status_code=400)
 
 
 
 
 
 
 
 
283
  wav, err = await asyncio.to_thread(_dashscope_voice_design, text, instruct)
284
  if err:
285
  return Response(err, status_code=502)
 
242
  _DASHSCOPE_BASE = os.environ.get("DASHSCOPE_BASE", "https://dashscope-intl.aliyuncs.com")
243
  _DASHSCOPE_URL = _DASHSCOPE_BASE + "/api/v1/services/audio/tts/customization"
244
 
245
+ # TINY_TTS_MODE=local → run the OPEN WEIGHTS in-process (your GPU, off the grid; same
246
+ # origin so no CORS/cert dance — the LeLab pattern). Needs `pip install qwen-tts torch
247
+ # soundfile`. Lazy-loaded; the Space (cpu-basic) leaves this unset and uses DashScope.
248
+ TTS_MODE = os.environ.get("TINY_TTS_MODE", "").strip().lower()
249
+ _local_tts = None
250
+ _local_tts_lock = threading.Lock()
251
+
252
+
253
+ def _local_voice_design(text, instruct, language="English"):
254
+ global _local_tts
255
+ import io
256
+ with _local_tts_lock: # one GPU model can't decode in parallel
257
+ if _local_tts is None:
258
+ import torch
259
+ from qwen_tts import Qwen3TTSModel
260
+ mid = os.environ.get("QWEN_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign")
261
+ dev = "cuda:0" if torch.cuda.is_available() else "cpu"
262
+ dt = torch.bfloat16 if dev != "cpu" else torch.float32
263
+ _local_tts = Qwen3TTSModel.from_pretrained(mid, device_map=dev, dtype=dt)
264
+ import soundfile as sf
265
+ wavs, sr = _local_tts.generate_voice_design(
266
+ text=text, language=language, instruct=instruct or "A clear, natural voice at a moderate pace.")
267
+ out = io.BytesIO()
268
+ sf.write(out, wavs[0], sr, format="WAV")
269
+ return out.getvalue()
270
+
271
 
272
  def _dashscope_voice_design(text, instruct):
273
  payload = _json.dumps({
 
299
 
300
  @fastapi_app.post("/qwen-tts")
301
  async def qwen_tts(request: Request):
 
 
302
  body = await request.json()
303
  text = (body.get("text") or "").strip()
304
  instruct = (body.get("instruct") or "").strip()
305
+ language = body.get("language") or "English"
306
  if not text:
307
  return Response("text required", status_code=400)
308
+ if TTS_MODE == "local": # in-process open weights (dev)
309
+ try:
310
+ wav = await asyncio.to_thread(_local_voice_design, text, instruct, language)
311
+ except Exception as e: # noqa: BLE001 — surface a clear setup hint
312
+ return Response(f"local TTS error (pip install qwen-tts torch soundfile?): {e}", status_code=500)
313
+ return Response(wav, media_type="audio/wav", headers={"Cache-Control": "no-store"})
314
+ if not DASHSCOPE_KEY:
315
+ return Response("DASHSCOPE_API_KEY not set (or run with TINY_TTS_MODE=local)", status_code=503)
316
  wav, err = await asyncio.to_thread(_dashscope_voice_design, text, instruct)
317
  if err:
318
  return Response(err, status_code=502)
web/tts.js CHANGED
@@ -3,14 +3,15 @@
3
  // reader that speaks sentence-by-sentence so a war diary can narrate itself while the
4
  // LLM is still writing. Panels + the TTS bar import only from here.
5
  import { engine as kokoro } from '/web/ttsKokoro.js'
6
- import { engine as qwen3 } from '/web/ttsQwen3.js'
7
  import { engine as kitten } from '/web/ttsKitten.js'
8
  import { engine as webspeech } from '/web/ttsWebSpeech.js'
9
  import { playSamples, stopAudio } from '/web/ttsAudio.js'
10
  import { ensurePersistentStorage } from '/web/storage.js'
11
 
12
- const ENGINES = [kokoro, qwen3, kitten, webspeech]
13
- let activeId = 'kokoro'
 
14
 
15
  // Qwen3-TTS designs a voice from a free-form description (the persona's `voice`).
16
  // Panels set it before narrating; previewVoice() plays a one-off sample.
@@ -26,7 +27,7 @@ const voiceSel = {} // engineId -> chosen voice id
26
  const eng = () => ENGINES.find((e) => e.id === activeId) || ENGINES[0]
27
 
28
  export const listTtsEngines = () =>
29
- ENGINES.map((e) => ({ id: e.id, label: e.label, available: e.available(), experimental: !!e.experimental }))
30
  export const getTtsEngineId = () => activeId
31
  export function setTtsEngine(id) { if (ENGINES.some((e) => e.id === id)) activeId = id }
32
 
 
3
  // reader that speaks sentence-by-sentence so a war diary can narrate itself while the
4
  // LLM is still writing. Panels + the TTS bar import only from here.
5
  import { engine as kokoro } from '/web/ttsKokoro.js'
6
+ import { engine as qwen3, engineLocal as qwen3local, isLocalhost } from '/web/ttsQwen3.js'
7
  import { engine as kitten } from '/web/ttsKitten.js'
8
  import { engine as webspeech } from '/web/ttsWebSpeech.js'
9
  import { playSamples, stopAudio } from '/web/ttsAudio.js'
10
  import { ensurePersistentStorage } from '/web/storage.js'
11
 
12
+ const ENGINES = [kokoro, qwen3local, qwen3, kitten, webspeech]
13
+ // On localhost the local-GPU Qwen3-TTS is the default; in prod it's Kokoro (local-first).
14
+ let activeId = isLocalhost() ? 'qwen3local' : 'kokoro'
15
 
16
  // Qwen3-TTS designs a voice from a free-form description (the persona's `voice`).
17
  // Panels set it before narrating; previewVoice() plays a one-off sample.
 
27
  const eng = () => ENGINES.find((e) => e.id === activeId) || ENGINES[0]
28
 
29
  export const listTtsEngines = () =>
30
+ ENGINES.map((e) => ({ id: e.id, label: e.label, available: e.available(), experimental: !!e.experimental, note: e.note || '' }))
31
  export const getTtsEngineId = () => activeId
32
  export function setTtsEngine(id) { if (ENGINES.some((e) => e.id === id)) activeId = id }
33
 
web/ttsBar.js CHANGED
@@ -35,7 +35,7 @@ export function mountTtsBar(host, { onChange } = {}) {
35
 
36
  engSel.replaceChildren(...listTtsEngines().map((e) =>
37
  el('option', { value: e.id, ...(e.available ? {} : { disabled: 'disabled' }) },
38
- `${e.label}${e.available ? '' : ' · n/a'}`)))
39
  engSel.value = getTtsEngineId()
40
 
41
  function renderVoices() {
 
35
 
36
  engSel.replaceChildren(...listTtsEngines().map((e) =>
37
  el('option', { value: e.id, ...(e.available ? {} : { disabled: 'disabled' }) },
38
+ `${e.label}${e.available ? '' : ' · ' + (e.note || 'n/a')}`)))
39
  engSel.value = getTtsEngineId()
40
 
41
  function renderVoices() {
web/ttsQwen3.js CHANGED
@@ -31,9 +31,14 @@ const VOICES = [
31
  ]
32
  const get = (id) => VOICES.find((v) => v.id === id) || VOICES[0]
33
 
34
- async function synth(text, voiceId) {
 
 
 
 
 
35
  const instruct = (get(voiceId).desc() || '').trim()
36
- const resp = await fetch(`${ttsBase()}/qwen-tts`, {
37
  method: 'POST', headers: { 'Content-Type': 'application/json' },
38
  body: JSON.stringify({ text, instruct, language: 'English' }),
39
  })
@@ -41,17 +46,31 @@ async function synth(text, voiceId) {
41
  return decodeAudio(await resp.arrayBuffer())
42
  }
43
 
 
 
 
 
 
 
 
 
44
  export const engine = {
 
45
  id: 'qwen3',
46
  label: 'Qwen3-TTS · Voice Design (cloud)',
47
- mode: 'pcm',
48
- needsDownload: false,
49
- networked: true,
50
  available: () => true,
51
- listVoices: () => VOICES,
52
- defaultVoice: 'persona',
53
- ensure: async () => { /* nothing to load — server-side */ },
54
- synth,
55
  backendLabel: () => { const b = ttsBase(); try { return b ? '🖥 ' + new URL(b).host : '☁ DashScope' } catch { return '☁ DashScope' } },
56
- setDesc(d) { _desc = (d || '').trim() },
 
 
 
 
 
 
 
 
 
 
 
57
  }
 
31
  ]
32
  const get = (id) => VOICES.find((v) => v.id === id) || VOICES[0]
33
 
34
+ export const isLocalhost = () => {
35
+ try { return /^(localhost|127\.0\.0\.1|\[?::1\]?|0\.0\.0\.0)$/i.test(location.hostname) } catch { return false }
36
+ }
37
+
38
+ // POST to `${base}/qwen-tts` → WAV → samples. base '' = same-origin.
39
+ async function postSynth(base, text, voiceId) {
40
  const instruct = (get(voiceId).desc() || '').trim()
41
+ const resp = await fetch(`${base}/qwen-tts`, {
42
  method: 'POST', headers: { 'Content-Type': 'application/json' },
43
  body: JSON.stringify({ text, instruct, language: 'English' }),
44
  })
 
46
  return decodeAudio(await resp.arrayBuffer())
47
  }
48
 
49
+ const common = {
50
+ mode: 'pcm', needsDownload: false, networked: true,
51
+ listVoices: () => VOICES, defaultVoice: 'persona',
52
+ ensure: async () => { /* nothing to load — server-side */ },
53
+ setDesc(d) { _desc = (d || '').trim() }, // shared _desc across both variants
54
+ }
55
+
56
+ // CLOUD: the hosted backend (/qwen-tts → DashScope). `?tts=` can still bridge it.
57
  export const engine = {
58
+ ...common,
59
  id: 'qwen3',
60
  label: 'Qwen3-TTS · Voice Design (cloud)',
 
 
 
61
  available: () => true,
62
+ synth: (text, voiceId) => postSynth(ttsBase(), text, voiceId),
 
 
 
63
  backendLabel: () => { const b = ttsBase(); try { return b ? '🖥 ' + new URL(b).host : '☁ DashScope' } catch { return '☁ DashScope' } },
64
+ }
65
+
66
+ // LOCAL: same-origin /qwen-tts on a locally-run app.py (TINY_TTS_MODE=local → the open
67
+ // weights on your GPU). Only offered on localhost; disabled with a note in prod.
68
+ export const engineLocal = {
69
+ ...common,
70
+ id: 'qwen3local',
71
+ label: 'Qwen3-TTS · local (your GPU)',
72
+ available: () => isLocalhost(),
73
+ note: 'run the project locally',
74
+ synth: (text, voiceId) => postSynth('', text, voiceId),
75
+ backendLabel: () => '🖥 local model',
76
  }