Spaces:
Running
Running
Voice settings: add "Qwen3-TTS local" engine — localhost-only, default on localhost
Browse files- New TTS engine 'qwen3local' (Qwen3-TTS · local (your GPU)) that POSTs to same-origin
/qwen-tts. available() = isLocalhost(), so it's enabled on localhost and shown
DISABLED with a "run the project locally" note in prod.
- Default TTS engine is now qwen3local on localhost, Kokoro in prod (local-first).
- ttsBar shows each disabled engine's note instead of "n/a".
- app.py: TINY_TTS_MODE=local runs the open weights in-process (lazy qwen-tts, GPU),
so the local engine's same-origin /qwen-tts serves the model — the LeLab pattern
(one origin, no CORS/cert). Unset on the Space → DashScope as before.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
- app.py +35 -2
- web/tts.js +5 -4
- web/ttsBar.js +1 -1
- web/ttsQwen3.js +29 -10
app.py
CHANGED
|
@@ -242,6 +242,32 @@ DASHSCOPE_KEY = os.environ.get("DASHSCOPE_API_KEY", "")
|
|
| 242 |
_DASHSCOPE_BASE = os.environ.get("DASHSCOPE_BASE", "https://dashscope-intl.aliyuncs.com")
|
| 243 |
_DASHSCOPE_URL = _DASHSCOPE_BASE + "/api/v1/services/audio/tts/customization"
|
| 244 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
|
| 246 |
def _dashscope_voice_design(text, instruct):
|
| 247 |
payload = _json.dumps({
|
|
@@ -273,13 +299,20 @@ def _dashscope_voice_design(text, instruct):
|
|
| 273 |
|
| 274 |
@fastapi_app.post("/qwen-tts")
|
| 275 |
async def qwen_tts(request: Request):
|
| 276 |
-
if not DASHSCOPE_KEY:
|
| 277 |
-
return Response("DASHSCOPE_API_KEY not set", status_code=503)
|
| 278 |
body = await request.json()
|
| 279 |
text = (body.get("text") or "").strip()
|
| 280 |
instruct = (body.get("instruct") or "").strip()
|
|
|
|
| 281 |
if not text:
|
| 282 |
return Response("text required", status_code=400)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
wav, err = await asyncio.to_thread(_dashscope_voice_design, text, instruct)
|
| 284 |
if err:
|
| 285 |
return Response(err, status_code=502)
|
|
|
|
| 242 |
_DASHSCOPE_BASE = os.environ.get("DASHSCOPE_BASE", "https://dashscope-intl.aliyuncs.com")
|
| 243 |
_DASHSCOPE_URL = _DASHSCOPE_BASE + "/api/v1/services/audio/tts/customization"
|
| 244 |
|
| 245 |
+
# TINY_TTS_MODE=local → run the OPEN WEIGHTS in-process (your GPU, off the grid; same
|
| 246 |
+
# origin so no CORS/cert dance — the LeLab pattern). Needs `pip install qwen-tts torch
|
| 247 |
+
# soundfile`. Lazy-loaded; the Space (cpu-basic) leaves this unset and uses DashScope.
|
| 248 |
+
TTS_MODE = os.environ.get("TINY_TTS_MODE", "").strip().lower()
|
| 249 |
+
_local_tts = None
|
| 250 |
+
_local_tts_lock = threading.Lock()
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
def _local_voice_design(text, instruct, language="English"):
|
| 254 |
+
global _local_tts
|
| 255 |
+
import io
|
| 256 |
+
with _local_tts_lock: # one GPU model can't decode in parallel
|
| 257 |
+
if _local_tts is None:
|
| 258 |
+
import torch
|
| 259 |
+
from qwen_tts import Qwen3TTSModel
|
| 260 |
+
mid = os.environ.get("QWEN_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign")
|
| 261 |
+
dev = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 262 |
+
dt = torch.bfloat16 if dev != "cpu" else torch.float32
|
| 263 |
+
_local_tts = Qwen3TTSModel.from_pretrained(mid, device_map=dev, dtype=dt)
|
| 264 |
+
import soundfile as sf
|
| 265 |
+
wavs, sr = _local_tts.generate_voice_design(
|
| 266 |
+
text=text, language=language, instruct=instruct or "A clear, natural voice at a moderate pace.")
|
| 267 |
+
out = io.BytesIO()
|
| 268 |
+
sf.write(out, wavs[0], sr, format="WAV")
|
| 269 |
+
return out.getvalue()
|
| 270 |
+
|
| 271 |
|
| 272 |
def _dashscope_voice_design(text, instruct):
|
| 273 |
payload = _json.dumps({
|
|
|
|
| 299 |
|
| 300 |
@fastapi_app.post("/qwen-tts")
|
| 301 |
async def qwen_tts(request: Request):
|
|
|
|
|
|
|
| 302 |
body = await request.json()
|
| 303 |
text = (body.get("text") or "").strip()
|
| 304 |
instruct = (body.get("instruct") or "").strip()
|
| 305 |
+
language = body.get("language") or "English"
|
| 306 |
if not text:
|
| 307 |
return Response("text required", status_code=400)
|
| 308 |
+
if TTS_MODE == "local": # in-process open weights (dev)
|
| 309 |
+
try:
|
| 310 |
+
wav = await asyncio.to_thread(_local_voice_design, text, instruct, language)
|
| 311 |
+
except Exception as e: # noqa: BLE001 — surface a clear setup hint
|
| 312 |
+
return Response(f"local TTS error (pip install qwen-tts torch soundfile?): {e}", status_code=500)
|
| 313 |
+
return Response(wav, media_type="audio/wav", headers={"Cache-Control": "no-store"})
|
| 314 |
+
if not DASHSCOPE_KEY:
|
| 315 |
+
return Response("DASHSCOPE_API_KEY not set (or run with TINY_TTS_MODE=local)", status_code=503)
|
| 316 |
wav, err = await asyncio.to_thread(_dashscope_voice_design, text, instruct)
|
| 317 |
if err:
|
| 318 |
return Response(err, status_code=502)
|
web/tts.js
CHANGED
|
@@ -3,14 +3,15 @@
|
|
| 3 |
// reader that speaks sentence-by-sentence so a war diary can narrate itself while the
|
| 4 |
// LLM is still writing. Panels + the TTS bar import only from here.
|
| 5 |
import { engine as kokoro } from '/web/ttsKokoro.js'
|
| 6 |
-
import { engine as qwen3 } from '/web/ttsQwen3.js'
|
| 7 |
import { engine as kitten } from '/web/ttsKitten.js'
|
| 8 |
import { engine as webspeech } from '/web/ttsWebSpeech.js'
|
| 9 |
import { playSamples, stopAudio } from '/web/ttsAudio.js'
|
| 10 |
import { ensurePersistentStorage } from '/web/storage.js'
|
| 11 |
|
| 12 |
-
const ENGINES = [kokoro, qwen3, kitten, webspeech]
|
| 13 |
-
|
|
|
|
| 14 |
|
| 15 |
// Qwen3-TTS designs a voice from a free-form description (the persona's `voice`).
|
| 16 |
// Panels set it before narrating; previewVoice() plays a one-off sample.
|
|
@@ -26,7 +27,7 @@ const voiceSel = {} // engineId -> chosen voice id
|
|
| 26 |
const eng = () => ENGINES.find((e) => e.id === activeId) || ENGINES[0]
|
| 27 |
|
| 28 |
export const listTtsEngines = () =>
|
| 29 |
-
ENGINES.map((e) => ({ id: e.id, label: e.label, available: e.available(), experimental: !!e.experimental }))
|
| 30 |
export const getTtsEngineId = () => activeId
|
| 31 |
export function setTtsEngine(id) { if (ENGINES.some((e) => e.id === id)) activeId = id }
|
| 32 |
|
|
|
|
| 3 |
// reader that speaks sentence-by-sentence so a war diary can narrate itself while the
|
| 4 |
// LLM is still writing. Panels + the TTS bar import only from here.
|
| 5 |
import { engine as kokoro } from '/web/ttsKokoro.js'
|
| 6 |
+
import { engine as qwen3, engineLocal as qwen3local, isLocalhost } from '/web/ttsQwen3.js'
|
| 7 |
import { engine as kitten } from '/web/ttsKitten.js'
|
| 8 |
import { engine as webspeech } from '/web/ttsWebSpeech.js'
|
| 9 |
import { playSamples, stopAudio } from '/web/ttsAudio.js'
|
| 10 |
import { ensurePersistentStorage } from '/web/storage.js'
|
| 11 |
|
| 12 |
+
const ENGINES = [kokoro, qwen3local, qwen3, kitten, webspeech]
|
| 13 |
+
// On localhost the local-GPU Qwen3-TTS is the default; in prod it's Kokoro (local-first).
|
| 14 |
+
let activeId = isLocalhost() ? 'qwen3local' : 'kokoro'
|
| 15 |
|
| 16 |
// Qwen3-TTS designs a voice from a free-form description (the persona's `voice`).
|
| 17 |
// Panels set it before narrating; previewVoice() plays a one-off sample.
|
|
|
|
| 27 |
const eng = () => ENGINES.find((e) => e.id === activeId) || ENGINES[0]
|
| 28 |
|
| 29 |
export const listTtsEngines = () =>
|
| 30 |
+
ENGINES.map((e) => ({ id: e.id, label: e.label, available: e.available(), experimental: !!e.experimental, note: e.note || '' }))
|
| 31 |
export const getTtsEngineId = () => activeId
|
| 32 |
export function setTtsEngine(id) { if (ENGINES.some((e) => e.id === id)) activeId = id }
|
| 33 |
|
web/ttsBar.js
CHANGED
|
@@ -35,7 +35,7 @@ export function mountTtsBar(host, { onChange } = {}) {
|
|
| 35 |
|
| 36 |
engSel.replaceChildren(...listTtsEngines().map((e) =>
|
| 37 |
el('option', { value: e.id, ...(e.available ? {} : { disabled: 'disabled' }) },
|
| 38 |
-
`${e.label}${e.available ? '' : ' · n/a'}`)))
|
| 39 |
engSel.value = getTtsEngineId()
|
| 40 |
|
| 41 |
function renderVoices() {
|
|
|
|
| 35 |
|
| 36 |
engSel.replaceChildren(...listTtsEngines().map((e) =>
|
| 37 |
el('option', { value: e.id, ...(e.available ? {} : { disabled: 'disabled' }) },
|
| 38 |
+
`${e.label}${e.available ? '' : ' · ' + (e.note || 'n/a')}`)))
|
| 39 |
engSel.value = getTtsEngineId()
|
| 40 |
|
| 41 |
function renderVoices() {
|
web/ttsQwen3.js
CHANGED
|
@@ -31,9 +31,14 @@ const VOICES = [
|
|
| 31 |
]
|
| 32 |
const get = (id) => VOICES.find((v) => v.id === id) || VOICES[0]
|
| 33 |
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
const instruct = (get(voiceId).desc() || '').trim()
|
| 36 |
-
const resp = await fetch(`${
|
| 37 |
method: 'POST', headers: { 'Content-Type': 'application/json' },
|
| 38 |
body: JSON.stringify({ text, instruct, language: 'English' }),
|
| 39 |
})
|
|
@@ -41,17 +46,31 @@ async function synth(text, voiceId) {
|
|
| 41 |
return decodeAudio(await resp.arrayBuffer())
|
| 42 |
}
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
export const engine = {
|
|
|
|
| 45 |
id: 'qwen3',
|
| 46 |
label: 'Qwen3-TTS · Voice Design (cloud)',
|
| 47 |
-
mode: 'pcm',
|
| 48 |
-
needsDownload: false,
|
| 49 |
-
networked: true,
|
| 50 |
available: () => true,
|
| 51 |
-
|
| 52 |
-
defaultVoice: 'persona',
|
| 53 |
-
ensure: async () => { /* nothing to load — server-side */ },
|
| 54 |
-
synth,
|
| 55 |
backendLabel: () => { const b = ttsBase(); try { return b ? '🖥 ' + new URL(b).host : '☁ DashScope' } catch { return '☁ DashScope' } },
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
}
|
|
|
|
| 31 |
]
|
| 32 |
const get = (id) => VOICES.find((v) => v.id === id) || VOICES[0]
|
| 33 |
|
| 34 |
+
export const isLocalhost = () => {
|
| 35 |
+
try { return /^(localhost|127\.0\.0\.1|\[?::1\]?|0\.0\.0\.0)$/i.test(location.hostname) } catch { return false }
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
// POST to `${base}/qwen-tts` → WAV → samples. base '' = same-origin.
|
| 39 |
+
async function postSynth(base, text, voiceId) {
|
| 40 |
const instruct = (get(voiceId).desc() || '').trim()
|
| 41 |
+
const resp = await fetch(`${base}/qwen-tts`, {
|
| 42 |
method: 'POST', headers: { 'Content-Type': 'application/json' },
|
| 43 |
body: JSON.stringify({ text, instruct, language: 'English' }),
|
| 44 |
})
|
|
|
|
| 46 |
return decodeAudio(await resp.arrayBuffer())
|
| 47 |
}
|
| 48 |
|
| 49 |
+
const common = {
|
| 50 |
+
mode: 'pcm', needsDownload: false, networked: true,
|
| 51 |
+
listVoices: () => VOICES, defaultVoice: 'persona',
|
| 52 |
+
ensure: async () => { /* nothing to load — server-side */ },
|
| 53 |
+
setDesc(d) { _desc = (d || '').trim() }, // shared _desc across both variants
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
// CLOUD: the hosted backend (/qwen-tts → DashScope). `?tts=` can still bridge it.
|
| 57 |
export const engine = {
|
| 58 |
+
...common,
|
| 59 |
id: 'qwen3',
|
| 60 |
label: 'Qwen3-TTS · Voice Design (cloud)',
|
|
|
|
|
|
|
|
|
|
| 61 |
available: () => true,
|
| 62 |
+
synth: (text, voiceId) => postSynth(ttsBase(), text, voiceId),
|
|
|
|
|
|
|
|
|
|
| 63 |
backendLabel: () => { const b = ttsBase(); try { return b ? '🖥 ' + new URL(b).host : '☁ DashScope' } catch { return '☁ DashScope' } },
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
// LOCAL: same-origin /qwen-tts on a locally-run app.py (TINY_TTS_MODE=local → the open
|
| 67 |
+
// weights on your GPU). Only offered on localhost; disabled with a note in prod.
|
| 68 |
+
export const engineLocal = {
|
| 69 |
+
...common,
|
| 70 |
+
id: 'qwen3local',
|
| 71 |
+
label: 'Qwen3-TTS · local (your GPU)',
|
| 72 |
+
available: () => isLocalhost(),
|
| 73 |
+
note: 'run the project locally',
|
| 74 |
+
synth: (text, voiceId) => postSynth('', text, voiceId),
|
| 75 |
+
backendLabel: () => '🖥 local model',
|
| 76 |
}
|