Spaces:
Running
Running
| """Local Qwen3-TTS Voice Design server — the LeLab-style bridge. | |
| Runs the OPEN WEIGHTS on YOUR machine's GPU; the hosted Tiny Army UI calls it via a | |
| `?tts=` override, so voices are designed locally and off the grid (no DashScope key/cost). | |
| Quick start (on a CUDA box; MPS/CPU work but are slow): | |
| pip install qwen-tts soundfile "fastapi[standard]" uvicorn torch | |
| python tts_server.py # serves http://localhost:8800/qwen-tts | |
| Then open the app pointed at this server: | |
| http://localhost:7860/?tts=http://localhost:8800 # local UI + local TTS | |
| https://tinyarmy.noods.cc/?tts=http://localhost:8800 # hosted UI + your GPU | |
| (browsers block https→http://localhost by default; run Chrome with | |
| --unsafely-treat-insecure-origin-as-secure=http://localhost:8800 or serve TLS) | |
| Smoke-test the bridge WITHOUT a GPU (returns a short tone instead of speech): | |
| QWEN_TTS_STUB=1 python tts_server.py | |
| Env: PORT (8800), QWEN_TTS_MODEL (Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign), QWEN_TTS_STUB. | |
| """ | |
| import asyncio | |
| import io | |
| import math | |
| import os | |
| import struct | |
| from fastapi import FastAPI, Request | |
| from fastapi.responses import Response | |
| from fastapi.middleware.cors import CORSMiddleware | |
| MODEL_ID = os.environ.get("QWEN_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign") | |
| STUB = os.environ.get("QWEN_TTS_STUB", "") not in ("", "0", "false", "False") | |
| PORT = int(os.environ.get("PORT", "8800")) | |
| app = FastAPI() | |
| # The hosted UI is a different origin — allow it (and any localhost dev port). | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], allow_methods=["*"], allow_headers=["*"], allow_credentials=False, | |
| ) | |
| _model = None | |
| _load_lock = asyncio.Lock() | |
| def _load_model(): | |
| global _model | |
| if _model is not None: | |
| return _model | |
| import torch | |
| from qwen_tts import Qwen3TTSModel | |
| if torch.cuda.is_available(): | |
| dev, dtype = "cuda:0", torch.bfloat16 | |
| elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available(): | |
| dev, dtype = "mps", torch.float32 | |
| else: | |
| dev, dtype = "cpu", torch.float32 | |
| print(f"[tts] loading {MODEL_ID} on {dev} ({dtype})…", flush=True) | |
| _model = Qwen3TTSModel.from_pretrained(MODEL_ID, device_map=dev, dtype=dtype) | |
| print("[tts] model ready", flush=True) | |
| return _model | |
| def _stub_wav(text, sr=24000): | |
| """A short A4 tone — proves the bridge end-to-end without loading the model.""" | |
| secs = min(4.0, max(0.6, len(text) / 18.0)) | |
| n = int(sr * secs) | |
| buf = io.BytesIO() | |
| data = b"".join(struct.pack("<h", int(0.25 * 32767 * math.sin(2 * math.pi * 440 * i / sr))) for i in range(n)) | |
| buf.write(b"RIFF"); buf.write(struct.pack("<I", 36 + len(data))); buf.write(b"WAVE") | |
| buf.write(b"fmt "); buf.write(struct.pack("<IHHIIHH", 16, 1, 1, sr, sr * 2, 2, 16)) | |
| buf.write(b"data"); buf.write(struct.pack("<I", len(data))); buf.write(data) | |
| return buf.getvalue() | |
| def _synth(text, instruct, language): | |
| if STUB: | |
| return _stub_wav(text) | |
| import soundfile as sf | |
| wavs, sr = _load_model().generate_voice_design(text=text, language=language, instruct=instruct) | |
| out = io.BytesIO(); sf.write(out, wavs[0], sr, format="WAV") | |
| return out.getvalue() | |
| def health(): | |
| return {"ok": True, "model": MODEL_ID, "stub": STUB, "loaded": _model is not None} | |
| async def qwen_tts(request: Request): | |
| body = await request.json() | |
| text = (body.get("text") or "").strip() | |
| instruct = (body.get("instruct") or "").strip() or "A clear, natural voice at a moderate pace." | |
| language = body.get("language") or "English" | |
| if not text: | |
| return Response("text required", status_code=400) | |
| if not STUB: | |
| async with _load_lock: # one CPU/GPU model can't decode in parallel | |
| wav = await asyncio.to_thread(_synth, text, instruct, language) | |
| else: | |
| wav = _synth(text, instruct, language) | |
| return Response(wav, media_type="audio/wav", headers={"Cache-Control": "no-store"}) | |
| if __name__ == "__main__": | |
| import uvicorn | |
| print(f"[tts] http://localhost:{PORT}/qwen-tts (stub={STUB}, model={MODEL_ID})", flush=True) | |
| uvicorn.run(app, host="0.0.0.0", port=PORT) | |