Spaces:
Running
Running
Add VoxCPM ZeroGPU voice provider
Browse files- app.py +26 -0
- web/tts.js +12 -7
- web/ttsVoxcpm.js +41 -0
app.py
CHANGED
|
@@ -261,6 +261,7 @@ _DASHSCOPE_URL = _DASHSCOPE_BASE + "/api/v1/services/audio/tts/customization"
|
|
| 261 |
# origin so no CORS/cert dance — the LeLab pattern). Needs `pip install qwen-tts torch
|
| 262 |
# soundfile`. Lazy-loaded; the Space (cpu-basic) leaves this unset and uses DashScope.
|
| 263 |
TTS_MODE = os.environ.get("TINY_TTS_MODE", "").strip().lower()
|
|
|
|
| 264 |
_local_tts = None # VoiceDesign model
|
| 265 |
_local_clone = None # Base model (voice clone) — lazy, only if a clone is requested
|
| 266 |
_local_tts_lock = threading.Lock()
|
|
@@ -411,6 +412,31 @@ async def qwen_tts(request: Request):
|
|
| 411 |
return Response(wav, media_type="audio/wav", headers={"Cache-Control": "no-store"})
|
| 412 |
|
| 413 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 414 |
# ── Persona portraits (image generation) ─────────────────────────────────────
|
| 415 |
# Mirrors the voice path: TINY_IMAGE_MODE=local runs the OPEN WEIGHTS on your GPU
|
| 416 |
# (Z-Image-Turbo, 6B, ~12 GB bf16 — coexists with the TTS model on a 24 GB card);
|
|
|
|
| 261 |
# origin so no CORS/cert dance — the LeLab pattern). Needs `pip install qwen-tts torch
|
| 262 |
# soundfile`. Lazy-loaded; the Space (cpu-basic) leaves this unset and uses DashScope.
|
| 263 |
TTS_MODE = os.environ.get("TINY_TTS_MODE", "").strip().lower()
|
| 264 |
+
VOXCPM_SPACE = os.environ.get("TINY_VOXCPM_SPACE", "").strip()
|
| 265 |
_local_tts = None # VoiceDesign model
|
| 266 |
_local_clone = None # Base model (voice clone) — lazy, only if a clone is requested
|
| 267 |
_local_tts_lock = threading.Lock()
|
|
|
|
| 412 |
return Response(wav, media_type="audio/wav", headers={"Cache-Control": "no-store"})
|
| 413 |
|
| 414 |
|
| 415 |
+
def _voxcpm_tts(text, instruct):
|
| 416 |
+
from gradio_client import Client
|
| 417 |
+
client = Client(VOXCPM_SPACE, token=HF_TOKEN or None)
|
| 418 |
+
result = client.predict(text, instruct or "A clear, natural voice at a moderate pace.", api_name="/synthesize")
|
| 419 |
+
path = result[0] if isinstance(result, (tuple, list)) else result
|
| 420 |
+
with open(os.fspath(path), "rb") as f:
|
| 421 |
+
return f.read()
|
| 422 |
+
|
| 423 |
+
|
| 424 |
+
@fastapi_app.post("/voxcpm-tts")
|
| 425 |
+
async def voxcpm_tts(request: Request):
|
| 426 |
+
body = await request.json()
|
| 427 |
+
text = (body.get("text") or "").strip()
|
| 428 |
+
instruct = (body.get("instruct") or "").strip()
|
| 429 |
+
if not text:
|
| 430 |
+
return Response("text required", status_code=400)
|
| 431 |
+
if not VOXCPM_SPACE:
|
| 432 |
+
return Response("TINY_VOXCPM_SPACE not set", status_code=503)
|
| 433 |
+
try:
|
| 434 |
+
wav = await asyncio.to_thread(_voxcpm_tts, text, instruct)
|
| 435 |
+
except Exception as e: # noqa: BLE001
|
| 436 |
+
return Response(f"VoxCPM error: {e}", status_code=502)
|
| 437 |
+
return Response(wav, media_type="audio/wav", headers={"Cache-Control": "no-store"})
|
| 438 |
+
|
| 439 |
+
|
| 440 |
# ── Persona portraits (image generation) ─────────────────────────────────────
|
| 441 |
# Mirrors the voice path: TINY_IMAGE_MODE=local runs the OPEN WEIGHTS on your GPU
|
| 442 |
# (Z-Image-Turbo, 6B, ~12 GB bf16 — coexists with the TTS model on a 24 GB card);
|
web/tts.js
CHANGED
|
@@ -4,12 +4,13 @@
|
|
| 4 |
// LLM is still writing. Panels + the TTS bar import only from here.
|
| 5 |
import { engine as kokoro } from '/web/ttsKokoro.js'
|
| 6 |
import { engine as qwen3, engineLocal as qwen3local, isLocalhost } from '/web/ttsQwen3.js'
|
|
|
|
| 7 |
import { engine as kitten } from '/web/ttsKitten.js'
|
| 8 |
import { engine as webspeech } from '/web/ttsWebSpeech.js'
|
| 9 |
import { playSamples, stopAudio, decodeAudio, encodeWav } from '/web/ttsAudio.js'
|
| 10 |
import { ensurePersistentStorage } from '/web/storage.js'
|
| 11 |
|
| 12 |
-
const ENGINES = [kokoro, qwen3local, qwen3, kitten, webspeech]
|
| 13 |
// Default voice provider: local-GPU Qwen3-TTS on localhost (your GPU designs voices),
|
| 14 |
// in-browser Kokoro in prod (runs on the device — no exhaustible cloud quota). Cloud
|
| 15 |
// Qwen3-TTS and the others remain selectable in Settings. Persisted across refreshes.
|
|
@@ -23,10 +24,11 @@ let activeId = (() => {
|
|
| 23 |
|
| 24 |
// Qwen3-TTS designs a voice from a free-form description (the persona's `voice`).
|
| 25 |
// Panels set it before narrating; previewVoice() plays a one-off sample.
|
| 26 |
-
export function setVoiceDescription(desc) { qwen3.setDesc(desc) }
|
| 27 |
export async function previewVoice(desc, text) {
|
| 28 |
-
|
| 29 |
-
|
|
|
|
| 30 |
return playSamples(audio, sampleRate)
|
| 31 |
}
|
| 32 |
export const stopPreview = () => stopAudio()
|
|
@@ -34,13 +36,16 @@ export const stopPreview = () => stopAudio()
|
|
| 34 |
// Create a persona's voice FILE: synth the line in the designed voice and return the
|
| 35 |
// raw WAV (ArrayBuffer) so it can be cached + replayed verbatim. Caller plays it.
|
| 36 |
export async function createVoiceWav(desc, text) {
|
| 37 |
-
|
| 38 |
-
|
|
|
|
| 39 |
}
|
| 40 |
// Clone `text` from a reference voice file (keep timbre, change words). `desc` is the
|
| 41 |
// voice design — a fallback so prod (no clone model) can re-design instead. Returns WAV.
|
| 42 |
export async function cloneVoiceWav(refArrayBuffer, refText, text, desc) {
|
| 43 |
-
|
|
|
|
|
|
|
| 44 |
}
|
| 45 |
export async function playWav(arrayBuffer) {
|
| 46 |
const { audio, sampleRate } = await decodeAudio(arrayBuffer)
|
|
|
|
| 4 |
// LLM is still writing. Panels + the TTS bar import only from here.
|
| 5 |
import { engine as kokoro } from '/web/ttsKokoro.js'
|
| 6 |
import { engine as qwen3, engineLocal as qwen3local, isLocalhost } from '/web/ttsQwen3.js'
|
| 7 |
+
import { engine as voxcpm } from '/web/ttsVoxcpm.js'
|
| 8 |
import { engine as kitten } from '/web/ttsKitten.js'
|
| 9 |
import { engine as webspeech } from '/web/ttsWebSpeech.js'
|
| 10 |
import { playSamples, stopAudio, decodeAudio, encodeWav } from '/web/ttsAudio.js'
|
| 11 |
import { ensurePersistentStorage } from '/web/storage.js'
|
| 12 |
|
| 13 |
+
const ENGINES = [kokoro, qwen3local, qwen3, voxcpm, kitten, webspeech]
|
| 14 |
// Default voice provider: local-GPU Qwen3-TTS on localhost (your GPU designs voices),
|
| 15 |
// in-browser Kokoro in prod (runs on the device — no exhaustible cloud quota). Cloud
|
| 16 |
// Qwen3-TTS and the others remain selectable in Settings. Persisted across refreshes.
|
|
|
|
| 24 |
|
| 25 |
// Qwen3-TTS designs a voice from a free-form description (the persona's `voice`).
|
| 26 |
// Panels set it before narrating; previewVoice() plays a one-off sample.
|
| 27 |
+
export function setVoiceDescription(desc) { qwen3.setDesc(desc); voxcpm.setDesc(desc) }
|
| 28 |
export async function previewVoice(desc, text) {
|
| 29 |
+
const e = eng()
|
| 30 |
+
if (e.setDesc) e.setDesc(desc)
|
| 31 |
+
const { audio, sampleRate } = await e.synth(text, 'persona')
|
| 32 |
return playSamples(audio, sampleRate)
|
| 33 |
}
|
| 34 |
export const stopPreview = () => stopAudio()
|
|
|
|
| 36 |
// Create a persona's voice FILE: synth the line in the designed voice and return the
|
| 37 |
// raw WAV (ArrayBuffer) so it can be cached + replayed verbatim. Caller plays it.
|
| 38 |
export async function createVoiceWav(desc, text) {
|
| 39 |
+
const e = eng()
|
| 40 |
+
if (e.setDesc) e.setDesc(desc)
|
| 41 |
+
return e.synthWav(text, 'persona')
|
| 42 |
}
|
| 43 |
// Clone `text` from a reference voice file (keep timbre, change words). `desc` is the
|
| 44 |
// voice design — a fallback so prod (no clone model) can re-design instead. Returns WAV.
|
| 45 |
export async function cloneVoiceWav(refArrayBuffer, refText, text, desc) {
|
| 46 |
+
const e = eng()
|
| 47 |
+
if (e.setDesc) e.setDesc(desc)
|
| 48 |
+
return e.cloneWav(text, refArrayBuffer, refText, desc)
|
| 49 |
}
|
| 50 |
export async function playWav(arrayBuffer) {
|
| 51 |
const { audio, sampleRate } = await decodeAudio(arrayBuffer)
|
web/ttsVoxcpm.js
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// TTS engine: VoxCPM2 via a ZeroGPU sidecar, proxied through /voxcpm-tts so
|
| 2 |
+
// tokens and sidecar details stay server-side. Like Qwen3, this designs a voice
|
| 3 |
+
// from each hero's free-form voice description.
|
| 4 |
+
import { decodeAudio } from '/web/ttsAudio.js'
|
| 5 |
+
|
| 6 |
+
let _desc = ''
|
| 7 |
+
const VOICES = [
|
| 8 |
+
{ id: 'persona', label: '✨ Persona voice (designed)', desc: () => _desc },
|
| 9 |
+
{ id: 'veteran', label: 'Gruff veteran', desc: () => 'A gravelly, battle-worn male baritone - slow, deliberate, weary, with a wry edge.' },
|
| 10 |
+
{ id: 'herald', label: 'Bright herald', desc: () => 'A clear, bright young male voice - brisk, energetic, projecting and confident.' },
|
| 11 |
+
{ id: 'medic', label: 'Steady medic', desc: () => 'A calm, warm female voice - measured pace, clear articulation, reassuring.' },
|
| 12 |
+
{ id: 'rogue', label: 'Sly rogue', desc: () => 'A low, smooth voice with a sly, amused lilt - unhurried, with a dangerous edge.' },
|
| 13 |
+
]
|
| 14 |
+
const get = (id) => VOICES.find((v) => v.id === id) || VOICES[0]
|
| 15 |
+
|
| 16 |
+
async function postSynthWav(text, voiceId) {
|
| 17 |
+
const instruct = (get(voiceId).desc() || '').trim()
|
| 18 |
+
const resp = await fetch('/voxcpm-tts', {
|
| 19 |
+
method: 'POST', headers: { 'Content-Type': 'application/json' },
|
| 20 |
+
body: JSON.stringify({ text, instruct, language: 'English' }),
|
| 21 |
+
})
|
| 22 |
+
if (!resp.ok) throw new Error(`VoxCPM ${resp.status}: ${(await resp.text()).slice(0, 140)}`)
|
| 23 |
+
return resp.arrayBuffer()
|
| 24 |
+
}
|
| 25 |
+
const postSynth = async (text, voiceId) => decodeAudio(await postSynthWav(text, voiceId))
|
| 26 |
+
|
| 27 |
+
export const engine = {
|
| 28 |
+
mode: 'pcm', needsDownload: false, networked: true, design: true,
|
| 29 |
+
id: 'voxcpm',
|
| 30 |
+
label: 'VoxCPM2 · Voice Design (ZeroGPU)',
|
| 31 |
+
experimental: true,
|
| 32 |
+
available: () => true,
|
| 33 |
+
listVoices: () => VOICES,
|
| 34 |
+
defaultVoice: 'persona',
|
| 35 |
+
ensure: async () => {},
|
| 36 |
+
setDesc(d) { _desc = (d || '').trim() },
|
| 37 |
+
synth: (text, voiceId) => postSynth(text, voiceId),
|
| 38 |
+
synthWav: (text, voiceId) => postSynthWav(text, voiceId),
|
| 39 |
+
cloneWav: (text, refAb, refText, instruct) => postSynthWav(text, 'persona'),
|
| 40 |
+
backendLabel: () => 'ZeroGPU VoxCPM2',
|
| 41 |
+
}
|