polats commited on
Commit
3aafe4e
·
1 Parent(s): 8e1c8c5

Add VoxCPM ZeroGPU voice provider

Browse files
Files changed (3) hide show
  1. app.py +26 -0
  2. web/tts.js +12 -7
  3. web/ttsVoxcpm.js +41 -0
app.py CHANGED
@@ -261,6 +261,7 @@ _DASHSCOPE_URL = _DASHSCOPE_BASE + "/api/v1/services/audio/tts/customization"
261
  # origin so no CORS/cert dance — the LeLab pattern). Needs `pip install qwen-tts torch
262
  # soundfile`. Lazy-loaded; the Space (cpu-basic) leaves this unset and uses DashScope.
263
  TTS_MODE = os.environ.get("TINY_TTS_MODE", "").strip().lower()
 
264
  _local_tts = None # VoiceDesign model
265
  _local_clone = None # Base model (voice clone) — lazy, only if a clone is requested
266
  _local_tts_lock = threading.Lock()
@@ -411,6 +412,31 @@ async def qwen_tts(request: Request):
411
  return Response(wav, media_type="audio/wav", headers={"Cache-Control": "no-store"})
412
 
413
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
  # ── Persona portraits (image generation) ─────────────────────────────────────
415
  # Mirrors the voice path: TINY_IMAGE_MODE=local runs the OPEN WEIGHTS on your GPU
416
  # (Z-Image-Turbo, 6B, ~12 GB bf16 — coexists with the TTS model on a 24 GB card);
 
261
  # origin so no CORS/cert dance — the LeLab pattern). Needs `pip install qwen-tts torch
262
  # soundfile`. Lazy-loaded; the Space (cpu-basic) leaves this unset and uses DashScope.
263
  TTS_MODE = os.environ.get("TINY_TTS_MODE", "").strip().lower()
264
+ VOXCPM_SPACE = os.environ.get("TINY_VOXCPM_SPACE", "").strip()
265
  _local_tts = None # VoiceDesign model
266
  _local_clone = None # Base model (voice clone) — lazy, only if a clone is requested
267
  _local_tts_lock = threading.Lock()
 
412
  return Response(wav, media_type="audio/wav", headers={"Cache-Control": "no-store"})
413
 
414
 
415
+ def _voxcpm_tts(text, instruct):
416
+ from gradio_client import Client
417
+ client = Client(VOXCPM_SPACE, token=HF_TOKEN or None)
418
+ result = client.predict(text, instruct or "A clear, natural voice at a moderate pace.", api_name="/synthesize")
419
+ path = result[0] if isinstance(result, (tuple, list)) else result
420
+ with open(os.fspath(path), "rb") as f:
421
+ return f.read()
422
+
423
+
424
+ @fastapi_app.post("/voxcpm-tts")
425
+ async def voxcpm_tts(request: Request):
426
+ body = await request.json()
427
+ text = (body.get("text") or "").strip()
428
+ instruct = (body.get("instruct") or "").strip()
429
+ if not text:
430
+ return Response("text required", status_code=400)
431
+ if not VOXCPM_SPACE:
432
+ return Response("TINY_VOXCPM_SPACE not set", status_code=503)
433
+ try:
434
+ wav = await asyncio.to_thread(_voxcpm_tts, text, instruct)
435
+ except Exception as e: # noqa: BLE001
436
+ return Response(f"VoxCPM error: {e}", status_code=502)
437
+ return Response(wav, media_type="audio/wav", headers={"Cache-Control": "no-store"})
438
+
439
+
440
  # ── Persona portraits (image generation) ─────────────────────────────────────
441
  # Mirrors the voice path: TINY_IMAGE_MODE=local runs the OPEN WEIGHTS on your GPU
442
  # (Z-Image-Turbo, 6B, ~12 GB bf16 — coexists with the TTS model on a 24 GB card);
web/tts.js CHANGED
@@ -4,12 +4,13 @@
4
  // LLM is still writing. Panels + the TTS bar import only from here.
5
  import { engine as kokoro } from '/web/ttsKokoro.js'
6
  import { engine as qwen3, engineLocal as qwen3local, isLocalhost } from '/web/ttsQwen3.js'
 
7
  import { engine as kitten } from '/web/ttsKitten.js'
8
  import { engine as webspeech } from '/web/ttsWebSpeech.js'
9
  import { playSamples, stopAudio, decodeAudio, encodeWav } from '/web/ttsAudio.js'
10
  import { ensurePersistentStorage } from '/web/storage.js'
11
 
12
- const ENGINES = [kokoro, qwen3local, qwen3, kitten, webspeech]
13
  // Default voice provider: local-GPU Qwen3-TTS on localhost (your GPU designs voices),
14
  // in-browser Kokoro in prod (runs on the device — no exhaustible cloud quota). Cloud
15
  // Qwen3-TTS and the others remain selectable in Settings. Persisted across refreshes.
@@ -23,10 +24,11 @@ let activeId = (() => {
23
 
24
  // Qwen3-TTS designs a voice from a free-form description (the persona's `voice`).
25
  // Panels set it before narrating; previewVoice() plays a one-off sample.
26
- export function setVoiceDescription(desc) { qwen3.setDesc(desc) }
27
  export async function previewVoice(desc, text) {
28
- qwen3.setDesc(desc)
29
- const { audio, sampleRate } = await qwen3.synth(text, 'persona')
 
30
  return playSamples(audio, sampleRate)
31
  }
32
  export const stopPreview = () => stopAudio()
@@ -34,13 +36,16 @@ export const stopPreview = () => stopAudio()
34
  // Create a persona's voice FILE: synth the line in the designed voice and return the
35
  // raw WAV (ArrayBuffer) so it can be cached + replayed verbatim. Caller plays it.
36
  export async function createVoiceWav(desc, text) {
37
- qwen3.setDesc(desc)
38
- return qwen3.synthWav(text, 'persona')
 
39
  }
40
  // Clone `text` from a reference voice file (keep timbre, change words). `desc` is the
41
  // voice design — a fallback so prod (no clone model) can re-design instead. Returns WAV.
42
  export async function cloneVoiceWav(refArrayBuffer, refText, text, desc) {
43
- return qwen3.cloneWav(text, refArrayBuffer, refText, desc)
 
 
44
  }
45
  export async function playWav(arrayBuffer) {
46
  const { audio, sampleRate } = await decodeAudio(arrayBuffer)
 
4
  // LLM is still writing. Panels + the TTS bar import only from here.
5
  import { engine as kokoro } from '/web/ttsKokoro.js'
6
  import { engine as qwen3, engineLocal as qwen3local, isLocalhost } from '/web/ttsQwen3.js'
7
+ import { engine as voxcpm } from '/web/ttsVoxcpm.js'
8
  import { engine as kitten } from '/web/ttsKitten.js'
9
  import { engine as webspeech } from '/web/ttsWebSpeech.js'
10
  import { playSamples, stopAudio, decodeAudio, encodeWav } from '/web/ttsAudio.js'
11
  import { ensurePersistentStorage } from '/web/storage.js'
12
 
13
+ const ENGINES = [kokoro, qwen3local, qwen3, voxcpm, kitten, webspeech]
14
  // Default voice provider: local-GPU Qwen3-TTS on localhost (your GPU designs voices),
15
  // in-browser Kokoro in prod (runs on the device — no exhaustible cloud quota). Cloud
16
  // Qwen3-TTS and the others remain selectable in Settings. Persisted across refreshes.
 
24
 
25
  // Qwen3-TTS designs a voice from a free-form description (the persona's `voice`).
26
  // Panels set it before narrating; previewVoice() plays a one-off sample.
27
+ export function setVoiceDescription(desc) { qwen3.setDesc(desc); voxcpm.setDesc(desc) }
28
  export async function previewVoice(desc, text) {
29
+ const e = eng()
30
+ if (e.setDesc) e.setDesc(desc)
31
+ const { audio, sampleRate } = await e.synth(text, 'persona')
32
  return playSamples(audio, sampleRate)
33
  }
34
  export const stopPreview = () => stopAudio()
 
36
  // Create a persona's voice FILE: synth the line in the designed voice and return the
37
  // raw WAV (ArrayBuffer) so it can be cached + replayed verbatim. Caller plays it.
38
  export async function createVoiceWav(desc, text) {
39
+ const e = eng()
40
+ if (e.setDesc) e.setDesc(desc)
41
+ return e.synthWav(text, 'persona')
42
  }
43
  // Clone `text` from a reference voice file (keep timbre, change words). `desc` is the
44
  // voice design — a fallback so prod (no clone model) can re-design instead. Returns WAV.
45
  export async function cloneVoiceWav(refArrayBuffer, refText, text, desc) {
46
+ const e = eng()
47
+ if (e.setDesc) e.setDesc(desc)
48
+ return e.cloneWav(text, refArrayBuffer, refText, desc)
49
  }
50
  export async function playWav(arrayBuffer) {
51
  const { audio, sampleRate } = await decodeAudio(arrayBuffer)
web/ttsVoxcpm.js ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // TTS engine: VoxCPM2 via a ZeroGPU sidecar, proxied through /voxcpm-tts so
2
+ // tokens and sidecar details stay server-side. Like Qwen3, this designs a voice
3
+ // from each hero's free-form voice description.
4
+ import { decodeAudio } from '/web/ttsAudio.js'
5
+
6
+ let _desc = ''
7
+ const VOICES = [
8
+ { id: 'persona', label: '✨ Persona voice (designed)', desc: () => _desc },
9
+ { id: 'veteran', label: 'Gruff veteran', desc: () => 'A gravelly, battle-worn male baritone - slow, deliberate, weary, with a wry edge.' },
10
+ { id: 'herald', label: 'Bright herald', desc: () => 'A clear, bright young male voice - brisk, energetic, projecting and confident.' },
11
+ { id: 'medic', label: 'Steady medic', desc: () => 'A calm, warm female voice - measured pace, clear articulation, reassuring.' },
12
+ { id: 'rogue', label: 'Sly rogue', desc: () => 'A low, smooth voice with a sly, amused lilt - unhurried, with a dangerous edge.' },
13
+ ]
14
+ const get = (id) => VOICES.find((v) => v.id === id) || VOICES[0]
15
+
16
+ async function postSynthWav(text, voiceId) {
17
+ const instruct = (get(voiceId).desc() || '').trim()
18
+ const resp = await fetch('/voxcpm-tts', {
19
+ method: 'POST', headers: { 'Content-Type': 'application/json' },
20
+ body: JSON.stringify({ text, instruct, language: 'English' }),
21
+ })
22
+ if (!resp.ok) throw new Error(`VoxCPM ${resp.status}: ${(await resp.text()).slice(0, 140)}`)
23
+ return resp.arrayBuffer()
24
+ }
25
+ const postSynth = async (text, voiceId) => decodeAudio(await postSynthWav(text, voiceId))
26
+
27
+ export const engine = {
28
+ mode: 'pcm', needsDownload: false, networked: true, design: true,
29
+ id: 'voxcpm',
30
+ label: 'VoxCPM2 · Voice Design (ZeroGPU)',
31
+ experimental: true,
32
+ available: () => true,
33
+ listVoices: () => VOICES,
34
+ defaultVoice: 'persona',
35
+ ensure: async () => {},
36
+ setDesc(d) { _desc = (d || '').trim() },
37
+ synth: (text, voiceId) => postSynth(text, voiceId),
38
+ synthWav: (text, voiceId) => postSynthWav(text, voiceId),
39
+ cloneWav: (text, refAb, refText, instruct) => postSynthWav(text, 'persona'),
40
+ backendLabel: () => 'ZeroGPU VoxCPM2',
41
+ }