polats commited on
Commit
8000b92
·
1 Parent(s): 3aafe4e

Add VoxCPM reference audio cloning

Browse files
Files changed (2) hide show
  1. app.py +35 -0
  2. web/ttsVoxcpm.js +15 -1
app.py CHANGED
@@ -421,6 +421,21 @@ def _voxcpm_tts(text, instruct):
421
  return f.read()
422
 
423
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
  @fastapi_app.post("/voxcpm-tts")
425
  async def voxcpm_tts(request: Request):
426
  body = await request.json()
@@ -437,6 +452,26 @@ async def voxcpm_tts(request: Request):
437
  return Response(wav, media_type="audio/wav", headers={"Cache-Control": "no-store"})
438
 
439
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
  # ── Persona portraits (image generation) ─────────────────────────────────────
441
  # Mirrors the voice path: TINY_IMAGE_MODE=local runs the OPEN WEIGHTS on your GPU
442
  # (Z-Image-Turbo, 6B, ~12 GB bf16 — coexists with the TTS model on a 24 GB card);
 
421
  return f.read()
422
 
423
 
424
+ def _voxcpm_clone(text, ref_audio_b64, ref_text, instruct):
425
+ from gradio_client import Client
426
+ client = Client(VOXCPM_SPACE, token=HF_TOKEN or None)
427
+ result = client.predict(
428
+ text,
429
+ ref_audio_b64,
430
+ ref_text or "",
431
+ instruct or "",
432
+ api_name="/clone",
433
+ )
434
+ path = result[0] if isinstance(result, (tuple, list)) else result
435
+ with open(os.fspath(path), "rb") as f:
436
+ return f.read()
437
+
438
+
439
  @fastapi_app.post("/voxcpm-tts")
440
  async def voxcpm_tts(request: Request):
441
  body = await request.json()
 
452
  return Response(wav, media_type="audio/wav", headers={"Cache-Control": "no-store"})
453
 
454
 
455
+ @fastapi_app.post("/voxcpm-clone")
456
+ async def voxcpm_clone(request: Request):
457
+ body = await request.json()
458
+ text = (body.get("text") or "").strip()
459
+ ref_audio = body.get("ref_audio") or ""
460
+ ref_text = body.get("ref_text") or ""
461
+ instruct = (body.get("instruct") or "").strip()
462
+ if not text:
463
+ return Response("text required", status_code=400)
464
+ if not ref_audio:
465
+ return Response("ref_audio required", status_code=400)
466
+ if not VOXCPM_SPACE:
467
+ return Response("TINY_VOXCPM_SPACE not set", status_code=503)
468
+ try:
469
+ wav = await asyncio.to_thread(_voxcpm_clone, text, ref_audio, ref_text, instruct)
470
+ except Exception as e: # noqa: BLE001
471
+ return Response(f"VoxCPM clone error: {e}", status_code=502)
472
+ return Response(wav, media_type="audio/wav", headers={"Cache-Control": "no-store"})
473
+
474
+
475
  # ── Persona portraits (image generation) ─────────────────────────────────────
476
  # Mirrors the voice path: TINY_IMAGE_MODE=local runs the OPEN WEIGHTS on your GPU
477
  # (Z-Image-Turbo, 6B, ~12 GB bf16 — coexists with the TTS model on a 24 GB card);
web/ttsVoxcpm.js CHANGED
@@ -24,6 +24,20 @@ async function postSynthWav(text, voiceId) {
24
  }
25
  const postSynth = async (text, voiceId) => decodeAudio(await postSynthWav(text, voiceId))
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  export const engine = {
28
  mode: 'pcm', needsDownload: false, networked: true, design: true,
29
  id: 'voxcpm',
@@ -36,6 +50,6 @@ export const engine = {
36
  setDesc(d) { _desc = (d || '').trim() },
37
  synth: (text, voiceId) => postSynth(text, voiceId),
38
  synthWav: (text, voiceId) => postSynthWav(text, voiceId),
39
- cloneWav: (text, refAb, refText, instruct) => postSynthWav(text, 'persona'),
40
  backendLabel: () => 'ZeroGPU VoxCPM2',
41
  }
 
24
  }
25
  const postSynth = async (text, voiceId) => decodeAudio(await postSynthWav(text, voiceId))
26
 
27
+ function abToB64(ab) {
28
+ let s = ''; const u = new Uint8Array(ab); const C = 0x8000
29
+ for (let i = 0; i < u.length; i += C) s += String.fromCharCode.apply(null, u.subarray(i, i + C))
30
+ return btoa(s)
31
+ }
32
+ async function postCloneWav(text, refAb, refText, instruct) {
33
+ const resp = await fetch('/voxcpm-clone', {
34
+ method: 'POST', headers: { 'Content-Type': 'application/json' },
35
+ body: JSON.stringify({ text, ref_audio: abToB64(refAb), ref_text: refText || '', instruct: instruct || '', language: 'English' }),
36
+ })
37
+ if (!resp.ok) throw new Error(`VoxCPM clone ${resp.status}: ${(await resp.text()).slice(0, 140)}`)
38
+ return resp.arrayBuffer()
39
+ }
40
+
41
  export const engine = {
42
  mode: 'pcm', needsDownload: false, networked: true, design: true,
43
  id: 'voxcpm',
 
50
  setDesc(d) { _desc = (d || '').trim() },
51
  synth: (text, voiceId) => postSynth(text, voiceId),
52
  synthWav: (text, voiceId) => postSynthWav(text, voiceId),
53
+ cloneWav: (text, refAb, refText, instruct) => postCloneWav(text, refAb, refText, instruct),
54
  backendLabel: () => 'ZeroGPU VoxCPM2',
55
  }