Spaces:

build-small-hackathon
/

tiny-army

Running

App Files Files Community

polats commited on 3 days ago

Commit

8000b92

1 Parent(s): 3aafe4e

Add VoxCPM reference audio cloning

Browse files

Files changed (2) hide show

app.py +35 -0
web/ttsVoxcpm.js +15 -1

app.py CHANGED Viewed

@@ -421,6 +421,21 @@ def _voxcpm_tts(text, instruct):
         return f.read()
 @fastapi_app.post("/voxcpm-tts")
 async def voxcpm_tts(request: Request):
     body = await request.json()
@@ -437,6 +452,26 @@ async def voxcpm_tts(request: Request):
     return Response(wav, media_type="audio/wav", headers={"Cache-Control": "no-store"})
 # ── Persona portraits (image generation) ─────────────────────────────────────
 # Mirrors the voice path: TINY_IMAGE_MODE=local runs the OPEN WEIGHTS on your GPU
 # (Z-Image-Turbo, 6B, ~12 GB bf16 — coexists with the TTS model on a 24 GB card);

         return f.read()
+def _voxcpm_clone(text, ref_audio_b64, ref_text, instruct):
+    from gradio_client import Client
+    client = Client(VOXCPM_SPACE, token=HF_TOKEN or None)
+    result = client.predict(
+        text,
+        ref_audio_b64,
+        ref_text or "",
+        instruct or "",
+        api_name="/clone",
+    )
+    path = result[0] if isinstance(result, (tuple, list)) else result
+    with open(os.fspath(path), "rb") as f:
+        return f.read()
 @fastapi_app.post("/voxcpm-tts")
 async def voxcpm_tts(request: Request):
     body = await request.json()
     return Response(wav, media_type="audio/wav", headers={"Cache-Control": "no-store"})
+@fastapi_app.post("/voxcpm-clone")
+async def voxcpm_clone(request: Request):
+    body = await request.json()
+    text = (body.get("text") or "").strip()
+    ref_audio = body.get("ref_audio") or ""
+    ref_text = body.get("ref_text") or ""
+    instruct = (body.get("instruct") or "").strip()
+    if not text:
+        return Response("text required", status_code=400)
+    if not ref_audio:
+        return Response("ref_audio required", status_code=400)
+    if not VOXCPM_SPACE:
+        return Response("TINY_VOXCPM_SPACE not set", status_code=503)
+    try:
+        wav = await asyncio.to_thread(_voxcpm_clone, text, ref_audio, ref_text, instruct)
+    except Exception as e:  # noqa: BLE001
+        return Response(f"VoxCPM clone error: {e}", status_code=502)
+    return Response(wav, media_type="audio/wav", headers={"Cache-Control": "no-store"})
 # ── Persona portraits (image generation) ─────────────────────────────────────
 # Mirrors the voice path: TINY_IMAGE_MODE=local runs the OPEN WEIGHTS on your GPU
 # (Z-Image-Turbo, 6B, ~12 GB bf16 — coexists with the TTS model on a 24 GB card);

web/ttsVoxcpm.js CHANGED Viewed

@@ -24,6 +24,20 @@ async function postSynthWav(text, voiceId) {
 }
 const postSynth = async (text, voiceId) => decodeAudio(await postSynthWav(text, voiceId))
 export const engine = {
   mode: 'pcm', needsDownload: false, networked: true, design: true,
   id: 'voxcpm',
@@ -36,6 +50,6 @@ export const engine = {
   setDesc(d) { _desc = (d || '').trim() },
   synth: (text, voiceId) => postSynth(text, voiceId),
   synthWav: (text, voiceId) => postSynthWav(text, voiceId),
-  cloneWav: (text, refAb, refText, instruct) => postSynthWav(text, 'persona'),
   backendLabel: () => 'ZeroGPU VoxCPM2',
 }

 }
 const postSynth = async (text, voiceId) => decodeAudio(await postSynthWav(text, voiceId))
+function abToB64(ab) {
+  let s = ''; const u = new Uint8Array(ab); const C = 0x8000
+  for (let i = 0; i < u.length; i += C) s += String.fromCharCode.apply(null, u.subarray(i, i + C))
+  return btoa(s)
+}
+async function postCloneWav(text, refAb, refText, instruct) {
+  const resp = await fetch('/voxcpm-clone', {
+    method: 'POST', headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify({ text, ref_audio: abToB64(refAb), ref_text: refText || '', instruct: instruct || '', language: 'English' }),
+  })
+  if (!resp.ok) throw new Error(`VoxCPM clone ${resp.status}: ${(await resp.text()).slice(0, 140)}`)
+  return resp.arrayBuffer()
+}
 export const engine = {
   mode: 'pcm', needsDownload: false, networked: true, design: true,
   id: 'voxcpm',
   setDesc(d) { _desc = (d || '').trim() },
   synth: (text, voiceId) => postSynth(text, voiceId),
   synthWav: (text, voiceId) => postSynthWav(text, voiceId),
+  cloneWav: (text, refAb, refText, instruct) => postCloneWav(text, refAb, refText, instruct),
   backendLabel: () => 'ZeroGPU VoxCPM2',
 }