Spaces:
Running
Running
Add VoxCPM reference audio cloning
Browse files- app.py +35 -0
- web/ttsVoxcpm.js +15 -1
app.py
CHANGED
|
@@ -421,6 +421,21 @@ def _voxcpm_tts(text, instruct):
|
|
| 421 |
return f.read()
|
| 422 |
|
| 423 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 424 |
@fastapi_app.post("/voxcpm-tts")
|
| 425 |
async def voxcpm_tts(request: Request):
|
| 426 |
body = await request.json()
|
|
@@ -437,6 +452,26 @@ async def voxcpm_tts(request: Request):
|
|
| 437 |
return Response(wav, media_type="audio/wav", headers={"Cache-Control": "no-store"})
|
| 438 |
|
| 439 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 440 |
# ── Persona portraits (image generation) ─────────────────────────────────────
|
| 441 |
# Mirrors the voice path: TINY_IMAGE_MODE=local runs the OPEN WEIGHTS on your GPU
|
| 442 |
# (Z-Image-Turbo, 6B, ~12 GB bf16 — coexists with the TTS model on a 24 GB card);
|
|
|
|
| 421 |
return f.read()
|
| 422 |
|
| 423 |
|
| 424 |
+
def _voxcpm_clone(text, ref_audio_b64, ref_text, instruct):
|
| 425 |
+
from gradio_client import Client
|
| 426 |
+
client = Client(VOXCPM_SPACE, token=HF_TOKEN or None)
|
| 427 |
+
result = client.predict(
|
| 428 |
+
text,
|
| 429 |
+
ref_audio_b64,
|
| 430 |
+
ref_text or "",
|
| 431 |
+
instruct or "",
|
| 432 |
+
api_name="/clone",
|
| 433 |
+
)
|
| 434 |
+
path = result[0] if isinstance(result, (tuple, list)) else result
|
| 435 |
+
with open(os.fspath(path), "rb") as f:
|
| 436 |
+
return f.read()
|
| 437 |
+
|
| 438 |
+
|
| 439 |
@fastapi_app.post("/voxcpm-tts")
|
| 440 |
async def voxcpm_tts(request: Request):
|
| 441 |
body = await request.json()
|
|
|
|
| 452 |
return Response(wav, media_type="audio/wav", headers={"Cache-Control": "no-store"})
|
| 453 |
|
| 454 |
|
| 455 |
+
@fastapi_app.post("/voxcpm-clone")
|
| 456 |
+
async def voxcpm_clone(request: Request):
|
| 457 |
+
body = await request.json()
|
| 458 |
+
text = (body.get("text") or "").strip()
|
| 459 |
+
ref_audio = body.get("ref_audio") or ""
|
| 460 |
+
ref_text = body.get("ref_text") or ""
|
| 461 |
+
instruct = (body.get("instruct") or "").strip()
|
| 462 |
+
if not text:
|
| 463 |
+
return Response("text required", status_code=400)
|
| 464 |
+
if not ref_audio:
|
| 465 |
+
return Response("ref_audio required", status_code=400)
|
| 466 |
+
if not VOXCPM_SPACE:
|
| 467 |
+
return Response("TINY_VOXCPM_SPACE not set", status_code=503)
|
| 468 |
+
try:
|
| 469 |
+
wav = await asyncio.to_thread(_voxcpm_clone, text, ref_audio, ref_text, instruct)
|
| 470 |
+
except Exception as e: # noqa: BLE001
|
| 471 |
+
return Response(f"VoxCPM clone error: {e}", status_code=502)
|
| 472 |
+
return Response(wav, media_type="audio/wav", headers={"Cache-Control": "no-store"})
|
| 473 |
+
|
| 474 |
+
|
| 475 |
# ── Persona portraits (image generation) ─────────────────────────────────────
|
| 476 |
# Mirrors the voice path: TINY_IMAGE_MODE=local runs the OPEN WEIGHTS on your GPU
|
| 477 |
# (Z-Image-Turbo, 6B, ~12 GB bf16 — coexists with the TTS model on a 24 GB card);
|
web/ttsVoxcpm.js
CHANGED
|
@@ -24,6 +24,20 @@ async function postSynthWav(text, voiceId) {
|
|
| 24 |
}
|
| 25 |
const postSynth = async (text, voiceId) => decodeAudio(await postSynthWav(text, voiceId))
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
export const engine = {
|
| 28 |
mode: 'pcm', needsDownload: false, networked: true, design: true,
|
| 29 |
id: 'voxcpm',
|
|
@@ -36,6 +50,6 @@ export const engine = {
|
|
| 36 |
setDesc(d) { _desc = (d || '').trim() },
|
| 37 |
synth: (text, voiceId) => postSynth(text, voiceId),
|
| 38 |
synthWav: (text, voiceId) => postSynthWav(text, voiceId),
|
| 39 |
-
cloneWav: (text, refAb, refText, instruct) =>
|
| 40 |
backendLabel: () => 'ZeroGPU VoxCPM2',
|
| 41 |
}
|
|
|
|
| 24 |
}
|
| 25 |
const postSynth = async (text, voiceId) => decodeAudio(await postSynthWav(text, voiceId))
|
| 26 |
|
| 27 |
+
function abToB64(ab) {
|
| 28 |
+
let s = ''; const u = new Uint8Array(ab); const C = 0x8000
|
| 29 |
+
for (let i = 0; i < u.length; i += C) s += String.fromCharCode.apply(null, u.subarray(i, i + C))
|
| 30 |
+
return btoa(s)
|
| 31 |
+
}
|
| 32 |
+
async function postCloneWav(text, refAb, refText, instruct) {
|
| 33 |
+
const resp = await fetch('/voxcpm-clone', {
|
| 34 |
+
method: 'POST', headers: { 'Content-Type': 'application/json' },
|
| 35 |
+
body: JSON.stringify({ text, ref_audio: abToB64(refAb), ref_text: refText || '', instruct: instruct || '', language: 'English' }),
|
| 36 |
+
})
|
| 37 |
+
if (!resp.ok) throw new Error(`VoxCPM clone ${resp.status}: ${(await resp.text()).slice(0, 140)}`)
|
| 38 |
+
return resp.arrayBuffer()
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
export const engine = {
|
| 42 |
mode: 'pcm', needsDownload: false, networked: true, design: true,
|
| 43 |
id: 'voxcpm',
|
|
|
|
| 50 |
setDesc(d) { _desc = (d || '').trim() },
|
| 51 |
synth: (text, voiceId) => postSynth(text, voiceId),
|
| 52 |
synthWav: (text, voiceId) => postSynthWav(text, voiceId),
|
| 53 |
+
cloneWav: (text, refAb, refText, instruct) => postCloneWav(text, refAb, refText, instruct),
|
| 54 |
backendLabel: () => 'ZeroGPU VoxCPM2',
|
| 55 |
}
|