Spaces:

build-small-hackathon
/

tiny-army

Running

App Files Files Community

polats commited on 4 days ago

Commit

3aafe4e

1 Parent(s): 8e1c8c5

Add VoxCPM ZeroGPU voice provider

Browse files

Files changed (3) hide show

app.py +26 -0
web/tts.js +12 -7
web/ttsVoxcpm.js +41 -0

app.py CHANGED Viewed

@@ -261,6 +261,7 @@ _DASHSCOPE_URL = _DASHSCOPE_BASE + "/api/v1/services/audio/tts/customization"
 # origin so no CORS/cert dance — the LeLab pattern). Needs `pip install qwen-tts torch
 # soundfile`. Lazy-loaded; the Space (cpu-basic) leaves this unset and uses DashScope.
 TTS_MODE = os.environ.get("TINY_TTS_MODE", "").strip().lower()
 _local_tts = None       # VoiceDesign model
 _local_clone = None     # Base model (voice clone) — lazy, only if a clone is requested
 _local_tts_lock = threading.Lock()
@@ -411,6 +412,31 @@ async def qwen_tts(request: Request):
     return Response(wav, media_type="audio/wav", headers={"Cache-Control": "no-store"})
 # ── Persona portraits (image generation) ─────────────────────────────────────
 # Mirrors the voice path: TINY_IMAGE_MODE=local runs the OPEN WEIGHTS on your GPU
 # (Z-Image-Turbo, 6B, ~12 GB bf16 — coexists with the TTS model on a 24 GB card);

 # origin so no CORS/cert dance — the LeLab pattern). Needs `pip install qwen-tts torch
 # soundfile`. Lazy-loaded; the Space (cpu-basic) leaves this unset and uses DashScope.
 TTS_MODE = os.environ.get("TINY_TTS_MODE", "").strip().lower()
+VOXCPM_SPACE = os.environ.get("TINY_VOXCPM_SPACE", "").strip()
 _local_tts = None       # VoiceDesign model
 _local_clone = None     # Base model (voice clone) — lazy, only if a clone is requested
 _local_tts_lock = threading.Lock()
     return Response(wav, media_type="audio/wav", headers={"Cache-Control": "no-store"})
+def _voxcpm_tts(text, instruct):
+    from gradio_client import Client
+    client = Client(VOXCPM_SPACE, token=HF_TOKEN or None)
+    result = client.predict(text, instruct or "A clear, natural voice at a moderate pace.", api_name="/synthesize")
+    path = result[0] if isinstance(result, (tuple, list)) else result
+    with open(os.fspath(path), "rb") as f:
+        return f.read()
+@fastapi_app.post("/voxcpm-tts")
+async def voxcpm_tts(request: Request):
+    body = await request.json()
+    text = (body.get("text") or "").strip()
+    instruct = (body.get("instruct") or "").strip()
+    if not text:
+        return Response("text required", status_code=400)
+    if not VOXCPM_SPACE:
+        return Response("TINY_VOXCPM_SPACE not set", status_code=503)
+    try:
+        wav = await asyncio.to_thread(_voxcpm_tts, text, instruct)
+    except Exception as e:  # noqa: BLE001
+        return Response(f"VoxCPM error: {e}", status_code=502)
+    return Response(wav, media_type="audio/wav", headers={"Cache-Control": "no-store"})
 # ── Persona portraits (image generation) ─────────────────────────────────────
 # Mirrors the voice path: TINY_IMAGE_MODE=local runs the OPEN WEIGHTS on your GPU
 # (Z-Image-Turbo, 6B, ~12 GB bf16 — coexists with the TTS model on a 24 GB card);

web/tts.js CHANGED Viewed

@@ -4,12 +4,13 @@
 // LLM is still writing. Panels + the TTS bar import only from here.
 import { engine as kokoro } from '/web/ttsKokoro.js'
 import { engine as qwen3, engineLocal as qwen3local, isLocalhost } from '/web/ttsQwen3.js'
 import { engine as kitten } from '/web/ttsKitten.js'
 import { engine as webspeech } from '/web/ttsWebSpeech.js'
 import { playSamples, stopAudio, decodeAudio, encodeWav } from '/web/ttsAudio.js'
 import { ensurePersistentStorage } from '/web/storage.js'
-const ENGINES = [kokoro, qwen3local, qwen3, kitten, webspeech]
 // Default voice provider: local-GPU Qwen3-TTS on localhost (your GPU designs voices),
 // in-browser Kokoro in prod (runs on the device — no exhaustible cloud quota). Cloud
 // Qwen3-TTS and the others remain selectable in Settings. Persisted across refreshes.
@@ -23,10 +24,11 @@ let activeId = (() => {
 // Qwen3-TTS designs a voice from a free-form description (the persona's `voice`).
 // Panels set it before narrating; previewVoice() plays a one-off sample.
-export function setVoiceDescription(desc) { qwen3.setDesc(desc) }
 export async function previewVoice(desc, text) {
-  qwen3.setDesc(desc)
-  const { audio, sampleRate } = await qwen3.synth(text, 'persona')
   return playSamples(audio, sampleRate)
 }
 export const stopPreview = () => stopAudio()
@@ -34,13 +36,16 @@ export const stopPreview = () => stopAudio()
 // Create a persona's voice FILE: synth the line in the designed voice and return the
 // raw WAV (ArrayBuffer) so it can be cached + replayed verbatim. Caller plays it.
 export async function createVoiceWav(desc, text) {
-  qwen3.setDesc(desc)
-  return qwen3.synthWav(text, 'persona')
 }
 // Clone `text` from a reference voice file (keep timbre, change words). `desc` is the
 // voice design — a fallback so prod (no clone model) can re-design instead. Returns WAV.
 export async function cloneVoiceWav(refArrayBuffer, refText, text, desc) {
-  return qwen3.cloneWav(text, refArrayBuffer, refText, desc)
 }
 export async function playWav(arrayBuffer) {
   const { audio, sampleRate } = await decodeAudio(arrayBuffer)

 // LLM is still writing. Panels + the TTS bar import only from here.
 import { engine as kokoro } from '/web/ttsKokoro.js'
 import { engine as qwen3, engineLocal as qwen3local, isLocalhost } from '/web/ttsQwen3.js'
+import { engine as voxcpm } from '/web/ttsVoxcpm.js'
 import { engine as kitten } from '/web/ttsKitten.js'
 import { engine as webspeech } from '/web/ttsWebSpeech.js'
 import { playSamples, stopAudio, decodeAudio, encodeWav } from '/web/ttsAudio.js'
 import { ensurePersistentStorage } from '/web/storage.js'
+const ENGINES = [kokoro, qwen3local, qwen3, voxcpm, kitten, webspeech]
 // Default voice provider: local-GPU Qwen3-TTS on localhost (your GPU designs voices),
 // in-browser Kokoro in prod (runs on the device — no exhaustible cloud quota). Cloud
 // Qwen3-TTS and the others remain selectable in Settings. Persisted across refreshes.
 // Qwen3-TTS designs a voice from a free-form description (the persona's `voice`).
 // Panels set it before narrating; previewVoice() plays a one-off sample.
+export function setVoiceDescription(desc) { qwen3.setDesc(desc); voxcpm.setDesc(desc) }
 export async function previewVoice(desc, text) {
+  const e = eng()
+  if (e.setDesc) e.setDesc(desc)
+  const { audio, sampleRate } = await e.synth(text, 'persona')
   return playSamples(audio, sampleRate)
 }
 export const stopPreview = () => stopAudio()
 // Create a persona's voice FILE: synth the line in the designed voice and return the
 // raw WAV (ArrayBuffer) so it can be cached + replayed verbatim. Caller plays it.
 export async function createVoiceWav(desc, text) {
+  const e = eng()
+  if (e.setDesc) e.setDesc(desc)
+  return e.synthWav(text, 'persona')
 }
 // Clone `text` from a reference voice file (keep timbre, change words). `desc` is the
 // voice design — a fallback so prod (no clone model) can re-design instead. Returns WAV.
 export async function cloneVoiceWav(refArrayBuffer, refText, text, desc) {
+  const e = eng()
+  if (e.setDesc) e.setDesc(desc)
+  return e.cloneWav(text, refArrayBuffer, refText, desc)
 }
 export async function playWav(arrayBuffer) {
   const { audio, sampleRate } = await decodeAudio(arrayBuffer)

web/ttsVoxcpm.js ADDED Viewed

	@@ -0,0 +1,41 @@

+// TTS engine: VoxCPM2 via a ZeroGPU sidecar, proxied through /voxcpm-tts so
+// tokens and sidecar details stay server-side. Like Qwen3, this designs a voice
+// from each hero's free-form voice description.
+import { decodeAudio } from '/web/ttsAudio.js'
+let _desc = ''
+const VOICES = [
+  { id: 'persona', label: '✨ Persona voice (designed)', desc: () => _desc },
+  { id: 'veteran', label: 'Gruff veteran', desc: () => 'A gravelly, battle-worn male baritone - slow, deliberate, weary, with a wry edge.' },
+  { id: 'herald', label: 'Bright herald', desc: () => 'A clear, bright young male voice - brisk, energetic, projecting and confident.' },
+  { id: 'medic', label: 'Steady medic', desc: () => 'A calm, warm female voice - measured pace, clear articulation, reassuring.' },
+  { id: 'rogue', label: 'Sly rogue', desc: () => 'A low, smooth voice with a sly, amused lilt - unhurried, with a dangerous edge.' },
+]
+const get = (id) => VOICES.find((v) => v.id === id) || VOICES[0]
+async function postSynthWav(text, voiceId) {
+  const instruct = (get(voiceId).desc() || '').trim()
+  const resp = await fetch('/voxcpm-tts', {
+    method: 'POST', headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify({ text, instruct, language: 'English' }),
+  })
+  if (!resp.ok) throw new Error(`VoxCPM ${resp.status}: ${(await resp.text()).slice(0, 140)}`)
+  return resp.arrayBuffer()
+}
+const postSynth = async (text, voiceId) => decodeAudio(await postSynthWav(text, voiceId))
+export const engine = {
+  mode: 'pcm', needsDownload: false, networked: true, design: true,
+  id: 'voxcpm',
+  label: 'VoxCPM2 · Voice Design (ZeroGPU)',
+  experimental: true,
+  available: () => true,
+  listVoices: () => VOICES,
+  defaultVoice: 'persona',
+  ensure: async () => {},
+  setDesc(d) { _desc = (d || '').trim() },
+  synth: (text, voiceId) => postSynth(text, voiceId),
+  synthWav: (text, voiceId) => postSynthWav(text, voiceId),
+  cloneWav: (text, refAb, refText, instruct) => postSynthWav(text, 'persona'),
+  backendLabel: () => 'ZeroGPU VoxCPM2',
+}