import spaces import torch from transformers import AutoProcessor, BarkModel import librosa import numpy as np VOICE_MODEL_ID = "suno/bark-small" VOICE_DESIGN = { "sml": "v2/en_speaker_6", "chop": "v2/en_speaker_2", "agressor": "v2/en_speaker_1", } _processor = None _model = None def get_model(): global _model, _processor if _model is None: print(f"[voice] loading {VOICE_MODEL_ID} ...", flush=True) # Bark runs natively in standard transformers. Using float16 to save VRAM. _processor = AutoProcessor.from_pretrained(VOICE_MODEL_ID) _model = BarkModel.from_pretrained( VOICE_MODEL_ID, torch_dtype=torch.float16 ).to("cuda") print("[voice] model is ready.") return _processor, _model @spaces.GPU(duration=50) def synthesize(text: str, voice_key: str): if not text: return None processor, model = get_model() voice_preset = VOICE_DESIGN.get(voice_key, "v2/en_speaker_6") print(f"[voice] generating voice from {text}", flush=True) input = processor(text, voice_preset=voice_preset, return_tensors="pt").to("cuda") with torch.no_grad(): speech_output = model.generate(**input) wav = speech_output[0].cpu().numpy() # Convert the float16 array to float32 so Numba can process it wav = wav.astype(np.float32) # Speed up the voice by 1.25x to reduce duration without altering pitch wav = librosa.effects.time_stretch(wav, rate=1.25) return (24000, wav) @spaces.GPU(duration=150) def warmup(): get_model() print("[voice] warmup complete.", flush=True)