Voinal / tts_model.py
GovIndLok
fix: cast audio output to float32 and adjust server binding based on platform environment
552a97b
Raw
History Blame Contribute Delete
1.63 kB
import spaces
import torch
from transformers import AutoProcessor, BarkModel
import librosa
import numpy as np
VOICE_MODEL_ID = "suno/bark-small"
VOICE_DESIGN = {
"sml": "v2/en_speaker_6",
"chop": "v2/en_speaker_2",
"agressor": "v2/en_speaker_1",
}
_processor = None
_model = None
def get_model():
global _model, _processor
if _model is None:
print(f"[voice] loading {VOICE_MODEL_ID} ...", flush=True)
# Bark runs natively in standard transformers. Using float16 to save VRAM.
_processor = AutoProcessor.from_pretrained(VOICE_MODEL_ID)
_model = BarkModel.from_pretrained(
VOICE_MODEL_ID, torch_dtype=torch.float16
).to("cuda")
print("[voice] model is ready.")
return _processor, _model
@spaces.GPU(duration=50)
def synthesize(text: str, voice_key: str):
if not text:
return None
processor, model = get_model()
voice_preset = VOICE_DESIGN.get(voice_key, "v2/en_speaker_6")
print(f"[voice] generating voice from {text}", flush=True)
input = processor(text, voice_preset=voice_preset, return_tensors="pt").to("cuda")
with torch.no_grad():
speech_output = model.generate(**input)
wav = speech_output[0].cpu().numpy()
# Convert the float16 array to float32 so Numba can process it
wav = wav.astype(np.float32)
# Speed up the voice by 1.25x to reduce duration without altering pitch
wav = librosa.effects.time_stretch(wav, rate=1.25)
return (24000, wav)
@spaces.GPU(duration=150)
def warmup():
get_model()
print("[voice] warmup complete.", flush=True)