Spaces:
Running on Zero
Running on Zero
GovIndLok
fix: cast audio output to float32 and adjust server binding based on platform environment
552a97b | import spaces | |
| import torch | |
| from transformers import AutoProcessor, BarkModel | |
| import librosa | |
| import numpy as np | |
| VOICE_MODEL_ID = "suno/bark-small" | |
| VOICE_DESIGN = { | |
| "sml": "v2/en_speaker_6", | |
| "chop": "v2/en_speaker_2", | |
| "agressor": "v2/en_speaker_1", | |
| } | |
| _processor = None | |
| _model = None | |
| def get_model(): | |
| global _model, _processor | |
| if _model is None: | |
| print(f"[voice] loading {VOICE_MODEL_ID} ...", flush=True) | |
| # Bark runs natively in standard transformers. Using float16 to save VRAM. | |
| _processor = AutoProcessor.from_pretrained(VOICE_MODEL_ID) | |
| _model = BarkModel.from_pretrained( | |
| VOICE_MODEL_ID, torch_dtype=torch.float16 | |
| ).to("cuda") | |
| print("[voice] model is ready.") | |
| return _processor, _model | |
| def synthesize(text: str, voice_key: str): | |
| if not text: | |
| return None | |
| processor, model = get_model() | |
| voice_preset = VOICE_DESIGN.get(voice_key, "v2/en_speaker_6") | |
| print(f"[voice] generating voice from {text}", flush=True) | |
| input = processor(text, voice_preset=voice_preset, return_tensors="pt").to("cuda") | |
| with torch.no_grad(): | |
| speech_output = model.generate(**input) | |
| wav = speech_output[0].cpu().numpy() | |
| # Convert the float16 array to float32 so Numba can process it | |
| wav = wav.astype(np.float32) | |
| # Speed up the voice by 1.25x to reduce duration without altering pitch | |
| wav = librosa.effects.time_stretch(wav, rate=1.25) | |
| return (24000, wav) | |
| def warmup(): | |
| get_model() | |
| print("[voice] warmup complete.", flush=True) | |