Spaces:

build-small-hackathon
/

Voinal

Running on Zero

Voinal / tts_model.py

GovIndLok

fix: cast audio output to float32 and adjust server binding based on platform environment

552a97b 19 days ago

1.63 kB

	import spaces
	import torch
	from transformers import AutoProcessor, BarkModel
	import librosa
	import numpy as np

	VOICE_MODEL_ID = "suno/bark-small"

	VOICE_DESIGN = {
	"sml": "v2/en_speaker_6",
	"chop": "v2/en_speaker_2",
	"agressor": "v2/en_speaker_1",
	}

	_processor = None
	_model = None


	def get_model():
	global _model, _processor
	if _model is None:
	print(f"[voice] loading {VOICE_MODEL_ID} ...", flush=True)
	# Bark runs natively in standard transformers. Using float16 to save VRAM.
	_processor = AutoProcessor.from_pretrained(VOICE_MODEL_ID)
	_model = BarkModel.from_pretrained(
	VOICE_MODEL_ID, torch_dtype=torch.float16
	).to("cuda")
	print("[voice] model is ready.")
	return _processor, _model


	@spaces.GPU(duration=50)
	def synthesize(text: str, voice_key: str):
	if not text:
	return None
	processor, model = get_model()
	voice_preset = VOICE_DESIGN.get(voice_key, "v2/en_speaker_6")

	print(f"[voice] generating voice from {text}", flush=True)
	input = processor(text, voice_preset=voice_preset, return_tensors="pt").to("cuda")

	with torch.no_grad():
	speech_output = model.generate(**input)

	wav = speech_output[0].cpu().numpy()

	# Convert the float16 array to float32 so Numba can process it
	wav = wav.astype(np.float32)

	# Speed up the voice by 1.25x to reduce duration without altering pitch
	wav = librosa.effects.time_stretch(wav, rate=1.25)

	return (24000, wav)


	@spaces.GPU(duration=150)
	def warmup():
	get_model()
	print("[voice] warmup complete.", flush=True)