timothydake2016
/

amebo-premium-voice

speech-synthesis

Model card Files Files and versions

amebo-premium-voice / handler.py

Amebo AI

Initial upload: Amebo Premium Voice - Hausa TTS

498dce5 28 days ago

history blame contribute delete

3.86 kB

	"""
	Custom Inference Handler for Amebo Premium Voice
	Enables HuggingFace Inference API and Dedicated Endpoints
	"""
	import torch
	import numpy as np
	from transformers import VitsModel, AutoTokenizer
	from scipy import signal
	from scipy.ndimage import uniform_filter1d
	import base64
	import io
	import soundfile as sf

	class EndpointHandler:
	def __init__(self, path="."):
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	self.sample_rate = 16000

	# Load MMS-TTS Hausa
	self.model = VitsModel.from_pretrained("facebook/mms-tts-hau").to(self.device)
	self.tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-hau")
	self.model.eval()

	def add_warmth(self, audio, warmth=0.3, presence=0.2):
	audio = audio.astype(np.float32)
	max_val = np.abs(audio).max()
	if max_val > 0:
	audio = audio / max_val

	# Low-mid boost for warmth
	if warmth > 0:
	b_low, a_low = signal.butter(2, 800 / (self.sample_rate / 2), btype='low')
	low_content = signal.filtfilt(b_low, a_low, audio)
	audio = audio + warmth * 0.3 * low_content

	# Presence boost for clarity
	if presence > 0:
	b_mid, a_mid = signal.butter(2, [2000 / (self.sample_rate / 2),
	4000 / (self.sample_rate / 2)], btype='band')
	mid_content = signal.filtfilt(b_mid, a_mid, audio)
	audio = audio + presence * 0.2 * mid_content

	# Gentle compression
	threshold = 0.5
	ratio = 3.0
	audio_abs = np.abs(audio)
	mask = audio_abs > threshold
	if np.any(mask):
	gain_reduction = np.ones_like(audio)
	gain_reduction[mask] = threshold + (audio_abs[mask] - threshold) / ratio
	gain_reduction[mask] = gain_reduction[mask] / audio_abs[mask]
	audio = audio * gain_reduction

	# Smooth transients
	audio = uniform_filter1d(audio, size=3)

	# Normalize
	max_val = np.abs(audio).max()
	if max_val > 0:
	audio = audio / max_val * 0.95

	return audio.astype(np.float32)

	def __call__(self, data):
	"""
	Process inference request

	Args:
	data: dict with 'inputs' (text) and optional 'parameters'

	Returns:
	Audio as base64 encoded WAV or raw bytes
	"""
	# Get input text
	inputs = data.get("inputs", "")
	if not inputs:
	return {"error": "No input text provided"}

	# Get parameters
	params = data.get("parameters", {})
	warmth = params.get("warmth", 0.3)
	presence = params.get("presence", 0.2)
	return_format = params.get("format", "base64")

	# Tokenize
	tokens = self.tokenizer(inputs, return_tensors="pt").to(self.device)

	# Generate audio
	with torch.no_grad():
	output = self.model(**tokens).waveform

	audio = output.squeeze().cpu().numpy()

	# Apply warmth
	audio = self.add_warmth(audio, warmth=warmth, presence=presence)

	# Return as base64 WAV
	if return_format == "base64":
	buffer = io.BytesIO()
	sf.write(buffer, audio, self.sample_rate, format="WAV")
	buffer.seek(0)
	audio_base64 = base64.b64encode(buffer.read()).decode("utf-8")
	return {
	"audio": audio_base64,
	"sample_rate": self.sample_rate,
	"format": "wav",
	"encoding": "base64"
	}
	else:
	return {
	"audio": audio.tolist(),
	"sample_rate": self.sample_rate
	}