amebo-premium-voice / handler.py
Amebo AI
Initial upload: Amebo Premium Voice - Hausa TTS
498dce5
"""
Custom Inference Handler for Amebo Premium Voice
Enables HuggingFace Inference API and Dedicated Endpoints
"""
import torch
import numpy as np
from transformers import VitsModel, AutoTokenizer
from scipy import signal
from scipy.ndimage import uniform_filter1d
import base64
import io
import soundfile as sf
class EndpointHandler:
def __init__(self, path="."):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.sample_rate = 16000
# Load MMS-TTS Hausa
self.model = VitsModel.from_pretrained("facebook/mms-tts-hau").to(self.device)
self.tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-hau")
self.model.eval()
def add_warmth(self, audio, warmth=0.3, presence=0.2):
audio = audio.astype(np.float32)
max_val = np.abs(audio).max()
if max_val > 0:
audio = audio / max_val
# Low-mid boost for warmth
if warmth > 0:
b_low, a_low = signal.butter(2, 800 / (self.sample_rate / 2), btype='low')
low_content = signal.filtfilt(b_low, a_low, audio)
audio = audio + warmth * 0.3 * low_content
# Presence boost for clarity
if presence > 0:
b_mid, a_mid = signal.butter(2, [2000 / (self.sample_rate / 2),
4000 / (self.sample_rate / 2)], btype='band')
mid_content = signal.filtfilt(b_mid, a_mid, audio)
audio = audio + presence * 0.2 * mid_content
# Gentle compression
threshold = 0.5
ratio = 3.0
audio_abs = np.abs(audio)
mask = audio_abs > threshold
if np.any(mask):
gain_reduction = np.ones_like(audio)
gain_reduction[mask] = threshold + (audio_abs[mask] - threshold) / ratio
gain_reduction[mask] = gain_reduction[mask] / audio_abs[mask]
audio = audio * gain_reduction
# Smooth transients
audio = uniform_filter1d(audio, size=3)
# Normalize
max_val = np.abs(audio).max()
if max_val > 0:
audio = audio / max_val * 0.95
return audio.astype(np.float32)
def __call__(self, data):
"""
Process inference request
Args:
data: dict with 'inputs' (text) and optional 'parameters'
Returns:
Audio as base64 encoded WAV or raw bytes
"""
# Get input text
inputs = data.get("inputs", "")
if not inputs:
return {"error": "No input text provided"}
# Get parameters
params = data.get("parameters", {})
warmth = params.get("warmth", 0.3)
presence = params.get("presence", 0.2)
return_format = params.get("format", "base64")
# Tokenize
tokens = self.tokenizer(inputs, return_tensors="pt").to(self.device)
# Generate audio
with torch.no_grad():
output = self.model(**tokens).waveform
audio = output.squeeze().cpu().numpy()
# Apply warmth
audio = self.add_warmth(audio, warmth=warmth, presence=presence)
# Return as base64 WAV
if return_format == "base64":
buffer = io.BytesIO()
sf.write(buffer, audio, self.sample_rate, format="WAV")
buffer.seek(0)
audio_base64 = base64.b64encode(buffer.read()).decode("utf-8")
return {
"audio": audio_base64,
"sample_rate": self.sample_rate,
"format": "wav",
"encoding": "base64"
}
else:
return {
"audio": audio.tolist(),
"sample_rate": self.sample_rate
}