Amebo AI
Initial upload: Amebo Premium Voice - Hausa TTS
498dce5
"""
Amebo Premium Voice - Hausa TTS with Warmth Processing
Built on Meta's MMS-TTS Hausa model
"""
import torch
import numpy as np
from transformers import VitsModel, AutoTokenizer
from scipy import signal
from scipy.ndimage import uniform_filter1d
class AmeboPremiumVoice:
"""
Amebo Premium Voice - Natural Nigerian Hausa TTS
Features:
- Native Hausa pronunciation (Meta MMS-TTS)
- Warmth post-processing for natural sound
- Fast inference (~100ms latency)
- Lightweight (36MB model)
"""
def __init__(self, device='cuda' if torch.cuda.is_available() else 'cpu'):
self.device = device
self.sample_rate = 16000
# Load base MMS-TTS Hausa model
self.model = VitsModel.from_pretrained('facebook/mms-tts-hau').to(device)
self.tokenizer = AutoTokenizer.from_pretrained('facebook/mms-tts-hau')
self.model.eval()
def add_warmth(self, audio, warmth=0.3, presence=0.2):
"""
Add warmth and presence to audio
Args:
audio: numpy array of audio samples
warmth: 0.0-1.0, amount of low-mid boost
presence: 0.0-1.0, amount of high-mid clarity
Returns:
Processed audio with warmth
"""
# Normalize input
audio = audio.astype(np.float32)
max_val = np.abs(audio).max()
if max_val > 0:
audio = audio / max_val
# 1. Gentle low-mid boost for warmth (200-800 Hz)
if warmth > 0:
# Low-shelf filter
b_low, a_low = signal.butter(2, 800 / (self.sample_rate / 2), btype='low')
low_content = signal.filtfilt(b_low, a_low, audio)
audio = audio + warmth * 0.3 * low_content
# 2. Presence boost (2-4 kHz) for clarity
if presence > 0:
b_mid, a_mid = signal.butter(2, [2000 / (self.sample_rate / 2),
4000 / (self.sample_rate / 2)], btype='band')
mid_content = signal.filtfilt(b_mid, a_mid, audio)
audio = audio + presence * 0.2 * mid_content
# 3. Gentle compression for consistency
threshold = 0.5
ratio = 3.0
audio_abs = np.abs(audio)
mask = audio_abs > threshold
if np.any(mask):
gain_reduction = np.ones_like(audio)
gain_reduction[mask] = threshold + (audio_abs[mask] - threshold) / ratio
gain_reduction[mask] = gain_reduction[mask] / audio_abs[mask]
audio = audio * gain_reduction
# 4. Smooth any harsh transients
audio = uniform_filter1d(audio, size=3)
# Normalize output
max_val = np.abs(audio).max()
if max_val > 0:
audio = audio / max_val * 0.95
return audio.astype(np.float32)
def generate(self, text, warmth=0.3, presence=0.2):
"""
Generate speech from Hausa text
Args:
text: Hausa text to synthesize
warmth: 0.0-1.0, voice warmth level
presence: 0.0-1.0, voice clarity level
Returns:
dict with 'audio' (numpy array) and 'sample_rate' (int)
"""
# Tokenize
inputs = self.tokenizer(text, return_tensors='pt').to(self.device)
# Generate
with torch.no_grad():
output = self.model(**inputs).waveform
# Get audio
audio = output.squeeze().cpu().numpy()
# Apply warmth processing
audio = self.add_warmth(audio, warmth=warmth, presence=presence)
return {
'audio': audio,
'sample_rate': self.sample_rate
}
def __call__(self, text, **kwargs):
return self.generate(text, **kwargs)