""" Amebo Premium Voice - Hausa TTS with Warmth Processing Built on Meta's MMS-TTS Hausa model """ import torch import numpy as np from transformers import VitsModel, AutoTokenizer from scipy import signal from scipy.ndimage import uniform_filter1d class AmeboPremiumVoice: """ Amebo Premium Voice - Natural Nigerian Hausa TTS Features: - Native Hausa pronunciation (Meta MMS-TTS) - Warmth post-processing for natural sound - Fast inference (~100ms latency) - Lightweight (36MB model) """ def __init__(self, device='cuda' if torch.cuda.is_available() else 'cpu'): self.device = device self.sample_rate = 16000 # Load base MMS-TTS Hausa model self.model = VitsModel.from_pretrained('facebook/mms-tts-hau').to(device) self.tokenizer = AutoTokenizer.from_pretrained('facebook/mms-tts-hau') self.model.eval() def add_warmth(self, audio, warmth=0.3, presence=0.2): """ Add warmth and presence to audio Args: audio: numpy array of audio samples warmth: 0.0-1.0, amount of low-mid boost presence: 0.0-1.0, amount of high-mid clarity Returns: Processed audio with warmth """ # Normalize input audio = audio.astype(np.float32) max_val = np.abs(audio).max() if max_val > 0: audio = audio / max_val # 1. Gentle low-mid boost for warmth (200-800 Hz) if warmth > 0: # Low-shelf filter b_low, a_low = signal.butter(2, 800 / (self.sample_rate / 2), btype='low') low_content = signal.filtfilt(b_low, a_low, audio) audio = audio + warmth * 0.3 * low_content # 2. Presence boost (2-4 kHz) for clarity if presence > 0: b_mid, a_mid = signal.butter(2, [2000 / (self.sample_rate / 2), 4000 / (self.sample_rate / 2)], btype='band') mid_content = signal.filtfilt(b_mid, a_mid, audio) audio = audio + presence * 0.2 * mid_content # 3. Gentle compression for consistency threshold = 0.5 ratio = 3.0 audio_abs = np.abs(audio) mask = audio_abs > threshold if np.any(mask): gain_reduction = np.ones_like(audio) gain_reduction[mask] = threshold + (audio_abs[mask] - threshold) / ratio gain_reduction[mask] = gain_reduction[mask] / audio_abs[mask] audio = audio * gain_reduction # 4. Smooth any harsh transients audio = uniform_filter1d(audio, size=3) # Normalize output max_val = np.abs(audio).max() if max_val > 0: audio = audio / max_val * 0.95 return audio.astype(np.float32) def generate(self, text, warmth=0.3, presence=0.2): """ Generate speech from Hausa text Args: text: Hausa text to synthesize warmth: 0.0-1.0, voice warmth level presence: 0.0-1.0, voice clarity level Returns: dict with 'audio' (numpy array) and 'sample_rate' (int) """ # Tokenize inputs = self.tokenizer(text, return_tensors='pt').to(self.device) # Generate with torch.no_grad(): output = self.model(**inputs).waveform # Get audio audio = output.squeeze().cpu().numpy() # Apply warmth processing audio = self.add_warmth(audio, warmth=warmth, presence=presence) return { 'audio': audio, 'sample_rate': self.sample_rate } def __call__(self, text, **kwargs): return self.generate(text, **kwargs)