File size: 3,863 Bytes

498dce5

"""
Amebo Premium Voice - Hausa TTS with Warmth Processing
Built on Meta's MMS-TTS Hausa model
"""
import torch
import numpy as np
from transformers import VitsModel, AutoTokenizer
from scipy import signal
from scipy.ndimage import uniform_filter1d

class AmeboPremiumVoice:
    """
    Amebo Premium Voice - Natural Nigerian Hausa TTS
    
    Features:
    - Native Hausa pronunciation (Meta MMS-TTS)
    - Warmth post-processing for natural sound
    - Fast inference (~100ms latency)
    - Lightweight (36MB model)
    """
    
    def __init__(self, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.device = device
        self.sample_rate = 16000
        
        # Load base MMS-TTS Hausa model
        self.model = VitsModel.from_pretrained('facebook/mms-tts-hau').to(device)
        self.tokenizer = AutoTokenizer.from_pretrained('facebook/mms-tts-hau')
        self.model.eval()
    
    def add_warmth(self, audio, warmth=0.3, presence=0.2):
        """
        Add warmth and presence to audio
        
        Args:
            audio: numpy array of audio samples
            warmth: 0.0-1.0, amount of low-mid boost
            presence: 0.0-1.0, amount of high-mid clarity
        
        Returns:
            Processed audio with warmth
        """
        # Normalize input
        audio = audio.astype(np.float32)
        max_val = np.abs(audio).max()
        if max_val > 0:
            audio = audio / max_val
        
        # 1. Gentle low-mid boost for warmth (200-800 Hz)
        if warmth > 0:
            # Low-shelf filter
            b_low, a_low = signal.butter(2, 800 / (self.sample_rate / 2), btype='low')
            low_content = signal.filtfilt(b_low, a_low, audio)
            audio = audio + warmth * 0.3 * low_content
        
        # 2. Presence boost (2-4 kHz) for clarity
        if presence > 0:
            b_mid, a_mid = signal.butter(2, [2000 / (self.sample_rate / 2), 
                                              4000 / (self.sample_rate / 2)], btype='band')
            mid_content = signal.filtfilt(b_mid, a_mid, audio)
            audio = audio + presence * 0.2 * mid_content
        
        # 3. Gentle compression for consistency
        threshold = 0.5
        ratio = 3.0
        audio_abs = np.abs(audio)
        mask = audio_abs > threshold
        if np.any(mask):
            gain_reduction = np.ones_like(audio)
            gain_reduction[mask] = threshold + (audio_abs[mask] - threshold) / ratio
            gain_reduction[mask] = gain_reduction[mask] / audio_abs[mask]
            audio = audio * gain_reduction
        
        # 4. Smooth any harsh transients
        audio = uniform_filter1d(audio, size=3)
        
        # Normalize output
        max_val = np.abs(audio).max()
        if max_val > 0:
            audio = audio / max_val * 0.95
        
        return audio.astype(np.float32)
    
    def generate(self, text, warmth=0.3, presence=0.2):
        """
        Generate speech from Hausa text
        
        Args:
            text: Hausa text to synthesize
            warmth: 0.0-1.0, voice warmth level
            presence: 0.0-1.0, voice clarity level
        
        Returns:
            dict with 'audio' (numpy array) and 'sample_rate' (int)
        """
        # Tokenize
        inputs = self.tokenizer(text, return_tensors='pt').to(self.device)
        
        # Generate
        with torch.no_grad():
            output = self.model(**inputs).waveform
        
        # Get audio
        audio = output.squeeze().cpu().numpy()
        
        # Apply warmth processing
        audio = self.add_warmth(audio, warmth=warmth, presence=presence)
        
        return {
            'audio': audio,
            'sample_rate': self.sample_rate
        }
    
    def __call__(self, text, **kwargs):
        return self.generate(text, **kwargs)