|
|
""" |
|
|
Amebo Premium Voice - Hausa TTS with Warmth Processing |
|
|
Built on Meta's MMS-TTS Hausa model |
|
|
""" |
|
|
import torch |
|
|
import numpy as np |
|
|
from transformers import VitsModel, AutoTokenizer |
|
|
from scipy import signal |
|
|
from scipy.ndimage import uniform_filter1d |
|
|
|
|
|
class AmeboPremiumVoice: |
|
|
""" |
|
|
Amebo Premium Voice - Natural Nigerian Hausa TTS |
|
|
|
|
|
Features: |
|
|
- Native Hausa pronunciation (Meta MMS-TTS) |
|
|
- Warmth post-processing for natural sound |
|
|
- Fast inference (~100ms latency) |
|
|
- Lightweight (36MB model) |
|
|
""" |
|
|
|
|
|
def __init__(self, device='cuda' if torch.cuda.is_available() else 'cpu'): |
|
|
self.device = device |
|
|
self.sample_rate = 16000 |
|
|
|
|
|
|
|
|
self.model = VitsModel.from_pretrained('facebook/mms-tts-hau').to(device) |
|
|
self.tokenizer = AutoTokenizer.from_pretrained('facebook/mms-tts-hau') |
|
|
self.model.eval() |
|
|
|
|
|
def add_warmth(self, audio, warmth=0.3, presence=0.2): |
|
|
""" |
|
|
Add warmth and presence to audio |
|
|
|
|
|
Args: |
|
|
audio: numpy array of audio samples |
|
|
warmth: 0.0-1.0, amount of low-mid boost |
|
|
presence: 0.0-1.0, amount of high-mid clarity |
|
|
|
|
|
Returns: |
|
|
Processed audio with warmth |
|
|
""" |
|
|
|
|
|
audio = audio.astype(np.float32) |
|
|
max_val = np.abs(audio).max() |
|
|
if max_val > 0: |
|
|
audio = audio / max_val |
|
|
|
|
|
|
|
|
if warmth > 0: |
|
|
|
|
|
b_low, a_low = signal.butter(2, 800 / (self.sample_rate / 2), btype='low') |
|
|
low_content = signal.filtfilt(b_low, a_low, audio) |
|
|
audio = audio + warmth * 0.3 * low_content |
|
|
|
|
|
|
|
|
if presence > 0: |
|
|
b_mid, a_mid = signal.butter(2, [2000 / (self.sample_rate / 2), |
|
|
4000 / (self.sample_rate / 2)], btype='band') |
|
|
mid_content = signal.filtfilt(b_mid, a_mid, audio) |
|
|
audio = audio + presence * 0.2 * mid_content |
|
|
|
|
|
|
|
|
threshold = 0.5 |
|
|
ratio = 3.0 |
|
|
audio_abs = np.abs(audio) |
|
|
mask = audio_abs > threshold |
|
|
if np.any(mask): |
|
|
gain_reduction = np.ones_like(audio) |
|
|
gain_reduction[mask] = threshold + (audio_abs[mask] - threshold) / ratio |
|
|
gain_reduction[mask] = gain_reduction[mask] / audio_abs[mask] |
|
|
audio = audio * gain_reduction |
|
|
|
|
|
|
|
|
audio = uniform_filter1d(audio, size=3) |
|
|
|
|
|
|
|
|
max_val = np.abs(audio).max() |
|
|
if max_val > 0: |
|
|
audio = audio / max_val * 0.95 |
|
|
|
|
|
return audio.astype(np.float32) |
|
|
|
|
|
def generate(self, text, warmth=0.3, presence=0.2): |
|
|
""" |
|
|
Generate speech from Hausa text |
|
|
|
|
|
Args: |
|
|
text: Hausa text to synthesize |
|
|
warmth: 0.0-1.0, voice warmth level |
|
|
presence: 0.0-1.0, voice clarity level |
|
|
|
|
|
Returns: |
|
|
dict with 'audio' (numpy array) and 'sample_rate' (int) |
|
|
""" |
|
|
|
|
|
inputs = self.tokenizer(text, return_tensors='pt').to(self.device) |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
output = self.model(**inputs).waveform |
|
|
|
|
|
|
|
|
audio = output.squeeze().cpu().numpy() |
|
|
|
|
|
|
|
|
audio = self.add_warmth(audio, warmth=warmth, presence=presence) |
|
|
|
|
|
return { |
|
|
'audio': audio, |
|
|
'sample_rate': self.sample_rate |
|
|
} |
|
|
|
|
|
def __call__(self, text, **kwargs): |
|
|
return self.generate(text, **kwargs) |
|
|
|