timothydake2016
/

amebo-premium-voice

speech-synthesis

Model card Files Files and versions

amebo-premium-voice / model.py

Amebo AI

Initial upload: Amebo Premium Voice - Hausa TTS

498dce5 about 1 month ago

history blame contribute delete

3.86 kB

	"""
	Amebo Premium Voice - Hausa TTS with Warmth Processing
	Built on Meta's MMS-TTS Hausa model
	"""
	import torch
	import numpy as np
	from transformers import VitsModel, AutoTokenizer
	from scipy import signal
	from scipy.ndimage import uniform_filter1d

	class AmeboPremiumVoice:
	"""
	Amebo Premium Voice - Natural Nigerian Hausa TTS

	Features:
	- Native Hausa pronunciation (Meta MMS-TTS)
	- Warmth post-processing for natural sound
	- Fast inference (~100ms latency)
	- Lightweight (36MB model)
	"""

	def __init__(self, device='cuda' if torch.cuda.is_available() else 'cpu'):
	self.device = device
	self.sample_rate = 16000

	# Load base MMS-TTS Hausa model
	self.model = VitsModel.from_pretrained('facebook/mms-tts-hau').to(device)
	self.tokenizer = AutoTokenizer.from_pretrained('facebook/mms-tts-hau')
	self.model.eval()

	def add_warmth(self, audio, warmth=0.3, presence=0.2):
	"""
	Add warmth and presence to audio

	Args:
	audio: numpy array of audio samples
	warmth: 0.0-1.0, amount of low-mid boost
	presence: 0.0-1.0, amount of high-mid clarity

	Returns:
	Processed audio with warmth
	"""
	# Normalize input
	audio = audio.astype(np.float32)
	max_val = np.abs(audio).max()
	if max_val > 0:
	audio = audio / max_val

	# 1. Gentle low-mid boost for warmth (200-800 Hz)
	if warmth > 0:
	# Low-shelf filter
	b_low, a_low = signal.butter(2, 800 / (self.sample_rate / 2), btype='low')
	low_content = signal.filtfilt(b_low, a_low, audio)
	audio = audio + warmth * 0.3 * low_content

	# 2. Presence boost (2-4 kHz) for clarity
	if presence > 0:
	b_mid, a_mid = signal.butter(2, [2000 / (self.sample_rate / 2),
	4000 / (self.sample_rate / 2)], btype='band')
	mid_content = signal.filtfilt(b_mid, a_mid, audio)
	audio = audio + presence * 0.2 * mid_content

	# 3. Gentle compression for consistency
	threshold = 0.5
	ratio = 3.0
	audio_abs = np.abs(audio)
	mask = audio_abs > threshold
	if np.any(mask):
	gain_reduction = np.ones_like(audio)
	gain_reduction[mask] = threshold + (audio_abs[mask] - threshold) / ratio
	gain_reduction[mask] = gain_reduction[mask] / audio_abs[mask]
	audio = audio * gain_reduction

	# 4. Smooth any harsh transients
	audio = uniform_filter1d(audio, size=3)

	# Normalize output
	max_val = np.abs(audio).max()
	if max_val > 0:
	audio = audio / max_val * 0.95

	return audio.astype(np.float32)

	def generate(self, text, warmth=0.3, presence=0.2):
	"""
	Generate speech from Hausa text

	Args:
	text: Hausa text to synthesize
	warmth: 0.0-1.0, voice warmth level
	presence: 0.0-1.0, voice clarity level

	Returns:
	dict with 'audio' (numpy array) and 'sample_rate' (int)
	"""
	# Tokenize
	inputs = self.tokenizer(text, return_tensors='pt').to(self.device)

	# Generate
	with torch.no_grad():
	output = self.model(**inputs).waveform

	# Get audio
	audio = output.squeeze().cpu().numpy()

	# Apply warmth processing
	audio = self.add_warmth(audio, warmth=warmth, presence=presence)

	return {
	'audio': audio,
	'sample_rate': self.sample_rate
	}

	def __call__(self, text, **kwargs):
	return self.generate(text, **kwargs)