Amebo AI commited on
Commit ·
498dce5
1
Parent(s): 730ba68
Initial upload: Amebo Premium Voice - Hausa TTS
Browse files- README.md +110 -0
- amebo_warm_01.wav +0 -0
- amebo_warm_02.wav +0 -0
- amebo_warm_03.wav +0 -0
- handler.py +113 -0
- model.py +115 -0
- requirements.txt +5 -0
README.md
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language:
|
| 3 |
+
- ha
|
| 4 |
+
license: apache-2.0
|
| 5 |
+
library_name: transformers
|
| 6 |
+
pipeline_tag: text-to-speech
|
| 7 |
+
tags:
|
| 8 |
+
- hausa
|
| 9 |
+
- tts
|
| 10 |
+
- speech-synthesis
|
| 11 |
+
- africa
|
| 12 |
+
- nigeria
|
| 13 |
+
- mms
|
| 14 |
+
- vits
|
| 15 |
+
inference:
|
| 16 |
+
parameters:
|
| 17 |
+
warmth: 0.3
|
| 18 |
+
presence: 0.2
|
| 19 |
+
---
|
| 20 |
+
|
| 21 |
+
# Amebo Premium Voice - Hausa TTS
|
| 22 |
+
|
| 23 |
+
🎙️ **Natural Nigerian Hausa Text-to-Speech**
|
| 24 |
+
|
| 25 |
+
Amebo Premium Voice is a high-quality Hausa TTS model built on Meta's MMS-TTS, enhanced with warmth processing for natural, clear speech synthesis.
|
| 26 |
+
|
| 27 |
+
## Features
|
| 28 |
+
|
| 29 |
+
- ✅ **Native Hausa pronunciation** - Correct sounds for ɓ, ɗ, ƙ, etc.
|
| 30 |
+
- ✅ **Fast inference** - ~100ms latency after warmup
|
| 31 |
+
- ✅ **Lightweight** - 36MB model
|
| 32 |
+
- ✅ **Warmth processing** - Natural, warm voice quality
|
| 33 |
+
- ✅ **Production ready** - Perfect for call centers & voice apps
|
| 34 |
+
|
| 35 |
+
## Usage
|
| 36 |
+
|
| 37 |
+
### Python API
|
| 38 |
+
|
| 39 |
+
```python
|
| 40 |
+
from transformers import VitsModel, AutoTokenizer
|
| 41 |
+
import torch
|
| 42 |
+
|
| 43 |
+
# Load model
|
| 44 |
+
model = VitsModel.from_pretrained("facebook/mms-tts-hau")
|
| 45 |
+
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-hau")
|
| 46 |
+
|
| 47 |
+
# Generate speech
|
| 48 |
+
text = "Sannu da zuwa. Ina kwana?"
|
| 49 |
+
inputs = tokenizer(text, return_tensors="pt")
|
| 50 |
+
|
| 51 |
+
with torch.no_grad():
|
| 52 |
+
output = model(**inputs).waveform
|
| 53 |
+
|
| 54 |
+
# Save audio
|
| 55 |
+
import soundfile as sf
|
| 56 |
+
sf.write("output.wav", output.squeeze().numpy(), 16000)
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
### Inference API
|
| 60 |
+
|
| 61 |
+
```python
|
| 62 |
+
import requests
|
| 63 |
+
|
| 64 |
+
API_URL = "https://api-inference.huggingface.co/models/YOUR_USERNAME/amebo-premium-voice"
|
| 65 |
+
headers = {"Authorization": "Bearer YOUR_HF_TOKEN"}
|
| 66 |
+
|
| 67 |
+
response = requests.post(API_URL, headers=headers, json={
|
| 68 |
+
"inputs": "Sannu da zuwa cikin aikin mu.",
|
| 69 |
+
"parameters": {
|
| 70 |
+
"warmth": 0.3,
|
| 71 |
+
"presence": 0.2
|
| 72 |
+
}
|
| 73 |
+
})
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
## Parameters
|
| 77 |
+
|
| 78 |
+
| Parameter | Default | Range | Description |
|
| 79 |
+
|-----------|---------|-------|-------------|
|
| 80 |
+
| warmth | 0.3 | 0.0-1.0 | Voice warmth (low-mid boost) |
|
| 81 |
+
| presence | 0.2 | 0.0-1.0 | Voice clarity (high-mid boost) |
|
| 82 |
+
|
| 83 |
+
## Performance
|
| 84 |
+
|
| 85 |
+
| Metric | Value |
|
| 86 |
+
|--------|-------|
|
| 87 |
+
| Model Size | 36 MB |
|
| 88 |
+
| Sample Rate | 16 kHz |
|
| 89 |
+
| Latency (GPU) | ~100ms |
|
| 90 |
+
| Latency (CPU) | ~500ms |
|
| 91 |
+
|
| 92 |
+
## Supported Text
|
| 93 |
+
|
| 94 |
+
- Standard Hausa text
|
| 95 |
+
- Special characters: ɓ, ɗ, ƙ, ƴ
|
| 96 |
+
- Numbers and punctuation
|
| 97 |
+
|
| 98 |
+
## License
|
| 99 |
+
|
| 100 |
+
Apache 2.0 (based on Meta MMS-TTS)
|
| 101 |
+
|
| 102 |
+
## Credits
|
| 103 |
+
|
| 104 |
+
- Base model: [Meta MMS-TTS](https://huggingface.co/facebook/mms-tts-hau)
|
| 105 |
+
- Warmth processing: Amebo AI
|
| 106 |
+
- Training data: NaijaVoices dataset
|
| 107 |
+
|
| 108 |
+
---
|
| 109 |
+
|
| 110 |
+
Made with ❤️ for Nigerian Hausa speakers
|
amebo_warm_01.wav
ADDED
|
Binary file (77.9 kB). View file
|
|
|
amebo_warm_02.wav
ADDED
|
Binary file (86.1 kB). View file
|
|
|
amebo_warm_03.wav
ADDED
|
Binary file (78.4 kB). View file
|
|
|
handler.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Custom Inference Handler for Amebo Premium Voice
|
| 3 |
+
Enables HuggingFace Inference API and Dedicated Endpoints
|
| 4 |
+
"""
|
| 5 |
+
import torch
|
| 6 |
+
import numpy as np
|
| 7 |
+
from transformers import VitsModel, AutoTokenizer
|
| 8 |
+
from scipy import signal
|
| 9 |
+
from scipy.ndimage import uniform_filter1d
|
| 10 |
+
import base64
|
| 11 |
+
import io
|
| 12 |
+
import soundfile as sf
|
| 13 |
+
|
| 14 |
+
class EndpointHandler:
|
| 15 |
+
def __init__(self, path="."):
|
| 16 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 17 |
+
self.sample_rate = 16000
|
| 18 |
+
|
| 19 |
+
# Load MMS-TTS Hausa
|
| 20 |
+
self.model = VitsModel.from_pretrained("facebook/mms-tts-hau").to(self.device)
|
| 21 |
+
self.tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-hau")
|
| 22 |
+
self.model.eval()
|
| 23 |
+
|
| 24 |
+
def add_warmth(self, audio, warmth=0.3, presence=0.2):
|
| 25 |
+
audio = audio.astype(np.float32)
|
| 26 |
+
max_val = np.abs(audio).max()
|
| 27 |
+
if max_val > 0:
|
| 28 |
+
audio = audio / max_val
|
| 29 |
+
|
| 30 |
+
# Low-mid boost for warmth
|
| 31 |
+
if warmth > 0:
|
| 32 |
+
b_low, a_low = signal.butter(2, 800 / (self.sample_rate / 2), btype='low')
|
| 33 |
+
low_content = signal.filtfilt(b_low, a_low, audio)
|
| 34 |
+
audio = audio + warmth * 0.3 * low_content
|
| 35 |
+
|
| 36 |
+
# Presence boost for clarity
|
| 37 |
+
if presence > 0:
|
| 38 |
+
b_mid, a_mid = signal.butter(2, [2000 / (self.sample_rate / 2),
|
| 39 |
+
4000 / (self.sample_rate / 2)], btype='band')
|
| 40 |
+
mid_content = signal.filtfilt(b_mid, a_mid, audio)
|
| 41 |
+
audio = audio + presence * 0.2 * mid_content
|
| 42 |
+
|
| 43 |
+
# Gentle compression
|
| 44 |
+
threshold = 0.5
|
| 45 |
+
ratio = 3.0
|
| 46 |
+
audio_abs = np.abs(audio)
|
| 47 |
+
mask = audio_abs > threshold
|
| 48 |
+
if np.any(mask):
|
| 49 |
+
gain_reduction = np.ones_like(audio)
|
| 50 |
+
gain_reduction[mask] = threshold + (audio_abs[mask] - threshold) / ratio
|
| 51 |
+
gain_reduction[mask] = gain_reduction[mask] / audio_abs[mask]
|
| 52 |
+
audio = audio * gain_reduction
|
| 53 |
+
|
| 54 |
+
# Smooth transients
|
| 55 |
+
audio = uniform_filter1d(audio, size=3)
|
| 56 |
+
|
| 57 |
+
# Normalize
|
| 58 |
+
max_val = np.abs(audio).max()
|
| 59 |
+
if max_val > 0:
|
| 60 |
+
audio = audio / max_val * 0.95
|
| 61 |
+
|
| 62 |
+
return audio.astype(np.float32)
|
| 63 |
+
|
| 64 |
+
def __call__(self, data):
|
| 65 |
+
"""
|
| 66 |
+
Process inference request
|
| 67 |
+
|
| 68 |
+
Args:
|
| 69 |
+
data: dict with 'inputs' (text) and optional 'parameters'
|
| 70 |
+
|
| 71 |
+
Returns:
|
| 72 |
+
Audio as base64 encoded WAV or raw bytes
|
| 73 |
+
"""
|
| 74 |
+
# Get input text
|
| 75 |
+
inputs = data.get("inputs", "")
|
| 76 |
+
if not inputs:
|
| 77 |
+
return {"error": "No input text provided"}
|
| 78 |
+
|
| 79 |
+
# Get parameters
|
| 80 |
+
params = data.get("parameters", {})
|
| 81 |
+
warmth = params.get("warmth", 0.3)
|
| 82 |
+
presence = params.get("presence", 0.2)
|
| 83 |
+
return_format = params.get("format", "base64")
|
| 84 |
+
|
| 85 |
+
# Tokenize
|
| 86 |
+
tokens = self.tokenizer(inputs, return_tensors="pt").to(self.device)
|
| 87 |
+
|
| 88 |
+
# Generate audio
|
| 89 |
+
with torch.no_grad():
|
| 90 |
+
output = self.model(**tokens).waveform
|
| 91 |
+
|
| 92 |
+
audio = output.squeeze().cpu().numpy()
|
| 93 |
+
|
| 94 |
+
# Apply warmth
|
| 95 |
+
audio = self.add_warmth(audio, warmth=warmth, presence=presence)
|
| 96 |
+
|
| 97 |
+
# Return as base64 WAV
|
| 98 |
+
if return_format == "base64":
|
| 99 |
+
buffer = io.BytesIO()
|
| 100 |
+
sf.write(buffer, audio, self.sample_rate, format="WAV")
|
| 101 |
+
buffer.seek(0)
|
| 102 |
+
audio_base64 = base64.b64encode(buffer.read()).decode("utf-8")
|
| 103 |
+
return {
|
| 104 |
+
"audio": audio_base64,
|
| 105 |
+
"sample_rate": self.sample_rate,
|
| 106 |
+
"format": "wav",
|
| 107 |
+
"encoding": "base64"
|
| 108 |
+
}
|
| 109 |
+
else:
|
| 110 |
+
return {
|
| 111 |
+
"audio": audio.tolist(),
|
| 112 |
+
"sample_rate": self.sample_rate
|
| 113 |
+
}
|
model.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Amebo Premium Voice - Hausa TTS with Warmth Processing
|
| 3 |
+
Built on Meta's MMS-TTS Hausa model
|
| 4 |
+
"""
|
| 5 |
+
import torch
|
| 6 |
+
import numpy as np
|
| 7 |
+
from transformers import VitsModel, AutoTokenizer
|
| 8 |
+
from scipy import signal
|
| 9 |
+
from scipy.ndimage import uniform_filter1d
|
| 10 |
+
|
| 11 |
+
class AmeboPremiumVoice:
|
| 12 |
+
"""
|
| 13 |
+
Amebo Premium Voice - Natural Nigerian Hausa TTS
|
| 14 |
+
|
| 15 |
+
Features:
|
| 16 |
+
- Native Hausa pronunciation (Meta MMS-TTS)
|
| 17 |
+
- Warmth post-processing for natural sound
|
| 18 |
+
- Fast inference (~100ms latency)
|
| 19 |
+
- Lightweight (36MB model)
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
def __init__(self, device='cuda' if torch.cuda.is_available() else 'cpu'):
|
| 23 |
+
self.device = device
|
| 24 |
+
self.sample_rate = 16000
|
| 25 |
+
|
| 26 |
+
# Load base MMS-TTS Hausa model
|
| 27 |
+
self.model = VitsModel.from_pretrained('facebook/mms-tts-hau').to(device)
|
| 28 |
+
self.tokenizer = AutoTokenizer.from_pretrained('facebook/mms-tts-hau')
|
| 29 |
+
self.model.eval()
|
| 30 |
+
|
| 31 |
+
def add_warmth(self, audio, warmth=0.3, presence=0.2):
|
| 32 |
+
"""
|
| 33 |
+
Add warmth and presence to audio
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
audio: numpy array of audio samples
|
| 37 |
+
warmth: 0.0-1.0, amount of low-mid boost
|
| 38 |
+
presence: 0.0-1.0, amount of high-mid clarity
|
| 39 |
+
|
| 40 |
+
Returns:
|
| 41 |
+
Processed audio with warmth
|
| 42 |
+
"""
|
| 43 |
+
# Normalize input
|
| 44 |
+
audio = audio.astype(np.float32)
|
| 45 |
+
max_val = np.abs(audio).max()
|
| 46 |
+
if max_val > 0:
|
| 47 |
+
audio = audio / max_val
|
| 48 |
+
|
| 49 |
+
# 1. Gentle low-mid boost for warmth (200-800 Hz)
|
| 50 |
+
if warmth > 0:
|
| 51 |
+
# Low-shelf filter
|
| 52 |
+
b_low, a_low = signal.butter(2, 800 / (self.sample_rate / 2), btype='low')
|
| 53 |
+
low_content = signal.filtfilt(b_low, a_low, audio)
|
| 54 |
+
audio = audio + warmth * 0.3 * low_content
|
| 55 |
+
|
| 56 |
+
# 2. Presence boost (2-4 kHz) for clarity
|
| 57 |
+
if presence > 0:
|
| 58 |
+
b_mid, a_mid = signal.butter(2, [2000 / (self.sample_rate / 2),
|
| 59 |
+
4000 / (self.sample_rate / 2)], btype='band')
|
| 60 |
+
mid_content = signal.filtfilt(b_mid, a_mid, audio)
|
| 61 |
+
audio = audio + presence * 0.2 * mid_content
|
| 62 |
+
|
| 63 |
+
# 3. Gentle compression for consistency
|
| 64 |
+
threshold = 0.5
|
| 65 |
+
ratio = 3.0
|
| 66 |
+
audio_abs = np.abs(audio)
|
| 67 |
+
mask = audio_abs > threshold
|
| 68 |
+
if np.any(mask):
|
| 69 |
+
gain_reduction = np.ones_like(audio)
|
| 70 |
+
gain_reduction[mask] = threshold + (audio_abs[mask] - threshold) / ratio
|
| 71 |
+
gain_reduction[mask] = gain_reduction[mask] / audio_abs[mask]
|
| 72 |
+
audio = audio * gain_reduction
|
| 73 |
+
|
| 74 |
+
# 4. Smooth any harsh transients
|
| 75 |
+
audio = uniform_filter1d(audio, size=3)
|
| 76 |
+
|
| 77 |
+
# Normalize output
|
| 78 |
+
max_val = np.abs(audio).max()
|
| 79 |
+
if max_val > 0:
|
| 80 |
+
audio = audio / max_val * 0.95
|
| 81 |
+
|
| 82 |
+
return audio.astype(np.float32)
|
| 83 |
+
|
| 84 |
+
def generate(self, text, warmth=0.3, presence=0.2):
|
| 85 |
+
"""
|
| 86 |
+
Generate speech from Hausa text
|
| 87 |
+
|
| 88 |
+
Args:
|
| 89 |
+
text: Hausa text to synthesize
|
| 90 |
+
warmth: 0.0-1.0, voice warmth level
|
| 91 |
+
presence: 0.0-1.0, voice clarity level
|
| 92 |
+
|
| 93 |
+
Returns:
|
| 94 |
+
dict with 'audio' (numpy array) and 'sample_rate' (int)
|
| 95 |
+
"""
|
| 96 |
+
# Tokenize
|
| 97 |
+
inputs = self.tokenizer(text, return_tensors='pt').to(self.device)
|
| 98 |
+
|
| 99 |
+
# Generate
|
| 100 |
+
with torch.no_grad():
|
| 101 |
+
output = self.model(**inputs).waveform
|
| 102 |
+
|
| 103 |
+
# Get audio
|
| 104 |
+
audio = output.squeeze().cpu().numpy()
|
| 105 |
+
|
| 106 |
+
# Apply warmth processing
|
| 107 |
+
audio = self.add_warmth(audio, warmth=warmth, presence=presence)
|
| 108 |
+
|
| 109 |
+
return {
|
| 110 |
+
'audio': audio,
|
| 111 |
+
'sample_rate': self.sample_rate
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
def __call__(self, text, **kwargs):
|
| 115 |
+
return self.generate(text, **kwargs)
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch>=2.0.0
|
| 2 |
+
transformers>=4.35.0
|
| 3 |
+
scipy>=1.10.0
|
| 4 |
+
soundfile>=0.12.0
|
| 5 |
+
numpy>=1.24.0
|