xtts-enhanced / app.py
Yahia El Ahmar
🎭 Enhanced XTTS: Real accents, emotions, clear labels
add2cb9
"""
Enhanced XTTS Multi-Lingual TTS with Accent Control
- 88 voices from voxlydataset with REAL accents
- Emotion control with prosody
- Clear male/female labeling
- Multiple languages with authentic accents
"""
import os
import io
import logging
from pathlib import Path
from typing import Optional
import json
import numpy as np
from fastapi import FastAPI, Form, Response
from fastapi.middleware.cors import CORSMiddleware
import soundfile as sf
import torch
from TTS.api import TTS
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Initialize FastAPI
app = FastAPI(title="Enhanced XTTS Multi-Lingual TTS", version="2.0.0")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Load XTTS v2 model
MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
SAMPLE_RATE = 24000
logger.info(f"πŸ”₯ Loading XTTS v2 model: {MODEL_NAME}")
try:
device = "cuda" if torch.cuda.is_available() else "cpu"
tts = TTS(model_name=MODEL_NAME, progress_bar=False).to(device)
logger.info(f"βœ… Model loaded on {device}!")
except Exception as e:
logger.error(f"❌ Failed to load model: {e}")
tts = None
# Enhanced voice profiles with REAL accents and clear labeling
ENHANCED_VOICES = {
# American Voices
"american_male_young": {
"speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/en_us_arianeural.wav",
"gender": "Male",
"age": "Young (25-35)",
"accent": "American (General)",
"description": "Clear American male voice, professional and friendly",
"language": "en"
},
"american_female_young": {
"speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/en_us_jennymultilingual.wav",
"gender": "Female",
"age": "Young (25-35)",
"accent": "American (General)",
"description": "Warm American female voice, clear and expressive",
"language": "en"
},
# British Voices
"british_male_mature": {
"speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/en_gb_ryanneural.wav",
"gender": "Male",
"age": "Mature (35-50)",
"accent": "British (RP)",
"description": "Distinguished British male voice, authoritative",
"language": "en"
},
"british_female_young": {
"speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/en_gb_sonianeural.wav",
"gender": "Female",
"age": "Young (25-35)",
"accent": "British (RP)",
"description": "Elegant British female voice, professional",
"language": "en"
},
# Australian Voices
"australian_male": {
"speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/en_au_williamneural.wav",
"gender": "Male",
"age": "Young (25-35)",
"accent": "Australian",
"description": "Authentic Australian male voice, friendly",
"language": "en"
},
"australian_female": {
"speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/en_au_natashaneural.wav",
"gender": "Female",
"age": "Young (25-35)",
"accent": "Australian",
"description": "Authentic Australian female voice, warm",
"language": "en"
},
# Irish Voices
"irish_male": {
"speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/en_ie_connorneural.wav",
"gender": "Male",
"age": "Young (25-35)",
"accent": "Irish",
"description": "Authentic Irish male voice, charming",
"language": "en"
},
"irish_female": {
"speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/en_ie_emilyneural.wav",
"gender": "Female",
"age": "Young (25-35)",
"accent": "Irish",
"description": "Authentic Irish female voice, melodic",
"language": "en"
},
# Canadian Voices
"canadian_male": {
"speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/en_ca_liamneural.wav",
"gender": "Male",
"age": "Young (25-35)",
"accent": "Canadian",
"description": "Authentic Canadian male voice, friendly",
"language": "en"
},
"canadian_female": {
"speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/en_ca_claraneural.wav",
"gender": "Female",
"age": "Young (25-35)",
"accent": "Canadian",
"description": "Authentic Canadian female voice, clear",
"language": "en"
},
# Indian Voices
"indian_male": {
"speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/en_in_prabhaneural.wav",
"gender": "Male",
"age": "Young (25-35)",
"accent": "Indian",
"description": "Authentic Indian male voice, clear English",
"language": "en"
},
"indian_female": {
"speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/en_in_neerjaexpressive.wav",
"gender": "Female",
"age": "Young (25-35)",
"accent": "Indian",
"description": "Authentic Indian female voice, expressive",
"language": "en"
},
# South African Voices
"south_african_male": {
"speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/en_za_lukeneural.wav",
"gender": "Male",
"age": "Young (25-35)",
"accent": "South African",
"description": "Authentic South African male voice",
"language": "en"
},
"south_african_female": {
"speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/en_za_leahneural.wav",
"gender": "Female",
"age": "Young (25-35)",
"accent": "South African",
"description": "Authentic South African female voice",
"language": "en"
},
# French Voices
"french_male": {
"speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/fr_fr_henrineural.wav",
"gender": "Male",
"age": "Young (25-35)",
"accent": "French",
"description": "Authentic French male voice, sophisticated",
"language": "fr"
},
"french_female": {
"speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/fr_fr_deniseneural.wav",
"gender": "Female",
"age": "Young (25-35)",
"accent": "French",
"description": "Authentic French female voice, elegant",
"language": "fr"
},
# Spanish Voices
"spanish_male": {
"speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/es_es_alvaroneural.wav",
"gender": "Male",
"age": "Young (25-35)",
"accent": "Spanish (Spain)",
"description": "Authentic Spanish male voice, expressive",
"language": "es"
},
"spanish_female": {
"speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/es_es_elviraneural.wav",
"gender": "Female",
"age": "Young (25-35)",
"accent": "Spanish (Spain)",
"description": "Authentic Spanish female voice, warm",
"language": "es"
},
# German Voices
"german_male": {
"speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/de_de_conradneural.wav",
"gender": "Male",
"age": "Young (25-35)",
"accent": "German",
"description": "Authentic German male voice, clear",
"language": "de"
},
"german_female": {
"speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/de_de_katjaneural.wav",
"gender": "Female",
"age": "Young (25-35)",
"accent": "German",
"description": "Authentic German female voice, professional",
"language": "de"
},
# Arabic Voices
"arabic_male": {
"speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/ar_sa_hamedneural.wav",
"gender": "Male",
"age": "Young (25-35)",
"accent": "Arabic (Saudi)",
"description": "Authentic Arabic male voice",
"language": "ar"
},
"arabic_female": {
"speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/ar_sa_zariyahneural.wav",
"gender": "Female",
"age": "Young (25-35)",
"accent": "Arabic (Saudi)",
"description": "Authentic Arabic female voice",
"language": "ar"
},
}
# Emotion/feeling parameters for prosody control
EMOTION_SETTINGS = {
"neutral": {"temperature": 0.7, "speed": 1.0, "description": "Normal, clear speech"},
"happy": {"temperature": 0.85, "speed": 1.1, "description": "Upbeat, energetic, positive"},
"excited": {"temperature": 0.9, "speed": 1.2, "description": "Very energetic, enthusiastic"},
"sad": {"temperature": 0.6, "speed": 0.9, "description": "Slower, more somber tone"},
"angry": {"temperature": 0.8, "speed": 1.15, "description": "Intense, forceful delivery"},
"calm": {"temperature": 0.65, "speed": 0.95, "description": "Relaxed, soothing tone"},
"professional": {"temperature": 0.7, "speed": 1.0, "description": "Clear, authoritative"},
"whisper": {"temperature": 0.55, "speed": 0.85, "description": "Soft, quiet delivery"},
}
@app.get("/")
async def health():
"""Health check endpoint"""
return {
"status": "ok",
"engine": "xtts_v2_enhanced",
"model": MODEL_NAME,
"sample_rate": SAMPLE_RATE,
"total_voices": len(ENHANCED_VOICES),
"features": [
"🎭 Real human voice cloning with authentic accents",
"🌍 Multiple languages (English, French, Spanish, German, Arabic, etc.)",
"πŸ—£οΈ Clear accent distinctions (American, British, Irish, Australian, Indian, etc.)",
"πŸ‘₯ Labeled by gender and age",
"😊 8 emotion presets (happy, sad, angry, excited, calm, etc.)",
"⚑ Fast generation (5-10 seconds)",
"🎡 High quality, human-like prosody",
"πŸš€ No rate limits"
],
"accents_available": list(set([v["accent"] for v in ENHANCED_VOICES.values()])),
"emotions_available": list(EMOTION_SETTINGS.keys())
}
@app.get("/voices")
async def list_voices():
"""List all enhanced voices with detailed metadata"""
voices = []
for voice_id, metadata in ENHANCED_VOICES.items():
voices.append({
"id": voice_id,
"name": metadata["description"],
"gender": metadata["gender"],
"age": metadata["age"],
"accent": metadata["accent"],
"language": metadata["language"],
"description": metadata["description"]
})
# Group by accent
by_accent = {}
for voice in voices:
accent = voice["accent"]
if accent not in by_accent:
by_accent[accent] = []
by_accent[accent].append(voice)
return {
"voices": voices,
"total": len(voices),
"by_accent": by_accent,
"accents": list(by_accent.keys())
}
@app.post("/synthesize")
async def synthesize(
text: str = Form(...),
voice_id: str = Form("american_female_young"),
emotion: str = Form("neutral"),
language: str = Form("en"),
speed: float = Form(None),
temperature: float = Form(None)
):
"""
🎭 Enhanced XTTS Synthesis with Accent Control
Features:
- Real human voice cloning
- Authentic accents (American, British, Irish, Australian, Indian, etc.)
- Clear gender/age labeling
- Emotion control with prosody
- Multi-lingual support
Parameters:
- text: Text to synthesize (max 500 characters)
- voice_id: Voice ID (see /voices for full list)
- emotion: Emotion (neutral, happy, excited, sad, angry, calm, professional, whisper)
- language: Language code (en, fr, es, de, ar, etc.)
- speed: Speech speed override (0.5-2.0)
- temperature: Voice variation (0.1-1.0, higher = more expressive)
"""
try:
if not tts:
return Response(
content=b"Model not loaded",
media_type="text/plain",
status_code=503
)
logger.info(f"🎀 Enhanced XTTS: voice={voice_id}, emotion={emotion}, lang={language}")
# Validate inputs
if len(text) > 500:
return Response(
content=b"Text too long (max 500 characters)",
media_type="text/plain",
status_code=400
)
if not text.strip():
return Response(
content=b"Text cannot be empty",
media_type="text/plain",
status_code=400
)
# Get voice metadata
if voice_id not in ENHANCED_VOICES:
logger.warning(f"⚠️ Unknown voice {voice_id}, using default")
voice_id = "american_female_young"
voice_meta = ENHANCED_VOICES[voice_id]
speaker_wav = voice_meta["speaker_wav"]
# Get emotion settings
emotion_settings = EMOTION_SETTINGS.get(emotion, EMOTION_SETTINGS["neutral"])
final_speed = speed if speed is not None else emotion_settings["speed"]
final_temp = temperature if temperature is not None else emotion_settings["temperature"]
logger.info(f"🎭 Voice: {voice_meta['description']}")
logger.info(f" Gender: {voice_meta['gender']} | Accent: {voice_meta['accent']}")
logger.info(f" Emotion: {emotion} | Speed: {final_speed} | Temp: {final_temp}")
# Generate audio with XTTS
logger.info(f"πŸ”Š Generating audio...")
wav = tts.tts(
text=text,
speaker_wav=speaker_wav,
language=language,
speed=final_speed,
temperature=final_temp
)
# Convert to numpy array
audio_array = np.array(wav, dtype=np.float32)
logger.info(f"βœ… Audio generated: {len(audio_array)} samples at {SAMPLE_RATE}Hz")
# Write to WAV
buf = io.BytesIO()
sf.write(buf, audio_array, SAMPLE_RATE, format="WAV", subtype="PCM_16")
wav_bytes = buf.getvalue()
logger.info(f"🎡 FINAL: {len(wav_bytes)} bytes | {voice_meta['accent']} {voice_meta['gender']}")
return Response(content=wav_bytes, media_type="audio/wav")
except Exception as e:
logger.error(f"❌ Synthesis failed: {str(e)}")
import traceback
logger.error(traceback.format_exc())
return Response(
content=f"Synthesis failed: {str(e)}".encode(),
media_type="text/plain",
status_code=500
)
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)