Spaces:
Sleeping
Sleeping
| """ | |
| Enhanced XTTS Multi-Lingual TTS with Accent Control | |
| - 88 voices from voxlydataset with REAL accents | |
| - Emotion control with prosody | |
| - Clear male/female labeling | |
| - Multiple languages with authentic accents | |
| """ | |
| import os | |
| import io | |
| import logging | |
| from pathlib import Path | |
| from typing import Optional | |
| import json | |
| import numpy as np | |
| from fastapi import FastAPI, Form, Response | |
| from fastapi.middleware.cors import CORSMiddleware | |
| import soundfile as sf | |
| import torch | |
| from TTS.api import TTS | |
| # Setup logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # Initialize FastAPI | |
| app = FastAPI(title="Enhanced XTTS Multi-Lingual TTS", version="2.0.0") | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # Load XTTS v2 model | |
| MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2" | |
| SAMPLE_RATE = 24000 | |
| logger.info(f"π₯ Loading XTTS v2 model: {MODEL_NAME}") | |
| try: | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| tts = TTS(model_name=MODEL_NAME, progress_bar=False).to(device) | |
| logger.info(f"β Model loaded on {device}!") | |
| except Exception as e: | |
| logger.error(f"β Failed to load model: {e}") | |
| tts = None | |
| # Enhanced voice profiles with REAL accents and clear labeling | |
| ENHANCED_VOICES = { | |
| # American Voices | |
| "american_male_young": { | |
| "speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/en_us_arianeural.wav", | |
| "gender": "Male", | |
| "age": "Young (25-35)", | |
| "accent": "American (General)", | |
| "description": "Clear American male voice, professional and friendly", | |
| "language": "en" | |
| }, | |
| "american_female_young": { | |
| "speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/en_us_jennymultilingual.wav", | |
| "gender": "Female", | |
| "age": "Young (25-35)", | |
| "accent": "American (General)", | |
| "description": "Warm American female voice, clear and expressive", | |
| "language": "en" | |
| }, | |
| # British Voices | |
| "british_male_mature": { | |
| "speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/en_gb_ryanneural.wav", | |
| "gender": "Male", | |
| "age": "Mature (35-50)", | |
| "accent": "British (RP)", | |
| "description": "Distinguished British male voice, authoritative", | |
| "language": "en" | |
| }, | |
| "british_female_young": { | |
| "speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/en_gb_sonianeural.wav", | |
| "gender": "Female", | |
| "age": "Young (25-35)", | |
| "accent": "British (RP)", | |
| "description": "Elegant British female voice, professional", | |
| "language": "en" | |
| }, | |
| # Australian Voices | |
| "australian_male": { | |
| "speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/en_au_williamneural.wav", | |
| "gender": "Male", | |
| "age": "Young (25-35)", | |
| "accent": "Australian", | |
| "description": "Authentic Australian male voice, friendly", | |
| "language": "en" | |
| }, | |
| "australian_female": { | |
| "speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/en_au_natashaneural.wav", | |
| "gender": "Female", | |
| "age": "Young (25-35)", | |
| "accent": "Australian", | |
| "description": "Authentic Australian female voice, warm", | |
| "language": "en" | |
| }, | |
| # Irish Voices | |
| "irish_male": { | |
| "speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/en_ie_connorneural.wav", | |
| "gender": "Male", | |
| "age": "Young (25-35)", | |
| "accent": "Irish", | |
| "description": "Authentic Irish male voice, charming", | |
| "language": "en" | |
| }, | |
| "irish_female": { | |
| "speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/en_ie_emilyneural.wav", | |
| "gender": "Female", | |
| "age": "Young (25-35)", | |
| "accent": "Irish", | |
| "description": "Authentic Irish female voice, melodic", | |
| "language": "en" | |
| }, | |
| # Canadian Voices | |
| "canadian_male": { | |
| "speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/en_ca_liamneural.wav", | |
| "gender": "Male", | |
| "age": "Young (25-35)", | |
| "accent": "Canadian", | |
| "description": "Authentic Canadian male voice, friendly", | |
| "language": "en" | |
| }, | |
| "canadian_female": { | |
| "speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/en_ca_claraneural.wav", | |
| "gender": "Female", | |
| "age": "Young (25-35)", | |
| "accent": "Canadian", | |
| "description": "Authentic Canadian female voice, clear", | |
| "language": "en" | |
| }, | |
| # Indian Voices | |
| "indian_male": { | |
| "speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/en_in_prabhaneural.wav", | |
| "gender": "Male", | |
| "age": "Young (25-35)", | |
| "accent": "Indian", | |
| "description": "Authentic Indian male voice, clear English", | |
| "language": "en" | |
| }, | |
| "indian_female": { | |
| "speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/en_in_neerjaexpressive.wav", | |
| "gender": "Female", | |
| "age": "Young (25-35)", | |
| "accent": "Indian", | |
| "description": "Authentic Indian female voice, expressive", | |
| "language": "en" | |
| }, | |
| # South African Voices | |
| "south_african_male": { | |
| "speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/en_za_lukeneural.wav", | |
| "gender": "Male", | |
| "age": "Young (25-35)", | |
| "accent": "South African", | |
| "description": "Authentic South African male voice", | |
| "language": "en" | |
| }, | |
| "south_african_female": { | |
| "speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/en_za_leahneural.wav", | |
| "gender": "Female", | |
| "age": "Young (25-35)", | |
| "accent": "South African", | |
| "description": "Authentic South African female voice", | |
| "language": "en" | |
| }, | |
| # French Voices | |
| "french_male": { | |
| "speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/fr_fr_henrineural.wav", | |
| "gender": "Male", | |
| "age": "Young (25-35)", | |
| "accent": "French", | |
| "description": "Authentic French male voice, sophisticated", | |
| "language": "fr" | |
| }, | |
| "french_female": { | |
| "speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/fr_fr_deniseneural.wav", | |
| "gender": "Female", | |
| "age": "Young (25-35)", | |
| "accent": "French", | |
| "description": "Authentic French female voice, elegant", | |
| "language": "fr" | |
| }, | |
| # Spanish Voices | |
| "spanish_male": { | |
| "speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/es_es_alvaroneural.wav", | |
| "gender": "Male", | |
| "age": "Young (25-35)", | |
| "accent": "Spanish (Spain)", | |
| "description": "Authentic Spanish male voice, expressive", | |
| "language": "es" | |
| }, | |
| "spanish_female": { | |
| "speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/es_es_elviraneural.wav", | |
| "gender": "Female", | |
| "age": "Young (25-35)", | |
| "accent": "Spanish (Spain)", | |
| "description": "Authentic Spanish female voice, warm", | |
| "language": "es" | |
| }, | |
| # German Voices | |
| "german_male": { | |
| "speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/de_de_conradneural.wav", | |
| "gender": "Male", | |
| "age": "Young (25-35)", | |
| "accent": "German", | |
| "description": "Authentic German male voice, clear", | |
| "language": "de" | |
| }, | |
| "german_female": { | |
| "speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/de_de_katjaneural.wav", | |
| "gender": "Female", | |
| "age": "Young (25-35)", | |
| "accent": "German", | |
| "description": "Authentic German female voice, professional", | |
| "language": "de" | |
| }, | |
| # Arabic Voices | |
| "arabic_male": { | |
| "speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/ar_sa_hamedneural.wav", | |
| "gender": "Male", | |
| "age": "Young (25-35)", | |
| "accent": "Arabic (Saudi)", | |
| "description": "Authentic Arabic male voice", | |
| "language": "ar" | |
| }, | |
| "arabic_female": { | |
| "speaker_wav": "https://huggingface.co/datasets/Yaya5777/voxlydataset/resolve/main/ar_sa_zariyahneural.wav", | |
| "gender": "Female", | |
| "age": "Young (25-35)", | |
| "accent": "Arabic (Saudi)", | |
| "description": "Authentic Arabic female voice", | |
| "language": "ar" | |
| }, | |
| } | |
| # Emotion/feeling parameters for prosody control | |
| EMOTION_SETTINGS = { | |
| "neutral": {"temperature": 0.7, "speed": 1.0, "description": "Normal, clear speech"}, | |
| "happy": {"temperature": 0.85, "speed": 1.1, "description": "Upbeat, energetic, positive"}, | |
| "excited": {"temperature": 0.9, "speed": 1.2, "description": "Very energetic, enthusiastic"}, | |
| "sad": {"temperature": 0.6, "speed": 0.9, "description": "Slower, more somber tone"}, | |
| "angry": {"temperature": 0.8, "speed": 1.15, "description": "Intense, forceful delivery"}, | |
| "calm": {"temperature": 0.65, "speed": 0.95, "description": "Relaxed, soothing tone"}, | |
| "professional": {"temperature": 0.7, "speed": 1.0, "description": "Clear, authoritative"}, | |
| "whisper": {"temperature": 0.55, "speed": 0.85, "description": "Soft, quiet delivery"}, | |
| } | |
| async def health(): | |
| """Health check endpoint""" | |
| return { | |
| "status": "ok", | |
| "engine": "xtts_v2_enhanced", | |
| "model": MODEL_NAME, | |
| "sample_rate": SAMPLE_RATE, | |
| "total_voices": len(ENHANCED_VOICES), | |
| "features": [ | |
| "π Real human voice cloning with authentic accents", | |
| "π Multiple languages (English, French, Spanish, German, Arabic, etc.)", | |
| "π£οΈ Clear accent distinctions (American, British, Irish, Australian, Indian, etc.)", | |
| "π₯ Labeled by gender and age", | |
| "π 8 emotion presets (happy, sad, angry, excited, calm, etc.)", | |
| "β‘ Fast generation (5-10 seconds)", | |
| "π΅ High quality, human-like prosody", | |
| "π No rate limits" | |
| ], | |
| "accents_available": list(set([v["accent"] for v in ENHANCED_VOICES.values()])), | |
| "emotions_available": list(EMOTION_SETTINGS.keys()) | |
| } | |
| async def list_voices(): | |
| """List all enhanced voices with detailed metadata""" | |
| voices = [] | |
| for voice_id, metadata in ENHANCED_VOICES.items(): | |
| voices.append({ | |
| "id": voice_id, | |
| "name": metadata["description"], | |
| "gender": metadata["gender"], | |
| "age": metadata["age"], | |
| "accent": metadata["accent"], | |
| "language": metadata["language"], | |
| "description": metadata["description"] | |
| }) | |
| # Group by accent | |
| by_accent = {} | |
| for voice in voices: | |
| accent = voice["accent"] | |
| if accent not in by_accent: | |
| by_accent[accent] = [] | |
| by_accent[accent].append(voice) | |
| return { | |
| "voices": voices, | |
| "total": len(voices), | |
| "by_accent": by_accent, | |
| "accents": list(by_accent.keys()) | |
| } | |
| async def synthesize( | |
| text: str = Form(...), | |
| voice_id: str = Form("american_female_young"), | |
| emotion: str = Form("neutral"), | |
| language: str = Form("en"), | |
| speed: float = Form(None), | |
| temperature: float = Form(None) | |
| ): | |
| """ | |
| π Enhanced XTTS Synthesis with Accent Control | |
| Features: | |
| - Real human voice cloning | |
| - Authentic accents (American, British, Irish, Australian, Indian, etc.) | |
| - Clear gender/age labeling | |
| - Emotion control with prosody | |
| - Multi-lingual support | |
| Parameters: | |
| - text: Text to synthesize (max 500 characters) | |
| - voice_id: Voice ID (see /voices for full list) | |
| - emotion: Emotion (neutral, happy, excited, sad, angry, calm, professional, whisper) | |
| - language: Language code (en, fr, es, de, ar, etc.) | |
| - speed: Speech speed override (0.5-2.0) | |
| - temperature: Voice variation (0.1-1.0, higher = more expressive) | |
| """ | |
| try: | |
| if not tts: | |
| return Response( | |
| content=b"Model not loaded", | |
| media_type="text/plain", | |
| status_code=503 | |
| ) | |
| logger.info(f"π€ Enhanced XTTS: voice={voice_id}, emotion={emotion}, lang={language}") | |
| # Validate inputs | |
| if len(text) > 500: | |
| return Response( | |
| content=b"Text too long (max 500 characters)", | |
| media_type="text/plain", | |
| status_code=400 | |
| ) | |
| if not text.strip(): | |
| return Response( | |
| content=b"Text cannot be empty", | |
| media_type="text/plain", | |
| status_code=400 | |
| ) | |
| # Get voice metadata | |
| if voice_id not in ENHANCED_VOICES: | |
| logger.warning(f"β οΈ Unknown voice {voice_id}, using default") | |
| voice_id = "american_female_young" | |
| voice_meta = ENHANCED_VOICES[voice_id] | |
| speaker_wav = voice_meta["speaker_wav"] | |
| # Get emotion settings | |
| emotion_settings = EMOTION_SETTINGS.get(emotion, EMOTION_SETTINGS["neutral"]) | |
| final_speed = speed if speed is not None else emotion_settings["speed"] | |
| final_temp = temperature if temperature is not None else emotion_settings["temperature"] | |
| logger.info(f"π Voice: {voice_meta['description']}") | |
| logger.info(f" Gender: {voice_meta['gender']} | Accent: {voice_meta['accent']}") | |
| logger.info(f" Emotion: {emotion} | Speed: {final_speed} | Temp: {final_temp}") | |
| # Generate audio with XTTS | |
| logger.info(f"π Generating audio...") | |
| wav = tts.tts( | |
| text=text, | |
| speaker_wav=speaker_wav, | |
| language=language, | |
| speed=final_speed, | |
| temperature=final_temp | |
| ) | |
| # Convert to numpy array | |
| audio_array = np.array(wav, dtype=np.float32) | |
| logger.info(f"β Audio generated: {len(audio_array)} samples at {SAMPLE_RATE}Hz") | |
| # Write to WAV | |
| buf = io.BytesIO() | |
| sf.write(buf, audio_array, SAMPLE_RATE, format="WAV", subtype="PCM_16") | |
| wav_bytes = buf.getvalue() | |
| logger.info(f"π΅ FINAL: {len(wav_bytes)} bytes | {voice_meta['accent']} {voice_meta['gender']}") | |
| return Response(content=wav_bytes, media_type="audio/wav") | |
| except Exception as e: | |
| logger.error(f"β Synthesis failed: {str(e)}") | |
| import traceback | |
| logger.error(traceback.format_exc()) | |
| return Response( | |
| content=f"Synthesis failed: {str(e)}".encode(), | |
| media_type="text/plain", | |
| status_code=500 | |
| ) | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) | |