Spaces:

Successmove
/

XTTS

Runtime error

File size: 4,318 Bytes

08a0d1e

import gradio as gr
import torch
import tempfile
import os
from TTS.api import TTS

# Initialize the XTTS model
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Initialize XTTS model
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

# Get list of supported languages
supported_languages = [
    "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", 
    "cs", "ar", "zh-cn", "ja", "hu", "ko"
]

def generate_speech(
    text, 
    language, 
    speaker_wav=None, 
    voice_preset=None,
    speed=1.0,
    temperature=0.7
):
    """
    Generate speech from text using XTTS model
    """
    # Create a temporary file for output
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
        output_path = tmp_file.name
    
    try:
        # If speaker wav is provided, use it for voice cloning
        if speaker_wav is not None:
            tts.tts_to_file(
                text=text,
                file_path=output_path,
                speaker_wav=speaker_wav,
                language=language,
                speed=speed,
                temperature=temperature
            )
        else:
            # Use default voice if no speaker wav is provided
            tts.tts_to_file(
                text=text,
                file_path=output_path,
                language=language,
                speed=speed,
                temperature=temperature
            )
        
        return output_path
    except Exception as e:
        # Clean up temporary file if error occurs
        if os.path.exists(output_path):
            os.unlink(output_path)
        raise gr.Error(f"Error generating speech: {str(e)}")

# Create Gradio interface
with gr.Blocks(title="XTTS Text-to-Speech") as demo:
    gr.Markdown("# XTTS Text-to-Speech Generator")
    gr.Markdown("Generate speech from text with voice cloning capabilities using XTTS v2")
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Input Text",
                placeholder="Enter text to convert to speech...",
                lines=3
            )
            
            language_input = gr.Dropdown(
                label="Language",
                choices=[(lang, lang) for lang in supported_languages],
                value="en",
                info="Select the language for synthesis"
            )
            
            speaker_wav_input = gr.Audio(
                label="Reference Voice (Optional)",
                type="filepath",
                info="Upload a 3-10 second audio sample for voice cloning"
            )
            
            with gr.Accordion("Advanced Settings", open=False):
                speed_input = gr.Slider(
                    label="Speed",
                    minimum=0.5,
                    maximum=2.0,
                    value=1.0,
                    step=0.1,
                    info="Speech speed (0.5 = slow, 2.0 = fast)"
                )
                
                temperature_input = gr.Slider(
                    label="Temperature",
                    minimum=0.1,
                    maximum=1.0,
                    value=0.7,
                    step=0.1,
                    info="Voice variability (lower = more deterministic)"
                )
            
            generate_btn = gr.Button("Generate Speech", variant="primary")
        
        with gr.Column():
            audio_output = gr.Audio(
                label="Generated Speech",
                type="filepath"
            )
    
    gr.Examples(
        examples=[
            ["Hello, world! This is a sample text to speech generation.", "en"],
            ["Bonjour, comment allez-vous aujourd'hui?", "fr"],
            ["Hola, ¿cómo estás?", "es"],
        ],
        inputs=[text_input, language_input],
        outputs=audio_output,
        fn=generate_speech,
        cache_examples=True
    )
    
    generate_btn.click(
        fn=generate_speech,
        inputs=[
            text_input, 
            language_input, 
            speaker_wav_input, 
            speed_input, 
            temperature_input
        ],
        outputs=audio_output
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)