File size: 4,318 Bytes
08a0d1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import gradio as gr
import torch
import tempfile
import os
from TTS.api import TTS

# Initialize the XTTS model
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Initialize XTTS model
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

# Get list of supported languages
supported_languages = [
    "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", 
    "cs", "ar", "zh-cn", "ja", "hu", "ko"
]

def generate_speech(
    text, 
    language, 
    speaker_wav=None, 
    voice_preset=None,
    speed=1.0,
    temperature=0.7
):
    """
    Generate speech from text using XTTS model
    """
    # Create a temporary file for output
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
        output_path = tmp_file.name
    
    try:
        # If speaker wav is provided, use it for voice cloning
        if speaker_wav is not None:
            tts.tts_to_file(
                text=text,
                file_path=output_path,
                speaker_wav=speaker_wav,
                language=language,
                speed=speed,
                temperature=temperature
            )
        else:
            # Use default voice if no speaker wav is provided
            tts.tts_to_file(
                text=text,
                file_path=output_path,
                language=language,
                speed=speed,
                temperature=temperature
            )
        
        return output_path
    except Exception as e:
        # Clean up temporary file if error occurs
        if os.path.exists(output_path):
            os.unlink(output_path)
        raise gr.Error(f"Error generating speech: {str(e)}")

# Create Gradio interface
with gr.Blocks(title="XTTS Text-to-Speech") as demo:
    gr.Markdown("# XTTS Text-to-Speech Generator")
    gr.Markdown("Generate speech from text with voice cloning capabilities using XTTS v2")
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Input Text",
                placeholder="Enter text to convert to speech...",
                lines=3
            )
            
            language_input = gr.Dropdown(
                label="Language",
                choices=[(lang, lang) for lang in supported_languages],
                value="en",
                info="Select the language for synthesis"
            )
            
            speaker_wav_input = gr.Audio(
                label="Reference Voice (Optional)",
                type="filepath",
                info="Upload a 3-10 second audio sample for voice cloning"
            )
            
            with gr.Accordion("Advanced Settings", open=False):
                speed_input = gr.Slider(
                    label="Speed",
                    minimum=0.5,
                    maximum=2.0,
                    value=1.0,
                    step=0.1,
                    info="Speech speed (0.5 = slow, 2.0 = fast)"
                )
                
                temperature_input = gr.Slider(
                    label="Temperature",
                    minimum=0.1,
                    maximum=1.0,
                    value=0.7,
                    step=0.1,
                    info="Voice variability (lower = more deterministic)"
                )
            
            generate_btn = gr.Button("Generate Speech", variant="primary")
        
        with gr.Column():
            audio_output = gr.Audio(
                label="Generated Speech",
                type="filepath"
            )
    
    gr.Examples(
        examples=[
            ["Hello, world! This is a sample text to speech generation.", "en"],
            ["Bonjour, comment allez-vous aujourd'hui?", "fr"],
            ["Hola, ¿cómo estás?", "es"],
        ],
        inputs=[text_input, language_input],
        outputs=audio_output,
        fn=generate_speech,
        cache_examples=True
    )
    
    generate_btn.click(
        fn=generate_speech,
        inputs=[
            text_input, 
            language_input, 
            speaker_wav_input, 
            speed_input, 
            temperature_input
        ],
        outputs=audio_output
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)