import gradio as gr import torch import tempfile import os from TTS.api import TTS # Initialize the XTTS model device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") # Initialize XTTS model tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) # Get list of supported languages supported_languages = [ "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko" ] def generate_speech( text, language, speaker_wav=None, voice_preset=None, speed=1.0, temperature=0.7 ): """ Generate speech from text using XTTS model """ # Create a temporary file for output with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: output_path = tmp_file.name try: # If speaker wav is provided, use it for voice cloning if speaker_wav is not None: tts.tts_to_file( text=text, file_path=output_path, speaker_wav=speaker_wav, language=language, speed=speed, temperature=temperature ) else: # Use default voice if no speaker wav is provided tts.tts_to_file( text=text, file_path=output_path, language=language, speed=speed, temperature=temperature ) return output_path except Exception as e: # Clean up temporary file if error occurs if os.path.exists(output_path): os.unlink(output_path) raise gr.Error(f"Error generating speech: {str(e)}") # Create Gradio interface with gr.Blocks(title="XTTS Text-to-Speech") as demo: gr.Markdown("# XTTS Text-to-Speech Generator") gr.Markdown("Generate speech from text with voice cloning capabilities using XTTS v2") with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="Input Text", placeholder="Enter text to convert to speech...", lines=3 ) language_input = gr.Dropdown( label="Language", choices=[(lang, lang) for lang in supported_languages], value="en", info="Select the language for synthesis" ) speaker_wav_input = gr.Audio( label="Reference Voice (Optional)", type="filepath", info="Upload a 3-10 second audio sample for voice cloning" ) with gr.Accordion("Advanced Settings", open=False): speed_input = gr.Slider( label="Speed", minimum=0.5, maximum=2.0, value=1.0, step=0.1, info="Speech speed (0.5 = slow, 2.0 = fast)" ) temperature_input = gr.Slider( label="Temperature", minimum=0.1, maximum=1.0, value=0.7, step=0.1, info="Voice variability (lower = more deterministic)" ) generate_btn = gr.Button("Generate Speech", variant="primary") with gr.Column(): audio_output = gr.Audio( label="Generated Speech", type="filepath" ) gr.Examples( examples=[ ["Hello, world! This is a sample text to speech generation.", "en"], ["Bonjour, comment allez-vous aujourd'hui?", "fr"], ["Hola, ¿cómo estás?", "es"], ], inputs=[text_input, language_input], outputs=audio_output, fn=generate_speech, cache_examples=True ) generate_btn.click( fn=generate_speech, inputs=[ text_input, language_input, speaker_wav_input, speed_input, temperature_input ], outputs=audio_output ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)