Spaces:
Runtime error
Runtime error
File size: 4,318 Bytes
08a0d1e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import gradio as gr
import torch
import tempfile
import os
from TTS.api import TTS
# Initialize the XTTS model
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Initialize XTTS model
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
# Get list of supported languages
supported_languages = [
"en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl",
"cs", "ar", "zh-cn", "ja", "hu", "ko"
]
def generate_speech(
text,
language,
speaker_wav=None,
voice_preset=None,
speed=1.0,
temperature=0.7
):
"""
Generate speech from text using XTTS model
"""
# Create a temporary file for output
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
output_path = tmp_file.name
try:
# If speaker wav is provided, use it for voice cloning
if speaker_wav is not None:
tts.tts_to_file(
text=text,
file_path=output_path,
speaker_wav=speaker_wav,
language=language,
speed=speed,
temperature=temperature
)
else:
# Use default voice if no speaker wav is provided
tts.tts_to_file(
text=text,
file_path=output_path,
language=language,
speed=speed,
temperature=temperature
)
return output_path
except Exception as e:
# Clean up temporary file if error occurs
if os.path.exists(output_path):
os.unlink(output_path)
raise gr.Error(f"Error generating speech: {str(e)}")
# Create Gradio interface
with gr.Blocks(title="XTTS Text-to-Speech") as demo:
gr.Markdown("# XTTS Text-to-Speech Generator")
gr.Markdown("Generate speech from text with voice cloning capabilities using XTTS v2")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Input Text",
placeholder="Enter text to convert to speech...",
lines=3
)
language_input = gr.Dropdown(
label="Language",
choices=[(lang, lang) for lang in supported_languages],
value="en",
info="Select the language for synthesis"
)
speaker_wav_input = gr.Audio(
label="Reference Voice (Optional)",
type="filepath",
info="Upload a 3-10 second audio sample for voice cloning"
)
with gr.Accordion("Advanced Settings", open=False):
speed_input = gr.Slider(
label="Speed",
minimum=0.5,
maximum=2.0,
value=1.0,
step=0.1,
info="Speech speed (0.5 = slow, 2.0 = fast)"
)
temperature_input = gr.Slider(
label="Temperature",
minimum=0.1,
maximum=1.0,
value=0.7,
step=0.1,
info="Voice variability (lower = more deterministic)"
)
generate_btn = gr.Button("Generate Speech", variant="primary")
with gr.Column():
audio_output = gr.Audio(
label="Generated Speech",
type="filepath"
)
gr.Examples(
examples=[
["Hello, world! This is a sample text to speech generation.", "en"],
["Bonjour, comment allez-vous aujourd'hui?", "fr"],
["Hola, ¿cómo estás?", "es"],
],
inputs=[text_input, language_input],
outputs=audio_output,
fn=generate_speech,
cache_examples=True
)
generate_btn.click(
fn=generate_speech,
inputs=[
text_input,
language_input,
speaker_wav_input,
speed_input,
temperature_input
],
outputs=audio_output
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860) |