| | import gradio as gr
|
| | import torch
|
| | import tempfile
|
| | import soundfile as sf
|
| | from tortoise.api import TextToSpeech
|
| | from tortoise.utils.audio import load_audio
|
| |
|
| |
|
| | tts = TextToSpeech()
|
| |
|
| |
|
| | def generate_speech(reference_audio_path, text):
|
| | """
|
| | reference_audio_path: filepath to a WAV sampled at 22 050 Hz
|
| | text: the string to synthesize
|
| | returns: path to a 24 kHz WAV file with your cloned voice
|
| | """
|
| |
|
| | ref_waveform = load_audio(reference_audio_path, 22050)
|
| |
|
| |
|
| | output_tensor = tts.tts_with_preset(
|
| | text,
|
| | voice_samples=[ref_waveform],
|
| | preset="fast"
|
| | )
|
| |
|
| |
|
| | wav_np = output_tensor.squeeze().cpu().numpy()
|
| | tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
| | sf.write(tmp.name, wav_np, samplerate=24000)
|
| | return tmp.name
|
| |
|
| |
|
| | with gr.Blocks(title="Tortoise Voice Cloning TTS") as app:
|
| | gr.Markdown("## π£οΈ Voice Cloning with Tortoise TTS")
|
| | gr.Markdown(
|
| | "Upload a ~10 sec WAV clip (22 050 Hz), enter English text, "
|
| | "and hear it spoken back in **your** voice!"
|
| | )
|
| |
|
| | with gr.Row():
|
| | voice_sample = gr.Audio(type="filepath", label="ποΈ Upload Reference Voice (22 050 Hz WAV)")
|
| | text_input = gr.Textbox(label="π¬ Text to Synthesize", placeholder="e.g., Hello, world!")
|
| |
|
| | generate_btn = gr.Button("π Generate Speech")
|
| | output_audio = gr.Audio(label="π’ Cloned Speech Output (24 kHz)", interactive=False)
|
| |
|
| | generate_btn.click(
|
| | fn=generate_speech,
|
| | inputs=[voice_sample, text_input],
|
| | outputs=output_audio
|
| | )
|
| |
|
| | if __name__ == "__main__":
|
| | app.launch() |