| import gradio as gr |
| import torch |
| import numpy as np |
|
|
| |
| from encoder import inference as encoder |
| from synthesizer.inference import Synthesizer |
| from vocoder import inference as vocoder |
|
|
| |
| encoder_model_path = "encoder/saved_models/pretrained.pt" |
| synthesizer_model_path = "synthesizer/saved_models/pretrained/pretrained.pt" |
| vocoder_model_path = "vocoder/saved_models/pretrained/pretrained.pt" |
|
|
| encoder.load_model(encoder_model_path) |
| synthesizer = Synthesizer(synthesizer_model_path) |
| vocoder.load_model(vocoder_model_path) |
|
|
| def clone_voice(reference_audio, text): |
| |
| sample_rate, audio = reference_audio |
|
|
| |
| preprocessed_wav = encoder.preprocess_wav(audio) |
| |
| |
| embed = encoder.embed_utterance(preprocessed_wav) |
| |
| |
| specs = synthesizer.synthesize_spectrograms([text], [embed]) |
| |
| |
| generated_wav = vocoder.infer_waveform(specs[0]) |
| |
| |
| generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant") |
| |
| |
| return (synthesizer.sample_rate, generated_wav) |
|
|
| |
| demo = gr.Interface( |
| fn=clone_voice, |
| inputs=[ |
| gr.Audio(source="upload", type="numpy", label="Voz de Referencia"), |
| gr.Textbox(label="Texto a Clonar") |
| ], |
| outputs=gr.Audio(label="Voz Clonada") |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|