| | import gradio as gr |
| | import numpy as np |
| | import os |
| | from pathlib import Path |
| | from synthesizer.inference import Synthesizer |
| | from encoder import inference as encoder |
| | from vocoder import inference as vocoder |
| | from pydub import AudioSegment |
| |
|
| | |
| | project_name = "Real-Time-Voice-Cloning" |
| | encoder.load_model(Path(project_name) / "encoder/saved_models/pretrained.pt") |
| | synthesizer = Synthesizer(Path(project_name) / "synthesizer/saved_models/pretrained/pretrained.pt") |
| | vocoder.load_model(Path(project_name) / "vocoder/saved_models/pretrained/pretrained.pt") |
| |
|
| | def clone_voice(text, reference_audio): |
| | |
| | audio_path = "reference_audio.wav" |
| | reference_audio.export(audio_path, format="wav") |
| | |
| | |
| | audio = encoder.preprocess_wav(audio_path) |
| | embedding = encoder.embed_utterance(audio) |
| | |
| | |
| | specs = synthesizer.synthesize_spectrograms([text], [embedding]) |
| | generated_wav = vocoder.infer_waveform(specs[0]) |
| | |
| | |
| | output_path = "output.wav" |
| | generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant") |
| | AudioSegment(generated_wav, frame_rate=synthesizer.sample_rate, sample_width=2, channels=1).export(output_path, format="wav") |
| | return output_path |
| |
|
| | iface = gr.Interface( |
| | fn=clone_voice, |
| | inputs=[gr.Textbox(label="Text"), gr.Audio(label="Reference Audio", type="file")], |
| | outputs=gr.Audio(label="Generated Audio"), |
| | title="Real-Time Voice Cloning", |
| | description="Generate new speech using a reference audio sample and provided text." |
| | ) |
| |
|
| | if __name__ == "__main__": |
| | iface.launch() |
| |
|