|
|
import gradio as gr |
|
|
import torch |
|
|
import sounddevice as sd |
|
|
from transformers import VITSTokenizer, VITSForConditionalGeneration |
|
|
|
|
|
|
|
|
model_name = "google/vit-base-patch16-224" |
|
|
tokenizer = VITSTokenizer.from_pretrained(model_name) |
|
|
model = VITSForConditionalGeneration.from_pretrained(model_name) |
|
|
|
|
|
|
|
|
def record_voice_sample(): |
|
|
duration = 5 |
|
|
sample_rate = 44100 |
|
|
|
|
|
print("Recording...") |
|
|
audio_data = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1) |
|
|
sd.wait() |
|
|
print("Recording finished.") |
|
|
|
|
|
return audio_data.T |
|
|
|
|
|
|
|
|
def perform_voice_cloning(audio_data, text_to_clone): |
|
|
|
|
|
cloned_audio = audio_data |
|
|
return cloned_audio |
|
|
|
|
|
def generate_speech(text_to_generate): |
|
|
inputs = tokenizer(text_to_generate, return_tensors="pt", padding=True, truncation=True, max_length=200) |
|
|
with torch.no_grad(): |
|
|
output = model.generate(**inputs) |
|
|
|
|
|
generated_audio = output[0].cpu().numpy() |
|
|
return generated_audio |
|
|
|
|
|
voice_sample_interface = gr.Interface( |
|
|
fn=record_voice_sample, |
|
|
inputs=None, |
|
|
outputs=gr.outputs.Audio(), |
|
|
live=True, |
|
|
title="Voice Sample Recording", |
|
|
description="Click 'Play' to record a voice sample.", |
|
|
) |
|
|
|
|
|
voice_cloning_interface = gr.Interface( |
|
|
fn=perform_voice_cloning, |
|
|
inputs=[gr.inputs.Audio(), gr.inputs.Textbox()], |
|
|
outputs=gr.outputs.Audio(), |
|
|
live=True, |
|
|
title="Voice Cloning", |
|
|
description="Clone the recorded voice sample.", |
|
|
) |
|
|
|
|
|
tts_interface = gr.Interface( |
|
|
fn=generate_speech, |
|
|
inputs=gr.inputs.Textbox(), |
|
|
outputs=gr.outputs.Audio(), |
|
|
live=True, |
|
|
title="Text-to-Speech (TTS) using VITS", |
|
|
description="Enter text, and the VITS model will generate speech.", |
|
|
) |
|
|
|
|
|
voice_sample_interface.launch() |
|
|
voice_cloning_interface.launch() |
|
|
tts_interface.launch(share=True) |