test_03 / app.py
Shanuka01's picture
Update app.py
be8c738
import gradio as gr
import torch
import sounddevice as sd
from transformers import VITSTokenizer, VITSForConditionalGeneration
# Load the pre-trained VITS model and tokenizer
model_name = "google/vit-base-patch16-224"
tokenizer = VITSTokenizer.from_pretrained(model_name)
model = VITSForConditionalGeneration.from_pretrained(model_name)
# Function to record a voice sample
def record_voice_sample():
duration = 5 # Record for 5 seconds
sample_rate = 44100 # Standard sample rate
print("Recording...")
audio_data = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1)
sd.wait()
print("Recording finished.")
return audio_data.T
# You need to properly implement this
def perform_voice_cloning(audio_data, text_to_clone):
# Use your voice cloning model to perform voice cloning
cloned_audio = audio_data # Dummy result
return cloned_audio
def generate_speech(text_to_generate):
inputs = tokenizer(text_to_generate, return_tensors="pt", padding=True, truncation=True, max_length=200)
with torch.no_grad():
output = model.generate(**inputs)
# Convert tensor to suitable format if needed
generated_audio = output[0].cpu().numpy()
return generated_audio
voice_sample_interface = gr.Interface(
fn=record_voice_sample,
inputs=None,
outputs=gr.outputs.Audio(),
live=True,
title="Voice Sample Recording",
description="Click 'Play' to record a voice sample.",
)
voice_cloning_interface = gr.Interface(
fn=perform_voice_cloning,
inputs=[gr.inputs.Audio(), gr.inputs.Textbox()],
outputs=gr.outputs.Audio(),
live=True,
title="Voice Cloning",
description="Clone the recorded voice sample.",
)
tts_interface = gr.Interface(
fn=generate_speech,
inputs=gr.inputs.Textbox(),
outputs=gr.outputs.Audio(),
live=True,
title="Text-to-Speech (TTS) using VITS",
description="Enter text, and the VITS model will generate speech.",
)
voice_sample_interface.launch()
voice_cloning_interface.launch()
tts_interface.launch(share=True)