File size: 2,073 Bytes
100749a
363c60e
ba7ce63
363c60e
 
 
ba7ce63
be8c738
363c60e
 
 
 
 
 
 
 
 
 
 
 
ba7ce63
363c60e
ba7ce63
363c60e
 
 
 
 
 
 
 
 
ba7ce63
 
363c60e
100749a
363c60e
 
 
 
 
 
 
 
100749a
363c60e
 
ba7ce63
363c60e
 
 
 
 
100749a
363c60e
 
 
 
 
 
 
 
100749a
363c60e
 
ba7ce63
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import gradio as gr
import torch
import sounddevice as sd
from transformers import VITSTokenizer, VITSForConditionalGeneration

# Load the pre-trained VITS model and tokenizer
model_name = "google/vit-base-patch16-224"
tokenizer = VITSTokenizer.from_pretrained(model_name)
model = VITSForConditionalGeneration.from_pretrained(model_name)

# Function to record a voice sample
def record_voice_sample():
    duration = 5  # Record for 5 seconds
    sample_rate = 44100  # Standard sample rate

    print("Recording...")
    audio_data = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1)
    sd.wait()
    print("Recording finished.")

    return audio_data.T

# You need to properly implement this
def perform_voice_cloning(audio_data, text_to_clone):
    # Use your voice cloning model to perform voice cloning
    cloned_audio = audio_data  # Dummy result
    return cloned_audio

def generate_speech(text_to_generate):
    inputs = tokenizer(text_to_generate, return_tensors="pt", padding=True, truncation=True, max_length=200)
    with torch.no_grad():
        output = model.generate(**inputs)
    # Convert tensor to suitable format if needed
    generated_audio = output[0].cpu().numpy()
    return generated_audio

voice_sample_interface = gr.Interface(
    fn=record_voice_sample,
    inputs=None,
    outputs=gr.outputs.Audio(),
    live=True,
    title="Voice Sample Recording",
    description="Click 'Play' to record a voice sample.",
)

voice_cloning_interface = gr.Interface(
    fn=perform_voice_cloning,
    inputs=[gr.inputs.Audio(), gr.inputs.Textbox()],
    outputs=gr.outputs.Audio(),
    live=True,
    title="Voice Cloning",
    description="Clone the recorded voice sample.",
)

tts_interface = gr.Interface(
    fn=generate_speech,
    inputs=gr.inputs.Textbox(),
    outputs=gr.outputs.Audio(),
    live=True,
    title="Text-to-Speech (TTS) using VITS",
    description="Enter text, and the VITS model will generate speech.",
)

voice_sample_interface.launch()
voice_cloning_interface.launch()
tts_interface.launch(share=True)