Shanuka01 commited on
Commit
363c60e
·
1 Parent(s): 100749a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -28
app.py CHANGED
@@ -1,37 +1,84 @@
1
- import os
2
- import torch
3
- from TTS.api import TTS
4
  import gradio as gr
5
- from TTS.tts.configs.xtts_config import XttsConfig
6
- from TTS.tts.models.xtts import Xtts
7
- from scipy.io.wavfile import write
8
- import numpy as np
9
- from transformers import pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
 
 
 
 
 
 
 
 
 
 
12
 
13
- def tts_generate(text, speaker_wav="model2.mp3"):
14
- # Get device
15
- device = device = "cuda" if torch.cuda.is_available() else "cpu"
16
 
17
- # Run TTS
 
 
 
 
 
 
 
18
 
19
- asr = pipeline("Text-to-Speech", model="coqui/XTTS-v1")
20
 
21
- # generate speech by cloning a voice using default settings
22
- tts.tts_to_file(text=text, speaker_wav="voice_models/" + speaker_wav, language="en", file_path="output.wav")
23
- return "output.wav"
24
-
25
- def greet(name):
26
- return "Hello " + name + "!!"
 
 
27
 
28
- iface = gr.Interface(fn=tts_generate,
29
- inputs=["text", "text"],
30
- outputs=["audio"],
31
- examples=[
32
- ["Hello Jhon. Welcome to our group.", "model1.wav"],
33
- ["Hello Jhon. Welcome to our group.", "model2.mp3"]]
34
- )
35
- iface.launch(share=True, debug=True)
36
 
37
- # tts_generate("Hello Jhon. Welcome to our group.", "model2.mp3")
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import torch
3
+ from transformers import VITSTokenizer, VITSForConditionalGeneration
4
+
5
+
6
+ # Load the pre-trained VITS model and tokenizer
7
+ model_name = "user/vits-large-melgan-ljspeech" # Replace with your desired VITS model
8
+ tokenizer = VITSTokenizer.from_pretrained(model_name)
9
+ model = VITSForConditionalGeneration.from_pretrained(model_name)
10
+
11
+
12
+ # Function to record a voice sample
13
+ def record_voice_sample():
14
+ duration = 5 # Record for 5 seconds
15
+ sample_rate = 44100 # Standard sample rate
16
+
17
+
18
+ print("Recording...")
19
+
20
+
21
+ audio_data = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1)
22
+ sd.wait()
23
+
24
+
25
+ print("Recording finished.")
26
+
27
+
28
+ return audio_data
29
+
30
+
31
+ # Function to perform voice cloning (replace with your actual voice cloning model)
32
+ def perform_voice_cloning(audio_data, text_to_clone):
33
+ # Use your voice cloning model to perform voice cloning
34
+ # Replace this code with your actual voice cloning model
35
+ cloned_audio = audio_data # Dummy result
36
+
37
+
38
+ return cloned_audio
39
+
40
+
41
+ # Function to perform text-to-speech (TTS) using the VITS model
42
+ def generate_speech(text_to_generate):
43
+ inputs = tokenizer(text_to_generate, return_tensors="pt", padding=True, truncation=True, max_length=200)
44
+ with torch.no_grad():
45
+ output = model.generate(**inputs)
46
+ generated_audio = output[0].numpy()
47
+ return generated_audio
48
 
49
 
50
+ # Create Gradio interfaces for each step
51
+ voice_sample_interface = gr.Interface(
52
+ fn=record_voice_sample,
53
+ inputs=None,
54
+ outputs=gr.outputs.Audio(),
55
+ live=True,
56
+ title="Voice Sample Recording",
57
+ description="Click 'Play' to record a voice sample.",
58
+ )
59
 
 
 
 
60
 
61
+ voice_cloning_interface = gr.Interface(
62
+ fn=perform_voice_cloning,
63
+ inputs=gr.inputs.Audio(),
64
+ outputs=gr.outputs.Audio(),
65
+ live=True,
66
+ title="Voice Cloning",
67
+ description="Clone the recorded voice sample.",
68
+ )
69
 
 
70
 
71
+ tts_interface = gr.Interface(
72
+ fn=generate_speech,
73
+ inputs=gr.inputs.Textbox(),
74
+ outputs=gr.outputs.Audio(),
75
+ live=True,
76
+ title="Text-to-Speech (TTS) using VITS",
77
+ description="Enter text, and the VITS model will generate speech.",
78
+ )
79
 
 
 
 
 
 
 
 
 
80
 
81
+ # Launch Gradio interfaces
82
+ voice_sample_interface.launch()
83
+ voice_cloning_interface.launch()
84
+ tts_interface.launch()