optigesr commited on
Commit
00e8925
·
1 Parent(s): 7b1dda1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +151 -1
app.py CHANGED
@@ -1,2 +1,152 @@
1
  import os
2
- os.system("python setup.py install")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ os.system("python setup.py install")
3
+ import gradio as gr
4
+ import torchaudio
5
+ import time
6
+ from datetime import datetime
7
+ os.system("git clone https://github.com/neonbjb/tortoise-tts.git")
8
+ os.system("pip install -r ./tortoise-tts/requirements.txt")
9
+ os.system("python ./tortoise-tts/setup.py install")
10
+ from tortoise.api import TextToSpeech
11
+ from tortoise.utils.audio import load_audio, load_voice, load_voices
12
+
13
+ VOICE_OPTIONS = [
14
+ "random", # special option for random voice
15
+ "custom_voice", # special option for custom voice
16
+ "disabled", # special option for disabled voice
17
+ ]
18
+ os.system("git clone https://github.com/neonbjb/tortoise-tts.git")
19
+ os.system("pip install -r ./tortoise-tts/requirements.txt")
20
+ os.system("python ./tortoise-tts/setup.py install")
21
+
22
+ def inference(text, emotion, prompt, voice, mic_audio, voice_b, voice_c, preset, seed):
23
+ if voice != "custom_voice":
24
+ voices = [voice]
25
+ else:
26
+ voices = []
27
+
28
+ if voice_b != "disabled":
29
+ voices.append(voice_b)
30
+ if voice_c != "disabled":
31
+ voices.append(voice_c)
32
+
33
+ if emotion != "None/Custom":
34
+ text = f"[I am really {emotion.lower()},] {text}"
35
+ elif prompt.strip() != "":
36
+ text = f"[{prompt},] {text}"
37
+
38
+ c = None
39
+ if voice == "custom_voice":
40
+ if mic_audio is None:
41
+ raise gr.Error("Please provide audio from mic when choosing custom voice")
42
+ c = load_audio(mic_audio, 22050)
43
+
44
+
45
+ if len(voices) == 1 or len(voices) == 0:
46
+ if voice == "custom_voice":
47
+ voice_samples, conditioning_latents = [c], None
48
+ else:
49
+ voice_samples, conditioning_latents = load_voice(voice)
50
+ else:
51
+ voice_samples, conditioning_latents = load_voices(voices)
52
+ if voice == "custom_voice":
53
+ voice_samples.extend([c])
54
+
55
+ sample_voice = voice_samples[0] if len(voice_samples) else None
56
+
57
+ start_time = time.time()
58
+ gen, _ = tts.tts_with_preset(
59
+ text,
60
+ voice_samples=voice_samples,
61
+ conditioning_latents=conditioning_latents,
62
+ preset=preset,
63
+ use_deterministic_seed=seed,
64
+ return_deterministic_state=True,
65
+ k=3,
66
+ )
67
+
68
+ with open("Tortoise_TTS_Runs.log", "a") as f:
69
+ f.write(
70
+ f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Quality: {preset} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n"
71
+ )
72
+
73
+ return (
74
+ (22050, sample_voice.squeeze().cpu().numpy()),
75
+ (24000, gen[0].squeeze().cpu().numpy()),
76
+ (24000, gen[1].squeeze().cpu().numpy()),
77
+ (24000, gen[2].squeeze().cpu().numpy()),
78
+ )
79
+
80
+
81
+ def main():
82
+ text = gr.Textbox(lines=4, label="Text:")
83
+ emotion = gr.Radio(
84
+ ["None/Custom", "Happy", "Sad", "Angry", "Disgusted", "Arrogant"],
85
+ value="None/Custom",
86
+ label="Select emotion:",
87
+ type="value",
88
+ )
89
+ prompt = gr.Textbox(lines=1, label="Enter prompt if [Custom] emotion:")
90
+ preset = gr.Radio(
91
+ ["ultra_fast", "fast", "standard", "high_quality"],
92
+ value="fast",
93
+ label="Preset mode (determines quality with tradeoff over speed):",
94
+ type="value",
95
+ )
96
+ voice = gr.Dropdown(
97
+ os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS,
98
+ value="angie",
99
+ label="Select voice:",
100
+ type="value",
101
+ )
102
+ mic_audio = gr.Audio(
103
+ label="Record voice (when selected custom_voice):",
104
+ source="microphone",
105
+ type="filepath",
106
+ )
107
+ voice_b = gr.Dropdown(
108
+ os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS,
109
+ value="disabled",
110
+ label="(Optional) Select second voice:",
111
+ type="value",
112
+ )
113
+ voice_c = gr.Dropdown(
114
+ os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS,
115
+ value="disabled",
116
+ label="(Optional) Select third voice:",
117
+ type="value",
118
+ )
119
+ seed = gr.Number(value=0, precision=0, label="Seed (for reproducibility):")
120
+
121
+ selected_voice = gr.Audio(label="Sample of selected voice (first):")
122
+ output_audio_1 = gr.Audio(label="Output [Candidate 1]:")
123
+ output_audio_2 = gr.Audio(label="Output [Candidate 2]:")
124
+ output_audio_3 = gr.Audio(label="Output [Candidate 3]:")
125
+
126
+ interface = gr.Interface(
127
+ fn=inference,
128
+ inputs=[
129
+ text,
130
+ emotion,
131
+ prompt,
132
+ voice,
133
+ mic_audio,
134
+ voice_b,
135
+ voice_c,
136
+ preset,
137
+ seed,
138
+ ],
139
+ outputs=[selected_voice, output_audio_1, output_audio_2, output_audio_3],
140
+ )
141
+ interface.launch(share=True)
142
+
143
+
144
+ if __name__ == "__main__":
145
+ tts = TextToSpeech()
146
+
147
+ with open("Tortoise_TTS_Runs.log", "a") as f:
148
+ f.write(
149
+ f"\n\n-------------------------Tortoise TTS Logs, {datetime.now()}-------------------------\n"
150
+ )
151
+
152
+ main()