NeuralFalcon commited on
Commit
1445f36
·
verified ·
1 Parent(s): 9ba6be8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +225 -0
app.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import re
4
+ import uuid
5
+ import scipy.io.wavfile
6
+ import torch
7
+ from pocket_tts import TTSModel
8
+
9
+ print("Loading TTS Model...")
10
+ try:
11
+ tts_model = TTSModel.load_model()
12
+ print("Model loaded successfully.")
13
+ except Exception as e:
14
+ print(f"Error loading model: {e}")
15
+
16
+ def get_tts_file_name(text, language="en"):
17
+ temp_audio_dir = "./ai_tts_voice/"
18
+ os.makedirs(temp_audio_dir, exist_ok=True)
19
+
20
+ clean = re.sub(r'[^a-zA-Z\s]', '', text or "")
21
+ clean = clean.lower().strip().replace(" ", "_")[:20] or "audio"
22
+
23
+ uid = uuid.uuid4().hex[:8].upper()
24
+ language = language.lower().strip()
25
+
26
+ return os.path.join(
27
+ temp_audio_dir,
28
+ f"{clean}_{language}_{uid}.wav"
29
+ )
30
+
31
+ DEFAULT_VOICES = [
32
+ "alba", "marius", "javert", "jean",
33
+ "fantine", "cosette", "eponine", "azelma"
34
+ ]
35
+
36
+ def generate_speech(text, mode, preset_voice, clone_audio_path):
37
+ if not text:
38
+ raise gr.Error("Please enter text to generate speech.")
39
+
40
+ state = None
41
+
42
+ if mode == "Default Voices":
43
+ print(f"Using preset voice: {preset_voice}")
44
+ state = tts_model.get_state_for_audio_prompt(preset_voice)
45
+
46
+ else:
47
+ if not clone_audio_path:
48
+ raise gr.Error("Please upload a reference audio file for cloning.")
49
+
50
+ print(f"Cloning voice from: {clone_audio_path}")
51
+ try:
52
+ state = tts_model.get_state_for_audio_prompt(clone_audio_path)
53
+ except Exception as e:
54
+ error_msg = f"Error loading reference audio: {str(e)}. Please upload a valid WAV file."
55
+ print(error_msg)
56
+ raise gr.Error(error_msg)
57
+
58
+ try:
59
+ audio_tensor = tts_model.generate_audio(state, text)
60
+
61
+ output_filename = get_tts_file_name(text)
62
+ scipy.io.wavfile.write(output_filename, tts_model.sample_rate, audio_tensor.numpy())
63
+
64
+ return output_filename
65
+ except Exception as e:
66
+ raise gr.Error(f"Generation failed: {str(e)}")
67
+
68
+ def toggle_inputs(mode):
69
+ if mode == "Default Voices":
70
+ return gr.update(visible=True), gr.update(visible=False)
71
+ else:
72
+ return gr.update(visible=False), gr.update(visible=True)
73
+
74
+
75
+ CUSTOM_CSS = """
76
+ .gradio-container {
77
+ font-family: 'SF Pro Display', -apple-system, BlinkMacSystemFont, sans-serif;
78
+ }
79
+ .header-container {
80
+ text-align: center;
81
+ margin-bottom: 20px;
82
+ }
83
+ .logo-img {
84
+ margin: 0 auto;
85
+ display: block;
86
+ max-width: 100%;
87
+ transition: transform 0.2s;
88
+ }
89
+ .logo-img:hover {
90
+ transform: scale(1.02);
91
+ opacity: 0.9;
92
+ }
93
+ .links-container a {
94
+ text-decoration: none;
95
+ color: #4a90e2;
96
+ font-weight: 500;
97
+ }
98
+ .links-container a:hover {
99
+ text-decoration: underline;
100
+ }
101
+ """
102
+
103
+ HEADER_HTML = """
104
+ <div class="header-container" style="text-align:center;">
105
+
106
+ <a href="https://kyutai.org/tts" target="_blank" title="Visit Kyutai TTS">
107
+ <img src="https://raw.githubusercontent.com/kyutai-labs/pocket-tts/refs/heads/main/docs/logo.png"
108
+ class="logo-img" width="200">
109
+ </a>
110
+
111
+ <div class="links-container"
112
+ style="
113
+ margin-top: 18px;
114
+ display: flex;
115
+ justify-content: center;
116
+ align-items: center;
117
+ gap: 14px;
118
+ flex-wrap: wrap;
119
+ ">
120
+
121
+ <a href="https://github.com/kyutai-labs/pocket-tts"
122
+ target="_blank"
123
+ style="text-decoration:none;">
124
+ 🐱 GitHub Repository
125
+ </a>
126
+
127
+ <span style="color: gray;">|</span>
128
+
129
+ <a href="https://huggingface.co/kyutai/pocket-tts"
130
+ target="_blank"
131
+ style="text-decoration:none;">
132
+ 🤗 Hugging Face Model Card
133
+ </a>
134
+
135
+ <span style="color: gray;">|</span>
136
+
137
+ <a href="https://colab.research.google.com/github/NeuralFalconYT/Voice-Clone/blob/main/Pocket_TTS_Colab.ipynb"
138
+ target="_blank"
139
+ style="
140
+ display: inline-flex;
141
+ align-items: center;
142
+ ">
143
+ <img src="https://colab.research.google.com/assets/colab-badge.svg"
144
+ alt="Open in Colab"
145
+ height="26">
146
+ </a>
147
+
148
+ </div>
149
+
150
+ <p style="font-size: 0.8em; color: gray; margin-top: 10px;">
151
+ <i>Note: This is not an official demo from Kyutai Labs</i>
152
+ </p>
153
+
154
+ </div>
155
+ """
156
+
157
+
158
+
159
+ with gr.Blocks(theme='JohnSmith9982/small_and_pretty', css=CUSTOM_CSS) as demo:
160
+ gr.HTML(HEADER_HTML)
161
+
162
+ with gr.Row():
163
+ with gr.Column():
164
+ text_input = gr.Textbox(
165
+ label="Text Input",
166
+ placeholder="Hi, how are you?",
167
+ lines=3,
168
+ value="Hi, how are you?"
169
+ )
170
+
171
+ mode_radio = gr.Radio(
172
+ choices=["Default Voices", "Voice Clone"],
173
+ value="Default Voices",
174
+ label="TTS Mode"
175
+ )
176
+
177
+ with gr.Group():
178
+ dropdown_input = gr.Dropdown(
179
+ choices=DEFAULT_VOICES,
180
+ value="alba",
181
+ label="Select Voice",
182
+ visible=True
183
+ )
184
+
185
+ audio_upload = gr.Audio(
186
+ label="Upload Reference Audio (WAV recommended)",
187
+ type="filepath",
188
+ visible=False
189
+ )
190
+
191
+ generate_btn = gr.Button("Generate Audio", variant="primary")
192
+
193
+ example_audio_url = "https://huggingface.co/kyutai/tts-voices/resolve/main/alba-mackenna/casual.wav"
194
+
195
+ gr.Examples(
196
+ examples=[
197
+ ["Hello, I am Fantine. Nice to meet you.", "Default Voices", "fantine", None],
198
+ ["I am Cosette, and the weather is lovely.", "Default Voices", "cosette", None],
199
+ ["Hey there, Eponine here.", "Default Voices", "eponine", None],
200
+ ["Greetings from Azelma.", "Default Voices", "azelma", None],
201
+ ["This is a voice cloning test using the uploaded reference audio.", "Voice Clone", None, example_audio_url],
202
+ ],
203
+ inputs=[text_input, mode_radio, dropdown_input, audio_upload],
204
+ label="Click on an Example to Try"
205
+ )
206
+
207
+ with gr.Column():
208
+ output_audio = gr.Audio(label="Generated Speech", type="filepath")
209
+
210
+
211
+ mode_radio.change(
212
+ fn=toggle_inputs,
213
+ inputs=[mode_radio],
214
+ outputs=[dropdown_input, audio_upload]
215
+ )
216
+
217
+ generate_btn.click(
218
+ fn=generate_speech,
219
+ inputs=[text_input, mode_radio, dropdown_input, audio_upload],
220
+ outputs=[output_audio]
221
+ )
222
+
223
+ if __name__ == "__main__":
224
+ demo.queue().launch(share=False, debug=False)
225
+