vedaco commited on
Commit
decc960
·
verified ·
1 Parent(s): bf4b051

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +667 -57
app.py CHANGED
@@ -1,70 +1,680 @@
 
 
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
-
5
- def respond(
6
- message,
7
- history: list[dict[str, str]],
8
- system_message,
9
- max_tokens,
10
- temperature,
11
- top_p,
12
- hf_token: gr.OAuthToken,
13
- ):
14
- """
15
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
16
- """
17
- client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
18
 
19
- messages = [{"role": "system", "content": system_message}]
 
 
20
 
21
- messages.extend(history)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- messages.append({"role": "user", "content": message})
24
 
25
- response = ""
 
 
26
 
27
- for message in client.chat_completion(
28
- messages,
29
- max_tokens=max_tokens,
30
- stream=True,
31
- temperature=temperature,
32
- top_p=top_p,
33
- ):
34
- choices = message.choices
35
- token = ""
36
- if len(choices) and choices[0].delta.content:
37
- token = choices[0].delta.content
 
 
 
 
 
 
 
 
 
 
38
 
39
- response += token
40
- yield response
41
 
 
 
 
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- chatbot = gr.ChatInterface(
47
- respond,
48
- type="messages",
49
- additional_inputs=[
50
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
51
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
52
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
53
- gr.Slider(
54
- minimum=0.1,
55
- maximum=1.0,
56
- value=0.95,
57
- step=0.05,
58
- label="Top-p (nucleus sampling)",
59
- ),
60
- ],
61
- )
62
-
63
- with gr.Blocks() as demo:
64
- with gr.Sidebar():
65
- gr.LoginButton()
66
- chatbot.render()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
 
 
69
  if __name__ == "__main__":
70
- demo.launch()
 
1
+ import tensorflow as tf
2
+ import numpy as np
3
  import gradio as gr
4
+ import scipy.signal as signal
5
+ from scipy.io import wavfile
6
+ import io
7
+ import os
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ # ============================================
10
+ # VEDES TTS - Text-to-Speech Model from Scratch
11
+ # ============================================
12
 
13
+ # Audio Configuration
14
+ class VedesConfig:
15
+ """Configuration for Vedes TTS Model"""
16
+ # Audio parameters
17
+ sample_rate = 22050
18
+ n_fft = 1024
19
+ hop_length = 256
20
+ win_length = 1024
21
+ n_mels = 80
22
+ fmin = 0
23
+ fmax = 8000
24
+
25
+ # Model parameters
26
+ embedding_dim = 256
27
+ encoder_dim = 256
28
+ decoder_dim = 512
29
+ attention_dim = 128
30
+ prenet_dim = 256
31
+ postnet_dim = 512
32
+ postnet_layers = 5
33
+ max_decoder_steps = 1000
34
+
35
+ # Text parameters
36
+ vocab = "abcdefghijklmnopqrstuvwxyz .,!?'-"
37
+ vocab_size = len(vocab) + 1 # +1 for padding
38
 
39
+ config = VedesConfig()
40
 
41
+ # ============================================
42
+ # TEXT PROCESSING
43
+ # ============================================
44
 
45
+ class TextProcessor:
46
+ """Text to sequence converter"""
47
+
48
+ def __init__(self, vocab):
49
+ self.vocab = vocab
50
+ self.char_to_idx = {char: idx + 1 for idx, char in enumerate(vocab)}
51
+ self.idx_to_char = {idx + 1: char for idx, char in enumerate(vocab)}
52
+ self.idx_to_char[0] = '<pad>'
53
+
54
+ def text_to_sequence(self, text):
55
+ """Convert text to sequence of integers"""
56
+ text = text.lower()
57
+ sequence = []
58
+ for char in text:
59
+ if char in self.char_to_idx:
60
+ sequence.append(self.char_to_idx[char])
61
+ return np.array(sequence, dtype=np.int32)
62
+
63
+ def sequence_to_text(self, sequence):
64
+ """Convert sequence back to text"""
65
+ return ''.join([self.idx_to_char.get(idx, '') for idx in sequence])
66
 
67
+ text_processor = TextProcessor(config.vocab)
 
68
 
69
+ # ============================================
70
+ # VEDES TTS MODEL COMPONENTS
71
+ # ============================================
72
 
73
+ class VedesPrenet(tf.keras.layers.Layer):
74
+ """Prenet: 2-layer FC with dropout"""
75
+
76
+ def __init__(self, units, **kwargs):
77
+ super().__init__(**kwargs)
78
+ self.dense1 = tf.keras.layers.Dense(units, activation='relu')
79
+ self.dense2 = tf.keras.layers.Dense(units, activation='relu')
80
+ self.dropout1 = tf.keras.layers.Dropout(0.5)
81
+ self.dropout2 = tf.keras.layers.Dropout(0.5)
82
+
83
+ def call(self, inputs, training=True):
84
+ x = self.dropout1(self.dense1(inputs), training=training)
85
+ x = self.dropout2(self.dense2(x), training=training)
86
+ return x
87
+
88
+
89
+ class VedesEncoder(tf.keras.layers.Layer):
90
+ """Encoder: Embedding + Conv layers + BiLSTM"""
91
+
92
+ def __init__(self, vocab_size, embed_dim, encoder_dim, **kwargs):
93
+ super().__init__(**kwargs)
94
+ self.embedding = tf.keras.layers.Embedding(vocab_size, embed_dim)
95
+
96
+ # 3 Conv layers with batch norm
97
+ self.conv_layers = []
98
+ self.batch_norms = []
99
+ for i in range(3):
100
+ self.conv_layers.append(
101
+ tf.keras.layers.Conv1D(encoder_dim, 5, padding='same', activation='relu')
102
+ )
103
+ self.batch_norms.append(tf.keras.layers.BatchNormalization())
104
+
105
+ self.dropout = tf.keras.layers.Dropout(0.5)
106
+
107
+ # Bidirectional LSTM
108
+ self.bilstm = tf.keras.layers.Bidirectional(
109
+ tf.keras.layers.LSTM(encoder_dim // 2, return_sequences=True)
110
+ )
111
+
112
+ def call(self, inputs, training=True):
113
+ x = self.embedding(inputs)
114
+
115
+ for conv, bn in zip(self.conv_layers, self.batch_norms):
116
+ x = conv(x)
117
+ x = bn(x, training=training)
118
+ x = self.dropout(x, training=training)
119
+
120
+ x = self.bilstm(x)
121
+ return x
122
+
123
+
124
+ class VedesAttention(tf.keras.layers.Layer):
125
+ """Location-Sensitive Attention"""
126
+
127
+ def __init__(self, attention_dim, **kwargs):
128
+ super().__init__(**kwargs)
129
+ self.attention_dim = attention_dim
130
+ self.query_layer = tf.keras.layers.Dense(attention_dim, use_bias=False)
131
+ self.memory_layer = tf.keras.layers.Dense(attention_dim, use_bias=False)
132
+ self.location_conv = tf.keras.layers.Conv1D(32, 31, padding='same')
133
+ self.location_dense = tf.keras.layers.Dense(attention_dim, use_bias=False)
134
+ self.v = tf.keras.layers.Dense(1, use_bias=False)
135
+
136
+ def call(self, query, memory, prev_attention):
137
+ """
138
+ query: decoder hidden state [batch, decoder_dim]
139
+ memory: encoder outputs [batch, seq_len, encoder_dim]
140
+ prev_attention: previous attention weights [batch, seq_len]
141
+ """
142
+ # Process query
143
+ processed_query = self.query_layer(tf.expand_dims(query, 1))
144
+
145
+ # Process memory
146
+ processed_memory = self.memory_layer(memory)
147
+
148
+ # Process location
149
+ prev_attention_expanded = tf.expand_dims(prev_attention, -1)
150
+ location_features = self.location_conv(prev_attention_expanded)
151
+ processed_location = self.location_dense(location_features)
152
+
153
+ # Compute attention scores
154
+ scores = self.v(tf.nn.tanh(
155
+ processed_query + processed_memory + processed_location
156
+ ))
157
+ scores = tf.squeeze(scores, -1)
158
+
159
+ # Softmax to get attention weights
160
+ attention_weights = tf.nn.softmax(scores, axis=-1)
161
+
162
+ # Compute context vector
163
+ context = tf.reduce_sum(
164
+ tf.expand_dims(attention_weights, -1) * memory, axis=1
165
+ )
166
+
167
+ return context, attention_weights
168
+
169
+
170
+ class VedesDecoder(tf.keras.layers.Layer):
171
+ """Autoregressive Decoder"""
172
+
173
+ def __init__(self, decoder_dim, n_mels, prenet_dim, attention_dim, **kwargs):
174
+ super().__init__(**kwargs)
175
+ self.n_mels = n_mels
176
+ self.decoder_dim = decoder_dim
177
+
178
+ self.prenet = VedesPrenet(prenet_dim)
179
+ self.attention = VedesAttention(attention_dim)
180
+
181
+ # Decoder LSTM cells
182
+ self.lstm1 = tf.keras.layers.LSTMCell(decoder_dim)
183
+ self.lstm2 = tf.keras.layers.LSTMCell(decoder_dim)
184
+
185
+ # Output projections
186
+ self.mel_dense = tf.keras.layers.Dense(n_mels)
187
+ self.stop_dense = tf.keras.layers.Dense(1)
188
+
189
+ def get_initial_state(self, batch_size, encoder_seq_len):
190
+ """Initialize decoder states"""
191
+ return {
192
+ 'lstm1_state': [
193
+ tf.zeros([batch_size, self.decoder_dim]),
194
+ tf.zeros([batch_size, self.decoder_dim])
195
+ ],
196
+ 'lstm2_state': [
197
+ tf.zeros([batch_size, self.decoder_dim]),
198
+ tf.zeros([batch_size, self.decoder_dim])
199
+ ],
200
+ 'attention_weights': tf.zeros([batch_size, encoder_seq_len]),
201
+ 'context': tf.zeros([batch_size, self.decoder_dim * 2])
202
+ }
203
+
204
+ def decode_step(self, decoder_input, encoder_outputs, state, training=True):
205
+ """Single decoder step"""
206
+ # Prenet
207
+ prenet_out = self.prenet(decoder_input, training=training)
208
+
209
+ # Concatenate with context
210
+ lstm1_input = tf.concat([prenet_out, state['context']], axis=-1)
211
+
212
+ # First LSTM
213
+ lstm1_out, new_lstm1_state = self.lstm1(lstm1_input, state['lstm1_state'])
214
+
215
+ # Attention
216
+ context, attention_weights = self.attention(
217
+ lstm1_out, encoder_outputs, state['attention_weights']
218
+ )
219
+
220
+ # Second LSTM
221
+ lstm2_input = tf.concat([lstm1_out, context], axis=-1)
222
+ lstm2_out, new_lstm2_state = self.lstm2(lstm2_input, state['lstm2_state'])
223
+
224
+ # Output projections
225
+ decoder_output = tf.concat([lstm2_out, context], axis=-1)
226
+ mel_output = self.mel_dense(decoder_output)
227
+ stop_output = self.stop_dense(decoder_output)
228
+
229
+ # Update state
230
+ new_state = {
231
+ 'lstm1_state': list(new_lstm1_state),
232
+ 'lstm2_state': list(new_lstm2_state),
233
+ 'attention_weights': attention_weights,
234
+ 'context': context
235
+ }
236
+
237
+ return mel_output, stop_output, new_state
238
+
239
+
240
+ class VedesPostnet(tf.keras.layers.Layer):
241
+ """Postnet: 5 Conv layers to refine mel spectrogram"""
242
+
243
+ def __init__(self, n_mels, postnet_dim, num_layers=5, **kwargs):
244
+ super().__init__(**kwargs)
245
+ self.conv_layers = []
246
+ self.batch_norms = []
247
+
248
+ for i in range(num_layers):
249
+ in_channels = n_mels if i == 0 else postnet_dim
250
+ out_channels = n_mels if i == num_layers - 1 else postnet_dim
251
+ activation = None if i == num_layers - 1 else 'tanh'
252
+
253
+ self.conv_layers.append(
254
+ tf.keras.layers.Conv1D(out_channels, 5, padding='same', activation=activation)
255
+ )
256
+ self.batch_norms.append(tf.keras.layers.BatchNormalization())
257
+
258
+ self.dropout = tf.keras.layers.Dropout(0.5)
259
+
260
+ def call(self, inputs, training=True):
261
+ x = inputs
262
+ for conv, bn in zip(self.conv_layers, self.batch_norms):
263
+ x = conv(x)
264
+ x = bn(x, training=training)
265
+ x = self.dropout(x, training=training)
266
+ return inputs + x
267
+
268
+
269
+ class VedesTTS(tf.keras.Model):
270
+ """Complete Vedes TTS Model"""
271
+
272
+ def __init__(self, config, **kwargs):
273
+ super().__init__(**kwargs)
274
+ self.config = config
275
+
276
+ self.encoder = VedesEncoder(
277
+ config.vocab_size,
278
+ config.embedding_dim,
279
+ config.encoder_dim
280
+ )
281
+
282
+ self.decoder = VedesDecoder(
283
+ config.decoder_dim,
284
+ config.n_mels,
285
+ config.prenet_dim,
286
+ config.attention_dim
287
+ )
288
+
289
+ self.postnet = VedesPostnet(
290
+ config.n_mels,
291
+ config.postnet_dim,
292
+ config.postnet_layers
293
+ )
294
+
295
+ def call(self, inputs, training=True):
296
+ """Forward pass for training"""
297
+ text_inputs, mel_targets = inputs
298
+ batch_size = tf.shape(text_inputs)[0]
299
+
300
+ # Encode
301
+ encoder_outputs = self.encoder(text_inputs, training=training)
302
+ encoder_seq_len = tf.shape(encoder_outputs)[1]
303
+
304
+ # Initialize decoder
305
+ state = self.decoder.get_initial_state(batch_size, encoder_seq_len)
306
+
307
+ # Teacher forcing
308
+ mel_outputs = []
309
+ stop_outputs = []
310
+
311
+ # Start with zeros
312
+ decoder_input = tf.zeros([batch_size, self.config.n_mels])
313
+
314
+ for t in range(tf.shape(mel_targets)[1]):
315
+ mel_out, stop_out, state = self.decoder.decode_step(
316
+ decoder_input, encoder_outputs, state, training=training
317
+ )
318
+ mel_outputs.append(mel_out)
319
+ stop_outputs.append(stop_out)
320
+
321
+ # Teacher forcing
322
+ decoder_input = mel_targets[:, t, :]
323
+
324
+ mel_outputs = tf.stack(mel_outputs, axis=1)
325
+ stop_outputs = tf.stack(stop_outputs, axis=1)
326
+
327
+ # Postnet
328
+ mel_outputs_postnet = self.postnet(mel_outputs, training=training)
329
+
330
+ return mel_outputs, mel_outputs_postnet, stop_outputs
331
+
332
+ @tf.function(reduce_retracing=True)
333
+ def inference(self, text_sequence, max_steps=1000):
334
+ """Inference mode - autoregressive generation"""
335
+ text_sequence = tf.expand_dims(text_sequence, 0)
336
+ batch_size = 1
337
+
338
+ # Encode
339
+ encoder_outputs = self.encoder(text_sequence, training=False)
340
+ encoder_seq_len = tf.shape(encoder_outputs)[1]
341
+
342
+ # Initialize
343
+ state = self.decoder.get_initial_state(batch_size, encoder_seq_len)
344
+ decoder_input = tf.zeros([batch_size, self.config.n_mels])
345
+
346
+ mel_outputs = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
347
+
348
+ for step in tf.range(max_steps):
349
+ mel_out, stop_out, state = self.decoder.decode_step(
350
+ decoder_input, encoder_outputs, state, training=False
351
+ )
352
+
353
+ mel_outputs = mel_outputs.write(step, mel_out[0])
354
+ decoder_input = mel_out
355
+
356
+ # Check stop token
357
+ if tf.nn.sigmoid(stop_out[0, 0]) > 0.5:
358
+ break
359
+
360
+ mel_outputs = mel_outputs.stack()
361
+ mel_outputs = tf.expand_dims(mel_outputs, 0)
362
+
363
+ # Postnet refinement
364
+ mel_outputs = self.postnet(mel_outputs, training=False)
365
+
366
+ return mel_outputs[0]
367
+
368
+
369
+ # ============================================
370
+ # GRIFFIN-LIM VOCODER
371
+ # ============================================
372
+
373
+ class GriffinLimVocoder:
374
+ """Griffin-Lim algorithm for mel spectrogram to audio"""
375
+
376
+ def __init__(self, config):
377
+ self.config = config
378
+ self.mel_basis = self._create_mel_filterbank()
379
+
380
+ def _create_mel_filterbank(self):
381
+ """Create mel filterbank matrix"""
382
+ n_fft = self.config.n_fft
383
+ n_mels = self.config.n_mels
384
+ sample_rate = self.config.sample_rate
385
+ fmin = self.config.fmin
386
+ fmax = self.config.fmax
387
+
388
+ # Mel frequencies
389
+ mel_fmin = self._hz_to_mel(fmin)
390
+ mel_fmax = self._hz_to_mel(fmax)
391
+ mel_points = np.linspace(mel_fmin, mel_fmax, n_mels + 2)
392
+ hz_points = self._mel_to_hz(mel_points)
393
+
394
+ # FFT bins
395
+ bin_points = np.floor((n_fft + 1) * hz_points / sample_rate).astype(int)
396
+
397
+ # Create filterbank
398
+ filterbank = np.zeros((n_mels, n_fft // 2 + 1))
399
+
400
+ for i in range(n_mels):
401
+ left = bin_points[i]
402
+ center = bin_points[i + 1]
403
+ right = bin_points[i + 2]
404
+
405
+ for j in range(left, center):
406
+ if center != left:
407
+ filterbank[i, j] = (j - left) / (center - left)
408
+
409
+ for j in range(center, right):
410
+ if right != center:
411
+ filterbank[i, j] = (right - j) / (right - center)
412
+
413
+ return filterbank
414
+
415
+ def _hz_to_mel(self, hz):
416
+ return 2595 * np.log10(1 + hz / 700)
417
+
418
+ def _mel_to_hz(self, mel):
419
+ return 700 * (10 ** (mel / 2595) - 1)
420
+
421
+ def mel_to_linear(self, mel_spec):
422
+ """Convert mel spectrogram to linear spectrogram"""
423
+ mel_basis_pinv = np.linalg.pinv(self.mel_basis)
424
+ linear = np.maximum(1e-10, np.dot(mel_spec, mel_basis_pinv.T))
425
+ return linear
426
+
427
+ def griffin_lim(self, spectrogram, n_iter=60):
428
+ """Griffin-Lim algorithm"""
429
+ # Denormalize
430
+ spectrogram = np.exp(spectrogram)
431
+
432
+ # Convert mel to linear
433
+ linear_spec = self.mel_to_linear(spectrogram)
434
+
435
+ # Initialize phase randomly
436
+ angles = np.exp(2j * np.pi * np.random.rand(*linear_spec.shape))
437
+ complex_spec = linear_spec * angles
438
+
439
+ # Iterate
440
+ for _ in range(n_iter):
441
+ # Inverse STFT
442
+ audio = self._istft(complex_spec)
443
+
444
+ # Forward STFT
445
+ complex_spec = self._stft(audio)
446
+
447
+ # Keep magnitude, update phase
448
+ angles = np.exp(1j * np.angle(complex_spec))
449
+ complex_spec = linear_spec * angles
450
+
451
+ # Final inverse STFT
452
+ audio = self._istft(complex_spec)
453
+
454
+ # Normalize
455
+ audio = audio / (np.max(np.abs(audio)) + 1e-8)
456
+
457
+ return audio.astype(np.float32)
458
+
459
+ def _stft(self, audio):
460
+ """Short-time Fourier transform"""
461
+ return np.array([
462
+ np.fft.rfft(
463
+ audio[i:i + self.config.win_length] *
464
+ np.hanning(self.config.win_length)
465
+ )
466
+ for i in range(0, len(audio) - self.config.win_length, self.config.hop_length)
467
+ ])
468
+
469
+ def _istft(self, complex_spec):
470
+ """Inverse short-time Fourier transform"""
471
+ n_frames = complex_spec.shape[0]
472
+ expected_len = self.config.hop_length * n_frames + self.config.win_length
473
+ audio = np.zeros(expected_len)
474
+ window = np.hanning(self.config.win_length)
475
+
476
+ for i, frame in enumerate(complex_spec):
477
+ start = i * self.config.hop_length
478
+ audio[start:start + self.config.win_length] += np.real(
479
+ np.fft.irfft(frame, self.config.win_length)
480
+ ) * window
481
+
482
+ return audio
483
+
484
+
485
+ # ============================================
486
+ # INITIALIZE MODEL AND VOCODER
487
+ # ============================================
488
+
489
+ print("Initializing Vedes TTS Model...")
490
+ model = VedesTTS(config)
491
+ vocoder = GriffinLimVocoder(config)
492
+
493
+ # Build model with dummy input
494
+ dummy_text = tf.zeros([1, 10], dtype=tf.int32)
495
+ dummy_mel = tf.zeros([1, 50, config.n_mels])
496
+ _ = model([dummy_text, dummy_mel], training=False)
497
+
498
+ print("Model initialized successfully!")
499
+ print(f"Total parameters: {model.count_params():,}")
500
+
501
+
502
+ # ============================================
503
+ # SYNTHESIS FUNCTION
504
+ # ============================================
505
+
506
+ def synthesize_speech(text, speaking_rate=1.0, pitch_shift=0):
507
+ """
508
+ Convert text to speech
509
+
510
+ Args:
511
+ text: Input text string
512
+ speaking_rate: Speed of speech (0.5 - 2.0)
513
+ pitch_shift: Pitch adjustment in semitones (-5 to 5)
514
+
515
+ Returns:
516
+ tuple: (sample_rate, audio_array)
517
+ """
518
+ if not text or len(text.strip()) == 0:
519
+ return None
520
+
521
+ try:
522
+ # Clean and process text
523
+ text = text.strip().lower()
524
+
525
+ # Convert text to sequence
526
+ text_sequence = text_processor.text_to_sequence(text)
527
+
528
+ if len(text_sequence) == 0:
529
+ return None
530
+
531
+ text_tensor = tf.constant(text_sequence, dtype=tf.int32)
532
+
533
+ # Generate mel spectrogram
534
+ max_steps = int(len(text_sequence) * 20 / speaking_rate)
535
+ max_steps = min(max_steps, config.max_decoder_steps)
536
+
537
+ mel_spectrogram = model.inference(text_tensor, max_steps=max_steps)
538
+ mel_spectrogram = mel_spectrogram.numpy()
539
+
540
+ # Apply pitch shift (simple frequency scaling)
541
+ if pitch_shift != 0:
542
+ shift_factor = 2 ** (pitch_shift / 12)
543
+ mel_spectrogram = mel_spectrogram * shift_factor
544
+
545
+ # Convert to audio using Griffin-Lim
546
+ audio = vocoder.griffin_lim(mel_spectrogram)
547
+
548
+ # Resample for speaking rate
549
+ if speaking_rate != 1.0:
550
+ target_length = int(len(audio) / speaking_rate)
551
+ audio = signal.resample(audio, target_length)
552
+
553
+ # Ensure audio is in correct format
554
+ audio = np.clip(audio, -1, 1)
555
+ audio = (audio * 32767).astype(np.int16)
556
+
557
+ return (config.sample_rate, audio)
558
+
559
+ except Exception as e:
560
+ print(f"Error during synthesis: {e}")
561
+ return None
562
+
563
+
564
+ # ============================================
565
+ # GRADIO INTERFACE
566
+ # ============================================
567
+
568
+ # Custom CSS for better styling
569
+ custom_css = """
570
+ .gradio-container {
571
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
572
+ }
573
+ .title {
574
+ text-align: center;
575
+ color: #2c3e50;
576
+ }
577
+ .description {
578
+ text-align: center;
579
+ color: #7f8c8d;
580
+ }
581
  """
582
+
583
+ # Create Gradio interface
584
+ with gr.Blocks(css=custom_css, title="Vedes TTS") as demo:
585
+ gr.Markdown(
586
+ """
587
+ # 🎙️ Vedes TTS - Text-to-Speech Synthesis
588
+ ### Built from scratch with TensorFlow
589
+
590
+ Enter any text below and convert it to natural-sounding speech!
591
+ """
592
+ )
593
+
594
+ with gr.Row():
595
+ with gr.Column(scale=2):
596
+ text_input = gr.Textbox(
597
+ label="Input Text",
598
+ placeholder="Enter the text you want to convert to speech...",
599
+ lines=3,
600
+ max_lines=10
601
+ )
602
+
603
+ with gr.Row():
604
+ speaking_rate = gr.Slider(
605
+ minimum=0.5,
606
+ maximum=2.0,
607
+ value=1.0,
608
+ step=0.1,
609
+ label="Speaking Rate",
610
+ info="Adjust speech speed"
611
+ )
612
+
613
+ pitch_shift = gr.Slider(
614
+ minimum=-5,
615
+ maximum=5,
616
+ value=0,
617
+ step=1,
618
+ label="Pitch Shift (semitones)",
619
+ info="Adjust voice pitch"
620
+ )
621
+
622
+ synthesize_btn = gr.Button("🔊 Synthesize Speech", variant="primary")
623
+
624
+ with gr.Column(scale=1):
625
+ audio_output = gr.Audio(
626
+ label="Generated Speech",
627
+ type="numpy"
628
+ )
629
+
630
+ # Example texts
631
+ gr.Examples(
632
+ examples=[
633
+ ["Hello, welcome to Vedes text to speech system!"],
634
+ ["The quick brown fox jumps over the lazy dog."],
635
+ ["Artificial intelligence is transforming the world."],
636
+ ["Good morning! How are you doing today?"],
637
+ ["This is a demonstration of neural text to speech."],
638
+ ],
639
+ inputs=text_input
640
+ )
641
+
642
+ gr.Markdown(
643
+ """
644
+ ---
645
+ ### About Vedes TTS
646
+
647
+ **Architecture:**
648
+ - **Encoder:** 3 Conv1D layers + Bidirectional LSTM
649
+ - **Attention:** Location-sensitive attention mechanism
650
+ - **Decoder:** Autoregressive LSTM with prenet
651
+ - **Postnet:** 5 Conv1D layers for mel refinement
652
+ - **Vocoder:** Griffin-Lim algorithm
653
+
654
+ **Features:**
655
+ - Character-level text processing
656
+ - Adjustable speaking rate
657
+ - Pitch shifting capability
658
+ - Real-time synthesis
659
+
660
+ Built with ❤️ using TensorFlow and Gradio
661
+ """
662
+ )
663
+
664
+ # Event handlers
665
+ synthesize_btn.click(
666
+ fn=synthesize_speech,
667
+ inputs=[text_input, speaking_rate, pitch_shift],
668
+ outputs=audio_output
669
+ )
670
+
671
+ text_input.submit(
672
+ fn=synthesize_speech,
673
+ inputs=[text_input, speaking_rate, pitch_shift],
674
+ outputs=audio_output
675
+ )
676
 
677
 
678
+ # Launch
679
  if __name__ == "__main__":
680
+ demo.launch()