vedaco commited on
Commit
a8983e0
Β·
verified Β·
1 Parent(s): c60acbd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +620 -534
app.py CHANGED
@@ -1,540 +1,633 @@
1
- import tensorflow as tf
2
  import numpy as np
3
  import gradio as gr
4
- import scipy.signal as signal
5
  from scipy.io import wavfile
6
  import io
7
- import os
8
-
9
- # Disable GPU warnings
10
- os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
11
 
12
  # ============================================
13
- # VEDES TTS - Text-to-Speech Model from Scratch
14
  # ============================================
15
 
16
  class VedesConfig:
17
- """Configuration for Vedes TTS Model"""
18
- # Audio parameters
19
  sample_rate = 22050
20
- n_fft = 1024
21
- hop_length = 256
22
- win_length = 1024
23
- n_mels = 80
24
- fmin = 0
25
- fmax = 8000
26
-
27
- # Model parameters
28
- embedding_dim = 256
29
- encoder_dim = 256
30
- decoder_dim = 256
31
- attention_dim = 128
32
- prenet_dim = 128
33
- postnet_dim = 256
34
- postnet_layers = 5
35
- max_decoder_steps = 500
36
 
37
- # Text parameters
38
- vocab = "abcdefghijklmnopqrstuvwxyz .,!?'-"
39
- vocab_size = len(vocab) + 1
40
 
41
  config = VedesConfig()
42
 
43
  # ============================================
44
- # TEXT PROCESSING
45
  # ============================================
46
 
47
- class TextProcessor:
48
- """Text to sequence converter"""
49
-
50
- def __init__(self, vocab):
51
- self.vocab = vocab
52
- self.char_to_idx = {char: idx + 1 for idx, char in enumerate(vocab)}
53
- self.idx_to_char = {idx + 1: char for idx, char in enumerate(vocab)}
54
- self.idx_to_char[0] = '<pad>'
55
-
56
- def text_to_sequence(self, text):
57
- """Convert text to sequence of integers"""
58
- text = text.lower()
59
- sequence = []
60
- for char in text:
61
- if char in self.char_to_idx:
62
- sequence.append(self.char_to_idx[char])
63
- return np.array(sequence, dtype=np.int32)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- text_processor = TextProcessor(config.vocab)
66
 
67
  # ============================================
68
- # MODEL LAYERS
69
  # ============================================
70
 
71
- class Prenet(tf.keras.layers.Layer):
72
- """Prenet with dropout"""
73
 
74
- def __init__(self, units, **kwargs):
75
- super().__init__(**kwargs)
76
- self.units = units
77
-
78
- def build(self, input_shape):
79
- self.dense1 = tf.keras.layers.Dense(self.units, activation='relu')
80
- self.dense2 = tf.keras.layers.Dense(self.units, activation='relu')
81
- super().build(input_shape)
82
 
83
- def call(self, inputs, training=True):
84
- x = self.dense1(inputs)
85
- x = tf.nn.dropout(x, rate=0.5) if training else x * 0.5
86
- x = self.dense2(x)
87
- x = tf.nn.dropout(x, rate=0.5) if training else x * 0.5
88
- return x
89
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
- class Encoder(tf.keras.layers.Layer):
92
- """Text Encoder"""
93
-
94
- def __init__(self, vocab_size, embed_dim, encoder_dim, **kwargs):
95
- super().__init__(**kwargs)
96
- self.vocab_size = vocab_size
97
- self.embed_dim = embed_dim
98
- self.encoder_dim = encoder_dim
99
-
100
- def build(self, input_shape):
101
- self.embedding = tf.keras.layers.Embedding(self.vocab_size, self.embed_dim)
102
-
103
- self.conv1 = tf.keras.layers.Conv1D(self.encoder_dim, 5, padding='same', activation='relu')
104
- self.bn1 = tf.keras.layers.BatchNormalization()
105
- self.conv2 = tf.keras.layers.Conv1D(self.encoder_dim, 5, padding='same', activation='relu')
106
- self.bn2 = tf.keras.layers.BatchNormalization()
107
- self.conv3 = tf.keras.layers.Conv1D(self.encoder_dim, 5, padding='same', activation='relu')
108
- self.bn3 = tf.keras.layers.BatchNormalization()
109
-
110
- self.bilstm = tf.keras.layers.Bidirectional(
111
- tf.keras.layers.LSTM(self.encoder_dim // 2, return_sequences=True),
112
- merge_mode='concat'
113
- )
114
- super().build(input_shape)
115
-
116
- def call(self, inputs, training=True):
117
- x = self.embedding(inputs)
118
-
119
- x = self.conv1(x)
120
- x = self.bn1(x, training=training)
121
- x = self.conv2(x)
122
- x = self.bn2(x, training=training)
123
- x = self.conv3(x)
124
- x = self.bn3(x, training=training)
125
-
126
- x = self.bilstm(x)
127
- return x
128
 
 
 
 
129
 
130
- class Attention(tf.keras.layers.Layer):
131
- """Bahdanau-style Attention"""
132
 
133
- def __init__(self, attention_dim, **kwargs):
134
- super().__init__(**kwargs)
135
- self.attention_dim = attention_dim
136
-
137
- def build(self, input_shape):
138
- self.W_query = tf.keras.layers.Dense(self.attention_dim)
139
- self.W_keys = tf.keras.layers.Dense(self.attention_dim)
140
- self.V = tf.keras.layers.Dense(1)
141
- super().build(input_shape)
142
 
143
- def call(self, query, keys):
144
- """
145
- query: [batch, decoder_dim]
146
- keys: [batch, seq_len, encoder_dim]
147
- """
148
- query_expanded = tf.expand_dims(query, 1)
149
 
150
- scores = self.V(tf.nn.tanh(
151
- self.W_query(query_expanded) + self.W_keys(keys)
152
- ))
153
- scores = tf.squeeze(scores, -1)
154
 
155
- weights = tf.nn.softmax(scores, axis=-1)
156
- context = tf.reduce_sum(tf.expand_dims(weights, -1) * keys, axis=1)
157
-
158
- return context, weights
159
-
160
-
161
- class Decoder(tf.keras.layers.Layer):
162
- """Autoregressive Decoder"""
163
-
164
- def __init__(self, decoder_dim, n_mels, prenet_dim, attention_dim, encoder_dim, **kwargs):
165
- super().__init__(**kwargs)
166
- self.decoder_dim = decoder_dim
167
- self.n_mels = n_mels
168
- self.prenet_dim = prenet_dim
169
- self.attention_dim = attention_dim
170
- self.encoder_dim = encoder_dim
171
 
172
- def build(self, input_shape):
173
- self.prenet = Prenet(self.prenet_dim)
174
- self.attention = Attention(self.attention_dim)
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
- # Single GRU layer (simpler than LSTM for this case)
177
- self.gru = tf.keras.layers.GRUCell(self.decoder_dim)
178
 
179
- # Output projections
180
- self.mel_dense = tf.keras.layers.Dense(self.n_mels)
181
- self.stop_dense = tf.keras.layers.Dense(1)
182
 
183
- # Build prenet
184
- self.prenet.build([None, self.n_mels])
 
185
 
186
- super().build(input_shape)
187
-
188
- def get_initial_state(self, batch_size):
189
- return tf.zeros([batch_size, self.decoder_dim])
190
 
191
- def step(self, decoder_input, encoder_outputs, state, training=True):
192
- """Single decoder step"""
193
- # Prenet
194
- prenet_out = self.prenet(decoder_input, training=training)
195
 
196
- # Attention
197
- context, attention_weights = self.attention(state, encoder_outputs)
198
 
199
- # GRU input
200
- gru_input = tf.concat([prenet_out, context], axis=-1)
 
 
 
 
201
 
202
- # GRU
203
- gru_out, new_states = self.gru(gru_input, [state])
204
- new_state = new_states[0] if isinstance(new_states, list) else new_states
 
 
205
 
206
- # Outputs
207
- output_concat = tf.concat([gru_out, context], axis=-1)
208
- mel_output = self.mel_dense(output_concat)
209
- stop_output = self.stop_dense(output_concat)
210
 
211
- return mel_output, stop_output, new_state, attention_weights
212
-
213
-
214
- class Postnet(tf.keras.layers.Layer):
215
- """Postnet for mel refinement"""
216
-
217
- def __init__(self, n_mels, postnet_dim, num_layers=5, **kwargs):
218
- super().__init__(**kwargs)
219
- self.n_mels = n_mels
220
- self.postnet_dim = postnet_dim
221
- self.num_layers = num_layers
222
-
223
- def build(self, input_shape):
224
- self.convs = []
225
- self.bns = []
226
-
227
- for i in range(self.num_layers):
228
- out_dim = self.n_mels if i == self.num_layers - 1 else self.postnet_dim
229
- self.convs.append(tf.keras.layers.Conv1D(out_dim, 5, padding='same'))
230
- self.bns.append(tf.keras.layers.BatchNormalization())
231
-
232
- super().build(input_shape)
233
-
234
- def call(self, inputs, training=True):
235
- x = inputs
236
- for i, (conv, bn) in enumerate(zip(self.convs, self.bns)):
237
- x = conv(x)
238
- x = bn(x, training=training)
239
- if i < self.num_layers - 1:
240
- x = tf.nn.tanh(x)
241
- return inputs + x
242
-
243
-
244
- # ============================================
245
- # VEDES TTS MODEL
246
- # ============================================
247
-
248
- class VedesTTS(tf.keras.Model):
249
- """Complete Vedes TTS Model"""
250
-
251
- def __init__(self, config, **kwargs):
252
- super().__init__(**kwargs)
253
- self.config = config
254
-
255
- def build(self, input_shape):
256
- self.encoder = Encoder(
257
- self.config.vocab_size,
258
- self.config.embedding_dim,
259
- self.config.encoder_dim
260
- )
261
-
262
- self.decoder = Decoder(
263
- self.config.decoder_dim,
264
- self.config.n_mels,
265
- self.config.prenet_dim,
266
- self.config.attention_dim,
267
- self.config.encoder_dim
268
- )
269
-
270
- self.postnet = Postnet(
271
- self.config.n_mels,
272
- self.config.postnet_dim,
273
- self.config.postnet_layers
274
- )
275
-
276
- super().build(input_shape)
277
-
278
- def call(self, inputs, training=True):
279
- """Forward pass"""
280
- if isinstance(inputs, (list, tuple)):
281
- text_inputs = inputs[0]
282
- else:
283
- text_inputs = inputs
284
-
285
- return self.inference_eager(text_inputs, self.config.max_decoder_steps)
286
-
287
- def inference_eager(self, text_sequence, max_steps=500):
288
- """Eager mode inference"""
289
- if len(text_sequence.shape) == 1:
290
- text_sequence = tf.expand_dims(text_sequence, 0)
291
 
292
- batch_size = tf.shape(text_sequence)[0]
 
 
 
 
293
 
294
- # Encode
295
- encoder_outputs = self.encoder(text_sequence, training=False)
 
 
 
 
 
 
 
296
 
297
- # Initialize decoder
298
- state = self.decoder.get_initial_state(batch_size)
299
- decoder_input = tf.zeros([batch_size, self.config.n_mels])
300
 
301
- mel_outputs = []
 
 
 
 
302
 
303
- for step in range(max_steps):
304
- mel_out, stop_out, state, _ = self.decoder.step(
305
- decoder_input, encoder_outputs, state, training=False
306
- )
307
 
308
- mel_outputs.append(mel_out)
309
- decoder_input = mel_out
310
 
311
- if tf.nn.sigmoid(stop_out[0, 0]) > 0.5 and step > 10:
312
- break
313
-
314
- if len(mel_outputs) == 0:
315
- return tf.zeros([1, 1, self.config.n_mels])
316
-
317
- mel_outputs = tf.stack(mel_outputs, axis=1)
318
- mel_outputs = self.postnet(mel_outputs, training=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
 
320
- return mel_outputs
321
-
322
-
323
- # ============================================
324
- # GRIFFIN-LIM VOCODER
325
- # ============================================
326
-
327
- class GriffinLimVocoder:
328
- """Griffin-Lim algorithm for mel to audio"""
329
-
330
- def __init__(self, config):
331
- self.config = config
332
- self.mel_basis = self._create_mel_filterbank()
333
-
334
- def _hz_to_mel(self, hz):
335
- return 2595 * np.log10(1 + hz / 700)
336
-
337
- def _mel_to_hz(self, mel):
338
- return 700 * (10 ** (mel / 2595) - 1)
339
 
340
- def _create_mel_filterbank(self):
341
- n_fft = self.config.n_fft
342
- n_mels = self.config.n_mels
343
- sample_rate = self.config.sample_rate
344
 
345
- mel_fmin = self._hz_to_mel(self.config.fmin)
346
- mel_fmax = self._hz_to_mel(self.config.fmax)
347
- mel_points = np.linspace(mel_fmin, mel_fmax, n_mels + 2)
348
- hz_points = self._mel_to_hz(mel_points)
 
 
349
 
350
- bin_points = np.floor((n_fft + 1) * hz_points / sample_rate).astype(int)
 
 
 
 
 
 
 
 
 
 
 
 
351
 
352
- filterbank = np.zeros((n_mels, n_fft // 2 + 1))
 
 
 
 
 
 
 
353
 
354
- for i in range(n_mels):
355
- left, center, right = bin_points[i], bin_points[i + 1], bin_points[i + 2]
356
-
357
- for j in range(left, center):
358
- if center != left:
359
- filterbank[i, j] = (j - left) / (center - left)
360
- for j in range(center, right):
361
- if right != center:
362
- filterbank[i, j] = (right - j) / (right - center)
363
-
364
- return filterbank
365
-
366
- def mel_to_linear(self, mel_spec):
367
- mel_basis_pinv = np.linalg.pinv(self.mel_basis)
368
- return np.maximum(1e-10, np.dot(mel_spec, mel_basis_pinv.T))
369
 
370
- def griffin_lim(self, mel_spectrogram, n_iter=32):
371
- # Denormalize
372
- spectrogram = np.exp(np.clip(mel_spectrogram, -10, 10))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
 
374
- # Mel to linear
375
- linear_spec = self.mel_to_linear(spectrogram)
376
 
377
- # Initialize phase
378
- phase = np.exp(2j * np.pi * np.random.rand(*linear_spec.shape))
379
- complex_spec = linear_spec * phase
 
 
 
380
 
381
- for _ in range(n_iter):
382
- audio = self._istft(complex_spec)
383
- complex_spec = self._stft(audio)
384
- phase = np.exp(1j * np.angle(complex_spec))
385
- complex_spec = linear_spec * phase
386
 
387
- audio = self._istft(complex_spec)
 
 
 
388
 
389
- # Normalize
390
- max_val = np.max(np.abs(audio))
391
- if max_val > 0:
392
- audio = audio / max_val
393
 
394
- return audio.astype(np.float32)
395
-
396
- def _stft(self, audio):
397
- frames = []
398
- window = np.hanning(self.config.win_length)
399
-
400
- for i in range(0, max(1, len(audio) - self.config.win_length), self.config.hop_length):
401
- frame = audio[i:i + self.config.win_length]
402
- if len(frame) < self.config.win_length:
403
- frame = np.pad(frame, (0, self.config.win_length - len(frame)))
404
- frames.append(np.fft.rfft(frame * window))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
405
 
406
- return np.array(frames) if frames else np.zeros((1, self.config.n_fft // 2 + 1), dtype=complex)
407
 
408
- def _istft(self, complex_spec):
409
- n_frames = len(complex_spec)
410
- expected_len = self.config.hop_length * n_frames + self.config.win_length
411
- audio = np.zeros(expected_len)
412
- window = np.hanning(self.config.win_length)
413
- window_sum = np.zeros(expected_len)
414
-
415
- for i, frame in enumerate(complex_spec):
416
- start = i * self.config.hop_length
417
- end = start + self.config.win_length
418
- audio[start:end] += np.real(np.fft.irfft(frame, self.config.win_length)) * window
419
- window_sum[start:end] += window ** 2
420
-
421
- # Normalize by window sum
422
- window_sum = np.maximum(window_sum, 1e-8)
423
- audio = audio / window_sum
424
 
425
  return audio
426
 
427
 
428
  # ============================================
429
- # SIMPLE SYNTHESIZER (Fallback)
430
  # ============================================
431
 
432
- class SimpleSynthesizer:
433
- """Simple formant-based synthesizer as fallback"""
434
 
435
  def __init__(self, sample_rate=22050):
436
  self.sample_rate = sample_rate
437
-
438
- # Basic phoneme frequencies
439
- self.phonemes = {
440
- 'a': {'f1': 730, 'f2': 1090},
441
- 'e': {'f1': 530, 'f2': 1840},
442
- 'i': {'f1': 270, 'f2': 2290},
443
- 'o': {'f1': 570, 'f2': 840},
444
- 'u': {'f1': 300, 'f2': 870},
445
- }
446
-
447
- self.default_formant = {'f1': 500, 'f2': 1500}
448
 
449
- def synthesize(self, text, duration_per_char=0.08):
450
- """Generate speech-like audio from text"""
451
- audio = np.array([], dtype=np.float32)
452
-
453
- text = text.lower()
454
-
455
- for char in text:
456
- if char == ' ':
457
- # Silence for space
458
- silence = np.zeros(int(self.sample_rate * 0.1))
459
- audio = np.concatenate([audio, silence])
460
- elif char in 'aeiou':
461
- # Vowel
462
- segment = self._generate_vowel(char, duration_per_char)
463
- audio = np.concatenate([audio, segment])
464
- elif char in 'bcdfghjklmnpqrstvwxyz':
465
- # Consonant (simplified)
466
- segment = self._generate_consonant(char, duration_per_char * 0.5)
467
- audio = np.concatenate([audio, segment])
468
- elif char in '.,!?':
469
- # Punctuation pause
470
- silence = np.zeros(int(self.sample_rate * 0.15))
471
- audio = np.concatenate([audio, silence])
472
 
473
- # Apply envelope
474
- if len(audio) > 0:
475
- audio = self._apply_envelope(audio)
476
- audio = audio / (np.max(np.abs(audio)) + 1e-8)
477
-
478
- return audio.astype(np.float32)
479
-
480
- def _generate_vowel(self, char, duration):
481
- """Generate a vowel sound"""
482
- t = np.linspace(0, duration, int(self.sample_rate * duration))
483
 
484
- formant = self.phonemes.get(char, self.default_formant)
485
- f0 = 120 # Fundamental frequency
486
 
487
- # Generate harmonics
488
- signal = np.zeros_like(t)
489
- for harmonic in range(1, 8):
490
- freq = f0 * harmonic
491
- amp = 1.0 / harmonic
492
- signal += amp * np.sin(2 * np.pi * freq * t)
493
-
494
- # Add formants
495
- f1_signal = np.sin(2 * np.pi * formant['f1'] * t) * 0.3
496
- f2_signal = np.sin(2 * np.pi * formant['f2'] * t) * 0.2
497
-
498
- signal = signal + f1_signal + f2_signal
499
-
500
- # Apply envelope
501
- envelope = np.ones_like(t)
502
- attack = int(len(t) * 0.1)
503
- release = int(len(t) * 0.2)
504
- envelope[:attack] = np.linspace(0, 1, attack)
505
- envelope[-release:] = np.linspace(1, 0, release)
506
-
507
- return (signal * envelope).astype(np.float32)
508
-
509
- def _generate_consonant(self, char, duration):
510
- """Generate consonant-like noise"""
511
- n_samples = int(self.sample_rate * duration)
512
-
513
- # Noise-based consonants
514
- if char in 'sfhx':
515
- noise = np.random.randn(n_samples) * 0.3
516
- elif char in 'bp':
517
- # Plosive
518
- noise = np.random.randn(n_samples) * 0.5
519
- noise[:n_samples//4] = 0
520
- else:
521
- # Default consonant
522
- noise = np.random.randn(n_samples) * 0.2
523
-
524
- # Envelope
525
- envelope = np.ones(n_samples)
526
- fade = min(n_samples // 4, 100)
527
- envelope[:fade] = np.linspace(0, 1, fade)
528
- envelope[-fade:] = np.linspace(1, 0, fade)
529
-
530
- return (noise * envelope).astype(np.float32)
531
-
532
- def _apply_envelope(self, audio):
533
- """Apply overall envelope to audio"""
534
- fade_len = min(len(audio) // 10, 1000)
535
- if fade_len > 0:
536
- audio[:fade_len] *= np.linspace(0, 1, fade_len)
537
- audio[-fade_len:] *= np.linspace(1, 0, fade_len)
538
  return audio
539
 
540
 
@@ -543,26 +636,12 @@ class SimpleSynthesizer:
543
  # ============================================
544
 
545
  print("=" * 50)
546
- print("Initializing Vedes TTS Model...")
547
  print("=" * 50)
548
 
549
- # Create model and vocoder
550
- model = VedesTTS(config)
551
- vocoder = GriffinLimVocoder(config)
552
- simple_synth = SimpleSynthesizer(config.sample_rate)
553
-
554
- # Build model
555
- try:
556
- dummy_input = tf.zeros([1, 10], dtype=tf.int32)
557
- model.build(input_shape=[None, None])
558
- _ = model(dummy_input, training=False)
559
- print("Neural model initialized successfully!")
560
- USE_NEURAL = True
561
- except Exception as e:
562
- print(f"Neural model init warning: {e}")
563
- print("Using simple synthesizer as fallback")
564
- USE_NEURAL = False
565
 
 
566
  print("=" * 50)
567
 
568
 
@@ -570,42 +649,28 @@ print("=" * 50)
570
  # SYNTHESIS FUNCTION
571
  # ============================================
572
 
573
- def synthesize_speech(text, speaking_rate=1.0, pitch_shift=0):
574
- """Convert text to speech"""
575
  if not text or len(text.strip()) == 0:
576
  return None
577
 
578
- text = text.strip()[:500] # Limit length
579
 
580
  try:
581
- if USE_NEURAL:
582
- # Use neural model
583
- text_sequence = text_processor.text_to_sequence(text)
584
- if len(text_sequence) == 0:
585
- return None
586
-
587
- text_tensor = tf.constant(text_sequence, dtype=tf.int32)
588
-
589
- # Generate mel spectrogram
590
- mel_output = model.inference_eager(text_tensor, max_steps=config.max_decoder_steps)
591
- mel_spectrogram = mel_output[0].numpy()
592
-
593
- # Apply pitch shift
594
- if pitch_shift != 0:
595
- mel_spectrogram = mel_spectrogram * (2 ** (pitch_shift / 12))
596
-
597
- # Convert to audio
598
- audio = vocoder.griffin_lim(mel_spectrogram)
599
- else:
600
- # Use simple synthesizer
601
- audio = simple_synth.synthesize(text)
602
 
603
- # Adjust speaking rate
604
- if speaking_rate != 1.0 and len(audio) > 100:
605
- target_length = int(len(audio) / speaking_rate)
606
- audio = signal.resample(audio, target_length)
607
 
608
- # Normalize and convert to int16
 
 
 
609
  audio = np.clip(audio, -1, 1)
610
  audio_int16 = (audio * 32767).astype(np.int16)
611
 
@@ -613,34 +678,32 @@ def synthesize_speech(text, speaking_rate=1.0, pitch_shift=0):
613
 
614
  except Exception as e:
615
  print(f"Synthesis error: {e}")
616
- # Fallback to simple synthesizer
617
- try:
618
- audio = simple_synth.synthesize(text)
619
- audio_int16 = (np.clip(audio, -1, 1) * 32767).astype(np.int16)
620
- return (config.sample_rate, audio_int16)
621
- except:
622
- return None
623
 
624
 
625
  # ============================================
626
  # GRADIO INTERFACE
627
  # ============================================
628
 
629
- with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
 
 
 
 
630
  gr.Markdown(
631
  """
632
  # πŸŽ™οΈ Vedes TTS - Text-to-Speech Synthesis
633
- ### Built from scratch with TensorFlow
634
 
635
- Enter text below to convert it to speech!
636
  """
637
  )
638
 
639
  with gr.Row():
640
  with gr.Column(scale=2):
641
  text_input = gr.Textbox(
642
- label="πŸ“ Input Text",
643
- placeholder="Type or paste your text here...",
644
  lines=4,
645
  max_lines=10
646
  )
@@ -651,18 +714,30 @@ with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
651
  maximum=2.0,
652
  value=1.0,
653
  step=0.1,
654
- label="🎚️ Speaking Rate"
 
655
  )
656
 
657
  pitch_shift = gr.Slider(
658
- minimum=-5,
659
- maximum=5,
660
  value=0,
661
  step=1,
662
- label="🎡 Pitch Shift"
 
663
  )
664
 
665
- synthesize_btn = gr.Button("πŸ”Š Synthesize Speech", variant="primary", size="lg")
 
 
 
 
 
 
 
 
 
 
666
 
667
  with gr.Column(scale=1):
668
  audio_output = gr.Audio(
@@ -672,14 +747,17 @@ with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
672
 
673
  gr.Examples(
674
  examples=[
675
- ["Hello world!"],
676
- ["Welcome to Vedes text to speech."],
677
  ["The quick brown fox jumps over the lazy dog."],
678
  ["How are you doing today?"],
679
  ["This is a test of the speech synthesis system."],
 
 
 
 
680
  ],
681
  inputs=text_input,
682
- label="πŸ“š Example Texts"
683
  )
684
 
685
  gr.Markdown(
@@ -687,30 +765,38 @@ with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
687
  ---
688
  ### ℹ️ About Vedes TTS
689
 
690
- **Architecture:**
691
- - Encoder: Embedding + Conv1D + BiLSTM
692
- - Attention: Bahdanau-style attention
693
- - Decoder: GRU with Prenet
694
- - Postnet: Conv1D refinement
695
- - Vocoder: Griffin-Lim
696
 
697
- Built with TensorFlow 2.x and Gradio
 
 
 
 
 
 
 
 
 
698
  """
699
  )
700
 
701
  # Event handlers
702
  synthesize_btn.click(
703
  fn=synthesize_speech,
704
- inputs=[text_input, speaking_rate, pitch_shift],
705
  outputs=audio_output
706
  )
707
 
708
  text_input.submit(
709
  fn=synthesize_speech,
710
- inputs=[text_input, speaking_rate, pitch_shift],
711
  outputs=audio_output
712
  )
713
 
 
714
  # Launch
715
  if __name__ == "__main__":
716
  demo.launch()
 
 
1
  import numpy as np
2
  import gradio as gr
3
+ from scipy import signal
4
  from scipy.io import wavfile
5
  import io
6
+ import re
 
 
 
7
 
8
  # ============================================
9
+ # VEDES TTS - Formant-Based Speech Synthesizer
10
  # ============================================
11
 
12
  class VedesConfig:
13
+ """Configuration"""
 
14
  sample_rate = 22050
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
 
 
 
16
 
17
  config = VedesConfig()
18
 
19
  # ============================================
20
+ # PHONEME DEFINITIONS
21
  # ============================================
22
 
23
+ # Phoneme to formant mapping (F1, F2, F3, duration_ms, is_voiced)
24
+ PHONEMES = {
25
+ # Vowels (voiced)
26
+ 'AA': (710, 1100, 2540, 120, True), # father
27
+ 'AE': (660, 1720, 2410, 120, True), # cat
28
+ 'AH': (520, 1190, 2390, 100, True), # but
29
+ 'AO': (570, 840, 2410, 120, True), # dog
30
+ 'AW': (630, 1200, 2550, 150, True), # how
31
+ 'AY': (710, 1100, 2540, 150, True), # my
32
+ 'EH': (530, 1840, 2480, 100, True), # bed
33
+ 'ER': (490, 1350, 1690, 120, True), # bird
34
+ 'EY': (450, 2100, 2680, 140, True), # say
35
+ 'IH': (400, 1920, 2560, 80, True), # bit
36
+ 'IY': (270, 2290, 3010, 120, True), # see
37
+ 'OW': (450, 850, 2500, 140, True), # go
38
+ 'OY': (490, 1350, 2480, 160, True), # boy
39
+ 'UH': (440, 1020, 2240, 100, True), # book
40
+ 'UW': (300, 870, 2240, 120, True), # too
41
+
42
+ # Consonants - Stops
43
+ 'B': (200, 1100, 2150, 60, True),
44
+ 'D': (200, 1600, 2600, 50, True),
45
+ 'G': (200, 1990, 2850, 50, True),
46
+ 'P': (200, 800, 2000, 80, False),
47
+ 'T': (200, 1600, 2600, 70, False),
48
+ 'K': (200, 1990, 2850, 80, False),
49
+
50
+ # Consonants - Fricatives
51
+ 'F': (175, 900, 2400, 100, False),
52
+ 'V': (175, 1100, 2400, 80, True),
53
+ 'TH': (200, 1400, 2200, 80, False),
54
+ 'DH': (200, 1600, 2400, 60, True),
55
+ 'S': (200, 1800, 4000, 100, False),
56
+ 'Z': (200, 1600, 3500, 80, True),
57
+ 'SH': (200, 1800, 2600, 100, False),
58
+ 'ZH': (200, 1800, 2600, 80, True),
59
+ 'HH': (280, 1200, 2400, 80, False),
60
+
61
+ # Consonants - Nasals
62
+ 'M': (280, 900, 2200, 80, True),
63
+ 'N': (280, 1700, 2600, 70, True),
64
+ 'NG': (280, 2300, 2750, 80, True),
65
+
66
+ # Consonants - Liquids
67
+ 'L': (350, 1100, 2700, 70, True),
68
+ 'R': (420, 1300, 1600, 70, True),
69
+
70
+ # Consonants - Glides
71
+ 'W': (300, 870, 2240, 60, True),
72
+ 'Y': (280, 2250, 3000, 50, True),
73
+
74
+ # Special
75
+ 'CH': (200, 1800, 2600, 100, False),
76
+ 'JH': (200, 1800, 2600, 80, True),
77
+
78
+ # Silence
79
+ 'SIL': (0, 0, 0, 100, False),
80
+ 'PAU': (0, 0, 0, 150, False),
81
+ }
82
+
83
+ # Letter to phoneme mapping (simplified)
84
+ LETTER_TO_PHONEME = {
85
+ 'a': ['AE'],
86
+ 'b': ['B'],
87
+ 'c': ['K'],
88
+ 'd': ['D'],
89
+ 'e': ['EH'],
90
+ 'f': ['F'],
91
+ 'g': ['G'],
92
+ 'h': ['HH'],
93
+ 'i': ['IH'],
94
+ 'j': ['JH'],
95
+ 'k': ['K'],
96
+ 'l': ['L'],
97
+ 'm': ['M'],
98
+ 'n': ['N'],
99
+ 'o': ['AA'],
100
+ 'p': ['P'],
101
+ 'q': ['K', 'W'],
102
+ 'r': ['R'],
103
+ 's': ['S'],
104
+ 't': ['T'],
105
+ 'u': ['AH'],
106
+ 'v': ['V'],
107
+ 'w': ['W'],
108
+ 'x': ['K', 'S'],
109
+ 'y': ['Y'],
110
+ 'z': ['Z'],
111
+ ' ': ['SIL'],
112
+ '.': ['PAU'],
113
+ ',': ['PAU'],
114
+ '!': ['PAU'],
115
+ '?': ['PAU'],
116
+ '-': ['SIL'],
117
+ "'": [],
118
+ }
119
+
120
+ # Common word pronunciations
121
+ WORD_PRONUNCIATIONS = {
122
+ 'the': ['DH', 'AH'],
123
+ 'a': ['AH'],
124
+ 'an': ['AE', 'N'],
125
+ 'is': ['IH', 'Z'],
126
+ 'are': ['AA', 'R'],
127
+ 'was': ['W', 'AA', 'Z'],
128
+ 'were': ['W', 'ER'],
129
+ 'be': ['B', 'IY'],
130
+ 'been': ['B', 'IH', 'N'],
131
+ 'have': ['HH', 'AE', 'V'],
132
+ 'has': ['HH', 'AE', 'Z'],
133
+ 'had': ['HH', 'AE', 'D'],
134
+ 'do': ['D', 'UW'],
135
+ 'does': ['D', 'AH', 'Z'],
136
+ 'did': ['D', 'IH', 'D'],
137
+ 'will': ['W', 'IH', 'L'],
138
+ 'would': ['W', 'UH', 'D'],
139
+ 'could': ['K', 'UH', 'D'],
140
+ 'should': ['SH', 'UH', 'D'],
141
+ 'can': ['K', 'AE', 'N'],
142
+ 'may': ['M', 'EY'],
143
+ 'might': ['M', 'AY', 'T'],
144
+ 'must': ['M', 'AH', 'S', 'T'],
145
+ 'i': ['AY'],
146
+ 'you': ['Y', 'UW'],
147
+ 'he': ['HH', 'IY'],
148
+ 'she': ['SH', 'IY'],
149
+ 'it': ['IH', 'T'],
150
+ 'we': ['W', 'IY'],
151
+ 'they': ['DH', 'EY'],
152
+ 'this': ['DH', 'IH', 'S'],
153
+ 'that': ['DH', 'AE', 'T'],
154
+ 'what': ['W', 'AH', 'T'],
155
+ 'which': ['W', 'IH', 'CH'],
156
+ 'who': ['HH', 'UW'],
157
+ 'how': ['HH', 'AW'],
158
+ 'when': ['W', 'EH', 'N'],
159
+ 'where': ['W', 'EH', 'R'],
160
+ 'why': ['W', 'AY'],
161
+ 'all': ['AO', 'L'],
162
+ 'each': ['IY', 'CH'],
163
+ 'every': ['EH', 'V', 'R', 'IY'],
164
+ 'both': ['B', 'OW', 'TH'],
165
+ 'few': ['F', 'Y', 'UW'],
166
+ 'more': ['M', 'AO', 'R'],
167
+ 'most': ['M', 'OW', 'S', 'T'],
168
+ 'other': ['AH', 'DH', 'ER'],
169
+ 'some': ['S', 'AH', 'M'],
170
+ 'such': ['S', 'AH', 'CH'],
171
+ 'no': ['N', 'OW'],
172
+ 'not': ['N', 'AA', 'T'],
173
+ 'only': ['OW', 'N', 'L', 'IY'],
174
+ 'same': ['S', 'EY', 'M'],
175
+ 'so': ['S', 'OW'],
176
+ 'than': ['DH', 'AE', 'N'],
177
+ 'too': ['T', 'UW'],
178
+ 'very': ['V', 'EH', 'R', 'IY'],
179
+ 'just': ['JH', 'AH', 'S', 'T'],
180
+ 'hello': ['HH', 'EH', 'L', 'OW'],
181
+ 'hi': ['HH', 'AY'],
182
+ 'welcome': ['W', 'EH', 'L', 'K', 'AH', 'M'],
183
+ 'to': ['T', 'UW'],
184
+ 'world': ['W', 'ER', 'L', 'D'],
185
+ 'speech': ['S', 'P', 'IY', 'CH'],
186
+ 'text': ['T', 'EH', 'K', 'S', 'T'],
187
+ 'voice': ['V', 'OY', 'S'],
188
+ 'sound': ['S', 'AW', 'N', 'D'],
189
+ 'good': ['G', 'UH', 'D'],
190
+ 'great': ['G', 'R', 'EY', 'T'],
191
+ 'nice': ['N', 'AY', 'S'],
192
+ 'thank': ['TH', 'AE', 'NG', 'K'],
193
+ 'thanks': ['TH', 'AE', 'NG', 'K', 'S'],
194
+ 'please': ['P', 'L', 'IY', 'Z'],
195
+ 'yes': ['Y', 'EH', 'S'],
196
+ 'yeah': ['Y', 'AE'],
197
+ 'ok': ['OW', 'K', 'EY'],
198
+ 'okay': ['OW', 'K', 'EY'],
199
+ 'and': ['AE', 'N', 'D'],
200
+ 'or': ['AO', 'R'],
201
+ 'but': ['B', 'AH', 'T'],
202
+ 'if': ['IH', 'F'],
203
+ 'then': ['DH', 'EH', 'N'],
204
+ 'because': ['B', 'IH', 'K', 'AO', 'Z'],
205
+ 'as': ['AE', 'Z'],
206
+ 'until': ['AH', 'N', 'T', 'IH', 'L'],
207
+ 'while': ['W', 'AY', 'L'],
208
+ 'of': ['AH', 'V'],
209
+ 'at': ['AE', 'T'],
210
+ 'by': ['B', 'AY'],
211
+ 'for': ['F', 'AO', 'R'],
212
+ 'with': ['W', 'IH', 'TH'],
213
+ 'about': ['AH', 'B', 'AW', 'T'],
214
+ 'into': ['IH', 'N', 'T', 'UW'],
215
+ 'through': ['TH', 'R', 'UW'],
216
+ 'during': ['D', 'UH', 'R', 'IH', 'NG'],
217
+ 'before': ['B', 'IH', 'F', 'AO', 'R'],
218
+ 'after': ['AE', 'F', 'T', 'ER'],
219
+ 'above': ['AH', 'B', 'AH', 'V'],
220
+ 'below': ['B', 'IH', 'L', 'OW'],
221
+ 'from': ['F', 'R', 'AH', 'M'],
222
+ 'up': ['AH', 'P'],
223
+ 'down': ['D', 'AW', 'N'],
224
+ 'in': ['IH', 'N'],
225
+ 'out': ['AW', 'T'],
226
+ 'on': ['AA', 'N'],
227
+ 'off': ['AO', 'F'],
228
+ 'over': ['OW', 'V', 'ER'],
229
+ 'under': ['AH', 'N', 'D', 'ER'],
230
+ 'again': ['AH', 'G', 'EH', 'N'],
231
+ 'there': ['DH', 'EH', 'R'],
232
+ 'here': ['HH', 'IY', 'R'],
233
+ 'today': ['T', 'AH', 'D', 'EY'],
234
+ 'now': ['N', 'AW'],
235
+ 'my': ['M', 'AY'],
236
+ 'your': ['Y', 'AO', 'R'],
237
+ 'his': ['HH', 'IH', 'Z'],
238
+ 'her': ['HH', 'ER'],
239
+ 'our': ['AW', 'ER'],
240
+ 'their': ['DH', 'EH', 'R'],
241
+ 'test': ['T', 'EH', 'S', 'T'],
242
+ 'testing': ['T', 'EH', 'S', 'T', 'IH', 'NG'],
243
+ 'one': ['W', 'AH', 'N'],
244
+ 'two': ['T', 'UW'],
245
+ 'three': ['TH', 'R', 'IY'],
246
+ 'four': ['F', 'AO', 'R'],
247
+ 'five': ['F', 'AY', 'V'],
248
+ 'name': ['N', 'EY', 'M'],
249
+ 'vedes': ['V', 'IY', 'D', 'EH', 'S'],
250
+ 'synthesis': ['S', 'IH', 'N', 'TH', 'AH', 'S', 'IH', 'S'],
251
+ 'system': ['S', 'IH', 'S', 'T', 'AH', 'M'],
252
+ }
253
+
254
+ # Common letter patterns
255
+ PATTERNS = [
256
+ (r'tion', ['SH', 'AH', 'N']),
257
+ (r'sion', ['ZH', 'AH', 'N']),
258
+ (r'ough', ['AH', 'F']),
259
+ (r'ight', ['AY', 'T']),
260
+ (r'ould', ['UH', 'D']),
261
+ (r'tion', ['SH', 'AH', 'N']),
262
+ (r'th', ['TH']),
263
+ (r'ch', ['CH']),
264
+ (r'sh', ['SH']),
265
+ (r'ph', ['F']),
266
+ (r'wh', ['W']),
267
+ (r'ck', ['K']),
268
+ (r'ng', ['NG']),
269
+ (r'qu', ['K', 'W']),
270
+ (r'ee', ['IY']),
271
+ (r'ea', ['IY']),
272
+ (r'oo', ['UW']),
273
+ (r'ou', ['AW']),
274
+ (r'ow', ['OW']),
275
+ (r'ai', ['EY']),
276
+ (r'ay', ['EY']),
277
+ (r'oy', ['OY']),
278
+ (r'oi', ['OY']),
279
+ (r'au', ['AO']),
280
+ (r'aw', ['AO']),
281
+ (r'ie', ['IY']),
282
+ (r'ei', ['EY']),
283
+ (r'ue', ['UW']),
284
+ (r'ew', ['UW']),
285
+ ]
286
 
 
287
 
288
  # ============================================
289
+ # TEXT TO PHONEME CONVERTER
290
  # ============================================
291
 
292
+ class TextToPhoneme:
293
+ """Convert text to phoneme sequence"""
294
 
295
+ def __init__(self):
296
+ self.word_dict = WORD_PRONUNCIATIONS
297
+ self.letter_map = LETTER_TO_PHONEME
298
+ self.patterns = PATTERNS
 
 
 
 
299
 
300
+ def convert(self, text):
301
+ """Convert text to phoneme list"""
302
+ text = text.lower().strip()
303
+ words = re.findall(r"[\w']+|[.,!?;:\-]|\s+", text)
304
+
305
+ phonemes = []
306
+
307
+ for word in words:
308
+ word = word.strip()
309
+ if not word:
310
+ continue
311
+
312
+ if word in self.word_dict:
313
+ phonemes.extend(self.word_dict[word])
314
+ elif word.isspace():
315
+ phonemes.append('SIL')
316
+ elif word in '.,!?;:':
317
+ phonemes.append('PAU')
318
+ else:
319
+ # Convert letter by letter with pattern matching
320
+ phonemes.extend(self._convert_word(word))
321
+
322
+ return phonemes
323
+
324
+ def _convert_word(self, word):
325
+ """Convert a single word to phonemes"""
326
+ phonemes = []
327
+ i = 0
328
+ word = word.lower()
329
+
330
+ while i < len(word):
331
+ matched = False
332
+
333
+ # Try pattern matching (longer patterns first)
334
+ for pattern, phon_list in sorted(self.patterns, key=lambda x: -len(x[0])):
335
+ if word[i:].startswith(pattern):
336
+ phonemes.extend(phon_list)
337
+ i += len(pattern)
338
+ matched = True
339
+ break
340
+
341
+ if not matched:
342
+ # Single letter conversion
343
+ char = word[i]
344
+ if char in self.letter_map:
345
+ phonemes.extend(self.letter_map[char])
346
+ i += 1
347
+
348
+ return phonemes
349
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
 
351
+ # ============================================
352
+ # FORMANT SYNTHESIZER
353
+ # ============================================
354
 
355
+ class FormantSynthesizer:
356
+ """Klatt-style formant synthesizer"""
357
 
358
+ def __init__(self, sample_rate=22050):
359
+ self.sample_rate = sample_rate
360
+ self.base_f0 = 120 # Base fundamental frequency
 
 
 
 
 
 
361
 
362
+ def synthesize(self, phonemes, speaking_rate=1.0, pitch_shift=0):
363
+ """Synthesize audio from phoneme sequence"""
364
+ if not phonemes:
365
+ return np.zeros(1000, dtype=np.float32)
 
 
366
 
367
+ # Adjust pitch
368
+ f0 = self.base_f0 * (2 ** (pitch_shift / 12))
 
 
369
 
370
+ audio_segments = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
371
 
372
+ for i, phoneme in enumerate(phonemes):
373
+ if phoneme not in PHONEMES:
374
+ continue
375
+
376
+ f1, f2, f3, duration_ms, is_voiced = PHONEMES[phoneme]
377
+
378
+ # Adjust duration for speaking rate
379
+ duration_ms = int(duration_ms / speaking_rate)
380
+ duration_ms = max(30, min(duration_ms, 300))
381
+
382
+ # Generate phoneme audio
383
+ segment = self._generate_phoneme(
384
+ f0, f1, f2, f3, duration_ms, is_voiced, phoneme
385
+ )
386
+
387
+ audio_segments.append(segment)
388
 
389
+ if not audio_segments:
390
+ return np.zeros(1000, dtype=np.float32)
391
 
392
+ # Concatenate with smoothing
393
+ audio = self._concatenate_smooth(audio_segments)
 
394
 
395
+ # Apply overall envelope and normalization
396
+ audio = self._apply_envelope(audio)
397
+ audio = audio / (np.max(np.abs(audio)) + 1e-8)
398
 
399
+ return audio.astype(np.float32)
 
 
 
400
 
401
+ def _generate_phoneme(self, f0, f1, f2, f3, duration_ms, is_voiced, phoneme):
402
+ """Generate audio for a single phoneme"""
403
+ n_samples = int(self.sample_rate * duration_ms / 1000)
404
+ t = np.linspace(0, duration_ms / 1000, n_samples)
405
 
406
+ if phoneme in ['SIL', 'PAU']:
407
+ return np.zeros(n_samples, dtype=np.float32)
408
 
409
+ if is_voiced:
410
+ # Generate glottal pulse train
411
+ source = self._generate_voice_source(t, f0)
412
+ else:
413
+ # Generate noise for unvoiced
414
+ source = np.random.randn(n_samples) * 0.3
415
 
416
+ # Apply formant filtering
417
+ if f1 > 0:
418
+ audio = self._apply_formants(source, [f1, f2, f3])
419
+ else:
420
+ audio = source
421
 
422
+ # Apply consonant characteristics
423
+ audio = self._apply_consonant_shape(audio, phoneme)
 
 
424
 
425
+ # Apply envelope
426
+ audio = self._apply_phoneme_envelope(audio, phoneme)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
 
428
+ return audio.astype(np.float32)
429
+
430
+ def _generate_voice_source(self, t, f0):
431
+ """Generate glottal source with harmonics"""
432
+ source = np.zeros_like(t)
433
 
434
+ # Add harmonics with decreasing amplitude
435
+ for harmonic in range(1, 12):
436
+ freq = f0 * harmonic
437
+ if freq > self.sample_rate / 2:
438
+ break
439
+ amp = 1.0 / (harmonic ** 1.2)
440
+ # Add slight vibrato
441
+ vibrato = 1 + 0.01 * np.sin(2 * np.pi * 5 * t)
442
+ source += amp * np.sin(2 * np.pi * freq * vibrato * t)
443
 
444
+ # Add some noise for naturalness
445
+ source += np.random.randn(len(t)) * 0.02
 
446
 
447
+ return source
448
+
449
+ def _apply_formants(self, source, formants):
450
+ """Apply formant filtering using resonators"""
451
+ audio = source.copy()
452
 
453
+ for i, f in enumerate(formants):
454
+ if f <= 0 or f >= self.sample_rate / 2:
455
+ continue
 
456
 
457
+ # Bandwidth increases with formant number
458
+ bandwidth = 60 + i * 40
459
 
460
+ # Design bandpass filter
461
+ try:
462
+ low = max(20, f - bandwidth)
463
+ high = min(self.sample_rate / 2 - 100, f + bandwidth)
464
+
465
+ if low >= high:
466
+ continue
467
+
468
+ b, a = signal.butter(
469
+ 2,
470
+ [low / (self.sample_rate / 2), high / (self.sample_rate / 2)],
471
+ btype='band'
472
+ )
473
+
474
+ filtered = signal.filtfilt(b, a, source)
475
+
476
+ # Weight formants (F1 strongest)
477
+ weight = 1.0 / (i + 1)
478
+ audio = audio + filtered * weight
479
+
480
+ except Exception:
481
+ pass
482
 
483
+ return audio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
484
 
485
+ def _apply_consonant_shape(self, audio, phoneme):
486
+ """Apply consonant-specific characteristics"""
487
+ n = len(audio)
 
488
 
489
+ # Plosives: silence then burst
490
+ if phoneme in ['P', 'T', 'K', 'B', 'D', 'G']:
491
+ silence_len = n // 3
492
+ audio[:silence_len] = 0
493
+ burst = np.random.randn(n // 6) * 0.5
494
+ audio[silence_len:silence_len + len(burst)] += burst
495
 
496
+ # Fricatives: add more noise
497
+ elif phoneme in ['F', 'S', 'SH', 'TH', 'HH']:
498
+ noise = np.random.randn(n) * 0.3
499
+
500
+ # High-pass for 's' and 'sh'
501
+ if phoneme in ['S', 'SH']:
502
+ try:
503
+ b, a = signal.butter(2, 3000 / (self.sample_rate / 2), btype='high')
504
+ noise = signal.filtfilt(b, a, noise)
505
+ except:
506
+ pass
507
+
508
+ audio = audio * 0.3 + noise * 0.7
509
 
510
+ # Nasals: add low frequency resonance
511
+ elif phoneme in ['M', 'N', 'NG']:
512
+ try:
513
+ b, a = signal.butter(2, 500 / (self.sample_rate / 2), btype='low')
514
+ low_comp = signal.filtfilt(b, a, audio)
515
+ audio = audio * 0.5 + low_comp * 0.5
516
+ except:
517
+ pass
518
 
519
+ return audio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
520
 
521
+ def _apply_phoneme_envelope(self, audio, phoneme):
522
+ """Apply amplitude envelope to phoneme"""
523
+ n = len(audio)
524
+ if n < 4:
525
+ return audio
526
+
527
+ envelope = np.ones(n)
528
+
529
+ # Attack and release times depend on phoneme type
530
+ if phoneme in ['P', 'T', 'K', 'B', 'D', 'G']:
531
+ # Plosives: sharp attack
532
+ attack = max(1, n // 8)
533
+ release = max(1, n // 4)
534
+ elif phoneme in ['F', 'S', 'SH', 'V', 'Z', 'ZH', 'TH', 'DH']:
535
+ # Fricatives: gradual
536
+ attack = max(1, n // 4)
537
+ release = max(1, n // 4)
538
+ else:
539
+ # Vowels and sonorants
540
+ attack = max(1, n // 5)
541
+ release = max(1, n // 5)
542
 
543
+ envelope[:attack] = np.linspace(0, 1, attack)
544
+ envelope[-release:] = np.linspace(1, 0, release)
545
 
546
+ return audio * envelope
547
+
548
+ def _concatenate_smooth(self, segments):
549
+ """Concatenate segments with crossfade"""
550
+ if len(segments) == 0:
551
+ return np.zeros(1000, dtype=np.float32)
552
 
553
+ if len(segments) == 1:
554
+ return segments[0]
 
 
 
555
 
556
+ # Calculate total length with overlap
557
+ overlap = 64
558
+ total_length = sum(len(s) for s in segments) - overlap * (len(segments) - 1)
559
+ total_length = max(total_length, 1)
560
 
561
+ audio = np.zeros(total_length, dtype=np.float32)
 
 
 
562
 
563
+ pos = 0
564
+ for i, segment in enumerate(segments):
565
+ if len(segment) == 0:
566
+ continue
567
+
568
+ end_pos = min(pos + len(segment), total_length)
569
+ seg_len = end_pos - pos
570
+
571
+ if seg_len <= 0:
572
+ break
573
+
574
+ # Crossfade with previous segment
575
+ if i > 0 and pos > 0:
576
+ fade_len = min(overlap, seg_len, pos)
577
+ if fade_len > 0:
578
+ fade_in = np.linspace(0, 1, fade_len)
579
+ fade_out = np.linspace(1, 0, fade_len)
580
+
581
+ audio[pos:pos + fade_len] *= fade_out
582
+ segment_copy = segment[:seg_len].copy()
583
+ segment_copy[:fade_len] *= fade_in
584
+ audio[pos:end_pos] += segment_copy
585
+ else:
586
+ audio[pos:end_pos] = segment[:seg_len]
587
+ else:
588
+ audio[pos:end_pos] = segment[:seg_len]
589
+
590
+ pos = end_pos - overlap
591
+ pos = max(0, pos)
592
 
593
+ return audio
594
 
595
+ def _apply_envelope(self, audio):
596
+ """Apply overall envelope"""
597
+ n = len(audio)
598
+ if n < 100:
599
+ return audio
600
+
601
+ fade_len = min(n // 20, 500)
602
+ audio[:fade_len] *= np.linspace(0, 1, fade_len)
603
+ audio[-fade_len:] *= np.linspace(1, 0, fade_len)
 
 
 
 
 
 
 
604
 
605
  return audio
606
 
607
 
608
  # ============================================
609
+ # VEDES TTS MAIN CLASS
610
  # ============================================
611
 
612
+ class VedesTTS:
613
+ """Main TTS class"""
614
 
615
  def __init__(self, sample_rate=22050):
616
  self.sample_rate = sample_rate
617
+ self.text_to_phoneme = TextToPhoneme()
618
+ self.synthesizer = FormantSynthesizer(sample_rate)
 
 
 
 
 
 
 
 
 
619
 
620
+ def synthesize(self, text, speaking_rate=1.0, pitch_shift=0):
621
+ """Convert text to speech"""
622
+ # Text to phonemes
623
+ phonemes = self.text_to_phoneme.convert(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
624
 
625
+ if not phonemes:
626
+ return np.zeros(self.sample_rate, dtype=np.float32)
 
 
 
 
 
 
 
 
627
 
628
+ # Phonemes to audio
629
+ audio = self.synthesizer.synthesize(phonemes, speaking_rate, pitch_shift)
630
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
631
  return audio
632
 
633
 
 
636
  # ============================================
637
 
638
  print("=" * 50)
639
+ print("πŸŽ™οΈ Initializing Vedes TTS...")
640
  print("=" * 50)
641
 
642
+ tts = VedesTTS(config.sample_rate)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
643
 
644
+ print("βœ… Vedes TTS initialized successfully!")
645
  print("=" * 50)
646
 
647
 
 
649
  # SYNTHESIS FUNCTION
650
  # ============================================
651
 
652
+ def synthesize_speech(text, speaking_rate=1.0, pitch_shift=0, voice_type="neutral"):
653
+ """Main synthesis function for Gradio"""
654
  if not text or len(text.strip()) == 0:
655
  return None
656
 
657
+ text = text.strip()[:1000] # Limit length
658
 
659
  try:
660
+ # Adjust base pitch for voice type
661
+ pitch_adjust = pitch_shift
662
+ if voice_type == "high":
663
+ pitch_adjust += 5
664
+ elif voice_type == "low":
665
+ pitch_adjust -= 5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
666
 
667
+ # Synthesize
668
+ audio = tts.synthesize(text, speaking_rate, pitch_adjust)
 
 
669
 
670
+ if len(audio) < 100:
671
+ return None
672
+
673
+ # Convert to int16
674
  audio = np.clip(audio, -1, 1)
675
  audio_int16 = (audio * 32767).astype(np.int16)
676
 
 
678
 
679
  except Exception as e:
680
  print(f"Synthesis error: {e}")
681
+ return None
 
 
 
 
 
 
682
 
683
 
684
  # ============================================
685
  # GRADIO INTERFACE
686
  # ============================================
687
 
688
+ with gr.Blocks(
689
+ title="Vedes TTS",
690
+ theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple")
691
+ ) as demo:
692
+
693
  gr.Markdown(
694
  """
695
  # πŸŽ™οΈ Vedes TTS - Text-to-Speech Synthesis
696
+ ### A formant-based speech synthesizer built from scratch
697
 
698
+ Type any text below and hear it spoken!
699
  """
700
  )
701
 
702
  with gr.Row():
703
  with gr.Column(scale=2):
704
  text_input = gr.Textbox(
705
+ label="πŸ“ Enter Text",
706
+ placeholder="Type something to synthesize... (e.g., 'Hello, welcome to Vedes!')",
707
  lines=4,
708
  max_lines=10
709
  )
 
714
  maximum=2.0,
715
  value=1.0,
716
  step=0.1,
717
+ label="🎚️ Speaking Rate",
718
+ info="Slower ← β†’ Faster"
719
  )
720
 
721
  pitch_shift = gr.Slider(
722
+ minimum=-10,
723
+ maximum=10,
724
  value=0,
725
  step=1,
726
+ label="🎡 Pitch Shift",
727
+ info="Lower ← β†’ Higher"
728
  )
729
 
730
+ voice_type = gr.Radio(
731
+ choices=["neutral", "high", "low"],
732
+ value="neutral",
733
+ label="πŸ—£οΈ Voice Type"
734
+ )
735
+
736
+ synthesize_btn = gr.Button(
737
+ "πŸ”Š Synthesize Speech",
738
+ variant="primary",
739
+ size="lg"
740
+ )
741
 
742
  with gr.Column(scale=1):
743
  audio_output = gr.Audio(
 
747
 
748
  gr.Examples(
749
  examples=[
750
+ ["Hello, welcome to Vedes text to speech!"],
 
751
  ["The quick brown fox jumps over the lazy dog."],
752
  ["How are you doing today?"],
753
  ["This is a test of the speech synthesis system."],
754
+ ["Good morning! Nice to meet you."],
755
+ ["One, two, three, four, five."],
756
+ ["Please say hello to my friend."],
757
+ ["What is your name?"],
758
  ],
759
  inputs=text_input,
760
+ label="πŸ“š Try These Examples"
761
  )
762
 
763
  gr.Markdown(
 
765
  ---
766
  ### ℹ️ About Vedes TTS
767
 
768
+ **How it works:**
769
+ 1. **Text Processing** - Converts text to phonemes using pronunciation rules
770
+ 2. **Formant Synthesis** - Generates speech using formant frequencies (F1, F2, F3)
771
+ 3. **Source-Filter Model** - Combines glottal source with vocal tract filtering
 
 
772
 
773
+ **Features:**
774
+ - πŸ”€ Letter-to-phoneme conversion with common word dictionary
775
+ - 🎡 Adjustable pitch and speaking rate
776
+ - πŸ—£οΈ Multiple voice types (neutral, high, low pitch)
777
+ - ⚑ Real-time synthesis - no neural network required!
778
+
779
+ **Supported:** English text with basic punctuation
780
+
781
+ ---
782
+ *Built with Python, NumPy, SciPy, and Gradio* ❀️
783
  """
784
  )
785
 
786
  # Event handlers
787
  synthesize_btn.click(
788
  fn=synthesize_speech,
789
+ inputs=[text_input, speaking_rate, pitch_shift, voice_type],
790
  outputs=audio_output
791
  )
792
 
793
  text_input.submit(
794
  fn=synthesize_speech,
795
+ inputs=[text_input, speaking_rate, pitch_shift, voice_type],
796
  outputs=audio_output
797
  )
798
 
799
+
800
  # Launch
801
  if __name__ == "__main__":
802
  demo.launch()