vedaco commited on
Commit
ee1f09c
Β·
verified Β·
1 Parent(s): fec6ba4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +288 -736
app.py CHANGED
@@ -1,802 +1,354 @@
1
- import numpy as np
2
  import gradio as gr
3
- from scipy import signal
 
 
 
 
4
  from scipy.io import wavfile
 
5
  import io
6
- import re
7
 
8
  # ============================================
9
- # VEDES TTS - Formant-Based Speech Synthesizer
10
  # ============================================
11
 
12
- class VedesConfig:
13
- """Configuration"""
14
- sample_rate = 22050
15
-
16
-
17
- config = VedesConfig()
18
-
19
- # ============================================
20
- # PHONEME DEFINITIONS
21
- # ============================================
22
-
23
- # Phoneme to formant mapping (F1, F2, F3, duration_ms, is_voiced)
24
- PHONEMES = {
25
- # Vowels (voiced)
26
- 'AA': (710, 1100, 2540, 120, True), # father
27
- 'AE': (660, 1720, 2410, 120, True), # cat
28
- 'AH': (520, 1190, 2390, 100, True), # but
29
- 'AO': (570, 840, 2410, 120, True), # dog
30
- 'AW': (630, 1200, 2550, 150, True), # how
31
- 'AY': (710, 1100, 2540, 150, True), # my
32
- 'EH': (530, 1840, 2480, 100, True), # bed
33
- 'ER': (490, 1350, 1690, 120, True), # bird
34
- 'EY': (450, 2100, 2680, 140, True), # say
35
- 'IH': (400, 1920, 2560, 80, True), # bit
36
- 'IY': (270, 2290, 3010, 120, True), # see
37
- 'OW': (450, 850, 2500, 140, True), # go
38
- 'OY': (490, 1350, 2480, 160, True), # boy
39
- 'UH': (440, 1020, 2240, 100, True), # book
40
- 'UW': (300, 870, 2240, 120, True), # too
41
-
42
- # Consonants - Stops
43
- 'B': (200, 1100, 2150, 60, True),
44
- 'D': (200, 1600, 2600, 50, True),
45
- 'G': (200, 1990, 2850, 50, True),
46
- 'P': (200, 800, 2000, 80, False),
47
- 'T': (200, 1600, 2600, 70, False),
48
- 'K': (200, 1990, 2850, 80, False),
49
-
50
- # Consonants - Fricatives
51
- 'F': (175, 900, 2400, 100, False),
52
- 'V': (175, 1100, 2400, 80, True),
53
- 'TH': (200, 1400, 2200, 80, False),
54
- 'DH': (200, 1600, 2400, 60, True),
55
- 'S': (200, 1800, 4000, 100, False),
56
- 'Z': (200, 1600, 3500, 80, True),
57
- 'SH': (200, 1800, 2600, 100, False),
58
- 'ZH': (200, 1800, 2600, 80, True),
59
- 'HH': (280, 1200, 2400, 80, False),
60
-
61
- # Consonants - Nasals
62
- 'M': (280, 900, 2200, 80, True),
63
- 'N': (280, 1700, 2600, 70, True),
64
- 'NG': (280, 2300, 2750, 80, True),
65
-
66
- # Consonants - Liquids
67
- 'L': (350, 1100, 2700, 70, True),
68
- 'R': (420, 1300, 1600, 70, True),
69
-
70
- # Consonants - Glides
71
- 'W': (300, 870, 2240, 60, True),
72
- 'Y': (280, 2250, 3000, 50, True),
73
-
74
- # Special
75
- 'CH': (200, 1800, 2600, 100, False),
76
- 'JH': (200, 1800, 2600, 80, True),
77
-
78
- # Silence
79
- 'SIL': (0, 0, 0, 100, False),
80
- 'PAU': (0, 0, 0, 150, False),
81
- }
82
-
83
- # Letter to phoneme mapping (simplified)
84
- LETTER_TO_PHONEME = {
85
- 'a': ['AE'],
86
- 'b': ['B'],
87
- 'c': ['K'],
88
- 'd': ['D'],
89
- 'e': ['EH'],
90
- 'f': ['F'],
91
- 'g': ['G'],
92
- 'h': ['HH'],
93
- 'i': ['IH'],
94
- 'j': ['JH'],
95
- 'k': ['K'],
96
- 'l': ['L'],
97
- 'm': ['M'],
98
- 'n': ['N'],
99
- 'o': ['AA'],
100
- 'p': ['P'],
101
- 'q': ['K', 'W'],
102
- 'r': ['R'],
103
- 's': ['S'],
104
- 't': ['T'],
105
- 'u': ['AH'],
106
- 'v': ['V'],
107
- 'w': ['W'],
108
- 'x': ['K', 'S'],
109
- 'y': ['Y'],
110
- 'z': ['Z'],
111
- ' ': ['SIL'],
112
- '.': ['PAU'],
113
- ',': ['PAU'],
114
- '!': ['PAU'],
115
- '?': ['PAU'],
116
- '-': ['SIL'],
117
- "'": [],
118
- }
119
 
120
- # Common word pronunciations
121
- WORD_PRONUNCIATIONS = {
122
- 'the': ['DH', 'AH'],
123
- 'a': ['AH'],
124
- 'an': ['AE', 'N'],
125
- 'is': ['IH', 'Z'],
126
- 'are': ['AA', 'R'],
127
- 'was': ['W', 'AA', 'Z'],
128
- 'were': ['W', 'ER'],
129
- 'be': ['B', 'IY'],
130
- 'been': ['B', 'IH', 'N'],
131
- 'have': ['HH', 'AE', 'V'],
132
- 'has': ['HH', 'AE', 'Z'],
133
- 'had': ['HH', 'AE', 'D'],
134
- 'do': ['D', 'UW'],
135
- 'does': ['D', 'AH', 'Z'],
136
- 'did': ['D', 'IH', 'D'],
137
- 'will': ['W', 'IH', 'L'],
138
- 'would': ['W', 'UH', 'D'],
139
- 'could': ['K', 'UH', 'D'],
140
- 'should': ['SH', 'UH', 'D'],
141
- 'can': ['K', 'AE', 'N'],
142
- 'may': ['M', 'EY'],
143
- 'might': ['M', 'AY', 'T'],
144
- 'must': ['M', 'AH', 'S', 'T'],
145
- 'i': ['AY'],
146
- 'you': ['Y', 'UW'],
147
- 'he': ['HH', 'IY'],
148
- 'she': ['SH', 'IY'],
149
- 'it': ['IH', 'T'],
150
- 'we': ['W', 'IY'],
151
- 'they': ['DH', 'EY'],
152
- 'this': ['DH', 'IH', 'S'],
153
- 'that': ['DH', 'AE', 'T'],
154
- 'what': ['W', 'AH', 'T'],
155
- 'which': ['W', 'IH', 'CH'],
156
- 'who': ['HH', 'UW'],
157
- 'how': ['HH', 'AW'],
158
- 'when': ['W', 'EH', 'N'],
159
- 'where': ['W', 'EH', 'R'],
160
- 'why': ['W', 'AY'],
161
- 'all': ['AO', 'L'],
162
- 'each': ['IY', 'CH'],
163
- 'every': ['EH', 'V', 'R', 'IY'],
164
- 'both': ['B', 'OW', 'TH'],
165
- 'few': ['F', 'Y', 'UW'],
166
- 'more': ['M', 'AO', 'R'],
167
- 'most': ['M', 'OW', 'S', 'T'],
168
- 'other': ['AH', 'DH', 'ER'],
169
- 'some': ['S', 'AH', 'M'],
170
- 'such': ['S', 'AH', 'CH'],
171
- 'no': ['N', 'OW'],
172
- 'not': ['N', 'AA', 'T'],
173
- 'only': ['OW', 'N', 'L', 'IY'],
174
- 'same': ['S', 'EY', 'M'],
175
- 'so': ['S', 'OW'],
176
- 'than': ['DH', 'AE', 'N'],
177
- 'too': ['T', 'UW'],
178
- 'very': ['V', 'EH', 'R', 'IY'],
179
- 'just': ['JH', 'AH', 'S', 'T'],
180
- 'hello': ['HH', 'EH', 'L', 'OW'],
181
- 'hi': ['HH', 'AY'],
182
- 'welcome': ['W', 'EH', 'L', 'K', 'AH', 'M'],
183
- 'to': ['T', 'UW'],
184
- 'world': ['W', 'ER', 'L', 'D'],
185
- 'speech': ['S', 'P', 'IY', 'CH'],
186
- 'text': ['T', 'EH', 'K', 'S', 'T'],
187
- 'voice': ['V', 'OY', 'S'],
188
- 'sound': ['S', 'AW', 'N', 'D'],
189
- 'good': ['G', 'UH', 'D'],
190
- 'great': ['G', 'R', 'EY', 'T'],
191
- 'nice': ['N', 'AY', 'S'],
192
- 'thank': ['TH', 'AE', 'NG', 'K'],
193
- 'thanks': ['TH', 'AE', 'NG', 'K', 'S'],
194
- 'please': ['P', 'L', 'IY', 'Z'],
195
- 'yes': ['Y', 'EH', 'S'],
196
- 'yeah': ['Y', 'AE'],
197
- 'ok': ['OW', 'K', 'EY'],
198
- 'okay': ['OW', 'K', 'EY'],
199
- 'and': ['AE', 'N', 'D'],
200
- 'or': ['AO', 'R'],
201
- 'but': ['B', 'AH', 'T'],
202
- 'if': ['IH', 'F'],
203
- 'then': ['DH', 'EH', 'N'],
204
- 'because': ['B', 'IH', 'K', 'AO', 'Z'],
205
- 'as': ['AE', 'Z'],
206
- 'until': ['AH', 'N', 'T', 'IH', 'L'],
207
- 'while': ['W', 'AY', 'L'],
208
- 'of': ['AH', 'V'],
209
- 'at': ['AE', 'T'],
210
- 'by': ['B', 'AY'],
211
- 'for': ['F', 'AO', 'R'],
212
- 'with': ['W', 'IH', 'TH'],
213
- 'about': ['AH', 'B', 'AW', 'T'],
214
- 'into': ['IH', 'N', 'T', 'UW'],
215
- 'through': ['TH', 'R', 'UW'],
216
- 'during': ['D', 'UH', 'R', 'IH', 'NG'],
217
- 'before': ['B', 'IH', 'F', 'AO', 'R'],
218
- 'after': ['AE', 'F', 'T', 'ER'],
219
- 'above': ['AH', 'B', 'AH', 'V'],
220
- 'below': ['B', 'IH', 'L', 'OW'],
221
- 'from': ['F', 'R', 'AH', 'M'],
222
- 'up': ['AH', 'P'],
223
- 'down': ['D', 'AW', 'N'],
224
- 'in': ['IH', 'N'],
225
- 'out': ['AW', 'T'],
226
- 'on': ['AA', 'N'],
227
- 'off': ['AO', 'F'],
228
- 'over': ['OW', 'V', 'ER'],
229
- 'under': ['AH', 'N', 'D', 'ER'],
230
- 'again': ['AH', 'G', 'EH', 'N'],
231
- 'there': ['DH', 'EH', 'R'],
232
- 'here': ['HH', 'IY', 'R'],
233
- 'today': ['T', 'AH', 'D', 'EY'],
234
- 'now': ['N', 'AW'],
235
- 'my': ['M', 'AY'],
236
- 'your': ['Y', 'AO', 'R'],
237
- 'his': ['HH', 'IH', 'Z'],
238
- 'her': ['HH', 'ER'],
239
- 'our': ['AW', 'ER'],
240
- 'their': ['DH', 'EH', 'R'],
241
- 'test': ['T', 'EH', 'S', 'T'],
242
- 'testing': ['T', 'EH', 'S', 'T', 'IH', 'NG'],
243
- 'one': ['W', 'AH', 'N'],
244
- 'two': ['T', 'UW'],
245
- 'three': ['TH', 'R', 'IY'],
246
- 'four': ['F', 'AO', 'R'],
247
- 'five': ['F', 'AY', 'V'],
248
- 'name': ['N', 'EY', 'M'],
249
- 'vedes': ['V', 'IY', 'D', 'EH', 'S'],
250
- 'synthesis': ['S', 'IH', 'N', 'TH', 'AH', 'S', 'IH', 'S'],
251
- 'system': ['S', 'IH', 'S', 'T', 'AH', 'M'],
252
  }
253
 
254
- # Common letter patterns
255
- PATTERNS = [
256
- (r'tion', ['SH', 'AH', 'N']),
257
- (r'sion', ['ZH', 'AH', 'N']),
258
- (r'ough', ['AH', 'F']),
259
- (r'ight', ['AY', 'T']),
260
- (r'ould', ['UH', 'D']),
261
- (r'tion', ['SH', 'AH', 'N']),
262
- (r'th', ['TH']),
263
- (r'ch', ['CH']),
264
- (r'sh', ['SH']),
265
- (r'ph', ['F']),
266
- (r'wh', ['W']),
267
- (r'ck', ['K']),
268
- (r'ng', ['NG']),
269
- (r'qu', ['K', 'W']),
270
- (r'ee', ['IY']),
271
- (r'ea', ['IY']),
272
- (r'oo', ['UW']),
273
- (r'ou', ['AW']),
274
- (r'ow', ['OW']),
275
- (r'ai', ['EY']),
276
- (r'ay', ['EY']),
277
- (r'oy', ['OY']),
278
- (r'oi', ['OY']),
279
- (r'au', ['AO']),
280
- (r'aw', ['AO']),
281
- (r'ie', ['IY']),
282
- (r'ei', ['EY']),
283
- (r'ue', ['UW']),
284
- (r'ew', ['UW']),
285
- ]
286
-
287
 
288
- # ============================================
289
- # TEXT TO PHONEME CONVERTER
290
- # ============================================
291
 
292
- class TextToPhoneme:
293
- """Convert text to phoneme sequence"""
294
 
295
- def __init__(self):
296
- self.word_dict = WORD_PRONUNCIATIONS
297
- self.letter_map = LETTER_TO_PHONEME
298
- self.patterns = PATTERNS
299
 
300
- def convert(self, text):
301
- """Convert text to phoneme list"""
302
- text = text.lower().strip()
303
- words = re.findall(r"[\w']+|[.,!?;:\-]|\s+", text)
304
-
305
- phonemes = []
306
-
307
- for word in words:
308
- word = word.strip()
309
- if not word:
310
- continue
311
-
312
- if word in self.word_dict:
313
- phonemes.extend(self.word_dict[word])
314
- elif word.isspace():
315
- phonemes.append('SIL')
316
- elif word in '.,!?;:':
317
- phonemes.append('PAU')
318
- else:
319
- # Convert letter by letter with pattern matching
320
- phonemes.extend(self._convert_word(word))
321
-
322
- return phonemes
323
 
324
- def _convert_word(self, word):
325
- """Convert a single word to phonemes"""
326
- phonemes = []
327
- i = 0
328
- word = word.lower()
329
-
330
- while i < len(word):
331
- matched = False
332
-
333
- # Try pattern matching (longer patterns first)
334
- for pattern, phon_list in sorted(self.patterns, key=lambda x: -len(x[0])):
335
- if word[i:].startswith(pattern):
336
- phonemes.extend(phon_list)
337
- i += len(pattern)
338
- matched = True
339
- break
340
-
341
- if not matched:
342
- # Single letter conversion
343
- char = word[i]
344
- if char in self.letter_map:
345
- phonemes.extend(self.letter_map[char])
346
- i += 1
347
-
348
- return phonemes
349
 
350
 
351
- # ============================================
352
- # FORMANT SYNTHESIZER
353
- # ============================================
354
-
355
- class FormantSynthesizer:
356
- """Klatt-style formant synthesizer"""
357
 
358
- def __init__(self, sample_rate=22050):
359
- self.sample_rate = sample_rate
360
- self.base_f0 = 120 # Base fundamental frequency
 
 
361
 
362
- def synthesize(self, phonemes, speaking_rate=1.0, pitch_shift=0):
363
- """Synthesize audio from phoneme sequence"""
364
- if not phonemes:
365
- return np.zeros(1000, dtype=np.float32)
366
-
367
- # Adjust pitch
368
- f0 = self.base_f0 * (2 ** (pitch_shift / 12))
369
-
370
- audio_segments = []
371
-
372
- for i, phoneme in enumerate(phonemes):
373
- if phoneme not in PHONEMES:
374
- continue
375
-
376
- f1, f2, f3, duration_ms, is_voiced = PHONEMES[phoneme]
377
-
378
- # Adjust duration for speaking rate
379
- duration_ms = int(duration_ms / speaking_rate)
380
- duration_ms = max(30, min(duration_ms, 300))
381
-
382
- # Generate phoneme audio
383
- segment = self._generate_phoneme(
384
- f0, f1, f2, f3, duration_ms, is_voiced, phoneme
385
- )
386
-
387
- audio_segments.append(segment)
388
-
389
- if not audio_segments:
390
- return np.zeros(1000, dtype=np.float32)
391
-
392
- # Concatenate with smoothing
393
- audio = self._concatenate_smooth(audio_segments)
394
-
395
- # Apply overall envelope and normalization
396
- audio = self._apply_envelope(audio)
397
- audio = audio / (np.max(np.abs(audio)) + 1e-8)
398
-
399
- return audio.astype(np.float32)
400
 
401
- def _generate_phoneme(self, f0, f1, f2, f3, duration_ms, is_voiced, phoneme):
402
- """Generate audio for a single phoneme"""
403
- n_samples = int(self.sample_rate * duration_ms / 1000)
404
- t = np.linspace(0, duration_ms / 1000, n_samples)
405
-
406
- if phoneme in ['SIL', 'PAU']:
407
- return np.zeros(n_samples, dtype=np.float32)
408
-
409
- if is_voiced:
410
- # Generate glottal pulse train
411
- source = self._generate_voice_source(t, f0)
412
- else:
413
- # Generate noise for unvoiced
414
- source = np.random.randn(n_samples) * 0.3
415
-
416
- # Apply formant filtering
417
- if f1 > 0:
418
- audio = self._apply_formants(source, [f1, f2, f3])
419
- else:
420
- audio = source
421
-
422
- # Apply consonant characteristics
423
- audio = self._apply_consonant_shape(audio, phoneme)
424
-
425
- # Apply envelope
426
- audio = self._apply_phoneme_envelope(audio, phoneme)
427
-
428
- return audio.astype(np.float32)
429
 
430
- def _generate_voice_source(self, t, f0):
431
- """Generate glottal source with harmonics"""
432
- source = np.zeros_like(t)
433
-
434
- # Add harmonics with decreasing amplitude
435
- for harmonic in range(1, 12):
436
- freq = f0 * harmonic
437
- if freq > self.sample_rate / 2:
438
- break
439
- amp = 1.0 / (harmonic ** 1.2)
440
- # Add slight vibrato
441
- vibrato = 1 + 0.01 * np.sin(2 * np.pi * 5 * t)
442
- source += amp * np.sin(2 * np.pi * freq * vibrato * t)
443
-
444
- # Add some noise for naturalness
445
- source += np.random.randn(len(t)) * 0.02
446
-
447
- return source
448
 
449
- def _apply_formants(self, source, formants):
450
- """Apply formant filtering using resonators"""
451
- audio = source.copy()
452
-
453
- for i, f in enumerate(formants):
454
- if f <= 0 or f >= self.sample_rate / 2:
455
- continue
456
-
457
- # Bandwidth increases with formant number
458
- bandwidth = 60 + i * 40
459
-
460
- # Design bandpass filter
461
- try:
462
- low = max(20, f - bandwidth)
463
- high = min(self.sample_rate / 2 - 100, f + bandwidth)
464
-
465
- if low >= high:
466
- continue
467
-
468
- b, a = signal.butter(
469
- 2,
470
- [low / (self.sample_rate / 2), high / (self.sample_rate / 2)],
471
- btype='band'
472
- )
473
-
474
- filtered = signal.filtfilt(b, a, source)
475
-
476
- # Weight formants (F1 strongest)
477
- weight = 1.0 / (i + 1)
478
- audio = audio + filtered * weight
479
-
480
- except Exception:
481
- pass
482
-
483
- return audio
484
 
485
- def _apply_consonant_shape(self, audio, phoneme):
486
- """Apply consonant-specific characteristics"""
487
- n = len(audio)
488
-
489
- # Plosives: silence then burst
490
- if phoneme in ['P', 'T', 'K', 'B', 'D', 'G']:
491
- silence_len = n // 3
492
- audio[:silence_len] = 0
493
- burst = np.random.randn(n // 6) * 0.5
494
- audio[silence_len:silence_len + len(burst)] += burst
495
-
496
- # Fricatives: add more noise
497
- elif phoneme in ['F', 'S', 'SH', 'TH', 'HH']:
498
- noise = np.random.randn(n) * 0.3
499
-
500
- # High-pass for 's' and 'sh'
501
- if phoneme in ['S', 'SH']:
502
- try:
503
- b, a = signal.butter(2, 3000 / (self.sample_rate / 2), btype='high')
504
- noise = signal.filtfilt(b, a, noise)
505
- except:
506
- pass
507
-
508
- audio = audio * 0.3 + noise * 0.7
509
-
510
- # Nasals: add low frequency resonance
511
- elif phoneme in ['M', 'N', 'NG']:
512
- try:
513
- b, a = signal.butter(2, 500 / (self.sample_rate / 2), btype='low')
514
- low_comp = signal.filtfilt(b, a, audio)
515
- audio = audio * 0.5 + low_comp * 0.5
516
- except:
517
- pass
518
-
519
- return audio
520
-
521
- def _apply_phoneme_envelope(self, audio, phoneme):
522
- """Apply amplitude envelope to phoneme"""
523
- n = len(audio)
524
- if n < 4:
525
- return audio
526
-
527
- envelope = np.ones(n)
528
-
529
- # Attack and release times depend on phoneme type
530
- if phoneme in ['P', 'T', 'K', 'B', 'D', 'G']:
531
- # Plosives: sharp attack
532
- attack = max(1, n // 8)
533
- release = max(1, n // 4)
534
- elif phoneme in ['F', 'S', 'SH', 'V', 'Z', 'ZH', 'TH', 'DH']:
535
- # Fricatives: gradual
536
- attack = max(1, n // 4)
537
- release = max(1, n // 4)
538
- else:
539
- # Vowels and sonorants
540
- attack = max(1, n // 5)
541
- release = max(1, n // 5)
542
-
543
- envelope[:attack] = np.linspace(0, 1, attack)
544
- envelope[-release:] = np.linspace(1, 0, release)
545
-
546
- return audio * envelope
547
 
548
- def _concatenate_smooth(self, segments):
549
- """Concatenate segments with crossfade"""
550
- if len(segments) == 0:
551
- return np.zeros(1000, dtype=np.float32)
552
-
553
- if len(segments) == 1:
554
- return segments[0]
555
-
556
- # Calculate total length with overlap
557
- overlap = 64
558
- total_length = sum(len(s) for s in segments) - overlap * (len(segments) - 1)
559
- total_length = max(total_length, 1)
560
-
561
- audio = np.zeros(total_length, dtype=np.float32)
562
-
563
- pos = 0
564
- for i, segment in enumerate(segments):
565
- if len(segment) == 0:
566
- continue
567
-
568
- end_pos = min(pos + len(segment), total_length)
569
- seg_len = end_pos - pos
570
-
571
- if seg_len <= 0:
572
- break
573
-
574
- # Crossfade with previous segment
575
- if i > 0 and pos > 0:
576
- fade_len = min(overlap, seg_len, pos)
577
- if fade_len > 0:
578
- fade_in = np.linspace(0, 1, fade_len)
579
- fade_out = np.linspace(1, 0, fade_len)
580
-
581
- audio[pos:pos + fade_len] *= fade_out
582
- segment_copy = segment[:seg_len].copy()
583
- segment_copy[:fade_len] *= fade_in
584
- audio[pos:end_pos] += segment_copy
585
- else:
586
- audio[pos:end_pos] = segment[:seg_len]
587
- else:
588
- audio[pos:end_pos] = segment[:seg_len]
589
-
590
- pos = end_pos - overlap
591
- pos = max(0, pos)
592
-
593
- return audio
594
 
595
- def _apply_envelope(self, audio):
596
- """Apply overall envelope"""
597
- n = len(audio)
598
- if n < 100:
599
- return audio
600
-
601
- fade_len = min(n // 20, 500)
602
- audio[:fade_len] *= np.linspace(0, 1, fade_len)
603
- audio[-fade_len:] *= np.linspace(1, 0, fade_len)
604
-
605
- return audio
606
-
607
 
608
- # ============================================
609
- # VEDES TTS MAIN CLASS
610
- # ============================================
611
 
612
- class VedesTTS:
613
- """Main TTS class"""
 
 
614
 
615
- def __init__(self, sample_rate=22050):
616
- self.sample_rate = sample_rate
617
- self.text_to_phoneme = TextToPhoneme()
618
- self.synthesizer = FormantSynthesizer(sample_rate)
619
 
620
- def synthesize(self, text, speaking_rate=1.0, pitch_shift=0):
621
- """Convert text to speech"""
622
- # Text to phonemes
623
- phonemes = self.text_to_phoneme.convert(text)
624
-
625
- if not phonemes:
626
- return np.zeros(self.sample_rate, dtype=np.float32)
627
-
628
- # Phonemes to audio
629
- audio = self.synthesizer.synthesize(phonemes, speaking_rate, pitch_shift)
630
-
631
- return audio
632
-
633
-
634
- # ============================================
635
- # INITIALIZE
636
- # ============================================
637
-
638
- print("=" * 50)
639
- print("πŸŽ™οΈ Initializing Vedes TTS...")
640
- print("=" * 50)
641
-
642
- tts = VedesTTS(config.sample_rate)
643
-
644
- print("βœ… Vedes TTS initialized successfully!")
645
- print("=" * 50)
646
-
647
-
648
- # ============================================
649
- # SYNTHESIS FUNCTION
650
- # ============================================
651
-
652
- def synthesize_speech(text, speaking_rate=1.0, pitch_shift=0, voice_type="neutral"):
653
- """Main synthesis function for Gradio"""
654
- if not text or len(text.strip()) == 0:
655
- return None
656
 
657
- text = text.strip()[:1000] # Limit length
658
-
659
- try:
660
- # Adjust base pitch for voice type
661
- pitch_adjust = pitch_shift
662
- if voice_type == "high":
663
- pitch_adjust += 5
664
- elif voice_type == "low":
665
- pitch_adjust -= 5
666
-
667
- # Synthesize
668
- audio = tts.synthesize(text, speaking_rate, pitch_adjust)
669
-
670
- if len(audio) < 100:
671
- return None
672
-
673
- # Convert to int16
674
- audio = np.clip(audio, -1, 1)
675
- audio_int16 = (audio * 32767).astype(np.int16)
676
-
677
- return (config.sample_rate, audio_int16)
678
 
679
- except Exception as e:
680
- print(f"Synthesis error: {e}")
681
- return None
 
 
 
 
682
 
683
 
684
  # ============================================
685
  # GRADIO INTERFACE
686
  # ============================================
687
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
688
  with gr.Blocks(
689
  title="Vedes TTS",
690
- theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple")
 
 
 
 
691
  ) as demo:
692
 
693
- gr.Markdown(
694
- """
695
- # πŸŽ™οΈ Vedes TTS - Text-to-Speech Synthesis
696
- ### A formant-based speech synthesizer built from scratch
697
-
698
- Type any text below and hear it spoken!
699
- """
700
- )
701
 
702
- with gr.Row():
703
- with gr.Column(scale=2):
704
- text_input = gr.Textbox(
705
- label="πŸ“ Enter Text",
706
- placeholder="Type something to synthesize... (e.g., 'Hello, welcome to Vedes!')",
707
- lines=4,
708
- max_lines=10
709
- )
710
-
711
  with gr.Row():
712
- speaking_rate = gr.Slider(
713
- minimum=0.5,
714
- maximum=2.0,
715
- value=1.0,
716
- step=0.1,
717
- label="🎚️ Speaking Rate",
718
- info="Slower ← β†’ Faster"
719
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
720
 
721
- pitch_shift = gr.Slider(
722
- minimum=-10,
723
- maximum=10,
724
- value=0,
725
- step=1,
726
- label="🎡 Pitch Shift",
727
- info="Lower ← β†’ Higher"
728
- )
 
 
 
 
 
 
 
 
 
729
 
730
- voice_type = gr.Radio(
731
- choices=["neutral", "high", "low"],
732
- value="neutral",
733
- label="πŸ—£οΈ Voice Type"
734
- )
 
 
 
 
 
735
 
736
- synthesize_btn = gr.Button(
737
- "πŸ”Š Synthesize Speech",
738
- variant="primary",
739
- size="lg"
740
  )
741
 
742
- with gr.Column(scale=1):
743
- audio_output = gr.Audio(
744
- label="🎧 Generated Speech",
745
- type="numpy"
746
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
747
 
748
- gr.Examples(
749
- examples=[
750
- ["Hello, welcome to Vedes text to speech!"],
751
- ["The quick brown fox jumps over the lazy dog."],
752
- ["How are you doing today?"],
753
- ["This is a test of the speech synthesis system."],
754
- ["Good morning! Nice to meet you."],
755
- ["One, two, three, four, five."],
756
- ["Please say hello to my friend."],
757
- ["What is your name?"],
758
- ],
759
- inputs=text_input,
760
- label="πŸ“š Try These Examples"
761
- )
762
 
763
- gr.Markdown(
764
- """
765
- ---
766
- ### ℹ️ About Vedes TTS
767
-
768
- **How it works:**
769
- 1. **Text Processing** - Converts text to phonemes using pronunciation rules
770
- 2. **Formant Synthesis** - Generates speech using formant frequencies (F1, F2, F3)
771
- 3. **Source-Filter Model** - Combines glottal source with vocal tract filtering
772
-
773
- **Features:**
774
- - πŸ”€ Letter-to-phoneme conversion with common word dictionary
775
- - 🎡 Adjustable pitch and speaking rate
776
- - πŸ—£οΈ Multiple voice types (neutral, high, low pitch)
777
- - ⚑ Real-time synthesis - no neural network required!
778
-
779
- **Supported:** English text with basic punctuation
780
-
781
- ---
782
- *Built with Python, NumPy, SciPy, and Gradio* ❀️
783
- """
784
  )
785
 
786
- # Event handlers
787
  synthesize_btn.click(
788
  fn=synthesize_speech,
789
- inputs=[text_input, speaking_rate, pitch_shift, voice_type],
790
  outputs=audio_output
791
  )
792
 
793
  text_input.submit(
794
  fn=synthesize_speech,
795
- inputs=[text_input, speaking_rate, pitch_shift, voice_type],
796
  outputs=audio_output
797
  )
798
 
799
 
800
  # Launch
 
 
 
801
  if __name__ == "__main__":
802
  demo.launch()
 
 
1
  import gradio as gr
2
+ import numpy as np
3
+ import asyncio
4
+ import edge_tts
5
+ import tempfile
6
+ import os
7
  from scipy.io import wavfile
8
+ from scipy import signal
9
  import io
 
10
 
11
  # ============================================
12
+ # VEDES TTS - Text-to-Speech System
13
  # ============================================
14
 
15
+ print("=" * 50)
16
+ print("πŸŽ™οΈ Initializing Vedes TTS...")
17
+ print("=" * 50)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
+ # Available voices
20
+ VOICES = {
21
+ "Emma (US Female)": "en-US-EmmaNeural",
22
+ "Jenny (US Female)": "en-US-JennyNeural",
23
+ "Aria (US Female)": "en-US-AriaNeural",
24
+ "Guy (US Male)": "en-US-GuyNeural",
25
+ "Eric (US Male)": "en-US-EricNeural",
26
+ "Ryan (UK Male)": "en-GB-RyanNeural",
27
+ "Sonia (UK Female)": "en-GB-SoniaNeural",
28
+ "Natasha (AU Female)": "en-AU-NatashaNeural",
29
+ "William (AU Male)": "en-AU-WilliamNeural",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  }
31
 
32
+ DEFAULT_VOICE = "en-US-EmmaNeural"
33
+ SAMPLE_RATE = 24000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
 
 
 
35
 
36
+ async def synthesize_async(text, voice, rate, pitch):
37
+ """Async TTS synthesis using edge-tts"""
38
 
39
+ # Format rate and pitch for edge-tts
40
+ rate_str = f"{'+' if rate >= 0 else ''}{int(rate)}%"
41
+ pitch_str = f"{'+' if pitch >= 0 else ''}{int(pitch)}Hz"
 
42
 
43
+ communicate = edge_tts.Communicate(
44
+ text=text,
45
+ voice=voice,
46
+ rate=rate_str,
47
+ pitch=pitch_str
48
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ # Save to temporary file
51
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
52
+ tmp_path = tmp_file.name
53
+
54
+ await communicate.save(tmp_path)
55
+
56
+ return tmp_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
 
59
+ def synthesize_speech(text, voice_name, speaking_rate, pitch_shift):
60
+ """
61
+ Main synthesis function
 
 
 
62
 
63
+ Args:
64
+ text: Input text to synthesize
65
+ voice_name: Selected voice
66
+ speaking_rate: Speed adjustment (-50 to +50)
67
+ pitch_shift: Pitch adjustment in Hz (-20 to +20)
68
 
69
+ Returns:
70
+ Path to generated audio file
71
+ """
72
+ if not text or len(text.strip()) == 0:
73
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
+ text = text.strip()[:5000] # Limit text length
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
+ # Get voice ID
78
+ voice = VOICES.get(voice_name, DEFAULT_VOICE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
+ # Convert speaking rate to percentage
81
+ rate = int((speaking_rate - 1.0) * 100)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
+ # Convert pitch shift
84
+ pitch = int(pitch_shift * 10)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ try:
87
+ # Run async synthesis
88
+ loop = asyncio.new_event_loop()
89
+ asyncio.set_event_loop(loop)
90
+ audio_path = loop.run_until_complete(
91
+ synthesize_async(text, voice, rate, pitch)
92
+ )
93
+ loop.close()
94
+
95
+ return audio_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
+ except Exception as e:
98
+ print(f"Synthesis error: {e}")
99
+ return None
 
 
 
 
 
 
 
 
 
100
 
 
 
 
101
 
102
+ def text_analysis(text):
103
+ """Analyze text and return statistics"""
104
+ if not text:
105
+ return ""
106
 
107
+ words = text.split()
108
+ sentences = text.replace('!', '.').replace('?', '.').split('.')
109
+ sentences = [s.strip() for s in sentences if s.strip()]
 
110
 
111
+ char_count = len(text)
112
+ word_count = len(words)
113
+ sentence_count = len(sentences)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
+ # Estimate duration (average 150 words per minute)
116
+ est_duration = word_count / 150 * 60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
+ return f"""
119
+ πŸ“Š **Text Analysis:**
120
+ - Characters: {char_count}
121
+ - Words: {word_count}
122
+ - Sentences: {sentence_count}
123
+ - Estimated Duration: {est_duration:.1f} seconds
124
+ """
125
 
126
 
127
  # ============================================
128
  # GRADIO INTERFACE
129
  # ============================================
130
 
131
+ # Custom CSS
132
+ custom_css = """
133
+ .gradio-container {
134
+ max-width: 900px !important;
135
+ }
136
+ .title-text {
137
+ text-align: center;
138
+ background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
139
+ -webkit-background-clip: text;
140
+ -webkit-text-fill-color: transparent;
141
+ font-size: 2.5rem;
142
+ font-weight: bold;
143
+ }
144
+ .subtitle-text {
145
+ text-align: center;
146
+ color: #666;
147
+ }
148
+ """
149
+
150
  with gr.Blocks(
151
  title="Vedes TTS",
152
+ css=custom_css,
153
+ theme=gr.themes.Soft(
154
+ primary_hue="purple",
155
+ secondary_hue="blue",
156
+ )
157
  ) as demo:
158
 
159
+ # Header
160
+ gr.HTML("""
161
+ <div style="text-align: center; padding: 20px;">
162
+ <h1 class="title-text">πŸŽ™οΈ Vedes TTS</h1>
163
+ <p class="subtitle-text">High-Quality Text-to-Speech Synthesis</p>
164
+ </div>
165
+ """)
 
166
 
167
+ with gr.Tabs():
168
+ # Main TTS Tab
169
+ with gr.TabItem("πŸ”Š Text to Speech"):
 
 
 
 
 
 
170
  with gr.Row():
171
+ with gr.Column(scale=2):
172
+ text_input = gr.Textbox(
173
+ label="πŸ“ Enter Text",
174
+ placeholder="Type or paste your text here...\n\nExample: Hello! Welcome to Vedes, a high-quality text-to-speech system. I can read any text you provide with natural-sounding speech.",
175
+ lines=6,
176
+ max_lines=15
177
+ )
178
+
179
+ text_stats = gr.Markdown("")
180
+
181
+ with gr.Row():
182
+ voice_select = gr.Dropdown(
183
+ choices=list(VOICES.keys()),
184
+ value="Emma (US Female)",
185
+ label="πŸ—£οΈ Select Voice",
186
+ interactive=True
187
+ )
188
+
189
+ with gr.Row():
190
+ speaking_rate = gr.Slider(
191
+ minimum=0.5,
192
+ maximum=2.0,
193
+ value=1.0,
194
+ step=0.1,
195
+ label="⏱️ Speaking Rate",
196
+ info="0.5x = Slow, 1.0x = Normal, 2.0x = Fast"
197
+ )
198
+
199
+ pitch_shift = gr.Slider(
200
+ minimum=-2.0,
201
+ maximum=2.0,
202
+ value=0.0,
203
+ step=0.1,
204
+ label="🎡 Pitch Adjustment",
205
+ info="Adjust voice pitch"
206
+ )
207
+
208
+ synthesize_btn = gr.Button(
209
+ "πŸ”Š Generate Speech",
210
+ variant="primary",
211
+ size="lg"
212
+ )
213
 
214
+ with gr.Column(scale=1):
215
+ audio_output = gr.Audio(
216
+ label="🎧 Generated Speech",
217
+ type="filepath"
218
+ )
219
+
220
+ gr.Markdown("""
221
+ ### πŸ’‘ Tips:
222
+ - Use punctuation for natural pauses
223
+ - Add commas for short pauses
224
+ - Add periods for longer pauses
225
+ - Use "!" and "?" for expression
226
+ """)
227
+
228
+ # Examples Tab
229
+ with gr.TabItem("πŸ“š Examples"):
230
+ gr.Markdown("### Click any example to try it:")
231
 
232
+ examples = [
233
+ ["Hello! Welcome to Vedes text-to-speech. I hope you're having a wonderful day!"],
234
+ ["The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet."],
235
+ ["In a world where technology advances rapidly, artificial intelligence continues to reshape how we live and work."],
236
+ ["Once upon a time, in a land far away, there lived a wise old wizard who knew the secrets of the universe."],
237
+ ["Breaking news: Scientists have discovered a new species of butterfly in the Amazon rainforest."],
238
+ ["To be, or not to be, that is the question. Whether 'tis nobler in the mind to suffer the slings and arrows of outrageous fortune."],
239
+ ["Good morning! Today's weather forecast predicts sunny skies with a high of 75 degrees Fahrenheit."],
240
+ ["Thank you for using Vedes TTS. We appreciate your interest in our text-to-speech technology!"],
241
+ ]
242
 
243
+ gr.Examples(
244
+ examples=examples,
245
+ inputs=text_input,
246
+ label=""
247
  )
248
 
249
+ # Voices Tab
250
+ with gr.TabItem("🎭 Voice Gallery"):
251
+ gr.Markdown("""
252
+ ### Available Voices:
253
+
254
+ | Voice | Gender | Accent | Best For |
255
+ |-------|--------|--------|----------|
256
+ | Emma | Female | US English | General, Friendly |
257
+ | Jenny | Female | US English | Professional, Clear |
258
+ | Aria | Female | US English | Conversational |
259
+ | Guy | Male | US English | Narration, Calm |
260
+ | Eric | Male | US English | News, Formal |
261
+ | Ryan | Male | UK English | British content |
262
+ | Sonia | Female | UK English | British content |
263
+ | Natasha | Female | AU English | Australian content |
264
+ | William | Male | AU English | Australian content |
265
+
266
+ ---
267
+
268
+ ### 🎯 Voice Selection Tips:
269
+
270
+ - **For storytelling:** Try Emma or Guy
271
+ - **For news/formal:** Try Jenny or Eric
272
+ - **For casual content:** Try Aria
273
+ - **For British accent:** Try Ryan or Sonia
274
+ - **For Australian accent:** Try Natasha or William
275
+ """)
276
+
277
+ # About Tab
278
+ with gr.TabItem("ℹ️ About"):
279
+ gr.Markdown("""
280
+ ## πŸŽ™οΈ About Vedes TTS
281
+
282
+ **Vedes** is a text-to-speech application that converts written text into natural-sounding speech.
283
+
284
+ ### ✨ Features:
285
+
286
+ - πŸ—£οΈ **9 High-Quality Voices** - Male and female voices with different accents
287
+ - 🌍 **Multiple Accents** - US, UK, and Australian English
288
+ - ⏱️ **Adjustable Speed** - From 0.5x to 2.0x speaking rate
289
+ - 🎡 **Pitch Control** - Fine-tune the voice pitch
290
+ - πŸ“± **Easy to Use** - Simple, intuitive interface
291
+ - ⚑ **Fast Generation** - Quick audio synthesis
292
+
293
+ ### πŸ”§ How It Works:
294
+
295
+ 1. **Enter Text** - Type or paste your text
296
+ 2. **Select Voice** - Choose from 9 available voices
297
+ 3. **Adjust Settings** - Modify speed and pitch if needed
298
+ 4. **Generate** - Click the button to create speech
299
+ 5. **Listen & Download** - Play or save the audio
300
+
301
+ ### πŸ“– Best Practices:
302
+
303
+ - Use proper punctuation for natural speech rhythm
304
+ - Break long texts into paragraphs
305
+ - Use commas for short pauses, periods for longer ones
306
+ - Add question marks and exclamation points for expression
307
+
308
+ ---
309
+
310
+ ### πŸ› οΈ Technical Details:
311
+
312
+ - **Engine:** Neural TTS
313
+ - **Audio Format:** MP3
314
+ - **Sample Rate:** 24kHz
315
+ - **Max Text Length:** 5000 characters
316
+
317
+ ---
318
+
319
+ *Built with ❀️ using Python and Gradio*
320
+ """)
321
 
322
+ # Footer
323
+ gr.HTML("""
324
+ <div style="text-align: center; padding: 20px; color: #888;">
325
+ <p>Vedes TTS Β© 2024 | Powered by Neural Speech Synthesis</p>
326
+ </div>
327
+ """)
 
 
 
 
 
 
 
 
328
 
329
+ # Event Handlers
330
+ text_input.change(
331
+ fn=text_analysis,
332
+ inputs=text_input,
333
+ outputs=text_stats
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
  )
335
 
 
336
  synthesize_btn.click(
337
  fn=synthesize_speech,
338
+ inputs=[text_input, voice_select, speaking_rate, pitch_shift],
339
  outputs=audio_output
340
  )
341
 
342
  text_input.submit(
343
  fn=synthesize_speech,
344
+ inputs=[text_input, voice_select, speaking_rate, pitch_shift],
345
  outputs=audio_output
346
  )
347
 
348
 
349
  # Launch
350
+ print("βœ… Vedes TTS Ready!")
351
+ print("=" * 50)
352
+
353
  if __name__ == "__main__":
354
  demo.launch()