vedaco commited on
Commit
b67fedc
Β·
verified Β·
1 Parent(s): de1505b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +842 -730
app.py CHANGED
@@ -4,48 +4,144 @@ from scipy import signal
4
  from scipy.io import wavfile
5
  import tempfile
6
  import re
 
 
7
 
8
  # ============================================
9
- # VEDES TTS - CLEAR SPEECH VERSION
10
  # 100% From Scratch - No APIs
11
  # ============================================
12
 
13
  SAMPLE_RATE = 22050
14
 
15
  # ============================================
16
- # PHONEME DATA - OPTIMIZED FOR CLARITY
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  # ============================================
18
 
19
- # Format: F1, F2, F3, duration_ms, amplitude, is_voiced
20
  VOWELS = {
21
- 'IY': (280, 2250, 2890, 150, 1.0, True), # bee
22
- 'IH': (400, 1920, 2550, 120, 0.9, True), # bit
23
- 'EH': (550, 1770, 2490, 130, 0.95, True), # bet
24
- 'AE': (690, 1660, 2490, 140, 1.0, True), # bat
25
- 'AA': (710, 1100, 2540, 150, 1.0, True), # father
26
- 'AO': (590, 880, 2540, 140, 0.95, True), # bought
27
- 'UH': (470, 1100, 2540, 120, 0.9, True), # book
28
- 'UW': (310, 870, 2250, 150, 1.0, True), # boot
29
- 'AH': (640, 1200, 2400, 100, 0.85, True), # but
30
- 'AX': (500, 1500, 2500, 80, 0.7, True), # about (schwa)
31
- 'ER': (500, 1350, 1700, 140, 0.9, True), # bird
32
- 'EY': (500, 1900, 2600, 160, 1.0, True), # bay
33
- 'AY': (700, 1200, 2600, 180, 1.0, True), # buy
34
- 'OY': (500, 900, 2500, 180, 1.0, True), # boy
35
- 'AW': (700, 1100, 2600, 180, 1.0, True), # now
36
- 'OW': (500, 900, 2500, 160, 1.0, True), # go
37
  }
38
 
39
  CONSONANTS = {
40
- # Stops: closure_ms, burst_ms, voice_bar, burst_freq
41
  'P': {'type': 'stop', 'closure': 80, 'burst': 30, 'voiced': False, 'burst_freq': 800, 'amp': 0.6},
42
  'B': {'type': 'stop', 'closure': 50, 'burst': 25, 'voiced': True, 'burst_freq': 800, 'amp': 0.7},
43
  'T': {'type': 'stop', 'closure': 70, 'burst': 30, 'voiced': False, 'burst_freq': 3500, 'amp': 0.7},
44
  'D': {'type': 'stop', 'closure': 40, 'burst': 25, 'voiced': True, 'burst_freq': 3500, 'amp': 0.7},
45
  'K': {'type': 'stop', 'closure': 80, 'burst': 40, 'voiced': False, 'burst_freq': 1500, 'amp': 0.7},
46
  'G': {'type': 'stop', 'closure': 50, 'burst': 30, 'voiced': True, 'burst_freq': 1500, 'amp': 0.7},
47
-
48
- # Fricatives: duration, freq_low, freq_high, voiced
49
  'F': {'type': 'fric', 'dur': 120, 'freq_low': 1500, 'freq_high': 8000, 'voiced': False, 'amp': 0.4},
50
  'V': {'type': 'fric', 'dur': 80, 'freq_low': 1500, 'freq_high': 8000, 'voiced': True, 'amp': 0.5},
51
  'TH': {'type': 'fric', 'dur': 100, 'freq_low': 1400, 'freq_high': 6000, 'voiced': False, 'amp': 0.3},
@@ -55,530 +151,209 @@ CONSONANTS = {
55
  'SH': {'type': 'fric', 'dur': 120, 'freq_low': 2000, 'freq_high': 6000, 'voiced': False, 'amp': 0.5},
56
  'ZH': {'type': 'fric', 'dur': 80, 'freq_low': 2000, 'freq_high': 6000, 'voiced': True, 'amp': 0.5},
57
  'HH': {'type': 'fric', 'dur': 80, 'freq_low': 500, 'freq_high': 2000, 'voiced': False, 'amp': 0.3},
58
-
59
- # Affricates
60
  'CH': {'type': 'affric', 'closure': 60, 'fric': 80, 'freq_low': 2000, 'freq_high': 6000, 'voiced': False, 'amp': 0.6},
61
  'JH': {'type': 'affric', 'closure': 40, 'fric': 60, 'freq_low': 2000, 'freq_high': 6000, 'voiced': True, 'amp': 0.6},
62
-
63
- # Nasals: F1, F2, F3, duration
64
  'M': {'type': 'nasal', 'f1': 280, 'f2': 1000, 'f3': 2200, 'dur': 100, 'amp': 0.8},
65
  'N': {'type': 'nasal', 'f1': 280, 'f2': 1700, 'f3': 2500, 'dur': 90, 'amp': 0.8},
66
  'NG': {'type': 'nasal', 'f1': 300, 'f2': 2000, 'f3': 2700, 'dur': 100, 'amp': 0.8},
67
-
68
- # Liquids
69
  'L': {'type': 'liquid', 'f1': 380, 'f2': 1000, 'f3': 2700, 'dur': 90, 'amp': 0.85},
70
  'R': {'type': 'liquid', 'f1': 350, 'f2': 1300, 'f3': 1700, 'dur': 90, 'amp': 0.85},
71
-
72
- # Glides
73
  'W': {'type': 'glide', 'f1': 300, 'f2': 700, 'f3': 2200, 'dur': 80, 'amp': 0.8},
74
  'Y': {'type': 'glide', 'f1': 280, 'f2': 2200, 'f3': 2900, 'dur': 70, 'amp': 0.8},
75
  }
76
 
77
- # Silence
78
- SILENCE = {
79
- 'SIL': 60, # Short pause between words
80
- 'PAU': 200, # Long pause (punctuation)
81
- }
82
-
83
 
84
  # ============================================
85
- # COMPREHENSIVE PRONUNCIATION DICTIONARY
86
  # ============================================
87
 
88
  DICTIONARY = {
89
- # ===== FUNCTION WORDS =====
90
- 'a': ['AX'],
91
- 'an': ['AE', 'N'],
92
- 'the': ['DH', 'AX'],
93
- 'and': ['AE', 'N', 'D'],
94
- 'or': ['AO', 'R'],
95
- 'but': ['B', 'AH', 'T'],
96
- 'if': ['IH', 'F'],
97
- 'of': ['AH', 'V'],
98
- 'to': ['T', 'UW'],
99
- 'in': ['IH', 'N'],
100
- 'on': ['AA', 'N'],
101
- 'at': ['AE', 'T'],
102
- 'by': ['B', 'AY'],
103
- 'for': ['F', 'AO', 'R'],
104
- 'with': ['W', 'IH', 'TH'],
105
- 'from': ['F', 'R', 'AH', 'M'],
106
- 'up': ['AH', 'P'],
107
- 'out': ['AW', 'T'],
108
- 'as': ['AE', 'Z'],
109
- 'so': ['S', 'OW'],
110
- 'not': ['N', 'AA', 'T'],
111
 
112
- # ===== PRONOUNS =====
113
- 'i': ['AY'],
114
- 'me': ['M', 'IY'],
115
- 'my': ['M', 'AY'],
116
- 'mine': ['M', 'AY', 'N'],
117
- 'you': ['Y', 'UW'],
118
- 'your': ['Y', 'AO', 'R'],
119
- 'yours': ['Y', 'AO', 'R', 'Z'],
120
- 'he': ['HH', 'IY'],
121
- 'him': ['HH', 'IH', 'M'],
122
- 'his': ['HH', 'IH', 'Z'],
123
- 'she': ['SH', 'IY'],
124
- 'her': ['HH', 'ER'],
125
- 'hers': ['HH', 'ER', 'Z'],
126
- 'it': ['IH', 'T'],
127
- 'its': ['IH', 'T', 'S'],
128
- 'we': ['W', 'IY'],
129
- 'us': ['AH', 'S'],
130
- 'our': ['AW', 'ER'],
131
- 'they': ['DH', 'EY'],
132
- 'them': ['DH', 'EH', 'M'],
133
- 'their': ['DH', 'EH', 'R'],
134
- 'this': ['DH', 'IH', 'S'],
135
- 'that': ['DH', 'AE', 'T'],
136
- 'these': ['DH', 'IY', 'Z'],
137
- 'those': ['DH', 'OW', 'Z'],
138
- 'what': ['W', 'AH', 'T'],
139
- 'who': ['HH', 'UW'],
140
- 'where': ['W', 'EH', 'R'],
141
- 'when': ['W', 'EH', 'N'],
142
- 'why': ['W', 'AY'],
143
- 'how': ['HH', 'AW'],
144
- 'which': ['W', 'IH', 'CH'],
145
 
146
- # ===== BE VERBS =====
147
- 'am': ['AE', 'M'],
148
- 'is': ['IH', 'Z'],
149
- 'are': ['AA', 'R'],
150
- 'was': ['W', 'AA', 'Z'],
151
- 'were': ['W', 'ER'],
152
- 'be': ['B', 'IY'],
153
- 'been': ['B', 'IH', 'N'],
154
- 'being': ['B', 'IY', 'IH', 'NG'],
155
 
156
- # ===== HAVE VERBS =====
157
- 'have': ['HH', 'AE', 'V'],
158
- 'has': ['HH', 'AE', 'Z'],
159
- 'had': ['HH', 'AE', 'D'],
160
- 'having': ['HH', 'AE', 'V', 'IH', 'NG'],
161
 
162
- # ===== DO VERBS =====
163
- 'do': ['D', 'UW'],
164
- 'does': ['D', 'AH', 'Z'],
165
- 'did': ['D', 'IH', 'D'],
166
- 'done': ['D', 'AH', 'N'],
167
- 'doing': ['D', 'UW', 'IH', 'NG'],
168
 
169
- # ===== MODAL VERBS =====
170
- 'will': ['W', 'IH', 'L'],
171
- 'would': ['W', 'UH', 'D'],
172
- 'can': ['K', 'AE', 'N'],
173
- 'could': ['K', 'UH', 'D'],
174
- 'should': ['SH', 'UH', 'D'],
175
- 'shall': ['SH', 'AE', 'L'],
176
- 'may': ['M', 'EY'],
177
- 'might': ['M', 'AY', 'T'],
178
- 'must': ['M', 'AH', 'S', 'T'],
179
 
180
- # ===== COMMON VERBS =====
181
- 'go': ['G', 'OW'],
182
- 'goes': ['G', 'OW', 'Z'],
183
- 'going': ['G', 'OW', 'IH', 'NG'],
184
- 'went': ['W', 'EH', 'N', 'T'],
185
- 'gone': ['G', 'AO', 'N'],
186
- 'come': ['K', 'AH', 'M'],
187
- 'comes': ['K', 'AH', 'M', 'Z'],
188
- 'coming': ['K', 'AH', 'M', 'IH', 'NG'],
189
- 'came': ['K', 'EY', 'M'],
190
- 'get': ['G', 'EH', 'T'],
191
- 'gets': ['G', 'EH', 'T', 'S'],
192
- 'getting': ['G', 'EH', 'T', 'IH', 'NG'],
193
- 'got': ['G', 'AA', 'T'],
194
- 'make': ['M', 'EY', 'K'],
195
- 'makes': ['M', 'EY', 'K', 'S'],
196
- 'making': ['M', 'EY', 'K', 'IH', 'NG'],
197
- 'made': ['M', 'EY', 'D'],
198
- 'take': ['T', 'EY', 'K'],
199
- 'takes': ['T', 'EY', 'K', 'S'],
200
- 'taking': ['T', 'EY', 'K', 'IH', 'NG'],
201
- 'took': ['T', 'UH', 'K'],
202
- 'taken': ['T', 'EY', 'K', 'AX', 'N'],
203
- 'see': ['S', 'IY'],
204
- 'sees': ['S', 'IY', 'Z'],
205
- 'seeing': ['S', 'IY', 'IH', 'NG'],
206
- 'saw': ['S', 'AO'],
207
- 'seen': ['S', 'IY', 'N'],
208
- 'say': ['S', 'EY'],
209
- 'says': ['S', 'EH', 'Z'],
210
- 'saying': ['S', 'EY', 'IH', 'NG'],
211
- 'said': ['S', 'EH', 'D'],
212
- 'know': ['N', 'OW'],
213
- 'knows': ['N', 'OW', 'Z'],
214
- 'knowing': ['N', 'OW', 'IH', 'NG'],
215
- 'knew': ['N', 'UW'],
216
- 'known': ['N', 'OW', 'N'],
217
- 'think': ['TH', 'IH', 'NG', 'K'],
218
- 'thinks': ['TH', 'IH', 'NG', 'K', 'S'],
219
- 'thinking': ['TH', 'IH', 'NG', 'K', 'IH', 'NG'],
220
- 'thought': ['TH', 'AO', 'T'],
221
- 'want': ['W', 'AA', 'N', 'T'],
222
- 'wants': ['W', 'AA', 'N', 'T', 'S'],
223
- 'wanted': ['W', 'AA', 'N', 'T', 'IH', 'D'],
224
- 'wanting': ['W', 'AA', 'N', 'T', 'IH', 'NG'],
225
- 'give': ['G', 'IH', 'V'],
226
- 'gives': ['G', 'IH', 'V', 'Z'],
227
- 'giving': ['G', 'IH', 'V', 'IH', 'NG'],
228
- 'gave': ['G', 'EY', 'V'],
229
- 'given': ['G', 'IH', 'V', 'AX', 'N'],
230
- 'use': ['Y', 'UW', 'Z'],
231
- 'uses': ['Y', 'UW', 'Z', 'IH', 'Z'],
232
- 'using': ['Y', 'UW', 'Z', 'IH', 'NG'],
233
- 'used': ['Y', 'UW', 'Z', 'D'],
234
- 'find': ['F', 'AY', 'N', 'D'],
235
- 'finds': ['F', 'AY', 'N', 'D', 'Z'],
236
- 'finding': ['F', 'AY', 'N', 'D', 'IH', 'NG'],
237
- 'found': ['F', 'AW', 'N', 'D'],
238
- 'tell': ['T', 'EH', 'L'],
239
- 'tells': ['T', 'EH', 'L', 'Z'],
240
- 'telling': ['T', 'EH', 'L', 'IH', 'NG'],
241
- 'told': ['T', 'OW', 'L', 'D'],
242
- 'ask': ['AE', 'S', 'K'],
243
- 'asks': ['AE', 'S', 'K', 'S'],
244
- 'asking': ['AE', 'S', 'K', 'IH', 'NG'],
245
- 'asked': ['AE', 'S', 'K', 'T'],
246
- 'work': ['W', 'ER', 'K'],
247
- 'works': ['W', 'ER', 'K', 'S'],
248
- 'working': ['W', 'ER', 'K', 'IH', 'NG'],
249
- 'worked': ['W', 'ER', 'K', 'T'],
250
- 'try': ['T', 'R', 'AY'],
251
- 'tries': ['T', 'R', 'AY', 'Z'],
252
- 'trying': ['T', 'R', 'AY', 'IH', 'NG'],
253
- 'tried': ['T', 'R', 'AY', 'D'],
254
- 'call': ['K', 'AO', 'L'],
255
- 'calls': ['K', 'AO', 'L', 'Z'],
256
- 'calling': ['K', 'AO', 'L', 'IH', 'NG'],
257
- 'called': ['K', 'AO', 'L', 'D'],
258
- 'need': ['N', 'IY', 'D'],
259
- 'needs': ['N', 'IY', 'D', 'Z'],
260
- 'needing': ['N', 'IY', 'D', 'IH', 'NG'],
261
- 'needed': ['N', 'IY', 'D', 'IH', 'D'],
262
- 'feel': ['F', 'IY', 'L'],
263
- 'feels': ['F', 'IY', 'L', 'Z'],
264
- 'feeling': ['F', 'IY', 'L', 'IH', 'NG'],
265
- 'felt': ['F', 'EH', 'L', 'T'],
266
- 'put': ['P', 'UH', 'T'],
267
- 'puts': ['P', 'UH', 'T', 'S'],
268
- 'putting': ['P', 'UH', 'T', 'IH', 'NG'],
269
- 'keep': ['K', 'IY', 'P'],
270
- 'keeps': ['K', 'IY', 'P', 'S'],
271
- 'keeping': ['K', 'IY', 'P', 'IH', 'NG'],
272
- 'kept': ['K', 'EH', 'P', 'T'],
273
- 'let': ['L', 'EH', 'T'],
274
- 'lets': ['L', 'EH', 'T', 'S'],
275
- 'letting': ['L', 'EH', 'T', 'IH', 'NG'],
276
- 'begin': ['B', 'IH', 'G', 'IH', 'N'],
277
- 'begins': ['B', 'IH', 'G', 'IH', 'N', 'Z'],
278
- 'beginning': ['B', 'IH', 'G', 'IH', 'N', 'IH', 'NG'],
279
- 'began': ['B', 'IH', 'G', 'AE', 'N'],
280
- 'seem': ['S', 'IY', 'M'],
281
- 'seems': ['S', 'IY', 'M', 'Z'],
282
- 'seeming': ['S', 'IY', 'M', 'IH', 'NG'],
283
- 'seemed': ['S', 'IY', 'M', 'D'],
284
- 'help': ['HH', 'EH', 'L', 'P'],
285
- 'helps': ['HH', 'EH', 'L', 'P', 'S'],
286
- 'helping': ['HH', 'EH', 'L', 'P', 'IH', 'NG'],
287
- 'helped': ['HH', 'EH', 'L', 'P', 'T'],
288
- 'show': ['SH', 'OW'],
289
- 'shows': ['SH', 'OW', 'Z'],
290
- 'showing': ['SH', 'OW', 'IH', 'NG'],
291
- 'showed': ['SH', 'OW', 'D'],
292
- 'shown': ['SH', 'OW', 'N'],
293
- 'hear': ['HH', 'IY', 'R'],
294
- 'hears': ['HH', 'IY', 'R', 'Z'],
295
- 'hearing': ['HH', 'IY', 'R', 'IH', 'NG'],
296
- 'heard': ['HH', 'ER', 'D'],
297
- 'play': ['P', 'L', 'EY'],
298
- 'plays': ['P', 'L', 'EY', 'Z'],
299
- 'playing': ['P', 'L', 'EY', 'IH', 'NG'],
300
- 'played': ['P', 'L', 'EY', 'D'],
301
- 'run': ['R', 'AH', 'N'],
302
- 'runs': ['R', 'AH', 'N', 'Z'],
303
- 'running': ['R', 'AH', 'N', 'IH', 'NG'],
304
- 'ran': ['R', 'AE', 'N'],
305
- 'move': ['M', 'UW', 'V'],
306
- 'moves': ['M', 'UW', 'V', 'Z'],
307
- 'moving': ['M', 'UW', 'V', 'IH', 'NG'],
308
- 'moved': ['M', 'UW', 'V', 'D'],
309
- 'live': ['L', 'IH', 'V'],
310
- 'lives': ['L', 'IH', 'V', 'Z'],
311
- 'living': ['L', 'IH', 'V', 'IH', 'NG'],
312
- 'lived': ['L', 'IH', 'V', 'D'],
313
  'believe': ['B', 'IH', 'L', 'IY', 'V'],
314
- 'believes': ['B', 'IH', 'L', 'IY', 'V', 'Z'],
315
- 'believed': ['B', 'IH', 'L', 'IY', 'V', 'D'],
316
- 'read': ['R', 'IY', 'D'],
317
- 'reads': ['R', 'IY', 'D', 'Z'],
318
- 'reading': ['R', 'IY', 'D', 'IH', 'NG'],
319
- 'write': ['R', 'AY', 'T'],
320
- 'writes': ['R', 'AY', 'T', 'S'],
321
- 'writing': ['R', 'AY', 'T', 'IH', 'NG'],
322
- 'wrote': ['R', 'OW', 'T'],
323
- 'written': ['R', 'IH', 'T', 'AX', 'N'],
324
- 'speak': ['S', 'P', 'IY', 'K'],
325
- 'speaks': ['S', 'P', 'IY', 'K', 'S'],
326
- 'speaking': ['S', 'P', 'IY', 'K', 'IH', 'NG'],
327
- 'spoke': ['S', 'P', 'OW', 'K'],
328
- 'spoken': ['S', 'P', 'OW', 'K', 'AX', 'N'],
329
- 'learn': ['L', 'ER', 'N'],
330
- 'learns': ['L', 'ER', 'N', 'Z'],
331
- 'learning': ['L', 'ER', 'N', 'IH', 'NG'],
332
- 'learned': ['L', 'ER', 'N', 'D'],
333
- 'like': ['L', 'AY', 'K'],
334
- 'likes': ['L', 'AY', 'K', 'S'],
335
- 'liking': ['L', 'AY', 'K', 'IH', 'NG'],
336
- 'liked': ['L', 'AY', 'K', 'T'],
337
- 'look': ['L', 'UH', 'K'],
338
- 'looks': ['L', 'UH', 'K', 'S'],
339
- 'looking': ['L', 'UH', 'K', 'IH', 'NG'],
340
- 'looked': ['L', 'UH', 'K', 'T'],
341
- 'love': ['L', 'AH', 'V'],
342
- 'loves': ['L', 'AH', 'V', 'Z'],
343
- 'loving': ['L', 'AH', 'V', 'IH', 'NG'],
344
- 'loved': ['L', 'AH', 'V', 'D'],
345
 
346
- # ===== ADJECTIVES =====
347
- 'good': ['G', 'UH', 'D'],
348
- 'better': ['B', 'EH', 'T', 'ER'],
349
- 'best': ['B', 'EH', 'S', 'T'],
350
- 'bad': ['B', 'AE', 'D'],
351
- 'worse': ['W', 'ER', 'S'],
352
- 'worst': ['W', 'ER', 'S', 'T'],
353
- 'new': ['N', 'UW'],
354
- 'old': ['OW', 'L', 'D'],
355
- 'young': ['Y', 'AH', 'NG'],
356
- 'big': ['B', 'IH', 'G'],
357
- 'small': ['S', 'M', 'AO', 'L'],
358
- 'long': ['L', 'AO', 'NG'],
359
- 'short': ['SH', 'AO', 'R', 'T'],
360
- 'high': ['HH', 'AY'],
361
- 'low': ['L', 'OW'],
362
- 'great': ['G', 'R', 'EY', 'T'],
363
- 'little': ['L', 'IH', 'T', 'AX', 'L'],
364
- 'right': ['R', 'AY', 'T'],
365
- 'wrong': ['R', 'AO', 'NG'],
366
- 'first': ['F', 'ER', 'S', 'T'],
367
- 'last': ['L', 'AE', 'S', 'T'],
368
- 'next': ['N', 'EH', 'K', 'S', 'T'],
369
- 'same': ['S', 'EY', 'M'],
370
- 'different': ['D', 'IH', 'F', 'R', 'AX', 'N', 'T'],
371
- 'other': ['AH', 'DH', 'ER'],
372
- 'own': ['OW', 'N'],
373
- 'important': ['IH', 'M', 'P', 'AO', 'R', 'T', 'AX', 'N', 'T'],
374
- 'real': ['R', 'IY', 'L'],
375
- 'sure': ['SH', 'UH', 'R'],
376
- 'true': ['T', 'R', 'UW'],
377
- 'happy': ['HH', 'AE', 'P', 'IY'],
378
- 'nice': ['N', 'AY', 'S'],
379
- 'easy': ['IY', 'Z', 'IY'],
380
- 'hard': ['HH', 'AA', 'R', 'D'],
381
- 'fine': ['F', 'AY', 'N'],
382
- 'clear': ['K', 'L', 'IY', 'R'],
383
- 'free': ['F', 'R', 'IY'],
384
- 'full': ['F', 'UH', 'L'],
385
- 'open': ['OW', 'P', 'AX', 'N'],
386
- 'simple': ['S', 'IH', 'M', 'P', 'AX', 'L'],
387
- 'ready': ['R', 'EH', 'D', 'IY'],
388
- 'able': ['EY', 'B', 'AX', 'L'],
389
- 'possible': ['P', 'AA', 'S', 'AX', 'B', 'AX', 'L'],
390
 
391
- # ===== ADVERBS =====
392
- 'very': ['V', 'EH', 'R', 'IY'],
393
- 'really': ['R', 'IY', 'L', 'IY'],
394
- 'just': ['JH', 'AH', 'S', 'T'],
395
- 'only': ['OW', 'N', 'L', 'IY'],
396
- 'also': ['AO', 'L', 'S', 'OW'],
397
- 'well': ['W', 'EH', 'L'],
398
- 'now': ['N', 'AW'],
399
- 'then': ['DH', 'EH', 'N'],
400
- 'here': ['HH', 'IY', 'R'],
401
- 'there': ['DH', 'EH', 'R'],
402
- 'still': ['S', 'T', 'IH', 'L'],
403
- 'even': ['IY', 'V', 'AX', 'N'],
404
- 'back': ['B', 'AE', 'K'],
405
- 'again': ['AX', 'G', 'EH', 'N'],
406
- 'always': ['AO', 'L', 'W', 'EY', 'Z'],
407
- 'never': ['N', 'EH', 'V', 'ER'],
408
- 'ever': ['EH', 'V', 'ER'],
409
- 'often': ['AO', 'F', 'AX', 'N'],
410
- 'sometimes': ['S', 'AH', 'M', 'T', 'AY', 'M', 'Z'],
411
- 'today': ['T', 'AX', 'D', 'EY'],
412
- 'maybe': ['M', 'EY', 'B', 'IY'],
413
- 'too': ['T', 'UW'],
414
- 'much': ['M', 'AH', 'CH'],
415
- 'more': ['M', 'AO', 'R'],
416
- 'most': ['M', 'OW', 'S', 'T'],
417
- 'less': ['L', 'EH', 'S'],
418
- 'away': ['AX', 'W', 'EY'],
419
- 'together': ['T', 'AX', 'G', 'EH', 'DH', 'ER'],
420
 
421
- # ===== NOUNS =====
422
- 'time': ['T', 'AY', 'M'],
423
- 'year': ['Y', 'IY', 'R'],
424
- 'day': ['D', 'EY'],
425
- 'way': ['W', 'EY'],
426
- 'man': ['M', 'AE', 'N'],
427
- 'men': ['M', 'EH', 'N'],
428
- 'woman': ['W', 'UH', 'M', 'AX', 'N'],
429
- 'women': ['W', 'IH', 'M', 'IH', 'N'],
430
- 'child': ['CH', 'AY', 'L', 'D'],
431
- 'children': ['CH', 'IH', 'L', 'D', 'R', 'AX', 'N'],
432
- 'world': ['W', 'ER', 'L', 'D'],
433
- 'life': ['L', 'AY', 'F'],
434
- 'hand': ['HH', 'AE', 'N', 'D'],
435
- 'part': ['P', 'AA', 'R', 'T'],
436
- 'place': ['P', 'L', 'EY', 'S'],
437
- 'thing': ['TH', 'IH', 'NG'],
438
- 'things': ['TH', 'IH', 'NG', 'Z'],
439
- 'people': ['P', 'IY', 'P', 'AX', 'L'],
440
- 'person': ['P', 'ER', 'S', 'AX', 'N'],
441
- 'home': ['HH', 'OW', 'M'],
442
- 'house': ['HH', 'AW', 'S'],
443
- 'room': ['R', 'UW', 'M'],
444
- 'word': ['W', 'ER', 'D'],
445
- 'words': ['W', 'ER', 'D', 'Z'],
446
- 'name': ['N', 'EY', 'M'],
447
- 'number': ['N', 'AH', 'M', 'B', 'ER'],
448
- 'water': ['W', 'AO', 'T', 'ER'],
449
- 'money': ['M', 'AH', 'N', 'IY'],
450
  'family': ['F', 'AE', 'M', 'AX', 'L', 'IY'],
451
- 'friend': ['F', 'R', 'EH', 'N', 'D'],
452
- 'friends': ['F', 'R', 'EH', 'N', 'D', 'Z'],
453
- 'mother': ['M', 'AH', 'DH', 'ER'],
454
- 'father': ['F', 'AA', 'DH', 'ER'],
455
- 'boy': ['B', 'OY'],
456
- 'girl': ['G', 'ER', 'L'],
457
- 'head': ['HH', 'EH', 'D'],
458
- 'face': ['F', 'EY', 'S'],
459
- 'eye': ['AY'],
460
- 'eyes': ['AY', 'Z'],
461
- 'body': ['B', 'AA', 'D', 'IY'],
462
- 'heart': ['HH', 'AA', 'R', 'T'],
463
- 'mind': ['M', 'AY', 'N', 'D'],
464
- 'voice': ['V', 'OY', 'S'],
465
- 'night': ['N', 'AY', 'T'],
466
  'morning': ['M', 'AO', 'R', 'N', 'IH', 'NG'],
467
- 'week': ['W', 'IY', 'K'],
468
- 'month': ['M', 'AH', 'N', 'TH'],
469
- 'hour': ['AW', 'ER'],
470
- 'minute': ['M', 'IH', 'N', 'IH', 'T'],
471
- 'second': ['S', 'EH', 'K', 'AX', 'N', 'D'],
472
- 'school': ['S', 'K', 'UW', 'L'],
473
- 'book': ['B', 'UH', 'K'],
474
  'story': ['S', 'T', 'AO', 'R', 'IY'],
475
  'question': ['K', 'W', 'EH', 'S', 'CH', 'AX', 'N'],
476
  'answer': ['AE', 'N', 'S', 'ER'],
477
- 'problem': ['P', 'R', 'AA', 'B', 'L', 'AX', 'M'],
478
- 'idea': ['AY', 'D', 'IY', 'AX'],
479
- 'fact': ['F', 'AE', 'K', 'T'],
480
- 'reason': ['R', 'IY', 'Z', 'AX', 'N'],
481
- 'example': ['IH', 'G', 'Z', 'AE', 'M', 'P', 'AX', 'L'],
482
- 'point': ['P', 'OY', 'N', 'T'],
483
- 'end': ['EH', 'N', 'D'],
484
- 'side': ['S', 'AY', 'D'],
485
- 'kind': ['K', 'AY', 'N', 'D'],
486
- 'case': ['K', 'EY', 'S'],
487
- 'line': ['L', 'AY', 'N'],
488
- 'car': ['K', 'AA', 'R'],
489
- 'city': ['S', 'IH', 'T', 'IY'],
490
- 'country': ['K', 'AH', 'N', 'T', 'R', 'IY'],
491
- 'door': ['D', 'AO', 'R'],
492
- 'job': ['JH', 'AA', 'B'],
493
- 'team': ['T', 'IY', 'M'],
494
- 'game': ['G', 'EY', 'M'],
495
- 'food': ['F', 'UW', 'D'],
496
- 'music': ['M', 'Y', 'UW', 'Z', 'IH', 'K'],
497
- 'art': ['AA', 'R', 'T'],
498
 
499
- # ===== NUMBERS =====
500
- 'zero': ['Z', 'IY', 'R', 'OW'],
501
- 'one': ['W', 'AH', 'N'],
502
- 'two': ['T', 'UW'],
503
- 'three': ['TH', 'R', 'IY'],
504
- 'four': ['F', 'AO', 'R'],
505
- 'five': ['F', 'AY', 'V'],
506
- 'six': ['S', 'IH', 'K', 'S'],
507
- 'seven': ['S', 'EH', 'V', 'AX', 'N'],
508
- 'eight': ['EY', 'T'],
509
- 'nine': ['N', 'AY', 'N'],
510
- 'ten': ['T', 'EH', 'N'],
511
- 'hundred': ['HH', 'AH', 'N', 'D', 'R', 'AX', 'D'],
512
- 'thousand': ['TH', 'AW', 'Z', 'AX', 'N', 'D'],
513
 
514
- # ===== GREETINGS & EXPRESSIONS =====
515
- 'hello': ['HH', 'AX', 'L', 'OW'],
516
- 'hi': ['HH', 'AY'],
517
- 'hey': ['HH', 'EY'],
518
- 'goodbye': ['G', 'UH', 'D', 'B', 'AY'],
519
- 'bye': ['B', 'AY'],
520
- 'welcome': ['W', 'EH', 'L', 'K', 'AX', 'M'],
521
- 'please': ['P', 'L', 'IY', 'Z'],
522
- 'thank': ['TH', 'AE', 'NG', 'K'],
523
- 'thanks': ['TH', 'AE', 'NG', 'K', 'S'],
524
  'sorry': ['S', 'AA', 'R', 'IY'],
525
- 'yes': ['Y', 'EH', 'S'],
526
- 'yeah': ['Y', 'AE'],
527
- 'no': ['N', 'OW'],
528
- 'ok': ['OW', 'K', 'EY'],
529
- 'okay': ['OW', 'K', 'EY'],
530
 
531
- # ===== TECH & TTS =====
532
- 'text': ['T', 'EH', 'K', 'S', 'T'],
533
- 'speech': ['S', 'P', 'IY', 'CH'],
534
- 'sound': ['S', 'AW', 'N', 'D'],
535
- 'audio': ['AO', 'D', 'IY', 'OW'],
536
- 'voice': ['V', 'OY', 'S'],
537
- 'test': ['T', 'EH', 'S', 'T'],
538
- 'testing': ['T', 'EH', 'S', 'T', 'IH', 'NG'],
539
  'computer': ['K', 'AX', 'M', 'P', 'Y', 'UW', 'T', 'ER'],
540
  'vedes': ['V', 'EY', 'D', 'EH', 'S'],
541
  'system': ['S', 'IH', 'S', 'T', 'AX', 'M'],
 
542
  }
543
 
544
- # Letter patterns for unknown words
545
  PATTERNS = [
546
- ('tion', ['SH', 'AX', 'N']),
547
- ('sion', ['ZH', 'AX', 'N']),
548
- ('ness', ['N', 'AX', 'S']),
549
- ('ment', ['M', 'AX', 'N', 'T']),
550
- ('able', ['AX', 'B', 'AX', 'L']),
551
- ('ible', ['AX', 'B', 'AX', 'L']),
552
- ('ful', ['F', 'AX', 'L']),
553
- ('less', ['L', 'AX', 'S']),
554
- ('ing', ['IH', 'NG']),
555
- ('ight', ['AY', 'T']),
556
- ('ough', ['AO']),
557
- ('ould', ['UH', 'D']),
558
- ('th', ['TH']),
559
- ('sh', ['SH']),
560
- ('ch', ['CH']),
561
- ('wh', ['W']),
562
- ('ph', ['F']),
563
- ('ck', ['K']),
564
- ('ng', ['NG']),
565
- ('qu', ['K', 'W']),
566
- ('ee', ['IY']),
567
- ('ea', ['IY']),
568
- ('oo', ['UW']),
569
- ('ou', ['AW']),
570
- ('ow', ['OW']),
571
- ('ai', ['EY']),
572
- ('ay', ['EY']),
573
- ('ey', ['IY']),
574
- ('oy', ['OY']),
575
- ('oi', ['OY']),
576
- ('ie', ['IY']),
577
- ('er', ['ER']),
578
- ('ir', ['ER']),
579
- ('ur', ['ER']),
580
- ('ar', ['AA', 'R']),
581
- ('or', ['AO', 'R']),
582
  ]
583
 
584
  LETTERS = {
@@ -590,6 +365,155 @@ LETTERS = {
590
  }
591
 
592
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
593
  # ============================================
594
  # TEXT TO PHONEME CONVERTER
595
  # ============================================
@@ -645,26 +569,42 @@ class TextToPhoneme:
645
 
646
 
647
  # ============================================
648
- # IMPROVED FORMANT SYNTHESIZER
649
  # ============================================
650
 
651
- class FormantSynthesizer:
652
  def __init__(self, sample_rate=22050):
653
  self.sr = sample_rate
654
- self.f0 = 130 # Base pitch
655
 
656
- def synthesize(self, phonemes, rate=1.0, pitch=1.0):
657
  if not phonemes:
658
  return np.zeros(int(self.sr * 0.5), dtype=np.float32)
659
 
660
- f0 = self.f0 * pitch
 
 
 
 
 
 
 
 
 
661
  segments = []
662
 
663
  for i, phon in enumerate(phonemes):
664
  prev_phon = phonemes[i - 1] if i > 0 else None
665
  next_phon = phonemes[i + 1] if i < len(phonemes) - 1 else None
666
 
667
- seg = self._synth_phoneme(phon, f0, rate, prev_phon, next_phon)
 
 
 
 
 
 
 
668
  segments.append(seg)
669
 
670
  audio = self._smooth_concat(segments)
@@ -672,135 +612,104 @@ class FormantSynthesizer:
672
 
673
  return audio.astype(np.float32)
674
 
675
- def _synth_phoneme(self, phon, f0, rate, prev_phon, next_phon):
676
- # Handle silence
677
  if phon in SILENCE:
678
- dur = int(self.sr * SILENCE[phon] / 1000 / rate)
679
  return np.zeros(dur, dtype=np.float32)
680
 
681
- # Handle vowels
682
  if phon in VOWELS:
683
- return self._synth_vowel(phon, f0, rate, prev_phon, next_phon)
 
684
 
685
- # Handle consonants
686
  if phon in CONSONANTS:
687
- return self._synth_consonant(phon, f0, rate)
688
 
689
  return np.zeros(100, dtype=np.float32)
690
 
691
- def _synth_vowel(self, phon, f0, rate, prev_phon, next_phon):
692
- """Synthesize vowel with formant transitions"""
693
  params = VOWELS[phon]
694
  f1, f2, f3, dur_ms, amp, voiced = params
695
 
696
- dur_ms = dur_ms / rate
 
 
 
 
 
 
 
 
 
697
  n = int(self.sr * dur_ms / 1000)
698
  n = max(n, 100)
699
  t = np.arange(n) / self.sr
700
 
701
- # Generate glottal source
702
- source = self._glottal_source(t, f0)
703
 
704
- # Apply formants with transitions
705
- audio = self._apply_formants_smooth(source, f1, f2, f3, prev_phon, next_phon)
706
 
707
- # Apply amplitude envelope
708
  envelope = self._vowel_envelope(n)
709
  audio = audio * envelope * amp
710
 
711
  return audio
712
 
713
- def _synth_consonant(self, phon, f0, rate):
714
- """Synthesize consonant"""
715
  params = CONSONANTS[phon]
716
  ctype = params['type']
717
 
718
  if ctype == 'stop':
719
- return self._synth_stop(phon, params, f0, rate)
720
  elif ctype == 'fric':
721
- return self._synth_fricative(phon, params, f0, rate)
722
  elif ctype == 'affric':
723
- return self._synth_affricate(phon, params, f0, rate)
724
  elif ctype == 'nasal':
725
- return self._synth_nasal(phon, params, f0, rate)
726
  elif ctype == 'liquid':
727
- return self._synth_liquid(phon, params, f0, rate)
728
  elif ctype == 'glide':
729
- return self._synth_glide(phon, params, f0, rate)
730
 
731
  return np.zeros(100, dtype=np.float32)
732
 
733
- def _glottal_source(self, t, f0):
734
- """Generate glottal pulse train"""
735
- # Use Rosenberg C model
736
  T0 = 1.0 / f0
737
  phase = (t % T0) / T0
738
 
739
- # Open phase (40%)
740
  glottal = np.zeros_like(t)
741
  mask1 = phase < 0.4
742
  glottal[mask1] = 0.5 * (1 - np.cos(np.pi * phase[mask1] / 0.4))
743
 
744
- # Closing phase (20%)
745
  mask2 = (phase >= 0.4) & (phase < 0.6)
746
  glottal[mask2] = np.cos(np.pi * (phase[mask2] - 0.4) / 0.4)
747
 
748
  # Add breathiness
749
- glottal += np.random.randn(len(t)) * 0.03
750
 
751
- # Add shimmer (amplitude variation)
752
  shimmer = 1 + 0.02 * np.sin(2 * np.pi * 5 * t)
753
  glottal *= shimmer
754
 
755
  return glottal
756
 
757
- def _apply_formants_smooth(self, source, f1, f2, f3, prev_phon, next_phon):
758
- """Apply formant filtering with smooth transitions"""
759
- n = len(source)
760
-
761
- # Get target formants
762
  formants = [(f1, 90), (f2, 110), (f3, 130)]
763
-
764
- # Get transition formants from neighbors
765
- if prev_phon and prev_phon in VOWELS:
766
- pf1, pf2, pf3 = VOWELS[prev_phon][0:3]
767
- else:
768
- pf1, pf2, pf3 = f1, f2, f3
769
-
770
- if next_phon and next_phon in VOWELS:
771
- nf1, nf2, nf3 = VOWELS[next_phon][0:3]
772
- else:
773
- nf1, nf2, nf3 = f1, f2, f3
774
-
775
  result = np.zeros_like(source)
776
- trans_len = min(n // 4, 500)
777
-
778
- # Process each formant
779
- for i, (freq, bw) in enumerate(formants):
780
- if i == 0:
781
- pf, nf = pf1, nf1
782
- elif i == 1:
783
- pf, nf = pf2, nf2
784
- else:
785
- pf, nf = pf3, nf3
786
-
787
- # Create frequency trajectory
788
- freq_traj = np.ones(n) * freq
789
- freq_traj[:trans_len] = np.linspace(pf * 0.7 + freq * 0.3, freq, trans_len)
790
- freq_traj[-trans_len:] = np.linspace(freq, nf * 0.7 + freq * 0.3, trans_len)
791
-
792
- # Apply time-varying filter (simplified)
793
- filtered = self._resonator_fixed(source, freq, bw)
794
- result += filtered * (1.0 / (i + 1))
795
 
796
  return result
797
 
798
- def _resonator_fixed(self, sig, freq, bw):
799
- """Fixed frequency resonator"""
800
  if freq <= 0 or freq >= self.sr / 2:
801
  return sig
802
 
803
- # Calculate coefficients
804
  r = np.exp(-np.pi * bw / self.sr)
805
  theta = 2 * np.pi * freq / self.sr
806
 
@@ -808,7 +717,6 @@ class FormantSynthesizer:
808
  a2 = r * r
809
  b0 = 1 - r
810
 
811
- # Apply IIR filter
812
  y = np.zeros_like(sig)
813
  for i in range(2, len(sig)):
814
  y[i] = b0 * sig[i] - a1 * y[i-1] - a2 * y[i-2]
@@ -816,23 +724,28 @@ class FormantSynthesizer:
816
  return y
817
 
818
  def _vowel_envelope(self, n):
819
- """Create smooth vowel envelope"""
820
  env = np.ones(n)
821
-
822
- # Attack (10%)
823
  attack = max(1, n // 10)
824
- env[:attack] = np.sin(np.linspace(0, np.pi/2, attack)) ** 2
825
-
826
- # Release (15%)
827
  release = max(1, int(n * 0.15))
 
 
828
  env[-release:] = np.cos(np.linspace(0, np.pi/2, release)) ** 2
829
 
830
  return env
831
 
832
- def _synth_stop(self, phon, params, f0, rate):
833
- """Synthesize stop consonant"""
834
- closure_ms = params['closure'] / rate
835
- burst_ms = params['burst'] / rate
 
 
 
 
 
 
 
 
 
836
 
837
  closure_n = int(self.sr * closure_ms / 1000)
838
  burst_n = int(self.sr * burst_ms / 1000)
@@ -840,17 +753,14 @@ class FormantSynthesizer:
840
 
841
  audio = np.zeros(total_n, dtype=np.float32)
842
 
843
- # Voice bar for voiced stops
844
  if params['voiced']:
845
  t = np.arange(closure_n) / self.sr
846
  voice_bar = np.sin(2 * np.pi * f0 * 0.8 * t) * 0.15
847
  audio[:closure_n] = voice_bar
848
 
849
- # Burst
850
  burst = np.random.randn(burst_n)
 
851
 
852
- # Filter burst
853
- burst_freq = params['burst_freq']
854
  try:
855
  if burst_freq < self.sr / 2 - 100:
856
  b, a = signal.butter(2, burst_freq / (self.sr / 2), 'low')
@@ -858,7 +768,6 @@ class FormantSynthesizer:
858
  except:
859
  pass
860
 
861
- # Burst envelope
862
  burst_env = np.exp(-np.linspace(0, 5, burst_n))
863
  burst *= burst_env * params['amp']
864
 
@@ -866,15 +775,12 @@ class FormantSynthesizer:
866
 
867
  return audio
868
 
869
- def _synth_fricative(self, phon, params, f0, rate):
870
- """Synthesize fricative consonant"""
871
- dur_ms = params['dur'] / rate
872
  n = int(self.sr * dur_ms / 1000)
873
 
874
- # Generate noise
875
  noise = np.random.randn(n)
876
 
877
- # Bandpass filter
878
  low = params['freq_low']
879
  high = min(params['freq_high'], self.sr / 2 - 100)
880
 
@@ -887,34 +793,28 @@ class FormantSynthesizer:
887
 
888
  audio = noise * params['amp']
889
 
890
- # Add voicing for voiced fricatives
891
  if params['voiced']:
892
  t = np.arange(n) / self.sr
893
- voice = self._glottal_source(t, f0) * 0.3
894
  audio = audio + voice
895
 
896
- # Apply envelope
897
- env = self._consonant_envelope(n)
898
- audio *= env
899
 
900
  return audio.astype(np.float32)
901
 
902
- def _synth_affricate(self, phon, params, f0, rate):
903
- """Synthesize affricate"""
904
- closure_ms = params['closure'] / rate
905
- fric_ms = params['fric'] / rate
906
 
907
  closure_n = int(self.sr * closure_ms / 1000)
908
  fric_n = int(self.sr * fric_ms / 1000)
909
 
910
  audio = np.zeros(closure_n + fric_n, dtype=np.float32)
911
 
912
- # Closure (silence or voice bar)
913
  if params['voiced']:
914
  t = np.arange(closure_n) / self.sr
915
  audio[:closure_n] = np.sin(2 * np.pi * f0 * 0.8 * t) * 0.1
916
 
917
- # Frication
918
  fric = np.random.randn(fric_n)
919
  low = params['freq_low']
920
  high = min(params['freq_high'], self.sr / 2 - 100)
@@ -927,7 +827,6 @@ class FormantSynthesizer:
927
 
928
  fric *= params['amp']
929
 
930
- # Envelope
931
  fric_env = np.ones(fric_n)
932
  attack = fric_n // 6
933
  release = fric_n // 3
@@ -938,32 +837,22 @@ class FormantSynthesizer:
938
 
939
  return audio
940
 
941
- def _synth_nasal(self, phon, params, f0, rate):
942
- """Synthesize nasal consonant"""
943
- dur_ms = params['dur'] / rate
944
  n = int(self.sr * dur_ms / 1000)
945
  t = np.arange(n) / self.sr
946
 
947
- # Generate voiced source
948
- source = self._glottal_source(t, f0)
949
 
950
- # Apply nasal formants
951
- audio = np.zeros_like(source)
 
952
 
953
- formants = [
954
- (params['f1'], 80),
955
- (params['f2'], 100),
956
- (params['f3'], 120),
957
- ]
958
 
959
- for freq, bw in formants:
960
- audio += self._resonator_fixed(source, freq, bw)
961
-
962
- # Add low nasal resonance
963
- nasal_pole = self._resonator_fixed(source, 250, 100) * 0.4
964
  audio += nasal_pole
965
 
966
- # Anti-resonance (nasal zero)
967
  try:
968
  b, a = signal.butter(2, 800 / (self.sr / 2), 'low')
969
  audio = signal.filtfilt(b, a, audio)
@@ -974,72 +863,46 @@ class FormantSynthesizer:
974
 
975
  return audio.astype(np.float32)
976
 
977
- def _synth_liquid(self, phon, params, f0, rate):
978
- """Synthesize liquid (L, R)"""
979
- dur_ms = params['dur'] / rate
980
  n = int(self.sr * dur_ms / 1000)
981
  t = np.arange(n) / self.sr
982
 
983
- source = self._glottal_source(t, f0)
984
 
985
- audio = np.zeros_like(source)
986
- formants = [
987
- (params['f1'], 70),
988
- (params['f2'], 90),
989
- (params['f3'], 110),
990
- ]
991
-
992
- for freq, bw in formants:
993
- audio += self._resonator_fixed(source, freq, bw)
994
 
 
995
  audio *= params['amp'] * self._consonant_envelope(n)
996
 
997
  return audio.astype(np.float32)
998
 
999
- def _synth_glide(self, phon, params, f0, rate):
1000
- """Synthesize glide (W, Y)"""
1001
- dur_ms = params['dur'] / rate
1002
  n = int(self.sr * dur_ms / 1000)
1003
  t = np.arange(n) / self.sr
1004
 
1005
- source = self._glottal_source(t, f0)
1006
-
1007
- # Formant transitions
1008
- audio = np.zeros_like(source)
1009
 
1010
- f1_start, f1_end = params['f1'], params['f1'] * 1.5
1011
- f2_start, f2_end = params['f2'], params['f2'] * 1.3
1012
-
1013
- # Time-varying formants (simplified)
1014
- for i, (freq, bw) in enumerate([(params['f1'], 70), (params['f2'], 90), (params['f3'], 110)]):
1015
- audio += self._resonator_fixed(source, freq, bw) / (i + 1)
1016
 
 
1017
  audio *= params['amp'] * self._consonant_envelope(n)
1018
 
1019
  return audio.astype(np.float32)
1020
 
1021
- def _consonant_envelope(self, n):
1022
- """Create consonant envelope"""
1023
- env = np.ones(n)
1024
- attack = max(1, n // 8)
1025
- release = max(1, n // 6)
1026
-
1027
- env[:attack] = np.linspace(0.1, 1, attack)
1028
- env[-release:] = np.linspace(1, 0.1, release)
1029
-
1030
- return env
1031
-
1032
  def _smooth_concat(self, segments):
1033
- """Concatenate with crossfade"""
1034
  if not segments:
1035
  return np.zeros(1000, dtype=np.float32)
1036
 
1037
  if len(segments) == 1:
1038
  return segments[0]
1039
 
1040
- # Calculate overlap
1041
  overlap = 64
1042
-
1043
  total_len = sum(len(s) for s in segments) - overlap * (len(segments) - 1)
1044
  total_len = max(total_len, 100)
1045
 
@@ -1059,7 +922,6 @@ class FormantSynthesizer:
1059
  seg_to_add = seg[:seg_len]
1060
 
1061
  if i > 0 and pos > overlap:
1062
- # Crossfade
1063
  fade_len = min(overlap, seg_len)
1064
  fade_in = np.linspace(0, 1, fade_len) ** 0.5
1065
  fade_out = np.linspace(1, 0, fade_len) ** 0.5
@@ -1075,19 +937,14 @@ class FormantSynthesizer:
1075
  return audio
1076
 
1077
  def _normalize(self, audio):
1078
- """Normalize and apply final envelope"""
1079
  if len(audio) < 100:
1080
  return audio
1081
 
1082
- # Remove DC
1083
  audio = audio - np.mean(audio)
1084
-
1085
- # Normalize
1086
  max_val = np.max(np.abs(audio))
1087
  if max_val > 0:
1088
  audio = audio / max_val * 0.9
1089
 
1090
- # Final fade
1091
  fade = min(len(audio) // 40, 200)
1092
  audio[:fade] *= np.linspace(0, 1, fade)
1093
  audio[-fade:] *= np.linspace(1, 0, fade)
@@ -1103,20 +960,61 @@ class VedesTTS:
1103
  def __init__(self, sample_rate=22050):
1104
  self.sr = sample_rate
1105
  self.text_to_phoneme = TextToPhoneme()
1106
- self.synthesizer = FormantSynthesizer(sample_rate)
 
 
 
 
 
 
 
 
1107
 
1108
- def speak(self, text, rate=1.0, pitch=1.0):
1109
  if not text or not text.strip():
1110
  return np.zeros(self.sr, dtype=np.float32)
1111
 
 
 
 
1112
  phonemes = self.text_to_phoneme.convert(text)
1113
 
1114
  if not phonemes:
1115
  return np.zeros(self.sr, dtype=np.float32)
1116
 
1117
- audio = self.synthesizer.synthesize(phonemes, rate, pitch)
1118
 
1119
  return audio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1120
 
1121
 
1122
  # ============================================
@@ -1124,13 +1022,13 @@ class VedesTTS:
1124
  # ============================================
1125
 
1126
  print("=" * 50)
1127
- print("πŸŽ™οΈ VEDES TTS - Clear Speech Version")
1128
- print("100% From Scratch")
1129
  print("=" * 50)
1130
 
1131
  tts = VedesTTS(SAMPLE_RATE)
1132
 
1133
  print("βœ… Ready!")
 
1134
  print("=" * 50)
1135
 
1136
 
@@ -1138,7 +1036,7 @@ print("=" * 50)
1138
  # GRADIO INTERFACE
1139
  # ============================================
1140
 
1141
- def synthesize(text, rate, pitch):
1142
  if not text or not text.strip():
1143
  return None
1144
 
@@ -1146,6 +1044,16 @@ def synthesize(text, rate, pitch):
1146
 
1147
  try:
1148
  pitch_mult = 2 ** (pitch / 12)
 
 
 
 
 
 
 
 
 
 
1149
  audio = tts.speak(text, rate=rate, pitch=pitch_mult)
1150
 
1151
  if len(audio) < 100:
@@ -1161,66 +1069,270 @@ def synthesize(text, rate, pitch):
1161
  return None
1162
 
1163
 
1164
- with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
 
 
1165
 
1166
- gr.Markdown("""
1167
- # πŸŽ™οΈ Vedes TTS
1168
- ### Clear Speech Synthesis - 100% From Scratch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1169
 
1170
- **No APIs. No pre-trained models. Pure Python.**
 
 
 
 
1171
 
1172
- Uses Klatt formant synthesis with:
1173
- - Glottal source modeling
1174
- - Formant transitions for clarity
1175
- - Proper consonant synthesis
1176
  """)
1177
 
1178
- with gr.Row():
1179
- with gr.Column(scale=2):
1180
- text_input = gr.Textbox(
1181
- label="πŸ“ Text to Speak",
1182
- placeholder="Type here... (e.g., Hello, how are you?)",
1183
- lines=3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1184
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
1185
 
1186
  with gr.Row():
1187
- rate = gr.Slider(0.6, 1.5, 0.9, step=0.1, label="⏱️ Speed")
1188
- pitch = gr.Slider(-4, 4, 0, step=1, label="🎡 Pitch")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1189
 
1190
- btn = gr.Button("πŸ”Š Speak", variant="primary", size="lg")
1191
-
1192
- with gr.Column(scale=1):
1193
- audio_out = gr.Audio(label="🎧 Audio", type="numpy")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1194
 
1195
- gr.Examples(
1196
- examples=[
1197
- ["Hello."],
1198
- ["How are you?"],
1199
- ["Good morning."],
1200
- ["Thank you."],
1201
- ["Yes, I can help."],
1202
- ["My name is Vedes."],
1203
- ["What is your name?"],
1204
- ["Have a nice day."],
1205
- ["This is a test."],
1206
- ["I am fine."],
1207
- ],
1208
- inputs=text_input,
1209
- label="πŸ“š Try These"
1210
- )
1211
 
1212
- gr.Markdown("""
1213
- ---
1214
- ### πŸ’‘ Tips for Better Results
 
 
1215
 
1216
- - **Keep sentences short** - 3-6 words works best
1217
- - **Use simple words** - Common words have better pronunciation
1218
- - **Slow speed** - Set rate to 0.7-0.8 for clarity
1219
- - **Add periods** - Creates natural pauses
1220
- """)
 
1221
 
1222
- btn.click(synthesize, [text_input, rate, pitch], audio_out)
1223
- text_input.submit(synthesize, [text_input, rate, pitch], audio_out)
 
 
 
 
 
1224
 
1225
 
1226
  if __name__ == "__main__":
 
4
  from scipy.io import wavfile
5
  import tempfile
6
  import re
7
+ import json
8
+ import os
9
 
10
  # ============================================
11
+ # VEDES TTS - WITH VOICE TRAINING
12
  # 100% From Scratch - No APIs
13
  # ============================================
14
 
15
  SAMPLE_RATE = 22050
16
 
17
  # ============================================
18
+ # VOICE PROFILES - Pre-defined Voices
19
+ # ============================================
20
+
21
+ VOICE_PROFILES = {
22
+ "Emma (Female)": {
23
+ "name": "Emma",
24
+ "gender": "female",
25
+ "f0": 210, # Higher pitch
26
+ "f0_variation": 30,
27
+ "formant_shift": 1.15, # Shift formants up
28
+ "breathiness": 0.04,
29
+ "speed": 1.0,
30
+ "brightness": 1.1,
31
+ "description": "Friendly female voice"
32
+ },
33
+ "James (Male)": {
34
+ "name": "James",
35
+ "gender": "male",
36
+ "f0": 110, # Lower pitch
37
+ "f0_variation": 20,
38
+ "formant_shift": 0.9, # Shift formants down
39
+ "breathiness": 0.02,
40
+ "speed": 0.95,
41
+ "brightness": 0.95,
42
+ "description": "Professional male voice"
43
+ },
44
+ "Sophie (Child)": {
45
+ "name": "Sophie",
46
+ "gender": "child",
47
+ "f0": 280, # High pitch
48
+ "f0_variation": 40,
49
+ "formant_shift": 1.25,
50
+ "breathiness": 0.03,
51
+ "speed": 1.1,
52
+ "brightness": 1.2,
53
+ "description": "Young child voice"
54
+ },
55
+ "David (Deep Male)": {
56
+ "name": "David",
57
+ "gender": "male",
58
+ "f0": 85, # Very deep
59
+ "f0_variation": 15,
60
+ "formant_shift": 0.82,
61
+ "breathiness": 0.02,
62
+ "speed": 0.9,
63
+ "brightness": 0.85,
64
+ "description": "Deep bass voice"
65
+ },
66
+ "Lisa (Bright Female)": {
67
+ "name": "Lisa",
68
+ "gender": "female",
69
+ "f0": 240,
70
+ "f0_variation": 35,
71
+ "formant_shift": 1.2,
72
+ "breathiness": 0.05,
73
+ "speed": 1.05,
74
+ "brightness": 1.15,
75
+ "description": "Bright, energetic female"
76
+ },
77
+ "Robert (Elderly Male)": {
78
+ "name": "Robert",
79
+ "gender": "male",
80
+ "f0": 95,
81
+ "f0_variation": 12,
82
+ "formant_shift": 0.88,
83
+ "breathiness": 0.06,
84
+ "speed": 0.85,
85
+ "brightness": 0.9,
86
+ "description": "Mature elderly voice"
87
+ },
88
+ "Anna (Soft Female)": {
89
+ "name": "Anna",
90
+ "gender": "female",
91
+ "f0": 195,
92
+ "f0_variation": 25,
93
+ "formant_shift": 1.1,
94
+ "breathiness": 0.07,
95
+ "speed": 0.92,
96
+ "brightness": 1.0,
97
+ "description": "Soft, gentle female"
98
+ },
99
+ "Mike (Energetic Male)": {
100
+ "name": "Mike",
101
+ "gender": "male",
102
+ "f0": 130,
103
+ "f0_variation": 30,
104
+ "formant_shift": 0.95,
105
+ "breathiness": 0.02,
106
+ "speed": 1.1,
107
+ "brightness": 1.05,
108
+ "description": "Energetic young male"
109
+ },
110
+ }
111
+
112
+ # Custom voices storage
113
+ custom_voices = {}
114
+
115
+ # ============================================
116
+ # PHONEME DATA
117
  # ============================================
118
 
 
119
  VOWELS = {
120
+ 'IY': (280, 2250, 2890, 150, 1.0, True),
121
+ 'IH': (400, 1920, 2550, 120, 0.9, True),
122
+ 'EH': (550, 1770, 2490, 130, 0.95, True),
123
+ 'AE': (690, 1660, 2490, 140, 1.0, True),
124
+ 'AA': (710, 1100, 2540, 150, 1.0, True),
125
+ 'AO': (590, 880, 2540, 140, 0.95, True),
126
+ 'UH': (470, 1100, 2540, 120, 0.9, True),
127
+ 'UW': (310, 870, 2250, 150, 1.0, True),
128
+ 'AH': (640, 1200, 2400, 100, 0.85, True),
129
+ 'AX': (500, 1500, 2500, 80, 0.7, True),
130
+ 'ER': (500, 1350, 1700, 140, 0.9, True),
131
+ 'EY': (500, 1900, 2600, 160, 1.0, True),
132
+ 'AY': (700, 1200, 2600, 180, 1.0, True),
133
+ 'OY': (500, 900, 2500, 180, 1.0, True),
134
+ 'AW': (700, 1100, 2600, 180, 1.0, True),
135
+ 'OW': (500, 900, 2500, 160, 1.0, True),
136
  }
137
 
138
  CONSONANTS = {
 
139
  'P': {'type': 'stop', 'closure': 80, 'burst': 30, 'voiced': False, 'burst_freq': 800, 'amp': 0.6},
140
  'B': {'type': 'stop', 'closure': 50, 'burst': 25, 'voiced': True, 'burst_freq': 800, 'amp': 0.7},
141
  'T': {'type': 'stop', 'closure': 70, 'burst': 30, 'voiced': False, 'burst_freq': 3500, 'amp': 0.7},
142
  'D': {'type': 'stop', 'closure': 40, 'burst': 25, 'voiced': True, 'burst_freq': 3500, 'amp': 0.7},
143
  'K': {'type': 'stop', 'closure': 80, 'burst': 40, 'voiced': False, 'burst_freq': 1500, 'amp': 0.7},
144
  'G': {'type': 'stop', 'closure': 50, 'burst': 30, 'voiced': True, 'burst_freq': 1500, 'amp': 0.7},
 
 
145
  'F': {'type': 'fric', 'dur': 120, 'freq_low': 1500, 'freq_high': 8000, 'voiced': False, 'amp': 0.4},
146
  'V': {'type': 'fric', 'dur': 80, 'freq_low': 1500, 'freq_high': 8000, 'voiced': True, 'amp': 0.5},
147
  'TH': {'type': 'fric', 'dur': 100, 'freq_low': 1400, 'freq_high': 6000, 'voiced': False, 'amp': 0.3},
 
151
  'SH': {'type': 'fric', 'dur': 120, 'freq_low': 2000, 'freq_high': 6000, 'voiced': False, 'amp': 0.5},
152
  'ZH': {'type': 'fric', 'dur': 80, 'freq_low': 2000, 'freq_high': 6000, 'voiced': True, 'amp': 0.5},
153
  'HH': {'type': 'fric', 'dur': 80, 'freq_low': 500, 'freq_high': 2000, 'voiced': False, 'amp': 0.3},
 
 
154
  'CH': {'type': 'affric', 'closure': 60, 'fric': 80, 'freq_low': 2000, 'freq_high': 6000, 'voiced': False, 'amp': 0.6},
155
  'JH': {'type': 'affric', 'closure': 40, 'fric': 60, 'freq_low': 2000, 'freq_high': 6000, 'voiced': True, 'amp': 0.6},
 
 
156
  'M': {'type': 'nasal', 'f1': 280, 'f2': 1000, 'f3': 2200, 'dur': 100, 'amp': 0.8},
157
  'N': {'type': 'nasal', 'f1': 280, 'f2': 1700, 'f3': 2500, 'dur': 90, 'amp': 0.8},
158
  'NG': {'type': 'nasal', 'f1': 300, 'f2': 2000, 'f3': 2700, 'dur': 100, 'amp': 0.8},
 
 
159
  'L': {'type': 'liquid', 'f1': 380, 'f2': 1000, 'f3': 2700, 'dur': 90, 'amp': 0.85},
160
  'R': {'type': 'liquid', 'f1': 350, 'f2': 1300, 'f3': 1700, 'dur': 90, 'amp': 0.85},
 
 
161
  'W': {'type': 'glide', 'f1': 300, 'f2': 700, 'f3': 2200, 'dur': 80, 'amp': 0.8},
162
  'Y': {'type': 'glide', 'f1': 280, 'f2': 2200, 'f3': 2900, 'dur': 70, 'amp': 0.8},
163
  }
164
 
165
+ SILENCE = {'SIL': 60, 'PAU': 200}
 
 
 
 
 
166
 
167
  # ============================================
168
+ # PRONUNCIATION DICTIONARY
169
  # ============================================
170
 
171
  DICTIONARY = {
172
+ # Function words
173
+ 'a': ['AX'], 'an': ['AE', 'N'], 'the': ['DH', 'AX'],
174
+ 'and': ['AE', 'N', 'D'], 'or': ['AO', 'R'], 'but': ['B', 'AH', 'T'],
175
+ 'if': ['IH', 'F'], 'of': ['AH', 'V'], 'to': ['T', 'UW'],
176
+ 'in': ['IH', 'N'], 'on': ['AA', 'N'], 'at': ['AE', 'T'],
177
+ 'by': ['B', 'AY'], 'for': ['F', 'AO', 'R'], 'with': ['W', 'IH', 'TH'],
178
+ 'from': ['F', 'R', 'AH', 'M'], 'up': ['AH', 'P'], 'out': ['AW', 'T'],
179
+ 'as': ['AE', 'Z'], 'so': ['S', 'OW'], 'not': ['N', 'AA', 'T'],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
 
181
+ # Pronouns
182
+ 'i': ['AY'], 'me': ['M', 'IY'], 'my': ['M', 'AY'],
183
+ 'you': ['Y', 'UW'], 'your': ['Y', 'AO', 'R'],
184
+ 'he': ['HH', 'IY'], 'him': ['HH', 'IH', 'M'], 'his': ['HH', 'IH', 'Z'],
185
+ 'she': ['SH', 'IY'], 'her': ['HH', 'ER'],
186
+ 'it': ['IH', 'T'], 'its': ['IH', 'T', 'S'],
187
+ 'we': ['W', 'IY'], 'us': ['AH', 'S'], 'our': ['AW', 'ER'],
188
+ 'they': ['DH', 'EY'], 'them': ['DH', 'EH', 'M'], 'their': ['DH', 'EH', 'R'],
189
+ 'this': ['DH', 'IH', 'S'], 'that': ['DH', 'AE', 'T'],
190
+ 'what': ['W', 'AH', 'T'], 'who': ['HH', 'UW'],
191
+ 'where': ['W', 'EH', 'R'], 'when': ['W', 'EH', 'N'],
192
+ 'why': ['W', 'AY'], 'how': ['HH', 'AW'], 'which': ['W', 'IH', 'CH'],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
+ # Be verbs
195
+ 'am': ['AE', 'M'], 'is': ['IH', 'Z'], 'are': ['AA', 'R'],
196
+ 'was': ['W', 'AA', 'Z'], 'were': ['W', 'ER'],
197
+ 'be': ['B', 'IY'], 'been': ['B', 'IH', 'N'], 'being': ['B', 'IY', 'IH', 'NG'],
 
 
 
 
 
198
 
199
+ # Have verbs
200
+ 'have': ['HH', 'AE', 'V'], 'has': ['HH', 'AE', 'Z'],
201
+ 'had': ['HH', 'AE', 'D'], 'having': ['HH', 'AE', 'V', 'IH', 'NG'],
 
 
202
 
203
+ # Do verbs
204
+ 'do': ['D', 'UW'], 'does': ['D', 'AH', 'Z'],
205
+ 'did': ['D', 'IH', 'D'], 'done': ['D', 'AH', 'N'],
 
 
 
206
 
207
+ # Modal verbs
208
+ 'will': ['W', 'IH', 'L'], 'would': ['W', 'UH', 'D'],
209
+ 'can': ['K', 'AE', 'N'], 'could': ['K', 'UH', 'D'],
210
+ 'should': ['SH', 'UH', 'D'], 'may': ['M', 'EY'],
211
+ 'might': ['M', 'AY', 'T'], 'must': ['M', 'AH', 'S', 'T'],
 
 
 
 
 
212
 
213
+ # Common verbs
214
+ 'go': ['G', 'OW'], 'goes': ['G', 'OW', 'Z'], 'going': ['G', 'OW', 'IH', 'NG'],
215
+ 'went': ['W', 'EH', 'N', 'T'], 'gone': ['G', 'AO', 'N'],
216
+ 'come': ['K', 'AH', 'M'], 'comes': ['K', 'AH', 'M', 'Z'],
217
+ 'coming': ['K', 'AH', 'M', 'IH', 'NG'], 'came': ['K', 'EY', 'M'],
218
+ 'get': ['G', 'EH', 'T'], 'gets': ['G', 'EH', 'T', 'S'],
219
+ 'getting': ['G', 'EH', 'T', 'IH', 'NG'], 'got': ['G', 'AA', 'T'],
220
+ 'make': ['M', 'EY', 'K'], 'makes': ['M', 'EY', 'K', 'S'],
221
+ 'making': ['M', 'EY', 'K', 'IH', 'NG'], 'made': ['M', 'EY', 'D'],
222
+ 'take': ['T', 'EY', 'K'], 'takes': ['T', 'EY', 'K', 'S'],
223
+ 'took': ['T', 'UH', 'K'], 'taken': ['T', 'EY', 'K', 'AX', 'N'],
224
+ 'see': ['S', 'IY'], 'sees': ['S', 'IY', 'Z'],
225
+ 'saw': ['S', 'AO'], 'seen': ['S', 'IY', 'N'],
226
+ 'say': ['S', 'EY'], 'says': ['S', 'EH', 'Z'], 'said': ['S', 'EH', 'D'],
227
+ 'know': ['N', 'OW'], 'knows': ['N', 'OW', 'Z'],
228
+ 'knew': ['N', 'UW'], 'known': ['N', 'OW', 'N'],
229
+ 'think': ['TH', 'IH', 'NG', 'K'], 'thought': ['TH', 'AO', 'T'],
230
+ 'want': ['W', 'AA', 'N', 'T'], 'wants': ['W', 'AA', 'N', 'T', 'S'],
231
+ 'give': ['G', 'IH', 'V'], 'gives': ['G', 'IH', 'V', 'Z'],
232
+ 'gave': ['G', 'EY', 'V'], 'given': ['G', 'IH', 'V', 'AX', 'N'],
233
+ 'tell': ['T', 'EH', 'L'], 'told': ['T', 'OW', 'L', 'D'],
234
+ 'ask': ['AE', 'S', 'K'], 'asked': ['AE', 'S', 'K', 'T'],
235
+ 'use': ['Y', 'UW', 'Z'], 'used': ['Y', 'UW', 'Z', 'D'],
236
+ 'find': ['F', 'AY', 'N', 'D'], 'found': ['F', 'AW', 'N', 'D'],
237
+ 'work': ['W', 'ER', 'K'], 'works': ['W', 'ER', 'K', 'S'],
238
+ 'call': ['K', 'AO', 'L'], 'called': ['K', 'AO', 'L', 'D'],
239
+ 'try': ['T', 'R', 'AY'], 'tried': ['T', 'R', 'AY', 'D'],
240
+ 'need': ['N', 'IY', 'D'], 'needs': ['N', 'IY', 'D', 'Z'],
241
+ 'feel': ['F', 'IY', 'L'], 'feels': ['F', 'IY', 'L', 'Z'],
242
+ 'help': ['HH', 'EH', 'L', 'P'], 'helps': ['HH', 'EH', 'L', 'P', 'S'],
243
+ 'keep': ['K', 'IY', 'P'], 'kept': ['K', 'EH', 'P', 'T'],
244
+ 'let': ['L', 'EH', 'T'], 'put': ['P', 'UH', 'T'],
245
+ 'seem': ['S', 'IY', 'M'], 'leave': ['L', 'IY', 'V'],
246
+ 'show': ['SH', 'OW'], 'hear': ['HH', 'IY', 'R'],
247
+ 'play': ['P', 'L', 'EY'], 'run': ['R', 'AH', 'N'],
248
+ 'move': ['M', 'UW', 'V'], 'live': ['L', 'IH', 'V'],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
  'believe': ['B', 'IH', 'L', 'IY', 'V'],
250
+ 'read': ['R', 'IY', 'D'], 'write': ['R', 'AY', 'T'],
251
+ 'learn': ['L', 'ER', 'N'], 'speak': ['S', 'P', 'IY', 'K'],
252
+ 'look': ['L', 'UH', 'K'], 'like': ['L', 'AY', 'K'],
253
+ 'love': ['L', 'AH', 'V'], 'start': ['S', 'T', 'AA', 'R', 'T'],
254
+ 'stop': ['S', 'T', 'AA', 'P'],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
 
256
+ # Adjectives
257
+ 'good': ['G', 'UH', 'D'], 'better': ['B', 'EH', 'T', 'ER'],
258
+ 'best': ['B', 'EH', 'S', 'T'], 'bad': ['B', 'AE', 'D'],
259
+ 'new': ['N', 'UW'], 'old': ['OW', 'L', 'D'],
260
+ 'big': ['B', 'IH', 'G'], 'small': ['S', 'M', 'AO', 'L'],
261
+ 'long': ['L', 'AO', 'NG'], 'short': ['SH', 'AO', 'R', 'T'],
262
+ 'high': ['HH', 'AY'], 'low': ['L', 'OW'],
263
+ 'great': ['G', 'R', 'EY', 'T'], 'little': ['L', 'IH', 'T', 'AX', 'L'],
264
+ 'right': ['R', 'AY', 'T'], 'wrong': ['R', 'AO', 'NG'],
265
+ 'first': ['F', 'ER', 'S', 'T'], 'last': ['L', 'AE', 'S', 'T'],
266
+ 'same': ['S', 'EY', 'M'], 'different': ['D', 'IH', 'F', 'R', 'AX', 'N', 'T'],
267
+ 'own': ['OW', 'N'], 'other': ['AH', 'DH', 'ER'],
268
+ 'nice': ['N', 'AY', 'S'], 'happy': ['HH', 'AE', 'P', 'IY'],
269
+ 'sure': ['SH', 'UH', 'R'], 'true': ['T', 'R', 'UW'],
270
+ 'real': ['R', 'IY', 'L'], 'clear': ['K', 'L', 'IY', 'R'],
271
+ 'fine': ['F', 'AY', 'N'], 'free': ['F', 'R', 'IY'],
272
+ 'easy': ['IY', 'Z', 'IY'], 'hard': ['HH', 'AA', 'R', 'D'],
273
+ 'young': ['Y', 'AH', 'NG'], 'beautiful': ['B', 'Y', 'UW', 'T', 'IH', 'F', 'AX', 'L'],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
 
275
+ # Adverbs
276
+ 'very': ['V', 'EH', 'R', 'IY'], 'really': ['R', 'IY', 'L', 'IY'],
277
+ 'just': ['JH', 'AH', 'S', 'T'], 'only': ['OW', 'N', 'L', 'IY'],
278
+ 'also': ['AO', 'L', 'S', 'OW'], 'well': ['W', 'EH', 'L'],
279
+ 'now': ['N', 'AW'], 'then': ['DH', 'EH', 'N'],
280
+ 'here': ['HH', 'IY', 'R'], 'there': ['DH', 'EH', 'R'],
281
+ 'still': ['S', 'T', 'IH', 'L'], 'even': ['IY', 'V', 'AX', 'N'],
282
+ 'back': ['B', 'AE', 'K'], 'again': ['AX', 'G', 'EH', 'N'],
283
+ 'always': ['AO', 'L', 'W', 'EY', 'Z'], 'never': ['N', 'EH', 'V', 'ER'],
284
+ 'today': ['T', 'AX', 'D', 'EY'], 'maybe': ['M', 'EY', 'B', 'IY'],
285
+ 'too': ['T', 'UW'], 'much': ['M', 'AH', 'CH'],
286
+ 'more': ['M', 'AO', 'R'], 'most': ['M', 'OW', 'S', 'T'],
287
+ 'please': ['P', 'L', 'IY', 'Z'],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
 
289
+ # Nouns
290
+ 'time': ['T', 'AY', 'M'], 'year': ['Y', 'IY', 'R'],
291
+ 'day': ['D', 'EY'], 'way': ['W', 'EY'],
292
+ 'man': ['M', 'AE', 'N'], 'woman': ['W', 'UH', 'M', 'AX', 'N'],
293
+ 'child': ['CH', 'AY', 'L', 'D'], 'world': ['W', 'ER', 'L', 'D'],
294
+ 'life': ['L', 'AY', 'F'], 'hand': ['HH', 'AE', 'N', 'D'],
295
+ 'part': ['P', 'AA', 'R', 'T'], 'place': ['P', 'L', 'EY', 'S'],
296
+ 'thing': ['TH', 'IH', 'NG'], 'things': ['TH', 'IH', 'NG', 'Z'],
297
+ 'people': ['P', 'IY', 'P', 'AX', 'L'], 'person': ['P', 'ER', 'S', 'AX', 'N'],
298
+ 'home': ['HH', 'OW', 'M'], 'house': ['HH', 'AW', 'S'],
299
+ 'word': ['W', 'ER', 'D'], 'name': ['N', 'EY', 'M'],
300
+ 'water': ['W', 'AO', 'T', 'ER'], 'money': ['M', 'AH', 'N', 'IY'],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  'family': ['F', 'AE', 'M', 'AX', 'L', 'IY'],
302
+ 'friend': ['F', 'R', 'EH', 'N', 'D'], 'friends': ['F', 'R', 'EH', 'N', 'D', 'Z'],
303
+ 'mother': ['M', 'AH', 'DH', 'ER'], 'father': ['F', 'AA', 'DH', 'ER'],
304
+ 'boy': ['B', 'OY'], 'girl': ['G', 'ER', 'L'],
305
+ 'head': ['HH', 'EH', 'D'], 'face': ['F', 'EY', 'S'],
306
+ 'eye': ['AY'], 'eyes': ['AY', 'Z'],
307
+ 'voice': ['V', 'OY', 'S'], 'night': ['N', 'AY', 'T'],
 
 
 
 
 
 
 
 
 
308
  'morning': ['M', 'AO', 'R', 'N', 'IH', 'NG'],
309
+ 'week': ['W', 'IY', 'K'], 'month': ['M', 'AH', 'N', 'TH'],
310
+ 'school': ['S', 'K', 'UW', 'L'], 'book': ['B', 'UH', 'K'],
 
 
 
 
 
311
  'story': ['S', 'T', 'AO', 'R', 'IY'],
312
  'question': ['K', 'W', 'EH', 'S', 'CH', 'AX', 'N'],
313
  'answer': ['AE', 'N', 'S', 'ER'],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
 
315
+ # Numbers
316
+ 'zero': ['Z', 'IY', 'R', 'OW'], 'one': ['W', 'AH', 'N'],
317
+ 'two': ['T', 'UW'], 'three': ['TH', 'R', 'IY'],
318
+ 'four': ['F', 'AO', 'R'], 'five': ['F', 'AY', 'V'],
319
+ 'six': ['S', 'IH', 'K', 'S'], 'seven': ['S', 'EH', 'V', 'AX', 'N'],
320
+ 'eight': ['EY', 'T'], 'nine': ['N', 'AY', 'N'], 'ten': ['T', 'EH', 'N'],
 
 
 
 
 
 
 
 
321
 
322
+ # Greetings
323
+ 'hello': ['HH', 'AX', 'L', 'OW'], 'hi': ['HH', 'AY'],
324
+ 'hey': ['HH', 'EY'], 'goodbye': ['G', 'UH', 'D', 'B', 'AY'],
325
+ 'bye': ['B', 'AY'], 'welcome': ['W', 'EH', 'L', 'K', 'AX', 'M'],
326
+ 'thank': ['TH', 'AE', 'NG', 'K'], 'thanks': ['TH', 'AE', 'NG', 'K', 'S'],
 
 
 
 
 
327
  'sorry': ['S', 'AA', 'R', 'IY'],
328
+ 'yes': ['Y', 'EH', 'S'], 'yeah': ['Y', 'AE'], 'no': ['N', 'OW'],
329
+ 'ok': ['OW', 'K', 'EY'], 'okay': ['OW', 'K', 'EY'],
 
 
 
330
 
331
+ # Tech/TTS
332
+ 'text': ['T', 'EH', 'K', 'S', 'T'], 'speech': ['S', 'P', 'IY', 'CH'],
333
+ 'sound': ['S', 'AW', 'N', 'D'], 'audio': ['AO', 'D', 'IY', 'OW'],
334
+ 'test': ['T', 'EH', 'S', 'T'], 'testing': ['T', 'EH', 'S', 'T', 'IH', 'NG'],
 
 
 
 
335
  'computer': ['K', 'AX', 'M', 'P', 'Y', 'UW', 'T', 'ER'],
336
  'vedes': ['V', 'EY', 'D', 'EH', 'S'],
337
  'system': ['S', 'IH', 'S', 'T', 'AX', 'M'],
338
+ 'train': ['T', 'R', 'EY', 'N'], 'training': ['T', 'R', 'EY', 'N', 'IH', 'NG'],
339
  }
340
 
341
+ # Letter patterns
342
  PATTERNS = [
343
+ ('tion', ['SH', 'AX', 'N']), ('sion', ['ZH', 'AX', 'N']),
344
+ ('ness', ['N', 'AX', 'S']), ('ment', ['M', 'AX', 'N', 'T']),
345
+ ('able', ['AX', 'B', 'AX', 'L']), ('ible', ['AX', 'B', 'AX', 'L']),
346
+ ('ful', ['F', 'AX', 'L']), ('less', ['L', 'AX', 'S']),
347
+ ('ing', ['IH', 'NG']), ('ight', ['AY', 'T']),
348
+ ('ough', ['AO']), ('ould', ['UH', 'D']),
349
+ ('th', ['TH']), ('sh', ['SH']), ('ch', ['CH']),
350
+ ('wh', ['W']), ('ph', ['F']), ('ck', ['K']), ('ng', ['NG']),
351
+ ('qu', ['K', 'W']), ('ee', ['IY']), ('ea', ['IY']),
352
+ ('oo', ['UW']), ('ou', ['AW']), ('ow', ['OW']),
353
+ ('ai', ['EY']), ('ay', ['EY']), ('ey', ['IY']),
354
+ ('oy', ['OY']), ('oi', ['OY']), ('ie', ['IY']),
355
+ ('er', ['ER']), ('ir', ['ER']), ('ur', ['ER']),
356
+ ('ar', ['AA', 'R']), ('or', ['AO', 'R']),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
  ]
358
 
359
  LETTERS = {
 
365
  }
366
 
367
 
368
+ # ============================================
369
+ # VOICE ANALYZER - Extract Voice Features
370
+ # ============================================
371
+
372
+ class VoiceAnalyzer:
373
+ """Analyze audio to extract voice characteristics"""
374
+
375
+ def __init__(self, sample_rate=22050):
376
+ self.sr = sample_rate
377
+
378
+ def analyze(self, audio):
379
+ """Extract voice features from audio sample"""
380
+ if len(audio) < self.sr * 0.5:
381
+ return None
382
+
383
+ # Normalize
384
+ audio = audio.astype(np.float32)
385
+ audio = audio / (np.max(np.abs(audio)) + 1e-8)
386
+
387
+ # Extract features
388
+ f0 = self._estimate_pitch(audio)
389
+ formants = self._estimate_formants(audio)
390
+ breathiness = self._estimate_breathiness(audio)
391
+
392
+ # Create voice profile
393
+ profile = {
394
+ "name": "Custom Voice",
395
+ "gender": "custom",
396
+ "f0": f0,
397
+ "f0_variation": self._estimate_f0_variation(audio, f0),
398
+ "formant_shift": formants.get('shift', 1.0),
399
+ "breathiness": breathiness,
400
+ "speed": 1.0,
401
+ "brightness": formants.get('brightness', 1.0),
402
+ "description": "Voice extracted from audio sample"
403
+ }
404
+
405
+ return profile
406
+
407
+ def _estimate_pitch(self, audio):
408
+ """Estimate fundamental frequency (F0)"""
409
+ # Use autocorrelation
410
+ frame_size = int(self.sr * 0.03) # 30ms frames
411
+
412
+ pitches = []
413
+ for i in range(0, len(audio) - frame_size, frame_size):
414
+ frame = audio[i:i + frame_size]
415
+
416
+ # Autocorrelation
417
+ corr = np.correlate(frame, frame, mode='full')
418
+ corr = corr[len(corr)//2:]
419
+
420
+ # Find first peak after initial decline
421
+ d = np.diff(corr)
422
+ start = np.where(d > 0)[0]
423
+
424
+ if len(start) > 0:
425
+ start = start[0]
426
+ peak = start + np.argmax(corr[start:start + int(self.sr / 80)])
427
+
428
+ if peak > 0:
429
+ f0 = self.sr / peak
430
+ if 60 < f0 < 400:
431
+ pitches.append(f0)
432
+
433
+ if pitches:
434
+ return np.median(pitches)
435
+ return 130 # Default
436
+
437
+ def _estimate_f0_variation(self, audio, base_f0):
438
+ """Estimate pitch variation"""
439
+ frame_size = int(self.sr * 0.03)
440
+ pitches = []
441
+
442
+ for i in range(0, len(audio) - frame_size, frame_size):
443
+ frame = audio[i:i + frame_size]
444
+ corr = np.correlate(frame, frame, mode='full')
445
+ corr = corr[len(corr)//2:]
446
+
447
+ d = np.diff(corr)
448
+ start = np.where(d > 0)[0]
449
+
450
+ if len(start) > 0:
451
+ start = start[0]
452
+ peak = start + np.argmax(corr[start:start + int(self.sr / 80)])
453
+ if peak > 0:
454
+ f0 = self.sr / peak
455
+ if 60 < f0 < 400:
456
+ pitches.append(f0)
457
+
458
+ if len(pitches) > 2:
459
+ return min(np.std(pitches), 50)
460
+ return 20
461
+
462
+ def _estimate_formants(self, audio):
463
+ """Estimate formant characteristics"""
464
+ # Simple spectral analysis
465
+ frame_size = 2048
466
+
467
+ if len(audio) < frame_size:
468
+ return {'shift': 1.0, 'brightness': 1.0}
469
+
470
+ # Get spectrum
471
+ spectrum = np.abs(np.fft.rfft(audio[:frame_size] * np.hanning(frame_size)))
472
+ freqs = np.fft.rfftfreq(frame_size, 1/self.sr)
473
+
474
+ # Find spectral centroid
475
+ centroid = np.sum(freqs * spectrum) / (np.sum(spectrum) + 1e-8)
476
+
477
+ # Estimate formant shift based on centroid
478
+ # Average male ~1200Hz, female ~1400Hz
479
+ if centroid > 1600:
480
+ shift = 1.2
481
+ brightness = 1.15
482
+ elif centroid > 1400:
483
+ shift = 1.1
484
+ brightness = 1.05
485
+ elif centroid > 1200:
486
+ shift = 1.0
487
+ brightness = 1.0
488
+ elif centroid > 1000:
489
+ shift = 0.9
490
+ brightness = 0.95
491
+ else:
492
+ shift = 0.85
493
+ brightness = 0.9
494
+
495
+ return {'shift': shift, 'brightness': brightness}
496
+
497
+ def _estimate_breathiness(self, audio):
498
+ """Estimate breathiness/aspiration"""
499
+ frame_size = 2048
500
+
501
+ if len(audio) < frame_size:
502
+ return 0.03
503
+
504
+ spectrum = np.abs(np.fft.rfft(audio[:frame_size]))
505
+ freqs = np.fft.rfftfreq(frame_size, 1/self.sr)
506
+
507
+ # High frequency energy ratio (breathiness indicator)
508
+ low_energy = np.sum(spectrum[freqs < 1000])
509
+ high_energy = np.sum(spectrum[(freqs > 2000) & (freqs < 5000)])
510
+
511
+ ratio = high_energy / (low_energy + 1e-8)
512
+ breathiness = np.clip(ratio * 0.1, 0.02, 0.1)
513
+
514
+ return breathiness
515
+
516
+
517
  # ============================================
518
  # TEXT TO PHONEME CONVERTER
519
  # ============================================
 
569
 
570
 
571
  # ============================================
572
+ # VOICE-AWARE FORMANT SYNTHESIZER
573
  # ============================================
574
 
575
+ class VoiceSynthesizer:
576
  def __init__(self, sample_rate=22050):
577
  self.sr = sample_rate
578
+ self.default_voice = VOICE_PROFILES["Emma (Female)"]
579
 
580
+ def synthesize(self, phonemes, voice_profile=None, rate=1.0, pitch=1.0):
581
  if not phonemes:
582
  return np.zeros(int(self.sr * 0.5), dtype=np.float32)
583
 
584
+ voice = voice_profile or self.default_voice
585
+
586
+ # Get voice parameters
587
+ f0 = voice.get('f0', 130) * pitch
588
+ f0_var = voice.get('f0_variation', 20)
589
+ formant_shift = voice.get('formant_shift', 1.0)
590
+ breathiness = voice.get('breathiness', 0.03)
591
+ voice_speed = voice.get('speed', 1.0) * rate
592
+ brightness = voice.get('brightness', 1.0)
593
+
594
  segments = []
595
 
596
  for i, phon in enumerate(phonemes):
597
  prev_phon = phonemes[i - 1] if i > 0 else None
598
  next_phon = phonemes[i + 1] if i < len(phonemes) - 1 else None
599
 
600
+ # Add pitch variation
601
+ phrase_pos = i / max(len(phonemes), 1)
602
+ f0_current = f0 + f0_var * np.sin(phrase_pos * np.pi) * 0.5
603
+
604
+ seg = self._synth_phoneme(
605
+ phon, f0_current, voice_speed, formant_shift,
606
+ breathiness, brightness, prev_phon, next_phon
607
+ )
608
  segments.append(seg)
609
 
610
  audio = self._smooth_concat(segments)
 
612
 
613
  return audio.astype(np.float32)
614
 
615
+ def _synth_phoneme(self, phon, f0, speed, formant_shift, breathiness,
616
+ brightness, prev_phon, next_phon):
617
  if phon in SILENCE:
618
+ dur = int(self.sr * SILENCE[phon] / 1000 / speed)
619
  return np.zeros(dur, dtype=np.float32)
620
 
 
621
  if phon in VOWELS:
622
+ return self._synth_vowel(phon, f0, speed, formant_shift,
623
+ breathiness, brightness, prev_phon, next_phon)
624
 
 
625
  if phon in CONSONANTS:
626
+ return self._synth_consonant(phon, f0, speed, formant_shift, breathiness)
627
 
628
  return np.zeros(100, dtype=np.float32)
629
 
630
+ def _synth_vowel(self, phon, f0, speed, formant_shift, breathiness,
631
+ brightness, prev_phon, next_phon):
632
  params = VOWELS[phon]
633
  f1, f2, f3, dur_ms, amp, voiced = params
634
 
635
+ # Apply formant shift
636
+ f1 = f1 * formant_shift
637
+ f2 = f2 * formant_shift
638
+ f3 = f3 * formant_shift
639
+
640
+ # Apply brightness
641
+ f2 = f2 * brightness
642
+ f3 = f3 * brightness
643
+
644
+ dur_ms = dur_ms / speed
645
  n = int(self.sr * dur_ms / 1000)
646
  n = max(n, 100)
647
  t = np.arange(n) / self.sr
648
 
649
+ # Generate glottal source with voice characteristics
650
+ source = self._glottal_source(t, f0, breathiness)
651
 
652
+ # Apply formants
653
+ audio = self._apply_formants(source, f1, f2, f3)
654
 
655
+ # Apply envelope
656
  envelope = self._vowel_envelope(n)
657
  audio = audio * envelope * amp
658
 
659
  return audio
660
 
661
+ def _synth_consonant(self, phon, f0, speed, formant_shift, breathiness):
 
662
  params = CONSONANTS[phon]
663
  ctype = params['type']
664
 
665
  if ctype == 'stop':
666
+ return self._synth_stop(phon, params, f0, speed, formant_shift)
667
  elif ctype == 'fric':
668
+ return self._synth_fricative(phon, params, f0, speed)
669
  elif ctype == 'affric':
670
+ return self._synth_affricate(phon, params, f0, speed)
671
  elif ctype == 'nasal':
672
+ return self._synth_nasal(phon, params, f0, speed, formant_shift, breathiness)
673
  elif ctype == 'liquid':
674
+ return self._synth_liquid(phon, params, f0, speed, formant_shift, breathiness)
675
  elif ctype == 'glide':
676
+ return self._synth_glide(phon, params, f0, speed, formant_shift, breathiness)
677
 
678
  return np.zeros(100, dtype=np.float32)
679
 
680
+ def _glottal_source(self, t, f0, breathiness):
 
 
681
  T0 = 1.0 / f0
682
  phase = (t % T0) / T0
683
 
 
684
  glottal = np.zeros_like(t)
685
  mask1 = phase < 0.4
686
  glottal[mask1] = 0.5 * (1 - np.cos(np.pi * phase[mask1] / 0.4))
687
 
 
688
  mask2 = (phase >= 0.4) & (phase < 0.6)
689
  glottal[mask2] = np.cos(np.pi * (phase[mask2] - 0.4) / 0.4)
690
 
691
  # Add breathiness
692
+ glottal += np.random.randn(len(t)) * breathiness
693
 
694
+ # Add shimmer
695
  shimmer = 1 + 0.02 * np.sin(2 * np.pi * 5 * t)
696
  glottal *= shimmer
697
 
698
  return glottal
699
 
700
+ def _apply_formants(self, source, f1, f2, f3):
 
 
 
 
701
  formants = [(f1, 90), (f2, 110), (f3, 130)]
 
 
 
 
 
 
 
 
 
 
 
 
702
  result = np.zeros_like(source)
703
+
704
+ for freq, bw in formants:
705
+ result += self._resonator(source, freq, bw)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
706
 
707
  return result
708
 
709
+ def _resonator(self, sig, freq, bw):
 
710
  if freq <= 0 or freq >= self.sr / 2:
711
  return sig
712
 
 
713
  r = np.exp(-np.pi * bw / self.sr)
714
  theta = 2 * np.pi * freq / self.sr
715
 
 
717
  a2 = r * r
718
  b0 = 1 - r
719
 
 
720
  y = np.zeros_like(sig)
721
  for i in range(2, len(sig)):
722
  y[i] = b0 * sig[i] - a1 * y[i-1] - a2 * y[i-2]
 
724
  return y
725
 
726
  def _vowel_envelope(self, n):
 
727
  env = np.ones(n)
 
 
728
  attack = max(1, n // 10)
 
 
 
729
  release = max(1, int(n * 0.15))
730
+
731
+ env[:attack] = np.sin(np.linspace(0, np.pi/2, attack)) ** 2
732
  env[-release:] = np.cos(np.linspace(0, np.pi/2, release)) ** 2
733
 
734
  return env
735
 
736
+ def _consonant_envelope(self, n):
737
+ env = np.ones(n)
738
+ attack = max(1, n // 8)
739
+ release = max(1, n // 6)
740
+
741
+ env[:attack] = np.linspace(0.1, 1, attack)
742
+ env[-release:] = np.linspace(1, 0.1, release)
743
+
744
+ return env
745
+
746
+ def _synth_stop(self, phon, params, f0, speed, formant_shift):
747
+ closure_ms = params['closure'] / speed
748
+ burst_ms = params['burst'] / speed
749
 
750
  closure_n = int(self.sr * closure_ms / 1000)
751
  burst_n = int(self.sr * burst_ms / 1000)
 
753
 
754
  audio = np.zeros(total_n, dtype=np.float32)
755
 
 
756
  if params['voiced']:
757
  t = np.arange(closure_n) / self.sr
758
  voice_bar = np.sin(2 * np.pi * f0 * 0.8 * t) * 0.15
759
  audio[:closure_n] = voice_bar
760
 
 
761
  burst = np.random.randn(burst_n)
762
+ burst_freq = params['burst_freq'] * formant_shift
763
 
 
 
764
  try:
765
  if burst_freq < self.sr / 2 - 100:
766
  b, a = signal.butter(2, burst_freq / (self.sr / 2), 'low')
 
768
  except:
769
  pass
770
 
 
771
  burst_env = np.exp(-np.linspace(0, 5, burst_n))
772
  burst *= burst_env * params['amp']
773
 
 
775
 
776
  return audio
777
 
778
+ def _synth_fricative(self, phon, params, f0, speed):
779
+ dur_ms = params['dur'] / speed
 
780
  n = int(self.sr * dur_ms / 1000)
781
 
 
782
  noise = np.random.randn(n)
783
 
 
784
  low = params['freq_low']
785
  high = min(params['freq_high'], self.sr / 2 - 100)
786
 
 
793
 
794
  audio = noise * params['amp']
795
 
 
796
  if params['voiced']:
797
  t = np.arange(n) / self.sr
798
+ voice = self._glottal_source(t, f0, 0.03) * 0.3
799
  audio = audio + voice
800
 
801
+ audio *= self._consonant_envelope(n)
 
 
802
 
803
  return audio.astype(np.float32)
804
 
805
+ def _synth_affricate(self, phon, params, f0, speed):
806
+ closure_ms = params['closure'] / speed
807
+ fric_ms = params['fric'] / speed
 
808
 
809
  closure_n = int(self.sr * closure_ms / 1000)
810
  fric_n = int(self.sr * fric_ms / 1000)
811
 
812
  audio = np.zeros(closure_n + fric_n, dtype=np.float32)
813
 
 
814
  if params['voiced']:
815
  t = np.arange(closure_n) / self.sr
816
  audio[:closure_n] = np.sin(2 * np.pi * f0 * 0.8 * t) * 0.1
817
 
 
818
  fric = np.random.randn(fric_n)
819
  low = params['freq_low']
820
  high = min(params['freq_high'], self.sr / 2 - 100)
 
827
 
828
  fric *= params['amp']
829
 
 
830
  fric_env = np.ones(fric_n)
831
  attack = fric_n // 6
832
  release = fric_n // 3
 
837
 
838
  return audio
839
 
840
+ def _synth_nasal(self, phon, params, f0, speed, formant_shift, breathiness):
841
+ dur_ms = params['dur'] / speed
 
842
  n = int(self.sr * dur_ms / 1000)
843
  t = np.arange(n) / self.sr
844
 
845
+ source = self._glottal_source(t, f0, breathiness)
 
846
 
847
+ f1 = params['f1'] * formant_shift
848
+ f2 = params['f2'] * formant_shift
849
+ f3 = params['f3'] * formant_shift
850
 
851
+ audio = self._apply_formants(source, f1, f2, f3)
 
 
 
 
852
 
853
+ nasal_pole = self._resonator(source, 250, 100) * 0.4
 
 
 
 
854
  audio += nasal_pole
855
 
 
856
  try:
857
  b, a = signal.butter(2, 800 / (self.sr / 2), 'low')
858
  audio = signal.filtfilt(b, a, audio)
 
863
 
864
  return audio.astype(np.float32)
865
 
866
+ def _synth_liquid(self, phon, params, f0, speed, formant_shift, breathiness):
867
+ dur_ms = params['dur'] / speed
 
868
  n = int(self.sr * dur_ms / 1000)
869
  t = np.arange(n) / self.sr
870
 
871
+ source = self._glottal_source(t, f0, breathiness)
872
 
873
+ f1 = params['f1'] * formant_shift
874
+ f2 = params['f2'] * formant_shift
875
+ f3 = params['f3'] * formant_shift
 
 
 
 
 
 
876
 
877
+ audio = self._apply_formants(source, f1, f2, f3)
878
  audio *= params['amp'] * self._consonant_envelope(n)
879
 
880
  return audio.astype(np.float32)
881
 
882
+ def _synth_glide(self, phon, params, f0, speed, formant_shift, breathiness):
883
+ dur_ms = params['dur'] / speed
 
884
  n = int(self.sr * dur_ms / 1000)
885
  t = np.arange(n) / self.sr
886
 
887
+ source = self._glottal_source(t, f0, breathiness)
 
 
 
888
 
889
+ f1 = params['f1'] * formant_shift
890
+ f2 = params['f2'] * formant_shift
891
+ f3 = params['f3'] * formant_shift
 
 
 
892
 
893
+ audio = self._apply_formants(source, f1, f2, f3)
894
  audio *= params['amp'] * self._consonant_envelope(n)
895
 
896
  return audio.astype(np.float32)
897
 
 
 
 
 
 
 
 
 
 
 
 
898
  def _smooth_concat(self, segments):
 
899
  if not segments:
900
  return np.zeros(1000, dtype=np.float32)
901
 
902
  if len(segments) == 1:
903
  return segments[0]
904
 
 
905
  overlap = 64
 
906
  total_len = sum(len(s) for s in segments) - overlap * (len(segments) - 1)
907
  total_len = max(total_len, 100)
908
 
 
922
  seg_to_add = seg[:seg_len]
923
 
924
  if i > 0 and pos > overlap:
 
925
  fade_len = min(overlap, seg_len)
926
  fade_in = np.linspace(0, 1, fade_len) ** 0.5
927
  fade_out = np.linspace(1, 0, fade_len) ** 0.5
 
937
  return audio
938
 
939
  def _normalize(self, audio):
 
940
  if len(audio) < 100:
941
  return audio
942
 
 
943
  audio = audio - np.mean(audio)
 
 
944
  max_val = np.max(np.abs(audio))
945
  if max_val > 0:
946
  audio = audio / max_val * 0.9
947
 
 
948
  fade = min(len(audio) // 40, 200)
949
  audio[:fade] *= np.linspace(0, 1, fade)
950
  audio[-fade:] *= np.linspace(1, 0, fade)
 
960
  def __init__(self, sample_rate=22050):
961
  self.sr = sample_rate
962
  self.text_to_phoneme = TextToPhoneme()
963
+ self.synthesizer = VoiceSynthesizer(sample_rate)
964
+ self.voice_analyzer = VoiceAnalyzer(sample_rate)
965
+ self.current_voice = VOICE_PROFILES["Emma (Female)"]
966
+
967
+ def set_voice(self, voice_name):
968
+ if voice_name in VOICE_PROFILES:
969
+ self.current_voice = VOICE_PROFILES[voice_name]
970
+ elif voice_name in custom_voices:
971
+ self.current_voice = custom_voices[voice_name]
972
 
973
+ def speak(self, text, rate=1.0, pitch=1.0, voice_name=None):
974
  if not text or not text.strip():
975
  return np.zeros(self.sr, dtype=np.float32)
976
 
977
+ if voice_name:
978
+ self.set_voice(voice_name)
979
+
980
  phonemes = self.text_to_phoneme.convert(text)
981
 
982
  if not phonemes:
983
  return np.zeros(self.sr, dtype=np.float32)
984
 
985
+ audio = self.synthesizer.synthesize(phonemes, self.current_voice, rate, pitch)
986
 
987
  return audio
988
+
989
+ def train_voice(self, audio_data, voice_name="My Voice"):
990
+ """Train a new voice from audio sample"""
991
+ if audio_data is None:
992
+ return None
993
+
994
+ # Handle different input formats
995
+ if isinstance(audio_data, tuple):
996
+ sr, audio = audio_data
997
+ audio = audio.astype(np.float32)
998
+ if sr != self.sr:
999
+ # Resample
1000
+ duration = len(audio) / sr
1001
+ new_length = int(duration * self.sr)
1002
+ audio = signal.resample(audio, new_length)
1003
+ else:
1004
+ audio = audio_data.astype(np.float32)
1005
+
1006
+ # Normalize
1007
+ audio = audio / (np.max(np.abs(audio)) + 1e-8)
1008
+
1009
+ # Analyze voice
1010
+ profile = self.voice_analyzer.analyze(audio)
1011
+
1012
+ if profile:
1013
+ profile['name'] = voice_name
1014
+ custom_voices[voice_name] = profile
1015
+ return profile
1016
+
1017
+ return None
1018
 
1019
 
1020
  # ============================================
 
1022
  # ============================================
1023
 
1024
  print("=" * 50)
1025
+ print("πŸŽ™οΈ VEDES TTS - With Voice Training")
 
1026
  print("=" * 50)
1027
 
1028
  tts = VedesTTS(SAMPLE_RATE)
1029
 
1030
  print("βœ… Ready!")
1031
+ print(f"πŸ“’ Available voices: {len(VOICE_PROFILES)}")
1032
  print("=" * 50)
1033
 
1034
 
 
1036
  # GRADIO INTERFACE
1037
  # ============================================
1038
 
1039
+ def synthesize(text, voice_name, rate, pitch):
1040
  if not text or not text.strip():
1041
  return None
1042
 
 
1044
 
1045
  try:
1046
  pitch_mult = 2 ** (pitch / 12)
1047
+
1048
+ # Check custom voices first
1049
+ if voice_name in custom_voices:
1050
+ voice = custom_voices[voice_name]
1051
+ elif voice_name in VOICE_PROFILES:
1052
+ voice = VOICE_PROFILES[voice_name]
1053
+ else:
1054
+ voice = VOICE_PROFILES["Emma (Female)"]
1055
+
1056
+ tts.current_voice = voice
1057
  audio = tts.speak(text, rate=rate, pitch=pitch_mult)
1058
 
1059
  if len(audio) < 100:
 
1069
  return None
1070
 
1071
 
1072
+ def train_voice(audio, voice_name):
1073
+ if audio is None:
1074
+ return "❌ No audio provided", gr.update(choices=get_all_voices())
1075
 
1076
+ if not voice_name or not voice_name.strip():
1077
+ voice_name = "My Voice"
1078
+
1079
+ voice_name = voice_name.strip()[:30]
1080
+
1081
+ try:
1082
+ profile = tts.train_voice(audio, voice_name)
1083
+
1084
+ if profile:
1085
+ details = f"""
1086
+ βœ… Voice "{voice_name}" created successfully!
1087
+
1088
+ **Voice Parameters:**
1089
+ - Pitch (F0): {profile['f0']:.1f} Hz
1090
+ - Pitch Variation: {profile['f0_variation']:.1f} Hz
1091
+ - Formant Shift: {profile['formant_shift']:.2f}
1092
+ - Breathiness: {profile['breathiness']:.3f}
1093
+ - Brightness: {profile['brightness']:.2f}
1094
+ """
1095
+ return details, gr.update(choices=get_all_voices(), value=voice_name)
1096
+ else:
1097
+ return "❌ Could not analyze voice. Try a longer sample.", gr.update(choices=get_all_voices())
1098
+
1099
+ except Exception as e:
1100
+ return f"❌ Error: {str(e)}", gr.update(choices=get_all_voices())
1101
+
1102
+
1103
+ def get_all_voices():
1104
+ voices = list(VOICE_PROFILES.keys()) + list(custom_voices.keys())
1105
+ return voices
1106
+
1107
+
1108
+ def get_voice_info(voice_name):
1109
+ if voice_name in VOICE_PROFILES:
1110
+ v = VOICE_PROFILES[voice_name]
1111
+ elif voice_name in custom_voices:
1112
+ v = custom_voices[voice_name]
1113
+ else:
1114
+ return "Select a voice"
1115
+
1116
+ return f"""
1117
+ **{v.get('name', voice_name)}**
1118
+ - Type: {v.get('gender', 'unknown').title()}
1119
+ - Pitch: {v.get('f0', 130):.0f} Hz
1120
+ - {v.get('description', '')}
1121
+ """
1122
+
1123
+
1124
+ def create_custom_voice(name, pitch, formant, breathiness, speed, brightness):
1125
+ if not name or not name.strip():
1126
+ return "❌ Please enter a voice name", gr.update(choices=get_all_voices())
1127
+
1128
+ name = name.strip()
1129
+
1130
+ profile = {
1131
+ "name": name,
1132
+ "gender": "custom",
1133
+ "f0": pitch,
1134
+ "f0_variation": 25,
1135
+ "formant_shift": formant,
1136
+ "breathiness": breathiness / 100,
1137
+ "speed": speed,
1138
+ "brightness": brightness,
1139
+ "description": f"Custom voice (pitch={pitch}Hz)"
1140
+ }
1141
+
1142
+ custom_voices[name] = profile
1143
 
1144
+ return f"βœ… Voice '{name}' created!", gr.update(choices=get_all_voices(), value=name)
1145
+
1146
+
1147
+ # Build interface
1148
+ with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
1149
 
1150
+ gr.Markdown("""
1151
+ # πŸŽ™οΈ Vedes TTS - Voice Training Edition
1152
+ ### Create and Use Custom Voices - 100% From Scratch
 
1153
  """)
1154
 
1155
+ with gr.Tabs():
1156
+ # ===== SPEAK TAB =====
1157
+ with gr.TabItem("πŸ”Š Speak"):
1158
+ with gr.Row():
1159
+ with gr.Column(scale=2):
1160
+ text_input = gr.Textbox(
1161
+ label="πŸ“ Text to Speak",
1162
+ placeholder="Type here...",
1163
+ lines=3
1164
+ )
1165
+
1166
+ voice_select = gr.Dropdown(
1167
+ choices=get_all_voices(),
1168
+ value="Emma (Female)",
1169
+ label="πŸ—£οΈ Voice"
1170
+ )
1171
+
1172
+ voice_info = gr.Markdown("Select a voice")
1173
+
1174
+ with gr.Row():
1175
+ rate = gr.Slider(0.6, 1.5, 0.9, step=0.1, label="⏱️ Speed")
1176
+ pitch = gr.Slider(-6, 6, 0, step=1, label="🎡 Pitch")
1177
+
1178
+ speak_btn = gr.Button("πŸ”Š Speak", variant="primary", size="lg")
1179
+
1180
+ with gr.Column(scale=1):
1181
+ audio_out = gr.Audio(label="🎧 Output", type="numpy")
1182
+
1183
+ gr.Examples(
1184
+ examples=[
1185
+ ["Hello, how are you?"],
1186
+ ["Good morning!"],
1187
+ ["My name is Vedes."],
1188
+ ["Thank you very much."],
1189
+ ["Have a nice day."],
1190
+ ],
1191
+ inputs=text_input,
1192
+ label="πŸ“š Examples"
1193
  )
1194
+
1195
+ # ===== TRAIN VOICE TAB =====
1196
+ with gr.TabItem("🎀 Train Voice"):
1197
+ gr.Markdown("""
1198
+ ### Train a New Voice from Audio
1199
+
1200
+ Record or upload an audio sample, and Vedes will extract the voice characteristics.
1201
+
1202
+ **Tips for best results:**
1203
+ - Record 5-10 seconds of speech
1204
+ - Speak clearly and naturally
1205
+ - Avoid background noise
1206
+ """)
1207
 
1208
  with gr.Row():
1209
+ with gr.Column():
1210
+ audio_input = gr.Audio(
1211
+ label="🎀 Record or Upload Audio",
1212
+ sources=["microphone", "upload"],
1213
+ type="numpy"
1214
+ )
1215
+
1216
+ voice_name_input = gr.Textbox(
1217
+ label="Voice Name",
1218
+ placeholder="My Voice",
1219
+ value="My Voice"
1220
+ )
1221
+
1222
+ train_btn = gr.Button("🧠 Train Voice", variant="primary")
1223
+
1224
+ with gr.Column():
1225
+ train_result = gr.Markdown("Upload audio and click Train")
1226
+ trained_voice_select = gr.Dropdown(
1227
+ choices=get_all_voices(),
1228
+ label="Use Trained Voice"
1229
+ )
1230
+
1231
+ # ===== CREATE VOICE TAB =====
1232
+ with gr.TabItem("βš™οΈ Create Voice"):
1233
+ gr.Markdown("""
1234
+ ### Create Custom Voice Manually
1235
 
1236
+ Adjust the parameters to create your own voice:
1237
+ """)
1238
+
1239
+ with gr.Row():
1240
+ with gr.Column():
1241
+ custom_name = gr.Textbox(
1242
+ label="Voice Name",
1243
+ placeholder="My Custom Voice"
1244
+ )
1245
+
1246
+ custom_pitch = gr.Slider(
1247
+ 60, 300, 150,
1248
+ label="Pitch (Hz)",
1249
+ info="80-130 = Male, 150-250 = Female, 250+ = Child"
1250
+ )
1251
+
1252
+ custom_formant = gr.Slider(
1253
+ 0.7, 1.4, 1.0, step=0.05,
1254
+ label="Formant Shift",
1255
+ info="<1.0 = Larger vocal tract (male), >1.0 = Smaller (female)"
1256
+ )
1257
+
1258
+ custom_breathiness = gr.Slider(
1259
+ 1, 10, 3,
1260
+ label="Breathiness",
1261
+ info="Higher = more breathy/airy voice"
1262
+ )
1263
+
1264
+ custom_speed = gr.Slider(
1265
+ 0.7, 1.3, 1.0, step=0.05,
1266
+ label="Natural Speed"
1267
+ )
1268
+
1269
+ custom_brightness = gr.Slider(
1270
+ 0.8, 1.3, 1.0, step=0.05,
1271
+ label="Brightness",
1272
+ info="Higher = brighter, more forward voice"
1273
+ )
1274
+
1275
+ create_btn = gr.Button("✨ Create Voice", variant="primary")
1276
+
1277
+ with gr.Column():
1278
+ create_result = gr.Markdown("")
1279
+ created_voice_select = gr.Dropdown(
1280
+ choices=get_all_voices(),
1281
+ label="Created Voices"
1282
+ )
1283
+
1284
+ gr.Markdown("""
1285
+ ### Voice Parameter Guide
1286
+
1287
+ | Parameter | Male | Female | Child |
1288
+ |-----------|------|--------|-------|
1289
+ | Pitch | 80-130 Hz | 150-250 Hz | 250-350 Hz |
1290
+ | Formant | 0.85-0.95 | 1.05-1.20 | 1.20-1.35 |
1291
+ | Breathiness | 2-4 | 3-6 | 2-4 |
1292
+ | Brightness | 0.9-1.0 | 1.0-1.15 | 1.1-1.25 |
1293
+ """)
1294
+
1295
+ # ===== VOICES TAB =====
1296
+ with gr.TabItem("πŸ‘₯ All Voices"):
1297
+ gr.Markdown("### Available Voices")
1298
+
1299
+ voice_cards = ""
1300
+ for name, v in VOICE_PROFILES.items():
1301
+ voice_cards += f"""
1302
+ **{name}**
1303
+ - Type: {v['gender'].title()}
1304
+ - Pitch: {v['f0']} Hz
1305
+ - {v['description']}
1306
+
1307
+ ---
1308
+ """
1309
+ gr.Markdown(voice_cards)
1310
 
1311
+ # Event handlers
1312
+ voice_select.change(get_voice_info, voice_select, voice_info)
1313
+ speak_btn.click(synthesize, [text_input, voice_select, rate, pitch], audio_out)
1314
+ text_input.submit(synthesize, [text_input, voice_select, rate, pitch], audio_out)
 
 
 
 
 
 
 
 
 
 
 
 
1315
 
1316
+ train_btn.click(
1317
+ train_voice,
1318
+ [audio_input, voice_name_input],
1319
+ [train_result, trained_voice_select]
1320
+ )
1321
 
1322
+ create_btn.click(
1323
+ create_custom_voice,
1324
+ [custom_name, custom_pitch, custom_formant, custom_breathiness,
1325
+ custom_speed, custom_brightness],
1326
+ [create_result, created_voice_select]
1327
+ )
1328
 
1329
+ # Update voice selectors when new voices are created
1330
+ trained_voice_select.change(
1331
+ lambda x: x, trained_voice_select, voice_select
1332
+ )
1333
+ created_voice_select.change(
1334
+ lambda x: x, created_voice_select, voice_select
1335
+ )
1336
 
1337
 
1338
  if __name__ == "__main__":