Spaces:

vedaco
/

Vedes

Sleeping

App Files Files Community

vedaco commited on Jan 7

Commit

b67fedc

verified ·

1 Parent(s): de1505b

Update app.py

Browse files

Files changed (1) hide show

app.py +842 -730

app.py CHANGED Viewed

@@ -4,48 +4,144 @@ from scipy import signal
 from scipy.io import wavfile
 import tempfile
 import re
 # ============================================
-# VEDES TTS - CLEAR SPEECH VERSION
 # 100% From Scratch - No APIs
 # ============================================
 SAMPLE_RATE = 22050
 # ============================================
-# PHONEME DATA - OPTIMIZED FOR CLARITY
 # ============================================
-# Format: F1, F2, F3, duration_ms, amplitude, is_voiced
 VOWELS = {
-    'IY': (280, 2250, 2890, 150, 1.0, True),   # bee
-    'IH': (400, 1920, 2550, 120, 0.9, True),   # bit
-    'EH': (550, 1770, 2490, 130, 0.95, True),  # bet
-    'AE': (690, 1660, 2490, 140, 1.0, True),   # bat
-    'AA': (710, 1100, 2540, 150, 1.0, True),   # father
-    'AO': (590, 880, 2540, 140, 0.95, True),   # bought
-    'UH': (470, 1100, 2540, 120, 0.9, True),   # book
-    'UW': (310, 870, 2250, 150, 1.0, True),    # boot
-    'AH': (640, 1200, 2400, 100, 0.85, True),  # but
-    'AX': (500, 1500, 2500, 80, 0.7, True),    # about (schwa)
-    'ER': (500, 1350, 1700, 140, 0.9, True),   # bird
-    'EY': (500, 1900, 2600, 160, 1.0, True),   # bay
-    'AY': (700, 1200, 2600, 180, 1.0, True),   # buy
-    'OY': (500, 900, 2500, 180, 1.0, True),    # boy
-    'AW': (700, 1100, 2600, 180, 1.0, True),   # now
-    'OW': (500, 900, 2500, 160, 1.0, True),    # go
 }
 CONSONANTS = {
-    # Stops: closure_ms, burst_ms, voice_bar, burst_freq
     'P': {'type': 'stop', 'closure': 80, 'burst': 30, 'voiced': False, 'burst_freq': 800, 'amp': 0.6},
     'B': {'type': 'stop', 'closure': 50, 'burst': 25, 'voiced': True, 'burst_freq': 800, 'amp': 0.7},
     'T': {'type': 'stop', 'closure': 70, 'burst': 30, 'voiced': False, 'burst_freq': 3500, 'amp': 0.7},
     'D': {'type': 'stop', 'closure': 40, 'burst': 25, 'voiced': True, 'burst_freq': 3500, 'amp': 0.7},
     'K': {'type': 'stop', 'closure': 80, 'burst': 40, 'voiced': False, 'burst_freq': 1500, 'amp': 0.7},
     'G': {'type': 'stop', 'closure': 50, 'burst': 30, 'voiced': True, 'burst_freq': 1500, 'amp': 0.7},
-    # Fricatives: duration, freq_low, freq_high, voiced
     'F': {'type': 'fric', 'dur': 120, 'freq_low': 1500, 'freq_high': 8000, 'voiced': False, 'amp': 0.4},
     'V': {'type': 'fric', 'dur': 80, 'freq_low': 1500, 'freq_high': 8000, 'voiced': True, 'amp': 0.5},
     'TH': {'type': 'fric', 'dur': 100, 'freq_low': 1400, 'freq_high': 6000, 'voiced': False, 'amp': 0.3},
@@ -55,530 +151,209 @@ CONSONANTS = {
     'SH': {'type': 'fric', 'dur': 120, 'freq_low': 2000, 'freq_high': 6000, 'voiced': False, 'amp': 0.5},
     'ZH': {'type': 'fric', 'dur': 80, 'freq_low': 2000, 'freq_high': 6000, 'voiced': True, 'amp': 0.5},
     'HH': {'type': 'fric', 'dur': 80, 'freq_low': 500, 'freq_high': 2000, 'voiced': False, 'amp': 0.3},
-    # Affricates
     'CH': {'type': 'affric', 'closure': 60, 'fric': 80, 'freq_low': 2000, 'freq_high': 6000, 'voiced': False, 'amp': 0.6},
     'JH': {'type': 'affric', 'closure': 40, 'fric': 60, 'freq_low': 2000, 'freq_high': 6000, 'voiced': True, 'amp': 0.6},
-    # Nasals: F1, F2, F3, duration
     'M': {'type': 'nasal', 'f1': 280, 'f2': 1000, 'f3': 2200, 'dur': 100, 'amp': 0.8},
     'N': {'type': 'nasal', 'f1': 280, 'f2': 1700, 'f3': 2500, 'dur': 90, 'amp': 0.8},
     'NG': {'type': 'nasal', 'f1': 300, 'f2': 2000, 'f3': 2700, 'dur': 100, 'amp': 0.8},
-    # Liquids
     'L': {'type': 'liquid', 'f1': 380, 'f2': 1000, 'f3': 2700, 'dur': 90, 'amp': 0.85},
     'R': {'type': 'liquid', 'f1': 350, 'f2': 1300, 'f3': 1700, 'dur': 90, 'amp': 0.85},
-    # Glides
     'W': {'type': 'glide', 'f1': 300, 'f2': 700, 'f3': 2200, 'dur': 80, 'amp': 0.8},
     'Y': {'type': 'glide', 'f1': 280, 'f2': 2200, 'f3': 2900, 'dur': 70, 'amp': 0.8},
 }
-# Silence
-SILENCE = {
-    'SIL': 60,   # Short pause between words
-    'PAU': 200,  # Long pause (punctuation)
-}
 # ============================================
-# COMPREHENSIVE PRONUNCIATION DICTIONARY
 # ============================================
 DICTIONARY = {
-    # ===== FUNCTION WORDS =====
-    'a': ['AX'],
-    'an': ['AE', 'N'],
-    'the': ['DH', 'AX'],
-    'and': ['AE', 'N', 'D'],
-    'or': ['AO', 'R'],
-    'but': ['B', 'AH', 'T'],
-    'if': ['IH', 'F'],
-    'of': ['AH', 'V'],
-    'to': ['T', 'UW'],
-    'in': ['IH', 'N'],
-    'on': ['AA', 'N'],
-    'at': ['AE', 'T'],
-    'by': ['B', 'AY'],
-    'for': ['F', 'AO', 'R'],
-    'with': ['W', 'IH', 'TH'],
-    'from': ['F', 'R', 'AH', 'M'],
-    'up': ['AH', 'P'],
-    'out': ['AW', 'T'],
-    'as': ['AE', 'Z'],
-    'so': ['S', 'OW'],
-    'not': ['N', 'AA', 'T'],
-    # ===== PRONOUNS =====
-    'i': ['AY'],
-    'me': ['M', 'IY'],
-    'my': ['M', 'AY'],
-    'mine': ['M', 'AY', 'N'],
-    'you': ['Y', 'UW'],
-    'your': ['Y', 'AO', 'R'],
-    'yours': ['Y', 'AO', 'R', 'Z'],
-    'he': ['HH', 'IY'],
-    'him': ['HH', 'IH', 'M'],
-    'his': ['HH', 'IH', 'Z'],
-    'she': ['SH', 'IY'],
-    'her': ['HH', 'ER'],
-    'hers': ['HH', 'ER', 'Z'],
-    'it': ['IH', 'T'],
-    'its': ['IH', 'T', 'S'],
-    'we': ['W', 'IY'],
-    'us': ['AH', 'S'],
-    'our': ['AW', 'ER'],
-    'they': ['DH', 'EY'],
-    'them': ['DH', 'EH', 'M'],
-    'their': ['DH', 'EH', 'R'],
-    'this': ['DH', 'IH', 'S'],
-    'that': ['DH', 'AE', 'T'],
-    'these': ['DH', 'IY', 'Z'],
-    'those': ['DH', 'OW', 'Z'],
-    'what': ['W', 'AH', 'T'],
-    'who': ['HH', 'UW'],
-    'where': ['W', 'EH', 'R'],
-    'when': ['W', 'EH', 'N'],
-    'why': ['W', 'AY'],
-    'how': ['HH', 'AW'],
-    'which': ['W', 'IH', 'CH'],
-    # ===== BE VERBS =====
-    'am': ['AE', 'M'],
-    'is': ['IH', 'Z'],
-    'are': ['AA', 'R'],
-    'was': ['W', 'AA', 'Z'],
-    'were': ['W', 'ER'],
-    'be': ['B', 'IY'],
-    'been': ['B', 'IH', 'N'],
-    'being': ['B', 'IY', 'IH', 'NG'],
-    # ===== HAVE VERBS =====
-    'have': ['HH', 'AE', 'V'],
-    'has': ['HH', 'AE', 'Z'],
-    'had': ['HH', 'AE', 'D'],
-    'having': ['HH', 'AE', 'V', 'IH', 'NG'],
-    # ===== DO VERBS =====
-    'do': ['D', 'UW'],
-    'does': ['D', 'AH', 'Z'],
-    'did': ['D', 'IH', 'D'],
-    'done': ['D', 'AH', 'N'],
-    'doing': ['D', 'UW', 'IH', 'NG'],
-    # ===== MODAL VERBS =====
-    'will': ['W', 'IH', 'L'],
-    'would': ['W', 'UH', 'D'],
-    'can': ['K', 'AE', 'N'],
-    'could': ['K', 'UH', 'D'],
-    'should': ['SH', 'UH', 'D'],
-    'shall': ['SH', 'AE', 'L'],
-    'may': ['M', 'EY'],
-    'might': ['M', 'AY', 'T'],
-    'must': ['M', 'AH', 'S', 'T'],
-    # ===== COMMON VERBS =====
-    'go': ['G', 'OW'],
-    'goes': ['G', 'OW', 'Z'],
-    'going': ['G', 'OW', 'IH', 'NG'],
-    'went': ['W', 'EH', 'N', 'T'],
-    'gone': ['G', 'AO', 'N'],
-    'come': ['K', 'AH', 'M'],
-    'comes': ['K', 'AH', 'M', 'Z'],
-    'coming': ['K', 'AH', 'M', 'IH', 'NG'],
-    'came': ['K', 'EY', 'M'],
-    'get': ['G', 'EH', 'T'],
-    'gets': ['G', 'EH', 'T', 'S'],
-    'getting': ['G', 'EH', 'T', 'IH', 'NG'],
-    'got': ['G', 'AA', 'T'],
-    'make': ['M', 'EY', 'K'],
-    'makes': ['M', 'EY', 'K', 'S'],
-    'making': ['M', 'EY', 'K', 'IH', 'NG'],
-    'made': ['M', 'EY', 'D'],
-    'take': ['T', 'EY', 'K'],
-    'takes': ['T', 'EY', 'K', 'S'],
-    'taking': ['T', 'EY', 'K', 'IH', 'NG'],
-    'took': ['T', 'UH', 'K'],
-    'taken': ['T', 'EY', 'K', 'AX', 'N'],
-    'see': ['S', 'IY'],
-    'sees': ['S', 'IY', 'Z'],
-    'seeing': ['S', 'IY', 'IH', 'NG'],
-    'saw': ['S', 'AO'],
-    'seen': ['S', 'IY', 'N'],
-    'say': ['S', 'EY'],
-    'says': ['S', 'EH', 'Z'],
-    'saying': ['S', 'EY', 'IH', 'NG'],
-    'said': ['S', 'EH', 'D'],
-    'know': ['N', 'OW'],
-    'knows': ['N', 'OW', 'Z'],
-    'knowing': ['N', 'OW', 'IH', 'NG'],
-    'knew': ['N', 'UW'],
-    'known': ['N', 'OW', 'N'],
-    'think': ['TH', 'IH', 'NG', 'K'],
-    'thinks': ['TH', 'IH', 'NG', 'K', 'S'],
-    'thinking': ['TH', 'IH', 'NG', 'K', 'IH', 'NG'],
-    'thought': ['TH', 'AO', 'T'],
-    'want': ['W', 'AA', 'N', 'T'],
-    'wants': ['W', 'AA', 'N', 'T', 'S'],
-    'wanted': ['W', 'AA', 'N', 'T', 'IH', 'D'],
-    'wanting': ['W', 'AA', 'N', 'T', 'IH', 'NG'],
-    'give': ['G', 'IH', 'V'],
-    'gives': ['G', 'IH', 'V', 'Z'],
-    'giving': ['G', 'IH', 'V', 'IH', 'NG'],
-    'gave': ['G', 'EY', 'V'],
-    'given': ['G', 'IH', 'V', 'AX', 'N'],
-    'use': ['Y', 'UW', 'Z'],
-    'uses': ['Y', 'UW', 'Z', 'IH', 'Z'],
-    'using': ['Y', 'UW', 'Z', 'IH', 'NG'],
-    'used': ['Y', 'UW', 'Z', 'D'],
-    'find': ['F', 'AY', 'N', 'D'],
-    'finds': ['F', 'AY', 'N', 'D', 'Z'],
-    'finding': ['F', 'AY', 'N', 'D', 'IH', 'NG'],
-    'found': ['F', 'AW', 'N', 'D'],
-    'tell': ['T', 'EH', 'L'],
-    'tells': ['T', 'EH', 'L', 'Z'],
-    'telling': ['T', 'EH', 'L', 'IH', 'NG'],
-    'told': ['T', 'OW', 'L', 'D'],
-    'ask': ['AE', 'S', 'K'],
-    'asks': ['AE', 'S', 'K', 'S'],
-    'asking': ['AE', 'S', 'K', 'IH', 'NG'],
-    'asked': ['AE', 'S', 'K', 'T'],
-    'work': ['W', 'ER', 'K'],
-    'works': ['W', 'ER', 'K', 'S'],
-    'working': ['W', 'ER', 'K', 'IH', 'NG'],
-    'worked': ['W', 'ER', 'K', 'T'],
-    'try': ['T', 'R', 'AY'],
-    'tries': ['T', 'R', 'AY', 'Z'],
-    'trying': ['T', 'R', 'AY', 'IH', 'NG'],
-    'tried': ['T', 'R', 'AY', 'D'],
-    'call': ['K', 'AO', 'L'],
-    'calls': ['K', 'AO', 'L', 'Z'],
-    'calling': ['K', 'AO', 'L', 'IH', 'NG'],
-    'called': ['K', 'AO', 'L', 'D'],
-    'need': ['N', 'IY', 'D'],
-    'needs': ['N', 'IY', 'D', 'Z'],
-    'needing': ['N', 'IY', 'D', 'IH', 'NG'],
-    'needed': ['N', 'IY', 'D', 'IH', 'D'],
-    'feel': ['F', 'IY', 'L'],
-    'feels': ['F', 'IY', 'L', 'Z'],
-    'feeling': ['F', 'IY', 'L', 'IH', 'NG'],
-    'felt': ['F', 'EH', 'L', 'T'],
-    'put': ['P', 'UH', 'T'],
-    'puts': ['P', 'UH', 'T', 'S'],
-    'putting': ['P', 'UH', 'T', 'IH', 'NG'],
-    'keep': ['K', 'IY', 'P'],
-    'keeps': ['K', 'IY', 'P', 'S'],
-    'keeping': ['K', 'IY', 'P', 'IH', 'NG'],
-    'kept': ['K', 'EH', 'P', 'T'],
-    'let': ['L', 'EH', 'T'],
-    'lets': ['L', 'EH', 'T', 'S'],
-    'letting': ['L', 'EH', 'T', 'IH', 'NG'],
-    'begin': ['B', 'IH', 'G', 'IH', 'N'],
-    'begins': ['B', 'IH', 'G', 'IH', 'N', 'Z'],
-    'beginning': ['B', 'IH', 'G', 'IH', 'N', 'IH', 'NG'],
-    'began': ['B', 'IH', 'G', 'AE', 'N'],
-    'seem': ['S', 'IY', 'M'],
-    'seems': ['S', 'IY', 'M', 'Z'],
-    'seeming': ['S', 'IY', 'M', 'IH', 'NG'],
-    'seemed': ['S', 'IY', 'M', 'D'],
-    'help': ['HH', 'EH', 'L', 'P'],
-    'helps': ['HH', 'EH', 'L', 'P', 'S'],
-    'helping': ['HH', 'EH', 'L', 'P', 'IH', 'NG'],
-    'helped': ['HH', 'EH', 'L', 'P', 'T'],
-    'show': ['SH', 'OW'],
-    'shows': ['SH', 'OW', 'Z'],
-    'showing': ['SH', 'OW', 'IH', 'NG'],
-    'showed': ['SH', 'OW', 'D'],
-    'shown': ['SH', 'OW', 'N'],
-    'hear': ['HH', 'IY', 'R'],
-    'hears': ['HH', 'IY', 'R', 'Z'],
-    'hearing': ['HH', 'IY', 'R', 'IH', 'NG'],
-    'heard': ['HH', 'ER', 'D'],
-    'play': ['P', 'L', 'EY'],
-    'plays': ['P', 'L', 'EY', 'Z'],
-    'playing': ['P', 'L', 'EY', 'IH', 'NG'],
-    'played': ['P', 'L', 'EY', 'D'],
-    'run': ['R', 'AH', 'N'],
-    'runs': ['R', 'AH', 'N', 'Z'],
-    'running': ['R', 'AH', 'N', 'IH', 'NG'],
-    'ran': ['R', 'AE', 'N'],
-    'move': ['M', 'UW', 'V'],
-    'moves': ['M', 'UW', 'V', 'Z'],
-    'moving': ['M', 'UW', 'V', 'IH', 'NG'],
-    'moved': ['M', 'UW', 'V', 'D'],
-    'live': ['L', 'IH', 'V'],
-    'lives': ['L', 'IH', 'V', 'Z'],
-    'living': ['L', 'IH', 'V', 'IH', 'NG'],
-    'lived': ['L', 'IH', 'V', 'D'],
     'believe': ['B', 'IH', 'L', 'IY', 'V'],
-    'believes': ['B', 'IH', 'L', 'IY', 'V', 'Z'],
-    'believed': ['B', 'IH', 'L', 'IY', 'V', 'D'],
-    'read': ['R', 'IY', 'D'],
-    'reads': ['R', 'IY', 'D', 'Z'],
-    'reading': ['R', 'IY', 'D', 'IH', 'NG'],
-    'write': ['R', 'AY', 'T'],
-    'writes': ['R', 'AY', 'T', 'S'],
-    'writing': ['R', 'AY', 'T', 'IH', 'NG'],
-    'wrote': ['R', 'OW', 'T'],
-    'written': ['R', 'IH', 'T', 'AX', 'N'],
-    'speak': ['S', 'P', 'IY', 'K'],
-    'speaks': ['S', 'P', 'IY', 'K', 'S'],
-    'speaking': ['S', 'P', 'IY', 'K', 'IH', 'NG'],
-    'spoke': ['S', 'P', 'OW', 'K'],
-    'spoken': ['S', 'P', 'OW', 'K', 'AX', 'N'],
-    'learn': ['L', 'ER', 'N'],
-    'learns': ['L', 'ER', 'N', 'Z'],
-    'learning': ['L', 'ER', 'N', 'IH', 'NG'],
-    'learned': ['L', 'ER', 'N', 'D'],
-    'like': ['L', 'AY', 'K'],
-    'likes': ['L', 'AY', 'K', 'S'],
-    'liking': ['L', 'AY', 'K', 'IH', 'NG'],
-    'liked': ['L', 'AY', 'K', 'T'],
-    'look': ['L', 'UH', 'K'],
-    'looks': ['L', 'UH', 'K', 'S'],
-    'looking': ['L', 'UH', 'K', 'IH', 'NG'],
-    'looked': ['L', 'UH', 'K', 'T'],
-    'love': ['L', 'AH', 'V'],
-    'loves': ['L', 'AH', 'V', 'Z'],
-    'loving': ['L', 'AH', 'V', 'IH', 'NG'],
-    'loved': ['L', 'AH', 'V', 'D'],
-    # ===== ADJECTIVES =====
-    'good': ['G', 'UH', 'D'],
-    'better': ['B', 'EH', 'T', 'ER'],
-    'best': ['B', 'EH', 'S', 'T'],
-    'bad': ['B', 'AE', 'D'],
-    'worse': ['W', 'ER', 'S'],
-    'worst': ['W', 'ER', 'S', 'T'],
-    'new': ['N', 'UW'],
-    'old': ['OW', 'L', 'D'],
-    'young': ['Y', 'AH', 'NG'],
-    'big': ['B', 'IH', 'G'],
-    'small': ['S', 'M', 'AO', 'L'],
-    'long': ['L', 'AO', 'NG'],
-    'short': ['SH', 'AO', 'R', 'T'],
-    'high': ['HH', 'AY'],
-    'low': ['L', 'OW'],
-    'great': ['G', 'R', 'EY', 'T'],
-    'little': ['L', 'IH', 'T', 'AX', 'L'],
-    'right': ['R', 'AY', 'T'],
-    'wrong': ['R', 'AO', 'NG'],
-    'first': ['F', 'ER', 'S', 'T'],
-    'last': ['L', 'AE', 'S', 'T'],
-    'next': ['N', 'EH', 'K', 'S', 'T'],
-    'same': ['S', 'EY', 'M'],
-    'different': ['D', 'IH', 'F', 'R', 'AX', 'N', 'T'],
-    'other': ['AH', 'DH', 'ER'],
-    'own': ['OW', 'N'],
-    'important': ['IH', 'M', 'P', 'AO', 'R', 'T', 'AX', 'N', 'T'],
-    'real': ['R', 'IY', 'L'],
-    'sure': ['SH', 'UH', 'R'],
-    'true': ['T', 'R', 'UW'],
-    'happy': ['HH', 'AE', 'P', 'IY'],
-    'nice': ['N', 'AY', 'S'],
-    'easy': ['IY', 'Z', 'IY'],
-    'hard': ['HH', 'AA', 'R', 'D'],
-    'fine': ['F', 'AY', 'N'],
-    'clear': ['K', 'L', 'IY', 'R'],
-    'free': ['F', 'R', 'IY'],
-    'full': ['F', 'UH', 'L'],
-    'open': ['OW', 'P', 'AX', 'N'],
-    'simple': ['S', 'IH', 'M', 'P', 'AX', 'L'],
-    'ready': ['R', 'EH', 'D', 'IY'],
-    'able': ['EY', 'B', 'AX', 'L'],
-    'possible': ['P', 'AA', 'S', 'AX', 'B', 'AX', 'L'],
-    # ===== ADVERBS =====
-    'very': ['V', 'EH', 'R', 'IY'],
-    'really': ['R', 'IY', 'L', 'IY'],
-    'just': ['JH', 'AH', 'S', 'T'],
-    'only': ['OW', 'N', 'L', 'IY'],
-    'also': ['AO', 'L', 'S', 'OW'],
-    'well': ['W', 'EH', 'L'],
-    'now': ['N', 'AW'],
-    'then': ['DH', 'EH', 'N'],
-    'here': ['HH', 'IY', 'R'],
-    'there': ['DH', 'EH', 'R'],
-    'still': ['S', 'T', 'IH', 'L'],
-    'even': ['IY', 'V', 'AX', 'N'],
-    'back': ['B', 'AE', 'K'],
-    'again': ['AX', 'G', 'EH', 'N'],
-    'always': ['AO', 'L', 'W', 'EY', 'Z'],
-    'never': ['N', 'EH', 'V', 'ER'],
-    'ever': ['EH', 'V', 'ER'],
-    'often': ['AO', 'F', 'AX', 'N'],
-    'sometimes': ['S', 'AH', 'M', 'T', 'AY', 'M', 'Z'],
-    'today': ['T', 'AX', 'D', 'EY'],
-    'maybe': ['M', 'EY', 'B', 'IY'],
-    'too': ['T', 'UW'],
-    'much': ['M', 'AH', 'CH'],
-    'more': ['M', 'AO', 'R'],
-    'most': ['M', 'OW', 'S', 'T'],
-    'less': ['L', 'EH', 'S'],
-    'away': ['AX', 'W', 'EY'],
-    'together': ['T', 'AX', 'G', 'EH', 'DH', 'ER'],
-    # ===== NOUNS =====
-    'time': ['T', 'AY', 'M'],
-    'year': ['Y', 'IY', 'R'],
-    'day': ['D', 'EY'],
-    'way': ['W', 'EY'],
-    'man': ['M', 'AE', 'N'],
-    'men': ['M', 'EH', 'N'],
-    'woman': ['W', 'UH', 'M', 'AX', 'N'],
-    'women': ['W', 'IH', 'M', 'IH', 'N'],
-    'child': ['CH', 'AY', 'L', 'D'],
-    'children': ['CH', 'IH', 'L', 'D', 'R', 'AX', 'N'],
-    'world': ['W', 'ER', 'L', 'D'],
-    'life': ['L', 'AY', 'F'],
-    'hand': ['HH', 'AE', 'N', 'D'],
-    'part': ['P', 'AA', 'R', 'T'],
-    'place': ['P', 'L', 'EY', 'S'],
-    'thing': ['TH', 'IH', 'NG'],
-    'things': ['TH', 'IH', 'NG', 'Z'],
-    'people': ['P', 'IY', 'P', 'AX', 'L'],
-    'person': ['P', 'ER', 'S', 'AX', 'N'],
-    'home': ['HH', 'OW', 'M'],
-    'house': ['HH', 'AW', 'S'],
-    'room': ['R', 'UW', 'M'],
-    'word': ['W', 'ER', 'D'],
-    'words': ['W', 'ER', 'D', 'Z'],
-    'name': ['N', 'EY', 'M'],
-    'number': ['N', 'AH', 'M', 'B', 'ER'],
-    'water': ['W', 'AO', 'T', 'ER'],
-    'money': ['M', 'AH', 'N', 'IY'],
     'family': ['F', 'AE', 'M', 'AX', 'L', 'IY'],
-    'friend': ['F', 'R', 'EH', 'N', 'D'],
-    'friends': ['F', 'R', 'EH', 'N', 'D', 'Z'],
-    'mother': ['M', 'AH', 'DH', 'ER'],
-    'father': ['F', 'AA', 'DH', 'ER'],
-    'boy': ['B', 'OY'],
-    'girl': ['G', 'ER', 'L'],
-    'head': ['HH', 'EH', 'D'],
-    'face': ['F', 'EY', 'S'],
-    'eye': ['AY'],
-    'eyes': ['AY', 'Z'],
-    'body': ['B', 'AA', 'D', 'IY'],
-    'heart': ['HH', 'AA', 'R', 'T'],
-    'mind': ['M', 'AY', 'N', 'D'],
-    'voice': ['V', 'OY', 'S'],
-    'night': ['N', 'AY', 'T'],
     'morning': ['M', 'AO', 'R', 'N', 'IH', 'NG'],
-    'week': ['W', 'IY', 'K'],
-    'month': ['M', 'AH', 'N', 'TH'],
-    'hour': ['AW', 'ER'],
-    'minute': ['M', 'IH', 'N', 'IH', 'T'],
-    'second': ['S', 'EH', 'K', 'AX', 'N', 'D'],
-    'school': ['S', 'K', 'UW', 'L'],
-    'book': ['B', 'UH', 'K'],
     'story': ['S', 'T', 'AO', 'R', 'IY'],
     'question': ['K', 'W', 'EH', 'S', 'CH', 'AX', 'N'],
     'answer': ['AE', 'N', 'S', 'ER'],
-    'problem': ['P', 'R', 'AA', 'B', 'L', 'AX', 'M'],
-    'idea': ['AY', 'D', 'IY', 'AX'],
-    'fact': ['F', 'AE', 'K', 'T'],
-    'reason': ['R', 'IY', 'Z', 'AX', 'N'],
-    'example': ['IH', 'G', 'Z', 'AE', 'M', 'P', 'AX', 'L'],
-    'point': ['P', 'OY', 'N', 'T'],
-    'end': ['EH', 'N', 'D'],
-    'side': ['S', 'AY', 'D'],
-    'kind': ['K', 'AY', 'N', 'D'],
-    'case': ['K', 'EY', 'S'],
-    'line': ['L', 'AY', 'N'],
-    'car': ['K', 'AA', 'R'],
-    'city': ['S', 'IH', 'T', 'IY'],
-    'country': ['K', 'AH', 'N', 'T', 'R', 'IY'],
-    'door': ['D', 'AO', 'R'],
-    'job': ['JH', 'AA', 'B'],
-    'team': ['T', 'IY', 'M'],
-    'game': ['G', 'EY', 'M'],
-    'food': ['F', 'UW', 'D'],
-    'music': ['M', 'Y', 'UW', 'Z', 'IH', 'K'],
-    'art': ['AA', 'R', 'T'],
-    # ===== NUMBERS =====
-    'zero': ['Z', 'IY', 'R', 'OW'],
-    'one': ['W', 'AH', 'N'],
-    'two': ['T', 'UW'],
-    'three': ['TH', 'R', 'IY'],
-    'four': ['F', 'AO', 'R'],
-    'five': ['F', 'AY', 'V'],
-    'six': ['S', 'IH', 'K', 'S'],
-    'seven': ['S', 'EH', 'V', 'AX', 'N'],
-    'eight': ['EY', 'T'],
-    'nine': ['N', 'AY', 'N'],
-    'ten': ['T', 'EH', 'N'],
-    'hundred': ['HH', 'AH', 'N', 'D', 'R', 'AX', 'D'],
-    'thousand': ['TH', 'AW', 'Z', 'AX', 'N', 'D'],
-    # ===== GREETINGS & EXPRESSIONS =====
-    'hello': ['HH', 'AX', 'L', 'OW'],
-    'hi': ['HH', 'AY'],
-    'hey': ['HH', 'EY'],
-    'goodbye': ['G', 'UH', 'D', 'B', 'AY'],
-    'bye': ['B', 'AY'],
-    'welcome': ['W', 'EH', 'L', 'K', 'AX', 'M'],
-    'please': ['P', 'L', 'IY', 'Z'],
-    'thank': ['TH', 'AE', 'NG', 'K'],
-    'thanks': ['TH', 'AE', 'NG', 'K', 'S'],
     'sorry': ['S', 'AA', 'R', 'IY'],
-    'yes': ['Y', 'EH', 'S'],
-    'yeah': ['Y', 'AE'],
-    'no': ['N', 'OW'],
-    'ok': ['OW', 'K', 'EY'],
-    'okay': ['OW', 'K', 'EY'],
-    # ===== TECH & TTS =====
-    'text': ['T', 'EH', 'K', 'S', 'T'],
-    'speech': ['S', 'P', 'IY', 'CH'],
-    'sound': ['S', 'AW', 'N', 'D'],
-    'audio': ['AO', 'D', 'IY', 'OW'],
-    'voice': ['V', 'OY', 'S'],
-    'test': ['T', 'EH', 'S', 'T'],
-    'testing': ['T', 'EH', 'S', 'T', 'IH', 'NG'],
     'computer': ['K', 'AX', 'M', 'P', 'Y', 'UW', 'T', 'ER'],
     'vedes': ['V', 'EY', 'D', 'EH', 'S'],
     'system': ['S', 'IH', 'S', 'T', 'AX', 'M'],
 }
-# Letter patterns for unknown words
 PATTERNS = [
-    ('tion', ['SH', 'AX', 'N']),
-    ('sion', ['ZH', 'AX', 'N']),
-    ('ness', ['N', 'AX', 'S']),
-    ('ment', ['M', 'AX', 'N', 'T']),
-    ('able', ['AX', 'B', 'AX', 'L']),
-    ('ible', ['AX', 'B', 'AX', 'L']),
-    ('ful', ['F', 'AX', 'L']),
-    ('less', ['L', 'AX', 'S']),
-    ('ing', ['IH', 'NG']),
-    ('ight', ['AY', 'T']),
-    ('ough', ['AO']),
-    ('ould', ['UH', 'D']),
-    ('th', ['TH']),
-    ('sh', ['SH']),
-    ('ch', ['CH']),
-    ('wh', ['W']),
-    ('ph', ['F']),
-    ('ck', ['K']),
-    ('ng', ['NG']),
-    ('qu', ['K', 'W']),
-    ('ee', ['IY']),
-    ('ea', ['IY']),
-    ('oo', ['UW']),
-    ('ou', ['AW']),
-    ('ow', ['OW']),
-    ('ai', ['EY']),
-    ('ay', ['EY']),
-    ('ey', ['IY']),
-    ('oy', ['OY']),
-    ('oi', ['OY']),
-    ('ie', ['IY']),
-    ('er', ['ER']),
-    ('ir', ['ER']),
-    ('ur', ['ER']),
-    ('ar', ['AA', 'R']),
-    ('or', ['AO', 'R']),
 ]
 LETTERS = {
@@ -590,6 +365,155 @@ LETTERS = {
 }
 # ============================================
 # TEXT TO PHONEME CONVERTER
 # ============================================
@@ -645,26 +569,42 @@ class TextToPhoneme:
 # ============================================
-# IMPROVED FORMANT SYNTHESIZER
 # ============================================
-class FormantSynthesizer:
     def __init__(self, sample_rate=22050):
         self.sr = sample_rate
-        self.f0 = 130  # Base pitch
-    def synthesize(self, phonemes, rate=1.0, pitch=1.0):
         if not phonemes:
             return np.zeros(int(self.sr * 0.5), dtype=np.float32)
-        f0 = self.f0 * pitch
         segments = []
         for i, phon in enumerate(phonemes):
             prev_phon = phonemes[i - 1] if i > 0 else None
             next_phon = phonemes[i + 1] if i < len(phonemes) - 1 else None
-            seg = self._synth_phoneme(phon, f0, rate, prev_phon, next_phon)
             segments.append(seg)
         audio = self._smooth_concat(segments)
@@ -672,135 +612,104 @@ class FormantSynthesizer:
         return audio.astype(np.float32)
-    def _synth_phoneme(self, phon, f0, rate, prev_phon, next_phon):
-        # Handle silence
         if phon in SILENCE:
-            dur = int(self.sr * SILENCE[phon] / 1000 / rate)
             return np.zeros(dur, dtype=np.float32)
-        # Handle vowels
         if phon in VOWELS:
-            return self._synth_vowel(phon, f0, rate, prev_phon, next_phon)
-        # Handle consonants
         if phon in CONSONANTS:
-            return self._synth_consonant(phon, f0, rate)
         return np.zeros(100, dtype=np.float32)
-    def _synth_vowel(self, phon, f0, rate, prev_phon, next_phon):
-        """Synthesize vowel with formant transitions"""
         params = VOWELS[phon]
         f1, f2, f3, dur_ms, amp, voiced = params
-        dur_ms = dur_ms / rate
         n = int(self.sr * dur_ms / 1000)
         n = max(n, 100)
         t = np.arange(n) / self.sr
-        # Generate glottal source
-        source = self._glottal_source(t, f0)
-        # Apply formants with transitions
-        audio = self._apply_formants_smooth(source, f1, f2, f3, prev_phon, next_phon)
-        # Apply amplitude envelope
         envelope = self._vowel_envelope(n)
         audio = audio * envelope * amp
         return audio
-    def _synth_consonant(self, phon, f0, rate):
-        """Synthesize consonant"""
         params = CONSONANTS[phon]
         ctype = params['type']
         if ctype == 'stop':
-            return self._synth_stop(phon, params, f0, rate)
         elif ctype == 'fric':
-            return self._synth_fricative(phon, params, f0, rate)
         elif ctype == 'affric':
-            return self._synth_affricate(phon, params, f0, rate)
         elif ctype == 'nasal':
-            return self._synth_nasal(phon, params, f0, rate)
         elif ctype == 'liquid':
-            return self._synth_liquid(phon, params, f0, rate)
         elif ctype == 'glide':
-            return self._synth_glide(phon, params, f0, rate)
         return np.zeros(100, dtype=np.float32)
-    def _glottal_source(self, t, f0):
-        """Generate glottal pulse train"""
-        # Use Rosenberg C model
         T0 = 1.0 / f0
         phase = (t % T0) / T0
-        # Open phase (40%)
         glottal = np.zeros_like(t)
         mask1 = phase < 0.4
         glottal[mask1] = 0.5 * (1 - np.cos(np.pi * phase[mask1] / 0.4))
-        # Closing phase (20%)
         mask2 = (phase >= 0.4) & (phase < 0.6)
         glottal[mask2] = np.cos(np.pi * (phase[mask2] - 0.4) / 0.4)
         # Add breathiness
-        glottal += np.random.randn(len(t)) * 0.03
-        # Add shimmer (amplitude variation)
         shimmer = 1 + 0.02 * np.sin(2 * np.pi * 5 * t)
         glottal *= shimmer
         return glottal
-    def _apply_formants_smooth(self, source, f1, f2, f3, prev_phon, next_phon):
-        """Apply formant filtering with smooth transitions"""
-        n = len(source)
-        # Get target formants
         formants = [(f1, 90), (f2, 110), (f3, 130)]
-        # Get transition formants from neighbors
-        if prev_phon and prev_phon in VOWELS:
-            pf1, pf2, pf3 = VOWELS[prev_phon][0:3]
-        else:
-            pf1, pf2, pf3 = f1, f2, f3
-        if next_phon and next_phon in VOWELS:
-            nf1, nf2, nf3 = VOWELS[next_phon][0:3]
-        else:
-            nf1, nf2, nf3 = f1, f2, f3
         result = np.zeros_like(source)
-        trans_len = min(n // 4, 500)
-        # Process each formant
-        for i, (freq, bw) in enumerate(formants):
-            if i == 0:
-                pf, nf = pf1, nf1
-            elif i == 1:
-                pf, nf = pf2, nf2
-            else:
-                pf, nf = pf3, nf3
-            # Create frequency trajectory
-            freq_traj = np.ones(n) * freq
-            freq_traj[:trans_len] = np.linspace(pf * 0.7 + freq * 0.3, freq, trans_len)
-            freq_traj[-trans_len:] = np.linspace(freq, nf * 0.7 + freq * 0.3, trans_len)
-            # Apply time-varying filter (simplified)
-            filtered = self._resonator_fixed(source, freq, bw)
-            result += filtered * (1.0 / (i + 1))
         return result
-    def _resonator_fixed(self, sig, freq, bw):
-        """Fixed frequency resonator"""
         if freq <= 0 or freq >= self.sr / 2:
             return sig
-        # Calculate coefficients
         r = np.exp(-np.pi * bw / self.sr)
         theta = 2 * np.pi * freq / self.sr
@@ -808,7 +717,6 @@ class FormantSynthesizer:
         a2 = r * r
         b0 = 1 - r
-        # Apply IIR filter
         y = np.zeros_like(sig)
         for i in range(2, len(sig)):
             y[i] = b0 * sig[i] - a1 * y[i-1] - a2 * y[i-2]
@@ -816,23 +724,28 @@ class FormantSynthesizer:
         return y
     def _vowel_envelope(self, n):
-        """Create smooth vowel envelope"""
         env = np.ones(n)
-        # Attack (10%)
         attack = max(1, n // 10)
-        env[:attack] = np.sin(np.linspace(0, np.pi/2, attack)) ** 2
-        # Release (15%)
         release = max(1, int(n * 0.15))
         env[-release:] = np.cos(np.linspace(0, np.pi/2, release)) ** 2
         return env
-    def _synth_stop(self, phon, params, f0, rate):
-        """Synthesize stop consonant"""
-        closure_ms = params['closure'] / rate
-        burst_ms = params['burst'] / rate
         closure_n = int(self.sr * closure_ms / 1000)
         burst_n = int(self.sr * burst_ms / 1000)
@@ -840,17 +753,14 @@ class FormantSynthesizer:
         audio = np.zeros(total_n, dtype=np.float32)
-        # Voice bar for voiced stops
         if params['voiced']:
             t = np.arange(closure_n) / self.sr
             voice_bar = np.sin(2 * np.pi * f0 * 0.8 * t) * 0.15
             audio[:closure_n] = voice_bar
-        # Burst
         burst = np.random.randn(burst_n)
-        # Filter burst
-        burst_freq = params['burst_freq']
         try:
             if burst_freq < self.sr / 2 - 100:
                 b, a = signal.butter(2, burst_freq / (self.sr / 2), 'low')
@@ -858,7 +768,6 @@ class FormantSynthesizer:
         except:
             pass
-        # Burst envelope
         burst_env = np.exp(-np.linspace(0, 5, burst_n))
         burst *= burst_env * params['amp']
@@ -866,15 +775,12 @@ class FormantSynthesizer:
         return audio
-    def _synth_fricative(self, phon, params, f0, rate):
-        """Synthesize fricative consonant"""
-        dur_ms = params['dur'] / rate
         n = int(self.sr * dur_ms / 1000)
-        # Generate noise
         noise = np.random.randn(n)
-        # Bandpass filter
         low = params['freq_low']
         high = min(params['freq_high'], self.sr / 2 - 100)
@@ -887,34 +793,28 @@ class FormantSynthesizer:
         audio = noise * params['amp']
-        # Add voicing for voiced fricatives
         if params['voiced']:
             t = np.arange(n) / self.sr
-            voice = self._glottal_source(t, f0) * 0.3
             audio = audio + voice
-        # Apply envelope
-        env = self._consonant_envelope(n)
-        audio *= env
         return audio.astype(np.float32)
-    def _synth_affricate(self, phon, params, f0, rate):
-        """Synthesize affricate"""
-        closure_ms = params['closure'] / rate
-        fric_ms = params['fric'] / rate
         closure_n = int(self.sr * closure_ms / 1000)
         fric_n = int(self.sr * fric_ms / 1000)
         audio = np.zeros(closure_n + fric_n, dtype=np.float32)
-        # Closure (silence or voice bar)
         if params['voiced']:
             t = np.arange(closure_n) / self.sr
             audio[:closure_n] = np.sin(2 * np.pi * f0 * 0.8 * t) * 0.1
-        # Frication
         fric = np.random.randn(fric_n)
         low = params['freq_low']
         high = min(params['freq_high'], self.sr / 2 - 100)
@@ -927,7 +827,6 @@ class FormantSynthesizer:
         fric *= params['amp']
-        # Envelope
         fric_env = np.ones(fric_n)
         attack = fric_n // 6
         release = fric_n // 3
@@ -938,32 +837,22 @@ class FormantSynthesizer:
         return audio
-    def _synth_nasal(self, phon, params, f0, rate):
-        """Synthesize nasal consonant"""
-        dur_ms = params['dur'] / rate
         n = int(self.sr * dur_ms / 1000)
         t = np.arange(n) / self.sr
-        # Generate voiced source
-        source = self._glottal_source(t, f0)
-        # Apply nasal formants
-        audio = np.zeros_like(source)
-        formants = [
-            (params['f1'], 80),
-            (params['f2'], 100),
-            (params['f3'], 120),
-        ]
-        for freq, bw in formants:
-            audio += self._resonator_fixed(source, freq, bw)
-        # Add low nasal resonance
-        nasal_pole = self._resonator_fixed(source, 250, 100) * 0.4
         audio += nasal_pole
-        # Anti-resonance (nasal zero)
         try:
             b, a = signal.butter(2, 800 / (self.sr / 2), 'low')
             audio = signal.filtfilt(b, a, audio)
@@ -974,72 +863,46 @@ class FormantSynthesizer:
         return audio.astype(np.float32)
-    def _synth_liquid(self, phon, params, f0, rate):
-        """Synthesize liquid (L, R)"""
-        dur_ms = params['dur'] / rate
         n = int(self.sr * dur_ms / 1000)
         t = np.arange(n) / self.sr
-        source = self._glottal_source(t, f0)
-        audio = np.zeros_like(source)
-        formants = [
-            (params['f1'], 70),
-            (params['f2'], 90),
-            (params['f3'], 110),
-        ]
-        for freq, bw in formants:
-            audio += self._resonator_fixed(source, freq, bw)
         audio *= params['amp'] * self._consonant_envelope(n)
         return audio.astype(np.float32)
-    def _synth_glide(self, phon, params, f0, rate):
-        """Synthesize glide (W, Y)"""
-        dur_ms = params['dur'] / rate
         n = int(self.sr * dur_ms / 1000)
         t = np.arange(n) / self.sr
-        source = self._glottal_source(t, f0)
-        # Formant transitions
-        audio = np.zeros_like(source)
-        f1_start, f1_end = params['f1'], params['f1'] * 1.5
-        f2_start, f2_end = params['f2'], params['f2'] * 1.3
-        # Time-varying formants (simplified)
-        for i, (freq, bw) in enumerate([(params['f1'], 70), (params['f2'], 90), (params['f3'], 110)]):
-            audio += self._resonator_fixed(source, freq, bw) / (i + 1)
         audio *= params['amp'] * self._consonant_envelope(n)
         return audio.astype(np.float32)
-    def _consonant_envelope(self, n):
-        """Create consonant envelope"""
-        env = np.ones(n)
-        attack = max(1, n // 8)
-        release = max(1, n // 6)
-        env[:attack] = np.linspace(0.1, 1, attack)
-        env[-release:] = np.linspace(1, 0.1, release)
-        return env
     def _smooth_concat(self, segments):
-        """Concatenate with crossfade"""
         if not segments:
             return np.zeros(1000, dtype=np.float32)
         if len(segments) == 1:
             return segments[0]
-        # Calculate overlap
         overlap = 64
         total_len = sum(len(s) for s in segments) - overlap * (len(segments) - 1)
         total_len = max(total_len, 100)
@@ -1059,7 +922,6 @@ class FormantSynthesizer:
             seg_to_add = seg[:seg_len]
             if i > 0 and pos > overlap:
-                # Crossfade
                 fade_len = min(overlap, seg_len)
                 fade_in = np.linspace(0, 1, fade_len) ** 0.5
                 fade_out = np.linspace(1, 0, fade_len) ** 0.5
@@ -1075,19 +937,14 @@ class FormantSynthesizer:
         return audio
     def _normalize(self, audio):
-        """Normalize and apply final envelope"""
         if len(audio) < 100:
             return audio
-        # Remove DC
         audio = audio - np.mean(audio)
-        # Normalize
         max_val = np.max(np.abs(audio))
         if max_val > 0:
             audio = audio / max_val * 0.9
-        # Final fade
         fade = min(len(audio) // 40, 200)
         audio[:fade] *= np.linspace(0, 1, fade)
         audio[-fade:] *= np.linspace(1, 0, fade)
@@ -1103,20 +960,61 @@ class VedesTTS:
     def __init__(self, sample_rate=22050):
         self.sr = sample_rate
         self.text_to_phoneme = TextToPhoneme()
-        self.synthesizer = FormantSynthesizer(sample_rate)
-    def speak(self, text, rate=1.0, pitch=1.0):
         if not text or not text.strip():
             return np.zeros(self.sr, dtype=np.float32)
         phonemes = self.text_to_phoneme.convert(text)
         if not phonemes:
             return np.zeros(self.sr, dtype=np.float32)
-        audio = self.synthesizer.synthesize(phonemes, rate, pitch)
         return audio
 # ============================================
@@ -1124,13 +1022,13 @@ class VedesTTS:
 # ============================================
 print("=" * 50)
-print("🎙️ VEDES TTS - Clear Speech Version")
-print("100% From Scratch")
 print("=" * 50)
 tts = VedesTTS(SAMPLE_RATE)
 print("✅ Ready!")
 print("=" * 50)
@@ -1138,7 +1036,7 @@ print("=" * 50)
 # GRADIO INTERFACE
 # ============================================
-def synthesize(text, rate, pitch):
     if not text or not text.strip():
         return None
@@ -1146,6 +1044,16 @@ def synthesize(text, rate, pitch):
     try:
         pitch_mult = 2 ** (pitch / 12)
         audio = tts.speak(text, rate=rate, pitch=pitch_mult)
         if len(audio) < 100:
@@ -1161,66 +1069,270 @@ def synthesize(text, rate, pitch):
         return None
-with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("""
-    # 🎙️ Vedes TTS
-    ### Clear Speech Synthesis - 100% From Scratch
-    **No APIs. No pre-trained models. Pure Python.**
-    Uses Klatt formant synthesis with:
-    - Glottal source modeling
-    - Formant transitions for clarity
-    - Proper consonant synthesis
     """)
-    with gr.Row():
-        with gr.Column(scale=2):
-            text_input = gr.Textbox(
-                label="📝 Text to Speak",
-                placeholder="Type here... (e.g., Hello, how are you?)",
-                lines=3
             )
             with gr.Row():
-                rate = gr.Slider(0.6, 1.5, 0.9, step=0.1, label="⏱️ Speed")
-                pitch = gr.Slider(-4, 4, 0, step=1, label="🎵 Pitch")
-            btn = gr.Button("🔊 Speak", variant="primary", size="lg")
-        with gr.Column(scale=1):
-            audio_out = gr.Audio(label="🎧 Audio", type="numpy")
-    gr.Examples(
-        examples=[
-            ["Hello."],
-            ["How are you?"],
-            ["Good morning."],
-            ["Thank you."],
-            ["Yes, I can help."],
-            ["My name is Vedes."],
-            ["What is your name?"],
-            ["Have a nice day."],
-            ["This is a test."],
-            ["I am fine."],
-        ],
-        inputs=text_input,
-        label="📚 Try These"
-    )
-    gr.Markdown("""
-    ---
-    ### 💡 Tips for Better Results
-    - **Keep sentences short** - 3-6 words works best
-    - **Use simple words** - Common words have better pronunciation
-    - **Slow speed** - Set rate to 0.7-0.8 for clarity
-    - **Add periods** - Creates natural pauses
-    """)
-    btn.click(synthesize, [text_input, rate, pitch], audio_out)
-    text_input.submit(synthesize, [text_input, rate, pitch], audio_out)
 if __name__ == "__main__":

 from scipy.io import wavfile
 import tempfile
 import re
+import json
+import os
 # ============================================
+# VEDES TTS - WITH VOICE TRAINING
 # 100% From Scratch - No APIs
 # ============================================
 SAMPLE_RATE = 22050
 # ============================================
+# VOICE PROFILES - Pre-defined Voices
+# ============================================
+VOICE_PROFILES = {
+    "Emma (Female)": {
+        "name": "Emma",
+        "gender": "female",
+        "f0": 210,  # Higher pitch
+        "f0_variation": 30,
+        "formant_shift": 1.15,  # Shift formants up
+        "breathiness": 0.04,
+        "speed": 1.0,
+        "brightness": 1.1,
+        "description": "Friendly female voice"
+    },
+    "James (Male)": {
+        "name": "James",
+        "gender": "male",
+        "f0": 110,  # Lower pitch
+        "f0_variation": 20,
+        "formant_shift": 0.9,  # Shift formants down
+        "breathiness": 0.02,
+        "speed": 0.95,
+        "brightness": 0.95,
+        "description": "Professional male voice"
+    },
+    "Sophie (Child)": {
+        "name": "Sophie",
+        "gender": "child",
+        "f0": 280,  # High pitch
+        "f0_variation": 40,
+        "formant_shift": 1.25,
+        "breathiness": 0.03,
+        "speed": 1.1,
+        "brightness": 1.2,
+        "description": "Young child voice"
+    },
+    "David (Deep Male)": {
+        "name": "David",
+        "gender": "male",
+        "f0": 85,  # Very deep
+        "f0_variation": 15,
+        "formant_shift": 0.82,
+        "breathiness": 0.02,
+        "speed": 0.9,
+        "brightness": 0.85,
+        "description": "Deep bass voice"
+    },
+    "Lisa (Bright Female)": {
+        "name": "Lisa",
+        "gender": "female",
+        "f0": 240,
+        "f0_variation": 35,
+        "formant_shift": 1.2,
+        "breathiness": 0.05,
+        "speed": 1.05,
+        "brightness": 1.15,
+        "description": "Bright, energetic female"
+    },
+    "Robert (Elderly Male)": {
+        "name": "Robert",
+        "gender": "male",
+        "f0": 95,
+        "f0_variation": 12,
+        "formant_shift": 0.88,
+        "breathiness": 0.06,
+        "speed": 0.85,
+        "brightness": 0.9,
+        "description": "Mature elderly voice"
+    },
+    "Anna (Soft Female)": {
+        "name": "Anna",
+        "gender": "female",
+        "f0": 195,
+        "f0_variation": 25,
+        "formant_shift": 1.1,
+        "breathiness": 0.07,
+        "speed": 0.92,
+        "brightness": 1.0,
+        "description": "Soft, gentle female"
+    },
+    "Mike (Energetic Male)": {
+        "name": "Mike",
+        "gender": "male",
+        "f0": 130,
+        "f0_variation": 30,
+        "formant_shift": 0.95,
+        "breathiness": 0.02,
+        "speed": 1.1,
+        "brightness": 1.05,
+        "description": "Energetic young male"
+    },
+}
+# Custom voices storage
+custom_voices = {}
+# ============================================
+# PHONEME DATA
 # ============================================
 VOWELS = {
+    'IY': (280, 2250, 2890, 150, 1.0, True),
+    'IH': (400, 1920, 2550, 120, 0.9, True),
+    'EH': (550, 1770, 2490, 130, 0.95, True),
+    'AE': (690, 1660, 2490, 140, 1.0, True),
+    'AA': (710, 1100, 2540, 150, 1.0, True),
+    'AO': (590, 880, 2540, 140, 0.95, True),
+    'UH': (470, 1100, 2540, 120, 0.9, True),
+    'UW': (310, 870, 2250, 150, 1.0, True),
+    'AH': (640, 1200, 2400, 100, 0.85, True),
+    'AX': (500, 1500, 2500, 80, 0.7, True),
+    'ER': (500, 1350, 1700, 140, 0.9, True),
+    'EY': (500, 1900, 2600, 160, 1.0, True),
+    'AY': (700, 1200, 2600, 180, 1.0, True),
+    'OY': (500, 900, 2500, 180, 1.0, True),
+    'AW': (700, 1100, 2600, 180, 1.0, True),
+    'OW': (500, 900, 2500, 160, 1.0, True),
 }
 CONSONANTS = {
     'P': {'type': 'stop', 'closure': 80, 'burst': 30, 'voiced': False, 'burst_freq': 800, 'amp': 0.6},
     'B': {'type': 'stop', 'closure': 50, 'burst': 25, 'voiced': True, 'burst_freq': 800, 'amp': 0.7},
     'T': {'type': 'stop', 'closure': 70, 'burst': 30, 'voiced': False, 'burst_freq': 3500, 'amp': 0.7},
     'D': {'type': 'stop', 'closure': 40, 'burst': 25, 'voiced': True, 'burst_freq': 3500, 'amp': 0.7},
     'K': {'type': 'stop', 'closure': 80, 'burst': 40, 'voiced': False, 'burst_freq': 1500, 'amp': 0.7},
     'G': {'type': 'stop', 'closure': 50, 'burst': 30, 'voiced': True, 'burst_freq': 1500, 'amp': 0.7},
     'F': {'type': 'fric', 'dur': 120, 'freq_low': 1500, 'freq_high': 8000, 'voiced': False, 'amp': 0.4},
     'V': {'type': 'fric', 'dur': 80, 'freq_low': 1500, 'freq_high': 8000, 'voiced': True, 'amp': 0.5},
     'TH': {'type': 'fric', 'dur': 100, 'freq_low': 1400, 'freq_high': 6000, 'voiced': False, 'amp': 0.3},
     'SH': {'type': 'fric', 'dur': 120, 'freq_low': 2000, 'freq_high': 6000, 'voiced': False, 'amp': 0.5},
     'ZH': {'type': 'fric', 'dur': 80, 'freq_low': 2000, 'freq_high': 6000, 'voiced': True, 'amp': 0.5},
     'HH': {'type': 'fric', 'dur': 80, 'freq_low': 500, 'freq_high': 2000, 'voiced': False, 'amp': 0.3},
     'CH': {'type': 'affric', 'closure': 60, 'fric': 80, 'freq_low': 2000, 'freq_high': 6000, 'voiced': False, 'amp': 0.6},
     'JH': {'type': 'affric', 'closure': 40, 'fric': 60, 'freq_low': 2000, 'freq_high': 6000, 'voiced': True, 'amp': 0.6},
     'M': {'type': 'nasal', 'f1': 280, 'f2': 1000, 'f3': 2200, 'dur': 100, 'amp': 0.8},
     'N': {'type': 'nasal', 'f1': 280, 'f2': 1700, 'f3': 2500, 'dur': 90, 'amp': 0.8},
     'NG': {'type': 'nasal', 'f1': 300, 'f2': 2000, 'f3': 2700, 'dur': 100, 'amp': 0.8},
     'L': {'type': 'liquid', 'f1': 380, 'f2': 1000, 'f3': 2700, 'dur': 90, 'amp': 0.85},
     'R': {'type': 'liquid', 'f1': 350, 'f2': 1300, 'f3': 1700, 'dur': 90, 'amp': 0.85},
     'W': {'type': 'glide', 'f1': 300, 'f2': 700, 'f3': 2200, 'dur': 80, 'amp': 0.8},
     'Y': {'type': 'glide', 'f1': 280, 'f2': 2200, 'f3': 2900, 'dur': 70, 'amp': 0.8},
 }
+SILENCE = {'SIL': 60, 'PAU': 200}
 # ============================================
+# PRONUNCIATION DICTIONARY
 # ============================================
 DICTIONARY = {
+    # Function words
+    'a': ['AX'], 'an': ['AE', 'N'], 'the': ['DH', 'AX'],
+    'and': ['AE', 'N', 'D'], 'or': ['AO', 'R'], 'but': ['B', 'AH', 'T'],
+    'if': ['IH', 'F'], 'of': ['AH', 'V'], 'to': ['T', 'UW'],
+    'in': ['IH', 'N'], 'on': ['AA', 'N'], 'at': ['AE', 'T'],
+    'by': ['B', 'AY'], 'for': ['F', 'AO', 'R'], 'with': ['W', 'IH', 'TH'],
+    'from': ['F', 'R', 'AH', 'M'], 'up': ['AH', 'P'], 'out': ['AW', 'T'],
+    'as': ['AE', 'Z'], 'so': ['S', 'OW'], 'not': ['N', 'AA', 'T'],
+    # Pronouns
+    'i': ['AY'], 'me': ['M', 'IY'], 'my': ['M', 'AY'],
+    'you': ['Y', 'UW'], 'your': ['Y', 'AO', 'R'],
+    'he': ['HH', 'IY'], 'him': ['HH', 'IH', 'M'], 'his': ['HH', 'IH', 'Z'],
+    'she': ['SH', 'IY'], 'her': ['HH', 'ER'],
+    'it': ['IH', 'T'], 'its': ['IH', 'T', 'S'],
+    'we': ['W', 'IY'], 'us': ['AH', 'S'], 'our': ['AW', 'ER'],
+    'they': ['DH', 'EY'], 'them': ['DH', 'EH', 'M'], 'their': ['DH', 'EH', 'R'],
+    'this': ['DH', 'IH', 'S'], 'that': ['DH', 'AE', 'T'],
+    'what': ['W', 'AH', 'T'], 'who': ['HH', 'UW'],
+    'where': ['W', 'EH', 'R'], 'when': ['W', 'EH', 'N'],
+    'why': ['W', 'AY'], 'how': ['HH', 'AW'], 'which': ['W', 'IH', 'CH'],
+    # Be verbs
+    'am': ['AE', 'M'], 'is': ['IH', 'Z'], 'are': ['AA', 'R'],
+    'was': ['W', 'AA', 'Z'], 'were': ['W', 'ER'],
+    'be': ['B', 'IY'], 'been': ['B', 'IH', 'N'], 'being': ['B', 'IY', 'IH', 'NG'],
+    # Have verbs
+    'have': ['HH', 'AE', 'V'], 'has': ['HH', 'AE', 'Z'],
+    'had': ['HH', 'AE', 'D'], 'having': ['HH', 'AE', 'V', 'IH', 'NG'],
+    # Do verbs
+    'do': ['D', 'UW'], 'does': ['D', 'AH', 'Z'],
+    'did': ['D', 'IH', 'D'], 'done': ['D', 'AH', 'N'],
+    # Modal verbs
+    'will': ['W', 'IH', 'L'], 'would': ['W', 'UH', 'D'],
+    'can': ['K', 'AE', 'N'], 'could': ['K', 'UH', 'D'],
+    'should': ['SH', 'UH', 'D'], 'may': ['M', 'EY'],
+    'might': ['M', 'AY', 'T'], 'must': ['M', 'AH', 'S', 'T'],
+    # Common verbs
+    'go': ['G', 'OW'], 'goes': ['G', 'OW', 'Z'], 'going': ['G', 'OW', 'IH', 'NG'],
+    'went': ['W', 'EH', 'N', 'T'], 'gone': ['G', 'AO', 'N'],
+    'come': ['K', 'AH', 'M'], 'comes': ['K', 'AH', 'M', 'Z'],
+    'coming': ['K', 'AH', 'M', 'IH', 'NG'], 'came': ['K', 'EY', 'M'],
+    'get': ['G', 'EH', 'T'], 'gets': ['G', 'EH', 'T', 'S'],
+    'getting': ['G', 'EH', 'T', 'IH', 'NG'], 'got': ['G', 'AA', 'T'],
+    'make': ['M', 'EY', 'K'], 'makes': ['M', 'EY', 'K', 'S'],
+    'making': ['M', 'EY', 'K', 'IH', 'NG'], 'made': ['M', 'EY', 'D'],
+    'take': ['T', 'EY', 'K'], 'takes': ['T', 'EY', 'K', 'S'],
+    'took': ['T', 'UH', 'K'], 'taken': ['T', 'EY', 'K', 'AX', 'N'],
+    'see': ['S', 'IY'], 'sees': ['S', 'IY', 'Z'],
+    'saw': ['S', 'AO'], 'seen': ['S', 'IY', 'N'],
+    'say': ['S', 'EY'], 'says': ['S', 'EH', 'Z'], 'said': ['S', 'EH', 'D'],
+    'know': ['N', 'OW'], 'knows': ['N', 'OW', 'Z'],
+    'knew': ['N', 'UW'], 'known': ['N', 'OW', 'N'],
+    'think': ['TH', 'IH', 'NG', 'K'], 'thought': ['TH', 'AO', 'T'],
+    'want': ['W', 'AA', 'N', 'T'], 'wants': ['W', 'AA', 'N', 'T', 'S'],
+    'give': ['G', 'IH', 'V'], 'gives': ['G', 'IH', 'V', 'Z'],
+    'gave': ['G', 'EY', 'V'], 'given': ['G', 'IH', 'V', 'AX', 'N'],
+    'tell': ['T', 'EH', 'L'], 'told': ['T', 'OW', 'L', 'D'],
+    'ask': ['AE', 'S', 'K'], 'asked': ['AE', 'S', 'K', 'T'],
+    'use': ['Y', 'UW', 'Z'], 'used': ['Y', 'UW', 'Z', 'D'],
+    'find': ['F', 'AY', 'N', 'D'], 'found': ['F', 'AW', 'N', 'D'],
+    'work': ['W', 'ER', 'K'], 'works': ['W', 'ER', 'K', 'S'],
+    'call': ['K', 'AO', 'L'], 'called': ['K', 'AO', 'L', 'D'],
+    'try': ['T', 'R', 'AY'], 'tried': ['T', 'R', 'AY', 'D'],
+    'need': ['N', 'IY', 'D'], 'needs': ['N', 'IY', 'D', 'Z'],
+    'feel': ['F', 'IY', 'L'], 'feels': ['F', 'IY', 'L', 'Z'],
+    'help': ['HH', 'EH', 'L', 'P'], 'helps': ['HH', 'EH', 'L', 'P', 'S'],
+    'keep': ['K', 'IY', 'P'], 'kept': ['K', 'EH', 'P', 'T'],
+    'let': ['L', 'EH', 'T'], 'put': ['P', 'UH', 'T'],
+    'seem': ['S', 'IY', 'M'], 'leave': ['L', 'IY', 'V'],
+    'show': ['SH', 'OW'], 'hear': ['HH', 'IY', 'R'],
+    'play': ['P', 'L', 'EY'], 'run': ['R', 'AH', 'N'],
+    'move': ['M', 'UW', 'V'], 'live': ['L', 'IH', 'V'],
     'believe': ['B', 'IH', 'L', 'IY', 'V'],
+    'read': ['R', 'IY', 'D'], 'write': ['R', 'AY', 'T'],
+    'learn': ['L', 'ER', 'N'], 'speak': ['S', 'P', 'IY', 'K'],
+    'look': ['L', 'UH', 'K'], 'like': ['L', 'AY', 'K'],
+    'love': ['L', 'AH', 'V'], 'start': ['S', 'T', 'AA', 'R', 'T'],
+    'stop': ['S', 'T', 'AA', 'P'],
+    # Adjectives
+    'good': ['G', 'UH', 'D'], 'better': ['B', 'EH', 'T', 'ER'],
+    'best': ['B', 'EH', 'S', 'T'], 'bad': ['B', 'AE', 'D'],
+    'new': ['N', 'UW'], 'old': ['OW', 'L', 'D'],
+    'big': ['B', 'IH', 'G'], 'small': ['S', 'M', 'AO', 'L'],
+    'long': ['L', 'AO', 'NG'], 'short': ['SH', 'AO', 'R', 'T'],
+    'high': ['HH', 'AY'], 'low': ['L', 'OW'],
+    'great': ['G', 'R', 'EY', 'T'], 'little': ['L', 'IH', 'T', 'AX', 'L'],
+    'right': ['R', 'AY', 'T'], 'wrong': ['R', 'AO', 'NG'],
+    'first': ['F', 'ER', 'S', 'T'], 'last': ['L', 'AE', 'S', 'T'],
+    'same': ['S', 'EY', 'M'], 'different': ['D', 'IH', 'F', 'R', 'AX', 'N', 'T'],
+    'own': ['OW', 'N'], 'other': ['AH', 'DH', 'ER'],
+    'nice': ['N', 'AY', 'S'], 'happy': ['HH', 'AE', 'P', 'IY'],
+    'sure': ['SH', 'UH', 'R'], 'true': ['T', 'R', 'UW'],
+    'real': ['R', 'IY', 'L'], 'clear': ['K', 'L', 'IY', 'R'],
+    'fine': ['F', 'AY', 'N'], 'free': ['F', 'R', 'IY'],
+    'easy': ['IY', 'Z', 'IY'], 'hard': ['HH', 'AA', 'R', 'D'],
+    'young': ['Y', 'AH', 'NG'], 'beautiful': ['B', 'Y', 'UW', 'T', 'IH', 'F', 'AX', 'L'],
+    # Adverbs
+    'very': ['V', 'EH', 'R', 'IY'], 'really': ['R', 'IY', 'L', 'IY'],
+    'just': ['JH', 'AH', 'S', 'T'], 'only': ['OW', 'N', 'L', 'IY'],
+    'also': ['AO', 'L', 'S', 'OW'], 'well': ['W', 'EH', 'L'],
+    'now': ['N', 'AW'], 'then': ['DH', 'EH', 'N'],
+    'here': ['HH', 'IY', 'R'], 'there': ['DH', 'EH', 'R'],
+    'still': ['S', 'T', 'IH', 'L'], 'even': ['IY', 'V', 'AX', 'N'],
+    'back': ['B', 'AE', 'K'], 'again': ['AX', 'G', 'EH', 'N'],
+    'always': ['AO', 'L', 'W', 'EY', 'Z'], 'never': ['N', 'EH', 'V', 'ER'],
+    'today': ['T', 'AX', 'D', 'EY'], 'maybe': ['M', 'EY', 'B', 'IY'],
+    'too': ['T', 'UW'], 'much': ['M', 'AH', 'CH'],
+    'more': ['M', 'AO', 'R'], 'most': ['M', 'OW', 'S', 'T'],
+    'please': ['P', 'L', 'IY', 'Z'],
+    # Nouns
+    'time': ['T', 'AY', 'M'], 'year': ['Y', 'IY', 'R'],
+    'day': ['D', 'EY'], 'way': ['W', 'EY'],
+    'man': ['M', 'AE', 'N'], 'woman': ['W', 'UH', 'M', 'AX', 'N'],
+    'child': ['CH', 'AY', 'L', 'D'], 'world': ['W', 'ER', 'L', 'D'],
+    'life': ['L', 'AY', 'F'], 'hand': ['HH', 'AE', 'N', 'D'],
+    'part': ['P', 'AA', 'R', 'T'], 'place': ['P', 'L', 'EY', 'S'],
+    'thing': ['TH', 'IH', 'NG'], 'things': ['TH', 'IH', 'NG', 'Z'],
+    'people': ['P', 'IY', 'P', 'AX', 'L'], 'person': ['P', 'ER', 'S', 'AX', 'N'],
+    'home': ['HH', 'OW', 'M'], 'house': ['HH', 'AW', 'S'],
+    'word': ['W', 'ER', 'D'], 'name': ['N', 'EY', 'M'],
+    'water': ['W', 'AO', 'T', 'ER'], 'money': ['M', 'AH', 'N', 'IY'],
     'family': ['F', 'AE', 'M', 'AX', 'L', 'IY'],
+    'friend': ['F', 'R', 'EH', 'N', 'D'], 'friends': ['F', 'R', 'EH', 'N', 'D', 'Z'],
+    'mother': ['M', 'AH', 'DH', 'ER'], 'father': ['F', 'AA', 'DH', 'ER'],
+    'boy': ['B', 'OY'], 'girl': ['G', 'ER', 'L'],
+    'head': ['HH', 'EH', 'D'], 'face': ['F', 'EY', 'S'],
+    'eye': ['AY'], 'eyes': ['AY', 'Z'],
+    'voice': ['V', 'OY', 'S'], 'night': ['N', 'AY', 'T'],
     'morning': ['M', 'AO', 'R', 'N', 'IH', 'NG'],
+    'week': ['W', 'IY', 'K'], 'month': ['M', 'AH', 'N', 'TH'],
+    'school': ['S', 'K', 'UW', 'L'], 'book': ['B', 'UH', 'K'],
     'story': ['S', 'T', 'AO', 'R', 'IY'],
     'question': ['K', 'W', 'EH', 'S', 'CH', 'AX', 'N'],
     'answer': ['AE', 'N', 'S', 'ER'],
+    # Numbers
+    'zero': ['Z', 'IY', 'R', 'OW'], 'one': ['W', 'AH', 'N'],
+    'two': ['T', 'UW'], 'three': ['TH', 'R', 'IY'],
+    'four': ['F', 'AO', 'R'], 'five': ['F', 'AY', 'V'],
+    'six': ['S', 'IH', 'K', 'S'], 'seven': ['S', 'EH', 'V', 'AX', 'N'],
+    'eight': ['EY', 'T'], 'nine': ['N', 'AY', 'N'], 'ten': ['T', 'EH', 'N'],
+    # Greetings
+    'hello': ['HH', 'AX', 'L', 'OW'], 'hi': ['HH', 'AY'],
+    'hey': ['HH', 'EY'], 'goodbye': ['G', 'UH', 'D', 'B', 'AY'],
+    'bye': ['B', 'AY'], 'welcome': ['W', 'EH', 'L', 'K', 'AX', 'M'],
+    'thank': ['TH', 'AE', 'NG', 'K'], 'thanks': ['TH', 'AE', 'NG', 'K', 'S'],
     'sorry': ['S', 'AA', 'R', 'IY'],
+    'yes': ['Y', 'EH', 'S'], 'yeah': ['Y', 'AE'], 'no': ['N', 'OW'],
+    'ok': ['OW', 'K', 'EY'], 'okay': ['OW', 'K', 'EY'],
+    # Tech/TTS
+    'text': ['T', 'EH', 'K', 'S', 'T'], 'speech': ['S', 'P', 'IY', 'CH'],
+    'sound': ['S', 'AW', 'N', 'D'], 'audio': ['AO', 'D', 'IY', 'OW'],
+    'test': ['T', 'EH', 'S', 'T'], 'testing': ['T', 'EH', 'S', 'T', 'IH', 'NG'],
     'computer': ['K', 'AX', 'M', 'P', 'Y', 'UW', 'T', 'ER'],
     'vedes': ['V', 'EY', 'D', 'EH', 'S'],
     'system': ['S', 'IH', 'S', 'T', 'AX', 'M'],
+    'train': ['T', 'R', 'EY', 'N'], 'training': ['T', 'R', 'EY', 'N', 'IH', 'NG'],
 }
+# Letter patterns
 PATTERNS = [
+    ('tion', ['SH', 'AX', 'N']), ('sion', ['ZH', 'AX', 'N']),
+    ('ness', ['N', 'AX', 'S']), ('ment', ['M', 'AX', 'N', 'T']),
+    ('able', ['AX', 'B', 'AX', 'L']), ('ible', ['AX', 'B', 'AX', 'L']),
+    ('ful', ['F', 'AX', 'L']), ('less', ['L', 'AX', 'S']),
+    ('ing', ['IH', 'NG']), ('ight', ['AY', 'T']),
+    ('ough', ['AO']), ('ould', ['UH', 'D']),
+    ('th', ['TH']), ('sh', ['SH']), ('ch', ['CH']),
+    ('wh', ['W']), ('ph', ['F']), ('ck', ['K']), ('ng', ['NG']),
+    ('qu', ['K', 'W']), ('ee', ['IY']), ('ea', ['IY']),
+    ('oo', ['UW']), ('ou', ['AW']), ('ow', ['OW']),
+    ('ai', ['EY']), ('ay', ['EY']), ('ey', ['IY']),
+    ('oy', ['OY']), ('oi', ['OY']), ('ie', ['IY']),
+    ('er', ['ER']), ('ir', ['ER']), ('ur', ['ER']),
+    ('ar', ['AA', 'R']), ('or', ['AO', 'R']),
 ]
 LETTERS = {
 }
+# ============================================
+# VOICE ANALYZER - Extract Voice Features
+# ============================================
+class VoiceAnalyzer:
+    """Analyze audio to extract voice characteristics"""
+    def __init__(self, sample_rate=22050):
+        self.sr = sample_rate
+    def analyze(self, audio):
+        """Extract voice features from audio sample"""
+        if len(audio) < self.sr * 0.5:
+            return None
+        # Normalize
+        audio = audio.astype(np.float32)
+        audio = audio / (np.max(np.abs(audio)) + 1e-8)
+        # Extract features
+        f0 = self._estimate_pitch(audio)
+        formants = self._estimate_formants(audio)
+        breathiness = self._estimate_breathiness(audio)
+        # Create voice profile
+        profile = {
+            "name": "Custom Voice",
+            "gender": "custom",
+            "f0": f0,
+            "f0_variation": self._estimate_f0_variation(audio, f0),
+            "formant_shift": formants.get('shift', 1.0),
+            "breathiness": breathiness,
+            "speed": 1.0,
+            "brightness": formants.get('brightness', 1.0),
+            "description": "Voice extracted from audio sample"
+        }
+        return profile
+    def _estimate_pitch(self, audio):
+        """Estimate fundamental frequency (F0)"""
+        # Use autocorrelation
+        frame_size = int(self.sr * 0.03)  # 30ms frames
+        pitches = []
+        for i in range(0, len(audio) - frame_size, frame_size):
+            frame = audio[i:i + frame_size]
+            # Autocorrelation
+            corr = np.correlate(frame, frame, mode='full')
+            corr = corr[len(corr)//2:]
+            # Find first peak after initial decline
+            d = np.diff(corr)
+            start = np.where(d > 0)[0]
+            if len(start) > 0:
+                start = start[0]
+                peak = start + np.argmax(corr[start:start + int(self.sr / 80)])
+                if peak > 0:
+                    f0 = self.sr / peak
+                    if 60 < f0 < 400:
+                        pitches.append(f0)
+        if pitches:
+            return np.median(pitches)
+        return 130  # Default
+    def _estimate_f0_variation(self, audio, base_f0):
+        """Estimate pitch variation"""
+        frame_size = int(self.sr * 0.03)
+        pitches = []
+        for i in range(0, len(audio) - frame_size, frame_size):
+            frame = audio[i:i + frame_size]
+            corr = np.correlate(frame, frame, mode='full')
+            corr = corr[len(corr)//2:]
+            d = np.diff(corr)
+            start = np.where(d > 0)[0]
+            if len(start) > 0:
+                start = start[0]
+                peak = start + np.argmax(corr[start:start + int(self.sr / 80)])
+                if peak > 0:
+                    f0 = self.sr / peak
+                    if 60 < f0 < 400:
+                        pitches.append(f0)
+        if len(pitches) > 2:
+            return min(np.std(pitches), 50)
+        return 20
+    def _estimate_formants(self, audio):
+        """Estimate formant characteristics"""
+        # Simple spectral analysis
+        frame_size = 2048
+        if len(audio) < frame_size:
+            return {'shift': 1.0, 'brightness': 1.0}
+        # Get spectrum
+        spectrum = np.abs(np.fft.rfft(audio[:frame_size] * np.hanning(frame_size)))
+        freqs = np.fft.rfftfreq(frame_size, 1/self.sr)
+        # Find spectral centroid
+        centroid = np.sum(freqs * spectrum) / (np.sum(spectrum) + 1e-8)
+        # Estimate formant shift based on centroid
+        # Average male ~1200Hz, female ~1400Hz
+        if centroid > 1600:
+            shift = 1.2
+            brightness = 1.15
+        elif centroid > 1400:
+            shift = 1.1
+            brightness = 1.05
+        elif centroid > 1200:
+            shift = 1.0
+            brightness = 1.0
+        elif centroid > 1000:
+            shift = 0.9
+            brightness = 0.95
+        else:
+            shift = 0.85
+            brightness = 0.9
+        return {'shift': shift, 'brightness': brightness}
+    def _estimate_breathiness(self, audio):
+        """Estimate breathiness/aspiration"""
+        frame_size = 2048
+        if len(audio) < frame_size:
+            return 0.03
+        spectrum = np.abs(np.fft.rfft(audio[:frame_size]))
+        freqs = np.fft.rfftfreq(frame_size, 1/self.sr)
+        # High frequency energy ratio (breathiness indicator)
+        low_energy = np.sum(spectrum[freqs < 1000])
+        high_energy = np.sum(spectrum[(freqs > 2000) & (freqs < 5000)])
+        ratio = high_energy / (low_energy + 1e-8)
+        breathiness = np.clip(ratio * 0.1, 0.02, 0.1)
+        return breathiness
 # ============================================
 # TEXT TO PHONEME CONVERTER
 # ============================================
 # ============================================
+# VOICE-AWARE FORMANT SYNTHESIZER
 # ============================================
+class VoiceSynthesizer:
     def __init__(self, sample_rate=22050):
         self.sr = sample_rate
+        self.default_voice = VOICE_PROFILES["Emma (Female)"]
+    def synthesize(self, phonemes, voice_profile=None, rate=1.0, pitch=1.0):
         if not phonemes:
             return np.zeros(int(self.sr * 0.5), dtype=np.float32)
+        voice = voice_profile or self.default_voice
+        # Get voice parameters
+        f0 = voice.get('f0', 130) * pitch
+        f0_var = voice.get('f0_variation', 20)
+        formant_shift = voice.get('formant_shift', 1.0)
+        breathiness = voice.get('breathiness', 0.03)
+        voice_speed = voice.get('speed', 1.0) * rate
+        brightness = voice.get('brightness', 1.0)
         segments = []
         for i, phon in enumerate(phonemes):
             prev_phon = phonemes[i - 1] if i > 0 else None
             next_phon = phonemes[i + 1] if i < len(phonemes) - 1 else None
+            # Add pitch variation
+            phrase_pos = i / max(len(phonemes), 1)
+            f0_current = f0 + f0_var * np.sin(phrase_pos * np.pi) * 0.5
+            seg = self._synth_phoneme(
+                phon, f0_current, voice_speed, formant_shift,
+                breathiness, brightness, prev_phon, next_phon
+            )
             segments.append(seg)
         audio = self._smooth_concat(segments)
         return audio.astype(np.float32)
+    def _synth_phoneme(self, phon, f0, speed, formant_shift, breathiness,
+                       brightness, prev_phon, next_phon):
         if phon in SILENCE:
+            dur = int(self.sr * SILENCE[phon] / 1000 / speed)
             return np.zeros(dur, dtype=np.float32)
         if phon in VOWELS:
+            return self._synth_vowel(phon, f0, speed, formant_shift,
+                                     breathiness, brightness, prev_phon, next_phon)
         if phon in CONSONANTS:
+            return self._synth_consonant(phon, f0, speed, formant_shift, breathiness)
         return np.zeros(100, dtype=np.float32)
+    def _synth_vowel(self, phon, f0, speed, formant_shift, breathiness,
+                     brightness, prev_phon, next_phon):
         params = VOWELS[phon]
         f1, f2, f3, dur_ms, amp, voiced = params
+        # Apply formant shift
+        f1 = f1 * formant_shift
+        f2 = f2 * formant_shift
+        f3 = f3 * formant_shift
+        # Apply brightness
+        f2 = f2 * brightness
+        f3 = f3 * brightness
+        dur_ms = dur_ms / speed
         n = int(self.sr * dur_ms / 1000)
         n = max(n, 100)
         t = np.arange(n) / self.sr
+        # Generate glottal source with voice characteristics
+        source = self._glottal_source(t, f0, breathiness)
+        # Apply formants
+        audio = self._apply_formants(source, f1, f2, f3)
+        # Apply envelope
         envelope = self._vowel_envelope(n)
         audio = audio * envelope * amp
         return audio
+    def _synth_consonant(self, phon, f0, speed, formant_shift, breathiness):
         params = CONSONANTS[phon]
         ctype = params['type']
         if ctype == 'stop':
+            return self._synth_stop(phon, params, f0, speed, formant_shift)
         elif ctype == 'fric':
+            return self._synth_fricative(phon, params, f0, speed)
         elif ctype == 'affric':
+            return self._synth_affricate(phon, params, f0, speed)
         elif ctype == 'nasal':
+            return self._synth_nasal(phon, params, f0, speed, formant_shift, breathiness)
         elif ctype == 'liquid':
+            return self._synth_liquid(phon, params, f0, speed, formant_shift, breathiness)
         elif ctype == 'glide':
+            return self._synth_glide(phon, params, f0, speed, formant_shift, breathiness)
         return np.zeros(100, dtype=np.float32)
+    def _glottal_source(self, t, f0, breathiness):
         T0 = 1.0 / f0
         phase = (t % T0) / T0
         glottal = np.zeros_like(t)
         mask1 = phase < 0.4
         glottal[mask1] = 0.5 * (1 - np.cos(np.pi * phase[mask1] / 0.4))
         mask2 = (phase >= 0.4) & (phase < 0.6)
         glottal[mask2] = np.cos(np.pi * (phase[mask2] - 0.4) / 0.4)
         # Add breathiness
+        glottal += np.random.randn(len(t)) * breathiness
+        # Add shimmer
         shimmer = 1 + 0.02 * np.sin(2 * np.pi * 5 * t)
         glottal *= shimmer
         return glottal
+    def _apply_formants(self, source, f1, f2, f3):
         formants = [(f1, 90), (f2, 110), (f3, 130)]
         result = np.zeros_like(source)
+        for freq, bw in formants:
+            result += self._resonator(source, freq, bw)
         return result
+    def _resonator(self, sig, freq, bw):
         if freq <= 0 or freq >= self.sr / 2:
             return sig
         r = np.exp(-np.pi * bw / self.sr)
         theta = 2 * np.pi * freq / self.sr
         a2 = r * r
         b0 = 1 - r
         y = np.zeros_like(sig)
         for i in range(2, len(sig)):
             y[i] = b0 * sig[i] - a1 * y[i-1] - a2 * y[i-2]
         return y
     def _vowel_envelope(self, n):
         env = np.ones(n)
         attack = max(1, n // 10)
         release = max(1, int(n * 0.15))
+        env[:attack] = np.sin(np.linspace(0, np.pi/2, attack)) ** 2
         env[-release:] = np.cos(np.linspace(0, np.pi/2, release)) ** 2
         return env
+    def _consonant_envelope(self, n):
+        env = np.ones(n)
+        attack = max(1, n // 8)
+        release = max(1, n // 6)
+        env[:attack] = np.linspace(0.1, 1, attack)
+        env[-release:] = np.linspace(1, 0.1, release)
+        return env
+    def _synth_stop(self, phon, params, f0, speed, formant_shift):
+        closure_ms = params['closure'] / speed
+        burst_ms = params['burst'] / speed
         closure_n = int(self.sr * closure_ms / 1000)
         burst_n = int(self.sr * burst_ms / 1000)
         audio = np.zeros(total_n, dtype=np.float32)
         if params['voiced']:
             t = np.arange(closure_n) / self.sr
             voice_bar = np.sin(2 * np.pi * f0 * 0.8 * t) * 0.15
             audio[:closure_n] = voice_bar
         burst = np.random.randn(burst_n)
+        burst_freq = params['burst_freq'] * formant_shift
         try:
             if burst_freq < self.sr / 2 - 100:
                 b, a = signal.butter(2, burst_freq / (self.sr / 2), 'low')
         except:
             pass
         burst_env = np.exp(-np.linspace(0, 5, burst_n))
         burst *= burst_env * params['amp']
         return audio
+    def _synth_fricative(self, phon, params, f0, speed):
+        dur_ms = params['dur'] / speed
         n = int(self.sr * dur_ms / 1000)
         noise = np.random.randn(n)
         low = params['freq_low']
         high = min(params['freq_high'], self.sr / 2 - 100)
         audio = noise * params['amp']
         if params['voiced']:
             t = np.arange(n) / self.sr
+            voice = self._glottal_source(t, f0, 0.03) * 0.3
             audio = audio + voice
+        audio *= self._consonant_envelope(n)
         return audio.astype(np.float32)
+    def _synth_affricate(self, phon, params, f0, speed):
+        closure_ms = params['closure'] / speed
+        fric_ms = params['fric'] / speed
         closure_n = int(self.sr * closure_ms / 1000)
         fric_n = int(self.sr * fric_ms / 1000)
         audio = np.zeros(closure_n + fric_n, dtype=np.float32)
         if params['voiced']:
             t = np.arange(closure_n) / self.sr
             audio[:closure_n] = np.sin(2 * np.pi * f0 * 0.8 * t) * 0.1
         fric = np.random.randn(fric_n)
         low = params['freq_low']
         high = min(params['freq_high'], self.sr / 2 - 100)
         fric *= params['amp']
         fric_env = np.ones(fric_n)
         attack = fric_n // 6
         release = fric_n // 3
         return audio
+    def _synth_nasal(self, phon, params, f0, speed, formant_shift, breathiness):
+        dur_ms = params['dur'] / speed
         n = int(self.sr * dur_ms / 1000)
         t = np.arange(n) / self.sr
+        source = self._glottal_source(t, f0, breathiness)
+        f1 = params['f1'] * formant_shift
+        f2 = params['f2'] * formant_shift
+        f3 = params['f3'] * formant_shift
+        audio = self._apply_formants(source, f1, f2, f3)
+        nasal_pole = self._resonator(source, 250, 100) * 0.4
         audio += nasal_pole
         try:
             b, a = signal.butter(2, 800 / (self.sr / 2), 'low')
             audio = signal.filtfilt(b, a, audio)
         return audio.astype(np.float32)
+    def _synth_liquid(self, phon, params, f0, speed, formant_shift, breathiness):
+        dur_ms = params['dur'] / speed
         n = int(self.sr * dur_ms / 1000)
         t = np.arange(n) / self.sr
+        source = self._glottal_source(t, f0, breathiness)
+        f1 = params['f1'] * formant_shift
+        f2 = params['f2'] * formant_shift
+        f3 = params['f3'] * formant_shift
+        audio = self._apply_formants(source, f1, f2, f3)
         audio *= params['amp'] * self._consonant_envelope(n)
         return audio.astype(np.float32)
+    def _synth_glide(self, phon, params, f0, speed, formant_shift, breathiness):
+        dur_ms = params['dur'] / speed
         n = int(self.sr * dur_ms / 1000)
         t = np.arange(n) / self.sr
+        source = self._glottal_source(t, f0, breathiness)
+        f1 = params['f1'] * formant_shift
+        f2 = params['f2'] * formant_shift
+        f3 = params['f3'] * formant_shift
+        audio = self._apply_formants(source, f1, f2, f3)
         audio *= params['amp'] * self._consonant_envelope(n)
         return audio.astype(np.float32)
     def _smooth_concat(self, segments):
         if not segments:
             return np.zeros(1000, dtype=np.float32)
         if len(segments) == 1:
             return segments[0]
         overlap = 64
         total_len = sum(len(s) for s in segments) - overlap * (len(segments) - 1)
         total_len = max(total_len, 100)
             seg_to_add = seg[:seg_len]
             if i > 0 and pos > overlap:
                 fade_len = min(overlap, seg_len)
                 fade_in = np.linspace(0, 1, fade_len) ** 0.5
                 fade_out = np.linspace(1, 0, fade_len) ** 0.5
         return audio
     def _normalize(self, audio):
         if len(audio) < 100:
             return audio
         audio = audio - np.mean(audio)
         max_val = np.max(np.abs(audio))
         if max_val > 0:
             audio = audio / max_val * 0.9
         fade = min(len(audio) // 40, 200)
         audio[:fade] *= np.linspace(0, 1, fade)
         audio[-fade:] *= np.linspace(1, 0, fade)
     def __init__(self, sample_rate=22050):
         self.sr = sample_rate
         self.text_to_phoneme = TextToPhoneme()
+        self.synthesizer = VoiceSynthesizer(sample_rate)
+        self.voice_analyzer = VoiceAnalyzer(sample_rate)
+        self.current_voice = VOICE_PROFILES["Emma (Female)"]
+    def set_voice(self, voice_name):
+        if voice_name in VOICE_PROFILES:
+            self.current_voice = VOICE_PROFILES[voice_name]
+        elif voice_name in custom_voices:
+            self.current_voice = custom_voices[voice_name]
+    def speak(self, text, rate=1.0, pitch=1.0, voice_name=None):
         if not text or not text.strip():
             return np.zeros(self.sr, dtype=np.float32)
+        if voice_name:
+            self.set_voice(voice_name)
         phonemes = self.text_to_phoneme.convert(text)
         if not phonemes:
             return np.zeros(self.sr, dtype=np.float32)
+        audio = self.synthesizer.synthesize(phonemes, self.current_voice, rate, pitch)
         return audio
+    def train_voice(self, audio_data, voice_name="My Voice"):
+        """Train a new voice from audio sample"""
+        if audio_data is None:
+            return None
+        # Handle different input formats
+        if isinstance(audio_data, tuple):
+            sr, audio = audio_data
+            audio = audio.astype(np.float32)
+            if sr != self.sr:
+                # Resample
+                duration = len(audio) / sr
+                new_length = int(duration * self.sr)
+                audio = signal.resample(audio, new_length)
+        else:
+            audio = audio_data.astype(np.float32)
+        # Normalize
+        audio = audio / (np.max(np.abs(audio)) + 1e-8)
+        # Analyze voice
+        profile = self.voice_analyzer.analyze(audio)
+        if profile:
+            profile['name'] = voice_name
+            custom_voices[voice_name] = profile
+            return profile
+        return None
 # ============================================
 # ============================================
 print("=" * 50)
+print("🎙️ VEDES TTS - With Voice Training")
 print("=" * 50)
 tts = VedesTTS(SAMPLE_RATE)
 print("✅ Ready!")
+print(f"📢 Available voices: {len(VOICE_PROFILES)}")
 print("=" * 50)
 # GRADIO INTERFACE
 # ============================================
+def synthesize(text, voice_name, rate, pitch):
     if not text or not text.strip():
         return None
     try:
         pitch_mult = 2 ** (pitch / 12)
+        # Check custom voices first
+        if voice_name in custom_voices:
+            voice = custom_voices[voice_name]
+        elif voice_name in VOICE_PROFILES:
+            voice = VOICE_PROFILES[voice_name]
+        else:
+            voice = VOICE_PROFILES["Emma (Female)"]
+        tts.current_voice = voice
         audio = tts.speak(text, rate=rate, pitch=pitch_mult)
         if len(audio) < 100:
         return None
+def train_voice(audio, voice_name):
+    if audio is None:
+        return "❌ No audio provided", gr.update(choices=get_all_voices())
+    if not voice_name or not voice_name.strip():
+        voice_name = "My Voice"
+    voice_name = voice_name.strip()[:30]
+    try:
+        profile = tts.train_voice(audio, voice_name)
+        if profile:
+            details = f"""
+✅ Voice "{voice_name}" created successfully!
+**Voice Parameters:**
+- Pitch (F0): {profile['f0']:.1f} Hz
+- Pitch Variation: {profile['f0_variation']:.1f} Hz
+- Formant Shift: {profile['formant_shift']:.2f}
+- Breathiness: {profile['breathiness']:.3f}
+- Brightness: {profile['brightness']:.2f}
+"""
+            return details, gr.update(choices=get_all_voices(), value=voice_name)
+        else:
+            return "❌ Could not analyze voice. Try a longer sample.", gr.update(choices=get_all_voices())
+    except Exception as e:
+        return f"❌ Error: {str(e)}", gr.update(choices=get_all_voices())
+def get_all_voices():
+    voices = list(VOICE_PROFILES.keys()) + list(custom_voices.keys())
+    return voices
+def get_voice_info(voice_name):
+    if voice_name in VOICE_PROFILES:
+        v = VOICE_PROFILES[voice_name]
+    elif voice_name in custom_voices:
+        v = custom_voices[voice_name]
+    else:
+        return "Select a voice"
+    return f"""
+**{v.get('name', voice_name)}**
+- Type: {v.get('gender', 'unknown').title()}
+- Pitch: {v.get('f0', 130):.0f} Hz
+- {v.get('description', '')}
+"""
+def create_custom_voice(name, pitch, formant, breathiness, speed, brightness):
+    if not name or not name.strip():
+        return "❌ Please enter a voice name", gr.update(choices=get_all_voices())
+    name = name.strip()
+    profile = {
+        "name": name,
+        "gender": "custom",
+        "f0": pitch,
+        "f0_variation": 25,
+        "formant_shift": formant,
+        "breathiness": breathiness / 100,
+        "speed": speed,
+        "brightness": brightness,
+        "description": f"Custom voice (pitch={pitch}Hz)"
+    }
+    custom_voices[name] = profile
+    return f"✅ Voice '{name}' created!", gr.update(choices=get_all_voices(), value=name)
+# Build interface
+with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🎙️ Vedes TTS - Voice Training Edition
+    ### Create and Use Custom Voices - 100% From Scratch
     """)
+    with gr.Tabs():
+        # ===== SPEAK TAB =====
+        with gr.TabItem("🔊 Speak"):
+            with gr.Row():
+                with gr.Column(scale=2):
+                    text_input = gr.Textbox(
+                        label="📝 Text to Speak",
+                        placeholder="Type here...",
+                        lines=3
+                    )
+                    voice_select = gr.Dropdown(
+                        choices=get_all_voices(),
+                        value="Emma (Female)",
+                        label="🗣️ Voice"
+                    )
+                    voice_info = gr.Markdown("Select a voice")
+                    with gr.Row():
+                        rate = gr.Slider(0.6, 1.5, 0.9, step=0.1, label="⏱️ Speed")
+                        pitch = gr.Slider(-6, 6, 0, step=1, label="🎵 Pitch")
+                    speak_btn = gr.Button("🔊 Speak", variant="primary", size="lg")
+                with gr.Column(scale=1):
+                    audio_out = gr.Audio(label="🎧 Output", type="numpy")
+            gr.Examples(
+                examples=[
+                    ["Hello, how are you?"],
+                    ["Good morning!"],
+                    ["My name is Vedes."],
+                    ["Thank you very much."],
+                    ["Have a nice day."],
+                ],
+                inputs=text_input,
+                label="📚 Examples"
             )
+        # ===== TRAIN VOICE TAB =====
+        with gr.TabItem("🎤 Train Voice"):
+            gr.Markdown("""
+            ### Train a New Voice from Audio
+            Record or upload an audio sample, and Vedes will extract the voice characteristics.
+            **Tips for best results:**
+            - Record 5-10 seconds of speech
+            - Speak clearly and naturally
+            - Avoid background noise
+            """)
             with gr.Row():
+                with gr.Column():
+                    audio_input = gr.Audio(
+                        label="🎤 Record or Upload Audio",
+                        sources=["microphone", "upload"],
+                        type="numpy"
+                    )
+                    voice_name_input = gr.Textbox(
+                        label="Voice Name",
+                        placeholder="My Voice",
+                        value="My Voice"
+                    )
+                    train_btn = gr.Button("🧠 Train Voice", variant="primary")
+                with gr.Column():
+                    train_result = gr.Markdown("Upload audio and click Train")
+                    trained_voice_select = gr.Dropdown(
+                        choices=get_all_voices(),
+                        label="Use Trained Voice"
+                    )
+        # ===== CREATE VOICE TAB =====
+        with gr.TabItem("⚙️ Create Voice"):
+            gr.Markdown("""
+            ### Create Custom Voice Manually
+            Adjust the parameters to create your own voice:
+            """)
+            with gr.Row():
+                with gr.Column():
+                    custom_name = gr.Textbox(
+                        label="Voice Name",
+                        placeholder="My Custom Voice"
+                    )
+                    custom_pitch = gr.Slider(
+                        60, 300, 150,
+                        label="Pitch (Hz)",
+                        info="80-130 = Male, 150-250 = Female, 250+ = Child"
+                    )
+                    custom_formant = gr.Slider(
+                        0.7, 1.4, 1.0, step=0.05,
+                        label="Formant Shift",
+                        info="<1.0 = Larger vocal tract (male), >1.0 = Smaller (female)"
+                    )
+                    custom_breathiness = gr.Slider(
+                        1, 10, 3,
+                        label="Breathiness",
+                        info="Higher = more breathy/airy voice"
+                    )
+                    custom_speed = gr.Slider(
+                        0.7, 1.3, 1.0, step=0.05,
+                        label="Natural Speed"
+                    )
+                    custom_brightness = gr.Slider(
+                        0.8, 1.3, 1.0, step=0.05,
+                        label="Brightness",
+                        info="Higher = brighter, more forward voice"
+                    )
+                    create_btn = gr.Button("✨ Create Voice", variant="primary")
+                with gr.Column():
+                    create_result = gr.Markdown("")
+                    created_voice_select = gr.Dropdown(
+                        choices=get_all_voices(),
+                        label="Created Voices"
+                    )
+                    gr.Markdown("""
+                    ### Voice Parameter Guide
+                    | Parameter | Male | Female | Child |
+                    |-----------|------|--------|-------|
+                    | Pitch | 80-130 Hz | 150-250 Hz | 250-350 Hz |
+                    | Formant | 0.85-0.95 | 1.05-1.20 | 1.20-1.35 |
+                    | Breathiness | 2-4 | 3-6 | 2-4 |
+                    | Brightness | 0.9-1.0 | 1.0-1.15 | 1.1-1.25 |
+                    """)
+        # ===== VOICES TAB =====
+        with gr.TabItem("👥 All Voices"):
+            gr.Markdown("### Available Voices")
+            voice_cards = ""
+            for name, v in VOICE_PROFILES.items():
+                voice_cards += f"""
+**{name}**
+- Type: {v['gender'].title()}
+- Pitch: {v['f0']} Hz
+- {v['description']}
+---
+"""
+            gr.Markdown(voice_cards)
+    # Event handlers
+    voice_select.change(get_voice_info, voice_select, voice_info)
+    speak_btn.click(synthesize, [text_input, voice_select, rate, pitch], audio_out)
+    text_input.submit(synthesize, [text_input, voice_select, rate, pitch], audio_out)
+    train_btn.click(
+        train_voice,
+        [audio_input, voice_name_input],
+        [train_result, trained_voice_select]
+    )
+    create_btn.click(
+        create_custom_voice,
+        [custom_name, custom_pitch, custom_formant, custom_breathiness,
+         custom_speed, custom_brightness],
+        [create_result, created_voice_select]
+    )
+    # Update voice selectors when new voices are created
+    trained_voice_select.change(
+        lambda x: x, trained_voice_select, voice_select
+    )
+    created_voice_select.change(
+        lambda x: x, created_voice_select, voice_select
+    )
 if __name__ == "__main__":