import numpy as np
import gradio as gr
from scipy import signal
from scipy.io import wavfile
import tempfile
import re
import json
import os

# ============================================
# VEDES TTS - WITH VOICE TRAINING (FIXED)
# 100% From Scratch - No APIs
# ============================================

SAMPLE_RATE = 22050

# ============================================
# VOICE PROFILES - Pre-defined Voices
# ============================================

VOICE_PROFILES = {
    "Emma (Female)": {
        "name": "Emma",
        "gender": "female",
        "f0": 210,
        "f0_variation": 30,
        "formant_shift": 1.15,
        "breathiness": 0.04,
        "speed": 1.0,
        "brightness": 1.1,
        "description": "Friendly female voice"
    },
    "James (Male)": {
        "name": "James", 
        "gender": "male",
        "f0": 110,
        "f0_variation": 20,
        "formant_shift": 0.9,
        "breathiness": 0.02,
        "speed": 0.95,
        "brightness": 0.95,
        "description": "Professional male voice"
    },
    "Sophie (Child)": {
        "name": "Sophie",
        "gender": "child",
        "f0": 280,
        "f0_variation": 40,
        "formant_shift": 1.25,
        "breathiness": 0.03,
        "speed": 1.1,
        "brightness": 1.2,
        "description": "Young child voice"
    },
    "David (Deep Male)": {
        "name": "David",
        "gender": "male",
        "f0": 85,
        "f0_variation": 15,
        "formant_shift": 0.82,
        "breathiness": 0.02,
        "speed": 0.9,
        "brightness": 0.85,
        "description": "Deep bass voice"
    },
    "Lisa (Bright Female)": {
        "name": "Lisa",
        "gender": "female",
        "f0": 240,
        "f0_variation": 35,
        "formant_shift": 1.2,
        "breathiness": 0.05,
        "speed": 1.05,
        "brightness": 1.15,
        "description": "Bright, energetic female"
    },
    "Robert (Elderly Male)": {
        "name": "Robert",
        "gender": "male",
        "f0": 95,
        "f0_variation": 12,
        "formant_shift": 0.88,
        "breathiness": 0.06,
        "speed": 0.85,
        "brightness": 0.9,
        "description": "Mature elderly voice"
    },
    "Anna (Soft Female)": {
        "name": "Anna",
        "gender": "female",
        "f0": 195,
        "f0_variation": 25,
        "formant_shift": 1.1,
        "breathiness": 0.07,
        "speed": 0.92,
        "brightness": 1.0,
        "description": "Soft, gentle female"
    },
    "Mike (Energetic Male)": {
        "name": "Mike",
        "gender": "male",
        "f0": 130,
        "f0_variation": 30,
        "formant_shift": 0.95,
        "breathiness": 0.02,
        "speed": 1.1,
        "brightness": 1.05,
        "description": "Energetic young male"
    },
}

# Custom voices storage (global)
custom_voices = {}

# ============================================
# PHONEME DATA
# ============================================

VOWELS = {
    'IY': (280, 2250, 2890, 150, 1.0, True),
    'IH': (400, 1920, 2550, 120, 0.9, True),
    'EH': (550, 1770, 2490, 130, 0.95, True),
    'AE': (690, 1660, 2490, 140, 1.0, True),
    'AA': (710, 1100, 2540, 150, 1.0, True),
    'AO': (590, 880, 2540, 140, 0.95, True),
    'UH': (470, 1100, 2540, 120, 0.9, True),
    'UW': (310, 870, 2250, 150, 1.0, True),
    'AH': (640, 1200, 2400, 100, 0.85, True),
    'AX': (500, 1500, 2500, 80, 0.7, True),
    'ER': (500, 1350, 1700, 140, 0.9, True),
    'EY': (500, 1900, 2600, 160, 1.0, True),
    'AY': (700, 1200, 2600, 180, 1.0, True),
    'OY': (500, 900, 2500, 180, 1.0, True),
    'AW': (700, 1100, 2600, 180, 1.0, True),
    'OW': (500, 900, 2500, 160, 1.0, True),
}

CONSONANTS = {
    'P': {'type': 'stop', 'closure': 80, 'burst': 30, 'voiced': False, 'burst_freq': 800, 'amp': 0.6},
    'B': {'type': 'stop', 'closure': 50, 'burst': 25, 'voiced': True, 'burst_freq': 800, 'amp': 0.7},
    'T': {'type': 'stop', 'closure': 70, 'burst': 30, 'voiced': False, 'burst_freq': 3500, 'amp': 0.7},
    'D': {'type': 'stop', 'closure': 40, 'burst': 25, 'voiced': True, 'burst_freq': 3500, 'amp': 0.7},
    'K': {'type': 'stop', 'closure': 80, 'burst': 40, 'voiced': False, 'burst_freq': 1500, 'amp': 0.7},
    'G': {'type': 'stop', 'closure': 50, 'burst': 30, 'voiced': True, 'burst_freq': 1500, 'amp': 0.7},
    'F': {'type': 'fric', 'dur': 120, 'freq_low': 1500, 'freq_high': 8000, 'voiced': False, 'amp': 0.4},
    'V': {'type': 'fric', 'dur': 80, 'freq_low': 1500, 'freq_high': 8000, 'voiced': True, 'amp': 0.5},
    'TH': {'type': 'fric', 'dur': 100, 'freq_low': 1400, 'freq_high': 6000, 'voiced': False, 'amp': 0.3},
    'DH': {'type': 'fric', 'dur': 60, 'freq_low': 1400, 'freq_high': 6000, 'voiced': True, 'amp': 0.5},
    'S': {'type': 'fric', 'dur': 120, 'freq_low': 4000, 'freq_high': 9000, 'voiced': False, 'amp': 0.5},
    'Z': {'type': 'fric', 'dur': 90, 'freq_low': 4000, 'freq_high': 9000, 'voiced': True, 'amp': 0.5},
    'SH': {'type': 'fric', 'dur': 120, 'freq_low': 2000, 'freq_high': 6000, 'voiced': False, 'amp': 0.5},
    'ZH': {'type': 'fric', 'dur': 80, 'freq_low': 2000, 'freq_high': 6000, 'voiced': True, 'amp': 0.5},
    'HH': {'type': 'fric', 'dur': 80, 'freq_low': 500, 'freq_high': 2000, 'voiced': False, 'amp': 0.3},
    'CH': {'type': 'affric', 'closure': 60, 'fric': 80, 'freq_low': 2000, 'freq_high': 6000, 'voiced': False, 'amp': 0.6},
    'JH': {'type': 'affric', 'closure': 40, 'fric': 60, 'freq_low': 2000, 'freq_high': 6000, 'voiced': True, 'amp': 0.6},
    'M': {'type': 'nasal', 'f1': 280, 'f2': 1000, 'f3': 2200, 'dur': 100, 'amp': 0.8},
    'N': {'type': 'nasal', 'f1': 280, 'f2': 1700, 'f3': 2500, 'dur': 90, 'amp': 0.8},
    'NG': {'type': 'nasal', 'f1': 300, 'f2': 2000, 'f3': 2700, 'dur': 100, 'amp': 0.8},
    'L': {'type': 'liquid', 'f1': 380, 'f2': 1000, 'f3': 2700, 'dur': 90, 'amp': 0.85},
    'R': {'type': 'liquid', 'f1': 350, 'f2': 1300, 'f3': 1700, 'dur': 90, 'amp': 0.85},
    'W': {'type': 'glide', 'f1': 300, 'f2': 700, 'f3': 2200, 'dur': 80, 'amp': 0.8},
    'Y': {'type': 'glide', 'f1': 280, 'f2': 2200, 'f3': 2900, 'dur': 70, 'amp': 0.8},
}

SILENCE = {'SIL': 60, 'PAU': 200}

# ============================================
# PRONUNCIATION DICTIONARY
# ============================================

DICTIONARY = {
    # Function words
    'a': ['AX'], 'an': ['AE', 'N'], 'the': ['DH', 'AX'],
    'and': ['AE', 'N', 'D'], 'or': ['AO', 'R'], 'but': ['B', 'AH', 'T'],
    'if': ['IH', 'F'], 'of': ['AH', 'V'], 'to': ['T', 'UW'],
    'in': ['IH', 'N'], 'on': ['AA', 'N'], 'at': ['AE', 'T'],
    'by': ['B', 'AY'], 'for': ['F', 'AO', 'R'], 'with': ['W', 'IH', 'TH'],
    'from': ['F', 'R', 'AH', 'M'], 'up': ['AH', 'P'], 'out': ['AW', 'T'],
    'as': ['AE', 'Z'], 'so': ['S', 'OW'], 'not': ['N', 'AA', 'T'],
    
    # Pronouns
    'i': ['AY'], 'me': ['M', 'IY'], 'my': ['M', 'AY'],
    'you': ['Y', 'UW'], 'your': ['Y', 'AO', 'R'],
    'he': ['HH', 'IY'], 'him': ['HH', 'IH', 'M'], 'his': ['HH', 'IH', 'Z'],
    'she': ['SH', 'IY'], 'her': ['HH', 'ER'],
    'it': ['IH', 'T'], 'its': ['IH', 'T', 'S'],
    'we': ['W', 'IY'], 'us': ['AH', 'S'], 'our': ['AW', 'ER'],
    'they': ['DH', 'EY'], 'them': ['DH', 'EH', 'M'], 'their': ['DH', 'EH', 'R'],
    'this': ['DH', 'IH', 'S'], 'that': ['DH', 'AE', 'T'],
    'what': ['W', 'AH', 'T'], 'who': ['HH', 'UW'],
    'where': ['W', 'EH', 'R'], 'when': ['W', 'EH', 'N'],
    'why': ['W', 'AY'], 'how': ['HH', 'AW'], 'which': ['W', 'IH', 'CH'],
    
    # Be verbs
    'am': ['AE', 'M'], 'is': ['IH', 'Z'], 'are': ['AA', 'R'],
    'was': ['W', 'AA', 'Z'], 'were': ['W', 'ER'],
    'be': ['B', 'IY'], 'been': ['B', 'IH', 'N'], 'being': ['B', 'IY', 'IH', 'NG'],
    
    # Have verbs
    'have': ['HH', 'AE', 'V'], 'has': ['HH', 'AE', 'Z'],
    'had': ['HH', 'AE', 'D'], 'having': ['HH', 'AE', 'V', 'IH', 'NG'],
    
    # Do verbs
    'do': ['D', 'UW'], 'does': ['D', 'AH', 'Z'],
    'did': ['D', 'IH', 'D'], 'done': ['D', 'AH', 'N'],
    
    # Modal verbs
    'will': ['W', 'IH', 'L'], 'would': ['W', 'UH', 'D'],
    'can': ['K', 'AE', 'N'], 'could': ['K', 'UH', 'D'],
    'should': ['SH', 'UH', 'D'], 'may': ['M', 'EY'],
    'might': ['M', 'AY', 'T'], 'must': ['M', 'AH', 'S', 'T'],
    
    # Common verbs
    'go': ['G', 'OW'], 'goes': ['G', 'OW', 'Z'], 'going': ['G', 'OW', 'IH', 'NG'],
    'went': ['W', 'EH', 'N', 'T'], 'gone': ['G', 'AO', 'N'],
    'come': ['K', 'AH', 'M'], 'comes': ['K', 'AH', 'M', 'Z'],
    'coming': ['K', 'AH', 'M', 'IH', 'NG'], 'came': ['K', 'EY', 'M'],
    'get': ['G', 'EH', 'T'], 'gets': ['G', 'EH', 'T', 'S'],
    'getting': ['G', 'EH', 'T', 'IH', 'NG'], 'got': ['G', 'AA', 'T'],
    'make': ['M', 'EY', 'K'], 'makes': ['M', 'EY', 'K', 'S'],
    'making': ['M', 'EY', 'K', 'IH', 'NG'], 'made': ['M', 'EY', 'D'],
    'take': ['T', 'EY', 'K'], 'takes': ['T', 'EY', 'K', 'S'],
    'took': ['T', 'UH', 'K'], 'taken': ['T', 'EY', 'K', 'AX', 'N'],
    'see': ['S', 'IY'], 'sees': ['S', 'IY', 'Z'],
    'saw': ['S', 'AO'], 'seen': ['S', 'IY', 'N'],
    'say': ['S', 'EY'], 'says': ['S', 'EH', 'Z'], 'said': ['S', 'EH', 'D'],
    'know': ['N', 'OW'], 'knows': ['N', 'OW', 'Z'],
    'knew': ['N', 'UW'], 'known': ['N', 'OW', 'N'],
    'think': ['TH', 'IH', 'NG', 'K'], 'thought': ['TH', 'AO', 'T'],
    'want': ['W', 'AA', 'N', 'T'], 'wants': ['W', 'AA', 'N', 'T', 'S'],
    'give': ['G', 'IH', 'V'], 'gives': ['G', 'IH', 'V', 'Z'],
    'gave': ['G', 'EY', 'V'], 'given': ['G', 'IH', 'V', 'AX', 'N'],
    'tell': ['T', 'EH', 'L'], 'told': ['T', 'OW', 'L', 'D'],
    'ask': ['AE', 'S', 'K'], 'asked': ['AE', 'S', 'K', 'T'],
    'use': ['Y', 'UW', 'Z'], 'used': ['Y', 'UW', 'Z', 'D'],
    'find': ['F', 'AY', 'N', 'D'], 'found': ['F', 'AW', 'N', 'D'],
    'work': ['W', 'ER', 'K'], 'works': ['W', 'ER', 'K', 'S'],
    'call': ['K', 'AO', 'L'], 'called': ['K', 'AO', 'L', 'D'],
    'try': ['T', 'R', 'AY'], 'tried': ['T', 'R', 'AY', 'D'],
    'need': ['N', 'IY', 'D'], 'needs': ['N', 'IY', 'D', 'Z'],
    'feel': ['F', 'IY', 'L'], 'feels': ['F', 'IY', 'L', 'Z'],
    'help': ['HH', 'EH', 'L', 'P'], 'helps': ['HH', 'EH', 'L', 'P', 'S'],
    'keep': ['K', 'IY', 'P'], 'kept': ['K', 'EH', 'P', 'T'],
    'let': ['L', 'EH', 'T'], 'put': ['P', 'UH', 'T'],
    'seem': ['S', 'IY', 'M'], 'leave': ['L', 'IY', 'V'],
    'show': ['SH', 'OW'], 'hear': ['HH', 'IY', 'R'],
    'play': ['P', 'L', 'EY'], 'run': ['R', 'AH', 'N'],
    'move': ['M', 'UW', 'V'], 'live': ['L', 'IH', 'V'],
    'believe': ['B', 'IH', 'L', 'IY', 'V'],
    'read': ['R', 'IY', 'D'], 'write': ['R', 'AY', 'T'],
    'learn': ['L', 'ER', 'N'], 'speak': ['S', 'P', 'IY', 'K'],
    'look': ['L', 'UH', 'K'], 'like': ['L', 'AY', 'K'],
    'love': ['L', 'AH', 'V'], 'start': ['S', 'T', 'AA', 'R', 'T'],
    'stop': ['S', 'T', 'AA', 'P'],
    
    # Adjectives
    'good': ['G', 'UH', 'D'], 'better': ['B', 'EH', 'T', 'ER'],
    'best': ['B', 'EH', 'S', 'T'], 'bad': ['B', 'AE', 'D'],
    'new': ['N', 'UW'], 'old': ['OW', 'L', 'D'],
    'big': ['B', 'IH', 'G'], 'small': ['S', 'M', 'AO', 'L'],
    'long': ['L', 'AO', 'NG'], 'short': ['SH', 'AO', 'R', 'T'],
    'high': ['HH', 'AY'], 'low': ['L', 'OW'],
    'great': ['G', 'R', 'EY', 'T'], 'little': ['L', 'IH', 'T', 'AX', 'L'],
    'right': ['R', 'AY', 'T'], 'wrong': ['R', 'AO', 'NG'],
    'first': ['F', 'ER', 'S', 'T'], 'last': ['L', 'AE', 'S', 'T'],
    'same': ['S', 'EY', 'M'], 'different': ['D', 'IH', 'F', 'R', 'AX', 'N', 'T'],
    'own': ['OW', 'N'], 'other': ['AH', 'DH', 'ER'],
    'nice': ['N', 'AY', 'S'], 'happy': ['HH', 'AE', 'P', 'IY'],
    'sure': ['SH', 'UH', 'R'], 'true': ['T', 'R', 'UW'],
    'real': ['R', 'IY', 'L'], 'clear': ['K', 'L', 'IY', 'R'],
    'fine': ['F', 'AY', 'N'], 'free': ['F', 'R', 'IY'],
    'easy': ['IY', 'Z', 'IY'], 'hard': ['HH', 'AA', 'R', 'D'],
    'young': ['Y', 'AH', 'NG'], 'beautiful': ['B', 'Y', 'UW', 'T', 'IH', 'F', 'AX', 'L'],
    
    # Adverbs
    'very': ['V', 'EH', 'R', 'IY'], 'really': ['R', 'IY', 'L', 'IY'],
    'just': ['JH', 'AH', 'S', 'T'], 'only': ['OW', 'N', 'L', 'IY'],
    'also': ['AO', 'L', 'S', 'OW'], 'well': ['W', 'EH', 'L'],
    'now': ['N', 'AW'], 'then': ['DH', 'EH', 'N'],
    'here': ['HH', 'IY', 'R'], 'there': ['DH', 'EH', 'R'],
    'still': ['S', 'T', 'IH', 'L'], 'even': ['IY', 'V', 'AX', 'N'],
    'back': ['B', 'AE', 'K'], 'again': ['AX', 'G', 'EH', 'N'],
    'always': ['AO', 'L', 'W', 'EY', 'Z'], 'never': ['N', 'EH', 'V', 'ER'],
    'today': ['T', 'AX', 'D', 'EY'], 'maybe': ['M', 'EY', 'B', 'IY'],
    'too': ['T', 'UW'], 'much': ['M', 'AH', 'CH'],
    'more': ['M', 'AO', 'R'], 'most': ['M', 'OW', 'S', 'T'],
    'please': ['P', 'L', 'IY', 'Z'],
    
    # Nouns
    'time': ['T', 'AY', 'M'], 'year': ['Y', 'IY', 'R'],
    'day': ['D', 'EY'], 'way': ['W', 'EY'],
    'man': ['M', 'AE', 'N'], 'woman': ['W', 'UH', 'M', 'AX', 'N'],
    'child': ['CH', 'AY', 'L', 'D'], 'world': ['W', 'ER', 'L', 'D'],
    'life': ['L', 'AY', 'F'], 'hand': ['HH', 'AE', 'N', 'D'],
    'part': ['P', 'AA', 'R', 'T'], 'place': ['P', 'L', 'EY', 'S'],
    'thing': ['TH', 'IH', 'NG'], 'things': ['TH', 'IH', 'NG', 'Z'],
    'people': ['P', 'IY', 'P', 'AX', 'L'], 'person': ['P', 'ER', 'S', 'AX', 'N'],
    'home': ['HH', 'OW', 'M'], 'house': ['HH', 'AW', 'S'],
    'word': ['W', 'ER', 'D'], 'name': ['N', 'EY', 'M'],
    'water': ['W', 'AO', 'T', 'ER'], 'money': ['M', 'AH', 'N', 'IY'],
    'family': ['F', 'AE', 'M', 'AX', 'L', 'IY'],
    'friend': ['F', 'R', 'EH', 'N', 'D'], 'friends': ['F', 'R', 'EH', 'N', 'D', 'Z'],
    'mother': ['M', 'AH', 'DH', 'ER'], 'father': ['F', 'AA', 'DH', 'ER'],
    'boy': ['B', 'OY'], 'girl': ['G', 'ER', 'L'],
    'head': ['HH', 'EH', 'D'], 'face': ['F', 'EY', 'S'],
    'eye': ['AY'], 'eyes': ['AY', 'Z'],
    'voice': ['V', 'OY', 'S'], 'night': ['N', 'AY', 'T'],
    'morning': ['M', 'AO', 'R', 'N', 'IH', 'NG'],
    'week': ['W', 'IY', 'K'], 'month': ['M', 'AH', 'N', 'TH'],
    'school': ['S', 'K', 'UW', 'L'], 'book': ['B', 'UH', 'K'],
    'story': ['S', 'T', 'AO', 'R', 'IY'],
    'question': ['K', 'W', 'EH', 'S', 'CH', 'AX', 'N'],
    'answer': ['AE', 'N', 'S', 'ER'],
    
    # Numbers
    'zero': ['Z', 'IY', 'R', 'OW'], 'one': ['W', 'AH', 'N'],
    'two': ['T', 'UW'], 'three': ['TH', 'R', 'IY'],
    'four': ['F', 'AO', 'R'], 'five': ['F', 'AY', 'V'],
    'six': ['S', 'IH', 'K', 'S'], 'seven': ['S', 'EH', 'V', 'AX', 'N'],
    'eight': ['EY', 'T'], 'nine': ['N', 'AY', 'N'], 'ten': ['T', 'EH', 'N'],
    
    # Greetings
    'hello': ['HH', 'AX', 'L', 'OW'], 'hi': ['HH', 'AY'],
    'hey': ['HH', 'EY'], 'goodbye': ['G', 'UH', 'D', 'B', 'AY'],
    'bye': ['B', 'AY'], 'welcome': ['W', 'EH', 'L', 'K', 'AX', 'M'],
    'thank': ['TH', 'AE', 'NG', 'K'], 'thanks': ['TH', 'AE', 'NG', 'K', 'S'],
    'sorry': ['S', 'AA', 'R', 'IY'],
    'yes': ['Y', 'EH', 'S'], 'yeah': ['Y', 'AE'], 'no': ['N', 'OW'],
    'ok': ['OW', 'K', 'EY'], 'okay': ['OW', 'K', 'EY'],
    
    # Tech/TTS
    'text': ['T', 'EH', 'K', 'S', 'T'], 'speech': ['S', 'P', 'IY', 'CH'],
    'sound': ['S', 'AW', 'N', 'D'], 'audio': ['AO', 'D', 'IY', 'OW'],
    'test': ['T', 'EH', 'S', 'T'], 'testing': ['T', 'EH', 'S', 'T', 'IH', 'NG'],
    'computer': ['K', 'AX', 'M', 'P', 'Y', 'UW', 'T', 'ER'],
    'vedes': ['V', 'EY', 'D', 'EH', 'S'],
    'system': ['S', 'IH', 'S', 'T', 'AX', 'M'],
    'train': ['T', 'R', 'EY', 'N'], 'training': ['T', 'R', 'EY', 'N', 'IH', 'NG'],
}

# Letter patterns
PATTERNS = [
    ('tion', ['SH', 'AX', 'N']), ('sion', ['ZH', 'AX', 'N']),
    ('ness', ['N', 'AX', 'S']), ('ment', ['M', 'AX', 'N', 'T']),
    ('able', ['AX', 'B', 'AX', 'L']), ('ible', ['AX', 'B', 'AX', 'L']),
    ('ful', ['F', 'AX', 'L']), ('less', ['L', 'AX', 'S']),
    ('ing', ['IH', 'NG']), ('ight', ['AY', 'T']),
    ('ough', ['AO']), ('ould', ['UH', 'D']),
    ('th', ['TH']), ('sh', ['SH']), ('ch', ['CH']),
    ('wh', ['W']), ('ph', ['F']), ('ck', ['K']), ('ng', ['NG']),
    ('qu', ['K', 'W']), ('ee', ['IY']), ('ea', ['IY']),
    ('oo', ['UW']), ('ou', ['AW']), ('ow', ['OW']),
    ('ai', ['EY']), ('ay', ['EY']), ('ey', ['IY']),
    ('oy', ['OY']), ('oi', ['OY']), ('ie', ['IY']),
    ('er', ['ER']), ('ir', ['ER']), ('ur', ['ER']),
    ('ar', ['AA', 'R']), ('or', ['AO', 'R']),
]

LETTERS = {
    'a': 'AE', 'b': 'B', 'c': 'K', 'd': 'D', 'e': 'EH',
    'f': 'F', 'g': 'G', 'h': 'HH', 'i': 'IH', 'j': 'JH',
    'k': 'K', 'l': 'L', 'm': 'M', 'n': 'N', 'o': 'AA',
    'p': 'P', 'q': 'K', 'r': 'R', 's': 'S', 't': 'T',
    'u': 'AH', 'v': 'V', 'w': 'W', 'x': 'K', 'y': 'IY', 'z': 'Z',
}


# ============================================
# VOICE ANALYZER
# ============================================

class VoiceAnalyzer:
    """Analyze audio to extract voice characteristics"""
    
    def __init__(self, sample_rate=22050):
        self.sr = sample_rate
    
    def analyze(self, audio):
        """Extract voice features from audio sample"""
        if len(audio) < self.sr * 0.3:
            return None
        
        audio = audio.astype(np.float32)
        max_val = np.max(np.abs(audio))
        if max_val > 0:
            audio = audio / max_val
        
        f0 = self._estimate_pitch(audio)
        formants = self._estimate_formants(audio)
        breathiness = self._estimate_breathiness(audio)
        
        profile = {
            "name": "Custom Voice",
            "gender": "custom",
            "f0": f0,
            "f0_variation": self._estimate_f0_variation(audio, f0),
            "formant_shift": formants.get('shift', 1.0),
            "breathiness": breathiness,
            "speed": 1.0,
            "brightness": formants.get('brightness', 1.0),
            "description": f"Custom voice (F0={f0:.0f}Hz)"
        }
        
        return profile
    
    def _estimate_pitch(self, audio):
        """Estimate fundamental frequency using autocorrelation"""
        frame_size = int(self.sr * 0.03)
        pitches = []
        
        for i in range(0, len(audio) - frame_size, frame_size):
            frame = audio[i:i + frame_size]
            
            # Remove DC
            frame = frame - np.mean(frame)
            
            # Autocorrelation
            corr = np.correlate(frame, frame, mode='full')
            corr = corr[len(corr)//2:]
            
            # Find peaks
            d = np.diff(corr)
            start_indices = np.where(d > 0)[0]
            
            if len(start_indices) > 0:
                start = start_indices[0]
                search_end = min(start + int(self.sr / 60), len(corr))
                
                if search_end > start:
                    peak = start + np.argmax(corr[start:search_end])
                    
                    if peak > 0:
                        f0 = self.sr / peak
                        if 60 < f0 < 400:
                            pitches.append(f0)
        
        if pitches:
            return float(np.median(pitches))
        return 130.0
    
    def _estimate_f0_variation(self, audio, base_f0):
        """Estimate pitch variation"""
        frame_size = int(self.sr * 0.03)
        pitches = []
        
        for i in range(0, len(audio) - frame_size, frame_size):
            frame = audio[i:i + frame_size]
            frame = frame - np.mean(frame)
            
            corr = np.correlate(frame, frame, mode='full')
            corr = corr[len(corr)//2:]
            
            d = np.diff(corr)
            start_indices = np.where(d > 0)[0]
            
            if len(start_indices) > 0:
                start = start_indices[0]
                search_end = min(start + int(self.sr / 60), len(corr))
                
                if search_end > start:
                    peak = start + np.argmax(corr[start:search_end])
                    if peak > 0:
                        f0 = self.sr / peak
                        if 60 < f0 < 400:
                            pitches.append(f0)
        
        if len(pitches) > 2:
            return min(float(np.std(pitches)), 50.0)
        return 20.0
    
    def _estimate_formants(self, audio):
        """Estimate formant characteristics"""
        frame_size = 2048
        
        if len(audio) < frame_size:
            return {'shift': 1.0, 'brightness': 1.0}
        
        spectrum = np.abs(np.fft.rfft(audio[:frame_size] * np.hanning(frame_size)))
        freqs = np.fft.rfftfreq(frame_size, 1/self.sr)
        
        total_energy = np.sum(spectrum) + 1e-8
        centroid = np.sum(freqs * spectrum) / total_energy
        
        if centroid > 1600:
            shift = 1.2
            brightness = 1.15
        elif centroid > 1400:
            shift = 1.1
            brightness = 1.05
        elif centroid > 1200:
            shift = 1.0
            brightness = 1.0
        elif centroid > 1000:
            shift = 0.9
            brightness = 0.95
        else:
            shift = 0.85
            brightness = 0.9
        
        return {'shift': shift, 'brightness': brightness}
    
    def _estimate_breathiness(self, audio):
        """Estimate breathiness"""
        frame_size = 2048
        
        if len(audio) < frame_size:
            return 0.03
        
        spectrum = np.abs(np.fft.rfft(audio[:frame_size]))
        freqs = np.fft.rfftfreq(frame_size, 1/self.sr)
        
        low_mask = freqs < 1000
        high_mask = (freqs > 2000) & (freqs < 5000)
        
        low_energy = np.sum(spectrum[low_mask]) + 1e-8
        high_energy = np.sum(spectrum[high_mask])
        
        ratio = high_energy / low_energy
        breathiness = float(np.clip(ratio * 0.1, 0.02, 0.1))
        
        return breathiness


# ============================================
# TEXT TO PHONEME CONVERTER
# ============================================

class TextToPhoneme:
    def __init__(self):
        self.dictionary = DICTIONARY
        self.patterns = sorted(PATTERNS, key=lambda x: -len(x[0]))
    
    def convert(self, text):
        text = text.lower().strip()
        text = re.sub(r"[^\w\s.,!?']", '', text)
        
        tokens = re.findall(r"[\w']+|[.,!?]", text)
        phonemes = []
        
        for i, token in enumerate(tokens):
            if token in '.,!?':
                phonemes.append('PAU')
            elif token in self.dictionary:
                phonemes.extend(self.dictionary[token])
                if i < len(tokens) - 1 and tokens[i + 1] not in '.,!?':
                    phonemes.append('SIL')
            else:
                phons = self._convert_word(token)
                phonemes.extend(phons)
                if i < len(tokens) - 1 and tokens[i + 1] not in '.,!?':
                    phonemes.append('SIL')
        
        return phonemes
    
    def _convert_word(self, word):
        phonemes = []
        i = 0
        
        while i < len(word):
            matched = False
            
            for pattern, phons in self.patterns:
                if word[i:].startswith(pattern):
                    phonemes.extend(phons)
                    i += len(pattern)
                    matched = True
                    break
            
            if not matched:
                char = word[i]
                if char in LETTERS:
                    phonemes.append(LETTERS[char])
                i += 1
        
        return phonemes


# ============================================
# VOICE SYNTHESIZER
# ============================================

class VoiceSynthesizer:
    def __init__(self, sample_rate=22050):
        self.sr = sample_rate
        self.default_voice = VOICE_PROFILES["Emma (Female)"]
    
    def synthesize(self, phonemes, voice_profile=None, rate=1.0, pitch=1.0):
        if not phonemes:
            return np.zeros(int(self.sr * 0.5), dtype=np.float32)
        
        voice = voice_profile or self.default_voice
        
        f0 = voice.get('f0', 130) * pitch
        f0_var = voice.get('f0_variation', 20)
        formant_shift = voice.get('formant_shift', 1.0)
        breathiness = voice.get('breathiness', 0.03)
        voice_speed = voice.get('speed', 1.0) * rate
        brightness = voice.get('brightness', 1.0)
        
        segments = []
        
        for i, phon in enumerate(phonemes):
            prev_phon = phonemes[i - 1] if i > 0 else None
            next_phon = phonemes[i + 1] if i < len(phonemes) - 1 else None
            
            phrase_pos = i / max(len(phonemes), 1)
            f0_current = f0 + f0_var * np.sin(phrase_pos * np.pi) * 0.5
            
            seg = self._synth_phoneme(
                phon, f0_current, voice_speed, formant_shift, 
                breathiness, brightness, prev_phon, next_phon
            )
            segments.append(seg)
        
        audio = self._smooth_concat(segments)
        audio = self._normalize(audio)
        
        return audio.astype(np.float32)
    
    def _synth_phoneme(self, phon, f0, speed, formant_shift, breathiness, 
                       brightness, prev_phon, next_phon):
        if phon in SILENCE:
            dur = int(self.sr * SILENCE[phon] / 1000 / speed)
            return np.zeros(dur, dtype=np.float32)
        
        if phon in VOWELS:
            return self._synth_vowel(phon, f0, speed, formant_shift, 
                                     breathiness, brightness)
        
        if phon in CONSONANTS:
            return self._synth_consonant(phon, f0, speed, formant_shift, breathiness)
        
        return np.zeros(100, dtype=np.float32)
    
    def _synth_vowel(self, phon, f0, speed, formant_shift, breathiness, brightness):
        params = VOWELS[phon]
        f1, f2, f3, dur_ms, amp, voiced = params
        
        f1 = f1 * formant_shift
        f2 = f2 * formant_shift * brightness
        f3 = f3 * formant_shift * brightness
        
        dur_ms = dur_ms / speed
        n = int(self.sr * dur_ms / 1000)
        n = max(n, 100)
        t = np.arange(n) / self.sr
        
        source = self._glottal_source(t, f0, breathiness)
        audio = self._apply_formants(source, f1, f2, f3)
        envelope = self._vowel_envelope(n)
        audio = audio * envelope * amp
        
        return audio
    
    def _synth_consonant(self, phon, f0, speed, formant_shift, breathiness):
        params = CONSONANTS[phon]
        ctype = params['type']
        
        if ctype == 'stop':
            return self._synth_stop(params, f0, speed, formant_shift)
        elif ctype == 'fric':
            return self._synth_fricative(params, f0, speed)
        elif ctype == 'affric':
            return self._synth_affricate(params, f0, speed)
        elif ctype == 'nasal':
            return self._synth_nasal(params, f0, speed, formant_shift, breathiness)
        elif ctype == 'liquid':
            return self._synth_liquid(params, f0, speed, formant_shift, breathiness)
        elif ctype == 'glide':
            return self._synth_glide(params, f0, speed, formant_shift, breathiness)
        
        return np.zeros(100, dtype=np.float32)
    
    def _glottal_source(self, t, f0, breathiness):
        T0 = 1.0 / f0
        phase = (t % T0) / T0
        
        glottal = np.zeros_like(t)
        mask1 = phase < 0.4
        glottal[mask1] = 0.5 * (1 - np.cos(np.pi * phase[mask1] / 0.4))
        
        mask2 = (phase >= 0.4) & (phase < 0.6)
        glottal[mask2] = np.cos(np.pi * (phase[mask2] - 0.4) / 0.4)
        
        glottal += np.random.randn(len(t)) * breathiness
        shimmer = 1 + 0.02 * np.sin(2 * np.pi * 5 * t)
        glottal *= shimmer
        
        return glottal
    
    def _apply_formants(self, source, f1, f2, f3):
        formants = [(f1, 90), (f2, 110), (f3, 130)]
        result = np.zeros_like(source)
        
        for freq, bw in formants:
            result += self._resonator(source, freq, bw)
        
        return result
    
    def _resonator(self, sig, freq, bw):
        if freq <= 0 or freq >= self.sr / 2:
            return sig
        
        r = np.exp(-np.pi * bw / self.sr)
        theta = 2 * np.pi * freq / self.sr
        
        a1 = -2 * r * np.cos(theta)
        a2 = r * r
        b0 = 1 - r
        
        y = np.zeros_like(sig)
        for i in range(2, len(sig)):
            y[i] = b0 * sig[i] - a1 * y[i-1] - a2 * y[i-2]
        
        return y
    
    def _vowel_envelope(self, n):
        env = np.ones(n)
        attack = max(1, n // 10)
        release = max(1, int(n * 0.15))
        
        env[:attack] = np.sin(np.linspace(0, np.pi/2, attack)) ** 2
        env[-release:] = np.cos(np.linspace(0, np.pi/2, release)) ** 2
        
        return env
    
    def _consonant_envelope(self, n):
        env = np.ones(n)
        attack = max(1, n // 8)
        release = max(1, n // 6)
        
        env[:attack] = np.linspace(0.1, 1, attack)
        env[-release:] = np.linspace(1, 0.1, release)
        
        return env
    
    def _synth_stop(self, params, f0, speed, formant_shift):
        closure_ms = params['closure'] / speed
        burst_ms = params['burst'] / speed
        
        closure_n = int(self.sr * closure_ms / 1000)
        burst_n = int(self.sr * burst_ms / 1000)
        total_n = closure_n + burst_n
        
        audio = np.zeros(total_n, dtype=np.float32)
        
        if params['voiced']:
            t = np.arange(closure_n) / self.sr
            voice_bar = np.sin(2 * np.pi * f0 * 0.8 * t) * 0.15
            audio[:closure_n] = voice_bar
        
        burst = np.random.randn(burst_n)
        burst_freq = params['burst_freq'] * formant_shift
        
        try:
            if burst_freq < self.sr / 2 - 100:
                b, a = signal.butter(2, burst_freq / (self.sr / 2), 'low')
                burst = signal.filtfilt(b, a, burst)
        except:
            pass
        
        burst_env = np.exp(-np.linspace(0, 5, burst_n))
        burst *= burst_env * params['amp']
        
        audio[closure_n:] = burst
        
        return audio
    
    def _synth_fricative(self, params, f0, speed):
        dur_ms = params['dur'] / speed
        n = int(self.sr * dur_ms / 1000)
        
        noise = np.random.randn(n)
        
        low = params['freq_low']
        high = min(params['freq_high'], self.sr / 2 - 100)
        
        try:
            if low < high:
                b, a = signal.butter(4, [low / (self.sr / 2), high / (self.sr / 2)], 'band')
                noise = signal.filtfilt(b, a, noise)
        except:
            pass
        
        audio = noise * params['amp']
        
        if params['voiced']:
            t = np.arange(n) / self.sr
            voice = self._glottal_source(t, f0, 0.03) * 0.3
            audio = audio + voice
        
        audio *= self._consonant_envelope(n)
        
        return audio.astype(np.float32)
    
    def _synth_affricate(self, params, f0, speed):
        closure_ms = params['closure'] / speed
        fric_ms = params['fric'] / speed
        
        closure_n = int(self.sr * closure_ms / 1000)
        fric_n = int(self.sr * fric_ms / 1000)
        
        audio = np.zeros(closure_n + fric_n, dtype=np.float32)
        
        if params['voiced']:
            t = np.arange(closure_n) / self.sr
            audio[:closure_n] = np.sin(2 * np.pi * f0 * 0.8 * t) * 0.1
        
        fric = np.random.randn(fric_n)
        low = params['freq_low']
        high = min(params['freq_high'], self.sr / 2 - 100)
        
        try:
            b, a = signal.butter(3, [low / (self.sr / 2), high / (self.sr / 2)], 'band')
            fric = signal.filtfilt(b, a, fric)
        except:
            pass
        
        fric *= params['amp']
        
        fric_env = np.ones(fric_n)
        attack = fric_n // 6
        release = fric_n // 3
        fric_env[:attack] = np.linspace(0, 1, attack)
        fric_env[-release:] = np.linspace(1, 0, release)
        
        audio[closure_n:] = fric * fric_env
        
        return audio
    
    def _synth_nasal(self, params, f0, speed, formant_shift, breathiness):
        dur_ms = params['dur'] / speed
        n = int(self.sr * dur_ms / 1000)
        t = np.arange(n) / self.sr
        
        source = self._glottal_source(t, f0, breathiness)
        
        f1 = params['f1'] * formant_shift
        f2 = params['f2'] * formant_shift
        f3 = params['f3'] * formant_shift
        
        audio = self._apply_formants(source, f1, f2, f3)
        
        nasal_pole = self._resonator(source, 250, 100) * 0.4
        audio += nasal_pole
        
        try:
            b, a = signal.butter(2, 800 / (self.sr / 2), 'low')
            audio = signal.filtfilt(b, a, audio)
        except:
            pass
        
        audio *= params['amp'] * self._consonant_envelope(n)
        
        return audio.astype(np.float32)
    
    def _synth_liquid(self, params, f0, speed, formant_shift, breathiness):
        dur_ms = params['dur'] / speed
        n = int(self.sr * dur_ms / 1000)
        t = np.arange(n) / self.sr
        
        source = self._glottal_source(t, f0, breathiness)
        
        f1 = params['f1'] * formant_shift
        f2 = params['f2'] * formant_shift
        f3 = params['f3'] * formant_shift
        
        audio = self._apply_formants(source, f1, f2, f3)
        audio *= params['amp'] * self._consonant_envelope(n)
        
        return audio.astype(np.float32)
    
    def _synth_glide(self, params, f0, speed, formant_shift, breathiness):
        dur_ms = params['dur'] / speed
        n = int(self.sr * dur_ms / 1000)
        t = np.arange(n) / self.sr
        
        source = self._glottal_source(t, f0, breathiness)
        
        f1 = params['f1'] * formant_shift
        f2 = params['f2'] * formant_shift
        f3 = params['f3'] * formant_shift
        
        audio = self._apply_formants(source, f1, f2, f3)
        audio *= params['amp'] * self._consonant_envelope(n)
        
        return audio.astype(np.float32)
    
    def _smooth_concat(self, segments):
        if not segments:
            return np.zeros(1000, dtype=np.float32)
        
        if len(segments) == 1:
            return segments[0]
        
        overlap = 64
        total_len = sum(len(s) for s in segments) - overlap * (len(segments) - 1)
        total_len = max(total_len, 100)
        
        audio = np.zeros(total_len, dtype=np.float32)
        pos = 0
        
        for i, seg in enumerate(segments):
            if len(seg) == 0:
                continue
            
            end_pos = min(pos + len(seg), total_len)
            seg_len = end_pos - pos
            
            if seg_len <= 0:
                break
            
            seg_to_add = seg[:seg_len].copy()
            
            if i > 0 and pos > overlap:
                fade_len = min(overlap, seg_len)
                fade_in = np.linspace(0, 1, fade_len) ** 0.5
                fade_out = np.linspace(1, 0, fade_len) ** 0.5
                
                audio[pos:pos + fade_len] *= fade_out
                seg_to_add[:fade_len] *= fade_in
            
            audio[pos:end_pos] += seg_to_add
            pos = end_pos - overlap // 2
            pos = max(0, pos)
        
        return audio
    
    def _normalize(self, audio):
        if len(audio) < 100:
            return audio
        
        audio = audio - np.mean(audio)
        max_val = np.max(np.abs(audio))
        if max_val > 0:
            audio = audio / max_val * 0.9
        
        fade = min(len(audio) // 40, 200)
        audio[:fade] *= np.linspace(0, 1, fade)
        audio[-fade:] *= np.linspace(1, 0, fade)
        
        return audio


# ============================================
# MAIN TTS CLASS
# ============================================

class VedesTTS:
    def __init__(self, sample_rate=22050):
        self.sr = sample_rate
        self.text_to_phoneme = TextToPhoneme()
        self.synthesizer = VoiceSynthesizer(sample_rate)
        self.voice_analyzer = VoiceAnalyzer(sample_rate)
        self.current_voice = VOICE_PROFILES["Emma (Female)"]
    
    def get_voice(self, voice_name):
        if voice_name in VOICE_PROFILES:
            return VOICE_PROFILES[voice_name]
        elif voice_name in custom_voices:
            return custom_voices[voice_name]
        return self.current_voice
    
    def speak(self, text, rate=1.0, pitch=1.0, voice_name=None):
        if not text or not text.strip():
            return np.zeros(self.sr, dtype=np.float32)
        
        voice = self.get_voice(voice_name) if voice_name else self.current_voice
        phonemes = self.text_to_phoneme.convert(text)
        
        if not phonemes:
            return np.zeros(self.sr, dtype=np.float32)
        
        audio = self.synthesizer.synthesize(phonemes, voice, rate, pitch)
        
        return audio
    
    def train_voice(self, audio_data, voice_name="My Voice"):
        """Train a new voice from audio sample"""
        global custom_voices
        
        if audio_data is None:
            return None
        
        # Handle tuple format (sample_rate, audio)
        if isinstance(audio_data, tuple):
            sr, audio = audio_data
            audio = audio.astype(np.float32)
            
            # Handle stereo
            if len(audio.shape) > 1:
                audio = audio.mean(axis=1)
            
            # Resample if needed
            if sr != self.sr:
                duration = len(audio) / sr
                new_length = int(duration * self.sr)
                audio = signal.resample(audio, new_length)
        else:
            audio = audio_data.astype(np.float32)
        
        # Normalize
        max_val = np.max(np.abs(audio))
        if max_val > 0:
            audio = audio / max_val
        
        # Analyze
        profile = self.voice_analyzer.analyze(audio)
        
        if profile:
            profile['name'] = voice_name
            profile['description'] = f"Trained voice (F0={profile['f0']:.0f}Hz)"
            custom_voices[voice_name] = profile
            return profile
        
        return None


# ============================================
# INITIALIZE
# ============================================

print("=" * 50)
print("🎙️ VEDES TTS - With Voice Training")
print("=" * 50)

tts = VedesTTS(SAMPLE_RATE)

print("✅ Ready!")
print(f"📢 Available voices: {len(VOICE_PROFILES)}")
print("=" * 50)


# ============================================
# HELPER FUNCTIONS
# ============================================

def get_all_voices():
    """Get list of all available voices"""
    voices = list(VOICE_PROFILES.keys()) + list(custom_voices.keys())
    return voices


def get_voice_info(voice_name):
    """Get info about a voice"""
    if voice_name in VOICE_PROFILES:
        v = VOICE_PROFILES[voice_name]
    elif voice_name in custom_voices:
        v = custom_voices[voice_name]
    else:
        return "Select a voice"
    
    return f"""
**{v.get('name', voice_name)}**
- Type: {v.get('gender', 'unknown').title()}
- Pitch: {v.get('f0', 130):.0f} Hz
- {v.get('description', '')}
"""


# ============================================
# GRADIO FUNCTIONS
# ============================================

def synthesize(text, voice_name, rate, pitch):
    """Synthesize speech"""
    if not text or not text.strip():
        return None
    
    text = text.strip()[:300]
    
    try:
        pitch_mult = 2 ** (pitch / 12)
        audio = tts.speak(text, rate=rate, pitch=pitch_mult, voice_name=voice_name)
        
        if len(audio) < 100:
            return None
        
        audio = np.clip(audio, -1, 1)
        audio_int16 = (audio * 32767).astype(np.int16)
        
        return (SAMPLE_RATE, audio_int16)
    
    except Exception as e:
        print(f"Synthesis error: {e}")
        return None


def train_voice(audio, voice_name):
    """Train a new voice from audio"""
    global custom_voices
    
    if audio is None:
        return "❌ Please record or upload audio first.", get_all_voices()
    
    if not voice_name or not voice_name.strip():
        voice_name = f"Custom Voice {len(custom_voices) + 1}"
    
    voice_name = voice_name.strip()[:30]
    
    # Check if name already exists
    if voice_name in VOICE_PROFILES:
        voice_name = f"{voice_name} (custom)"
    
    try:
        profile = tts.train_voice(audio, voice_name)
        
        if profile:
            result = f"""
✅ **Voice "{voice_name}" created!**

**Detected Parameters:**
- Pitch (F0): {profile['f0']:.1f} Hz
- Pitch Variation: {profile['f0_variation']:.1f} Hz
- Formant Shift: {profile['formant_shift']:.2f}
- Breathiness: {profile['breathiness']:.3f}
- Brightness: {profile['brightness']:.2f}

You can now select this voice in the Speak tab!
"""
            return result, get_all_voices()
        else:
            return "❌ Could not analyze voice. Try a longer/clearer sample.", get_all_voices()
    
    except Exception as e:
        return f"❌ Error: {str(e)}", get_all_voices()


def create_custom_voice(name, pitch, formant, breathiness, speed, brightness):
    """Create a custom voice from parameters"""
    global custom_voices
    
    if not name or not name.strip():
        return "❌ Please enter a voice name.", get_all_voices()
    
    name = name.strip()[:30]
    
    if name in VOICE_PROFILES:
        name = f"{name} (custom)"
    
    profile = {
        "name": name,
        "gender": "custom",
        "f0": pitch,
        "f0_variation": 25,
        "formant_shift": formant,
        "breathiness": breathiness / 100,
        "speed": speed,
        "brightness": brightness,
        "description": f"Custom voice (F0={pitch}Hz)"
    }
    
    custom_voices[name] = profile
    
    return f"✅ Voice **{name}** created! Select it in the Speak tab.", get_all_voices()


def refresh_voices():
    """Refresh the voice list"""
    return gr.update(choices=get_all_voices())


# ============================================
# GRADIO INTERFACE
# ============================================

with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
    
    gr.Markdown("""
    # 🎙️ Vedes TTS - Voice Training Edition
    ### Create and Use Custom Voices - 100% From Scratch
    """)
    
    with gr.Tabs():
        # ===== SPEAK TAB =====
        with gr.TabItem("🔊 Speak"):
            with gr.Row():
                with gr.Column(scale=2):
                    text_input = gr.Textbox(
                        label="📝 Text to Speak",
                        placeholder="Type something...",
                        lines=3
                    )
                    
                    with gr.Row():
                        voice_select = gr.Dropdown(
                            choices=get_all_voices(),
                            value="Emma (Female)",
                            label="🗣️ Voice",
                            interactive=True
                        )
                        refresh_btn = gr.Button("🔄", size="sm")
                    
                    voice_info = gr.Markdown("Select a voice")
                    
                    with gr.Row():
                        rate = gr.Slider(0.6, 1.5, 0.9, step=0.1, label="⏱️ Speed")
                        pitch = gr.Slider(-6, 6, 0, step=1, label="🎵 Pitch")
                    
                    speak_btn = gr.Button("🔊 Speak", variant="primary", size="lg")
                
                with gr.Column(scale=1):
                    audio_out = gr.Audio(label="🎧 Output", type="numpy")
            
            gr.Examples(
                examples=[
                    ["Hello, how are you?"],
                    ["Good morning!"],
                    ["My name is Vedes."],
                    ["Thank you very much."],
                    ["Have a nice day."],
                ],
                inputs=text_input,
                label="📚 Examples"
            )
        
        # ===== TRAIN VOICE TAB =====
        with gr.TabItem("🎤 Train Voice"):
            gr.Markdown("""
            ### Train a New Voice from Audio
            
            Record or upload 3-10 seconds of clear speech.
            
            **Tips:**
            - Speak naturally and clearly
            - Avoid background noise
            - Read a few sentences
            """)
            
            with gr.Row():
                with gr.Column():
                    audio_input = gr.Audio(
                        label="🎤 Record or Upload",
                        sources=["microphone", "upload"],
                        type="numpy"
                    )
                    
                    voice_name_input = gr.Textbox(
                        label="Voice Name",
                        placeholder="e.g., My Voice",
                        value=""
                    )
                    
                    train_btn = gr.Button("🧠 Train Voice", variant="primary")
                
                with gr.Column():
                    train_result = gr.Markdown("Record audio and click Train")
                    
                    gr.Markdown("""
                    ### What Gets Analyzed:
                    - **Pitch (F0)**: How high/low the voice is
                    - **Formants**: Voice quality/timbre
                    - **Breathiness**: Air in the voice
                    """)
        
        # ===== CREATE VOICE TAB =====
        with gr.TabItem("⚙️ Create Voice"):
            gr.Markdown("### Create Custom Voice Manually")
            
            with gr.Row():
                with gr.Column():
                    custom_name = gr.Textbox(
                        label="Voice Name",
                        placeholder="My Custom Voice"
                    )
                    
                    custom_pitch = gr.Slider(
                        60, 300, 150,
                        label="Pitch (Hz)",
                        info="60-130=Male, 150-250=Female, 250+=Child"
                    )
                    
                    custom_formant = gr.Slider(
                        0.7, 1.4, 1.0, step=0.05,
                        label="Formant Shift",
                        info="<1.0=Male, >1.0=Female/Child"
                    )
                    
                    custom_breathiness = gr.Slider(
                        1, 10, 3,
                        label="Breathiness (%)"
                    )
                    
                    custom_speed = gr.Slider(
                        0.7, 1.3, 1.0, step=0.05,
                        label="Natural Speed"
                    )
                    
                    custom_brightness = gr.Slider(
                        0.8, 1.3, 1.0, step=0.05,
                        label="Brightness"
                    )
                    
                    create_btn = gr.Button("✨ Create Voice", variant="primary")
                
                with gr.Column():
                    create_result = gr.Markdown("")
                    
                    gr.Markdown("""
                    ### Quick Presets:
                    
                    | Type | Pitch | Formant |
                    |------|-------|---------|
                    | Deep Male | 85 | 0.85 |
                    | Male | 120 | 0.92 |
                    | Female | 200 | 1.12 |
                    | High Female | 240 | 1.20 |
                    | Child | 280 | 1.25 |
                    """)
        
        # ===== ALL VOICES TAB =====
        with gr.TabItem("👥 All Voices"):
            gr.Markdown("### Pre-built Voices")
            
            voice_info_md = ""
            for name, v in VOICE_PROFILES.items():
                voice_info_md += f"""
**{name}**
- Type: {v['gender'].title()} | Pitch: {v['f0']} Hz
- {v['description']}

"""
            gr.Markdown(voice_info_md)
            
            gr.Markdown("### Custom Voices")
            custom_voices_display = gr.Markdown("*No custom voices yet*")
    
    # ===== EVENT HANDLERS =====
    
    # Speak tab
    voice_select.change(get_voice_info, voice_select, voice_info)
    refresh_btn.click(refresh_voices, outputs=voice_select)
    speak_btn.click(synthesize, [text_input, voice_select, rate, pitch], audio_out)
    text_input.submit(synthesize, [text_input, voice_select, rate, pitch], audio_out)
    
    # Train tab - Fixed: update choices first, then set value separately
    def train_and_update(audio, name):
        result, voices = train_voice(audio, name)
        # Return result and updated dropdown with new choices
        return result, gr.update(choices=voices)
    
    train_btn.click(
        train_and_update,
        [audio_input, voice_name_input],
        [train_result, voice_select]
    )
    
    # Create tab - Fixed similarly
    def create_and_update(name, pitch, formant, breathiness, speed, brightness):
        result, voices = create_custom_voice(name, pitch, formant, breathiness, speed, brightness)
        return result, gr.update(choices=voices)
    
    create_btn.click(
        create_and_update,
        [custom_name, custom_pitch, custom_formant, custom_breathiness, 
         custom_speed, custom_brightness],
        [create_result, voice_select]
    )


# ============================================
# LAUNCH
# ============================================

if __name__ == "__main__":
    demo.launch()