Spaces:

vedaco
/

Vedes

Running

App Files Files Community

Vedes / app.py

vedaco

Update app.py

9ee9e3a verified 8 days ago

raw

history blame contribute delete

49.3 kB

	import numpy as np
	import gradio as gr
	from scipy import signal
	from scipy.io import wavfile
	import tempfile
	import re
	import json
	import os

	# ============================================
	# VEDES TTS - WITH VOICE TRAINING (FIXED)
	# 100% From Scratch - No APIs
	# ============================================

	SAMPLE_RATE = 22050

	# ============================================
	# VOICE PROFILES - Pre-defined Voices
	# ============================================

	VOICE_PROFILES = {
	"Emma (Female)": {
	"name": "Emma",
	"gender": "female",
	"f0": 210,
	"f0_variation": 30,
	"formant_shift": 1.15,
	"breathiness": 0.04,
	"speed": 1.0,
	"brightness": 1.1,
	"description": "Friendly female voice"
	},
	"James (Male)": {
	"name": "James",
	"gender": "male",
	"f0": 110,
	"f0_variation": 20,
	"formant_shift": 0.9,
	"breathiness": 0.02,
	"speed": 0.95,
	"brightness": 0.95,
	"description": "Professional male voice"
	},
	"Sophie (Child)": {
	"name": "Sophie",
	"gender": "child",
	"f0": 280,
	"f0_variation": 40,
	"formant_shift": 1.25,
	"breathiness": 0.03,
	"speed": 1.1,
	"brightness": 1.2,
	"description": "Young child voice"
	},
	"David (Deep Male)": {
	"name": "David",
	"gender": "male",
	"f0": 85,
	"f0_variation": 15,
	"formant_shift": 0.82,
	"breathiness": 0.02,
	"speed": 0.9,
	"brightness": 0.85,
	"description": "Deep bass voice"
	},
	"Lisa (Bright Female)": {
	"name": "Lisa",
	"gender": "female",
	"f0": 240,
	"f0_variation": 35,
	"formant_shift": 1.2,
	"breathiness": 0.05,
	"speed": 1.05,
	"brightness": 1.15,
	"description": "Bright, energetic female"
	},
	"Robert (Elderly Male)": {
	"name": "Robert",
	"gender": "male",
	"f0": 95,
	"f0_variation": 12,
	"formant_shift": 0.88,
	"breathiness": 0.06,
	"speed": 0.85,
	"brightness": 0.9,
	"description": "Mature elderly voice"
	},
	"Anna (Soft Female)": {
	"name": "Anna",
	"gender": "female",
	"f0": 195,
	"f0_variation": 25,
	"formant_shift": 1.1,
	"breathiness": 0.07,
	"speed": 0.92,
	"brightness": 1.0,
	"description": "Soft, gentle female"
	},
	"Mike (Energetic Male)": {
	"name": "Mike",
	"gender": "male",
	"f0": 130,
	"f0_variation": 30,
	"formant_shift": 0.95,
	"breathiness": 0.02,
	"speed": 1.1,
	"brightness": 1.05,
	"description": "Energetic young male"
	},
	}

	# Custom voices storage (global)
	custom_voices = {}

	# ============================================
	# PHONEME DATA
	# ============================================

	VOWELS = {
	'IY': (280, 2250, 2890, 150, 1.0, True),
	'IH': (400, 1920, 2550, 120, 0.9, True),
	'EH': (550, 1770, 2490, 130, 0.95, True),
	'AE': (690, 1660, 2490, 140, 1.0, True),
	'AA': (710, 1100, 2540, 150, 1.0, True),
	'AO': (590, 880, 2540, 140, 0.95, True),
	'UH': (470, 1100, 2540, 120, 0.9, True),
	'UW': (310, 870, 2250, 150, 1.0, True),
	'AH': (640, 1200, 2400, 100, 0.85, True),
	'AX': (500, 1500, 2500, 80, 0.7, True),
	'ER': (500, 1350, 1700, 140, 0.9, True),
	'EY': (500, 1900, 2600, 160, 1.0, True),
	'AY': (700, 1200, 2600, 180, 1.0, True),
	'OY': (500, 900, 2500, 180, 1.0, True),
	'AW': (700, 1100, 2600, 180, 1.0, True),
	'OW': (500, 900, 2500, 160, 1.0, True),
	}

	CONSONANTS = {
	'P': {'type': 'stop', 'closure': 80, 'burst': 30, 'voiced': False, 'burst_freq': 800, 'amp': 0.6},
	'B': {'type': 'stop', 'closure': 50, 'burst': 25, 'voiced': True, 'burst_freq': 800, 'amp': 0.7},
	'T': {'type': 'stop', 'closure': 70, 'burst': 30, 'voiced': False, 'burst_freq': 3500, 'amp': 0.7},
	'D': {'type': 'stop', 'closure': 40, 'burst': 25, 'voiced': True, 'burst_freq': 3500, 'amp': 0.7},
	'K': {'type': 'stop', 'closure': 80, 'burst': 40, 'voiced': False, 'burst_freq': 1500, 'amp': 0.7},
	'G': {'type': 'stop', 'closure': 50, 'burst': 30, 'voiced': True, 'burst_freq': 1500, 'amp': 0.7},
	'F': {'type': 'fric', 'dur': 120, 'freq_low': 1500, 'freq_high': 8000, 'voiced': False, 'amp': 0.4},
	'V': {'type': 'fric', 'dur': 80, 'freq_low': 1500, 'freq_high': 8000, 'voiced': True, 'amp': 0.5},
	'TH': {'type': 'fric', 'dur': 100, 'freq_low': 1400, 'freq_high': 6000, 'voiced': False, 'amp': 0.3},
	'DH': {'type': 'fric', 'dur': 60, 'freq_low': 1400, 'freq_high': 6000, 'voiced': True, 'amp': 0.5},
	'S': {'type': 'fric', 'dur': 120, 'freq_low': 4000, 'freq_high': 9000, 'voiced': False, 'amp': 0.5},
	'Z': {'type': 'fric', 'dur': 90, 'freq_low': 4000, 'freq_high': 9000, 'voiced': True, 'amp': 0.5},
	'SH': {'type': 'fric', 'dur': 120, 'freq_low': 2000, 'freq_high': 6000, 'voiced': False, 'amp': 0.5},
	'ZH': {'type': 'fric', 'dur': 80, 'freq_low': 2000, 'freq_high': 6000, 'voiced': True, 'amp': 0.5},
	'HH': {'type': 'fric', 'dur': 80, 'freq_low': 500, 'freq_high': 2000, 'voiced': False, 'amp': 0.3},
	'CH': {'type': 'affric', 'closure': 60, 'fric': 80, 'freq_low': 2000, 'freq_high': 6000, 'voiced': False, 'amp': 0.6},
	'JH': {'type': 'affric', 'closure': 40, 'fric': 60, 'freq_low': 2000, 'freq_high': 6000, 'voiced': True, 'amp': 0.6},
	'M': {'type': 'nasal', 'f1': 280, 'f2': 1000, 'f3': 2200, 'dur': 100, 'amp': 0.8},
	'N': {'type': 'nasal', 'f1': 280, 'f2': 1700, 'f3': 2500, 'dur': 90, 'amp': 0.8},
	'NG': {'type': 'nasal', 'f1': 300, 'f2': 2000, 'f3': 2700, 'dur': 100, 'amp': 0.8},
	'L': {'type': 'liquid', 'f1': 380, 'f2': 1000, 'f3': 2700, 'dur': 90, 'amp': 0.85},
	'R': {'type': 'liquid', 'f1': 350, 'f2': 1300, 'f3': 1700, 'dur': 90, 'amp': 0.85},
	'W': {'type': 'glide', 'f1': 300, 'f2': 700, 'f3': 2200, 'dur': 80, 'amp': 0.8},
	'Y': {'type': 'glide', 'f1': 280, 'f2': 2200, 'f3': 2900, 'dur': 70, 'amp': 0.8},
	}

	SILENCE = {'SIL': 60, 'PAU': 200}

	# ============================================
	# PRONUNCIATION DICTIONARY
	# ============================================

	DICTIONARY = {
	# Function words
	'a': ['AX'], 'an': ['AE', 'N'], 'the': ['DH', 'AX'],
	'and': ['AE', 'N', 'D'], 'or': ['AO', 'R'], 'but': ['B', 'AH', 'T'],
	'if': ['IH', 'F'], 'of': ['AH', 'V'], 'to': ['T', 'UW'],
	'in': ['IH', 'N'], 'on': ['AA', 'N'], 'at': ['AE', 'T'],
	'by': ['B', 'AY'], 'for': ['F', 'AO', 'R'], 'with': ['W', 'IH', 'TH'],
	'from': ['F', 'R', 'AH', 'M'], 'up': ['AH', 'P'], 'out': ['AW', 'T'],
	'as': ['AE', 'Z'], 'so': ['S', 'OW'], 'not': ['N', 'AA', 'T'],

	# Pronouns
	'i': ['AY'], 'me': ['M', 'IY'], 'my': ['M', 'AY'],
	'you': ['Y', 'UW'], 'your': ['Y', 'AO', 'R'],
	'he': ['HH', 'IY'], 'him': ['HH', 'IH', 'M'], 'his': ['HH', 'IH', 'Z'],
	'she': ['SH', 'IY'], 'her': ['HH', 'ER'],
	'it': ['IH', 'T'], 'its': ['IH', 'T', 'S'],
	'we': ['W', 'IY'], 'us': ['AH', 'S'], 'our': ['AW', 'ER'],
	'they': ['DH', 'EY'], 'them': ['DH', 'EH', 'M'], 'their': ['DH', 'EH', 'R'],
	'this': ['DH', 'IH', 'S'], 'that': ['DH', 'AE', 'T'],
	'what': ['W', 'AH', 'T'], 'who': ['HH', 'UW'],
	'where': ['W', 'EH', 'R'], 'when': ['W', 'EH', 'N'],
	'why': ['W', 'AY'], 'how': ['HH', 'AW'], 'which': ['W', 'IH', 'CH'],

	# Be verbs
	'am': ['AE', 'M'], 'is': ['IH', 'Z'], 'are': ['AA', 'R'],
	'was': ['W', 'AA', 'Z'], 'were': ['W', 'ER'],
	'be': ['B', 'IY'], 'been': ['B', 'IH', 'N'], 'being': ['B', 'IY', 'IH', 'NG'],

	# Have verbs
	'have': ['HH', 'AE', 'V'], 'has': ['HH', 'AE', 'Z'],
	'had': ['HH', 'AE', 'D'], 'having': ['HH', 'AE', 'V', 'IH', 'NG'],

	# Do verbs
	'do': ['D', 'UW'], 'does': ['D', 'AH', 'Z'],
	'did': ['D', 'IH', 'D'], 'done': ['D', 'AH', 'N'],

	# Modal verbs
	'will': ['W', 'IH', 'L'], 'would': ['W', 'UH', 'D'],
	'can': ['K', 'AE', 'N'], 'could': ['K', 'UH', 'D'],
	'should': ['SH', 'UH', 'D'], 'may': ['M', 'EY'],
	'might': ['M', 'AY', 'T'], 'must': ['M', 'AH', 'S', 'T'],

	# Common verbs
	'go': ['G', 'OW'], 'goes': ['G', 'OW', 'Z'], 'going': ['G', 'OW', 'IH', 'NG'],
	'went': ['W', 'EH', 'N', 'T'], 'gone': ['G', 'AO', 'N'],
	'come': ['K', 'AH', 'M'], 'comes': ['K', 'AH', 'M', 'Z'],
	'coming': ['K', 'AH', 'M', 'IH', 'NG'], 'came': ['K', 'EY', 'M'],
	'get': ['G', 'EH', 'T'], 'gets': ['G', 'EH', 'T', 'S'],
	'getting': ['G', 'EH', 'T', 'IH', 'NG'], 'got': ['G', 'AA', 'T'],
	'make': ['M', 'EY', 'K'], 'makes': ['M', 'EY', 'K', 'S'],
	'making': ['M', 'EY', 'K', 'IH', 'NG'], 'made': ['M', 'EY', 'D'],
	'take': ['T', 'EY', 'K'], 'takes': ['T', 'EY', 'K', 'S'],
	'took': ['T', 'UH', 'K'], 'taken': ['T', 'EY', 'K', 'AX', 'N'],
	'see': ['S', 'IY'], 'sees': ['S', 'IY', 'Z'],
	'saw': ['S', 'AO'], 'seen': ['S', 'IY', 'N'],
	'say': ['S', 'EY'], 'says': ['S', 'EH', 'Z'], 'said': ['S', 'EH', 'D'],
	'know': ['N', 'OW'], 'knows': ['N', 'OW', 'Z'],
	'knew': ['N', 'UW'], 'known': ['N', 'OW', 'N'],
	'think': ['TH', 'IH', 'NG', 'K'], 'thought': ['TH', 'AO', 'T'],
	'want': ['W', 'AA', 'N', 'T'], 'wants': ['W', 'AA', 'N', 'T', 'S'],
	'give': ['G', 'IH', 'V'], 'gives': ['G', 'IH', 'V', 'Z'],
	'gave': ['G', 'EY', 'V'], 'given': ['G', 'IH', 'V', 'AX', 'N'],
	'tell': ['T', 'EH', 'L'], 'told': ['T', 'OW', 'L', 'D'],
	'ask': ['AE', 'S', 'K'], 'asked': ['AE', 'S', 'K', 'T'],
	'use': ['Y', 'UW', 'Z'], 'used': ['Y', 'UW', 'Z', 'D'],
	'find': ['F', 'AY', 'N', 'D'], 'found': ['F', 'AW', 'N', 'D'],
	'work': ['W', 'ER', 'K'], 'works': ['W', 'ER', 'K', 'S'],
	'call': ['K', 'AO', 'L'], 'called': ['K', 'AO', 'L', 'D'],
	'try': ['T', 'R', 'AY'], 'tried': ['T', 'R', 'AY', 'D'],
	'need': ['N', 'IY', 'D'], 'needs': ['N', 'IY', 'D', 'Z'],
	'feel': ['F', 'IY', 'L'], 'feels': ['F', 'IY', 'L', 'Z'],
	'help': ['HH', 'EH', 'L', 'P'], 'helps': ['HH', 'EH', 'L', 'P', 'S'],
	'keep': ['K', 'IY', 'P'], 'kept': ['K', 'EH', 'P', 'T'],
	'let': ['L', 'EH', 'T'], 'put': ['P', 'UH', 'T'],
	'seem': ['S', 'IY', 'M'], 'leave': ['L', 'IY', 'V'],
	'show': ['SH', 'OW'], 'hear': ['HH', 'IY', 'R'],
	'play': ['P', 'L', 'EY'], 'run': ['R', 'AH', 'N'],
	'move': ['M', 'UW', 'V'], 'live': ['L', 'IH', 'V'],
	'believe': ['B', 'IH', 'L', 'IY', 'V'],
	'read': ['R', 'IY', 'D'], 'write': ['R', 'AY', 'T'],
	'learn': ['L', 'ER', 'N'], 'speak': ['S', 'P', 'IY', 'K'],
	'look': ['L', 'UH', 'K'], 'like': ['L', 'AY', 'K'],
	'love': ['L', 'AH', 'V'], 'start': ['S', 'T', 'AA', 'R', 'T'],
	'stop': ['S', 'T', 'AA', 'P'],

	# Adjectives
	'good': ['G', 'UH', 'D'], 'better': ['B', 'EH', 'T', 'ER'],
	'best': ['B', 'EH', 'S', 'T'], 'bad': ['B', 'AE', 'D'],
	'new': ['N', 'UW'], 'old': ['OW', 'L', 'D'],
	'big': ['B', 'IH', 'G'], 'small': ['S', 'M', 'AO', 'L'],
	'long': ['L', 'AO', 'NG'], 'short': ['SH', 'AO', 'R', 'T'],
	'high': ['HH', 'AY'], 'low': ['L', 'OW'],
	'great': ['G', 'R', 'EY', 'T'], 'little': ['L', 'IH', 'T', 'AX', 'L'],
	'right': ['R', 'AY', 'T'], 'wrong': ['R', 'AO', 'NG'],
	'first': ['F', 'ER', 'S', 'T'], 'last': ['L', 'AE', 'S', 'T'],
	'same': ['S', 'EY', 'M'], 'different': ['D', 'IH', 'F', 'R', 'AX', 'N', 'T'],
	'own': ['OW', 'N'], 'other': ['AH', 'DH', 'ER'],
	'nice': ['N', 'AY', 'S'], 'happy': ['HH', 'AE', 'P', 'IY'],
	'sure': ['SH', 'UH', 'R'], 'true': ['T', 'R', 'UW'],
	'real': ['R', 'IY', 'L'], 'clear': ['K', 'L', 'IY', 'R'],
	'fine': ['F', 'AY', 'N'], 'free': ['F', 'R', 'IY'],
	'easy': ['IY', 'Z', 'IY'], 'hard': ['HH', 'AA', 'R', 'D'],
	'young': ['Y', 'AH', 'NG'], 'beautiful': ['B', 'Y', 'UW', 'T', 'IH', 'F', 'AX', 'L'],

	# Adverbs
	'very': ['V', 'EH', 'R', 'IY'], 'really': ['R', 'IY', 'L', 'IY'],
	'just': ['JH', 'AH', 'S', 'T'], 'only': ['OW', 'N', 'L', 'IY'],
	'also': ['AO', 'L', 'S', 'OW'], 'well': ['W', 'EH', 'L'],
	'now': ['N', 'AW'], 'then': ['DH', 'EH', 'N'],
	'here': ['HH', 'IY', 'R'], 'there': ['DH', 'EH', 'R'],
	'still': ['S', 'T', 'IH', 'L'], 'even': ['IY', 'V', 'AX', 'N'],
	'back': ['B', 'AE', 'K'], 'again': ['AX', 'G', 'EH', 'N'],
	'always': ['AO', 'L', 'W', 'EY', 'Z'], 'never': ['N', 'EH', 'V', 'ER'],
	'today': ['T', 'AX', 'D', 'EY'], 'maybe': ['M', 'EY', 'B', 'IY'],
	'too': ['T', 'UW'], 'much': ['M', 'AH', 'CH'],
	'more': ['M', 'AO', 'R'], 'most': ['M', 'OW', 'S', 'T'],
	'please': ['P', 'L', 'IY', 'Z'],

	# Nouns
	'time': ['T', 'AY', 'M'], 'year': ['Y', 'IY', 'R'],
	'day': ['D', 'EY'], 'way': ['W', 'EY'],
	'man': ['M', 'AE', 'N'], 'woman': ['W', 'UH', 'M', 'AX', 'N'],
	'child': ['CH', 'AY', 'L', 'D'], 'world': ['W', 'ER', 'L', 'D'],
	'life': ['L', 'AY', 'F'], 'hand': ['HH', 'AE', 'N', 'D'],
	'part': ['P', 'AA', 'R', 'T'], 'place': ['P', 'L', 'EY', 'S'],
	'thing': ['TH', 'IH', 'NG'], 'things': ['TH', 'IH', 'NG', 'Z'],
	'people': ['P', 'IY', 'P', 'AX', 'L'], 'person': ['P', 'ER', 'S', 'AX', 'N'],
	'home': ['HH', 'OW', 'M'], 'house': ['HH', 'AW', 'S'],
	'word': ['W', 'ER', 'D'], 'name': ['N', 'EY', 'M'],
	'water': ['W', 'AO', 'T', 'ER'], 'money': ['M', 'AH', 'N', 'IY'],
	'family': ['F', 'AE', 'M', 'AX', 'L', 'IY'],
	'friend': ['F', 'R', 'EH', 'N', 'D'], 'friends': ['F', 'R', 'EH', 'N', 'D', 'Z'],
	'mother': ['M', 'AH', 'DH', 'ER'], 'father': ['F', 'AA', 'DH', 'ER'],
	'boy': ['B', 'OY'], 'girl': ['G', 'ER', 'L'],
	'head': ['HH', 'EH', 'D'], 'face': ['F', 'EY', 'S'],
	'eye': ['AY'], 'eyes': ['AY', 'Z'],
	'voice': ['V', 'OY', 'S'], 'night': ['N', 'AY', 'T'],
	'morning': ['M', 'AO', 'R', 'N', 'IH', 'NG'],
	'week': ['W', 'IY', 'K'], 'month': ['M', 'AH', 'N', 'TH'],
	'school': ['S', 'K', 'UW', 'L'], 'book': ['B', 'UH', 'K'],
	'story': ['S', 'T', 'AO', 'R', 'IY'],
	'question': ['K', 'W', 'EH', 'S', 'CH', 'AX', 'N'],
	'answer': ['AE', 'N', 'S', 'ER'],

	# Numbers
	'zero': ['Z', 'IY', 'R', 'OW'], 'one': ['W', 'AH', 'N'],
	'two': ['T', 'UW'], 'three': ['TH', 'R', 'IY'],
	'four': ['F', 'AO', 'R'], 'five': ['F', 'AY', 'V'],
	'six': ['S', 'IH', 'K', 'S'], 'seven': ['S', 'EH', 'V', 'AX', 'N'],
	'eight': ['EY', 'T'], 'nine': ['N', 'AY', 'N'], 'ten': ['T', 'EH', 'N'],

	# Greetings
	'hello': ['HH', 'AX', 'L', 'OW'], 'hi': ['HH', 'AY'],
	'hey': ['HH', 'EY'], 'goodbye': ['G', 'UH', 'D', 'B', 'AY'],
	'bye': ['B', 'AY'], 'welcome': ['W', 'EH', 'L', 'K', 'AX', 'M'],
	'thank': ['TH', 'AE', 'NG', 'K'], 'thanks': ['TH', 'AE', 'NG', 'K', 'S'],
	'sorry': ['S', 'AA', 'R', 'IY'],
	'yes': ['Y', 'EH', 'S'], 'yeah': ['Y', 'AE'], 'no': ['N', 'OW'],
	'ok': ['OW', 'K', 'EY'], 'okay': ['OW', 'K', 'EY'],

	# Tech/TTS
	'text': ['T', 'EH', 'K', 'S', 'T'], 'speech': ['S', 'P', 'IY', 'CH'],
	'sound': ['S', 'AW', 'N', 'D'], 'audio': ['AO', 'D', 'IY', 'OW'],
	'test': ['T', 'EH', 'S', 'T'], 'testing': ['T', 'EH', 'S', 'T', 'IH', 'NG'],
	'computer': ['K', 'AX', 'M', 'P', 'Y', 'UW', 'T', 'ER'],
	'vedes': ['V', 'EY', 'D', 'EH', 'S'],
	'system': ['S', 'IH', 'S', 'T', 'AX', 'M'],
	'train': ['T', 'R', 'EY', 'N'], 'training': ['T', 'R', 'EY', 'N', 'IH', 'NG'],
	}

	# Letter patterns
	PATTERNS = [
	('tion', ['SH', 'AX', 'N']), ('sion', ['ZH', 'AX', 'N']),
	('ness', ['N', 'AX', 'S']), ('ment', ['M', 'AX', 'N', 'T']),
	('able', ['AX', 'B', 'AX', 'L']), ('ible', ['AX', 'B', 'AX', 'L']),
	('ful', ['F', 'AX', 'L']), ('less', ['L', 'AX', 'S']),
	('ing', ['IH', 'NG']), ('ight', ['AY', 'T']),
	('ough', ['AO']), ('ould', ['UH', 'D']),
	('th', ['TH']), ('sh', ['SH']), ('ch', ['CH']),
	('wh', ['W']), ('ph', ['F']), ('ck', ['K']), ('ng', ['NG']),
	('qu', ['K', 'W']), ('ee', ['IY']), ('ea', ['IY']),
	('oo', ['UW']), ('ou', ['AW']), ('ow', ['OW']),
	('ai', ['EY']), ('ay', ['EY']), ('ey', ['IY']),
	('oy', ['OY']), ('oi', ['OY']), ('ie', ['IY']),
	('er', ['ER']), ('ir', ['ER']), ('ur', ['ER']),
	('ar', ['AA', 'R']), ('or', ['AO', 'R']),
	]

	LETTERS = {
	'a': 'AE', 'b': 'B', 'c': 'K', 'd': 'D', 'e': 'EH',
	'f': 'F', 'g': 'G', 'h': 'HH', 'i': 'IH', 'j': 'JH',
	'k': 'K', 'l': 'L', 'm': 'M', 'n': 'N', 'o': 'AA',
	'p': 'P', 'q': 'K', 'r': 'R', 's': 'S', 't': 'T',
	'u': 'AH', 'v': 'V', 'w': 'W', 'x': 'K', 'y': 'IY', 'z': 'Z',
	}


	# ============================================
	# VOICE ANALYZER
	# ============================================

	class VoiceAnalyzer:
	"""Analyze audio to extract voice characteristics"""

	def __init__(self, sample_rate=22050):
	self.sr = sample_rate

	def analyze(self, audio):
	"""Extract voice features from audio sample"""
	if len(audio) < self.sr * 0.3:
	return None

	audio = audio.astype(np.float32)
	max_val = np.max(np.abs(audio))
	if max_val > 0:
	audio = audio / max_val

	f0 = self._estimate_pitch(audio)
	formants = self._estimate_formants(audio)
	breathiness = self._estimate_breathiness(audio)

	profile = {
	"name": "Custom Voice",
	"gender": "custom",
	"f0": f0,
	"f0_variation": self._estimate_f0_variation(audio, f0),
	"formant_shift": formants.get('shift', 1.0),
	"breathiness": breathiness,
	"speed": 1.0,
	"brightness": formants.get('brightness', 1.0),
	"description": f"Custom voice (F0={f0:.0f}Hz)"
	}

	return profile

	def _estimate_pitch(self, audio):
	"""Estimate fundamental frequency using autocorrelation"""
	frame_size = int(self.sr * 0.03)
	pitches = []

	for i in range(0, len(audio) - frame_size, frame_size):
	frame = audio[i:i + frame_size]

	# Remove DC
	frame = frame - np.mean(frame)

	# Autocorrelation
	corr = np.correlate(frame, frame, mode='full')
	corr = corr[len(corr)//2:]

	# Find peaks
	d = np.diff(corr)
	start_indices = np.where(d > 0)[0]

	if len(start_indices) > 0:
	start = start_indices[0]
	search_end = min(start + int(self.sr / 60), len(corr))

	if search_end > start:
	peak = start + np.argmax(corr[start:search_end])

	if peak > 0:
	f0 = self.sr / peak
	if 60 < f0 < 400:
	pitches.append(f0)

	if pitches:
	return float(np.median(pitches))
	return 130.0

	def _estimate_f0_variation(self, audio, base_f0):
	"""Estimate pitch variation"""
	frame_size = int(self.sr * 0.03)
	pitches = []

	for i in range(0, len(audio) - frame_size, frame_size):
	frame = audio[i:i + frame_size]
	frame = frame - np.mean(frame)

	corr = np.correlate(frame, frame, mode='full')
	corr = corr[len(corr)//2:]

	d = np.diff(corr)
	start_indices = np.where(d > 0)[0]

	if len(start_indices) > 0:
	start = start_indices[0]
	search_end = min(start + int(self.sr / 60), len(corr))

	if search_end > start:
	peak = start + np.argmax(corr[start:search_end])
	if peak > 0:
	f0 = self.sr / peak
	if 60 < f0 < 400:
	pitches.append(f0)

	if len(pitches) > 2:
	return min(float(np.std(pitches)), 50.0)
	return 20.0

	def _estimate_formants(self, audio):
	"""Estimate formant characteristics"""
	frame_size = 2048

	if len(audio) < frame_size:
	return {'shift': 1.0, 'brightness': 1.0}

	spectrum = np.abs(np.fft.rfft(audio[:frame_size] * np.hanning(frame_size)))
	freqs = np.fft.rfftfreq(frame_size, 1/self.sr)

	total_energy = np.sum(spectrum) + 1e-8
	centroid = np.sum(freqs * spectrum) / total_energy

	if centroid > 1600:
	shift = 1.2
	brightness = 1.15
	elif centroid > 1400:
	shift = 1.1
	brightness = 1.05
	elif centroid > 1200:
	shift = 1.0
	brightness = 1.0
	elif centroid > 1000:
	shift = 0.9
	brightness = 0.95
	else:
	shift = 0.85
	brightness = 0.9

	return {'shift': shift, 'brightness': brightness}

	def _estimate_breathiness(self, audio):
	"""Estimate breathiness"""
	frame_size = 2048

	if len(audio) < frame_size:
	return 0.03

	spectrum = np.abs(np.fft.rfft(audio[:frame_size]))
	freqs = np.fft.rfftfreq(frame_size, 1/self.sr)

	low_mask = freqs < 1000
	high_mask = (freqs > 2000) & (freqs < 5000)

	low_energy = np.sum(spectrum[low_mask]) + 1e-8
	high_energy = np.sum(spectrum[high_mask])

	ratio = high_energy / low_energy
	breathiness = float(np.clip(ratio * 0.1, 0.02, 0.1))

	return breathiness


	# ============================================
	# TEXT TO PHONEME CONVERTER
	# ============================================

	class TextToPhoneme:
	def __init__(self):
	self.dictionary = DICTIONARY
	self.patterns = sorted(PATTERNS, key=lambda x: -len(x[0]))

	def convert(self, text):
	text = text.lower().strip()
	text = re.sub(r"[^\w\s.,!?']", '', text)

	tokens = re.findall(r"[\w']+\|[.,!?]", text)
	phonemes = []

	for i, token in enumerate(tokens):
	if token in '.,!?':
	phonemes.append('PAU')
	elif token in self.dictionary:
	phonemes.extend(self.dictionary[token])
	if i < len(tokens) - 1 and tokens[i + 1] not in '.,!?':
	phonemes.append('SIL')
	else:
	phons = self._convert_word(token)
	phonemes.extend(phons)
	if i < len(tokens) - 1 and tokens[i + 1] not in '.,!?':
	phonemes.append('SIL')

	return phonemes

	def _convert_word(self, word):
	phonemes = []
	i = 0

	while i < len(word):
	matched = False

	for pattern, phons in self.patterns:
	if word[i:].startswith(pattern):
	phonemes.extend(phons)
	i += len(pattern)
	matched = True
	break

	if not matched:
	char = word[i]
	if char in LETTERS:
	phonemes.append(LETTERS[char])
	i += 1

	return phonemes


	# ============================================
	# VOICE SYNTHESIZER
	# ============================================

	class VoiceSynthesizer:
	def __init__(self, sample_rate=22050):
	self.sr = sample_rate
	self.default_voice = VOICE_PROFILES["Emma (Female)"]

	def synthesize(self, phonemes, voice_profile=None, rate=1.0, pitch=1.0):
	if not phonemes:
	return np.zeros(int(self.sr * 0.5), dtype=np.float32)

	voice = voice_profile or self.default_voice

	f0 = voice.get('f0', 130) * pitch
	f0_var = voice.get('f0_variation', 20)
	formant_shift = voice.get('formant_shift', 1.0)
	breathiness = voice.get('breathiness', 0.03)
	voice_speed = voice.get('speed', 1.0) * rate
	brightness = voice.get('brightness', 1.0)

	segments = []

	for i, phon in enumerate(phonemes):
	prev_phon = phonemes[i - 1] if i > 0 else None
	next_phon = phonemes[i + 1] if i < len(phonemes) - 1 else None

	phrase_pos = i / max(len(phonemes), 1)
	f0_current = f0 + f0_var * np.sin(phrase_pos * np.pi) * 0.5

	seg = self._synth_phoneme(
	phon, f0_current, voice_speed, formant_shift,
	breathiness, brightness, prev_phon, next_phon
	)
	segments.append(seg)

	audio = self._smooth_concat(segments)
	audio = self._normalize(audio)

	return audio.astype(np.float32)

	def _synth_phoneme(self, phon, f0, speed, formant_shift, breathiness,
	brightness, prev_phon, next_phon):
	if phon in SILENCE:
	dur = int(self.sr * SILENCE[phon] / 1000 / speed)
	return np.zeros(dur, dtype=np.float32)

	if phon in VOWELS:
	return self._synth_vowel(phon, f0, speed, formant_shift,
	breathiness, brightness)

	if phon in CONSONANTS:
	return self._synth_consonant(phon, f0, speed, formant_shift, breathiness)

	return np.zeros(100, dtype=np.float32)

	def _synth_vowel(self, phon, f0, speed, formant_shift, breathiness, brightness):
	params = VOWELS[phon]
	f1, f2, f3, dur_ms, amp, voiced = params

	f1 = f1 * formant_shift
	f2 = f2 * formant_shift * brightness
	f3 = f3 * formant_shift * brightness

	dur_ms = dur_ms / speed
	n = int(self.sr * dur_ms / 1000)
	n = max(n, 100)
	t = np.arange(n) / self.sr

	source = self._glottal_source(t, f0, breathiness)
	audio = self._apply_formants(source, f1, f2, f3)
	envelope = self._vowel_envelope(n)
	audio = audio * envelope * amp

	return audio

	def _synth_consonant(self, phon, f0, speed, formant_shift, breathiness):
	params = CONSONANTS[phon]
	ctype = params['type']

	if ctype == 'stop':
	return self._synth_stop(params, f0, speed, formant_shift)
	elif ctype == 'fric':
	return self._synth_fricative(params, f0, speed)
	elif ctype == 'affric':
	return self._synth_affricate(params, f0, speed)
	elif ctype == 'nasal':
	return self._synth_nasal(params, f0, speed, formant_shift, breathiness)
	elif ctype == 'liquid':
	return self._synth_liquid(params, f0, speed, formant_shift, breathiness)
	elif ctype == 'glide':
	return self._synth_glide(params, f0, speed, formant_shift, breathiness)

	return np.zeros(100, dtype=np.float32)

	def _glottal_source(self, t, f0, breathiness):
	T0 = 1.0 / f0
	phase = (t % T0) / T0

	glottal = np.zeros_like(t)
	mask1 = phase < 0.4
	glottal[mask1] = 0.5 * (1 - np.cos(np.pi * phase[mask1] / 0.4))

	mask2 = (phase >= 0.4) & (phase < 0.6)
	glottal[mask2] = np.cos(np.pi * (phase[mask2] - 0.4) / 0.4)

	glottal += np.random.randn(len(t)) * breathiness
	shimmer = 1 + 0.02 * np.sin(2 * np.pi * 5 * t)
	glottal *= shimmer

	return glottal

	def _apply_formants(self, source, f1, f2, f3):
	formants = [(f1, 90), (f2, 110), (f3, 130)]
	result = np.zeros_like(source)

	for freq, bw in formants:
	result += self._resonator(source, freq, bw)

	return result

	def _resonator(self, sig, freq, bw):
	if freq <= 0 or freq >= self.sr / 2:
	return sig

	r = np.exp(-np.pi * bw / self.sr)
	theta = 2 * np.pi * freq / self.sr

	a1 = -2 * r * np.cos(theta)
	a2 = r * r
	b0 = 1 - r

	y = np.zeros_like(sig)
	for i in range(2, len(sig)):
	y[i] = b0 * sig[i] - a1 * y[i-1] - a2 * y[i-2]

	return y

	def _vowel_envelope(self, n):
	env = np.ones(n)
	attack = max(1, n // 10)
	release = max(1, int(n * 0.15))

	env[:attack] = np.sin(np.linspace(0, np.pi/2, attack)) ** 2
	env[-release:] = np.cos(np.linspace(0, np.pi/2, release)) ** 2

	return env

	def _consonant_envelope(self, n):
	env = np.ones(n)
	attack = max(1, n // 8)
	release = max(1, n // 6)

	env[:attack] = np.linspace(0.1, 1, attack)
	env[-release:] = np.linspace(1, 0.1, release)

	return env

	def _synth_stop(self, params, f0, speed, formant_shift):
	closure_ms = params['closure'] / speed
	burst_ms = params['burst'] / speed

	closure_n = int(self.sr * closure_ms / 1000)
	burst_n = int(self.sr * burst_ms / 1000)
	total_n = closure_n + burst_n

	audio = np.zeros(total_n, dtype=np.float32)

	if params['voiced']:
	t = np.arange(closure_n) / self.sr
	voice_bar = np.sin(2 * np.pi * f0 * 0.8 * t) * 0.15
	audio[:closure_n] = voice_bar

	burst = np.random.randn(burst_n)
	burst_freq = params['burst_freq'] * formant_shift

	try:
	if burst_freq < self.sr / 2 - 100:
	b, a = signal.butter(2, burst_freq / (self.sr / 2), 'low')
	burst = signal.filtfilt(b, a, burst)
	except:
	pass

	burst_env = np.exp(-np.linspace(0, 5, burst_n))
	burst = burst_env params['amp']

	audio[closure_n:] = burst

	return audio

	def _synth_fricative(self, params, f0, speed):
	dur_ms = params['dur'] / speed
	n = int(self.sr * dur_ms / 1000)

	noise = np.random.randn(n)

	low = params['freq_low']
	high = min(params['freq_high'], self.sr / 2 - 100)

	try:
	if low < high:
	b, a = signal.butter(4, [low / (self.sr / 2), high / (self.sr / 2)], 'band')
	noise = signal.filtfilt(b, a, noise)
	except:
	pass

	audio = noise * params['amp']

	if params['voiced']:
	t = np.arange(n) / self.sr
	voice = self._glottal_source(t, f0, 0.03) * 0.3
	audio = audio + voice

	audio *= self._consonant_envelope(n)

	return audio.astype(np.float32)

	def _synth_affricate(self, params, f0, speed):
	closure_ms = params['closure'] / speed
	fric_ms = params['fric'] / speed

	closure_n = int(self.sr * closure_ms / 1000)
	fric_n = int(self.sr * fric_ms / 1000)

	audio = np.zeros(closure_n + fric_n, dtype=np.float32)

	if params['voiced']:
	t = np.arange(closure_n) / self.sr
	audio[:closure_n] = np.sin(2 * np.pi * f0 * 0.8 * t) * 0.1

	fric = np.random.randn(fric_n)
	low = params['freq_low']
	high = min(params['freq_high'], self.sr / 2 - 100)

	try:
	b, a = signal.butter(3, [low / (self.sr / 2), high / (self.sr / 2)], 'band')
	fric = signal.filtfilt(b, a, fric)
	except:
	pass

	fric *= params['amp']

	fric_env = np.ones(fric_n)
	attack = fric_n // 6
	release = fric_n // 3
	fric_env[:attack] = np.linspace(0, 1, attack)
	fric_env[-release:] = np.linspace(1, 0, release)

	audio[closure_n:] = fric * fric_env

	return audio

	def _synth_nasal(self, params, f0, speed, formant_shift, breathiness):
	dur_ms = params['dur'] / speed
	n = int(self.sr * dur_ms / 1000)
	t = np.arange(n) / self.sr

	source = self._glottal_source(t, f0, breathiness)

	f1 = params['f1'] * formant_shift
	f2 = params['f2'] * formant_shift
	f3 = params['f3'] * formant_shift

	audio = self._apply_formants(source, f1, f2, f3)

	nasal_pole = self._resonator(source, 250, 100) * 0.4
	audio += nasal_pole

	try:
	b, a = signal.butter(2, 800 / (self.sr / 2), 'low')
	audio = signal.filtfilt(b, a, audio)
	except:
	pass

	audio = params['amp'] self._consonant_envelope(n)

	return audio.astype(np.float32)

	def _synth_liquid(self, params, f0, speed, formant_shift, breathiness):
	dur_ms = params['dur'] / speed
	n = int(self.sr * dur_ms / 1000)
	t = np.arange(n) / self.sr

	source = self._glottal_source(t, f0, breathiness)

	f1 = params['f1'] * formant_shift
	f2 = params['f2'] * formant_shift
	f3 = params['f3'] * formant_shift

	audio = self._apply_formants(source, f1, f2, f3)
	audio = params['amp'] self._consonant_envelope(n)

	return audio.astype(np.float32)

	def _synth_glide(self, params, f0, speed, formant_shift, breathiness):
	dur_ms = params['dur'] / speed
	n = int(self.sr * dur_ms / 1000)
	t = np.arange(n) / self.sr

	source = self._glottal_source(t, f0, breathiness)

	f1 = params['f1'] * formant_shift
	f2 = params['f2'] * formant_shift
	f3 = params['f3'] * formant_shift

	audio = self._apply_formants(source, f1, f2, f3)
	audio = params['amp'] self._consonant_envelope(n)

	return audio.astype(np.float32)

	def _smooth_concat(self, segments):
	if not segments:
	return np.zeros(1000, dtype=np.float32)

	if len(segments) == 1:
	return segments[0]

	overlap = 64
	total_len = sum(len(s) for s in segments) - overlap * (len(segments) - 1)
	total_len = max(total_len, 100)

	audio = np.zeros(total_len, dtype=np.float32)
	pos = 0

	for i, seg in enumerate(segments):
	if len(seg) == 0:
	continue

	end_pos = min(pos + len(seg), total_len)
	seg_len = end_pos - pos

	if seg_len <= 0:
	break

	seg_to_add = seg[:seg_len].copy()

	if i > 0 and pos > overlap:
	fade_len = min(overlap, seg_len)
	fade_in = np.linspace(0, 1, fade_len) ** 0.5
	fade_out = np.linspace(1, 0, fade_len) ** 0.5

	audio[pos:pos + fade_len] *= fade_out
	seg_to_add[:fade_len] *= fade_in

	audio[pos:end_pos] += seg_to_add
	pos = end_pos - overlap // 2
	pos = max(0, pos)

	return audio

	def _normalize(self, audio):
	if len(audio) < 100:
	return audio

	audio = audio - np.mean(audio)
	max_val = np.max(np.abs(audio))
	if max_val > 0:
	audio = audio / max_val * 0.9

	fade = min(len(audio) // 40, 200)
	audio[:fade] *= np.linspace(0, 1, fade)
	audio[-fade:] *= np.linspace(1, 0, fade)

	return audio


	# ============================================
	# MAIN TTS CLASS
	# ============================================

	class VedesTTS:
	def __init__(self, sample_rate=22050):
	self.sr = sample_rate
	self.text_to_phoneme = TextToPhoneme()
	self.synthesizer = VoiceSynthesizer(sample_rate)
	self.voice_analyzer = VoiceAnalyzer(sample_rate)
	self.current_voice = VOICE_PROFILES["Emma (Female)"]

	def get_voice(self, voice_name):
	if voice_name in VOICE_PROFILES:
	return VOICE_PROFILES[voice_name]
	elif voice_name in custom_voices:
	return custom_voices[voice_name]
	return self.current_voice

	def speak(self, text, rate=1.0, pitch=1.0, voice_name=None):
	if not text or not text.strip():
	return np.zeros(self.sr, dtype=np.float32)

	voice = self.get_voice(voice_name) if voice_name else self.current_voice
	phonemes = self.text_to_phoneme.convert(text)

	if not phonemes:
	return np.zeros(self.sr, dtype=np.float32)

	audio = self.synthesizer.synthesize(phonemes, voice, rate, pitch)

	return audio

	def train_voice(self, audio_data, voice_name="My Voice"):
	"""Train a new voice from audio sample"""
	global custom_voices

	if audio_data is None:
	return None

	# Handle tuple format (sample_rate, audio)
	if isinstance(audio_data, tuple):
	sr, audio = audio_data
	audio = audio.astype(np.float32)

	# Handle stereo
	if len(audio.shape) > 1:
	audio = audio.mean(axis=1)

	# Resample if needed
	if sr != self.sr:
	duration = len(audio) / sr
	new_length = int(duration * self.sr)
	audio = signal.resample(audio, new_length)
	else:
	audio = audio_data.astype(np.float32)

	# Normalize
	max_val = np.max(np.abs(audio))
	if max_val > 0:
	audio = audio / max_val

	# Analyze
	profile = self.voice_analyzer.analyze(audio)

	if profile:
	profile['name'] = voice_name
	profile['description'] = f"Trained voice (F0={profile['f0']:.0f}Hz)"
	custom_voices[voice_name] = profile
	return profile

	return None


	# ============================================
	# INITIALIZE
	# ============================================

	print("=" * 50)
	print("🎙️ VEDES TTS - With Voice Training")
	print("=" * 50)

	tts = VedesTTS(SAMPLE_RATE)

	print("✅ Ready!")
	print(f"📢 Available voices: {len(VOICE_PROFILES)}")
	print("=" * 50)


	# ============================================
	# HELPER FUNCTIONS
	# ============================================

	def get_all_voices():
	"""Get list of all available voices"""
	voices = list(VOICE_PROFILES.keys()) + list(custom_voices.keys())
	return voices


	def get_voice_info(voice_name):
	"""Get info about a voice"""
	if voice_name in VOICE_PROFILES:
	v = VOICE_PROFILES[voice_name]
	elif voice_name in custom_voices:
	v = custom_voices[voice_name]
	else:
	return "Select a voice"

	return f"""
	{v.get('name', voice_name)}
	- Type: {v.get('gender', 'unknown').title()}
	- Pitch: {v.get('f0', 130):.0f} Hz
	- {v.get('description', '')}
	"""


	# ============================================
	# GRADIO FUNCTIONS
	# ============================================

	def synthesize(text, voice_name, rate, pitch):
	"""Synthesize speech"""
	if not text or not text.strip():
	return None

	text = text.strip()[:300]

	try:
	pitch_mult = 2 ** (pitch / 12)
	audio = tts.speak(text, rate=rate, pitch=pitch_mult, voice_name=voice_name)

	if len(audio) < 100:
	return None

	audio = np.clip(audio, -1, 1)
	audio_int16 = (audio * 32767).astype(np.int16)

	return (SAMPLE_RATE, audio_int16)

	except Exception as e:
	print(f"Synthesis error: {e}")
	return None


	def train_voice(audio, voice_name):
	"""Train a new voice from audio"""
	global custom_voices

	if audio is None:
	return "❌ Please record or upload audio first.", get_all_voices()

	if not voice_name or not voice_name.strip():
	voice_name = f"Custom Voice {len(custom_voices) + 1}"

	voice_name = voice_name.strip()[:30]

	# Check if name already exists
	if voice_name in VOICE_PROFILES:
	voice_name = f"{voice_name} (custom)"

	try:
	profile = tts.train_voice(audio, voice_name)

	if profile:
	result = f"""
	✅ Voice "{voice_name}" created!

	Detected Parameters:
	- Pitch (F0): {profile['f0']:.1f} Hz
	- Pitch Variation: {profile['f0_variation']:.1f} Hz
	- Formant Shift: {profile['formant_shift']:.2f}
	- Breathiness: {profile['breathiness']:.3f}
	- Brightness: {profile['brightness']:.2f}

	You can now select this voice in the Speak tab!
	"""
	return result, get_all_voices()
	else:
	return "❌ Could not analyze voice. Try a longer/clearer sample.", get_all_voices()

	except Exception as e:
	return f"❌ Error: {str(e)}", get_all_voices()


	def create_custom_voice(name, pitch, formant, breathiness, speed, brightness):
	"""Create a custom voice from parameters"""
	global custom_voices

	if not name or not name.strip():
	return "❌ Please enter a voice name.", get_all_voices()

	name = name.strip()[:30]

	if name in VOICE_PROFILES:
	name = f"{name} (custom)"

	profile = {
	"name": name,
	"gender": "custom",
	"f0": pitch,
	"f0_variation": 25,
	"formant_shift": formant,
	"breathiness": breathiness / 100,
	"speed": speed,
	"brightness": brightness,
	"description": f"Custom voice (F0={pitch}Hz)"
	}

	custom_voices[name] = profile

	return f"✅ Voice {name} created! Select it in the Speak tab.", get_all_voices()


	def refresh_voices():
	"""Refresh the voice list"""
	return gr.update(choices=get_all_voices())


	# ============================================
	# GRADIO INTERFACE
	# ============================================

	with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:

	gr.Markdown("""
	# 🎙️ Vedes TTS - Voice Training Edition
	### Create and Use Custom Voices - 100% From Scratch
	""")

	with gr.Tabs():
	# ===== SPEAK TAB =====
	with gr.TabItem("🔊 Speak"):
	with gr.Row():
	with gr.Column(scale=2):
	text_input = gr.Textbox(
	label="📝 Text to Speak",
	placeholder="Type something...",
	lines=3
	)

	with gr.Row():
	voice_select = gr.Dropdown(
	choices=get_all_voices(),
	value="Emma (Female)",
	label="🗣️ Voice",
	interactive=True
	)
	refresh_btn = gr.Button("🔄", size="sm")

	voice_info = gr.Markdown("Select a voice")

	with gr.Row():
	rate = gr.Slider(0.6, 1.5, 0.9, step=0.1, label="⏱️ Speed")
	pitch = gr.Slider(-6, 6, 0, step=1, label="🎵 Pitch")

	speak_btn = gr.Button("🔊 Speak", variant="primary", size="lg")

	with gr.Column(scale=1):
	audio_out = gr.Audio(label="🎧 Output", type="numpy")

	gr.Examples(
	examples=[
	["Hello, how are you?"],
	["Good morning!"],
	["My name is Vedes."],
	["Thank you very much."],
	["Have a nice day."],
	],
	inputs=text_input,
	label="📚 Examples"
	)

	# ===== TRAIN VOICE TAB =====
	with gr.TabItem("🎤 Train Voice"):
	gr.Markdown("""
	### Train a New Voice from Audio

	Record or upload 3-10 seconds of clear speech.

	Tips:
	- Speak naturally and clearly
	- Avoid background noise
	- Read a few sentences
	""")

	with gr.Row():
	with gr.Column():
	audio_input = gr.Audio(
	label="🎤 Record or Upload",
	sources=["microphone", "upload"],
	type="numpy"
	)

	voice_name_input = gr.Textbox(
	label="Voice Name",
	placeholder="e.g., My Voice",
	value=""
	)

	train_btn = gr.Button("🧠 Train Voice", variant="primary")

	with gr.Column():
	train_result = gr.Markdown("Record audio and click Train")

	gr.Markdown("""
	### What Gets Analyzed:
	- Pitch (F0): How high/low the voice is
	- Formants: Voice quality/timbre
	- Breathiness: Air in the voice
	""")

	# ===== CREATE VOICE TAB =====
	with gr.TabItem("⚙️ Create Voice"):
	gr.Markdown("### Create Custom Voice Manually")

	with gr.Row():
	with gr.Column():
	custom_name = gr.Textbox(
	label="Voice Name",
	placeholder="My Custom Voice"
	)

	custom_pitch = gr.Slider(
	60, 300, 150,
	label="Pitch (Hz)",
	info="60-130=Male, 150-250=Female, 250+=Child"
	)

	custom_formant = gr.Slider(
	0.7, 1.4, 1.0, step=0.05,
	label="Formant Shift",
	info="<1.0=Male, >1.0=Female/Child"
	)

	custom_breathiness = gr.Slider(
	1, 10, 3,
	label="Breathiness (%)"
	)

	custom_speed = gr.Slider(
	0.7, 1.3, 1.0, step=0.05,
	label="Natural Speed"
	)

	custom_brightness = gr.Slider(
	0.8, 1.3, 1.0, step=0.05,
	label="Brightness"
	)

	create_btn = gr.Button("✨ Create Voice", variant="primary")

	with gr.Column():
	create_result = gr.Markdown("")

	gr.Markdown("""
	### Quick Presets:

	\| Type \| Pitch \| Formant \|
	\|------\|-------\|---------\|
	\| Deep Male \| 85 \| 0.85 \|
	\| Male \| 120 \| 0.92 \|
	\| Female \| 200 \| 1.12 \|
	\| High Female \| 240 \| 1.20 \|
	\| Child \| 280 \| 1.25 \|
	""")

	# ===== ALL VOICES TAB =====
	with gr.TabItem("👥 All Voices"):
	gr.Markdown("### Pre-built Voices")

	voice_info_md = ""
	for name, v in VOICE_PROFILES.items():
	voice_info_md += f"""
	{name}
	- Type: {v['gender'].title()} \| Pitch: {v['f0']} Hz
	- {v['description']}

	"""
	gr.Markdown(voice_info_md)

	gr.Markdown("### Custom Voices")
	custom_voices_display = gr.Markdown("No custom voices yet")

	# ===== EVENT HANDLERS =====

	# Speak tab
	voice_select.change(get_voice_info, voice_select, voice_info)
	refresh_btn.click(refresh_voices, outputs=voice_select)
	speak_btn.click(synthesize, [text_input, voice_select, rate, pitch], audio_out)
	text_input.submit(synthesize, [text_input, voice_select, rate, pitch], audio_out)

	# Train tab - Fixed: update choices first, then set value separately
	def train_and_update(audio, name):
	result, voices = train_voice(audio, name)
	# Return result and updated dropdown with new choices
	return result, gr.update(choices=voices)

	train_btn.click(
	train_and_update,
	[audio_input, voice_name_input],
	[train_result, voice_select]
	)

	# Create tab - Fixed similarly
	def create_and_update(name, pitch, formant, breathiness, speed, brightness):
	result, voices = create_custom_voice(name, pitch, formant, breathiness, speed, brightness)
	return result, gr.update(choices=voices)

	create_btn.click(
	create_and_update,
	[custom_name, custom_pitch, custom_formant, custom_breathiness,
	custom_speed, custom_brightness],
	[create_result, voice_select]
	)


	# ============================================
	# LAUNCH
	# ============================================

	if __name__ == "__main__":
	demo.launch()