Vedes / app.py
vedaco's picture
Update app.py
9ee9e3a verified
import numpy as np
import gradio as gr
from scipy import signal
from scipy.io import wavfile
import tempfile
import re
import json
import os
# ============================================
# VEDES TTS - WITH VOICE TRAINING (FIXED)
# 100% From Scratch - No APIs
# ============================================
SAMPLE_RATE = 22050
# ============================================
# VOICE PROFILES - Pre-defined Voices
# ============================================
VOICE_PROFILES = {
"Emma (Female)": {
"name": "Emma",
"gender": "female",
"f0": 210,
"f0_variation": 30,
"formant_shift": 1.15,
"breathiness": 0.04,
"speed": 1.0,
"brightness": 1.1,
"description": "Friendly female voice"
},
"James (Male)": {
"name": "James",
"gender": "male",
"f0": 110,
"f0_variation": 20,
"formant_shift": 0.9,
"breathiness": 0.02,
"speed": 0.95,
"brightness": 0.95,
"description": "Professional male voice"
},
"Sophie (Child)": {
"name": "Sophie",
"gender": "child",
"f0": 280,
"f0_variation": 40,
"formant_shift": 1.25,
"breathiness": 0.03,
"speed": 1.1,
"brightness": 1.2,
"description": "Young child voice"
},
"David (Deep Male)": {
"name": "David",
"gender": "male",
"f0": 85,
"f0_variation": 15,
"formant_shift": 0.82,
"breathiness": 0.02,
"speed": 0.9,
"brightness": 0.85,
"description": "Deep bass voice"
},
"Lisa (Bright Female)": {
"name": "Lisa",
"gender": "female",
"f0": 240,
"f0_variation": 35,
"formant_shift": 1.2,
"breathiness": 0.05,
"speed": 1.05,
"brightness": 1.15,
"description": "Bright, energetic female"
},
"Robert (Elderly Male)": {
"name": "Robert",
"gender": "male",
"f0": 95,
"f0_variation": 12,
"formant_shift": 0.88,
"breathiness": 0.06,
"speed": 0.85,
"brightness": 0.9,
"description": "Mature elderly voice"
},
"Anna (Soft Female)": {
"name": "Anna",
"gender": "female",
"f0": 195,
"f0_variation": 25,
"formant_shift": 1.1,
"breathiness": 0.07,
"speed": 0.92,
"brightness": 1.0,
"description": "Soft, gentle female"
},
"Mike (Energetic Male)": {
"name": "Mike",
"gender": "male",
"f0": 130,
"f0_variation": 30,
"formant_shift": 0.95,
"breathiness": 0.02,
"speed": 1.1,
"brightness": 1.05,
"description": "Energetic young male"
},
}
# Custom voices storage (global)
custom_voices = {}
# ============================================
# PHONEME DATA
# ============================================
VOWELS = {
'IY': (280, 2250, 2890, 150, 1.0, True),
'IH': (400, 1920, 2550, 120, 0.9, True),
'EH': (550, 1770, 2490, 130, 0.95, True),
'AE': (690, 1660, 2490, 140, 1.0, True),
'AA': (710, 1100, 2540, 150, 1.0, True),
'AO': (590, 880, 2540, 140, 0.95, True),
'UH': (470, 1100, 2540, 120, 0.9, True),
'UW': (310, 870, 2250, 150, 1.0, True),
'AH': (640, 1200, 2400, 100, 0.85, True),
'AX': (500, 1500, 2500, 80, 0.7, True),
'ER': (500, 1350, 1700, 140, 0.9, True),
'EY': (500, 1900, 2600, 160, 1.0, True),
'AY': (700, 1200, 2600, 180, 1.0, True),
'OY': (500, 900, 2500, 180, 1.0, True),
'AW': (700, 1100, 2600, 180, 1.0, True),
'OW': (500, 900, 2500, 160, 1.0, True),
}
CONSONANTS = {
'P': {'type': 'stop', 'closure': 80, 'burst': 30, 'voiced': False, 'burst_freq': 800, 'amp': 0.6},
'B': {'type': 'stop', 'closure': 50, 'burst': 25, 'voiced': True, 'burst_freq': 800, 'amp': 0.7},
'T': {'type': 'stop', 'closure': 70, 'burst': 30, 'voiced': False, 'burst_freq': 3500, 'amp': 0.7},
'D': {'type': 'stop', 'closure': 40, 'burst': 25, 'voiced': True, 'burst_freq': 3500, 'amp': 0.7},
'K': {'type': 'stop', 'closure': 80, 'burst': 40, 'voiced': False, 'burst_freq': 1500, 'amp': 0.7},
'G': {'type': 'stop', 'closure': 50, 'burst': 30, 'voiced': True, 'burst_freq': 1500, 'amp': 0.7},
'F': {'type': 'fric', 'dur': 120, 'freq_low': 1500, 'freq_high': 8000, 'voiced': False, 'amp': 0.4},
'V': {'type': 'fric', 'dur': 80, 'freq_low': 1500, 'freq_high': 8000, 'voiced': True, 'amp': 0.5},
'TH': {'type': 'fric', 'dur': 100, 'freq_low': 1400, 'freq_high': 6000, 'voiced': False, 'amp': 0.3},
'DH': {'type': 'fric', 'dur': 60, 'freq_low': 1400, 'freq_high': 6000, 'voiced': True, 'amp': 0.5},
'S': {'type': 'fric', 'dur': 120, 'freq_low': 4000, 'freq_high': 9000, 'voiced': False, 'amp': 0.5},
'Z': {'type': 'fric', 'dur': 90, 'freq_low': 4000, 'freq_high': 9000, 'voiced': True, 'amp': 0.5},
'SH': {'type': 'fric', 'dur': 120, 'freq_low': 2000, 'freq_high': 6000, 'voiced': False, 'amp': 0.5},
'ZH': {'type': 'fric', 'dur': 80, 'freq_low': 2000, 'freq_high': 6000, 'voiced': True, 'amp': 0.5},
'HH': {'type': 'fric', 'dur': 80, 'freq_low': 500, 'freq_high': 2000, 'voiced': False, 'amp': 0.3},
'CH': {'type': 'affric', 'closure': 60, 'fric': 80, 'freq_low': 2000, 'freq_high': 6000, 'voiced': False, 'amp': 0.6},
'JH': {'type': 'affric', 'closure': 40, 'fric': 60, 'freq_low': 2000, 'freq_high': 6000, 'voiced': True, 'amp': 0.6},
'M': {'type': 'nasal', 'f1': 280, 'f2': 1000, 'f3': 2200, 'dur': 100, 'amp': 0.8},
'N': {'type': 'nasal', 'f1': 280, 'f2': 1700, 'f3': 2500, 'dur': 90, 'amp': 0.8},
'NG': {'type': 'nasal', 'f1': 300, 'f2': 2000, 'f3': 2700, 'dur': 100, 'amp': 0.8},
'L': {'type': 'liquid', 'f1': 380, 'f2': 1000, 'f3': 2700, 'dur': 90, 'amp': 0.85},
'R': {'type': 'liquid', 'f1': 350, 'f2': 1300, 'f3': 1700, 'dur': 90, 'amp': 0.85},
'W': {'type': 'glide', 'f1': 300, 'f2': 700, 'f3': 2200, 'dur': 80, 'amp': 0.8},
'Y': {'type': 'glide', 'f1': 280, 'f2': 2200, 'f3': 2900, 'dur': 70, 'amp': 0.8},
}
SILENCE = {'SIL': 60, 'PAU': 200}
# ============================================
# PRONUNCIATION DICTIONARY
# ============================================
DICTIONARY = {
# Function words
'a': ['AX'], 'an': ['AE', 'N'], 'the': ['DH', 'AX'],
'and': ['AE', 'N', 'D'], 'or': ['AO', 'R'], 'but': ['B', 'AH', 'T'],
'if': ['IH', 'F'], 'of': ['AH', 'V'], 'to': ['T', 'UW'],
'in': ['IH', 'N'], 'on': ['AA', 'N'], 'at': ['AE', 'T'],
'by': ['B', 'AY'], 'for': ['F', 'AO', 'R'], 'with': ['W', 'IH', 'TH'],
'from': ['F', 'R', 'AH', 'M'], 'up': ['AH', 'P'], 'out': ['AW', 'T'],
'as': ['AE', 'Z'], 'so': ['S', 'OW'], 'not': ['N', 'AA', 'T'],
# Pronouns
'i': ['AY'], 'me': ['M', 'IY'], 'my': ['M', 'AY'],
'you': ['Y', 'UW'], 'your': ['Y', 'AO', 'R'],
'he': ['HH', 'IY'], 'him': ['HH', 'IH', 'M'], 'his': ['HH', 'IH', 'Z'],
'she': ['SH', 'IY'], 'her': ['HH', 'ER'],
'it': ['IH', 'T'], 'its': ['IH', 'T', 'S'],
'we': ['W', 'IY'], 'us': ['AH', 'S'], 'our': ['AW', 'ER'],
'they': ['DH', 'EY'], 'them': ['DH', 'EH', 'M'], 'their': ['DH', 'EH', 'R'],
'this': ['DH', 'IH', 'S'], 'that': ['DH', 'AE', 'T'],
'what': ['W', 'AH', 'T'], 'who': ['HH', 'UW'],
'where': ['W', 'EH', 'R'], 'when': ['W', 'EH', 'N'],
'why': ['W', 'AY'], 'how': ['HH', 'AW'], 'which': ['W', 'IH', 'CH'],
# Be verbs
'am': ['AE', 'M'], 'is': ['IH', 'Z'], 'are': ['AA', 'R'],
'was': ['W', 'AA', 'Z'], 'were': ['W', 'ER'],
'be': ['B', 'IY'], 'been': ['B', 'IH', 'N'], 'being': ['B', 'IY', 'IH', 'NG'],
# Have verbs
'have': ['HH', 'AE', 'V'], 'has': ['HH', 'AE', 'Z'],
'had': ['HH', 'AE', 'D'], 'having': ['HH', 'AE', 'V', 'IH', 'NG'],
# Do verbs
'do': ['D', 'UW'], 'does': ['D', 'AH', 'Z'],
'did': ['D', 'IH', 'D'], 'done': ['D', 'AH', 'N'],
# Modal verbs
'will': ['W', 'IH', 'L'], 'would': ['W', 'UH', 'D'],
'can': ['K', 'AE', 'N'], 'could': ['K', 'UH', 'D'],
'should': ['SH', 'UH', 'D'], 'may': ['M', 'EY'],
'might': ['M', 'AY', 'T'], 'must': ['M', 'AH', 'S', 'T'],
# Common verbs
'go': ['G', 'OW'], 'goes': ['G', 'OW', 'Z'], 'going': ['G', 'OW', 'IH', 'NG'],
'went': ['W', 'EH', 'N', 'T'], 'gone': ['G', 'AO', 'N'],
'come': ['K', 'AH', 'M'], 'comes': ['K', 'AH', 'M', 'Z'],
'coming': ['K', 'AH', 'M', 'IH', 'NG'], 'came': ['K', 'EY', 'M'],
'get': ['G', 'EH', 'T'], 'gets': ['G', 'EH', 'T', 'S'],
'getting': ['G', 'EH', 'T', 'IH', 'NG'], 'got': ['G', 'AA', 'T'],
'make': ['M', 'EY', 'K'], 'makes': ['M', 'EY', 'K', 'S'],
'making': ['M', 'EY', 'K', 'IH', 'NG'], 'made': ['M', 'EY', 'D'],
'take': ['T', 'EY', 'K'], 'takes': ['T', 'EY', 'K', 'S'],
'took': ['T', 'UH', 'K'], 'taken': ['T', 'EY', 'K', 'AX', 'N'],
'see': ['S', 'IY'], 'sees': ['S', 'IY', 'Z'],
'saw': ['S', 'AO'], 'seen': ['S', 'IY', 'N'],
'say': ['S', 'EY'], 'says': ['S', 'EH', 'Z'], 'said': ['S', 'EH', 'D'],
'know': ['N', 'OW'], 'knows': ['N', 'OW', 'Z'],
'knew': ['N', 'UW'], 'known': ['N', 'OW', 'N'],
'think': ['TH', 'IH', 'NG', 'K'], 'thought': ['TH', 'AO', 'T'],
'want': ['W', 'AA', 'N', 'T'], 'wants': ['W', 'AA', 'N', 'T', 'S'],
'give': ['G', 'IH', 'V'], 'gives': ['G', 'IH', 'V', 'Z'],
'gave': ['G', 'EY', 'V'], 'given': ['G', 'IH', 'V', 'AX', 'N'],
'tell': ['T', 'EH', 'L'], 'told': ['T', 'OW', 'L', 'D'],
'ask': ['AE', 'S', 'K'], 'asked': ['AE', 'S', 'K', 'T'],
'use': ['Y', 'UW', 'Z'], 'used': ['Y', 'UW', 'Z', 'D'],
'find': ['F', 'AY', 'N', 'D'], 'found': ['F', 'AW', 'N', 'D'],
'work': ['W', 'ER', 'K'], 'works': ['W', 'ER', 'K', 'S'],
'call': ['K', 'AO', 'L'], 'called': ['K', 'AO', 'L', 'D'],
'try': ['T', 'R', 'AY'], 'tried': ['T', 'R', 'AY', 'D'],
'need': ['N', 'IY', 'D'], 'needs': ['N', 'IY', 'D', 'Z'],
'feel': ['F', 'IY', 'L'], 'feels': ['F', 'IY', 'L', 'Z'],
'help': ['HH', 'EH', 'L', 'P'], 'helps': ['HH', 'EH', 'L', 'P', 'S'],
'keep': ['K', 'IY', 'P'], 'kept': ['K', 'EH', 'P', 'T'],
'let': ['L', 'EH', 'T'], 'put': ['P', 'UH', 'T'],
'seem': ['S', 'IY', 'M'], 'leave': ['L', 'IY', 'V'],
'show': ['SH', 'OW'], 'hear': ['HH', 'IY', 'R'],
'play': ['P', 'L', 'EY'], 'run': ['R', 'AH', 'N'],
'move': ['M', 'UW', 'V'], 'live': ['L', 'IH', 'V'],
'believe': ['B', 'IH', 'L', 'IY', 'V'],
'read': ['R', 'IY', 'D'], 'write': ['R', 'AY', 'T'],
'learn': ['L', 'ER', 'N'], 'speak': ['S', 'P', 'IY', 'K'],
'look': ['L', 'UH', 'K'], 'like': ['L', 'AY', 'K'],
'love': ['L', 'AH', 'V'], 'start': ['S', 'T', 'AA', 'R', 'T'],
'stop': ['S', 'T', 'AA', 'P'],
# Adjectives
'good': ['G', 'UH', 'D'], 'better': ['B', 'EH', 'T', 'ER'],
'best': ['B', 'EH', 'S', 'T'], 'bad': ['B', 'AE', 'D'],
'new': ['N', 'UW'], 'old': ['OW', 'L', 'D'],
'big': ['B', 'IH', 'G'], 'small': ['S', 'M', 'AO', 'L'],
'long': ['L', 'AO', 'NG'], 'short': ['SH', 'AO', 'R', 'T'],
'high': ['HH', 'AY'], 'low': ['L', 'OW'],
'great': ['G', 'R', 'EY', 'T'], 'little': ['L', 'IH', 'T', 'AX', 'L'],
'right': ['R', 'AY', 'T'], 'wrong': ['R', 'AO', 'NG'],
'first': ['F', 'ER', 'S', 'T'], 'last': ['L', 'AE', 'S', 'T'],
'same': ['S', 'EY', 'M'], 'different': ['D', 'IH', 'F', 'R', 'AX', 'N', 'T'],
'own': ['OW', 'N'], 'other': ['AH', 'DH', 'ER'],
'nice': ['N', 'AY', 'S'], 'happy': ['HH', 'AE', 'P', 'IY'],
'sure': ['SH', 'UH', 'R'], 'true': ['T', 'R', 'UW'],
'real': ['R', 'IY', 'L'], 'clear': ['K', 'L', 'IY', 'R'],
'fine': ['F', 'AY', 'N'], 'free': ['F', 'R', 'IY'],
'easy': ['IY', 'Z', 'IY'], 'hard': ['HH', 'AA', 'R', 'D'],
'young': ['Y', 'AH', 'NG'], 'beautiful': ['B', 'Y', 'UW', 'T', 'IH', 'F', 'AX', 'L'],
# Adverbs
'very': ['V', 'EH', 'R', 'IY'], 'really': ['R', 'IY', 'L', 'IY'],
'just': ['JH', 'AH', 'S', 'T'], 'only': ['OW', 'N', 'L', 'IY'],
'also': ['AO', 'L', 'S', 'OW'], 'well': ['W', 'EH', 'L'],
'now': ['N', 'AW'], 'then': ['DH', 'EH', 'N'],
'here': ['HH', 'IY', 'R'], 'there': ['DH', 'EH', 'R'],
'still': ['S', 'T', 'IH', 'L'], 'even': ['IY', 'V', 'AX', 'N'],
'back': ['B', 'AE', 'K'], 'again': ['AX', 'G', 'EH', 'N'],
'always': ['AO', 'L', 'W', 'EY', 'Z'], 'never': ['N', 'EH', 'V', 'ER'],
'today': ['T', 'AX', 'D', 'EY'], 'maybe': ['M', 'EY', 'B', 'IY'],
'too': ['T', 'UW'], 'much': ['M', 'AH', 'CH'],
'more': ['M', 'AO', 'R'], 'most': ['M', 'OW', 'S', 'T'],
'please': ['P', 'L', 'IY', 'Z'],
# Nouns
'time': ['T', 'AY', 'M'], 'year': ['Y', 'IY', 'R'],
'day': ['D', 'EY'], 'way': ['W', 'EY'],
'man': ['M', 'AE', 'N'], 'woman': ['W', 'UH', 'M', 'AX', 'N'],
'child': ['CH', 'AY', 'L', 'D'], 'world': ['W', 'ER', 'L', 'D'],
'life': ['L', 'AY', 'F'], 'hand': ['HH', 'AE', 'N', 'D'],
'part': ['P', 'AA', 'R', 'T'], 'place': ['P', 'L', 'EY', 'S'],
'thing': ['TH', 'IH', 'NG'], 'things': ['TH', 'IH', 'NG', 'Z'],
'people': ['P', 'IY', 'P', 'AX', 'L'], 'person': ['P', 'ER', 'S', 'AX', 'N'],
'home': ['HH', 'OW', 'M'], 'house': ['HH', 'AW', 'S'],
'word': ['W', 'ER', 'D'], 'name': ['N', 'EY', 'M'],
'water': ['W', 'AO', 'T', 'ER'], 'money': ['M', 'AH', 'N', 'IY'],
'family': ['F', 'AE', 'M', 'AX', 'L', 'IY'],
'friend': ['F', 'R', 'EH', 'N', 'D'], 'friends': ['F', 'R', 'EH', 'N', 'D', 'Z'],
'mother': ['M', 'AH', 'DH', 'ER'], 'father': ['F', 'AA', 'DH', 'ER'],
'boy': ['B', 'OY'], 'girl': ['G', 'ER', 'L'],
'head': ['HH', 'EH', 'D'], 'face': ['F', 'EY', 'S'],
'eye': ['AY'], 'eyes': ['AY', 'Z'],
'voice': ['V', 'OY', 'S'], 'night': ['N', 'AY', 'T'],
'morning': ['M', 'AO', 'R', 'N', 'IH', 'NG'],
'week': ['W', 'IY', 'K'], 'month': ['M', 'AH', 'N', 'TH'],
'school': ['S', 'K', 'UW', 'L'], 'book': ['B', 'UH', 'K'],
'story': ['S', 'T', 'AO', 'R', 'IY'],
'question': ['K', 'W', 'EH', 'S', 'CH', 'AX', 'N'],
'answer': ['AE', 'N', 'S', 'ER'],
# Numbers
'zero': ['Z', 'IY', 'R', 'OW'], 'one': ['W', 'AH', 'N'],
'two': ['T', 'UW'], 'three': ['TH', 'R', 'IY'],
'four': ['F', 'AO', 'R'], 'five': ['F', 'AY', 'V'],
'six': ['S', 'IH', 'K', 'S'], 'seven': ['S', 'EH', 'V', 'AX', 'N'],
'eight': ['EY', 'T'], 'nine': ['N', 'AY', 'N'], 'ten': ['T', 'EH', 'N'],
# Greetings
'hello': ['HH', 'AX', 'L', 'OW'], 'hi': ['HH', 'AY'],
'hey': ['HH', 'EY'], 'goodbye': ['G', 'UH', 'D', 'B', 'AY'],
'bye': ['B', 'AY'], 'welcome': ['W', 'EH', 'L', 'K', 'AX', 'M'],
'thank': ['TH', 'AE', 'NG', 'K'], 'thanks': ['TH', 'AE', 'NG', 'K', 'S'],
'sorry': ['S', 'AA', 'R', 'IY'],
'yes': ['Y', 'EH', 'S'], 'yeah': ['Y', 'AE'], 'no': ['N', 'OW'],
'ok': ['OW', 'K', 'EY'], 'okay': ['OW', 'K', 'EY'],
# Tech/TTS
'text': ['T', 'EH', 'K', 'S', 'T'], 'speech': ['S', 'P', 'IY', 'CH'],
'sound': ['S', 'AW', 'N', 'D'], 'audio': ['AO', 'D', 'IY', 'OW'],
'test': ['T', 'EH', 'S', 'T'], 'testing': ['T', 'EH', 'S', 'T', 'IH', 'NG'],
'computer': ['K', 'AX', 'M', 'P', 'Y', 'UW', 'T', 'ER'],
'vedes': ['V', 'EY', 'D', 'EH', 'S'],
'system': ['S', 'IH', 'S', 'T', 'AX', 'M'],
'train': ['T', 'R', 'EY', 'N'], 'training': ['T', 'R', 'EY', 'N', 'IH', 'NG'],
}
# Letter patterns
PATTERNS = [
('tion', ['SH', 'AX', 'N']), ('sion', ['ZH', 'AX', 'N']),
('ness', ['N', 'AX', 'S']), ('ment', ['M', 'AX', 'N', 'T']),
('able', ['AX', 'B', 'AX', 'L']), ('ible', ['AX', 'B', 'AX', 'L']),
('ful', ['F', 'AX', 'L']), ('less', ['L', 'AX', 'S']),
('ing', ['IH', 'NG']), ('ight', ['AY', 'T']),
('ough', ['AO']), ('ould', ['UH', 'D']),
('th', ['TH']), ('sh', ['SH']), ('ch', ['CH']),
('wh', ['W']), ('ph', ['F']), ('ck', ['K']), ('ng', ['NG']),
('qu', ['K', 'W']), ('ee', ['IY']), ('ea', ['IY']),
('oo', ['UW']), ('ou', ['AW']), ('ow', ['OW']),
('ai', ['EY']), ('ay', ['EY']), ('ey', ['IY']),
('oy', ['OY']), ('oi', ['OY']), ('ie', ['IY']),
('er', ['ER']), ('ir', ['ER']), ('ur', ['ER']),
('ar', ['AA', 'R']), ('or', ['AO', 'R']),
]
LETTERS = {
'a': 'AE', 'b': 'B', 'c': 'K', 'd': 'D', 'e': 'EH',
'f': 'F', 'g': 'G', 'h': 'HH', 'i': 'IH', 'j': 'JH',
'k': 'K', 'l': 'L', 'm': 'M', 'n': 'N', 'o': 'AA',
'p': 'P', 'q': 'K', 'r': 'R', 's': 'S', 't': 'T',
'u': 'AH', 'v': 'V', 'w': 'W', 'x': 'K', 'y': 'IY', 'z': 'Z',
}
# ============================================
# VOICE ANALYZER
# ============================================
class VoiceAnalyzer:
"""Analyze audio to extract voice characteristics"""
def __init__(self, sample_rate=22050):
self.sr = sample_rate
def analyze(self, audio):
"""Extract voice features from audio sample"""
if len(audio) < self.sr * 0.3:
return None
audio = audio.astype(np.float32)
max_val = np.max(np.abs(audio))
if max_val > 0:
audio = audio / max_val
f0 = self._estimate_pitch(audio)
formants = self._estimate_formants(audio)
breathiness = self._estimate_breathiness(audio)
profile = {
"name": "Custom Voice",
"gender": "custom",
"f0": f0,
"f0_variation": self._estimate_f0_variation(audio, f0),
"formant_shift": formants.get('shift', 1.0),
"breathiness": breathiness,
"speed": 1.0,
"brightness": formants.get('brightness', 1.0),
"description": f"Custom voice (F0={f0:.0f}Hz)"
}
return profile
def _estimate_pitch(self, audio):
"""Estimate fundamental frequency using autocorrelation"""
frame_size = int(self.sr * 0.03)
pitches = []
for i in range(0, len(audio) - frame_size, frame_size):
frame = audio[i:i + frame_size]
# Remove DC
frame = frame - np.mean(frame)
# Autocorrelation
corr = np.correlate(frame, frame, mode='full')
corr = corr[len(corr)//2:]
# Find peaks
d = np.diff(corr)
start_indices = np.where(d > 0)[0]
if len(start_indices) > 0:
start = start_indices[0]
search_end = min(start + int(self.sr / 60), len(corr))
if search_end > start:
peak = start + np.argmax(corr[start:search_end])
if peak > 0:
f0 = self.sr / peak
if 60 < f0 < 400:
pitches.append(f0)
if pitches:
return float(np.median(pitches))
return 130.0
def _estimate_f0_variation(self, audio, base_f0):
"""Estimate pitch variation"""
frame_size = int(self.sr * 0.03)
pitches = []
for i in range(0, len(audio) - frame_size, frame_size):
frame = audio[i:i + frame_size]
frame = frame - np.mean(frame)
corr = np.correlate(frame, frame, mode='full')
corr = corr[len(corr)//2:]
d = np.diff(corr)
start_indices = np.where(d > 0)[0]
if len(start_indices) > 0:
start = start_indices[0]
search_end = min(start + int(self.sr / 60), len(corr))
if search_end > start:
peak = start + np.argmax(corr[start:search_end])
if peak > 0:
f0 = self.sr / peak
if 60 < f0 < 400:
pitches.append(f0)
if len(pitches) > 2:
return min(float(np.std(pitches)), 50.0)
return 20.0
def _estimate_formants(self, audio):
"""Estimate formant characteristics"""
frame_size = 2048
if len(audio) < frame_size:
return {'shift': 1.0, 'brightness': 1.0}
spectrum = np.abs(np.fft.rfft(audio[:frame_size] * np.hanning(frame_size)))
freqs = np.fft.rfftfreq(frame_size, 1/self.sr)
total_energy = np.sum(spectrum) + 1e-8
centroid = np.sum(freqs * spectrum) / total_energy
if centroid > 1600:
shift = 1.2
brightness = 1.15
elif centroid > 1400:
shift = 1.1
brightness = 1.05
elif centroid > 1200:
shift = 1.0
brightness = 1.0
elif centroid > 1000:
shift = 0.9
brightness = 0.95
else:
shift = 0.85
brightness = 0.9
return {'shift': shift, 'brightness': brightness}
def _estimate_breathiness(self, audio):
"""Estimate breathiness"""
frame_size = 2048
if len(audio) < frame_size:
return 0.03
spectrum = np.abs(np.fft.rfft(audio[:frame_size]))
freqs = np.fft.rfftfreq(frame_size, 1/self.sr)
low_mask = freqs < 1000
high_mask = (freqs > 2000) & (freqs < 5000)
low_energy = np.sum(spectrum[low_mask]) + 1e-8
high_energy = np.sum(spectrum[high_mask])
ratio = high_energy / low_energy
breathiness = float(np.clip(ratio * 0.1, 0.02, 0.1))
return breathiness
# ============================================
# TEXT TO PHONEME CONVERTER
# ============================================
class TextToPhoneme:
def __init__(self):
self.dictionary = DICTIONARY
self.patterns = sorted(PATTERNS, key=lambda x: -len(x[0]))
def convert(self, text):
text = text.lower().strip()
text = re.sub(r"[^\w\s.,!?']", '', text)
tokens = re.findall(r"[\w']+|[.,!?]", text)
phonemes = []
for i, token in enumerate(tokens):
if token in '.,!?':
phonemes.append('PAU')
elif token in self.dictionary:
phonemes.extend(self.dictionary[token])
if i < len(tokens) - 1 and tokens[i + 1] not in '.,!?':
phonemes.append('SIL')
else:
phons = self._convert_word(token)
phonemes.extend(phons)
if i < len(tokens) - 1 and tokens[i + 1] not in '.,!?':
phonemes.append('SIL')
return phonemes
def _convert_word(self, word):
phonemes = []
i = 0
while i < len(word):
matched = False
for pattern, phons in self.patterns:
if word[i:].startswith(pattern):
phonemes.extend(phons)
i += len(pattern)
matched = True
break
if not matched:
char = word[i]
if char in LETTERS:
phonemes.append(LETTERS[char])
i += 1
return phonemes
# ============================================
# VOICE SYNTHESIZER
# ============================================
class VoiceSynthesizer:
def __init__(self, sample_rate=22050):
self.sr = sample_rate
self.default_voice = VOICE_PROFILES["Emma (Female)"]
def synthesize(self, phonemes, voice_profile=None, rate=1.0, pitch=1.0):
if not phonemes:
return np.zeros(int(self.sr * 0.5), dtype=np.float32)
voice = voice_profile or self.default_voice
f0 = voice.get('f0', 130) * pitch
f0_var = voice.get('f0_variation', 20)
formant_shift = voice.get('formant_shift', 1.0)
breathiness = voice.get('breathiness', 0.03)
voice_speed = voice.get('speed', 1.0) * rate
brightness = voice.get('brightness', 1.0)
segments = []
for i, phon in enumerate(phonemes):
prev_phon = phonemes[i - 1] if i > 0 else None
next_phon = phonemes[i + 1] if i < len(phonemes) - 1 else None
phrase_pos = i / max(len(phonemes), 1)
f0_current = f0 + f0_var * np.sin(phrase_pos * np.pi) * 0.5
seg = self._synth_phoneme(
phon, f0_current, voice_speed, formant_shift,
breathiness, brightness, prev_phon, next_phon
)
segments.append(seg)
audio = self._smooth_concat(segments)
audio = self._normalize(audio)
return audio.astype(np.float32)
def _synth_phoneme(self, phon, f0, speed, formant_shift, breathiness,
brightness, prev_phon, next_phon):
if phon in SILENCE:
dur = int(self.sr * SILENCE[phon] / 1000 / speed)
return np.zeros(dur, dtype=np.float32)
if phon in VOWELS:
return self._synth_vowel(phon, f0, speed, formant_shift,
breathiness, brightness)
if phon in CONSONANTS:
return self._synth_consonant(phon, f0, speed, formant_shift, breathiness)
return np.zeros(100, dtype=np.float32)
def _synth_vowel(self, phon, f0, speed, formant_shift, breathiness, brightness):
params = VOWELS[phon]
f1, f2, f3, dur_ms, amp, voiced = params
f1 = f1 * formant_shift
f2 = f2 * formant_shift * brightness
f3 = f3 * formant_shift * brightness
dur_ms = dur_ms / speed
n = int(self.sr * dur_ms / 1000)
n = max(n, 100)
t = np.arange(n) / self.sr
source = self._glottal_source(t, f0, breathiness)
audio = self._apply_formants(source, f1, f2, f3)
envelope = self._vowel_envelope(n)
audio = audio * envelope * amp
return audio
def _synth_consonant(self, phon, f0, speed, formant_shift, breathiness):
params = CONSONANTS[phon]
ctype = params['type']
if ctype == 'stop':
return self._synth_stop(params, f0, speed, formant_shift)
elif ctype == 'fric':
return self._synth_fricative(params, f0, speed)
elif ctype == 'affric':
return self._synth_affricate(params, f0, speed)
elif ctype == 'nasal':
return self._synth_nasal(params, f0, speed, formant_shift, breathiness)
elif ctype == 'liquid':
return self._synth_liquid(params, f0, speed, formant_shift, breathiness)
elif ctype == 'glide':
return self._synth_glide(params, f0, speed, formant_shift, breathiness)
return np.zeros(100, dtype=np.float32)
def _glottal_source(self, t, f0, breathiness):
T0 = 1.0 / f0
phase = (t % T0) / T0
glottal = np.zeros_like(t)
mask1 = phase < 0.4
glottal[mask1] = 0.5 * (1 - np.cos(np.pi * phase[mask1] / 0.4))
mask2 = (phase >= 0.4) & (phase < 0.6)
glottal[mask2] = np.cos(np.pi * (phase[mask2] - 0.4) / 0.4)
glottal += np.random.randn(len(t)) * breathiness
shimmer = 1 + 0.02 * np.sin(2 * np.pi * 5 * t)
glottal *= shimmer
return glottal
def _apply_formants(self, source, f1, f2, f3):
formants = [(f1, 90), (f2, 110), (f3, 130)]
result = np.zeros_like(source)
for freq, bw in formants:
result += self._resonator(source, freq, bw)
return result
def _resonator(self, sig, freq, bw):
if freq <= 0 or freq >= self.sr / 2:
return sig
r = np.exp(-np.pi * bw / self.sr)
theta = 2 * np.pi * freq / self.sr
a1 = -2 * r * np.cos(theta)
a2 = r * r
b0 = 1 - r
y = np.zeros_like(sig)
for i in range(2, len(sig)):
y[i] = b0 * sig[i] - a1 * y[i-1] - a2 * y[i-2]
return y
def _vowel_envelope(self, n):
env = np.ones(n)
attack = max(1, n // 10)
release = max(1, int(n * 0.15))
env[:attack] = np.sin(np.linspace(0, np.pi/2, attack)) ** 2
env[-release:] = np.cos(np.linspace(0, np.pi/2, release)) ** 2
return env
def _consonant_envelope(self, n):
env = np.ones(n)
attack = max(1, n // 8)
release = max(1, n // 6)
env[:attack] = np.linspace(0.1, 1, attack)
env[-release:] = np.linspace(1, 0.1, release)
return env
def _synth_stop(self, params, f0, speed, formant_shift):
closure_ms = params['closure'] / speed
burst_ms = params['burst'] / speed
closure_n = int(self.sr * closure_ms / 1000)
burst_n = int(self.sr * burst_ms / 1000)
total_n = closure_n + burst_n
audio = np.zeros(total_n, dtype=np.float32)
if params['voiced']:
t = np.arange(closure_n) / self.sr
voice_bar = np.sin(2 * np.pi * f0 * 0.8 * t) * 0.15
audio[:closure_n] = voice_bar
burst = np.random.randn(burst_n)
burst_freq = params['burst_freq'] * formant_shift
try:
if burst_freq < self.sr / 2 - 100:
b, a = signal.butter(2, burst_freq / (self.sr / 2), 'low')
burst = signal.filtfilt(b, a, burst)
except:
pass
burst_env = np.exp(-np.linspace(0, 5, burst_n))
burst *= burst_env * params['amp']
audio[closure_n:] = burst
return audio
def _synth_fricative(self, params, f0, speed):
dur_ms = params['dur'] / speed
n = int(self.sr * dur_ms / 1000)
noise = np.random.randn(n)
low = params['freq_low']
high = min(params['freq_high'], self.sr / 2 - 100)
try:
if low < high:
b, a = signal.butter(4, [low / (self.sr / 2), high / (self.sr / 2)], 'band')
noise = signal.filtfilt(b, a, noise)
except:
pass
audio = noise * params['amp']
if params['voiced']:
t = np.arange(n) / self.sr
voice = self._glottal_source(t, f0, 0.03) * 0.3
audio = audio + voice
audio *= self._consonant_envelope(n)
return audio.astype(np.float32)
def _synth_affricate(self, params, f0, speed):
closure_ms = params['closure'] / speed
fric_ms = params['fric'] / speed
closure_n = int(self.sr * closure_ms / 1000)
fric_n = int(self.sr * fric_ms / 1000)
audio = np.zeros(closure_n + fric_n, dtype=np.float32)
if params['voiced']:
t = np.arange(closure_n) / self.sr
audio[:closure_n] = np.sin(2 * np.pi * f0 * 0.8 * t) * 0.1
fric = np.random.randn(fric_n)
low = params['freq_low']
high = min(params['freq_high'], self.sr / 2 - 100)
try:
b, a = signal.butter(3, [low / (self.sr / 2), high / (self.sr / 2)], 'band')
fric = signal.filtfilt(b, a, fric)
except:
pass
fric *= params['amp']
fric_env = np.ones(fric_n)
attack = fric_n // 6
release = fric_n // 3
fric_env[:attack] = np.linspace(0, 1, attack)
fric_env[-release:] = np.linspace(1, 0, release)
audio[closure_n:] = fric * fric_env
return audio
def _synth_nasal(self, params, f0, speed, formant_shift, breathiness):
dur_ms = params['dur'] / speed
n = int(self.sr * dur_ms / 1000)
t = np.arange(n) / self.sr
source = self._glottal_source(t, f0, breathiness)
f1 = params['f1'] * formant_shift
f2 = params['f2'] * formant_shift
f3 = params['f3'] * formant_shift
audio = self._apply_formants(source, f1, f2, f3)
nasal_pole = self._resonator(source, 250, 100) * 0.4
audio += nasal_pole
try:
b, a = signal.butter(2, 800 / (self.sr / 2), 'low')
audio = signal.filtfilt(b, a, audio)
except:
pass
audio *= params['amp'] * self._consonant_envelope(n)
return audio.astype(np.float32)
def _synth_liquid(self, params, f0, speed, formant_shift, breathiness):
dur_ms = params['dur'] / speed
n = int(self.sr * dur_ms / 1000)
t = np.arange(n) / self.sr
source = self._glottal_source(t, f0, breathiness)
f1 = params['f1'] * formant_shift
f2 = params['f2'] * formant_shift
f3 = params['f3'] * formant_shift
audio = self._apply_formants(source, f1, f2, f3)
audio *= params['amp'] * self._consonant_envelope(n)
return audio.astype(np.float32)
def _synth_glide(self, params, f0, speed, formant_shift, breathiness):
dur_ms = params['dur'] / speed
n = int(self.sr * dur_ms / 1000)
t = np.arange(n) / self.sr
source = self._glottal_source(t, f0, breathiness)
f1 = params['f1'] * formant_shift
f2 = params['f2'] * formant_shift
f3 = params['f3'] * formant_shift
audio = self._apply_formants(source, f1, f2, f3)
audio *= params['amp'] * self._consonant_envelope(n)
return audio.astype(np.float32)
def _smooth_concat(self, segments):
if not segments:
return np.zeros(1000, dtype=np.float32)
if len(segments) == 1:
return segments[0]
overlap = 64
total_len = sum(len(s) for s in segments) - overlap * (len(segments) - 1)
total_len = max(total_len, 100)
audio = np.zeros(total_len, dtype=np.float32)
pos = 0
for i, seg in enumerate(segments):
if len(seg) == 0:
continue
end_pos = min(pos + len(seg), total_len)
seg_len = end_pos - pos
if seg_len <= 0:
break
seg_to_add = seg[:seg_len].copy()
if i > 0 and pos > overlap:
fade_len = min(overlap, seg_len)
fade_in = np.linspace(0, 1, fade_len) ** 0.5
fade_out = np.linspace(1, 0, fade_len) ** 0.5
audio[pos:pos + fade_len] *= fade_out
seg_to_add[:fade_len] *= fade_in
audio[pos:end_pos] += seg_to_add
pos = end_pos - overlap // 2
pos = max(0, pos)
return audio
def _normalize(self, audio):
if len(audio) < 100:
return audio
audio = audio - np.mean(audio)
max_val = np.max(np.abs(audio))
if max_val > 0:
audio = audio / max_val * 0.9
fade = min(len(audio) // 40, 200)
audio[:fade] *= np.linspace(0, 1, fade)
audio[-fade:] *= np.linspace(1, 0, fade)
return audio
# ============================================
# MAIN TTS CLASS
# ============================================
class VedesTTS:
def __init__(self, sample_rate=22050):
self.sr = sample_rate
self.text_to_phoneme = TextToPhoneme()
self.synthesizer = VoiceSynthesizer(sample_rate)
self.voice_analyzer = VoiceAnalyzer(sample_rate)
self.current_voice = VOICE_PROFILES["Emma (Female)"]
def get_voice(self, voice_name):
if voice_name in VOICE_PROFILES:
return VOICE_PROFILES[voice_name]
elif voice_name in custom_voices:
return custom_voices[voice_name]
return self.current_voice
def speak(self, text, rate=1.0, pitch=1.0, voice_name=None):
if not text or not text.strip():
return np.zeros(self.sr, dtype=np.float32)
voice = self.get_voice(voice_name) if voice_name else self.current_voice
phonemes = self.text_to_phoneme.convert(text)
if not phonemes:
return np.zeros(self.sr, dtype=np.float32)
audio = self.synthesizer.synthesize(phonemes, voice, rate, pitch)
return audio
def train_voice(self, audio_data, voice_name="My Voice"):
"""Train a new voice from audio sample"""
global custom_voices
if audio_data is None:
return None
# Handle tuple format (sample_rate, audio)
if isinstance(audio_data, tuple):
sr, audio = audio_data
audio = audio.astype(np.float32)
# Handle stereo
if len(audio.shape) > 1:
audio = audio.mean(axis=1)
# Resample if needed
if sr != self.sr:
duration = len(audio) / sr
new_length = int(duration * self.sr)
audio = signal.resample(audio, new_length)
else:
audio = audio_data.astype(np.float32)
# Normalize
max_val = np.max(np.abs(audio))
if max_val > 0:
audio = audio / max_val
# Analyze
profile = self.voice_analyzer.analyze(audio)
if profile:
profile['name'] = voice_name
profile['description'] = f"Trained voice (F0={profile['f0']:.0f}Hz)"
custom_voices[voice_name] = profile
return profile
return None
# ============================================
# INITIALIZE
# ============================================
print("=" * 50)
print("🎙️ VEDES TTS - With Voice Training")
print("=" * 50)
tts = VedesTTS(SAMPLE_RATE)
print("✅ Ready!")
print(f"📢 Available voices: {len(VOICE_PROFILES)}")
print("=" * 50)
# ============================================
# HELPER FUNCTIONS
# ============================================
def get_all_voices():
"""Get list of all available voices"""
voices = list(VOICE_PROFILES.keys()) + list(custom_voices.keys())
return voices
def get_voice_info(voice_name):
"""Get info about a voice"""
if voice_name in VOICE_PROFILES:
v = VOICE_PROFILES[voice_name]
elif voice_name in custom_voices:
v = custom_voices[voice_name]
else:
return "Select a voice"
return f"""
**{v.get('name', voice_name)}**
- Type: {v.get('gender', 'unknown').title()}
- Pitch: {v.get('f0', 130):.0f} Hz
- {v.get('description', '')}
"""
# ============================================
# GRADIO FUNCTIONS
# ============================================
def synthesize(text, voice_name, rate, pitch):
"""Synthesize speech"""
if not text or not text.strip():
return None
text = text.strip()[:300]
try:
pitch_mult = 2 ** (pitch / 12)
audio = tts.speak(text, rate=rate, pitch=pitch_mult, voice_name=voice_name)
if len(audio) < 100:
return None
audio = np.clip(audio, -1, 1)
audio_int16 = (audio * 32767).astype(np.int16)
return (SAMPLE_RATE, audio_int16)
except Exception as e:
print(f"Synthesis error: {e}")
return None
def train_voice(audio, voice_name):
"""Train a new voice from audio"""
global custom_voices
if audio is None:
return "❌ Please record or upload audio first.", get_all_voices()
if not voice_name or not voice_name.strip():
voice_name = f"Custom Voice {len(custom_voices) + 1}"
voice_name = voice_name.strip()[:30]
# Check if name already exists
if voice_name in VOICE_PROFILES:
voice_name = f"{voice_name} (custom)"
try:
profile = tts.train_voice(audio, voice_name)
if profile:
result = f"""
✅ **Voice "{voice_name}" created!**
**Detected Parameters:**
- Pitch (F0): {profile['f0']:.1f} Hz
- Pitch Variation: {profile['f0_variation']:.1f} Hz
- Formant Shift: {profile['formant_shift']:.2f}
- Breathiness: {profile['breathiness']:.3f}
- Brightness: {profile['brightness']:.2f}
You can now select this voice in the Speak tab!
"""
return result, get_all_voices()
else:
return "❌ Could not analyze voice. Try a longer/clearer sample.", get_all_voices()
except Exception as e:
return f"❌ Error: {str(e)}", get_all_voices()
def create_custom_voice(name, pitch, formant, breathiness, speed, brightness):
"""Create a custom voice from parameters"""
global custom_voices
if not name or not name.strip():
return "❌ Please enter a voice name.", get_all_voices()
name = name.strip()[:30]
if name in VOICE_PROFILES:
name = f"{name} (custom)"
profile = {
"name": name,
"gender": "custom",
"f0": pitch,
"f0_variation": 25,
"formant_shift": formant,
"breathiness": breathiness / 100,
"speed": speed,
"brightness": brightness,
"description": f"Custom voice (F0={pitch}Hz)"
}
custom_voices[name] = profile
return f"✅ Voice **{name}** created! Select it in the Speak tab.", get_all_voices()
def refresh_voices():
"""Refresh the voice list"""
return gr.update(choices=get_all_voices())
# ============================================
# GRADIO INTERFACE
# ============================================
with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# 🎙️ Vedes TTS - Voice Training Edition
### Create and Use Custom Voices - 100% From Scratch
""")
with gr.Tabs():
# ===== SPEAK TAB =====
with gr.TabItem("🔊 Speak"):
with gr.Row():
with gr.Column(scale=2):
text_input = gr.Textbox(
label="📝 Text to Speak",
placeholder="Type something...",
lines=3
)
with gr.Row():
voice_select = gr.Dropdown(
choices=get_all_voices(),
value="Emma (Female)",
label="🗣️ Voice",
interactive=True
)
refresh_btn = gr.Button("🔄", size="sm")
voice_info = gr.Markdown("Select a voice")
with gr.Row():
rate = gr.Slider(0.6, 1.5, 0.9, step=0.1, label="⏱️ Speed")
pitch = gr.Slider(-6, 6, 0, step=1, label="🎵 Pitch")
speak_btn = gr.Button("🔊 Speak", variant="primary", size="lg")
with gr.Column(scale=1):
audio_out = gr.Audio(label="🎧 Output", type="numpy")
gr.Examples(
examples=[
["Hello, how are you?"],
["Good morning!"],
["My name is Vedes."],
["Thank you very much."],
["Have a nice day."],
],
inputs=text_input,
label="📚 Examples"
)
# ===== TRAIN VOICE TAB =====
with gr.TabItem("🎤 Train Voice"):
gr.Markdown("""
### Train a New Voice from Audio
Record or upload 3-10 seconds of clear speech.
**Tips:**
- Speak naturally and clearly
- Avoid background noise
- Read a few sentences
""")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
label="🎤 Record or Upload",
sources=["microphone", "upload"],
type="numpy"
)
voice_name_input = gr.Textbox(
label="Voice Name",
placeholder="e.g., My Voice",
value=""
)
train_btn = gr.Button("🧠 Train Voice", variant="primary")
with gr.Column():
train_result = gr.Markdown("Record audio and click Train")
gr.Markdown("""
### What Gets Analyzed:
- **Pitch (F0)**: How high/low the voice is
- **Formants**: Voice quality/timbre
- **Breathiness**: Air in the voice
""")
# ===== CREATE VOICE TAB =====
with gr.TabItem("⚙️ Create Voice"):
gr.Markdown("### Create Custom Voice Manually")
with gr.Row():
with gr.Column():
custom_name = gr.Textbox(
label="Voice Name",
placeholder="My Custom Voice"
)
custom_pitch = gr.Slider(
60, 300, 150,
label="Pitch (Hz)",
info="60-130=Male, 150-250=Female, 250+=Child"
)
custom_formant = gr.Slider(
0.7, 1.4, 1.0, step=0.05,
label="Formant Shift",
info="<1.0=Male, >1.0=Female/Child"
)
custom_breathiness = gr.Slider(
1, 10, 3,
label="Breathiness (%)"
)
custom_speed = gr.Slider(
0.7, 1.3, 1.0, step=0.05,
label="Natural Speed"
)
custom_brightness = gr.Slider(
0.8, 1.3, 1.0, step=0.05,
label="Brightness"
)
create_btn = gr.Button("✨ Create Voice", variant="primary")
with gr.Column():
create_result = gr.Markdown("")
gr.Markdown("""
### Quick Presets:
| Type | Pitch | Formant |
|------|-------|---------|
| Deep Male | 85 | 0.85 |
| Male | 120 | 0.92 |
| Female | 200 | 1.12 |
| High Female | 240 | 1.20 |
| Child | 280 | 1.25 |
""")
# ===== ALL VOICES TAB =====
with gr.TabItem("👥 All Voices"):
gr.Markdown("### Pre-built Voices")
voice_info_md = ""
for name, v in VOICE_PROFILES.items():
voice_info_md += f"""
**{name}**
- Type: {v['gender'].title()} | Pitch: {v['f0']} Hz
- {v['description']}
"""
gr.Markdown(voice_info_md)
gr.Markdown("### Custom Voices")
custom_voices_display = gr.Markdown("*No custom voices yet*")
# ===== EVENT HANDLERS =====
# Speak tab
voice_select.change(get_voice_info, voice_select, voice_info)
refresh_btn.click(refresh_voices, outputs=voice_select)
speak_btn.click(synthesize, [text_input, voice_select, rate, pitch], audio_out)
text_input.submit(synthesize, [text_input, voice_select, rate, pitch], audio_out)
# Train tab - Fixed: update choices first, then set value separately
def train_and_update(audio, name):
result, voices = train_voice(audio, name)
# Return result and updated dropdown with new choices
return result, gr.update(choices=voices)
train_btn.click(
train_and_update,
[audio_input, voice_name_input],
[train_result, voice_select]
)
# Create tab - Fixed similarly
def create_and_update(name, pitch, formant, breathiness, speed, brightness):
result, voices = create_custom_voice(name, pitch, formant, breathiness, speed, brightness)
return result, gr.update(choices=voices)
create_btn.click(
create_and_update,
[custom_name, custom_pitch, custom_formant, custom_breathiness,
custom_speed, custom_brightness],
[create_result, voice_select]
)
# ============================================
# LAUNCH
# ============================================
if __name__ == "__main__":
demo.launch()