import numpy as np import gradio as gr from scipy import signal from scipy.io import wavfile import tempfile import re import json import os # ============================================ # VEDES TTS - WITH VOICE TRAINING (FIXED) # 100% From Scratch - No APIs # ============================================ SAMPLE_RATE = 22050 # ============================================ # VOICE PROFILES - Pre-defined Voices # ============================================ VOICE_PROFILES = { "Emma (Female)": { "name": "Emma", "gender": "female", "f0": 210, "f0_variation": 30, "formant_shift": 1.15, "breathiness": 0.04, "speed": 1.0, "brightness": 1.1, "description": "Friendly female voice" }, "James (Male)": { "name": "James", "gender": "male", "f0": 110, "f0_variation": 20, "formant_shift": 0.9, "breathiness": 0.02, "speed": 0.95, "brightness": 0.95, "description": "Professional male voice" }, "Sophie (Child)": { "name": "Sophie", "gender": "child", "f0": 280, "f0_variation": 40, "formant_shift": 1.25, "breathiness": 0.03, "speed": 1.1, "brightness": 1.2, "description": "Young child voice" }, "David (Deep Male)": { "name": "David", "gender": "male", "f0": 85, "f0_variation": 15, "formant_shift": 0.82, "breathiness": 0.02, "speed": 0.9, "brightness": 0.85, "description": "Deep bass voice" }, "Lisa (Bright Female)": { "name": "Lisa", "gender": "female", "f0": 240, "f0_variation": 35, "formant_shift": 1.2, "breathiness": 0.05, "speed": 1.05, "brightness": 1.15, "description": "Bright, energetic female" }, "Robert (Elderly Male)": { "name": "Robert", "gender": "male", "f0": 95, "f0_variation": 12, "formant_shift": 0.88, "breathiness": 0.06, "speed": 0.85, "brightness": 0.9, "description": "Mature elderly voice" }, "Anna (Soft Female)": { "name": "Anna", "gender": "female", "f0": 195, "f0_variation": 25, "formant_shift": 1.1, "breathiness": 0.07, "speed": 0.92, "brightness": 1.0, "description": "Soft, gentle female" }, "Mike (Energetic Male)": { "name": "Mike", "gender": "male", "f0": 130, "f0_variation": 30, "formant_shift": 0.95, "breathiness": 0.02, "speed": 1.1, "brightness": 1.05, "description": "Energetic young male" }, } # Custom voices storage (global) custom_voices = {} # ============================================ # PHONEME DATA # ============================================ VOWELS = { 'IY': (280, 2250, 2890, 150, 1.0, True), 'IH': (400, 1920, 2550, 120, 0.9, True), 'EH': (550, 1770, 2490, 130, 0.95, True), 'AE': (690, 1660, 2490, 140, 1.0, True), 'AA': (710, 1100, 2540, 150, 1.0, True), 'AO': (590, 880, 2540, 140, 0.95, True), 'UH': (470, 1100, 2540, 120, 0.9, True), 'UW': (310, 870, 2250, 150, 1.0, True), 'AH': (640, 1200, 2400, 100, 0.85, True), 'AX': (500, 1500, 2500, 80, 0.7, True), 'ER': (500, 1350, 1700, 140, 0.9, True), 'EY': (500, 1900, 2600, 160, 1.0, True), 'AY': (700, 1200, 2600, 180, 1.0, True), 'OY': (500, 900, 2500, 180, 1.0, True), 'AW': (700, 1100, 2600, 180, 1.0, True), 'OW': (500, 900, 2500, 160, 1.0, True), } CONSONANTS = { 'P': {'type': 'stop', 'closure': 80, 'burst': 30, 'voiced': False, 'burst_freq': 800, 'amp': 0.6}, 'B': {'type': 'stop', 'closure': 50, 'burst': 25, 'voiced': True, 'burst_freq': 800, 'amp': 0.7}, 'T': {'type': 'stop', 'closure': 70, 'burst': 30, 'voiced': False, 'burst_freq': 3500, 'amp': 0.7}, 'D': {'type': 'stop', 'closure': 40, 'burst': 25, 'voiced': True, 'burst_freq': 3500, 'amp': 0.7}, 'K': {'type': 'stop', 'closure': 80, 'burst': 40, 'voiced': False, 'burst_freq': 1500, 'amp': 0.7}, 'G': {'type': 'stop', 'closure': 50, 'burst': 30, 'voiced': True, 'burst_freq': 1500, 'amp': 0.7}, 'F': {'type': 'fric', 'dur': 120, 'freq_low': 1500, 'freq_high': 8000, 'voiced': False, 'amp': 0.4}, 'V': {'type': 'fric', 'dur': 80, 'freq_low': 1500, 'freq_high': 8000, 'voiced': True, 'amp': 0.5}, 'TH': {'type': 'fric', 'dur': 100, 'freq_low': 1400, 'freq_high': 6000, 'voiced': False, 'amp': 0.3}, 'DH': {'type': 'fric', 'dur': 60, 'freq_low': 1400, 'freq_high': 6000, 'voiced': True, 'amp': 0.5}, 'S': {'type': 'fric', 'dur': 120, 'freq_low': 4000, 'freq_high': 9000, 'voiced': False, 'amp': 0.5}, 'Z': {'type': 'fric', 'dur': 90, 'freq_low': 4000, 'freq_high': 9000, 'voiced': True, 'amp': 0.5}, 'SH': {'type': 'fric', 'dur': 120, 'freq_low': 2000, 'freq_high': 6000, 'voiced': False, 'amp': 0.5}, 'ZH': {'type': 'fric', 'dur': 80, 'freq_low': 2000, 'freq_high': 6000, 'voiced': True, 'amp': 0.5}, 'HH': {'type': 'fric', 'dur': 80, 'freq_low': 500, 'freq_high': 2000, 'voiced': False, 'amp': 0.3}, 'CH': {'type': 'affric', 'closure': 60, 'fric': 80, 'freq_low': 2000, 'freq_high': 6000, 'voiced': False, 'amp': 0.6}, 'JH': {'type': 'affric', 'closure': 40, 'fric': 60, 'freq_low': 2000, 'freq_high': 6000, 'voiced': True, 'amp': 0.6}, 'M': {'type': 'nasal', 'f1': 280, 'f2': 1000, 'f3': 2200, 'dur': 100, 'amp': 0.8}, 'N': {'type': 'nasal', 'f1': 280, 'f2': 1700, 'f3': 2500, 'dur': 90, 'amp': 0.8}, 'NG': {'type': 'nasal', 'f1': 300, 'f2': 2000, 'f3': 2700, 'dur': 100, 'amp': 0.8}, 'L': {'type': 'liquid', 'f1': 380, 'f2': 1000, 'f3': 2700, 'dur': 90, 'amp': 0.85}, 'R': {'type': 'liquid', 'f1': 350, 'f2': 1300, 'f3': 1700, 'dur': 90, 'amp': 0.85}, 'W': {'type': 'glide', 'f1': 300, 'f2': 700, 'f3': 2200, 'dur': 80, 'amp': 0.8}, 'Y': {'type': 'glide', 'f1': 280, 'f2': 2200, 'f3': 2900, 'dur': 70, 'amp': 0.8}, } SILENCE = {'SIL': 60, 'PAU': 200} # ============================================ # PRONUNCIATION DICTIONARY # ============================================ DICTIONARY = { # Function words 'a': ['AX'], 'an': ['AE', 'N'], 'the': ['DH', 'AX'], 'and': ['AE', 'N', 'D'], 'or': ['AO', 'R'], 'but': ['B', 'AH', 'T'], 'if': ['IH', 'F'], 'of': ['AH', 'V'], 'to': ['T', 'UW'], 'in': ['IH', 'N'], 'on': ['AA', 'N'], 'at': ['AE', 'T'], 'by': ['B', 'AY'], 'for': ['F', 'AO', 'R'], 'with': ['W', 'IH', 'TH'], 'from': ['F', 'R', 'AH', 'M'], 'up': ['AH', 'P'], 'out': ['AW', 'T'], 'as': ['AE', 'Z'], 'so': ['S', 'OW'], 'not': ['N', 'AA', 'T'], # Pronouns 'i': ['AY'], 'me': ['M', 'IY'], 'my': ['M', 'AY'], 'you': ['Y', 'UW'], 'your': ['Y', 'AO', 'R'], 'he': ['HH', 'IY'], 'him': ['HH', 'IH', 'M'], 'his': ['HH', 'IH', 'Z'], 'she': ['SH', 'IY'], 'her': ['HH', 'ER'], 'it': ['IH', 'T'], 'its': ['IH', 'T', 'S'], 'we': ['W', 'IY'], 'us': ['AH', 'S'], 'our': ['AW', 'ER'], 'they': ['DH', 'EY'], 'them': ['DH', 'EH', 'M'], 'their': ['DH', 'EH', 'R'], 'this': ['DH', 'IH', 'S'], 'that': ['DH', 'AE', 'T'], 'what': ['W', 'AH', 'T'], 'who': ['HH', 'UW'], 'where': ['W', 'EH', 'R'], 'when': ['W', 'EH', 'N'], 'why': ['W', 'AY'], 'how': ['HH', 'AW'], 'which': ['W', 'IH', 'CH'], # Be verbs 'am': ['AE', 'M'], 'is': ['IH', 'Z'], 'are': ['AA', 'R'], 'was': ['W', 'AA', 'Z'], 'were': ['W', 'ER'], 'be': ['B', 'IY'], 'been': ['B', 'IH', 'N'], 'being': ['B', 'IY', 'IH', 'NG'], # Have verbs 'have': ['HH', 'AE', 'V'], 'has': ['HH', 'AE', 'Z'], 'had': ['HH', 'AE', 'D'], 'having': ['HH', 'AE', 'V', 'IH', 'NG'], # Do verbs 'do': ['D', 'UW'], 'does': ['D', 'AH', 'Z'], 'did': ['D', 'IH', 'D'], 'done': ['D', 'AH', 'N'], # Modal verbs 'will': ['W', 'IH', 'L'], 'would': ['W', 'UH', 'D'], 'can': ['K', 'AE', 'N'], 'could': ['K', 'UH', 'D'], 'should': ['SH', 'UH', 'D'], 'may': ['M', 'EY'], 'might': ['M', 'AY', 'T'], 'must': ['M', 'AH', 'S', 'T'], # Common verbs 'go': ['G', 'OW'], 'goes': ['G', 'OW', 'Z'], 'going': ['G', 'OW', 'IH', 'NG'], 'went': ['W', 'EH', 'N', 'T'], 'gone': ['G', 'AO', 'N'], 'come': ['K', 'AH', 'M'], 'comes': ['K', 'AH', 'M', 'Z'], 'coming': ['K', 'AH', 'M', 'IH', 'NG'], 'came': ['K', 'EY', 'M'], 'get': ['G', 'EH', 'T'], 'gets': ['G', 'EH', 'T', 'S'], 'getting': ['G', 'EH', 'T', 'IH', 'NG'], 'got': ['G', 'AA', 'T'], 'make': ['M', 'EY', 'K'], 'makes': ['M', 'EY', 'K', 'S'], 'making': ['M', 'EY', 'K', 'IH', 'NG'], 'made': ['M', 'EY', 'D'], 'take': ['T', 'EY', 'K'], 'takes': ['T', 'EY', 'K', 'S'], 'took': ['T', 'UH', 'K'], 'taken': ['T', 'EY', 'K', 'AX', 'N'], 'see': ['S', 'IY'], 'sees': ['S', 'IY', 'Z'], 'saw': ['S', 'AO'], 'seen': ['S', 'IY', 'N'], 'say': ['S', 'EY'], 'says': ['S', 'EH', 'Z'], 'said': ['S', 'EH', 'D'], 'know': ['N', 'OW'], 'knows': ['N', 'OW', 'Z'], 'knew': ['N', 'UW'], 'known': ['N', 'OW', 'N'], 'think': ['TH', 'IH', 'NG', 'K'], 'thought': ['TH', 'AO', 'T'], 'want': ['W', 'AA', 'N', 'T'], 'wants': ['W', 'AA', 'N', 'T', 'S'], 'give': ['G', 'IH', 'V'], 'gives': ['G', 'IH', 'V', 'Z'], 'gave': ['G', 'EY', 'V'], 'given': ['G', 'IH', 'V', 'AX', 'N'], 'tell': ['T', 'EH', 'L'], 'told': ['T', 'OW', 'L', 'D'], 'ask': ['AE', 'S', 'K'], 'asked': ['AE', 'S', 'K', 'T'], 'use': ['Y', 'UW', 'Z'], 'used': ['Y', 'UW', 'Z', 'D'], 'find': ['F', 'AY', 'N', 'D'], 'found': ['F', 'AW', 'N', 'D'], 'work': ['W', 'ER', 'K'], 'works': ['W', 'ER', 'K', 'S'], 'call': ['K', 'AO', 'L'], 'called': ['K', 'AO', 'L', 'D'], 'try': ['T', 'R', 'AY'], 'tried': ['T', 'R', 'AY', 'D'], 'need': ['N', 'IY', 'D'], 'needs': ['N', 'IY', 'D', 'Z'], 'feel': ['F', 'IY', 'L'], 'feels': ['F', 'IY', 'L', 'Z'], 'help': ['HH', 'EH', 'L', 'P'], 'helps': ['HH', 'EH', 'L', 'P', 'S'], 'keep': ['K', 'IY', 'P'], 'kept': ['K', 'EH', 'P', 'T'], 'let': ['L', 'EH', 'T'], 'put': ['P', 'UH', 'T'], 'seem': ['S', 'IY', 'M'], 'leave': ['L', 'IY', 'V'], 'show': ['SH', 'OW'], 'hear': ['HH', 'IY', 'R'], 'play': ['P', 'L', 'EY'], 'run': ['R', 'AH', 'N'], 'move': ['M', 'UW', 'V'], 'live': ['L', 'IH', 'V'], 'believe': ['B', 'IH', 'L', 'IY', 'V'], 'read': ['R', 'IY', 'D'], 'write': ['R', 'AY', 'T'], 'learn': ['L', 'ER', 'N'], 'speak': ['S', 'P', 'IY', 'K'], 'look': ['L', 'UH', 'K'], 'like': ['L', 'AY', 'K'], 'love': ['L', 'AH', 'V'], 'start': ['S', 'T', 'AA', 'R', 'T'], 'stop': ['S', 'T', 'AA', 'P'], # Adjectives 'good': ['G', 'UH', 'D'], 'better': ['B', 'EH', 'T', 'ER'], 'best': ['B', 'EH', 'S', 'T'], 'bad': ['B', 'AE', 'D'], 'new': ['N', 'UW'], 'old': ['OW', 'L', 'D'], 'big': ['B', 'IH', 'G'], 'small': ['S', 'M', 'AO', 'L'], 'long': ['L', 'AO', 'NG'], 'short': ['SH', 'AO', 'R', 'T'], 'high': ['HH', 'AY'], 'low': ['L', 'OW'], 'great': ['G', 'R', 'EY', 'T'], 'little': ['L', 'IH', 'T', 'AX', 'L'], 'right': ['R', 'AY', 'T'], 'wrong': ['R', 'AO', 'NG'], 'first': ['F', 'ER', 'S', 'T'], 'last': ['L', 'AE', 'S', 'T'], 'same': ['S', 'EY', 'M'], 'different': ['D', 'IH', 'F', 'R', 'AX', 'N', 'T'], 'own': ['OW', 'N'], 'other': ['AH', 'DH', 'ER'], 'nice': ['N', 'AY', 'S'], 'happy': ['HH', 'AE', 'P', 'IY'], 'sure': ['SH', 'UH', 'R'], 'true': ['T', 'R', 'UW'], 'real': ['R', 'IY', 'L'], 'clear': ['K', 'L', 'IY', 'R'], 'fine': ['F', 'AY', 'N'], 'free': ['F', 'R', 'IY'], 'easy': ['IY', 'Z', 'IY'], 'hard': ['HH', 'AA', 'R', 'D'], 'young': ['Y', 'AH', 'NG'], 'beautiful': ['B', 'Y', 'UW', 'T', 'IH', 'F', 'AX', 'L'], # Adverbs 'very': ['V', 'EH', 'R', 'IY'], 'really': ['R', 'IY', 'L', 'IY'], 'just': ['JH', 'AH', 'S', 'T'], 'only': ['OW', 'N', 'L', 'IY'], 'also': ['AO', 'L', 'S', 'OW'], 'well': ['W', 'EH', 'L'], 'now': ['N', 'AW'], 'then': ['DH', 'EH', 'N'], 'here': ['HH', 'IY', 'R'], 'there': ['DH', 'EH', 'R'], 'still': ['S', 'T', 'IH', 'L'], 'even': ['IY', 'V', 'AX', 'N'], 'back': ['B', 'AE', 'K'], 'again': ['AX', 'G', 'EH', 'N'], 'always': ['AO', 'L', 'W', 'EY', 'Z'], 'never': ['N', 'EH', 'V', 'ER'], 'today': ['T', 'AX', 'D', 'EY'], 'maybe': ['M', 'EY', 'B', 'IY'], 'too': ['T', 'UW'], 'much': ['M', 'AH', 'CH'], 'more': ['M', 'AO', 'R'], 'most': ['M', 'OW', 'S', 'T'], 'please': ['P', 'L', 'IY', 'Z'], # Nouns 'time': ['T', 'AY', 'M'], 'year': ['Y', 'IY', 'R'], 'day': ['D', 'EY'], 'way': ['W', 'EY'], 'man': ['M', 'AE', 'N'], 'woman': ['W', 'UH', 'M', 'AX', 'N'], 'child': ['CH', 'AY', 'L', 'D'], 'world': ['W', 'ER', 'L', 'D'], 'life': ['L', 'AY', 'F'], 'hand': ['HH', 'AE', 'N', 'D'], 'part': ['P', 'AA', 'R', 'T'], 'place': ['P', 'L', 'EY', 'S'], 'thing': ['TH', 'IH', 'NG'], 'things': ['TH', 'IH', 'NG', 'Z'], 'people': ['P', 'IY', 'P', 'AX', 'L'], 'person': ['P', 'ER', 'S', 'AX', 'N'], 'home': ['HH', 'OW', 'M'], 'house': ['HH', 'AW', 'S'], 'word': ['W', 'ER', 'D'], 'name': ['N', 'EY', 'M'], 'water': ['W', 'AO', 'T', 'ER'], 'money': ['M', 'AH', 'N', 'IY'], 'family': ['F', 'AE', 'M', 'AX', 'L', 'IY'], 'friend': ['F', 'R', 'EH', 'N', 'D'], 'friends': ['F', 'R', 'EH', 'N', 'D', 'Z'], 'mother': ['M', 'AH', 'DH', 'ER'], 'father': ['F', 'AA', 'DH', 'ER'], 'boy': ['B', 'OY'], 'girl': ['G', 'ER', 'L'], 'head': ['HH', 'EH', 'D'], 'face': ['F', 'EY', 'S'], 'eye': ['AY'], 'eyes': ['AY', 'Z'], 'voice': ['V', 'OY', 'S'], 'night': ['N', 'AY', 'T'], 'morning': ['M', 'AO', 'R', 'N', 'IH', 'NG'], 'week': ['W', 'IY', 'K'], 'month': ['M', 'AH', 'N', 'TH'], 'school': ['S', 'K', 'UW', 'L'], 'book': ['B', 'UH', 'K'], 'story': ['S', 'T', 'AO', 'R', 'IY'], 'question': ['K', 'W', 'EH', 'S', 'CH', 'AX', 'N'], 'answer': ['AE', 'N', 'S', 'ER'], # Numbers 'zero': ['Z', 'IY', 'R', 'OW'], 'one': ['W', 'AH', 'N'], 'two': ['T', 'UW'], 'three': ['TH', 'R', 'IY'], 'four': ['F', 'AO', 'R'], 'five': ['F', 'AY', 'V'], 'six': ['S', 'IH', 'K', 'S'], 'seven': ['S', 'EH', 'V', 'AX', 'N'], 'eight': ['EY', 'T'], 'nine': ['N', 'AY', 'N'], 'ten': ['T', 'EH', 'N'], # Greetings 'hello': ['HH', 'AX', 'L', 'OW'], 'hi': ['HH', 'AY'], 'hey': ['HH', 'EY'], 'goodbye': ['G', 'UH', 'D', 'B', 'AY'], 'bye': ['B', 'AY'], 'welcome': ['W', 'EH', 'L', 'K', 'AX', 'M'], 'thank': ['TH', 'AE', 'NG', 'K'], 'thanks': ['TH', 'AE', 'NG', 'K', 'S'], 'sorry': ['S', 'AA', 'R', 'IY'], 'yes': ['Y', 'EH', 'S'], 'yeah': ['Y', 'AE'], 'no': ['N', 'OW'], 'ok': ['OW', 'K', 'EY'], 'okay': ['OW', 'K', 'EY'], # Tech/TTS 'text': ['T', 'EH', 'K', 'S', 'T'], 'speech': ['S', 'P', 'IY', 'CH'], 'sound': ['S', 'AW', 'N', 'D'], 'audio': ['AO', 'D', 'IY', 'OW'], 'test': ['T', 'EH', 'S', 'T'], 'testing': ['T', 'EH', 'S', 'T', 'IH', 'NG'], 'computer': ['K', 'AX', 'M', 'P', 'Y', 'UW', 'T', 'ER'], 'vedes': ['V', 'EY', 'D', 'EH', 'S'], 'system': ['S', 'IH', 'S', 'T', 'AX', 'M'], 'train': ['T', 'R', 'EY', 'N'], 'training': ['T', 'R', 'EY', 'N', 'IH', 'NG'], } # Letter patterns PATTERNS = [ ('tion', ['SH', 'AX', 'N']), ('sion', ['ZH', 'AX', 'N']), ('ness', ['N', 'AX', 'S']), ('ment', ['M', 'AX', 'N', 'T']), ('able', ['AX', 'B', 'AX', 'L']), ('ible', ['AX', 'B', 'AX', 'L']), ('ful', ['F', 'AX', 'L']), ('less', ['L', 'AX', 'S']), ('ing', ['IH', 'NG']), ('ight', ['AY', 'T']), ('ough', ['AO']), ('ould', ['UH', 'D']), ('th', ['TH']), ('sh', ['SH']), ('ch', ['CH']), ('wh', ['W']), ('ph', ['F']), ('ck', ['K']), ('ng', ['NG']), ('qu', ['K', 'W']), ('ee', ['IY']), ('ea', ['IY']), ('oo', ['UW']), ('ou', ['AW']), ('ow', ['OW']), ('ai', ['EY']), ('ay', ['EY']), ('ey', ['IY']), ('oy', ['OY']), ('oi', ['OY']), ('ie', ['IY']), ('er', ['ER']), ('ir', ['ER']), ('ur', ['ER']), ('ar', ['AA', 'R']), ('or', ['AO', 'R']), ] LETTERS = { 'a': 'AE', 'b': 'B', 'c': 'K', 'd': 'D', 'e': 'EH', 'f': 'F', 'g': 'G', 'h': 'HH', 'i': 'IH', 'j': 'JH', 'k': 'K', 'l': 'L', 'm': 'M', 'n': 'N', 'o': 'AA', 'p': 'P', 'q': 'K', 'r': 'R', 's': 'S', 't': 'T', 'u': 'AH', 'v': 'V', 'w': 'W', 'x': 'K', 'y': 'IY', 'z': 'Z', } # ============================================ # VOICE ANALYZER # ============================================ class VoiceAnalyzer: """Analyze audio to extract voice characteristics""" def __init__(self, sample_rate=22050): self.sr = sample_rate def analyze(self, audio): """Extract voice features from audio sample""" if len(audio) < self.sr * 0.3: return None audio = audio.astype(np.float32) max_val = np.max(np.abs(audio)) if max_val > 0: audio = audio / max_val f0 = self._estimate_pitch(audio) formants = self._estimate_formants(audio) breathiness = self._estimate_breathiness(audio) profile = { "name": "Custom Voice", "gender": "custom", "f0": f0, "f0_variation": self._estimate_f0_variation(audio, f0), "formant_shift": formants.get('shift', 1.0), "breathiness": breathiness, "speed": 1.0, "brightness": formants.get('brightness', 1.0), "description": f"Custom voice (F0={f0:.0f}Hz)" } return profile def _estimate_pitch(self, audio): """Estimate fundamental frequency using autocorrelation""" frame_size = int(self.sr * 0.03) pitches = [] for i in range(0, len(audio) - frame_size, frame_size): frame = audio[i:i + frame_size] # Remove DC frame = frame - np.mean(frame) # Autocorrelation corr = np.correlate(frame, frame, mode='full') corr = corr[len(corr)//2:] # Find peaks d = np.diff(corr) start_indices = np.where(d > 0)[0] if len(start_indices) > 0: start = start_indices[0] search_end = min(start + int(self.sr / 60), len(corr)) if search_end > start: peak = start + np.argmax(corr[start:search_end]) if peak > 0: f0 = self.sr / peak if 60 < f0 < 400: pitches.append(f0) if pitches: return float(np.median(pitches)) return 130.0 def _estimate_f0_variation(self, audio, base_f0): """Estimate pitch variation""" frame_size = int(self.sr * 0.03) pitches = [] for i in range(0, len(audio) - frame_size, frame_size): frame = audio[i:i + frame_size] frame = frame - np.mean(frame) corr = np.correlate(frame, frame, mode='full') corr = corr[len(corr)//2:] d = np.diff(corr) start_indices = np.where(d > 0)[0] if len(start_indices) > 0: start = start_indices[0] search_end = min(start + int(self.sr / 60), len(corr)) if search_end > start: peak = start + np.argmax(corr[start:search_end]) if peak > 0: f0 = self.sr / peak if 60 < f0 < 400: pitches.append(f0) if len(pitches) > 2: return min(float(np.std(pitches)), 50.0) return 20.0 def _estimate_formants(self, audio): """Estimate formant characteristics""" frame_size = 2048 if len(audio) < frame_size: return {'shift': 1.0, 'brightness': 1.0} spectrum = np.abs(np.fft.rfft(audio[:frame_size] * np.hanning(frame_size))) freqs = np.fft.rfftfreq(frame_size, 1/self.sr) total_energy = np.sum(spectrum) + 1e-8 centroid = np.sum(freqs * spectrum) / total_energy if centroid > 1600: shift = 1.2 brightness = 1.15 elif centroid > 1400: shift = 1.1 brightness = 1.05 elif centroid > 1200: shift = 1.0 brightness = 1.0 elif centroid > 1000: shift = 0.9 brightness = 0.95 else: shift = 0.85 brightness = 0.9 return {'shift': shift, 'brightness': brightness} def _estimate_breathiness(self, audio): """Estimate breathiness""" frame_size = 2048 if len(audio) < frame_size: return 0.03 spectrum = np.abs(np.fft.rfft(audio[:frame_size])) freqs = np.fft.rfftfreq(frame_size, 1/self.sr) low_mask = freqs < 1000 high_mask = (freqs > 2000) & (freqs < 5000) low_energy = np.sum(spectrum[low_mask]) + 1e-8 high_energy = np.sum(spectrum[high_mask]) ratio = high_energy / low_energy breathiness = float(np.clip(ratio * 0.1, 0.02, 0.1)) return breathiness # ============================================ # TEXT TO PHONEME CONVERTER # ============================================ class TextToPhoneme: def __init__(self): self.dictionary = DICTIONARY self.patterns = sorted(PATTERNS, key=lambda x: -len(x[0])) def convert(self, text): text = text.lower().strip() text = re.sub(r"[^\w\s.,!?']", '', text) tokens = re.findall(r"[\w']+|[.,!?]", text) phonemes = [] for i, token in enumerate(tokens): if token in '.,!?': phonemes.append('PAU') elif token in self.dictionary: phonemes.extend(self.dictionary[token]) if i < len(tokens) - 1 and tokens[i + 1] not in '.,!?': phonemes.append('SIL') else: phons = self._convert_word(token) phonemes.extend(phons) if i < len(tokens) - 1 and tokens[i + 1] not in '.,!?': phonemes.append('SIL') return phonemes def _convert_word(self, word): phonemes = [] i = 0 while i < len(word): matched = False for pattern, phons in self.patterns: if word[i:].startswith(pattern): phonemes.extend(phons) i += len(pattern) matched = True break if not matched: char = word[i] if char in LETTERS: phonemes.append(LETTERS[char]) i += 1 return phonemes # ============================================ # VOICE SYNTHESIZER # ============================================ class VoiceSynthesizer: def __init__(self, sample_rate=22050): self.sr = sample_rate self.default_voice = VOICE_PROFILES["Emma (Female)"] def synthesize(self, phonemes, voice_profile=None, rate=1.0, pitch=1.0): if not phonemes: return np.zeros(int(self.sr * 0.5), dtype=np.float32) voice = voice_profile or self.default_voice f0 = voice.get('f0', 130) * pitch f0_var = voice.get('f0_variation', 20) formant_shift = voice.get('formant_shift', 1.0) breathiness = voice.get('breathiness', 0.03) voice_speed = voice.get('speed', 1.0) * rate brightness = voice.get('brightness', 1.0) segments = [] for i, phon in enumerate(phonemes): prev_phon = phonemes[i - 1] if i > 0 else None next_phon = phonemes[i + 1] if i < len(phonemes) - 1 else None phrase_pos = i / max(len(phonemes), 1) f0_current = f0 + f0_var * np.sin(phrase_pos * np.pi) * 0.5 seg = self._synth_phoneme( phon, f0_current, voice_speed, formant_shift, breathiness, brightness, prev_phon, next_phon ) segments.append(seg) audio = self._smooth_concat(segments) audio = self._normalize(audio) return audio.astype(np.float32) def _synth_phoneme(self, phon, f0, speed, formant_shift, breathiness, brightness, prev_phon, next_phon): if phon in SILENCE: dur = int(self.sr * SILENCE[phon] / 1000 / speed) return np.zeros(dur, dtype=np.float32) if phon in VOWELS: return self._synth_vowel(phon, f0, speed, formant_shift, breathiness, brightness) if phon in CONSONANTS: return self._synth_consonant(phon, f0, speed, formant_shift, breathiness) return np.zeros(100, dtype=np.float32) def _synth_vowel(self, phon, f0, speed, formant_shift, breathiness, brightness): params = VOWELS[phon] f1, f2, f3, dur_ms, amp, voiced = params f1 = f1 * formant_shift f2 = f2 * formant_shift * brightness f3 = f3 * formant_shift * brightness dur_ms = dur_ms / speed n = int(self.sr * dur_ms / 1000) n = max(n, 100) t = np.arange(n) / self.sr source = self._glottal_source(t, f0, breathiness) audio = self._apply_formants(source, f1, f2, f3) envelope = self._vowel_envelope(n) audio = audio * envelope * amp return audio def _synth_consonant(self, phon, f0, speed, formant_shift, breathiness): params = CONSONANTS[phon] ctype = params['type'] if ctype == 'stop': return self._synth_stop(params, f0, speed, formant_shift) elif ctype == 'fric': return self._synth_fricative(params, f0, speed) elif ctype == 'affric': return self._synth_affricate(params, f0, speed) elif ctype == 'nasal': return self._synth_nasal(params, f0, speed, formant_shift, breathiness) elif ctype == 'liquid': return self._synth_liquid(params, f0, speed, formant_shift, breathiness) elif ctype == 'glide': return self._synth_glide(params, f0, speed, formant_shift, breathiness) return np.zeros(100, dtype=np.float32) def _glottal_source(self, t, f0, breathiness): T0 = 1.0 / f0 phase = (t % T0) / T0 glottal = np.zeros_like(t) mask1 = phase < 0.4 glottal[mask1] = 0.5 * (1 - np.cos(np.pi * phase[mask1] / 0.4)) mask2 = (phase >= 0.4) & (phase < 0.6) glottal[mask2] = np.cos(np.pi * (phase[mask2] - 0.4) / 0.4) glottal += np.random.randn(len(t)) * breathiness shimmer = 1 + 0.02 * np.sin(2 * np.pi * 5 * t) glottal *= shimmer return glottal def _apply_formants(self, source, f1, f2, f3): formants = [(f1, 90), (f2, 110), (f3, 130)] result = np.zeros_like(source) for freq, bw in formants: result += self._resonator(source, freq, bw) return result def _resonator(self, sig, freq, bw): if freq <= 0 or freq >= self.sr / 2: return sig r = np.exp(-np.pi * bw / self.sr) theta = 2 * np.pi * freq / self.sr a1 = -2 * r * np.cos(theta) a2 = r * r b0 = 1 - r y = np.zeros_like(sig) for i in range(2, len(sig)): y[i] = b0 * sig[i] - a1 * y[i-1] - a2 * y[i-2] return y def _vowel_envelope(self, n): env = np.ones(n) attack = max(1, n // 10) release = max(1, int(n * 0.15)) env[:attack] = np.sin(np.linspace(0, np.pi/2, attack)) ** 2 env[-release:] = np.cos(np.linspace(0, np.pi/2, release)) ** 2 return env def _consonant_envelope(self, n): env = np.ones(n) attack = max(1, n // 8) release = max(1, n // 6) env[:attack] = np.linspace(0.1, 1, attack) env[-release:] = np.linspace(1, 0.1, release) return env def _synth_stop(self, params, f0, speed, formant_shift): closure_ms = params['closure'] / speed burst_ms = params['burst'] / speed closure_n = int(self.sr * closure_ms / 1000) burst_n = int(self.sr * burst_ms / 1000) total_n = closure_n + burst_n audio = np.zeros(total_n, dtype=np.float32) if params['voiced']: t = np.arange(closure_n) / self.sr voice_bar = np.sin(2 * np.pi * f0 * 0.8 * t) * 0.15 audio[:closure_n] = voice_bar burst = np.random.randn(burst_n) burst_freq = params['burst_freq'] * formant_shift try: if burst_freq < self.sr / 2 - 100: b, a = signal.butter(2, burst_freq / (self.sr / 2), 'low') burst = signal.filtfilt(b, a, burst) except: pass burst_env = np.exp(-np.linspace(0, 5, burst_n)) burst *= burst_env * params['amp'] audio[closure_n:] = burst return audio def _synth_fricative(self, params, f0, speed): dur_ms = params['dur'] / speed n = int(self.sr * dur_ms / 1000) noise = np.random.randn(n) low = params['freq_low'] high = min(params['freq_high'], self.sr / 2 - 100) try: if low < high: b, a = signal.butter(4, [low / (self.sr / 2), high / (self.sr / 2)], 'band') noise = signal.filtfilt(b, a, noise) except: pass audio = noise * params['amp'] if params['voiced']: t = np.arange(n) / self.sr voice = self._glottal_source(t, f0, 0.03) * 0.3 audio = audio + voice audio *= self._consonant_envelope(n) return audio.astype(np.float32) def _synth_affricate(self, params, f0, speed): closure_ms = params['closure'] / speed fric_ms = params['fric'] / speed closure_n = int(self.sr * closure_ms / 1000) fric_n = int(self.sr * fric_ms / 1000) audio = np.zeros(closure_n + fric_n, dtype=np.float32) if params['voiced']: t = np.arange(closure_n) / self.sr audio[:closure_n] = np.sin(2 * np.pi * f0 * 0.8 * t) * 0.1 fric = np.random.randn(fric_n) low = params['freq_low'] high = min(params['freq_high'], self.sr / 2 - 100) try: b, a = signal.butter(3, [low / (self.sr / 2), high / (self.sr / 2)], 'band') fric = signal.filtfilt(b, a, fric) except: pass fric *= params['amp'] fric_env = np.ones(fric_n) attack = fric_n // 6 release = fric_n // 3 fric_env[:attack] = np.linspace(0, 1, attack) fric_env[-release:] = np.linspace(1, 0, release) audio[closure_n:] = fric * fric_env return audio def _synth_nasal(self, params, f0, speed, formant_shift, breathiness): dur_ms = params['dur'] / speed n = int(self.sr * dur_ms / 1000) t = np.arange(n) / self.sr source = self._glottal_source(t, f0, breathiness) f1 = params['f1'] * formant_shift f2 = params['f2'] * formant_shift f3 = params['f3'] * formant_shift audio = self._apply_formants(source, f1, f2, f3) nasal_pole = self._resonator(source, 250, 100) * 0.4 audio += nasal_pole try: b, a = signal.butter(2, 800 / (self.sr / 2), 'low') audio = signal.filtfilt(b, a, audio) except: pass audio *= params['amp'] * self._consonant_envelope(n) return audio.astype(np.float32) def _synth_liquid(self, params, f0, speed, formant_shift, breathiness): dur_ms = params['dur'] / speed n = int(self.sr * dur_ms / 1000) t = np.arange(n) / self.sr source = self._glottal_source(t, f0, breathiness) f1 = params['f1'] * formant_shift f2 = params['f2'] * formant_shift f3 = params['f3'] * formant_shift audio = self._apply_formants(source, f1, f2, f3) audio *= params['amp'] * self._consonant_envelope(n) return audio.astype(np.float32) def _synth_glide(self, params, f0, speed, formant_shift, breathiness): dur_ms = params['dur'] / speed n = int(self.sr * dur_ms / 1000) t = np.arange(n) / self.sr source = self._glottal_source(t, f0, breathiness) f1 = params['f1'] * formant_shift f2 = params['f2'] * formant_shift f3 = params['f3'] * formant_shift audio = self._apply_formants(source, f1, f2, f3) audio *= params['amp'] * self._consonant_envelope(n) return audio.astype(np.float32) def _smooth_concat(self, segments): if not segments: return np.zeros(1000, dtype=np.float32) if len(segments) == 1: return segments[0] overlap = 64 total_len = sum(len(s) for s in segments) - overlap * (len(segments) - 1) total_len = max(total_len, 100) audio = np.zeros(total_len, dtype=np.float32) pos = 0 for i, seg in enumerate(segments): if len(seg) == 0: continue end_pos = min(pos + len(seg), total_len) seg_len = end_pos - pos if seg_len <= 0: break seg_to_add = seg[:seg_len].copy() if i > 0 and pos > overlap: fade_len = min(overlap, seg_len) fade_in = np.linspace(0, 1, fade_len) ** 0.5 fade_out = np.linspace(1, 0, fade_len) ** 0.5 audio[pos:pos + fade_len] *= fade_out seg_to_add[:fade_len] *= fade_in audio[pos:end_pos] += seg_to_add pos = end_pos - overlap // 2 pos = max(0, pos) return audio def _normalize(self, audio): if len(audio) < 100: return audio audio = audio - np.mean(audio) max_val = np.max(np.abs(audio)) if max_val > 0: audio = audio / max_val * 0.9 fade = min(len(audio) // 40, 200) audio[:fade] *= np.linspace(0, 1, fade) audio[-fade:] *= np.linspace(1, 0, fade) return audio # ============================================ # MAIN TTS CLASS # ============================================ class VedesTTS: def __init__(self, sample_rate=22050): self.sr = sample_rate self.text_to_phoneme = TextToPhoneme() self.synthesizer = VoiceSynthesizer(sample_rate) self.voice_analyzer = VoiceAnalyzer(sample_rate) self.current_voice = VOICE_PROFILES["Emma (Female)"] def get_voice(self, voice_name): if voice_name in VOICE_PROFILES: return VOICE_PROFILES[voice_name] elif voice_name in custom_voices: return custom_voices[voice_name] return self.current_voice def speak(self, text, rate=1.0, pitch=1.0, voice_name=None): if not text or not text.strip(): return np.zeros(self.sr, dtype=np.float32) voice = self.get_voice(voice_name) if voice_name else self.current_voice phonemes = self.text_to_phoneme.convert(text) if not phonemes: return np.zeros(self.sr, dtype=np.float32) audio = self.synthesizer.synthesize(phonemes, voice, rate, pitch) return audio def train_voice(self, audio_data, voice_name="My Voice"): """Train a new voice from audio sample""" global custom_voices if audio_data is None: return None # Handle tuple format (sample_rate, audio) if isinstance(audio_data, tuple): sr, audio = audio_data audio = audio.astype(np.float32) # Handle stereo if len(audio.shape) > 1: audio = audio.mean(axis=1) # Resample if needed if sr != self.sr: duration = len(audio) / sr new_length = int(duration * self.sr) audio = signal.resample(audio, new_length) else: audio = audio_data.astype(np.float32) # Normalize max_val = np.max(np.abs(audio)) if max_val > 0: audio = audio / max_val # Analyze profile = self.voice_analyzer.analyze(audio) if profile: profile['name'] = voice_name profile['description'] = f"Trained voice (F0={profile['f0']:.0f}Hz)" custom_voices[voice_name] = profile return profile return None # ============================================ # INITIALIZE # ============================================ print("=" * 50) print("🎙️ VEDES TTS - With Voice Training") print("=" * 50) tts = VedesTTS(SAMPLE_RATE) print("✅ Ready!") print(f"📢 Available voices: {len(VOICE_PROFILES)}") print("=" * 50) # ============================================ # HELPER FUNCTIONS # ============================================ def get_all_voices(): """Get list of all available voices""" voices = list(VOICE_PROFILES.keys()) + list(custom_voices.keys()) return voices def get_voice_info(voice_name): """Get info about a voice""" if voice_name in VOICE_PROFILES: v = VOICE_PROFILES[voice_name] elif voice_name in custom_voices: v = custom_voices[voice_name] else: return "Select a voice" return f""" **{v.get('name', voice_name)}** - Type: {v.get('gender', 'unknown').title()} - Pitch: {v.get('f0', 130):.0f} Hz - {v.get('description', '')} """ # ============================================ # GRADIO FUNCTIONS # ============================================ def synthesize(text, voice_name, rate, pitch): """Synthesize speech""" if not text or not text.strip(): return None text = text.strip()[:300] try: pitch_mult = 2 ** (pitch / 12) audio = tts.speak(text, rate=rate, pitch=pitch_mult, voice_name=voice_name) if len(audio) < 100: return None audio = np.clip(audio, -1, 1) audio_int16 = (audio * 32767).astype(np.int16) return (SAMPLE_RATE, audio_int16) except Exception as e: print(f"Synthesis error: {e}") return None def train_voice(audio, voice_name): """Train a new voice from audio""" global custom_voices if audio is None: return "❌ Please record or upload audio first.", get_all_voices() if not voice_name or not voice_name.strip(): voice_name = f"Custom Voice {len(custom_voices) + 1}" voice_name = voice_name.strip()[:30] # Check if name already exists if voice_name in VOICE_PROFILES: voice_name = f"{voice_name} (custom)" try: profile = tts.train_voice(audio, voice_name) if profile: result = f""" ✅ **Voice "{voice_name}" created!** **Detected Parameters:** - Pitch (F0): {profile['f0']:.1f} Hz - Pitch Variation: {profile['f0_variation']:.1f} Hz - Formant Shift: {profile['formant_shift']:.2f} - Breathiness: {profile['breathiness']:.3f} - Brightness: {profile['brightness']:.2f} You can now select this voice in the Speak tab! """ return result, get_all_voices() else: return "❌ Could not analyze voice. Try a longer/clearer sample.", get_all_voices() except Exception as e: return f"❌ Error: {str(e)}", get_all_voices() def create_custom_voice(name, pitch, formant, breathiness, speed, brightness): """Create a custom voice from parameters""" global custom_voices if not name or not name.strip(): return "❌ Please enter a voice name.", get_all_voices() name = name.strip()[:30] if name in VOICE_PROFILES: name = f"{name} (custom)" profile = { "name": name, "gender": "custom", "f0": pitch, "f0_variation": 25, "formant_shift": formant, "breathiness": breathiness / 100, "speed": speed, "brightness": brightness, "description": f"Custom voice (F0={pitch}Hz)" } custom_voices[name] = profile return f"✅ Voice **{name}** created! Select it in the Speak tab.", get_all_voices() def refresh_voices(): """Refresh the voice list""" return gr.update(choices=get_all_voices()) # ============================================ # GRADIO INTERFACE # ============================================ with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🎙️ Vedes TTS - Voice Training Edition ### Create and Use Custom Voices - 100% From Scratch """) with gr.Tabs(): # ===== SPEAK TAB ===== with gr.TabItem("🔊 Speak"): with gr.Row(): with gr.Column(scale=2): text_input = gr.Textbox( label="📝 Text to Speak", placeholder="Type something...", lines=3 ) with gr.Row(): voice_select = gr.Dropdown( choices=get_all_voices(), value="Emma (Female)", label="🗣️ Voice", interactive=True ) refresh_btn = gr.Button("🔄", size="sm") voice_info = gr.Markdown("Select a voice") with gr.Row(): rate = gr.Slider(0.6, 1.5, 0.9, step=0.1, label="⏱️ Speed") pitch = gr.Slider(-6, 6, 0, step=1, label="🎵 Pitch") speak_btn = gr.Button("🔊 Speak", variant="primary", size="lg") with gr.Column(scale=1): audio_out = gr.Audio(label="🎧 Output", type="numpy") gr.Examples( examples=[ ["Hello, how are you?"], ["Good morning!"], ["My name is Vedes."], ["Thank you very much."], ["Have a nice day."], ], inputs=text_input, label="📚 Examples" ) # ===== TRAIN VOICE TAB ===== with gr.TabItem("🎤 Train Voice"): gr.Markdown(""" ### Train a New Voice from Audio Record or upload 3-10 seconds of clear speech. **Tips:** - Speak naturally and clearly - Avoid background noise - Read a few sentences """) with gr.Row(): with gr.Column(): audio_input = gr.Audio( label="🎤 Record or Upload", sources=["microphone", "upload"], type="numpy" ) voice_name_input = gr.Textbox( label="Voice Name", placeholder="e.g., My Voice", value="" ) train_btn = gr.Button("🧠 Train Voice", variant="primary") with gr.Column(): train_result = gr.Markdown("Record audio and click Train") gr.Markdown(""" ### What Gets Analyzed: - **Pitch (F0)**: How high/low the voice is - **Formants**: Voice quality/timbre - **Breathiness**: Air in the voice """) # ===== CREATE VOICE TAB ===== with gr.TabItem("⚙️ Create Voice"): gr.Markdown("### Create Custom Voice Manually") with gr.Row(): with gr.Column(): custom_name = gr.Textbox( label="Voice Name", placeholder="My Custom Voice" ) custom_pitch = gr.Slider( 60, 300, 150, label="Pitch (Hz)", info="60-130=Male, 150-250=Female, 250+=Child" ) custom_formant = gr.Slider( 0.7, 1.4, 1.0, step=0.05, label="Formant Shift", info="<1.0=Male, >1.0=Female/Child" ) custom_breathiness = gr.Slider( 1, 10, 3, label="Breathiness (%)" ) custom_speed = gr.Slider( 0.7, 1.3, 1.0, step=0.05, label="Natural Speed" ) custom_brightness = gr.Slider( 0.8, 1.3, 1.0, step=0.05, label="Brightness" ) create_btn = gr.Button("✨ Create Voice", variant="primary") with gr.Column(): create_result = gr.Markdown("") gr.Markdown(""" ### Quick Presets: | Type | Pitch | Formant | |------|-------|---------| | Deep Male | 85 | 0.85 | | Male | 120 | 0.92 | | Female | 200 | 1.12 | | High Female | 240 | 1.20 | | Child | 280 | 1.25 | """) # ===== ALL VOICES TAB ===== with gr.TabItem("👥 All Voices"): gr.Markdown("### Pre-built Voices") voice_info_md = "" for name, v in VOICE_PROFILES.items(): voice_info_md += f""" **{name}** - Type: {v['gender'].title()} | Pitch: {v['f0']} Hz - {v['description']} """ gr.Markdown(voice_info_md) gr.Markdown("### Custom Voices") custom_voices_display = gr.Markdown("*No custom voices yet*") # ===== EVENT HANDLERS ===== # Speak tab voice_select.change(get_voice_info, voice_select, voice_info) refresh_btn.click(refresh_voices, outputs=voice_select) speak_btn.click(synthesize, [text_input, voice_select, rate, pitch], audio_out) text_input.submit(synthesize, [text_input, voice_select, rate, pitch], audio_out) # Train tab - Fixed: update choices first, then set value separately def train_and_update(audio, name): result, voices = train_voice(audio, name) # Return result and updated dropdown with new choices return result, gr.update(choices=voices) train_btn.click( train_and_update, [audio_input, voice_name_input], [train_result, voice_select] ) # Create tab - Fixed similarly def create_and_update(name, pitch, formant, breathiness, speed, brightness): result, voices = create_custom_voice(name, pitch, formant, breathiness, speed, brightness) return result, gr.update(choices=voices) create_btn.click( create_and_update, [custom_name, custom_pitch, custom_formant, custom_breathiness, custom_speed, custom_brightness], [create_result, voice_select] ) # ============================================ # LAUNCH # ============================================ if __name__ == "__main__": demo.launch()