|
|
import numpy as np |
|
|
import gradio as gr |
|
|
from scipy import signal |
|
|
from scipy.io import wavfile |
|
|
import tempfile |
|
|
import re |
|
|
import json |
|
|
import os |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SAMPLE_RATE = 22050 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
VOICE_PROFILES = { |
|
|
"Emma (Female)": { |
|
|
"name": "Emma", |
|
|
"gender": "female", |
|
|
"f0": 210, |
|
|
"f0_variation": 30, |
|
|
"formant_shift": 1.15, |
|
|
"breathiness": 0.04, |
|
|
"speed": 1.0, |
|
|
"brightness": 1.1, |
|
|
"description": "Friendly female voice" |
|
|
}, |
|
|
"James (Male)": { |
|
|
"name": "James", |
|
|
"gender": "male", |
|
|
"f0": 110, |
|
|
"f0_variation": 20, |
|
|
"formant_shift": 0.9, |
|
|
"breathiness": 0.02, |
|
|
"speed": 0.95, |
|
|
"brightness": 0.95, |
|
|
"description": "Professional male voice" |
|
|
}, |
|
|
"Sophie (Child)": { |
|
|
"name": "Sophie", |
|
|
"gender": "child", |
|
|
"f0": 280, |
|
|
"f0_variation": 40, |
|
|
"formant_shift": 1.25, |
|
|
"breathiness": 0.03, |
|
|
"speed": 1.1, |
|
|
"brightness": 1.2, |
|
|
"description": "Young child voice" |
|
|
}, |
|
|
"David (Deep Male)": { |
|
|
"name": "David", |
|
|
"gender": "male", |
|
|
"f0": 85, |
|
|
"f0_variation": 15, |
|
|
"formant_shift": 0.82, |
|
|
"breathiness": 0.02, |
|
|
"speed": 0.9, |
|
|
"brightness": 0.85, |
|
|
"description": "Deep bass voice" |
|
|
}, |
|
|
"Lisa (Bright Female)": { |
|
|
"name": "Lisa", |
|
|
"gender": "female", |
|
|
"f0": 240, |
|
|
"f0_variation": 35, |
|
|
"formant_shift": 1.2, |
|
|
"breathiness": 0.05, |
|
|
"speed": 1.05, |
|
|
"brightness": 1.15, |
|
|
"description": "Bright, energetic female" |
|
|
}, |
|
|
"Robert (Elderly Male)": { |
|
|
"name": "Robert", |
|
|
"gender": "male", |
|
|
"f0": 95, |
|
|
"f0_variation": 12, |
|
|
"formant_shift": 0.88, |
|
|
"breathiness": 0.06, |
|
|
"speed": 0.85, |
|
|
"brightness": 0.9, |
|
|
"description": "Mature elderly voice" |
|
|
}, |
|
|
"Anna (Soft Female)": { |
|
|
"name": "Anna", |
|
|
"gender": "female", |
|
|
"f0": 195, |
|
|
"f0_variation": 25, |
|
|
"formant_shift": 1.1, |
|
|
"breathiness": 0.07, |
|
|
"speed": 0.92, |
|
|
"brightness": 1.0, |
|
|
"description": "Soft, gentle female" |
|
|
}, |
|
|
"Mike (Energetic Male)": { |
|
|
"name": "Mike", |
|
|
"gender": "male", |
|
|
"f0": 130, |
|
|
"f0_variation": 30, |
|
|
"formant_shift": 0.95, |
|
|
"breathiness": 0.02, |
|
|
"speed": 1.1, |
|
|
"brightness": 1.05, |
|
|
"description": "Energetic young male" |
|
|
}, |
|
|
} |
|
|
|
|
|
|
|
|
custom_voices = {} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
VOWELS = { |
|
|
'IY': (280, 2250, 2890, 150, 1.0, True), |
|
|
'IH': (400, 1920, 2550, 120, 0.9, True), |
|
|
'EH': (550, 1770, 2490, 130, 0.95, True), |
|
|
'AE': (690, 1660, 2490, 140, 1.0, True), |
|
|
'AA': (710, 1100, 2540, 150, 1.0, True), |
|
|
'AO': (590, 880, 2540, 140, 0.95, True), |
|
|
'UH': (470, 1100, 2540, 120, 0.9, True), |
|
|
'UW': (310, 870, 2250, 150, 1.0, True), |
|
|
'AH': (640, 1200, 2400, 100, 0.85, True), |
|
|
'AX': (500, 1500, 2500, 80, 0.7, True), |
|
|
'ER': (500, 1350, 1700, 140, 0.9, True), |
|
|
'EY': (500, 1900, 2600, 160, 1.0, True), |
|
|
'AY': (700, 1200, 2600, 180, 1.0, True), |
|
|
'OY': (500, 900, 2500, 180, 1.0, True), |
|
|
'AW': (700, 1100, 2600, 180, 1.0, True), |
|
|
'OW': (500, 900, 2500, 160, 1.0, True), |
|
|
} |
|
|
|
|
|
CONSONANTS = { |
|
|
'P': {'type': 'stop', 'closure': 80, 'burst': 30, 'voiced': False, 'burst_freq': 800, 'amp': 0.6}, |
|
|
'B': {'type': 'stop', 'closure': 50, 'burst': 25, 'voiced': True, 'burst_freq': 800, 'amp': 0.7}, |
|
|
'T': {'type': 'stop', 'closure': 70, 'burst': 30, 'voiced': False, 'burst_freq': 3500, 'amp': 0.7}, |
|
|
'D': {'type': 'stop', 'closure': 40, 'burst': 25, 'voiced': True, 'burst_freq': 3500, 'amp': 0.7}, |
|
|
'K': {'type': 'stop', 'closure': 80, 'burst': 40, 'voiced': False, 'burst_freq': 1500, 'amp': 0.7}, |
|
|
'G': {'type': 'stop', 'closure': 50, 'burst': 30, 'voiced': True, 'burst_freq': 1500, 'amp': 0.7}, |
|
|
'F': {'type': 'fric', 'dur': 120, 'freq_low': 1500, 'freq_high': 8000, 'voiced': False, 'amp': 0.4}, |
|
|
'V': {'type': 'fric', 'dur': 80, 'freq_low': 1500, 'freq_high': 8000, 'voiced': True, 'amp': 0.5}, |
|
|
'TH': {'type': 'fric', 'dur': 100, 'freq_low': 1400, 'freq_high': 6000, 'voiced': False, 'amp': 0.3}, |
|
|
'DH': {'type': 'fric', 'dur': 60, 'freq_low': 1400, 'freq_high': 6000, 'voiced': True, 'amp': 0.5}, |
|
|
'S': {'type': 'fric', 'dur': 120, 'freq_low': 4000, 'freq_high': 9000, 'voiced': False, 'amp': 0.5}, |
|
|
'Z': {'type': 'fric', 'dur': 90, 'freq_low': 4000, 'freq_high': 9000, 'voiced': True, 'amp': 0.5}, |
|
|
'SH': {'type': 'fric', 'dur': 120, 'freq_low': 2000, 'freq_high': 6000, 'voiced': False, 'amp': 0.5}, |
|
|
'ZH': {'type': 'fric', 'dur': 80, 'freq_low': 2000, 'freq_high': 6000, 'voiced': True, 'amp': 0.5}, |
|
|
'HH': {'type': 'fric', 'dur': 80, 'freq_low': 500, 'freq_high': 2000, 'voiced': False, 'amp': 0.3}, |
|
|
'CH': {'type': 'affric', 'closure': 60, 'fric': 80, 'freq_low': 2000, 'freq_high': 6000, 'voiced': False, 'amp': 0.6}, |
|
|
'JH': {'type': 'affric', 'closure': 40, 'fric': 60, 'freq_low': 2000, 'freq_high': 6000, 'voiced': True, 'amp': 0.6}, |
|
|
'M': {'type': 'nasal', 'f1': 280, 'f2': 1000, 'f3': 2200, 'dur': 100, 'amp': 0.8}, |
|
|
'N': {'type': 'nasal', 'f1': 280, 'f2': 1700, 'f3': 2500, 'dur': 90, 'amp': 0.8}, |
|
|
'NG': {'type': 'nasal', 'f1': 300, 'f2': 2000, 'f3': 2700, 'dur': 100, 'amp': 0.8}, |
|
|
'L': {'type': 'liquid', 'f1': 380, 'f2': 1000, 'f3': 2700, 'dur': 90, 'amp': 0.85}, |
|
|
'R': {'type': 'liquid', 'f1': 350, 'f2': 1300, 'f3': 1700, 'dur': 90, 'amp': 0.85}, |
|
|
'W': {'type': 'glide', 'f1': 300, 'f2': 700, 'f3': 2200, 'dur': 80, 'amp': 0.8}, |
|
|
'Y': {'type': 'glide', 'f1': 280, 'f2': 2200, 'f3': 2900, 'dur': 70, 'amp': 0.8}, |
|
|
} |
|
|
|
|
|
SILENCE = {'SIL': 60, 'PAU': 200} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
DICTIONARY = { |
|
|
|
|
|
'a': ['AX'], 'an': ['AE', 'N'], 'the': ['DH', 'AX'], |
|
|
'and': ['AE', 'N', 'D'], 'or': ['AO', 'R'], 'but': ['B', 'AH', 'T'], |
|
|
'if': ['IH', 'F'], 'of': ['AH', 'V'], 'to': ['T', 'UW'], |
|
|
'in': ['IH', 'N'], 'on': ['AA', 'N'], 'at': ['AE', 'T'], |
|
|
'by': ['B', 'AY'], 'for': ['F', 'AO', 'R'], 'with': ['W', 'IH', 'TH'], |
|
|
'from': ['F', 'R', 'AH', 'M'], 'up': ['AH', 'P'], 'out': ['AW', 'T'], |
|
|
'as': ['AE', 'Z'], 'so': ['S', 'OW'], 'not': ['N', 'AA', 'T'], |
|
|
|
|
|
|
|
|
'i': ['AY'], 'me': ['M', 'IY'], 'my': ['M', 'AY'], |
|
|
'you': ['Y', 'UW'], 'your': ['Y', 'AO', 'R'], |
|
|
'he': ['HH', 'IY'], 'him': ['HH', 'IH', 'M'], 'his': ['HH', 'IH', 'Z'], |
|
|
'she': ['SH', 'IY'], 'her': ['HH', 'ER'], |
|
|
'it': ['IH', 'T'], 'its': ['IH', 'T', 'S'], |
|
|
'we': ['W', 'IY'], 'us': ['AH', 'S'], 'our': ['AW', 'ER'], |
|
|
'they': ['DH', 'EY'], 'them': ['DH', 'EH', 'M'], 'their': ['DH', 'EH', 'R'], |
|
|
'this': ['DH', 'IH', 'S'], 'that': ['DH', 'AE', 'T'], |
|
|
'what': ['W', 'AH', 'T'], 'who': ['HH', 'UW'], |
|
|
'where': ['W', 'EH', 'R'], 'when': ['W', 'EH', 'N'], |
|
|
'why': ['W', 'AY'], 'how': ['HH', 'AW'], 'which': ['W', 'IH', 'CH'], |
|
|
|
|
|
|
|
|
'am': ['AE', 'M'], 'is': ['IH', 'Z'], 'are': ['AA', 'R'], |
|
|
'was': ['W', 'AA', 'Z'], 'were': ['W', 'ER'], |
|
|
'be': ['B', 'IY'], 'been': ['B', 'IH', 'N'], 'being': ['B', 'IY', 'IH', 'NG'], |
|
|
|
|
|
|
|
|
'have': ['HH', 'AE', 'V'], 'has': ['HH', 'AE', 'Z'], |
|
|
'had': ['HH', 'AE', 'D'], 'having': ['HH', 'AE', 'V', 'IH', 'NG'], |
|
|
|
|
|
|
|
|
'do': ['D', 'UW'], 'does': ['D', 'AH', 'Z'], |
|
|
'did': ['D', 'IH', 'D'], 'done': ['D', 'AH', 'N'], |
|
|
|
|
|
|
|
|
'will': ['W', 'IH', 'L'], 'would': ['W', 'UH', 'D'], |
|
|
'can': ['K', 'AE', 'N'], 'could': ['K', 'UH', 'D'], |
|
|
'should': ['SH', 'UH', 'D'], 'may': ['M', 'EY'], |
|
|
'might': ['M', 'AY', 'T'], 'must': ['M', 'AH', 'S', 'T'], |
|
|
|
|
|
|
|
|
'go': ['G', 'OW'], 'goes': ['G', 'OW', 'Z'], 'going': ['G', 'OW', 'IH', 'NG'], |
|
|
'went': ['W', 'EH', 'N', 'T'], 'gone': ['G', 'AO', 'N'], |
|
|
'come': ['K', 'AH', 'M'], 'comes': ['K', 'AH', 'M', 'Z'], |
|
|
'coming': ['K', 'AH', 'M', 'IH', 'NG'], 'came': ['K', 'EY', 'M'], |
|
|
'get': ['G', 'EH', 'T'], 'gets': ['G', 'EH', 'T', 'S'], |
|
|
'getting': ['G', 'EH', 'T', 'IH', 'NG'], 'got': ['G', 'AA', 'T'], |
|
|
'make': ['M', 'EY', 'K'], 'makes': ['M', 'EY', 'K', 'S'], |
|
|
'making': ['M', 'EY', 'K', 'IH', 'NG'], 'made': ['M', 'EY', 'D'], |
|
|
'take': ['T', 'EY', 'K'], 'takes': ['T', 'EY', 'K', 'S'], |
|
|
'took': ['T', 'UH', 'K'], 'taken': ['T', 'EY', 'K', 'AX', 'N'], |
|
|
'see': ['S', 'IY'], 'sees': ['S', 'IY', 'Z'], |
|
|
'saw': ['S', 'AO'], 'seen': ['S', 'IY', 'N'], |
|
|
'say': ['S', 'EY'], 'says': ['S', 'EH', 'Z'], 'said': ['S', 'EH', 'D'], |
|
|
'know': ['N', 'OW'], 'knows': ['N', 'OW', 'Z'], |
|
|
'knew': ['N', 'UW'], 'known': ['N', 'OW', 'N'], |
|
|
'think': ['TH', 'IH', 'NG', 'K'], 'thought': ['TH', 'AO', 'T'], |
|
|
'want': ['W', 'AA', 'N', 'T'], 'wants': ['W', 'AA', 'N', 'T', 'S'], |
|
|
'give': ['G', 'IH', 'V'], 'gives': ['G', 'IH', 'V', 'Z'], |
|
|
'gave': ['G', 'EY', 'V'], 'given': ['G', 'IH', 'V', 'AX', 'N'], |
|
|
'tell': ['T', 'EH', 'L'], 'told': ['T', 'OW', 'L', 'D'], |
|
|
'ask': ['AE', 'S', 'K'], 'asked': ['AE', 'S', 'K', 'T'], |
|
|
'use': ['Y', 'UW', 'Z'], 'used': ['Y', 'UW', 'Z', 'D'], |
|
|
'find': ['F', 'AY', 'N', 'D'], 'found': ['F', 'AW', 'N', 'D'], |
|
|
'work': ['W', 'ER', 'K'], 'works': ['W', 'ER', 'K', 'S'], |
|
|
'call': ['K', 'AO', 'L'], 'called': ['K', 'AO', 'L', 'D'], |
|
|
'try': ['T', 'R', 'AY'], 'tried': ['T', 'R', 'AY', 'D'], |
|
|
'need': ['N', 'IY', 'D'], 'needs': ['N', 'IY', 'D', 'Z'], |
|
|
'feel': ['F', 'IY', 'L'], 'feels': ['F', 'IY', 'L', 'Z'], |
|
|
'help': ['HH', 'EH', 'L', 'P'], 'helps': ['HH', 'EH', 'L', 'P', 'S'], |
|
|
'keep': ['K', 'IY', 'P'], 'kept': ['K', 'EH', 'P', 'T'], |
|
|
'let': ['L', 'EH', 'T'], 'put': ['P', 'UH', 'T'], |
|
|
'seem': ['S', 'IY', 'M'], 'leave': ['L', 'IY', 'V'], |
|
|
'show': ['SH', 'OW'], 'hear': ['HH', 'IY', 'R'], |
|
|
'play': ['P', 'L', 'EY'], 'run': ['R', 'AH', 'N'], |
|
|
'move': ['M', 'UW', 'V'], 'live': ['L', 'IH', 'V'], |
|
|
'believe': ['B', 'IH', 'L', 'IY', 'V'], |
|
|
'read': ['R', 'IY', 'D'], 'write': ['R', 'AY', 'T'], |
|
|
'learn': ['L', 'ER', 'N'], 'speak': ['S', 'P', 'IY', 'K'], |
|
|
'look': ['L', 'UH', 'K'], 'like': ['L', 'AY', 'K'], |
|
|
'love': ['L', 'AH', 'V'], 'start': ['S', 'T', 'AA', 'R', 'T'], |
|
|
'stop': ['S', 'T', 'AA', 'P'], |
|
|
|
|
|
|
|
|
'good': ['G', 'UH', 'D'], 'better': ['B', 'EH', 'T', 'ER'], |
|
|
'best': ['B', 'EH', 'S', 'T'], 'bad': ['B', 'AE', 'D'], |
|
|
'new': ['N', 'UW'], 'old': ['OW', 'L', 'D'], |
|
|
'big': ['B', 'IH', 'G'], 'small': ['S', 'M', 'AO', 'L'], |
|
|
'long': ['L', 'AO', 'NG'], 'short': ['SH', 'AO', 'R', 'T'], |
|
|
'high': ['HH', 'AY'], 'low': ['L', 'OW'], |
|
|
'great': ['G', 'R', 'EY', 'T'], 'little': ['L', 'IH', 'T', 'AX', 'L'], |
|
|
'right': ['R', 'AY', 'T'], 'wrong': ['R', 'AO', 'NG'], |
|
|
'first': ['F', 'ER', 'S', 'T'], 'last': ['L', 'AE', 'S', 'T'], |
|
|
'same': ['S', 'EY', 'M'], 'different': ['D', 'IH', 'F', 'R', 'AX', 'N', 'T'], |
|
|
'own': ['OW', 'N'], 'other': ['AH', 'DH', 'ER'], |
|
|
'nice': ['N', 'AY', 'S'], 'happy': ['HH', 'AE', 'P', 'IY'], |
|
|
'sure': ['SH', 'UH', 'R'], 'true': ['T', 'R', 'UW'], |
|
|
'real': ['R', 'IY', 'L'], 'clear': ['K', 'L', 'IY', 'R'], |
|
|
'fine': ['F', 'AY', 'N'], 'free': ['F', 'R', 'IY'], |
|
|
'easy': ['IY', 'Z', 'IY'], 'hard': ['HH', 'AA', 'R', 'D'], |
|
|
'young': ['Y', 'AH', 'NG'], 'beautiful': ['B', 'Y', 'UW', 'T', 'IH', 'F', 'AX', 'L'], |
|
|
|
|
|
|
|
|
'very': ['V', 'EH', 'R', 'IY'], 'really': ['R', 'IY', 'L', 'IY'], |
|
|
'just': ['JH', 'AH', 'S', 'T'], 'only': ['OW', 'N', 'L', 'IY'], |
|
|
'also': ['AO', 'L', 'S', 'OW'], 'well': ['W', 'EH', 'L'], |
|
|
'now': ['N', 'AW'], 'then': ['DH', 'EH', 'N'], |
|
|
'here': ['HH', 'IY', 'R'], 'there': ['DH', 'EH', 'R'], |
|
|
'still': ['S', 'T', 'IH', 'L'], 'even': ['IY', 'V', 'AX', 'N'], |
|
|
'back': ['B', 'AE', 'K'], 'again': ['AX', 'G', 'EH', 'N'], |
|
|
'always': ['AO', 'L', 'W', 'EY', 'Z'], 'never': ['N', 'EH', 'V', 'ER'], |
|
|
'today': ['T', 'AX', 'D', 'EY'], 'maybe': ['M', 'EY', 'B', 'IY'], |
|
|
'too': ['T', 'UW'], 'much': ['M', 'AH', 'CH'], |
|
|
'more': ['M', 'AO', 'R'], 'most': ['M', 'OW', 'S', 'T'], |
|
|
'please': ['P', 'L', 'IY', 'Z'], |
|
|
|
|
|
|
|
|
'time': ['T', 'AY', 'M'], 'year': ['Y', 'IY', 'R'], |
|
|
'day': ['D', 'EY'], 'way': ['W', 'EY'], |
|
|
'man': ['M', 'AE', 'N'], 'woman': ['W', 'UH', 'M', 'AX', 'N'], |
|
|
'child': ['CH', 'AY', 'L', 'D'], 'world': ['W', 'ER', 'L', 'D'], |
|
|
'life': ['L', 'AY', 'F'], 'hand': ['HH', 'AE', 'N', 'D'], |
|
|
'part': ['P', 'AA', 'R', 'T'], 'place': ['P', 'L', 'EY', 'S'], |
|
|
'thing': ['TH', 'IH', 'NG'], 'things': ['TH', 'IH', 'NG', 'Z'], |
|
|
'people': ['P', 'IY', 'P', 'AX', 'L'], 'person': ['P', 'ER', 'S', 'AX', 'N'], |
|
|
'home': ['HH', 'OW', 'M'], 'house': ['HH', 'AW', 'S'], |
|
|
'word': ['W', 'ER', 'D'], 'name': ['N', 'EY', 'M'], |
|
|
'water': ['W', 'AO', 'T', 'ER'], 'money': ['M', 'AH', 'N', 'IY'], |
|
|
'family': ['F', 'AE', 'M', 'AX', 'L', 'IY'], |
|
|
'friend': ['F', 'R', 'EH', 'N', 'D'], 'friends': ['F', 'R', 'EH', 'N', 'D', 'Z'], |
|
|
'mother': ['M', 'AH', 'DH', 'ER'], 'father': ['F', 'AA', 'DH', 'ER'], |
|
|
'boy': ['B', 'OY'], 'girl': ['G', 'ER', 'L'], |
|
|
'head': ['HH', 'EH', 'D'], 'face': ['F', 'EY', 'S'], |
|
|
'eye': ['AY'], 'eyes': ['AY', 'Z'], |
|
|
'voice': ['V', 'OY', 'S'], 'night': ['N', 'AY', 'T'], |
|
|
'morning': ['M', 'AO', 'R', 'N', 'IH', 'NG'], |
|
|
'week': ['W', 'IY', 'K'], 'month': ['M', 'AH', 'N', 'TH'], |
|
|
'school': ['S', 'K', 'UW', 'L'], 'book': ['B', 'UH', 'K'], |
|
|
'story': ['S', 'T', 'AO', 'R', 'IY'], |
|
|
'question': ['K', 'W', 'EH', 'S', 'CH', 'AX', 'N'], |
|
|
'answer': ['AE', 'N', 'S', 'ER'], |
|
|
|
|
|
|
|
|
'zero': ['Z', 'IY', 'R', 'OW'], 'one': ['W', 'AH', 'N'], |
|
|
'two': ['T', 'UW'], 'three': ['TH', 'R', 'IY'], |
|
|
'four': ['F', 'AO', 'R'], 'five': ['F', 'AY', 'V'], |
|
|
'six': ['S', 'IH', 'K', 'S'], 'seven': ['S', 'EH', 'V', 'AX', 'N'], |
|
|
'eight': ['EY', 'T'], 'nine': ['N', 'AY', 'N'], 'ten': ['T', 'EH', 'N'], |
|
|
|
|
|
|
|
|
'hello': ['HH', 'AX', 'L', 'OW'], 'hi': ['HH', 'AY'], |
|
|
'hey': ['HH', 'EY'], 'goodbye': ['G', 'UH', 'D', 'B', 'AY'], |
|
|
'bye': ['B', 'AY'], 'welcome': ['W', 'EH', 'L', 'K', 'AX', 'M'], |
|
|
'thank': ['TH', 'AE', 'NG', 'K'], 'thanks': ['TH', 'AE', 'NG', 'K', 'S'], |
|
|
'sorry': ['S', 'AA', 'R', 'IY'], |
|
|
'yes': ['Y', 'EH', 'S'], 'yeah': ['Y', 'AE'], 'no': ['N', 'OW'], |
|
|
'ok': ['OW', 'K', 'EY'], 'okay': ['OW', 'K', 'EY'], |
|
|
|
|
|
|
|
|
'text': ['T', 'EH', 'K', 'S', 'T'], 'speech': ['S', 'P', 'IY', 'CH'], |
|
|
'sound': ['S', 'AW', 'N', 'D'], 'audio': ['AO', 'D', 'IY', 'OW'], |
|
|
'test': ['T', 'EH', 'S', 'T'], 'testing': ['T', 'EH', 'S', 'T', 'IH', 'NG'], |
|
|
'computer': ['K', 'AX', 'M', 'P', 'Y', 'UW', 'T', 'ER'], |
|
|
'vedes': ['V', 'EY', 'D', 'EH', 'S'], |
|
|
'system': ['S', 'IH', 'S', 'T', 'AX', 'M'], |
|
|
'train': ['T', 'R', 'EY', 'N'], 'training': ['T', 'R', 'EY', 'N', 'IH', 'NG'], |
|
|
} |
|
|
|
|
|
|
|
|
PATTERNS = [ |
|
|
('tion', ['SH', 'AX', 'N']), ('sion', ['ZH', 'AX', 'N']), |
|
|
('ness', ['N', 'AX', 'S']), ('ment', ['M', 'AX', 'N', 'T']), |
|
|
('able', ['AX', 'B', 'AX', 'L']), ('ible', ['AX', 'B', 'AX', 'L']), |
|
|
('ful', ['F', 'AX', 'L']), ('less', ['L', 'AX', 'S']), |
|
|
('ing', ['IH', 'NG']), ('ight', ['AY', 'T']), |
|
|
('ough', ['AO']), ('ould', ['UH', 'D']), |
|
|
('th', ['TH']), ('sh', ['SH']), ('ch', ['CH']), |
|
|
('wh', ['W']), ('ph', ['F']), ('ck', ['K']), ('ng', ['NG']), |
|
|
('qu', ['K', 'W']), ('ee', ['IY']), ('ea', ['IY']), |
|
|
('oo', ['UW']), ('ou', ['AW']), ('ow', ['OW']), |
|
|
('ai', ['EY']), ('ay', ['EY']), ('ey', ['IY']), |
|
|
('oy', ['OY']), ('oi', ['OY']), ('ie', ['IY']), |
|
|
('er', ['ER']), ('ir', ['ER']), ('ur', ['ER']), |
|
|
('ar', ['AA', 'R']), ('or', ['AO', 'R']), |
|
|
] |
|
|
|
|
|
LETTERS = { |
|
|
'a': 'AE', 'b': 'B', 'c': 'K', 'd': 'D', 'e': 'EH', |
|
|
'f': 'F', 'g': 'G', 'h': 'HH', 'i': 'IH', 'j': 'JH', |
|
|
'k': 'K', 'l': 'L', 'm': 'M', 'n': 'N', 'o': 'AA', |
|
|
'p': 'P', 'q': 'K', 'r': 'R', 's': 'S', 't': 'T', |
|
|
'u': 'AH', 'v': 'V', 'w': 'W', 'x': 'K', 'y': 'IY', 'z': 'Z', |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class VoiceAnalyzer: |
|
|
"""Analyze audio to extract voice characteristics""" |
|
|
|
|
|
def __init__(self, sample_rate=22050): |
|
|
self.sr = sample_rate |
|
|
|
|
|
def analyze(self, audio): |
|
|
"""Extract voice features from audio sample""" |
|
|
if len(audio) < self.sr * 0.3: |
|
|
return None |
|
|
|
|
|
audio = audio.astype(np.float32) |
|
|
max_val = np.max(np.abs(audio)) |
|
|
if max_val > 0: |
|
|
audio = audio / max_val |
|
|
|
|
|
f0 = self._estimate_pitch(audio) |
|
|
formants = self._estimate_formants(audio) |
|
|
breathiness = self._estimate_breathiness(audio) |
|
|
|
|
|
profile = { |
|
|
"name": "Custom Voice", |
|
|
"gender": "custom", |
|
|
"f0": f0, |
|
|
"f0_variation": self._estimate_f0_variation(audio, f0), |
|
|
"formant_shift": formants.get('shift', 1.0), |
|
|
"breathiness": breathiness, |
|
|
"speed": 1.0, |
|
|
"brightness": formants.get('brightness', 1.0), |
|
|
"description": f"Custom voice (F0={f0:.0f}Hz)" |
|
|
} |
|
|
|
|
|
return profile |
|
|
|
|
|
def _estimate_pitch(self, audio): |
|
|
"""Estimate fundamental frequency using autocorrelation""" |
|
|
frame_size = int(self.sr * 0.03) |
|
|
pitches = [] |
|
|
|
|
|
for i in range(0, len(audio) - frame_size, frame_size): |
|
|
frame = audio[i:i + frame_size] |
|
|
|
|
|
|
|
|
frame = frame - np.mean(frame) |
|
|
|
|
|
|
|
|
corr = np.correlate(frame, frame, mode='full') |
|
|
corr = corr[len(corr)//2:] |
|
|
|
|
|
|
|
|
d = np.diff(corr) |
|
|
start_indices = np.where(d > 0)[0] |
|
|
|
|
|
if len(start_indices) > 0: |
|
|
start = start_indices[0] |
|
|
search_end = min(start + int(self.sr / 60), len(corr)) |
|
|
|
|
|
if search_end > start: |
|
|
peak = start + np.argmax(corr[start:search_end]) |
|
|
|
|
|
if peak > 0: |
|
|
f0 = self.sr / peak |
|
|
if 60 < f0 < 400: |
|
|
pitches.append(f0) |
|
|
|
|
|
if pitches: |
|
|
return float(np.median(pitches)) |
|
|
return 130.0 |
|
|
|
|
|
def _estimate_f0_variation(self, audio, base_f0): |
|
|
"""Estimate pitch variation""" |
|
|
frame_size = int(self.sr * 0.03) |
|
|
pitches = [] |
|
|
|
|
|
for i in range(0, len(audio) - frame_size, frame_size): |
|
|
frame = audio[i:i + frame_size] |
|
|
frame = frame - np.mean(frame) |
|
|
|
|
|
corr = np.correlate(frame, frame, mode='full') |
|
|
corr = corr[len(corr)//2:] |
|
|
|
|
|
d = np.diff(corr) |
|
|
start_indices = np.where(d > 0)[0] |
|
|
|
|
|
if len(start_indices) > 0: |
|
|
start = start_indices[0] |
|
|
search_end = min(start + int(self.sr / 60), len(corr)) |
|
|
|
|
|
if search_end > start: |
|
|
peak = start + np.argmax(corr[start:search_end]) |
|
|
if peak > 0: |
|
|
f0 = self.sr / peak |
|
|
if 60 < f0 < 400: |
|
|
pitches.append(f0) |
|
|
|
|
|
if len(pitches) > 2: |
|
|
return min(float(np.std(pitches)), 50.0) |
|
|
return 20.0 |
|
|
|
|
|
def _estimate_formants(self, audio): |
|
|
"""Estimate formant characteristics""" |
|
|
frame_size = 2048 |
|
|
|
|
|
if len(audio) < frame_size: |
|
|
return {'shift': 1.0, 'brightness': 1.0} |
|
|
|
|
|
spectrum = np.abs(np.fft.rfft(audio[:frame_size] * np.hanning(frame_size))) |
|
|
freqs = np.fft.rfftfreq(frame_size, 1/self.sr) |
|
|
|
|
|
total_energy = np.sum(spectrum) + 1e-8 |
|
|
centroid = np.sum(freqs * spectrum) / total_energy |
|
|
|
|
|
if centroid > 1600: |
|
|
shift = 1.2 |
|
|
brightness = 1.15 |
|
|
elif centroid > 1400: |
|
|
shift = 1.1 |
|
|
brightness = 1.05 |
|
|
elif centroid > 1200: |
|
|
shift = 1.0 |
|
|
brightness = 1.0 |
|
|
elif centroid > 1000: |
|
|
shift = 0.9 |
|
|
brightness = 0.95 |
|
|
else: |
|
|
shift = 0.85 |
|
|
brightness = 0.9 |
|
|
|
|
|
return {'shift': shift, 'brightness': brightness} |
|
|
|
|
|
def _estimate_breathiness(self, audio): |
|
|
"""Estimate breathiness""" |
|
|
frame_size = 2048 |
|
|
|
|
|
if len(audio) < frame_size: |
|
|
return 0.03 |
|
|
|
|
|
spectrum = np.abs(np.fft.rfft(audio[:frame_size])) |
|
|
freqs = np.fft.rfftfreq(frame_size, 1/self.sr) |
|
|
|
|
|
low_mask = freqs < 1000 |
|
|
high_mask = (freqs > 2000) & (freqs < 5000) |
|
|
|
|
|
low_energy = np.sum(spectrum[low_mask]) + 1e-8 |
|
|
high_energy = np.sum(spectrum[high_mask]) |
|
|
|
|
|
ratio = high_energy / low_energy |
|
|
breathiness = float(np.clip(ratio * 0.1, 0.02, 0.1)) |
|
|
|
|
|
return breathiness |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TextToPhoneme: |
|
|
def __init__(self): |
|
|
self.dictionary = DICTIONARY |
|
|
self.patterns = sorted(PATTERNS, key=lambda x: -len(x[0])) |
|
|
|
|
|
def convert(self, text): |
|
|
text = text.lower().strip() |
|
|
text = re.sub(r"[^\w\s.,!?']", '', text) |
|
|
|
|
|
tokens = re.findall(r"[\w']+|[.,!?]", text) |
|
|
phonemes = [] |
|
|
|
|
|
for i, token in enumerate(tokens): |
|
|
if token in '.,!?': |
|
|
phonemes.append('PAU') |
|
|
elif token in self.dictionary: |
|
|
phonemes.extend(self.dictionary[token]) |
|
|
if i < len(tokens) - 1 and tokens[i + 1] not in '.,!?': |
|
|
phonemes.append('SIL') |
|
|
else: |
|
|
phons = self._convert_word(token) |
|
|
phonemes.extend(phons) |
|
|
if i < len(tokens) - 1 and tokens[i + 1] not in '.,!?': |
|
|
phonemes.append('SIL') |
|
|
|
|
|
return phonemes |
|
|
|
|
|
def _convert_word(self, word): |
|
|
phonemes = [] |
|
|
i = 0 |
|
|
|
|
|
while i < len(word): |
|
|
matched = False |
|
|
|
|
|
for pattern, phons in self.patterns: |
|
|
if word[i:].startswith(pattern): |
|
|
phonemes.extend(phons) |
|
|
i += len(pattern) |
|
|
matched = True |
|
|
break |
|
|
|
|
|
if not matched: |
|
|
char = word[i] |
|
|
if char in LETTERS: |
|
|
phonemes.append(LETTERS[char]) |
|
|
i += 1 |
|
|
|
|
|
return phonemes |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class VoiceSynthesizer: |
|
|
def __init__(self, sample_rate=22050): |
|
|
self.sr = sample_rate |
|
|
self.default_voice = VOICE_PROFILES["Emma (Female)"] |
|
|
|
|
|
def synthesize(self, phonemes, voice_profile=None, rate=1.0, pitch=1.0): |
|
|
if not phonemes: |
|
|
return np.zeros(int(self.sr * 0.5), dtype=np.float32) |
|
|
|
|
|
voice = voice_profile or self.default_voice |
|
|
|
|
|
f0 = voice.get('f0', 130) * pitch |
|
|
f0_var = voice.get('f0_variation', 20) |
|
|
formant_shift = voice.get('formant_shift', 1.0) |
|
|
breathiness = voice.get('breathiness', 0.03) |
|
|
voice_speed = voice.get('speed', 1.0) * rate |
|
|
brightness = voice.get('brightness', 1.0) |
|
|
|
|
|
segments = [] |
|
|
|
|
|
for i, phon in enumerate(phonemes): |
|
|
prev_phon = phonemes[i - 1] if i > 0 else None |
|
|
next_phon = phonemes[i + 1] if i < len(phonemes) - 1 else None |
|
|
|
|
|
phrase_pos = i / max(len(phonemes), 1) |
|
|
f0_current = f0 + f0_var * np.sin(phrase_pos * np.pi) * 0.5 |
|
|
|
|
|
seg = self._synth_phoneme( |
|
|
phon, f0_current, voice_speed, formant_shift, |
|
|
breathiness, brightness, prev_phon, next_phon |
|
|
) |
|
|
segments.append(seg) |
|
|
|
|
|
audio = self._smooth_concat(segments) |
|
|
audio = self._normalize(audio) |
|
|
|
|
|
return audio.astype(np.float32) |
|
|
|
|
|
def _synth_phoneme(self, phon, f0, speed, formant_shift, breathiness, |
|
|
brightness, prev_phon, next_phon): |
|
|
if phon in SILENCE: |
|
|
dur = int(self.sr * SILENCE[phon] / 1000 / speed) |
|
|
return np.zeros(dur, dtype=np.float32) |
|
|
|
|
|
if phon in VOWELS: |
|
|
return self._synth_vowel(phon, f0, speed, formant_shift, |
|
|
breathiness, brightness) |
|
|
|
|
|
if phon in CONSONANTS: |
|
|
return self._synth_consonant(phon, f0, speed, formant_shift, breathiness) |
|
|
|
|
|
return np.zeros(100, dtype=np.float32) |
|
|
|
|
|
def _synth_vowel(self, phon, f0, speed, formant_shift, breathiness, brightness): |
|
|
params = VOWELS[phon] |
|
|
f1, f2, f3, dur_ms, amp, voiced = params |
|
|
|
|
|
f1 = f1 * formant_shift |
|
|
f2 = f2 * formant_shift * brightness |
|
|
f3 = f3 * formant_shift * brightness |
|
|
|
|
|
dur_ms = dur_ms / speed |
|
|
n = int(self.sr * dur_ms / 1000) |
|
|
n = max(n, 100) |
|
|
t = np.arange(n) / self.sr |
|
|
|
|
|
source = self._glottal_source(t, f0, breathiness) |
|
|
audio = self._apply_formants(source, f1, f2, f3) |
|
|
envelope = self._vowel_envelope(n) |
|
|
audio = audio * envelope * amp |
|
|
|
|
|
return audio |
|
|
|
|
|
def _synth_consonant(self, phon, f0, speed, formant_shift, breathiness): |
|
|
params = CONSONANTS[phon] |
|
|
ctype = params['type'] |
|
|
|
|
|
if ctype == 'stop': |
|
|
return self._synth_stop(params, f0, speed, formant_shift) |
|
|
elif ctype == 'fric': |
|
|
return self._synth_fricative(params, f0, speed) |
|
|
elif ctype == 'affric': |
|
|
return self._synth_affricate(params, f0, speed) |
|
|
elif ctype == 'nasal': |
|
|
return self._synth_nasal(params, f0, speed, formant_shift, breathiness) |
|
|
elif ctype == 'liquid': |
|
|
return self._synth_liquid(params, f0, speed, formant_shift, breathiness) |
|
|
elif ctype == 'glide': |
|
|
return self._synth_glide(params, f0, speed, formant_shift, breathiness) |
|
|
|
|
|
return np.zeros(100, dtype=np.float32) |
|
|
|
|
|
def _glottal_source(self, t, f0, breathiness): |
|
|
T0 = 1.0 / f0 |
|
|
phase = (t % T0) / T0 |
|
|
|
|
|
glottal = np.zeros_like(t) |
|
|
mask1 = phase < 0.4 |
|
|
glottal[mask1] = 0.5 * (1 - np.cos(np.pi * phase[mask1] / 0.4)) |
|
|
|
|
|
mask2 = (phase >= 0.4) & (phase < 0.6) |
|
|
glottal[mask2] = np.cos(np.pi * (phase[mask2] - 0.4) / 0.4) |
|
|
|
|
|
glottal += np.random.randn(len(t)) * breathiness |
|
|
shimmer = 1 + 0.02 * np.sin(2 * np.pi * 5 * t) |
|
|
glottal *= shimmer |
|
|
|
|
|
return glottal |
|
|
|
|
|
def _apply_formants(self, source, f1, f2, f3): |
|
|
formants = [(f1, 90), (f2, 110), (f3, 130)] |
|
|
result = np.zeros_like(source) |
|
|
|
|
|
for freq, bw in formants: |
|
|
result += self._resonator(source, freq, bw) |
|
|
|
|
|
return result |
|
|
|
|
|
def _resonator(self, sig, freq, bw): |
|
|
if freq <= 0 or freq >= self.sr / 2: |
|
|
return sig |
|
|
|
|
|
r = np.exp(-np.pi * bw / self.sr) |
|
|
theta = 2 * np.pi * freq / self.sr |
|
|
|
|
|
a1 = -2 * r * np.cos(theta) |
|
|
a2 = r * r |
|
|
b0 = 1 - r |
|
|
|
|
|
y = np.zeros_like(sig) |
|
|
for i in range(2, len(sig)): |
|
|
y[i] = b0 * sig[i] - a1 * y[i-1] - a2 * y[i-2] |
|
|
|
|
|
return y |
|
|
|
|
|
def _vowel_envelope(self, n): |
|
|
env = np.ones(n) |
|
|
attack = max(1, n // 10) |
|
|
release = max(1, int(n * 0.15)) |
|
|
|
|
|
env[:attack] = np.sin(np.linspace(0, np.pi/2, attack)) ** 2 |
|
|
env[-release:] = np.cos(np.linspace(0, np.pi/2, release)) ** 2 |
|
|
|
|
|
return env |
|
|
|
|
|
def _consonant_envelope(self, n): |
|
|
env = np.ones(n) |
|
|
attack = max(1, n // 8) |
|
|
release = max(1, n // 6) |
|
|
|
|
|
env[:attack] = np.linspace(0.1, 1, attack) |
|
|
env[-release:] = np.linspace(1, 0.1, release) |
|
|
|
|
|
return env |
|
|
|
|
|
def _synth_stop(self, params, f0, speed, formant_shift): |
|
|
closure_ms = params['closure'] / speed |
|
|
burst_ms = params['burst'] / speed |
|
|
|
|
|
closure_n = int(self.sr * closure_ms / 1000) |
|
|
burst_n = int(self.sr * burst_ms / 1000) |
|
|
total_n = closure_n + burst_n |
|
|
|
|
|
audio = np.zeros(total_n, dtype=np.float32) |
|
|
|
|
|
if params['voiced']: |
|
|
t = np.arange(closure_n) / self.sr |
|
|
voice_bar = np.sin(2 * np.pi * f0 * 0.8 * t) * 0.15 |
|
|
audio[:closure_n] = voice_bar |
|
|
|
|
|
burst = np.random.randn(burst_n) |
|
|
burst_freq = params['burst_freq'] * formant_shift |
|
|
|
|
|
try: |
|
|
if burst_freq < self.sr / 2 - 100: |
|
|
b, a = signal.butter(2, burst_freq / (self.sr / 2), 'low') |
|
|
burst = signal.filtfilt(b, a, burst) |
|
|
except: |
|
|
pass |
|
|
|
|
|
burst_env = np.exp(-np.linspace(0, 5, burst_n)) |
|
|
burst *= burst_env * params['amp'] |
|
|
|
|
|
audio[closure_n:] = burst |
|
|
|
|
|
return audio |
|
|
|
|
|
def _synth_fricative(self, params, f0, speed): |
|
|
dur_ms = params['dur'] / speed |
|
|
n = int(self.sr * dur_ms / 1000) |
|
|
|
|
|
noise = np.random.randn(n) |
|
|
|
|
|
low = params['freq_low'] |
|
|
high = min(params['freq_high'], self.sr / 2 - 100) |
|
|
|
|
|
try: |
|
|
if low < high: |
|
|
b, a = signal.butter(4, [low / (self.sr / 2), high / (self.sr / 2)], 'band') |
|
|
noise = signal.filtfilt(b, a, noise) |
|
|
except: |
|
|
pass |
|
|
|
|
|
audio = noise * params['amp'] |
|
|
|
|
|
if params['voiced']: |
|
|
t = np.arange(n) / self.sr |
|
|
voice = self._glottal_source(t, f0, 0.03) * 0.3 |
|
|
audio = audio + voice |
|
|
|
|
|
audio *= self._consonant_envelope(n) |
|
|
|
|
|
return audio.astype(np.float32) |
|
|
|
|
|
def _synth_affricate(self, params, f0, speed): |
|
|
closure_ms = params['closure'] / speed |
|
|
fric_ms = params['fric'] / speed |
|
|
|
|
|
closure_n = int(self.sr * closure_ms / 1000) |
|
|
fric_n = int(self.sr * fric_ms / 1000) |
|
|
|
|
|
audio = np.zeros(closure_n + fric_n, dtype=np.float32) |
|
|
|
|
|
if params['voiced']: |
|
|
t = np.arange(closure_n) / self.sr |
|
|
audio[:closure_n] = np.sin(2 * np.pi * f0 * 0.8 * t) * 0.1 |
|
|
|
|
|
fric = np.random.randn(fric_n) |
|
|
low = params['freq_low'] |
|
|
high = min(params['freq_high'], self.sr / 2 - 100) |
|
|
|
|
|
try: |
|
|
b, a = signal.butter(3, [low / (self.sr / 2), high / (self.sr / 2)], 'band') |
|
|
fric = signal.filtfilt(b, a, fric) |
|
|
except: |
|
|
pass |
|
|
|
|
|
fric *= params['amp'] |
|
|
|
|
|
fric_env = np.ones(fric_n) |
|
|
attack = fric_n // 6 |
|
|
release = fric_n // 3 |
|
|
fric_env[:attack] = np.linspace(0, 1, attack) |
|
|
fric_env[-release:] = np.linspace(1, 0, release) |
|
|
|
|
|
audio[closure_n:] = fric * fric_env |
|
|
|
|
|
return audio |
|
|
|
|
|
def _synth_nasal(self, params, f0, speed, formant_shift, breathiness): |
|
|
dur_ms = params['dur'] / speed |
|
|
n = int(self.sr * dur_ms / 1000) |
|
|
t = np.arange(n) / self.sr |
|
|
|
|
|
source = self._glottal_source(t, f0, breathiness) |
|
|
|
|
|
f1 = params['f1'] * formant_shift |
|
|
f2 = params['f2'] * formant_shift |
|
|
f3 = params['f3'] * formant_shift |
|
|
|
|
|
audio = self._apply_formants(source, f1, f2, f3) |
|
|
|
|
|
nasal_pole = self._resonator(source, 250, 100) * 0.4 |
|
|
audio += nasal_pole |
|
|
|
|
|
try: |
|
|
b, a = signal.butter(2, 800 / (self.sr / 2), 'low') |
|
|
audio = signal.filtfilt(b, a, audio) |
|
|
except: |
|
|
pass |
|
|
|
|
|
audio *= params['amp'] * self._consonant_envelope(n) |
|
|
|
|
|
return audio.astype(np.float32) |
|
|
|
|
|
def _synth_liquid(self, params, f0, speed, formant_shift, breathiness): |
|
|
dur_ms = params['dur'] / speed |
|
|
n = int(self.sr * dur_ms / 1000) |
|
|
t = np.arange(n) / self.sr |
|
|
|
|
|
source = self._glottal_source(t, f0, breathiness) |
|
|
|
|
|
f1 = params['f1'] * formant_shift |
|
|
f2 = params['f2'] * formant_shift |
|
|
f3 = params['f3'] * formant_shift |
|
|
|
|
|
audio = self._apply_formants(source, f1, f2, f3) |
|
|
audio *= params['amp'] * self._consonant_envelope(n) |
|
|
|
|
|
return audio.astype(np.float32) |
|
|
|
|
|
def _synth_glide(self, params, f0, speed, formant_shift, breathiness): |
|
|
dur_ms = params['dur'] / speed |
|
|
n = int(self.sr * dur_ms / 1000) |
|
|
t = np.arange(n) / self.sr |
|
|
|
|
|
source = self._glottal_source(t, f0, breathiness) |
|
|
|
|
|
f1 = params['f1'] * formant_shift |
|
|
f2 = params['f2'] * formant_shift |
|
|
f3 = params['f3'] * formant_shift |
|
|
|
|
|
audio = self._apply_formants(source, f1, f2, f3) |
|
|
audio *= params['amp'] * self._consonant_envelope(n) |
|
|
|
|
|
return audio.astype(np.float32) |
|
|
|
|
|
def _smooth_concat(self, segments): |
|
|
if not segments: |
|
|
return np.zeros(1000, dtype=np.float32) |
|
|
|
|
|
if len(segments) == 1: |
|
|
return segments[0] |
|
|
|
|
|
overlap = 64 |
|
|
total_len = sum(len(s) for s in segments) - overlap * (len(segments) - 1) |
|
|
total_len = max(total_len, 100) |
|
|
|
|
|
audio = np.zeros(total_len, dtype=np.float32) |
|
|
pos = 0 |
|
|
|
|
|
for i, seg in enumerate(segments): |
|
|
if len(seg) == 0: |
|
|
continue |
|
|
|
|
|
end_pos = min(pos + len(seg), total_len) |
|
|
seg_len = end_pos - pos |
|
|
|
|
|
if seg_len <= 0: |
|
|
break |
|
|
|
|
|
seg_to_add = seg[:seg_len].copy() |
|
|
|
|
|
if i > 0 and pos > overlap: |
|
|
fade_len = min(overlap, seg_len) |
|
|
fade_in = np.linspace(0, 1, fade_len) ** 0.5 |
|
|
fade_out = np.linspace(1, 0, fade_len) ** 0.5 |
|
|
|
|
|
audio[pos:pos + fade_len] *= fade_out |
|
|
seg_to_add[:fade_len] *= fade_in |
|
|
|
|
|
audio[pos:end_pos] += seg_to_add |
|
|
pos = end_pos - overlap // 2 |
|
|
pos = max(0, pos) |
|
|
|
|
|
return audio |
|
|
|
|
|
def _normalize(self, audio): |
|
|
if len(audio) < 100: |
|
|
return audio |
|
|
|
|
|
audio = audio - np.mean(audio) |
|
|
max_val = np.max(np.abs(audio)) |
|
|
if max_val > 0: |
|
|
audio = audio / max_val * 0.9 |
|
|
|
|
|
fade = min(len(audio) // 40, 200) |
|
|
audio[:fade] *= np.linspace(0, 1, fade) |
|
|
audio[-fade:] *= np.linspace(1, 0, fade) |
|
|
|
|
|
return audio |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class VedesTTS: |
|
|
def __init__(self, sample_rate=22050): |
|
|
self.sr = sample_rate |
|
|
self.text_to_phoneme = TextToPhoneme() |
|
|
self.synthesizer = VoiceSynthesizer(sample_rate) |
|
|
self.voice_analyzer = VoiceAnalyzer(sample_rate) |
|
|
self.current_voice = VOICE_PROFILES["Emma (Female)"] |
|
|
|
|
|
def get_voice(self, voice_name): |
|
|
if voice_name in VOICE_PROFILES: |
|
|
return VOICE_PROFILES[voice_name] |
|
|
elif voice_name in custom_voices: |
|
|
return custom_voices[voice_name] |
|
|
return self.current_voice |
|
|
|
|
|
def speak(self, text, rate=1.0, pitch=1.0, voice_name=None): |
|
|
if not text or not text.strip(): |
|
|
return np.zeros(self.sr, dtype=np.float32) |
|
|
|
|
|
voice = self.get_voice(voice_name) if voice_name else self.current_voice |
|
|
phonemes = self.text_to_phoneme.convert(text) |
|
|
|
|
|
if not phonemes: |
|
|
return np.zeros(self.sr, dtype=np.float32) |
|
|
|
|
|
audio = self.synthesizer.synthesize(phonemes, voice, rate, pitch) |
|
|
|
|
|
return audio |
|
|
|
|
|
def train_voice(self, audio_data, voice_name="My Voice"): |
|
|
"""Train a new voice from audio sample""" |
|
|
global custom_voices |
|
|
|
|
|
if audio_data is None: |
|
|
return None |
|
|
|
|
|
|
|
|
if isinstance(audio_data, tuple): |
|
|
sr, audio = audio_data |
|
|
audio = audio.astype(np.float32) |
|
|
|
|
|
|
|
|
if len(audio.shape) > 1: |
|
|
audio = audio.mean(axis=1) |
|
|
|
|
|
|
|
|
if sr != self.sr: |
|
|
duration = len(audio) / sr |
|
|
new_length = int(duration * self.sr) |
|
|
audio = signal.resample(audio, new_length) |
|
|
else: |
|
|
audio = audio_data.astype(np.float32) |
|
|
|
|
|
|
|
|
max_val = np.max(np.abs(audio)) |
|
|
if max_val > 0: |
|
|
audio = audio / max_val |
|
|
|
|
|
|
|
|
profile = self.voice_analyzer.analyze(audio) |
|
|
|
|
|
if profile: |
|
|
profile['name'] = voice_name |
|
|
profile['description'] = f"Trained voice (F0={profile['f0']:.0f}Hz)" |
|
|
custom_voices[voice_name] = profile |
|
|
return profile |
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("=" * 50) |
|
|
print("🎙️ VEDES TTS - With Voice Training") |
|
|
print("=" * 50) |
|
|
|
|
|
tts = VedesTTS(SAMPLE_RATE) |
|
|
|
|
|
print("✅ Ready!") |
|
|
print(f"📢 Available voices: {len(VOICE_PROFILES)}") |
|
|
print("=" * 50) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_all_voices(): |
|
|
"""Get list of all available voices""" |
|
|
voices = list(VOICE_PROFILES.keys()) + list(custom_voices.keys()) |
|
|
return voices |
|
|
|
|
|
|
|
|
def get_voice_info(voice_name): |
|
|
"""Get info about a voice""" |
|
|
if voice_name in VOICE_PROFILES: |
|
|
v = VOICE_PROFILES[voice_name] |
|
|
elif voice_name in custom_voices: |
|
|
v = custom_voices[voice_name] |
|
|
else: |
|
|
return "Select a voice" |
|
|
|
|
|
return f""" |
|
|
**{v.get('name', voice_name)}** |
|
|
- Type: {v.get('gender', 'unknown').title()} |
|
|
- Pitch: {v.get('f0', 130):.0f} Hz |
|
|
- {v.get('description', '')} |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def synthesize(text, voice_name, rate, pitch): |
|
|
"""Synthesize speech""" |
|
|
if not text or not text.strip(): |
|
|
return None |
|
|
|
|
|
text = text.strip()[:300] |
|
|
|
|
|
try: |
|
|
pitch_mult = 2 ** (pitch / 12) |
|
|
audio = tts.speak(text, rate=rate, pitch=pitch_mult, voice_name=voice_name) |
|
|
|
|
|
if len(audio) < 100: |
|
|
return None |
|
|
|
|
|
audio = np.clip(audio, -1, 1) |
|
|
audio_int16 = (audio * 32767).astype(np.int16) |
|
|
|
|
|
return (SAMPLE_RATE, audio_int16) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Synthesis error: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
def train_voice(audio, voice_name): |
|
|
"""Train a new voice from audio""" |
|
|
global custom_voices |
|
|
|
|
|
if audio is None: |
|
|
return "❌ Please record or upload audio first.", get_all_voices() |
|
|
|
|
|
if not voice_name or not voice_name.strip(): |
|
|
voice_name = f"Custom Voice {len(custom_voices) + 1}" |
|
|
|
|
|
voice_name = voice_name.strip()[:30] |
|
|
|
|
|
|
|
|
if voice_name in VOICE_PROFILES: |
|
|
voice_name = f"{voice_name} (custom)" |
|
|
|
|
|
try: |
|
|
profile = tts.train_voice(audio, voice_name) |
|
|
|
|
|
if profile: |
|
|
result = f""" |
|
|
✅ **Voice "{voice_name}" created!** |
|
|
|
|
|
**Detected Parameters:** |
|
|
- Pitch (F0): {profile['f0']:.1f} Hz |
|
|
- Pitch Variation: {profile['f0_variation']:.1f} Hz |
|
|
- Formant Shift: {profile['formant_shift']:.2f} |
|
|
- Breathiness: {profile['breathiness']:.3f} |
|
|
- Brightness: {profile['brightness']:.2f} |
|
|
|
|
|
You can now select this voice in the Speak tab! |
|
|
""" |
|
|
return result, get_all_voices() |
|
|
else: |
|
|
return "❌ Could not analyze voice. Try a longer/clearer sample.", get_all_voices() |
|
|
|
|
|
except Exception as e: |
|
|
return f"❌ Error: {str(e)}", get_all_voices() |
|
|
|
|
|
|
|
|
def create_custom_voice(name, pitch, formant, breathiness, speed, brightness): |
|
|
"""Create a custom voice from parameters""" |
|
|
global custom_voices |
|
|
|
|
|
if not name or not name.strip(): |
|
|
return "❌ Please enter a voice name.", get_all_voices() |
|
|
|
|
|
name = name.strip()[:30] |
|
|
|
|
|
if name in VOICE_PROFILES: |
|
|
name = f"{name} (custom)" |
|
|
|
|
|
profile = { |
|
|
"name": name, |
|
|
"gender": "custom", |
|
|
"f0": pitch, |
|
|
"f0_variation": 25, |
|
|
"formant_shift": formant, |
|
|
"breathiness": breathiness / 100, |
|
|
"speed": speed, |
|
|
"brightness": brightness, |
|
|
"description": f"Custom voice (F0={pitch}Hz)" |
|
|
} |
|
|
|
|
|
custom_voices[name] = profile |
|
|
|
|
|
return f"✅ Voice **{name}** created! Select it in the Speak tab.", get_all_voices() |
|
|
|
|
|
|
|
|
def refresh_voices(): |
|
|
"""Refresh the voice list""" |
|
|
return gr.update(choices=get_all_voices()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo: |
|
|
|
|
|
gr.Markdown(""" |
|
|
# 🎙️ Vedes TTS - Voice Training Edition |
|
|
### Create and Use Custom Voices - 100% From Scratch |
|
|
""") |
|
|
|
|
|
with gr.Tabs(): |
|
|
|
|
|
with gr.TabItem("🔊 Speak"): |
|
|
with gr.Row(): |
|
|
with gr.Column(scale=2): |
|
|
text_input = gr.Textbox( |
|
|
label="📝 Text to Speak", |
|
|
placeholder="Type something...", |
|
|
lines=3 |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
voice_select = gr.Dropdown( |
|
|
choices=get_all_voices(), |
|
|
value="Emma (Female)", |
|
|
label="🗣️ Voice", |
|
|
interactive=True |
|
|
) |
|
|
refresh_btn = gr.Button("🔄", size="sm") |
|
|
|
|
|
voice_info = gr.Markdown("Select a voice") |
|
|
|
|
|
with gr.Row(): |
|
|
rate = gr.Slider(0.6, 1.5, 0.9, step=0.1, label="⏱️ Speed") |
|
|
pitch = gr.Slider(-6, 6, 0, step=1, label="🎵 Pitch") |
|
|
|
|
|
speak_btn = gr.Button("🔊 Speak", variant="primary", size="lg") |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
audio_out = gr.Audio(label="🎧 Output", type="numpy") |
|
|
|
|
|
gr.Examples( |
|
|
examples=[ |
|
|
["Hello, how are you?"], |
|
|
["Good morning!"], |
|
|
["My name is Vedes."], |
|
|
["Thank you very much."], |
|
|
["Have a nice day."], |
|
|
], |
|
|
inputs=text_input, |
|
|
label="📚 Examples" |
|
|
) |
|
|
|
|
|
|
|
|
with gr.TabItem("🎤 Train Voice"): |
|
|
gr.Markdown(""" |
|
|
### Train a New Voice from Audio |
|
|
|
|
|
Record or upload 3-10 seconds of clear speech. |
|
|
|
|
|
**Tips:** |
|
|
- Speak naturally and clearly |
|
|
- Avoid background noise |
|
|
- Read a few sentences |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
audio_input = gr.Audio( |
|
|
label="🎤 Record or Upload", |
|
|
sources=["microphone", "upload"], |
|
|
type="numpy" |
|
|
) |
|
|
|
|
|
voice_name_input = gr.Textbox( |
|
|
label="Voice Name", |
|
|
placeholder="e.g., My Voice", |
|
|
value="" |
|
|
) |
|
|
|
|
|
train_btn = gr.Button("🧠 Train Voice", variant="primary") |
|
|
|
|
|
with gr.Column(): |
|
|
train_result = gr.Markdown("Record audio and click Train") |
|
|
|
|
|
gr.Markdown(""" |
|
|
### What Gets Analyzed: |
|
|
- **Pitch (F0)**: How high/low the voice is |
|
|
- **Formants**: Voice quality/timbre |
|
|
- **Breathiness**: Air in the voice |
|
|
""") |
|
|
|
|
|
|
|
|
with gr.TabItem("⚙️ Create Voice"): |
|
|
gr.Markdown("### Create Custom Voice Manually") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
custom_name = gr.Textbox( |
|
|
label="Voice Name", |
|
|
placeholder="My Custom Voice" |
|
|
) |
|
|
|
|
|
custom_pitch = gr.Slider( |
|
|
60, 300, 150, |
|
|
label="Pitch (Hz)", |
|
|
info="60-130=Male, 150-250=Female, 250+=Child" |
|
|
) |
|
|
|
|
|
custom_formant = gr.Slider( |
|
|
0.7, 1.4, 1.0, step=0.05, |
|
|
label="Formant Shift", |
|
|
info="<1.0=Male, >1.0=Female/Child" |
|
|
) |
|
|
|
|
|
custom_breathiness = gr.Slider( |
|
|
1, 10, 3, |
|
|
label="Breathiness (%)" |
|
|
) |
|
|
|
|
|
custom_speed = gr.Slider( |
|
|
0.7, 1.3, 1.0, step=0.05, |
|
|
label="Natural Speed" |
|
|
) |
|
|
|
|
|
custom_brightness = gr.Slider( |
|
|
0.8, 1.3, 1.0, step=0.05, |
|
|
label="Brightness" |
|
|
) |
|
|
|
|
|
create_btn = gr.Button("✨ Create Voice", variant="primary") |
|
|
|
|
|
with gr.Column(): |
|
|
create_result = gr.Markdown("") |
|
|
|
|
|
gr.Markdown(""" |
|
|
### Quick Presets: |
|
|
|
|
|
| Type | Pitch | Formant | |
|
|
|------|-------|---------| |
|
|
| Deep Male | 85 | 0.85 | |
|
|
| Male | 120 | 0.92 | |
|
|
| Female | 200 | 1.12 | |
|
|
| High Female | 240 | 1.20 | |
|
|
| Child | 280 | 1.25 | |
|
|
""") |
|
|
|
|
|
|
|
|
with gr.TabItem("👥 All Voices"): |
|
|
gr.Markdown("### Pre-built Voices") |
|
|
|
|
|
voice_info_md = "" |
|
|
for name, v in VOICE_PROFILES.items(): |
|
|
voice_info_md += f""" |
|
|
**{name}** |
|
|
- Type: {v['gender'].title()} | Pitch: {v['f0']} Hz |
|
|
- {v['description']} |
|
|
|
|
|
""" |
|
|
gr.Markdown(voice_info_md) |
|
|
|
|
|
gr.Markdown("### Custom Voices") |
|
|
custom_voices_display = gr.Markdown("*No custom voices yet*") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
voice_select.change(get_voice_info, voice_select, voice_info) |
|
|
refresh_btn.click(refresh_voices, outputs=voice_select) |
|
|
speak_btn.click(synthesize, [text_input, voice_select, rate, pitch], audio_out) |
|
|
text_input.submit(synthesize, [text_input, voice_select, rate, pitch], audio_out) |
|
|
|
|
|
|
|
|
def train_and_update(audio, name): |
|
|
result, voices = train_voice(audio, name) |
|
|
|
|
|
return result, gr.update(choices=voices) |
|
|
|
|
|
train_btn.click( |
|
|
train_and_update, |
|
|
[audio_input, voice_name_input], |
|
|
[train_result, voice_select] |
|
|
) |
|
|
|
|
|
|
|
|
def create_and_update(name, pitch, formant, breathiness, speed, brightness): |
|
|
result, voices = create_custom_voice(name, pitch, formant, breathiness, speed, brightness) |
|
|
return result, gr.update(choices=voices) |
|
|
|
|
|
create_btn.click( |
|
|
create_and_update, |
|
|
[custom_name, custom_pitch, custom_formant, custom_breathiness, |
|
|
custom_speed, custom_brightness], |
|
|
[create_result, voice_select] |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |