vedaco commited on
Commit
9ee9e3a
·
verified ·
1 Parent(s): b67fedc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +234 -207
app.py CHANGED
@@ -8,7 +8,7 @@ import json
8
  import os
9
 
10
  # ============================================
11
- # VEDES TTS - WITH VOICE TRAINING
12
  # 100% From Scratch - No APIs
13
  # ============================================
14
 
@@ -22,9 +22,9 @@ VOICE_PROFILES = {
22
  "Emma (Female)": {
23
  "name": "Emma",
24
  "gender": "female",
25
- "f0": 210, # Higher pitch
26
  "f0_variation": 30,
27
- "formant_shift": 1.15, # Shift formants up
28
  "breathiness": 0.04,
29
  "speed": 1.0,
30
  "brightness": 1.1,
@@ -33,9 +33,9 @@ VOICE_PROFILES = {
33
  "James (Male)": {
34
  "name": "James",
35
  "gender": "male",
36
- "f0": 110, # Lower pitch
37
  "f0_variation": 20,
38
- "formant_shift": 0.9, # Shift formants down
39
  "breathiness": 0.02,
40
  "speed": 0.95,
41
  "brightness": 0.95,
@@ -44,7 +44,7 @@ VOICE_PROFILES = {
44
  "Sophie (Child)": {
45
  "name": "Sophie",
46
  "gender": "child",
47
- "f0": 280, # High pitch
48
  "f0_variation": 40,
49
  "formant_shift": 1.25,
50
  "breathiness": 0.03,
@@ -55,7 +55,7 @@ VOICE_PROFILES = {
55
  "David (Deep Male)": {
56
  "name": "David",
57
  "gender": "male",
58
- "f0": 85, # Very deep
59
  "f0_variation": 15,
60
  "formant_shift": 0.82,
61
  "breathiness": 0.02,
@@ -109,7 +109,7 @@ VOICE_PROFILES = {
109
  },
110
  }
111
 
112
- # Custom voices storage
113
  custom_voices = {}
114
 
115
  # ============================================
@@ -366,7 +366,7 @@ LETTERS = {
366
 
367
 
368
  # ============================================
369
- # VOICE ANALYZER - Extract Voice Features
370
  # ============================================
371
 
372
  class VoiceAnalyzer:
@@ -377,19 +377,18 @@ class VoiceAnalyzer:
377
 
378
  def analyze(self, audio):
379
  """Extract voice features from audio sample"""
380
- if len(audio) < self.sr * 0.5:
381
  return None
382
 
383
- # Normalize
384
  audio = audio.astype(np.float32)
385
- audio = audio / (np.max(np.abs(audio)) + 1e-8)
 
 
386
 
387
- # Extract features
388
  f0 = self._estimate_pitch(audio)
389
  formants = self._estimate_formants(audio)
390
  breathiness = self._estimate_breathiness(audio)
391
 
392
- # Create voice profile
393
  profile = {
394
  "name": "Custom Voice",
395
  "gender": "custom",
@@ -399,40 +398,45 @@ class VoiceAnalyzer:
399
  "breathiness": breathiness,
400
  "speed": 1.0,
401
  "brightness": formants.get('brightness', 1.0),
402
- "description": "Voice extracted from audio sample"
403
  }
404
 
405
  return profile
406
 
407
  def _estimate_pitch(self, audio):
408
- """Estimate fundamental frequency (F0)"""
409
- # Use autocorrelation
410
- frame_size = int(self.sr * 0.03) # 30ms frames
411
-
412
  pitches = []
 
413
  for i in range(0, len(audio) - frame_size, frame_size):
414
  frame = audio[i:i + frame_size]
415
 
 
 
 
416
  # Autocorrelation
417
  corr = np.correlate(frame, frame, mode='full')
418
  corr = corr[len(corr)//2:]
419
 
420
- # Find first peak after initial decline
421
  d = np.diff(corr)
422
- start = np.where(d > 0)[0]
423
 
424
- if len(start) > 0:
425
- start = start[0]
426
- peak = start + np.argmax(corr[start:start + int(self.sr / 80)])
427
 
428
- if peak > 0:
429
- f0 = self.sr / peak
430
- if 60 < f0 < 400:
431
- pitches.append(f0)
 
 
 
432
 
433
  if pitches:
434
- return np.median(pitches)
435
- return 130 # Default
436
 
437
  def _estimate_f0_variation(self, audio, base_f0):
438
  """Estimate pitch variation"""
@@ -441,41 +445,42 @@ class VoiceAnalyzer:
441
 
442
  for i in range(0, len(audio) - frame_size, frame_size):
443
  frame = audio[i:i + frame_size]
 
 
444
  corr = np.correlate(frame, frame, mode='full')
445
  corr = corr[len(corr)//2:]
446
 
447
  d = np.diff(corr)
448
- start = np.where(d > 0)[0]
449
 
450
- if len(start) > 0:
451
- start = start[0]
452
- peak = start + np.argmax(corr[start:start + int(self.sr / 80)])
453
- if peak > 0:
454
- f0 = self.sr / peak
455
- if 60 < f0 < 400:
456
- pitches.append(f0)
 
 
 
457
 
458
  if len(pitches) > 2:
459
- return min(np.std(pitches), 50)
460
- return 20
461
 
462
  def _estimate_formants(self, audio):
463
  """Estimate formant characteristics"""
464
- # Simple spectral analysis
465
  frame_size = 2048
466
 
467
  if len(audio) < frame_size:
468
  return {'shift': 1.0, 'brightness': 1.0}
469
 
470
- # Get spectrum
471
  spectrum = np.abs(np.fft.rfft(audio[:frame_size] * np.hanning(frame_size)))
472
  freqs = np.fft.rfftfreq(frame_size, 1/self.sr)
473
 
474
- # Find spectral centroid
475
- centroid = np.sum(freqs * spectrum) / (np.sum(spectrum) + 1e-8)
476
 
477
- # Estimate formant shift based on centroid
478
- # Average male ~1200Hz, female ~1400Hz
479
  if centroid > 1600:
480
  shift = 1.2
481
  brightness = 1.15
@@ -495,7 +500,7 @@ class VoiceAnalyzer:
495
  return {'shift': shift, 'brightness': brightness}
496
 
497
  def _estimate_breathiness(self, audio):
498
- """Estimate breathiness/aspiration"""
499
  frame_size = 2048
500
 
501
  if len(audio) < frame_size:
@@ -504,12 +509,14 @@ class VoiceAnalyzer:
504
  spectrum = np.abs(np.fft.rfft(audio[:frame_size]))
505
  freqs = np.fft.rfftfreq(frame_size, 1/self.sr)
506
 
507
- # High frequency energy ratio (breathiness indicator)
508
- low_energy = np.sum(spectrum[freqs < 1000])
509
- high_energy = np.sum(spectrum[(freqs > 2000) & (freqs < 5000)])
510
 
511
- ratio = high_energy / (low_energy + 1e-8)
512
- breathiness = np.clip(ratio * 0.1, 0.02, 0.1)
 
 
 
513
 
514
  return breathiness
515
 
@@ -569,7 +576,7 @@ class TextToPhoneme:
569
 
570
 
571
  # ============================================
572
- # VOICE-AWARE FORMANT SYNTHESIZER
573
  # ============================================
574
 
575
  class VoiceSynthesizer:
@@ -583,7 +590,6 @@ class VoiceSynthesizer:
583
 
584
  voice = voice_profile or self.default_voice
585
 
586
- # Get voice parameters
587
  f0 = voice.get('f0', 130) * pitch
588
  f0_var = voice.get('f0_variation', 20)
589
  formant_shift = voice.get('formant_shift', 1.0)
@@ -597,7 +603,6 @@ class VoiceSynthesizer:
597
  prev_phon = phonemes[i - 1] if i > 0 else None
598
  next_phon = phonemes[i + 1] if i < len(phonemes) - 1 else None
599
 
600
- # Add pitch variation
601
  phrase_pos = i / max(len(phonemes), 1)
602
  f0_current = f0 + f0_var * np.sin(phrase_pos * np.pi) * 0.5
603
 
@@ -620,39 +625,28 @@ class VoiceSynthesizer:
620
 
621
  if phon in VOWELS:
622
  return self._synth_vowel(phon, f0, speed, formant_shift,
623
- breathiness, brightness, prev_phon, next_phon)
624
 
625
  if phon in CONSONANTS:
626
  return self._synth_consonant(phon, f0, speed, formant_shift, breathiness)
627
 
628
  return np.zeros(100, dtype=np.float32)
629
 
630
- def _synth_vowel(self, phon, f0, speed, formant_shift, breathiness,
631
- brightness, prev_phon, next_phon):
632
  params = VOWELS[phon]
633
  f1, f2, f3, dur_ms, amp, voiced = params
634
 
635
- # Apply formant shift
636
  f1 = f1 * formant_shift
637
- f2 = f2 * formant_shift
638
- f3 = f3 * formant_shift
639
-
640
- # Apply brightness
641
- f2 = f2 * brightness
642
- f3 = f3 * brightness
643
 
644
  dur_ms = dur_ms / speed
645
  n = int(self.sr * dur_ms / 1000)
646
  n = max(n, 100)
647
  t = np.arange(n) / self.sr
648
 
649
- # Generate glottal source with voice characteristics
650
  source = self._glottal_source(t, f0, breathiness)
651
-
652
- # Apply formants
653
  audio = self._apply_formants(source, f1, f2, f3)
654
-
655
- # Apply envelope
656
  envelope = self._vowel_envelope(n)
657
  audio = audio * envelope * amp
658
 
@@ -663,17 +657,17 @@ class VoiceSynthesizer:
663
  ctype = params['type']
664
 
665
  if ctype == 'stop':
666
- return self._synth_stop(phon, params, f0, speed, formant_shift)
667
  elif ctype == 'fric':
668
- return self._synth_fricative(phon, params, f0, speed)
669
  elif ctype == 'affric':
670
- return self._synth_affricate(phon, params, f0, speed)
671
  elif ctype == 'nasal':
672
- return self._synth_nasal(phon, params, f0, speed, formant_shift, breathiness)
673
  elif ctype == 'liquid':
674
- return self._synth_liquid(phon, params, f0, speed, formant_shift, breathiness)
675
  elif ctype == 'glide':
676
- return self._synth_glide(phon, params, f0, speed, formant_shift, breathiness)
677
 
678
  return np.zeros(100, dtype=np.float32)
679
 
@@ -688,10 +682,7 @@ class VoiceSynthesizer:
688
  mask2 = (phase >= 0.4) & (phase < 0.6)
689
  glottal[mask2] = np.cos(np.pi * (phase[mask2] - 0.4) / 0.4)
690
 
691
- # Add breathiness
692
  glottal += np.random.randn(len(t)) * breathiness
693
-
694
- # Add shimmer
695
  shimmer = 1 + 0.02 * np.sin(2 * np.pi * 5 * t)
696
  glottal *= shimmer
697
 
@@ -743,7 +734,7 @@ class VoiceSynthesizer:
743
 
744
  return env
745
 
746
- def _synth_stop(self, phon, params, f0, speed, formant_shift):
747
  closure_ms = params['closure'] / speed
748
  burst_ms = params['burst'] / speed
749
 
@@ -775,7 +766,7 @@ class VoiceSynthesizer:
775
 
776
  return audio
777
 
778
- def _synth_fricative(self, phon, params, f0, speed):
779
  dur_ms = params['dur'] / speed
780
  n = int(self.sr * dur_ms / 1000)
781
 
@@ -802,7 +793,7 @@ class VoiceSynthesizer:
802
 
803
  return audio.astype(np.float32)
804
 
805
- def _synth_affricate(self, phon, params, f0, speed):
806
  closure_ms = params['closure'] / speed
807
  fric_ms = params['fric'] / speed
808
 
@@ -837,7 +828,7 @@ class VoiceSynthesizer:
837
 
838
  return audio
839
 
840
- def _synth_nasal(self, phon, params, f0, speed, formant_shift, breathiness):
841
  dur_ms = params['dur'] / speed
842
  n = int(self.sr * dur_ms / 1000)
843
  t = np.arange(n) / self.sr
@@ -863,7 +854,7 @@ class VoiceSynthesizer:
863
 
864
  return audio.astype(np.float32)
865
 
866
- def _synth_liquid(self, phon, params, f0, speed, formant_shift, breathiness):
867
  dur_ms = params['dur'] / speed
868
  n = int(self.sr * dur_ms / 1000)
869
  t = np.arange(n) / self.sr
@@ -879,7 +870,7 @@ class VoiceSynthesizer:
879
 
880
  return audio.astype(np.float32)
881
 
882
- def _synth_glide(self, phon, params, f0, speed, formant_shift, breathiness):
883
  dur_ms = params['dur'] / speed
884
  n = int(self.sr * dur_ms / 1000)
885
  t = np.arange(n) / self.sr
@@ -919,7 +910,7 @@ class VoiceSynthesizer:
919
  if seg_len <= 0:
920
  break
921
 
922
- seg_to_add = seg[:seg_len]
923
 
924
  if i > 0 and pos > overlap:
925
  fade_len = min(overlap, seg_len)
@@ -927,7 +918,6 @@ class VoiceSynthesizer:
927
  fade_out = np.linspace(1, 0, fade_len) ** 0.5
928
 
929
  audio[pos:pos + fade_len] *= fade_out
930
- seg_to_add = seg_to_add.copy()
931
  seg_to_add[:fade_len] *= fade_in
932
 
933
  audio[pos:end_pos] += seg_to_add
@@ -964,39 +954,45 @@ class VedesTTS:
964
  self.voice_analyzer = VoiceAnalyzer(sample_rate)
965
  self.current_voice = VOICE_PROFILES["Emma (Female)"]
966
 
967
- def set_voice(self, voice_name):
968
  if voice_name in VOICE_PROFILES:
969
- self.current_voice = VOICE_PROFILES[voice_name]
970
  elif voice_name in custom_voices:
971
- self.current_voice = custom_voices[voice_name]
 
972
 
973
  def speak(self, text, rate=1.0, pitch=1.0, voice_name=None):
974
  if not text or not text.strip():
975
  return np.zeros(self.sr, dtype=np.float32)
976
 
977
- if voice_name:
978
- self.set_voice(voice_name)
979
-
980
  phonemes = self.text_to_phoneme.convert(text)
981
 
982
  if not phonemes:
983
  return np.zeros(self.sr, dtype=np.float32)
984
 
985
- audio = self.synthesizer.synthesize(phonemes, self.current_voice, rate, pitch)
986
 
987
  return audio
988
 
989
  def train_voice(self, audio_data, voice_name="My Voice"):
990
  """Train a new voice from audio sample"""
 
 
991
  if audio_data is None:
992
  return None
993
 
994
- # Handle different input formats
995
  if isinstance(audio_data, tuple):
996
  sr, audio = audio_data
997
  audio = audio.astype(np.float32)
 
 
 
 
 
 
998
  if sr != self.sr:
999
- # Resample
1000
  duration = len(audio) / sr
1001
  new_length = int(duration * self.sr)
1002
  audio = signal.resample(audio, new_length)
@@ -1004,13 +1000,16 @@ class VedesTTS:
1004
  audio = audio_data.astype(np.float32)
1005
 
1006
  # Normalize
1007
- audio = audio / (np.max(np.abs(audio)) + 1e-8)
 
 
1008
 
1009
- # Analyze voice
1010
  profile = self.voice_analyzer.analyze(audio)
1011
 
1012
  if profile:
1013
  profile['name'] = voice_name
 
1014
  custom_voices[voice_name] = profile
1015
  return profile
1016
 
@@ -1033,10 +1032,38 @@ print("=" * 50)
1033
 
1034
 
1035
  # ============================================
1036
- # GRADIO INTERFACE
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1037
  # ============================================
1038
 
1039
  def synthesize(text, voice_name, rate, pitch):
 
1040
  if not text or not text.strip():
1041
  return None
1042
 
@@ -1044,17 +1071,7 @@ def synthesize(text, voice_name, rate, pitch):
1044
 
1045
  try:
1046
  pitch_mult = 2 ** (pitch / 12)
1047
-
1048
- # Check custom voices first
1049
- if voice_name in custom_voices:
1050
- voice = custom_voices[voice_name]
1051
- elif voice_name in VOICE_PROFILES:
1052
- voice = VOICE_PROFILES[voice_name]
1053
- else:
1054
- voice = VOICE_PROFILES["Emma (Female)"]
1055
-
1056
- tts.current_voice = voice
1057
- audio = tts.speak(text, rate=rate, pitch=pitch_mult)
1058
 
1059
  if len(audio) < 100:
1060
  return None
@@ -1065,67 +1082,61 @@ def synthesize(text, voice_name, rate, pitch):
1065
  return (SAMPLE_RATE, audio_int16)
1066
 
1067
  except Exception as e:
1068
- print(f"Error: {e}")
1069
  return None
1070
 
1071
 
1072
  def train_voice(audio, voice_name):
 
 
 
1073
  if audio is None:
1074
- return "❌ No audio provided", gr.update(choices=get_all_voices())
1075
 
1076
  if not voice_name or not voice_name.strip():
1077
- voice_name = "My Voice"
1078
 
1079
  voice_name = voice_name.strip()[:30]
1080
 
 
 
 
 
1081
  try:
1082
  profile = tts.train_voice(audio, voice_name)
1083
 
1084
  if profile:
1085
- details = f"""
1086
- ✅ Voice "{voice_name}" created successfully!
1087
 
1088
- **Voice Parameters:**
1089
  - Pitch (F0): {profile['f0']:.1f} Hz
1090
  - Pitch Variation: {profile['f0_variation']:.1f} Hz
1091
  - Formant Shift: {profile['formant_shift']:.2f}
1092
  - Breathiness: {profile['breathiness']:.3f}
1093
  - Brightness: {profile['brightness']:.2f}
 
 
1094
  """
1095
- return details, gr.update(choices=get_all_voices(), value=voice_name)
1096
  else:
1097
- return "❌ Could not analyze voice. Try a longer sample.", gr.update(choices=get_all_voices())
1098
 
1099
  except Exception as e:
1100
- return f"❌ Error: {str(e)}", gr.update(choices=get_all_voices())
1101
-
1102
-
1103
- def get_all_voices():
1104
- voices = list(VOICE_PROFILES.keys()) + list(custom_voices.keys())
1105
- return voices
1106
-
1107
-
1108
- def get_voice_info(voice_name):
1109
- if voice_name in VOICE_PROFILES:
1110
- v = VOICE_PROFILES[voice_name]
1111
- elif voice_name in custom_voices:
1112
- v = custom_voices[voice_name]
1113
- else:
1114
- return "Select a voice"
1115
-
1116
- return f"""
1117
- **{v.get('name', voice_name)}**
1118
- - Type: {v.get('gender', 'unknown').title()}
1119
- - Pitch: {v.get('f0', 130):.0f} Hz
1120
- - {v.get('description', '')}
1121
- """
1122
 
1123
 
1124
  def create_custom_voice(name, pitch, formant, breathiness, speed, brightness):
 
 
 
1125
  if not name or not name.strip():
1126
- return "❌ Please enter a voice name", gr.update(choices=get_all_voices())
 
 
1127
 
1128
- name = name.strip()
 
1129
 
1130
  profile = {
1131
  "name": name,
@@ -1136,15 +1147,23 @@ def create_custom_voice(name, pitch, formant, breathiness, speed, brightness):
1136
  "breathiness": breathiness / 100,
1137
  "speed": speed,
1138
  "brightness": brightness,
1139
- "description": f"Custom voice (pitch={pitch}Hz)"
1140
  }
1141
 
1142
  custom_voices[name] = profile
1143
 
1144
- return f"✅ Voice '{name}' created!", gr.update(choices=get_all_voices(), value=name)
 
1145
 
 
 
 
 
 
 
 
 
1146
 
1147
- # Build interface
1148
  with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
1149
 
1150
  gr.Markdown("""
@@ -1159,15 +1178,18 @@ with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
1159
  with gr.Column(scale=2):
1160
  text_input = gr.Textbox(
1161
  label="📝 Text to Speak",
1162
- placeholder="Type here...",
1163
  lines=3
1164
  )
1165
 
1166
- voice_select = gr.Dropdown(
1167
- choices=get_all_voices(),
1168
- value="Emma (Female)",
1169
- label="🗣️ Voice"
1170
- )
 
 
 
1171
 
1172
  voice_info = gr.Markdown("Select a voice")
1173
 
@@ -1197,44 +1219,43 @@ with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
1197
  gr.Markdown("""
1198
  ### Train a New Voice from Audio
1199
 
1200
- Record or upload an audio sample, and Vedes will extract the voice characteristics.
1201
 
1202
- **Tips for best results:**
1203
- - Record 5-10 seconds of speech
1204
- - Speak clearly and naturally
1205
  - Avoid background noise
 
1206
  """)
1207
 
1208
  with gr.Row():
1209
  with gr.Column():
1210
  audio_input = gr.Audio(
1211
- label="🎤 Record or Upload Audio",
1212
  sources=["microphone", "upload"],
1213
  type="numpy"
1214
  )
1215
 
1216
  voice_name_input = gr.Textbox(
1217
  label="Voice Name",
1218
- placeholder="My Voice",
1219
- value="My Voice"
1220
  )
1221
 
1222
  train_btn = gr.Button("🧠 Train Voice", variant="primary")
1223
 
1224
  with gr.Column():
1225
- train_result = gr.Markdown("Upload audio and click Train")
1226
- trained_voice_select = gr.Dropdown(
1227
- choices=get_all_voices(),
1228
- label="Use Trained Voice"
1229
- )
 
 
 
1230
 
1231
  # ===== CREATE VOICE TAB =====
1232
  with gr.TabItem("⚙️ Create Voice"):
1233
- gr.Markdown("""
1234
- ### Create Custom Voice Manually
1235
-
1236
- Adjust the parameters to create your own voice:
1237
- """)
1238
 
1239
  with gr.Row():
1240
  with gr.Column():
@@ -1246,19 +1267,18 @@ with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
1246
  custom_pitch = gr.Slider(
1247
  60, 300, 150,
1248
  label="Pitch (Hz)",
1249
- info="80-130 = Male, 150-250 = Female, 250+ = Child"
1250
  )
1251
 
1252
  custom_formant = gr.Slider(
1253
  0.7, 1.4, 1.0, step=0.05,
1254
  label="Formant Shift",
1255
- info="<1.0 = Larger vocal tract (male), >1.0 = Smaller (female)"
1256
  )
1257
 
1258
  custom_breathiness = gr.Slider(
1259
  1, 10, 3,
1260
- label="Breathiness",
1261
- info="Higher = more breathy/airy voice"
1262
  )
1263
 
1264
  custom_speed = gr.Slider(
@@ -1268,72 +1288,79 @@ with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
1268
 
1269
  custom_brightness = gr.Slider(
1270
  0.8, 1.3, 1.0, step=0.05,
1271
- label="Brightness",
1272
- info="Higher = brighter, more forward voice"
1273
  )
1274
 
1275
  create_btn = gr.Button("✨ Create Voice", variant="primary")
1276
 
1277
  with gr.Column():
1278
  create_result = gr.Markdown("")
1279
- created_voice_select = gr.Dropdown(
1280
- choices=get_all_voices(),
1281
- label="Created Voices"
1282
- )
1283
 
1284
  gr.Markdown("""
1285
- ### Voice Parameter Guide
1286
 
1287
- | Parameter | Male | Female | Child |
1288
- |-----------|------|--------|-------|
1289
- | Pitch | 80-130 Hz | 150-250 Hz | 250-350 Hz |
1290
- | Formant | 0.85-0.95 | 1.05-1.20 | 1.20-1.35 |
1291
- | Breathiness | 2-4 | 3-6 | 2-4 |
1292
- | Brightness | 0.9-1.0 | 1.0-1.15 | 1.1-1.25 |
 
1293
  """)
1294
 
1295
- # ===== VOICES TAB =====
1296
  with gr.TabItem("👥 All Voices"):
1297
- gr.Markdown("### Available Voices")
1298
 
1299
- voice_cards = ""
1300
  for name, v in VOICE_PROFILES.items():
1301
- voice_cards += f"""
1302
  **{name}**
1303
- - Type: {v['gender'].title()}
1304
- - Pitch: {v['f0']} Hz
1305
  - {v['description']}
1306
 
1307
- ---
1308
  """
1309
- gr.Markdown(voice_cards)
 
 
 
1310
 
1311
- # Event handlers
 
 
1312
  voice_select.change(get_voice_info, voice_select, voice_info)
 
1313
  speak_btn.click(synthesize, [text_input, voice_select, rate, pitch], audio_out)
1314
  text_input.submit(synthesize, [text_input, voice_select, rate, pitch], audio_out)
1315
 
 
 
 
 
 
 
1316
  train_btn.click(
1317
- train_voice,
1318
- [audio_input, voice_name_input],
1319
- [train_result, trained_voice_select]
1320
  )
1321
 
 
 
 
 
 
1322
  create_btn.click(
1323
- create_custom_voice,
1324
  [custom_name, custom_pitch, custom_formant, custom_breathiness,
1325
  custom_speed, custom_brightness],
1326
- [create_result, created_voice_select]
1327
- )
1328
-
1329
- # Update voice selectors when new voices are created
1330
- trained_voice_select.change(
1331
- lambda x: x, trained_voice_select, voice_select
1332
- )
1333
- created_voice_select.change(
1334
- lambda x: x, created_voice_select, voice_select
1335
  )
1336
 
1337
 
 
 
 
 
1338
  if __name__ == "__main__":
1339
  demo.launch()
 
8
  import os
9
 
10
  # ============================================
11
+ # VEDES TTS - WITH VOICE TRAINING (FIXED)
12
  # 100% From Scratch - No APIs
13
  # ============================================
14
 
 
22
  "Emma (Female)": {
23
  "name": "Emma",
24
  "gender": "female",
25
+ "f0": 210,
26
  "f0_variation": 30,
27
+ "formant_shift": 1.15,
28
  "breathiness": 0.04,
29
  "speed": 1.0,
30
  "brightness": 1.1,
 
33
  "James (Male)": {
34
  "name": "James",
35
  "gender": "male",
36
+ "f0": 110,
37
  "f0_variation": 20,
38
+ "formant_shift": 0.9,
39
  "breathiness": 0.02,
40
  "speed": 0.95,
41
  "brightness": 0.95,
 
44
  "Sophie (Child)": {
45
  "name": "Sophie",
46
  "gender": "child",
47
+ "f0": 280,
48
  "f0_variation": 40,
49
  "formant_shift": 1.25,
50
  "breathiness": 0.03,
 
55
  "David (Deep Male)": {
56
  "name": "David",
57
  "gender": "male",
58
+ "f0": 85,
59
  "f0_variation": 15,
60
  "formant_shift": 0.82,
61
  "breathiness": 0.02,
 
109
  },
110
  }
111
 
112
+ # Custom voices storage (global)
113
  custom_voices = {}
114
 
115
  # ============================================
 
366
 
367
 
368
  # ============================================
369
+ # VOICE ANALYZER
370
  # ============================================
371
 
372
  class VoiceAnalyzer:
 
377
 
378
  def analyze(self, audio):
379
  """Extract voice features from audio sample"""
380
+ if len(audio) < self.sr * 0.3:
381
  return None
382
 
 
383
  audio = audio.astype(np.float32)
384
+ max_val = np.max(np.abs(audio))
385
+ if max_val > 0:
386
+ audio = audio / max_val
387
 
 
388
  f0 = self._estimate_pitch(audio)
389
  formants = self._estimate_formants(audio)
390
  breathiness = self._estimate_breathiness(audio)
391
 
 
392
  profile = {
393
  "name": "Custom Voice",
394
  "gender": "custom",
 
398
  "breathiness": breathiness,
399
  "speed": 1.0,
400
  "brightness": formants.get('brightness', 1.0),
401
+ "description": f"Custom voice (F0={f0:.0f}Hz)"
402
  }
403
 
404
  return profile
405
 
406
  def _estimate_pitch(self, audio):
407
+ """Estimate fundamental frequency using autocorrelation"""
408
+ frame_size = int(self.sr * 0.03)
 
 
409
  pitches = []
410
+
411
  for i in range(0, len(audio) - frame_size, frame_size):
412
  frame = audio[i:i + frame_size]
413
 
414
+ # Remove DC
415
+ frame = frame - np.mean(frame)
416
+
417
  # Autocorrelation
418
  corr = np.correlate(frame, frame, mode='full')
419
  corr = corr[len(corr)//2:]
420
 
421
+ # Find peaks
422
  d = np.diff(corr)
423
+ start_indices = np.where(d > 0)[0]
424
 
425
+ if len(start_indices) > 0:
426
+ start = start_indices[0]
427
+ search_end = min(start + int(self.sr / 60), len(corr))
428
 
429
+ if search_end > start:
430
+ peak = start + np.argmax(corr[start:search_end])
431
+
432
+ if peak > 0:
433
+ f0 = self.sr / peak
434
+ if 60 < f0 < 400:
435
+ pitches.append(f0)
436
 
437
  if pitches:
438
+ return float(np.median(pitches))
439
+ return 130.0
440
 
441
  def _estimate_f0_variation(self, audio, base_f0):
442
  """Estimate pitch variation"""
 
445
 
446
  for i in range(0, len(audio) - frame_size, frame_size):
447
  frame = audio[i:i + frame_size]
448
+ frame = frame - np.mean(frame)
449
+
450
  corr = np.correlate(frame, frame, mode='full')
451
  corr = corr[len(corr)//2:]
452
 
453
  d = np.diff(corr)
454
+ start_indices = np.where(d > 0)[0]
455
 
456
+ if len(start_indices) > 0:
457
+ start = start_indices[0]
458
+ search_end = min(start + int(self.sr / 60), len(corr))
459
+
460
+ if search_end > start:
461
+ peak = start + np.argmax(corr[start:search_end])
462
+ if peak > 0:
463
+ f0 = self.sr / peak
464
+ if 60 < f0 < 400:
465
+ pitches.append(f0)
466
 
467
  if len(pitches) > 2:
468
+ return min(float(np.std(pitches)), 50.0)
469
+ return 20.0
470
 
471
  def _estimate_formants(self, audio):
472
  """Estimate formant characteristics"""
 
473
  frame_size = 2048
474
 
475
  if len(audio) < frame_size:
476
  return {'shift': 1.0, 'brightness': 1.0}
477
 
 
478
  spectrum = np.abs(np.fft.rfft(audio[:frame_size] * np.hanning(frame_size)))
479
  freqs = np.fft.rfftfreq(frame_size, 1/self.sr)
480
 
481
+ total_energy = np.sum(spectrum) + 1e-8
482
+ centroid = np.sum(freqs * spectrum) / total_energy
483
 
 
 
484
  if centroid > 1600:
485
  shift = 1.2
486
  brightness = 1.15
 
500
  return {'shift': shift, 'brightness': brightness}
501
 
502
  def _estimate_breathiness(self, audio):
503
+ """Estimate breathiness"""
504
  frame_size = 2048
505
 
506
  if len(audio) < frame_size:
 
509
  spectrum = np.abs(np.fft.rfft(audio[:frame_size]))
510
  freqs = np.fft.rfftfreq(frame_size, 1/self.sr)
511
 
512
+ low_mask = freqs < 1000
513
+ high_mask = (freqs > 2000) & (freqs < 5000)
 
514
 
515
+ low_energy = np.sum(spectrum[low_mask]) + 1e-8
516
+ high_energy = np.sum(spectrum[high_mask])
517
+
518
+ ratio = high_energy / low_energy
519
+ breathiness = float(np.clip(ratio * 0.1, 0.02, 0.1))
520
 
521
  return breathiness
522
 
 
576
 
577
 
578
  # ============================================
579
+ # VOICE SYNTHESIZER
580
  # ============================================
581
 
582
  class VoiceSynthesizer:
 
590
 
591
  voice = voice_profile or self.default_voice
592
 
 
593
  f0 = voice.get('f0', 130) * pitch
594
  f0_var = voice.get('f0_variation', 20)
595
  formant_shift = voice.get('formant_shift', 1.0)
 
603
  prev_phon = phonemes[i - 1] if i > 0 else None
604
  next_phon = phonemes[i + 1] if i < len(phonemes) - 1 else None
605
 
 
606
  phrase_pos = i / max(len(phonemes), 1)
607
  f0_current = f0 + f0_var * np.sin(phrase_pos * np.pi) * 0.5
608
 
 
625
 
626
  if phon in VOWELS:
627
  return self._synth_vowel(phon, f0, speed, formant_shift,
628
+ breathiness, brightness)
629
 
630
  if phon in CONSONANTS:
631
  return self._synth_consonant(phon, f0, speed, formant_shift, breathiness)
632
 
633
  return np.zeros(100, dtype=np.float32)
634
 
635
+ def _synth_vowel(self, phon, f0, speed, formant_shift, breathiness, brightness):
 
636
  params = VOWELS[phon]
637
  f1, f2, f3, dur_ms, amp, voiced = params
638
 
 
639
  f1 = f1 * formant_shift
640
+ f2 = f2 * formant_shift * brightness
641
+ f3 = f3 * formant_shift * brightness
 
 
 
 
642
 
643
  dur_ms = dur_ms / speed
644
  n = int(self.sr * dur_ms / 1000)
645
  n = max(n, 100)
646
  t = np.arange(n) / self.sr
647
 
 
648
  source = self._glottal_source(t, f0, breathiness)
 
 
649
  audio = self._apply_formants(source, f1, f2, f3)
 
 
650
  envelope = self._vowel_envelope(n)
651
  audio = audio * envelope * amp
652
 
 
657
  ctype = params['type']
658
 
659
  if ctype == 'stop':
660
+ return self._synth_stop(params, f0, speed, formant_shift)
661
  elif ctype == 'fric':
662
+ return self._synth_fricative(params, f0, speed)
663
  elif ctype == 'affric':
664
+ return self._synth_affricate(params, f0, speed)
665
  elif ctype == 'nasal':
666
+ return self._synth_nasal(params, f0, speed, formant_shift, breathiness)
667
  elif ctype == 'liquid':
668
+ return self._synth_liquid(params, f0, speed, formant_shift, breathiness)
669
  elif ctype == 'glide':
670
+ return self._synth_glide(params, f0, speed, formant_shift, breathiness)
671
 
672
  return np.zeros(100, dtype=np.float32)
673
 
 
682
  mask2 = (phase >= 0.4) & (phase < 0.6)
683
  glottal[mask2] = np.cos(np.pi * (phase[mask2] - 0.4) / 0.4)
684
 
 
685
  glottal += np.random.randn(len(t)) * breathiness
 
 
686
  shimmer = 1 + 0.02 * np.sin(2 * np.pi * 5 * t)
687
  glottal *= shimmer
688
 
 
734
 
735
  return env
736
 
737
+ def _synth_stop(self, params, f0, speed, formant_shift):
738
  closure_ms = params['closure'] / speed
739
  burst_ms = params['burst'] / speed
740
 
 
766
 
767
  return audio
768
 
769
+ def _synth_fricative(self, params, f0, speed):
770
  dur_ms = params['dur'] / speed
771
  n = int(self.sr * dur_ms / 1000)
772
 
 
793
 
794
  return audio.astype(np.float32)
795
 
796
+ def _synth_affricate(self, params, f0, speed):
797
  closure_ms = params['closure'] / speed
798
  fric_ms = params['fric'] / speed
799
 
 
828
 
829
  return audio
830
 
831
+ def _synth_nasal(self, params, f0, speed, formant_shift, breathiness):
832
  dur_ms = params['dur'] / speed
833
  n = int(self.sr * dur_ms / 1000)
834
  t = np.arange(n) / self.sr
 
854
 
855
  return audio.astype(np.float32)
856
 
857
+ def _synth_liquid(self, params, f0, speed, formant_shift, breathiness):
858
  dur_ms = params['dur'] / speed
859
  n = int(self.sr * dur_ms / 1000)
860
  t = np.arange(n) / self.sr
 
870
 
871
  return audio.astype(np.float32)
872
 
873
+ def _synth_glide(self, params, f0, speed, formant_shift, breathiness):
874
  dur_ms = params['dur'] / speed
875
  n = int(self.sr * dur_ms / 1000)
876
  t = np.arange(n) / self.sr
 
910
  if seg_len <= 0:
911
  break
912
 
913
+ seg_to_add = seg[:seg_len].copy()
914
 
915
  if i > 0 and pos > overlap:
916
  fade_len = min(overlap, seg_len)
 
918
  fade_out = np.linspace(1, 0, fade_len) ** 0.5
919
 
920
  audio[pos:pos + fade_len] *= fade_out
 
921
  seg_to_add[:fade_len] *= fade_in
922
 
923
  audio[pos:end_pos] += seg_to_add
 
954
  self.voice_analyzer = VoiceAnalyzer(sample_rate)
955
  self.current_voice = VOICE_PROFILES["Emma (Female)"]
956
 
957
+ def get_voice(self, voice_name):
958
  if voice_name in VOICE_PROFILES:
959
+ return VOICE_PROFILES[voice_name]
960
  elif voice_name in custom_voices:
961
+ return custom_voices[voice_name]
962
+ return self.current_voice
963
 
964
  def speak(self, text, rate=1.0, pitch=1.0, voice_name=None):
965
  if not text or not text.strip():
966
  return np.zeros(self.sr, dtype=np.float32)
967
 
968
+ voice = self.get_voice(voice_name) if voice_name else self.current_voice
 
 
969
  phonemes = self.text_to_phoneme.convert(text)
970
 
971
  if not phonemes:
972
  return np.zeros(self.sr, dtype=np.float32)
973
 
974
+ audio = self.synthesizer.synthesize(phonemes, voice, rate, pitch)
975
 
976
  return audio
977
 
978
  def train_voice(self, audio_data, voice_name="My Voice"):
979
  """Train a new voice from audio sample"""
980
+ global custom_voices
981
+
982
  if audio_data is None:
983
  return None
984
 
985
+ # Handle tuple format (sample_rate, audio)
986
  if isinstance(audio_data, tuple):
987
  sr, audio = audio_data
988
  audio = audio.astype(np.float32)
989
+
990
+ # Handle stereo
991
+ if len(audio.shape) > 1:
992
+ audio = audio.mean(axis=1)
993
+
994
+ # Resample if needed
995
  if sr != self.sr:
 
996
  duration = len(audio) / sr
997
  new_length = int(duration * self.sr)
998
  audio = signal.resample(audio, new_length)
 
1000
  audio = audio_data.astype(np.float32)
1001
 
1002
  # Normalize
1003
+ max_val = np.max(np.abs(audio))
1004
+ if max_val > 0:
1005
+ audio = audio / max_val
1006
 
1007
+ # Analyze
1008
  profile = self.voice_analyzer.analyze(audio)
1009
 
1010
  if profile:
1011
  profile['name'] = voice_name
1012
+ profile['description'] = f"Trained voice (F0={profile['f0']:.0f}Hz)"
1013
  custom_voices[voice_name] = profile
1014
  return profile
1015
 
 
1032
 
1033
 
1034
  # ============================================
1035
+ # HELPER FUNCTIONS
1036
+ # ============================================
1037
+
1038
+ def get_all_voices():
1039
+ """Get list of all available voices"""
1040
+ voices = list(VOICE_PROFILES.keys()) + list(custom_voices.keys())
1041
+ return voices
1042
+
1043
+
1044
+ def get_voice_info(voice_name):
1045
+ """Get info about a voice"""
1046
+ if voice_name in VOICE_PROFILES:
1047
+ v = VOICE_PROFILES[voice_name]
1048
+ elif voice_name in custom_voices:
1049
+ v = custom_voices[voice_name]
1050
+ else:
1051
+ return "Select a voice"
1052
+
1053
+ return f"""
1054
+ **{v.get('name', voice_name)}**
1055
+ - Type: {v.get('gender', 'unknown').title()}
1056
+ - Pitch: {v.get('f0', 130):.0f} Hz
1057
+ - {v.get('description', '')}
1058
+ """
1059
+
1060
+
1061
+ # ============================================
1062
+ # GRADIO FUNCTIONS
1063
  # ============================================
1064
 
1065
  def synthesize(text, voice_name, rate, pitch):
1066
+ """Synthesize speech"""
1067
  if not text or not text.strip():
1068
  return None
1069
 
 
1071
 
1072
  try:
1073
  pitch_mult = 2 ** (pitch / 12)
1074
+ audio = tts.speak(text, rate=rate, pitch=pitch_mult, voice_name=voice_name)
 
 
 
 
 
 
 
 
 
 
1075
 
1076
  if len(audio) < 100:
1077
  return None
 
1082
  return (SAMPLE_RATE, audio_int16)
1083
 
1084
  except Exception as e:
1085
+ print(f"Synthesis error: {e}")
1086
  return None
1087
 
1088
 
1089
  def train_voice(audio, voice_name):
1090
+ """Train a new voice from audio"""
1091
+ global custom_voices
1092
+
1093
  if audio is None:
1094
+ return "❌ Please record or upload audio first.", get_all_voices()
1095
 
1096
  if not voice_name or not voice_name.strip():
1097
+ voice_name = f"Custom Voice {len(custom_voices) + 1}"
1098
 
1099
  voice_name = voice_name.strip()[:30]
1100
 
1101
+ # Check if name already exists
1102
+ if voice_name in VOICE_PROFILES:
1103
+ voice_name = f"{voice_name} (custom)"
1104
+
1105
  try:
1106
  profile = tts.train_voice(audio, voice_name)
1107
 
1108
  if profile:
1109
+ result = f"""
1110
+ **Voice "{voice_name}" created!**
1111
 
1112
+ **Detected Parameters:**
1113
  - Pitch (F0): {profile['f0']:.1f} Hz
1114
  - Pitch Variation: {profile['f0_variation']:.1f} Hz
1115
  - Formant Shift: {profile['formant_shift']:.2f}
1116
  - Breathiness: {profile['breathiness']:.3f}
1117
  - Brightness: {profile['brightness']:.2f}
1118
+
1119
+ You can now select this voice in the Speak tab!
1120
  """
1121
+ return result, get_all_voices()
1122
  else:
1123
+ return "❌ Could not analyze voice. Try a longer/clearer sample.", get_all_voices()
1124
 
1125
  except Exception as e:
1126
+ return f"❌ Error: {str(e)}", get_all_voices()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1127
 
1128
 
1129
  def create_custom_voice(name, pitch, formant, breathiness, speed, brightness):
1130
+ """Create a custom voice from parameters"""
1131
+ global custom_voices
1132
+
1133
  if not name or not name.strip():
1134
+ return "❌ Please enter a voice name.", get_all_voices()
1135
+
1136
+ name = name.strip()[:30]
1137
 
1138
+ if name in VOICE_PROFILES:
1139
+ name = f"{name} (custom)"
1140
 
1141
  profile = {
1142
  "name": name,
 
1147
  "breathiness": breathiness / 100,
1148
  "speed": speed,
1149
  "brightness": brightness,
1150
+ "description": f"Custom voice (F0={pitch}Hz)"
1151
  }
1152
 
1153
  custom_voices[name] = profile
1154
 
1155
+ return f"✅ Voice **{name}** created! Select it in the Speak tab.", get_all_voices()
1156
+
1157
 
1158
+ def refresh_voices():
1159
+ """Refresh the voice list"""
1160
+ return gr.update(choices=get_all_voices())
1161
+
1162
+
1163
+ # ============================================
1164
+ # GRADIO INTERFACE
1165
+ # ============================================
1166
 
 
1167
  with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
1168
 
1169
  gr.Markdown("""
 
1178
  with gr.Column(scale=2):
1179
  text_input = gr.Textbox(
1180
  label="📝 Text to Speak",
1181
+ placeholder="Type something...",
1182
  lines=3
1183
  )
1184
 
1185
+ with gr.Row():
1186
+ voice_select = gr.Dropdown(
1187
+ choices=get_all_voices(),
1188
+ value="Emma (Female)",
1189
+ label="🗣️ Voice",
1190
+ interactive=True
1191
+ )
1192
+ refresh_btn = gr.Button("🔄", size="sm")
1193
 
1194
  voice_info = gr.Markdown("Select a voice")
1195
 
 
1219
  gr.Markdown("""
1220
  ### Train a New Voice from Audio
1221
 
1222
+ Record or upload 3-10 seconds of clear speech.
1223
 
1224
+ **Tips:**
1225
+ - Speak naturally and clearly
 
1226
  - Avoid background noise
1227
+ - Read a few sentences
1228
  """)
1229
 
1230
  with gr.Row():
1231
  with gr.Column():
1232
  audio_input = gr.Audio(
1233
+ label="🎤 Record or Upload",
1234
  sources=["microphone", "upload"],
1235
  type="numpy"
1236
  )
1237
 
1238
  voice_name_input = gr.Textbox(
1239
  label="Voice Name",
1240
+ placeholder="e.g., My Voice",
1241
+ value=""
1242
  )
1243
 
1244
  train_btn = gr.Button("🧠 Train Voice", variant="primary")
1245
 
1246
  with gr.Column():
1247
+ train_result = gr.Markdown("Record audio and click Train")
1248
+
1249
+ gr.Markdown("""
1250
+ ### What Gets Analyzed:
1251
+ - **Pitch (F0)**: How high/low the voice is
1252
+ - **Formants**: Voice quality/timbre
1253
+ - **Breathiness**: Air in the voice
1254
+ """)
1255
 
1256
  # ===== CREATE VOICE TAB =====
1257
  with gr.TabItem("⚙️ Create Voice"):
1258
+ gr.Markdown("### Create Custom Voice Manually")
 
 
 
 
1259
 
1260
  with gr.Row():
1261
  with gr.Column():
 
1267
  custom_pitch = gr.Slider(
1268
  60, 300, 150,
1269
  label="Pitch (Hz)",
1270
+ info="60-130=Male, 150-250=Female, 250+=Child"
1271
  )
1272
 
1273
  custom_formant = gr.Slider(
1274
  0.7, 1.4, 1.0, step=0.05,
1275
  label="Formant Shift",
1276
+ info="<1.0=Male, >1.0=Female/Child"
1277
  )
1278
 
1279
  custom_breathiness = gr.Slider(
1280
  1, 10, 3,
1281
+ label="Breathiness (%)"
 
1282
  )
1283
 
1284
  custom_speed = gr.Slider(
 
1288
 
1289
  custom_brightness = gr.Slider(
1290
  0.8, 1.3, 1.0, step=0.05,
1291
+ label="Brightness"
 
1292
  )
1293
 
1294
  create_btn = gr.Button("✨ Create Voice", variant="primary")
1295
 
1296
  with gr.Column():
1297
  create_result = gr.Markdown("")
 
 
 
 
1298
 
1299
  gr.Markdown("""
1300
+ ### Quick Presets:
1301
 
1302
+ | Type | Pitch | Formant |
1303
+ |------|-------|---------|
1304
+ | Deep Male | 85 | 0.85 |
1305
+ | Male | 120 | 0.92 |
1306
+ | Female | 200 | 1.12 |
1307
+ | High Female | 240 | 1.20 |
1308
+ | Child | 280 | 1.25 |
1309
  """)
1310
 
1311
+ # ===== ALL VOICES TAB =====
1312
  with gr.TabItem("👥 All Voices"):
1313
+ gr.Markdown("### Pre-built Voices")
1314
 
1315
+ voice_info_md = ""
1316
  for name, v in VOICE_PROFILES.items():
1317
+ voice_info_md += f"""
1318
  **{name}**
1319
+ - Type: {v['gender'].title()} | Pitch: {v['f0']} Hz
 
1320
  - {v['description']}
1321
 
 
1322
  """
1323
+ gr.Markdown(voice_info_md)
1324
+
1325
+ gr.Markdown("### Custom Voices")
1326
+ custom_voices_display = gr.Markdown("*No custom voices yet*")
1327
 
1328
+ # ===== EVENT HANDLERS =====
1329
+
1330
+ # Speak tab
1331
  voice_select.change(get_voice_info, voice_select, voice_info)
1332
+ refresh_btn.click(refresh_voices, outputs=voice_select)
1333
  speak_btn.click(synthesize, [text_input, voice_select, rate, pitch], audio_out)
1334
  text_input.submit(synthesize, [text_input, voice_select, rate, pitch], audio_out)
1335
 
1336
+ # Train tab - Fixed: update choices first, then set value separately
1337
+ def train_and_update(audio, name):
1338
+ result, voices = train_voice(audio, name)
1339
+ # Return result and updated dropdown with new choices
1340
+ return result, gr.update(choices=voices)
1341
+
1342
  train_btn.click(
1343
+ train_and_update,
1344
+ [audio_input, voice_name_input],
1345
+ [train_result, voice_select]
1346
  )
1347
 
1348
+ # Create tab - Fixed similarly
1349
+ def create_and_update(name, pitch, formant, breathiness, speed, brightness):
1350
+ result, voices = create_custom_voice(name, pitch, formant, breathiness, speed, brightness)
1351
+ return result, gr.update(choices=voices)
1352
+
1353
  create_btn.click(
1354
+ create_and_update,
1355
  [custom_name, custom_pitch, custom_formant, custom_breathiness,
1356
  custom_speed, custom_brightness],
1357
+ [create_result, voice_select]
 
 
 
 
 
 
 
 
1358
  )
1359
 
1360
 
1361
+ # ============================================
1362
+ # LAUNCH
1363
+ # ============================================
1364
+
1365
  if __name__ == "__main__":
1366
  demo.launch()