Spaces:

vedaco
/

Vedes

Sleeping

App Files Files Community

vedaco commited on Jan 7

Commit

9ee9e3a

verified ·

1 Parent(s): b67fedc

Update app.py

Browse files

Files changed (1) hide show

app.py +234 -207

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ import json
 import os
 # ============================================
-# VEDES TTS - WITH VOICE TRAINING
 # 100% From Scratch - No APIs
 # ============================================
@@ -22,9 +22,9 @@ VOICE_PROFILES = {
     "Emma (Female)": {
         "name": "Emma",
         "gender": "female",
-        "f0": 210,  # Higher pitch
         "f0_variation": 30,
-        "formant_shift": 1.15,  # Shift formants up
         "breathiness": 0.04,
         "speed": 1.0,
         "brightness": 1.1,
@@ -33,9 +33,9 @@ VOICE_PROFILES = {
     "James (Male)": {
         "name": "James",
         "gender": "male",
-        "f0": 110,  # Lower pitch
         "f0_variation": 20,
-        "formant_shift": 0.9,  # Shift formants down
         "breathiness": 0.02,
         "speed": 0.95,
         "brightness": 0.95,
@@ -44,7 +44,7 @@ VOICE_PROFILES = {
     "Sophie (Child)": {
         "name": "Sophie",
         "gender": "child",
-        "f0": 280,  # High pitch
         "f0_variation": 40,
         "formant_shift": 1.25,
         "breathiness": 0.03,
@@ -55,7 +55,7 @@ VOICE_PROFILES = {
     "David (Deep Male)": {
         "name": "David",
         "gender": "male",
-        "f0": 85,  # Very deep
         "f0_variation": 15,
         "formant_shift": 0.82,
         "breathiness": 0.02,
@@ -109,7 +109,7 @@ VOICE_PROFILES = {
     },
 }
-# Custom voices storage
 custom_voices = {}
 # ============================================
@@ -366,7 +366,7 @@ LETTERS = {
 # ============================================
-# VOICE ANALYZER - Extract Voice Features
 # ============================================
 class VoiceAnalyzer:
@@ -377,19 +377,18 @@ class VoiceAnalyzer:
     def analyze(self, audio):
         """Extract voice features from audio sample"""
-        if len(audio) < self.sr * 0.5:
             return None
-        # Normalize
         audio = audio.astype(np.float32)
-        audio = audio / (np.max(np.abs(audio)) + 1e-8)
-        # Extract features
         f0 = self._estimate_pitch(audio)
         formants = self._estimate_formants(audio)
         breathiness = self._estimate_breathiness(audio)
-        # Create voice profile
         profile = {
             "name": "Custom Voice",
             "gender": "custom",
@@ -399,40 +398,45 @@ class VoiceAnalyzer:
             "breathiness": breathiness,
             "speed": 1.0,
             "brightness": formants.get('brightness', 1.0),
-            "description": "Voice extracted from audio sample"
         }
         return profile
     def _estimate_pitch(self, audio):
-        """Estimate fundamental frequency (F0)"""
-        # Use autocorrelation
-        frame_size = int(self.sr * 0.03)  # 30ms frames
         pitches = []
         for i in range(0, len(audio) - frame_size, frame_size):
             frame = audio[i:i + frame_size]
             # Autocorrelation
             corr = np.correlate(frame, frame, mode='full')
             corr = corr[len(corr)//2:]
-            # Find first peak after initial decline
             d = np.diff(corr)
-            start = np.where(d > 0)[0]
-            if len(start) > 0:
-                start = start[0]
-                peak = start + np.argmax(corr[start:start + int(self.sr / 80)])
-                if peak > 0:
-                    f0 = self.sr / peak
-                    if 60 < f0 < 400:
-                        pitches.append(f0)
         if pitches:
-            return np.median(pitches)
-        return 130  # Default
     def _estimate_f0_variation(self, audio, base_f0):
         """Estimate pitch variation"""
@@ -441,41 +445,42 @@ class VoiceAnalyzer:
         for i in range(0, len(audio) - frame_size, frame_size):
             frame = audio[i:i + frame_size]
             corr = np.correlate(frame, frame, mode='full')
             corr = corr[len(corr)//2:]
             d = np.diff(corr)
-            start = np.where(d > 0)[0]
-            if len(start) > 0:
-                start = start[0]
-                peak = start + np.argmax(corr[start:start + int(self.sr / 80)])
-                if peak > 0:
-                    f0 = self.sr / peak
-                    if 60 < f0 < 400:
-                        pitches.append(f0)
         if len(pitches) > 2:
-            return min(np.std(pitches), 50)
-        return 20
     def _estimate_formants(self, audio):
         """Estimate formant characteristics"""
-        # Simple spectral analysis
         frame_size = 2048
         if len(audio) < frame_size:
             return {'shift': 1.0, 'brightness': 1.0}
-        # Get spectrum
         spectrum = np.abs(np.fft.rfft(audio[:frame_size] * np.hanning(frame_size)))
         freqs = np.fft.rfftfreq(frame_size, 1/self.sr)
-        # Find spectral centroid
-        centroid = np.sum(freqs * spectrum) / (np.sum(spectrum) + 1e-8)
-        # Estimate formant shift based on centroid
-        # Average male ~1200Hz, female ~1400Hz
         if centroid > 1600:
             shift = 1.2
             brightness = 1.15
@@ -495,7 +500,7 @@ class VoiceAnalyzer:
         return {'shift': shift, 'brightness': brightness}
     def _estimate_breathiness(self, audio):
-        """Estimate breathiness/aspiration"""
         frame_size = 2048
         if len(audio) < frame_size:
@@ -504,12 +509,14 @@ class VoiceAnalyzer:
         spectrum = np.abs(np.fft.rfft(audio[:frame_size]))
         freqs = np.fft.rfftfreq(frame_size, 1/self.sr)
-        # High frequency energy ratio (breathiness indicator)
-        low_energy = np.sum(spectrum[freqs < 1000])
-        high_energy = np.sum(spectrum[(freqs > 2000) & (freqs < 5000)])
-        ratio = high_energy / (low_energy + 1e-8)
-        breathiness = np.clip(ratio * 0.1, 0.02, 0.1)
         return breathiness
@@ -569,7 +576,7 @@ class TextToPhoneme:
 # ============================================
-# VOICE-AWARE FORMANT SYNTHESIZER
 # ============================================
 class VoiceSynthesizer:
@@ -583,7 +590,6 @@ class VoiceSynthesizer:
         voice = voice_profile or self.default_voice
-        # Get voice parameters
         f0 = voice.get('f0', 130) * pitch
         f0_var = voice.get('f0_variation', 20)
         formant_shift = voice.get('formant_shift', 1.0)
@@ -597,7 +603,6 @@ class VoiceSynthesizer:
             prev_phon = phonemes[i - 1] if i > 0 else None
             next_phon = phonemes[i + 1] if i < len(phonemes) - 1 else None
-            # Add pitch variation
             phrase_pos = i / max(len(phonemes), 1)
             f0_current = f0 + f0_var * np.sin(phrase_pos * np.pi) * 0.5
@@ -620,39 +625,28 @@ class VoiceSynthesizer:
         if phon in VOWELS:
             return self._synth_vowel(phon, f0, speed, formant_shift,
-                                     breathiness, brightness, prev_phon, next_phon)
         if phon in CONSONANTS:
             return self._synth_consonant(phon, f0, speed, formant_shift, breathiness)
         return np.zeros(100, dtype=np.float32)
-    def _synth_vowel(self, phon, f0, speed, formant_shift, breathiness,
-                     brightness, prev_phon, next_phon):
         params = VOWELS[phon]
         f1, f2, f3, dur_ms, amp, voiced = params
-        # Apply formant shift
         f1 = f1 * formant_shift
-        f2 = f2 * formant_shift
-        f3 = f3 * formant_shift
-        # Apply brightness
-        f2 = f2 * brightness
-        f3 = f3 * brightness
         dur_ms = dur_ms / speed
         n = int(self.sr * dur_ms / 1000)
         n = max(n, 100)
         t = np.arange(n) / self.sr
-        # Generate glottal source with voice characteristics
         source = self._glottal_source(t, f0, breathiness)
-        # Apply formants
         audio = self._apply_formants(source, f1, f2, f3)
-        # Apply envelope
         envelope = self._vowel_envelope(n)
         audio = audio * envelope * amp
@@ -663,17 +657,17 @@ class VoiceSynthesizer:
         ctype = params['type']
         if ctype == 'stop':
-            return self._synth_stop(phon, params, f0, speed, formant_shift)
         elif ctype == 'fric':
-            return self._synth_fricative(phon, params, f0, speed)
         elif ctype == 'affric':
-            return self._synth_affricate(phon, params, f0, speed)
         elif ctype == 'nasal':
-            return self._synth_nasal(phon, params, f0, speed, formant_shift, breathiness)
         elif ctype == 'liquid':
-            return self._synth_liquid(phon, params, f0, speed, formant_shift, breathiness)
         elif ctype == 'glide':
-            return self._synth_glide(phon, params, f0, speed, formant_shift, breathiness)
         return np.zeros(100, dtype=np.float32)
@@ -688,10 +682,7 @@ class VoiceSynthesizer:
         mask2 = (phase >= 0.4) & (phase < 0.6)
         glottal[mask2] = np.cos(np.pi * (phase[mask2] - 0.4) / 0.4)
-        # Add breathiness
         glottal += np.random.randn(len(t)) * breathiness
-        # Add shimmer
         shimmer = 1 + 0.02 * np.sin(2 * np.pi * 5 * t)
         glottal *= shimmer
@@ -743,7 +734,7 @@ class VoiceSynthesizer:
         return env
-    def _synth_stop(self, phon, params, f0, speed, formant_shift):
         closure_ms = params['closure'] / speed
         burst_ms = params['burst'] / speed
@@ -775,7 +766,7 @@ class VoiceSynthesizer:
         return audio
-    def _synth_fricative(self, phon, params, f0, speed):
         dur_ms = params['dur'] / speed
         n = int(self.sr * dur_ms / 1000)
@@ -802,7 +793,7 @@ class VoiceSynthesizer:
         return audio.astype(np.float32)
-    def _synth_affricate(self, phon, params, f0, speed):
         closure_ms = params['closure'] / speed
         fric_ms = params['fric'] / speed
@@ -837,7 +828,7 @@ class VoiceSynthesizer:
         return audio
-    def _synth_nasal(self, phon, params, f0, speed, formant_shift, breathiness):
         dur_ms = params['dur'] / speed
         n = int(self.sr * dur_ms / 1000)
         t = np.arange(n) / self.sr
@@ -863,7 +854,7 @@ class VoiceSynthesizer:
         return audio.astype(np.float32)
-    def _synth_liquid(self, phon, params, f0, speed, formant_shift, breathiness):
         dur_ms = params['dur'] / speed
         n = int(self.sr * dur_ms / 1000)
         t = np.arange(n) / self.sr
@@ -879,7 +870,7 @@ class VoiceSynthesizer:
         return audio.astype(np.float32)
-    def _synth_glide(self, phon, params, f0, speed, formant_shift, breathiness):
         dur_ms = params['dur'] / speed
         n = int(self.sr * dur_ms / 1000)
         t = np.arange(n) / self.sr
@@ -919,7 +910,7 @@ class VoiceSynthesizer:
             if seg_len <= 0:
                 break
-            seg_to_add = seg[:seg_len]
             if i > 0 and pos > overlap:
                 fade_len = min(overlap, seg_len)
@@ -927,7 +918,6 @@ class VoiceSynthesizer:
                 fade_out = np.linspace(1, 0, fade_len) ** 0.5
                 audio[pos:pos + fade_len] *= fade_out
-                seg_to_add = seg_to_add.copy()
                 seg_to_add[:fade_len] *= fade_in
             audio[pos:end_pos] += seg_to_add
@@ -964,39 +954,45 @@ class VedesTTS:
         self.voice_analyzer = VoiceAnalyzer(sample_rate)
         self.current_voice = VOICE_PROFILES["Emma (Female)"]
-    def set_voice(self, voice_name):
         if voice_name in VOICE_PROFILES:
-            self.current_voice = VOICE_PROFILES[voice_name]
         elif voice_name in custom_voices:
-            self.current_voice = custom_voices[voice_name]
     def speak(self, text, rate=1.0, pitch=1.0, voice_name=None):
         if not text or not text.strip():
             return np.zeros(self.sr, dtype=np.float32)
-        if voice_name:
-            self.set_voice(voice_name)
         phonemes = self.text_to_phoneme.convert(text)
         if not phonemes:
             return np.zeros(self.sr, dtype=np.float32)
-        audio = self.synthesizer.synthesize(phonemes, self.current_voice, rate, pitch)
         return audio
     def train_voice(self, audio_data, voice_name="My Voice"):
         """Train a new voice from audio sample"""
         if audio_data is None:
             return None
-        # Handle different input formats
         if isinstance(audio_data, tuple):
             sr, audio = audio_data
             audio = audio.astype(np.float32)
             if sr != self.sr:
-                # Resample
                 duration = len(audio) / sr
                 new_length = int(duration * self.sr)
                 audio = signal.resample(audio, new_length)
@@ -1004,13 +1000,16 @@ class VedesTTS:
             audio = audio_data.astype(np.float32)
         # Normalize
-        audio = audio / (np.max(np.abs(audio)) + 1e-8)
-        # Analyze voice
         profile = self.voice_analyzer.analyze(audio)
         if profile:
             profile['name'] = voice_name
             custom_voices[voice_name] = profile
             return profile
@@ -1033,10 +1032,38 @@ print("=" * 50)
 # ============================================
-# GRADIO INTERFACE
 # ============================================
 def synthesize(text, voice_name, rate, pitch):
     if not text or not text.strip():
         return None
@@ -1044,17 +1071,7 @@ def synthesize(text, voice_name, rate, pitch):
     try:
         pitch_mult = 2 ** (pitch / 12)
-        # Check custom voices first
-        if voice_name in custom_voices:
-            voice = custom_voices[voice_name]
-        elif voice_name in VOICE_PROFILES:
-            voice = VOICE_PROFILES[voice_name]
-        else:
-            voice = VOICE_PROFILES["Emma (Female)"]
-        tts.current_voice = voice
-        audio = tts.speak(text, rate=rate, pitch=pitch_mult)
         if len(audio) < 100:
             return None
@@ -1065,67 +1082,61 @@ def synthesize(text, voice_name, rate, pitch):
         return (SAMPLE_RATE, audio_int16)
     except Exception as e:
-        print(f"Error: {e}")
         return None
 def train_voice(audio, voice_name):
     if audio is None:
-        return "❌ No audio provided", gr.update(choices=get_all_voices())
     if not voice_name or not voice_name.strip():
-        voice_name = "My Voice"
     voice_name = voice_name.strip()[:30]
     try:
         profile = tts.train_voice(audio, voice_name)
         if profile:
-            details = f"""
-✅ Voice "{voice_name}" created successfully!
-**Voice Parameters:**
 - Pitch (F0): {profile['f0']:.1f} Hz
 - Pitch Variation: {profile['f0_variation']:.1f} Hz
 - Formant Shift: {profile['formant_shift']:.2f}
 - Breathiness: {profile['breathiness']:.3f}
 - Brightness: {profile['brightness']:.2f}
 """
-            return details, gr.update(choices=get_all_voices(), value=voice_name)
         else:
-            return "❌ Could not analyze voice. Try a longer sample.", gr.update(choices=get_all_voices())
     except Exception as e:
-        return f"❌ Error: {str(e)}", gr.update(choices=get_all_voices())
-def get_all_voices():
-    voices = list(VOICE_PROFILES.keys()) + list(custom_voices.keys())
-    return voices
-def get_voice_info(voice_name):
-    if voice_name in VOICE_PROFILES:
-        v = VOICE_PROFILES[voice_name]
-    elif voice_name in custom_voices:
-        v = custom_voices[voice_name]
-    else:
-        return "Select a voice"
-    return f"""
-**{v.get('name', voice_name)}**
-- Type: {v.get('gender', 'unknown').title()}
-- Pitch: {v.get('f0', 130):.0f} Hz
-- {v.get('description', '')}
-"""
 def create_custom_voice(name, pitch, formant, breathiness, speed, brightness):
     if not name or not name.strip():
-        return "❌ Please enter a voice name", gr.update(choices=get_all_voices())
-    name = name.strip()
     profile = {
         "name": name,
@@ -1136,15 +1147,23 @@ def create_custom_voice(name, pitch, formant, breathiness, speed, brightness):
         "breathiness": breathiness / 100,
         "speed": speed,
         "brightness": brightness,
-        "description": f"Custom voice (pitch={pitch}Hz)"
     }
     custom_voices[name] = profile
-    return f"✅ Voice '{name}' created!", gr.update(choices=get_all_voices(), value=name)
-# Build interface
 with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
@@ -1159,15 +1178,18 @@ with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
                 with gr.Column(scale=2):
                     text_input = gr.Textbox(
                         label="📝 Text to Speak",
-                        placeholder="Type here...",
                         lines=3
                     )
-                    voice_select = gr.Dropdown(
-                        choices=get_all_voices(),
-                        value="Emma (Female)",
-                        label="🗣️ Voice"
-                    )
                     voice_info = gr.Markdown("Select a voice")
@@ -1197,44 +1219,43 @@ with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
             gr.Markdown("""
             ### Train a New Voice from Audio
-            Record or upload an audio sample, and Vedes will extract the voice characteristics.
-            **Tips for best results:**
-            - Record 5-10 seconds of speech
-            - Speak clearly and naturally
             - Avoid background noise
             """)
             with gr.Row():
                 with gr.Column():
                     audio_input = gr.Audio(
-                        label="🎤 Record or Upload Audio",
                         sources=["microphone", "upload"],
                         type="numpy"
                     )
                     voice_name_input = gr.Textbox(
                         label="Voice Name",
-                        placeholder="My Voice",
-                        value="My Voice"
                     )
                     train_btn = gr.Button("🧠 Train Voice", variant="primary")
                 with gr.Column():
-                    train_result = gr.Markdown("Upload audio and click Train")
-                    trained_voice_select = gr.Dropdown(
-                        choices=get_all_voices(),
-                        label="Use Trained Voice"
-                    )
         # ===== CREATE VOICE TAB =====
         with gr.TabItem("⚙️ Create Voice"):
-            gr.Markdown("""
-            ### Create Custom Voice Manually
-            Adjust the parameters to create your own voice:
-            """)
             with gr.Row():
                 with gr.Column():
@@ -1246,19 +1267,18 @@ with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
                     custom_pitch = gr.Slider(
                         60, 300, 150,
                         label="Pitch (Hz)",
-                        info="80-130 = Male, 150-250 = Female, 250+ = Child"
                     )
                     custom_formant = gr.Slider(
                         0.7, 1.4, 1.0, step=0.05,
                         label="Formant Shift",
-                        info="<1.0 = Larger vocal tract (male), >1.0 = Smaller (female)"
                     )
                     custom_breathiness = gr.Slider(
                         1, 10, 3,
-                        label="Breathiness",
-                        info="Higher = more breathy/airy voice"
                     )
                     custom_speed = gr.Slider(
@@ -1268,72 +1288,79 @@ with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
                     custom_brightness = gr.Slider(
                         0.8, 1.3, 1.0, step=0.05,
-                        label="Brightness",
-                        info="Higher = brighter, more forward voice"
                     )
                     create_btn = gr.Button("✨ Create Voice", variant="primary")
                 with gr.Column():
                     create_result = gr.Markdown("")
-                    created_voice_select = gr.Dropdown(
-                        choices=get_all_voices(),
-                        label="Created Voices"
-                    )
                     gr.Markdown("""
-                    ### Voice Parameter Guide
-                    | Parameter | Male | Female | Child |
-                    |-----------|------|--------|-------|
-                    | Pitch | 80-130 Hz | 150-250 Hz | 250-350 Hz |
-                    | Formant | 0.85-0.95 | 1.05-1.20 | 1.20-1.35 |
-                    | Breathiness | 2-4 | 3-6 | 2-4 |
-                    | Brightness | 0.9-1.0 | 1.0-1.15 | 1.1-1.25 |
                     """)
-        # ===== VOICES TAB =====
         with gr.TabItem("👥 All Voices"):
-            gr.Markdown("### Available Voices")
-            voice_cards = ""
             for name, v in VOICE_PROFILES.items():
-                voice_cards += f"""
 **{name}**
-- Type: {v['gender'].title()}
-- Pitch: {v['f0']} Hz
 - {v['description']}
----
 """
-            gr.Markdown(voice_cards)
-    # Event handlers
     voice_select.change(get_voice_info, voice_select, voice_info)
     speak_btn.click(synthesize, [text_input, voice_select, rate, pitch], audio_out)
     text_input.submit(synthesize, [text_input, voice_select, rate, pitch], audio_out)
     train_btn.click(
-        train_voice,
-        [audio_input, voice_name_input],
-        [train_result, trained_voice_select]
     )
     create_btn.click(
-        create_custom_voice,
         [custom_name, custom_pitch, custom_formant, custom_breathiness,
          custom_speed, custom_brightness],
-        [create_result, created_voice_select]
-    )
-    # Update voice selectors when new voices are created
-    trained_voice_select.change(
-        lambda x: x, trained_voice_select, voice_select
-    )
-    created_voice_select.change(
-        lambda x: x, created_voice_select, voice_select
     )
 if __name__ == "__main__":
     demo.launch()

 import os
 # ============================================
+# VEDES TTS - WITH VOICE TRAINING (FIXED)
 # 100% From Scratch - No APIs
 # ============================================
     "Emma (Female)": {
         "name": "Emma",
         "gender": "female",
+        "f0": 210,
         "f0_variation": 30,
+        "formant_shift": 1.15,
         "breathiness": 0.04,
         "speed": 1.0,
         "brightness": 1.1,
     "James (Male)": {
         "name": "James",
         "gender": "male",
+        "f0": 110,
         "f0_variation": 20,
+        "formant_shift": 0.9,
         "breathiness": 0.02,
         "speed": 0.95,
         "brightness": 0.95,
     "Sophie (Child)": {
         "name": "Sophie",
         "gender": "child",
+        "f0": 280,
         "f0_variation": 40,
         "formant_shift": 1.25,
         "breathiness": 0.03,
     "David (Deep Male)": {
         "name": "David",
         "gender": "male",
+        "f0": 85,
         "f0_variation": 15,
         "formant_shift": 0.82,
         "breathiness": 0.02,
     },
 }
+# Custom voices storage (global)
 custom_voices = {}
 # ============================================
 # ============================================
+# VOICE ANALYZER
 # ============================================
 class VoiceAnalyzer:
     def analyze(self, audio):
         """Extract voice features from audio sample"""
+        if len(audio) < self.sr * 0.3:
             return None
         audio = audio.astype(np.float32)
+        max_val = np.max(np.abs(audio))
+        if max_val > 0:
+            audio = audio / max_val
         f0 = self._estimate_pitch(audio)
         formants = self._estimate_formants(audio)
         breathiness = self._estimate_breathiness(audio)
         profile = {
             "name": "Custom Voice",
             "gender": "custom",
             "breathiness": breathiness,
             "speed": 1.0,
             "brightness": formants.get('brightness', 1.0),
+            "description": f"Custom voice (F0={f0:.0f}Hz)"
         }
         return profile
     def _estimate_pitch(self, audio):
+        """Estimate fundamental frequency using autocorrelation"""
+        frame_size = int(self.sr * 0.03)
         pitches = []
         for i in range(0, len(audio) - frame_size, frame_size):
             frame = audio[i:i + frame_size]
+            # Remove DC
+            frame = frame - np.mean(frame)
             # Autocorrelation
             corr = np.correlate(frame, frame, mode='full')
             corr = corr[len(corr)//2:]
+            # Find peaks
             d = np.diff(corr)
+            start_indices = np.where(d > 0)[0]
+            if len(start_indices) > 0:
+                start = start_indices[0]
+                search_end = min(start + int(self.sr / 60), len(corr))
+                if search_end > start:
+                    peak = start + np.argmax(corr[start:search_end])
+                    if peak > 0:
+                        f0 = self.sr / peak
+                        if 60 < f0 < 400:
+                            pitches.append(f0)
         if pitches:
+            return float(np.median(pitches))
+        return 130.0
     def _estimate_f0_variation(self, audio, base_f0):
         """Estimate pitch variation"""
         for i in range(0, len(audio) - frame_size, frame_size):
             frame = audio[i:i + frame_size]
+            frame = frame - np.mean(frame)
             corr = np.correlate(frame, frame, mode='full')
             corr = corr[len(corr)//2:]
             d = np.diff(corr)
+            start_indices = np.where(d > 0)[0]
+            if len(start_indices) > 0:
+                start = start_indices[0]
+                search_end = min(start + int(self.sr / 60), len(corr))
+                if search_end > start:
+                    peak = start + np.argmax(corr[start:search_end])
+                    if peak > 0:
+                        f0 = self.sr / peak
+                        if 60 < f0 < 400:
+                            pitches.append(f0)
         if len(pitches) > 2:
+            return min(float(np.std(pitches)), 50.0)
+        return 20.0
     def _estimate_formants(self, audio):
         """Estimate formant characteristics"""
         frame_size = 2048
         if len(audio) < frame_size:
             return {'shift': 1.0, 'brightness': 1.0}
         spectrum = np.abs(np.fft.rfft(audio[:frame_size] * np.hanning(frame_size)))
         freqs = np.fft.rfftfreq(frame_size, 1/self.sr)
+        total_energy = np.sum(spectrum) + 1e-8
+        centroid = np.sum(freqs * spectrum) / total_energy
         if centroid > 1600:
             shift = 1.2
             brightness = 1.15
         return {'shift': shift, 'brightness': brightness}
     def _estimate_breathiness(self, audio):
+        """Estimate breathiness"""
         frame_size = 2048
         if len(audio) < frame_size:
         spectrum = np.abs(np.fft.rfft(audio[:frame_size]))
         freqs = np.fft.rfftfreq(frame_size, 1/self.sr)
+        low_mask = freqs < 1000
+        high_mask = (freqs > 2000) & (freqs < 5000)
+        low_energy = np.sum(spectrum[low_mask]) + 1e-8
+        high_energy = np.sum(spectrum[high_mask])
+        ratio = high_energy / low_energy
+        breathiness = float(np.clip(ratio * 0.1, 0.02, 0.1))
         return breathiness
 # ============================================
+# VOICE SYNTHESIZER
 # ============================================
 class VoiceSynthesizer:
         voice = voice_profile or self.default_voice
         f0 = voice.get('f0', 130) * pitch
         f0_var = voice.get('f0_variation', 20)
         formant_shift = voice.get('formant_shift', 1.0)
             prev_phon = phonemes[i - 1] if i > 0 else None
             next_phon = phonemes[i + 1] if i < len(phonemes) - 1 else None
             phrase_pos = i / max(len(phonemes), 1)
             f0_current = f0 + f0_var * np.sin(phrase_pos * np.pi) * 0.5
         if phon in VOWELS:
             return self._synth_vowel(phon, f0, speed, formant_shift,
+                                     breathiness, brightness)
         if phon in CONSONANTS:
             return self._synth_consonant(phon, f0, speed, formant_shift, breathiness)
         return np.zeros(100, dtype=np.float32)
+    def _synth_vowel(self, phon, f0, speed, formant_shift, breathiness, brightness):
         params = VOWELS[phon]
         f1, f2, f3, dur_ms, amp, voiced = params
         f1 = f1 * formant_shift
+        f2 = f2 * formant_shift * brightness
+        f3 = f3 * formant_shift * brightness
         dur_ms = dur_ms / speed
         n = int(self.sr * dur_ms / 1000)
         n = max(n, 100)
         t = np.arange(n) / self.sr
         source = self._glottal_source(t, f0, breathiness)
         audio = self._apply_formants(source, f1, f2, f3)
         envelope = self._vowel_envelope(n)
         audio = audio * envelope * amp
         ctype = params['type']
         if ctype == 'stop':
+            return self._synth_stop(params, f0, speed, formant_shift)
         elif ctype == 'fric':
+            return self._synth_fricative(params, f0, speed)
         elif ctype == 'affric':
+            return self._synth_affricate(params, f0, speed)
         elif ctype == 'nasal':
+            return self._synth_nasal(params, f0, speed, formant_shift, breathiness)
         elif ctype == 'liquid':
+            return self._synth_liquid(params, f0, speed, formant_shift, breathiness)
         elif ctype == 'glide':
+            return self._synth_glide(params, f0, speed, formant_shift, breathiness)
         return np.zeros(100, dtype=np.float32)
         mask2 = (phase >= 0.4) & (phase < 0.6)
         glottal[mask2] = np.cos(np.pi * (phase[mask2] - 0.4) / 0.4)
         glottal += np.random.randn(len(t)) * breathiness
         shimmer = 1 + 0.02 * np.sin(2 * np.pi * 5 * t)
         glottal *= shimmer
         return env
+    def _synth_stop(self, params, f0, speed, formant_shift):
         closure_ms = params['closure'] / speed
         burst_ms = params['burst'] / speed
         return audio
+    def _synth_fricative(self, params, f0, speed):
         dur_ms = params['dur'] / speed
         n = int(self.sr * dur_ms / 1000)
         return audio.astype(np.float32)
+    def _synth_affricate(self, params, f0, speed):
         closure_ms = params['closure'] / speed
         fric_ms = params['fric'] / speed
         return audio
+    def _synth_nasal(self, params, f0, speed, formant_shift, breathiness):
         dur_ms = params['dur'] / speed
         n = int(self.sr * dur_ms / 1000)
         t = np.arange(n) / self.sr
         return audio.astype(np.float32)
+    def _synth_liquid(self, params, f0, speed, formant_shift, breathiness):
         dur_ms = params['dur'] / speed
         n = int(self.sr * dur_ms / 1000)
         t = np.arange(n) / self.sr
         return audio.astype(np.float32)
+    def _synth_glide(self, params, f0, speed, formant_shift, breathiness):
         dur_ms = params['dur'] / speed
         n = int(self.sr * dur_ms / 1000)
         t = np.arange(n) / self.sr
             if seg_len <= 0:
                 break
+            seg_to_add = seg[:seg_len].copy()
             if i > 0 and pos > overlap:
                 fade_len = min(overlap, seg_len)
                 fade_out = np.linspace(1, 0, fade_len) ** 0.5
                 audio[pos:pos + fade_len] *= fade_out
                 seg_to_add[:fade_len] *= fade_in
             audio[pos:end_pos] += seg_to_add
         self.voice_analyzer = VoiceAnalyzer(sample_rate)
         self.current_voice = VOICE_PROFILES["Emma (Female)"]
+    def get_voice(self, voice_name):
         if voice_name in VOICE_PROFILES:
+            return VOICE_PROFILES[voice_name]
         elif voice_name in custom_voices:
+            return custom_voices[voice_name]
+        return self.current_voice
     def speak(self, text, rate=1.0, pitch=1.0, voice_name=None):
         if not text or not text.strip():
             return np.zeros(self.sr, dtype=np.float32)
+        voice = self.get_voice(voice_name) if voice_name else self.current_voice
         phonemes = self.text_to_phoneme.convert(text)
         if not phonemes:
             return np.zeros(self.sr, dtype=np.float32)
+        audio = self.synthesizer.synthesize(phonemes, voice, rate, pitch)
         return audio
     def train_voice(self, audio_data, voice_name="My Voice"):
         """Train a new voice from audio sample"""
+        global custom_voices
         if audio_data is None:
             return None
+        # Handle tuple format (sample_rate, audio)
         if isinstance(audio_data, tuple):
             sr, audio = audio_data
             audio = audio.astype(np.float32)
+            # Handle stereo
+            if len(audio.shape) > 1:
+                audio = audio.mean(axis=1)
+            # Resample if needed
             if sr != self.sr:
                 duration = len(audio) / sr
                 new_length = int(duration * self.sr)
                 audio = signal.resample(audio, new_length)
             audio = audio_data.astype(np.float32)
         # Normalize
+        max_val = np.max(np.abs(audio))
+        if max_val > 0:
+            audio = audio / max_val
+        # Analyze
         profile = self.voice_analyzer.analyze(audio)
         if profile:
             profile['name'] = voice_name
+            profile['description'] = f"Trained voice (F0={profile['f0']:.0f}Hz)"
             custom_voices[voice_name] = profile
             return profile
 # ============================================
+# HELPER FUNCTIONS
+# ============================================
+def get_all_voices():
+    """Get list of all available voices"""
+    voices = list(VOICE_PROFILES.keys()) + list(custom_voices.keys())
+    return voices
+def get_voice_info(voice_name):
+    """Get info about a voice"""
+    if voice_name in VOICE_PROFILES:
+        v = VOICE_PROFILES[voice_name]
+    elif voice_name in custom_voices:
+        v = custom_voices[voice_name]
+    else:
+        return "Select a voice"
+    return f"""
+**{v.get('name', voice_name)}**
+- Type: {v.get('gender', 'unknown').title()}
+- Pitch: {v.get('f0', 130):.0f} Hz
+- {v.get('description', '')}
+"""
+# ============================================
+# GRADIO FUNCTIONS
 # ============================================
 def synthesize(text, voice_name, rate, pitch):
+    """Synthesize speech"""
     if not text or not text.strip():
         return None
     try:
         pitch_mult = 2 ** (pitch / 12)
+        audio = tts.speak(text, rate=rate, pitch=pitch_mult, voice_name=voice_name)
         if len(audio) < 100:
             return None
         return (SAMPLE_RATE, audio_int16)
     except Exception as e:
+        print(f"Synthesis error: {e}")
         return None
 def train_voice(audio, voice_name):
+    """Train a new voice from audio"""
+    global custom_voices
     if audio is None:
+        return "❌ Please record or upload audio first.", get_all_voices()
     if not voice_name or not voice_name.strip():
+        voice_name = f"Custom Voice {len(custom_voices) + 1}"
     voice_name = voice_name.strip()[:30]
+    # Check if name already exists
+    if voice_name in VOICE_PROFILES:
+        voice_name = f"{voice_name} (custom)"
     try:
         profile = tts.train_voice(audio, voice_name)
         if profile:
+            result = f"""
+✅ **Voice "{voice_name}" created!**
+**Detected Parameters:**
 - Pitch (F0): {profile['f0']:.1f} Hz
 - Pitch Variation: {profile['f0_variation']:.1f} Hz
 - Formant Shift: {profile['formant_shift']:.2f}
 - Breathiness: {profile['breathiness']:.3f}
 - Brightness: {profile['brightness']:.2f}
+You can now select this voice in the Speak tab!
 """
+            return result, get_all_voices()
         else:
+            return "❌ Could not analyze voice. Try a longer/clearer sample.", get_all_voices()
     except Exception as e:
+        return f"❌ Error: {str(e)}", get_all_voices()
 def create_custom_voice(name, pitch, formant, breathiness, speed, brightness):
+    """Create a custom voice from parameters"""
+    global custom_voices
     if not name or not name.strip():
+        return "❌ Please enter a voice name.", get_all_voices()
+    name = name.strip()[:30]
+    if name in VOICE_PROFILES:
+        name = f"{name} (custom)"
     profile = {
         "name": name,
         "breathiness": breathiness / 100,
         "speed": speed,
         "brightness": brightness,
+        "description": f"Custom voice (F0={pitch}Hz)"
     }
     custom_voices[name] = profile
+    return f"✅ Voice **{name}** created! Select it in the Speak tab.", get_all_voices()
+def refresh_voices():
+    """Refresh the voice list"""
+    return gr.update(choices=get_all_voices())
+# ============================================
+# GRADIO INTERFACE
+# ============================================
 with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
                 with gr.Column(scale=2):
                     text_input = gr.Textbox(
                         label="📝 Text to Speak",
+                        placeholder="Type something...",
                         lines=3
                     )
+                    with gr.Row():
+                        voice_select = gr.Dropdown(
+                            choices=get_all_voices(),
+                            value="Emma (Female)",
+                            label="🗣️ Voice",
+                            interactive=True
+                        )
+                        refresh_btn = gr.Button("🔄", size="sm")
                     voice_info = gr.Markdown("Select a voice")
             gr.Markdown("""
             ### Train a New Voice from Audio
+            Record or upload 3-10 seconds of clear speech.
+            **Tips:**
+            - Speak naturally and clearly
             - Avoid background noise
+            - Read a few sentences
             """)
             with gr.Row():
                 with gr.Column():
                     audio_input = gr.Audio(
+                        label="🎤 Record or Upload",
                         sources=["microphone", "upload"],
                         type="numpy"
                     )
                     voice_name_input = gr.Textbox(
                         label="Voice Name",
+                        placeholder="e.g., My Voice",
+                        value=""
                     )
                     train_btn = gr.Button("🧠 Train Voice", variant="primary")
                 with gr.Column():
+                    train_result = gr.Markdown("Record audio and click Train")
+                    gr.Markdown("""
+                    ### What Gets Analyzed:
+                    - **Pitch (F0)**: How high/low the voice is
+                    - **Formants**: Voice quality/timbre
+                    - **Breathiness**: Air in the voice
+                    """)
         # ===== CREATE VOICE TAB =====
         with gr.TabItem("⚙️ Create Voice"):
+            gr.Markdown("### Create Custom Voice Manually")
             with gr.Row():
                 with gr.Column():
                     custom_pitch = gr.Slider(
                         60, 300, 150,
                         label="Pitch (Hz)",
+                        info="60-130=Male, 150-250=Female, 250+=Child"
                     )
                     custom_formant = gr.Slider(
                         0.7, 1.4, 1.0, step=0.05,
                         label="Formant Shift",
+                        info="<1.0=Male, >1.0=Female/Child"
                     )
                     custom_breathiness = gr.Slider(
                         1, 10, 3,
+                        label="Breathiness (%)"
                     )
                     custom_speed = gr.Slider(
                     custom_brightness = gr.Slider(
                         0.8, 1.3, 1.0, step=0.05,
+                        label="Brightness"
                     )
                     create_btn = gr.Button("✨ Create Voice", variant="primary")
                 with gr.Column():
                     create_result = gr.Markdown("")
                     gr.Markdown("""
+                    ### Quick Presets:
+                    | Type | Pitch | Formant |
+                    |------|-------|---------|
+                    | Deep Male | 85 | 0.85 |
+                    | Male | 120 | 0.92 |
+                    | Female | 200 | 1.12 |
+                    | High Female | 240 | 1.20 |
+                    | Child | 280 | 1.25 |
                     """)
+        # ===== ALL VOICES TAB =====
         with gr.TabItem("👥 All Voices"):
+            gr.Markdown("### Pre-built Voices")
+            voice_info_md = ""
             for name, v in VOICE_PROFILES.items():
+                voice_info_md += f"""
 **{name}**
+- Type: {v['gender'].title()} | Pitch: {v['f0']} Hz
 - {v['description']}
 """
+            gr.Markdown(voice_info_md)
+            gr.Markdown("### Custom Voices")
+            custom_voices_display = gr.Markdown("*No custom voices yet*")
+    # ===== EVENT HANDLERS =====
+    # Speak tab
     voice_select.change(get_voice_info, voice_select, voice_info)
+    refresh_btn.click(refresh_voices, outputs=voice_select)
     speak_btn.click(synthesize, [text_input, voice_select, rate, pitch], audio_out)
     text_input.submit(synthesize, [text_input, voice_select, rate, pitch], audio_out)
+    # Train tab - Fixed: update choices first, then set value separately
+    def train_and_update(audio, name):
+        result, voices = train_voice(audio, name)
+        # Return result and updated dropdown with new choices
+        return result, gr.update(choices=voices)
     train_btn.click(
+        train_and_update,
+        [audio_input, voice_name_input],
+        [train_result, voice_select]
     )
+    # Create tab - Fixed similarly
+    def create_and_update(name, pitch, formant, breathiness, speed, brightness):
+        result, voices = create_custom_voice(name, pitch, formant, breathiness, speed, brightness)
+        return result, gr.update(choices=voices)
     create_btn.click(
+        create_and_update,
         [custom_name, custom_pitch, custom_formant, custom_breathiness,
          custom_speed, custom_brightness],
+        [create_result, voice_select]
     )
+# ============================================
+# LAUNCH
+# ============================================
 if __name__ == "__main__":
     demo.launch()