Spaces:

vedaco
/

Vedes

Sleeping

App Files Files Community

vedaco commited on 17 days ago

Commit

3194e3d

verified ·

1 Parent(s): 29be232

Update app.py

Browse files

Files changed (1) hide show

app.py +905 -296

app.py CHANGED Viewed

@@ -1,354 +1,963 @@
-import gradio as gr
 import numpy as np
-import asyncio
-import edge_tts
-import tempfile
-import os
-from scipy.io import wavfile
 from scipy import signal
-import io
 # ============================================
-# VEDES TTS - Text-to-Speech System
 # ============================================
-print("=" * 50)
-print("🎙️ Initializing Vedes TTS...")
-print("=" * 50)
-# Available voices
-VOICES = {
-    "Emma (US Female)": "en-US-EmmaNeural",
-    "Jenny (US Female)": "en-US-JennyNeural",
-    "Aria (US Female)": "en-US-AriaNeural",
-    "Guy (US Male)": "en-US-GuyNeural",
-    "Eric (US Male)": "en-US-EricNeural",
-    "Ryan (UK Male)": "en-GB-RyanNeural",
-    "Sonia (UK Female)": "en-GB-SoniaNeural",
-    "Natasha (AU Female)": "en-AU-NatashaNeural",
-    "William (AU Male)": "en-AU-WilliamNeural",
 }
-DEFAULT_VOICE = "en-US-EmmaNeural"
-SAMPLE_RATE = 24000
-async def synthesize_async(text, voice, rate, pitch):
-    """Async TTS synthesis using edge-tts"""
-    # Format rate and pitch for edge-tts
-    rate_str = f"{'+' if rate >= 0 else ''}{int(rate)}%"
-    pitch_str = f"{'+' if pitch >= 0 else ''}{int(pitch)}Hz"
-    communicate = edge_tts.Communicate(
-        text=text,
-        voice=voice,
-        rate=rate_str,
-        pitch=pitch_str
-    )
-    # Save to temporary file
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
-        tmp_path = tmp_file.name
-    await communicate.save(tmp_path)
-    return tmp_path
-def synthesize_speech(text, voice_name, speaking_rate, pitch_shift):
-    """
-    Main synthesis function
-    Args:
-        text: Input text to synthesize
-        voice_name: Selected voice
-        speaking_rate: Speed adjustment (-50 to +50)
-        pitch_shift: Pitch adjustment in Hz (-20 to +20)
-    Returns:
-        Path to generated audio file
-    """
-    if not text or len(text.strip()) == 0:
-        return None
-    text = text.strip()[:5000]  # Limit text length
-    # Get voice ID
-    voice = VOICES.get(voice_name, DEFAULT_VOICE)
-    # Convert speaking rate to percentage
-    rate = int((speaking_rate - 1.0) * 100)
-    # Convert pitch shift
-    pitch = int(pitch_shift * 10)
-    try:
-        # Run async synthesis
-        loop = asyncio.new_event_loop()
-        asyncio.set_event_loop(loop)
-        audio_path = loop.run_until_complete(
-            synthesize_async(text, voice, rate, pitch)
-        )
-        loop.close()
-        return audio_path
-    except Exception as e:
-        print(f"Synthesis error: {e}")
-        return None
-def text_analysis(text):
-    """Analyze text and return statistics"""
-    if not text:
-        return ""
-    words = text.split()
-    sentences = text.replace('!', '.').replace('?', '.').split('.')
-    sentences = [s.strip() for s in sentences if s.strip()]
-    char_count = len(text)
-    word_count = len(words)
-    sentence_count = len(sentences)
-    # Estimate duration (average 150 words per minute)
-    est_duration = word_count / 150 * 60
-    return f"""
-    📊 **Text Analysis:**
-    - Characters: {char_count}
-    - Words: {word_count}
-    - Sentences: {sentence_count}
-    - Estimated Duration: {est_duration:.1f} seconds
-    """
 # ============================================
-# GRADIO INTERFACE
 # ============================================
-# Custom CSS
-custom_css = """
-.gradio-container {
-    max-width: 900px !important;
-}
-.title-text {
-    text-align: center;
-    background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
-    -webkit-background-clip: text;
-    -webkit-text-fill-color: transparent;
-    font-size: 2.5rem;
-    font-weight: bold;
-}
-.subtitle-text {
-    text-align: center;
-    color: #666;
-}
-"""
-with gr.Blocks(
-    title="Vedes TTS",
-    css=custom_css,
-    theme=gr.themes.Soft(
-        primary_hue="purple",
-        secondary_hue="blue",
-    )
-) as demo:
-    # Header
-    gr.HTML("""
-        <div style="text-align: center; padding: 20px;">
-            <h1 class="title-text">🎙️ Vedes TTS</h1>
-            <p class="subtitle-text">High-Quality Text-to-Speech Synthesis</p>
-        </div>
-    """)
-    with gr.Tabs():
-        # Main TTS Tab
-        with gr.TabItem("🔊 Text to Speech"):
-            with gr.Row():
-                with gr.Column(scale=2):
-                    text_input = gr.Textbox(
-                        label="📝 Enter Text",
-                        placeholder="Type or paste your text here...\n\nExample: Hello! Welcome to Vedes, a high-quality text-to-speech system. I can read any text you provide with natural-sounding speech.",
-                        lines=6,
-                        max_lines=15
-                    )
-                    text_stats = gr.Markdown("")
-                    with gr.Row():
-                        voice_select = gr.Dropdown(
-                            choices=list(VOICES.keys()),
-                            value="Emma (US Female)",
-                            label="🗣️ Select Voice",
-                            interactive=True
-                        )
-                    with gr.Row():
-                        speaking_rate = gr.Slider(
-                            minimum=0.5,
-                            maximum=2.0,
-                            value=1.0,
-                            step=0.1,
-                            label="⏱️ Speaking Rate",
-                            info="0.5x = Slow, 1.0x = Normal, 2.0x = Fast"
-                        )
-                        pitch_shift = gr.Slider(
-                            minimum=-2.0,
-                            maximum=2.0,
-                            value=0.0,
-                            step=0.1,
-                            label="🎵 Pitch Adjustment",
-                            info="Adjust voice pitch"
-                        )
-                    synthesize_btn = gr.Button(
-                        "🔊 Generate Speech",
-                        variant="primary",
-                        size="lg"
-                    )
-                with gr.Column(scale=1):
-                    audio_output = gr.Audio(
-                        label="🎧 Generated Speech",
-                        type="filepath"
-                    )
-                    gr.Markdown("""
-                    ### 💡 Tips:
-                    - Use punctuation for natural pauses
-                    - Add commas for short pauses
-                    - Add periods for longer pauses
-                    - Use "!" and "?" for expression
-                    """)
-        # Examples Tab
-        with gr.TabItem("📚 Examples"):
-            gr.Markdown("### Click any example to try it:")
-            examples = [
-                ["Hello! Welcome to Vedes text-to-speech. I hope you're having a wonderful day!"],
-                ["The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet."],
-                ["In a world where technology advances rapidly, artificial intelligence continues to reshape how we live and work."],
-                ["Once upon a time, in a land far away, there lived a wise old wizard who knew the secrets of the universe."],
-                ["Breaking news: Scientists have discovered a new species of butterfly in the Amazon rainforest."],
-                ["To be, or not to be, that is the question. Whether 'tis nobler in the mind to suffer the slings and arrows of outrageous fortune."],
-                ["Good morning! Today's weather forecast predicts sunny skies with a high of 75 degrees Fahrenheit."],
-                ["Thank you for using Vedes TTS. We appreciate your interest in our text-to-speech technology!"],
-            ]
-            gr.Examples(
-                examples=examples,
-                inputs=text_input,
-                label=""
-            )
-        # Voices Tab
-        with gr.TabItem("🎭 Voice Gallery"):
-            gr.Markdown("""
-            ### Available Voices:
-            | Voice | Gender | Accent | Best For |
-            |-------|--------|--------|----------|
-            | Emma | Female | US English | General, Friendly |
-            | Jenny | Female | US English | Professional, Clear |
-            | Aria | Female | US English | Conversational |
-            | Guy | Male | US English | Narration, Calm |
-            | Eric | Male | US English | News, Formal |
-            | Ryan | Male | UK English | British content |
-            | Sonia | Female | UK English | British content |
-            | Natasha | Female | AU English | Australian content |
-            | William | Male | AU English | Australian content |
-            ---
-            ### 🎯 Voice Selection Tips:
-            - **For storytelling:** Try Emma or Guy
-            - **For news/formal:** Try Jenny or Eric
-            - **For casual content:** Try Aria
-            - **For British accent:** Try Ryan or Sonia
-            - **For Australian accent:** Try Natasha or William
-            """)
-        # About Tab
-        with gr.TabItem("ℹ️ About"):
-            gr.Markdown("""
-            ## 🎙️ About Vedes TTS
-            **Vedes** is a text-to-speech application that converts written text into natural-sounding speech.
-            ### ✨ Features:
-            - 🗣️ **9 High-Quality Voices** - Male and female voices with different accents
-            - 🌍 **Multiple Accents** - US, UK, and Australian English
-            - ⏱️ **Adjustable Speed** - From 0.5x to 2.0x speaking rate
-            - 🎵 **Pitch Control** - Fine-tune the voice pitch
-            - 📱 **Easy to Use** - Simple, intuitive interface
-            - ⚡ **Fast Generation** - Quick audio synthesis
-            ### 🔧 How It Works:
-            1. **Enter Text** - Type or paste your text
-            2. **Select Voice** - Choose from 9 available voices
-            3. **Adjust Settings** - Modify speed and pitch if needed
-            4. **Generate** - Click the button to create speech
-            5. **Listen & Download** - Play or save the audio
-            ### 📖 Best Practices:
-            - Use proper punctuation for natural speech rhythm
-            - Break long texts into paragraphs
-            - Use commas for short pauses, periods for longer ones
-            - Add question marks and exclamation points for expression
-            ---
-            ### 🛠️ Technical Details:
-            - **Engine:** Neural TTS
-            - **Audio Format:** MP3
-            - **Sample Rate:** 24kHz
-            - **Max Text Length:** 5000 characters
-            ---
-            *Built with ❤️ using Python and Gradio*
-            """)
-    # Footer
-    gr.HTML("""
-        <div style="text-align: center; padding: 20px; color: #888;">
-            <p>Vedes TTS © 2024 | Powered by Neural Speech Synthesis</p>
-        </div>
-    """)
-    # Event Handlers
-    text_input.change(
-        fn=text_analysis,
         inputs=text_input,
-        outputs=text_stats
     )
-    synthesize_btn.click(
         fn=synthesize_speech,
-        inputs=[text_input, voice_select, speaking_rate, pitch_shift],
         outputs=audio_output
     )
     text_input.submit(
         fn=synthesize_speech,
-        inputs=[text_input, voice_select, speaking_rate, pitch_shift],
         outputs=audio_output
     )
-# Launch
-print("✅ Vedes TTS Ready!")
-print("=" * 50)
 if __name__ == "__main__":
     demo.launch()

 import numpy as np
+import gradio as gr
 from scipy import signal
+from scipy.io import wavfile
+import tempfile
+import re
 # ============================================
+# VEDES TTS - 100% FROM SCRATCH
+# No APIs, No Pre-trained Models
 # ============================================
+SAMPLE_RATE = 22050
+# ============================================
+# PHONEME DATABASE WITH ACCURATE FORMANTS
+# Based on linguistic research data
+# ============================================
+PHONEMES = {
+    # Vowels: (F1, F2, F3, F4, duration_ms, is_voiced)
+    # Formant values based on Peterson & Barney (1952) research
+    # Front vowels
+    'IY': {'f1': 270, 'f2': 2290, 'f3': 3010, 'f4': 3300, 'dur': 80, 'voiced': True},   # beat
+    'IH': {'f1': 390, 'f2': 1990, 'f3': 2550, 'f4': 3300, 'dur': 60, 'voiced': True},   # bit
+    'EH': {'f1': 530, 'f2': 1840, 'f3': 2480, 'f4': 3300, 'dur': 70, 'voiced': True},   # bet
+    'AE': {'f1': 660, 'f2': 1720, 'f3': 2410, 'f4': 3300, 'dur': 90, 'voiced': True},   # bat
+    # Back vowels
+    'AA': {'f1': 730, 'f2': 1090, 'f3': 2440, 'f4': 3300, 'dur': 100, 'voiced': True},  # father
+    'AO': {'f1': 570, 'f2': 840, 'f3': 2410, 'f4': 3300, 'dur': 100, 'voiced': True},   # bought
+    'UH': {'f1': 440, 'f2': 1020, 'f3': 2240, 'f4': 3300, 'dur': 70, 'voiced': True},   # book
+    'UW': {'f1': 300, 'f2': 870, 'f3': 2240, 'f4': 3300, 'dur': 90, 'voiced': True},    # boot
+    # Central vowels
+    'AH': {'f1': 520, 'f2': 1190, 'f3': 2390, 'f4': 3300, 'dur': 60, 'voiced': True},   # but
+    'ER': {'f1': 490, 'f2': 1350, 'f3': 1690, 'f4': 3300, 'dur': 90, 'voiced': True},   # bird
+    'AX': {'f1': 500, 'f2': 1500, 'f3': 2500, 'f4': 3300, 'dur': 40, 'voiced': True},   # about (schwa)
+    # Diphthongs
+    'EY': {'f1': 450, 'f2': 2000, 'f3': 2600, 'f4': 3300, 'dur': 120, 'voiced': True},  # bait
+    'AY': {'f1': 650, 'f2': 1200, 'f3': 2500, 'f4': 3300, 'dur': 130, 'voiced': True},  # bite
+    'OY': {'f1': 500, 'f2': 900, 'f3': 2500, 'f4': 3300, 'dur': 140, 'voiced': True},   # boy
+    'AW': {'f1': 650, 'f2': 1100, 'f3': 2500, 'f4': 3300, 'dur': 130, 'voiced': True},  # bout
+    'OW': {'f1': 450, 'f2': 850, 'f3': 2500, 'f4': 3300, 'dur': 120, 'voiced': True},   # boat
+    # Stops (plosives)
+    'P': {'f1': 300, 'f2': 1000, 'f3': 2500, 'f4': 3300, 'dur': 80, 'voiced': False, 'stop': True, 'burst_freq': 800},
+    'B': {'f1': 300, 'f2': 1000, 'f3': 2500, 'f4': 3300, 'dur': 60, 'voiced': True, 'stop': True, 'burst_freq': 800},
+    'T': {'f1': 300, 'f2': 1800, 'f3': 2500, 'f4': 3300, 'dur': 70, 'voiced': False, 'stop': True, 'burst_freq': 3000},
+    'D': {'f1': 300, 'f2': 1800, 'f3': 2500, 'f4': 3300, 'dur': 50, 'voiced': True, 'stop': True, 'burst_freq': 3000},
+    'K': {'f1': 300, 'f2': 2000, 'f3': 2500, 'f4': 3300, 'dur': 80, 'voiced': False, 'stop': True, 'burst_freq': 1500},
+    'G': {'f1': 300, 'f2': 2000, 'f3': 2500, 'f4': 3300, 'dur': 50, 'voiced': True, 'stop': True, 'burst_freq': 1500},
+    # Fricatives
+    'F': {'f1': 300, 'f2': 1100, 'f3': 2500, 'f4': 3300, 'dur': 90, 'voiced': False, 'fricative': True, 'fric_freq': 7000},
+    'V': {'f1': 300, 'f2': 1100, 'f3': 2500, 'f4': 3300, 'dur': 60, 'voiced': True, 'fricative': True, 'fric_freq': 7000},
+    'TH': {'f1': 300, 'f2': 1400, 'f3': 2500, 'f4': 3300, 'dur': 90, 'voiced': False, 'fricative': True, 'fric_freq': 5000},
+    'DH': {'f1': 300, 'f2': 1400, 'f3': 2500, 'f4': 3300, 'dur': 50, 'voiced': True, 'fricative': True, 'fric_freq': 5000},
+    'S': {'f1': 300, 'f2': 1800, 'f3': 2500, 'f4': 3300, 'dur': 100, 'voiced': False, 'fricative': True, 'fric_freq': 6000},
+    'Z': {'f1': 300, 'f2': 1800, 'f3': 2500, 'f4': 3300, 'dur': 70, 'voiced': True, 'fricative': True, 'fric_freq': 6000},
+    'SH': {'f1': 300, 'f2': 1900, 'f3': 2500, 'f4': 3300, 'dur': 100, 'voiced': False, 'fricative': True, 'fric_freq': 3500},
+    'ZH': {'f1': 300, 'f2': 1900, 'f3': 2500, 'f4': 3300, 'dur': 70, 'voiced': True, 'fricative': True, 'fric_freq': 3500},
+    'HH': {'f1': 500, 'f2': 1500, 'f3': 2500, 'f4': 3300, 'dur': 60, 'voiced': False, 'fricative': True, 'fric_freq': 1500},
+    # Affricates
+    'CH': {'f1': 300, 'f2': 1900, 'f3': 2500, 'f4': 3300, 'dur': 110, 'voiced': False, 'affricate': True},
+    'JH': {'f1': 300, 'f2': 1900, 'f3': 2500, 'f4': 3300, 'dur': 80, 'voiced': True, 'affricate': True},
+    # Nasals
+    'M': {'f1': 280, 'f2': 900, 'f3': 2200, 'f4': 3300, 'dur': 70, 'voiced': True, 'nasal': True},
+    'N': {'f1': 280, 'f2': 1700, 'f3': 2600, 'f4': 3300, 'dur': 60, 'voiced': True, 'nasal': True},
+    'NG': {'f1': 280, 'f2': 2300, 'f3': 2750, 'f4': 3300, 'dur': 70, 'voiced': True, 'nasal': True},
+    # Liquids
+    'L': {'f1': 350, 'f2': 1100, 'f3': 2700, 'f4': 3300, 'dur': 60, 'voiced': True, 'liquid': True},
+    'R': {'f1': 420, 'f2': 1300, 'f3': 1600, 'f4': 3300, 'dur': 60, 'voiced': True, 'liquid': True},
+    # Glides
+    'W': {'f1': 300, 'f2': 700, 'f3': 2200, 'f4': 3300, 'dur': 50, 'voiced': True, 'glide': True},
+    'Y': {'f1': 280, 'f2': 2200, 'f3': 2960, 'f4': 3300, 'dur': 50, 'voiced': True, 'glide': True},
+    # Silence
+    'SIL': {'f1': 0, 'f2': 0, 'f3': 0, 'f4': 0, 'dur': 80, 'voiced': False, 'silence': True},
+    'PAU': {'f1': 0, 'f2': 0, 'f3': 0, 'f4': 0, 'dur': 150, 'voiced': False, 'silence': True},
 }
+# ============================================
+# PRONUNCIATION DICTIONARY
+# ============================================
+DICTIONARY = {
+    # Common words
+    'a': ['AX'], 'the': ['DH', 'AX'], 'an': ['AE', 'N'],
+    'i': ['AY'], 'you': ['Y', 'UW'], 'he': ['HH', 'IY'],
+    'she': ['SH', 'IY'], 'it': ['IH', 'T'], 'we': ['W', 'IY'],
+    'they': ['DH', 'EY'], 'me': ['M', 'IY'], 'him': ['HH', 'IH', 'M'],
+    'her': ['HH', 'ER'], 'us': ['AH', 'S'], 'them': ['DH', 'EH', 'M'],
+    # Be verbs
+    'is': ['IH', 'Z'], 'are': ['AA', 'R'], 'was': ['W', 'AA', 'Z'],
+    'were': ['W', 'ER'], 'be': ['B', 'IY'], 'been': ['B', 'IH', 'N'],
+    'being': ['B', 'IY', 'IH', 'NG'], 'am': ['AE', 'M'],
+    # Have verbs
+    'have': ['HH', 'AE', 'V'], 'has': ['HH', 'AE', 'Z'],
+    'had': ['HH', 'AE', 'D'], 'having': ['HH', 'AE', 'V', 'IH', 'NG'],
+    # Do verbs
+    'do': ['D', 'UW'], 'does': ['D', 'AH', 'Z'], 'did': ['D', 'IH', 'D'],
+    'doing': ['D', 'UW', 'IH', 'NG'], 'done': ['D', 'AH', 'N'],
+    # Modal verbs
+    'will': ['W', 'IH', 'L'], 'would': ['W', 'UH', 'D'],
+    'can': ['K', 'AE', 'N'], 'could': ['K', 'UH', 'D'],
+    'should': ['SH', 'UH', 'D'], 'may': ['M', 'EY'],
+    'might': ['M', 'AY', 'T'], 'must': ['M', 'AH', 'S', 'T'],
+    # Common verbs
+    'go': ['G', 'OW'], 'goes': ['G', 'OW', 'Z'], 'going': ['G', 'OW', 'IH', 'NG'],
+    'went': ['W', 'EH', 'N', 'T'], 'gone': ['G', 'AO', 'N'],
+    'come': ['K', 'AH', 'M'], 'comes': ['K', 'AH', 'M', 'Z'],
+    'coming': ['K', 'AH', 'M', 'IH', 'NG'], 'came': ['K', 'EY', 'M'],
+    'get': ['G', 'EH', 'T'], 'gets': ['G', 'EH', 'T', 'S'],
+    'getting': ['G', 'EH', 'T', 'IH', 'NG'], 'got': ['G', 'AA', 'T'],
+    'make': ['M', 'EY', 'K'], 'makes': ['M', 'EY', 'K', 'S'],
+    'making': ['M', 'EY', 'K', 'IH', 'NG'], 'made': ['M', 'EY', 'D'],
+    'say': ['S', 'EY'], 'says': ['S', 'EH', 'Z'], 'said': ['S', 'EH', 'D'],
+    'saying': ['S', 'EY', 'IH', 'NG'],
+    'know': ['N', 'OW'], 'knows': ['N', 'OW', 'Z'], 'knew': ['N', 'UW'],
+    'known': ['N', 'OW', 'N'], 'knowing': ['N', 'OW', 'IH', 'NG'],
+    'think': ['TH', 'IH', 'NG', 'K'], 'thinks': ['TH', 'IH', 'NG', 'K', 'S'],
+    'thought': ['TH', 'AO', 'T'], 'thinking': ['TH', 'IH', 'NG', 'K', 'IH', 'NG'],
+    'take': ['T', 'EY', 'K'], 'takes': ['T', 'EY', 'K', 'S'],
+    'took': ['T', 'UH', 'K'], 'taken': ['T', 'EY', 'K', 'AX', 'N'],
+    'see': ['S', 'IY'], 'sees': ['S', 'IY', 'Z'], 'saw': ['S', 'AO'],
+    'seen': ['S', 'IY', 'N'], 'seeing': ['S', 'IY', 'IH', 'NG'],
+    'want': ['W', 'AA', 'N', 'T'], 'wants': ['W', 'AA', 'N', 'T', 'S'],
+    'wanted': ['W', 'AA', 'N', 'T', 'IH', 'D'],
+    'give': ['G', 'IH', 'V'], 'gives': ['G', 'IH', 'V', 'Z'],
+    'gave': ['G', 'EY', 'V'], 'given': ['G', 'IH', 'V', 'AX', 'N'],
+    'use': ['Y', 'UW', 'Z'], 'uses': ['Y', 'UW', 'Z', 'IH', 'Z'],
+    'used': ['Y', 'UW', 'Z', 'D'], 'using': ['Y', 'UW', 'Z', 'IH', 'NG'],
+    'find': ['F', 'AY', 'N', 'D'], 'found': ['F', 'AW', 'N', 'D'],
+    'tell': ['T', 'EH', 'L'], 'told': ['T', 'OW', 'L', 'D'],
+    'ask': ['AE', 'S', 'K'], 'asked': ['AE', 'S', 'K', 'T'],
+    'work': ['W', 'ER', 'K'], 'works': ['W', 'ER', 'K', 'S'],
+    'worked': ['W', 'ER', 'K', 'T'], 'working': ['W', 'ER', 'K', 'IH', 'NG'],
+    'try': ['T', 'R', 'AY'], 'tried': ['T', 'R', 'AY', 'D'],
+    'call': ['K', 'AO', 'L'], 'called': ['K', 'AO', 'L', 'D'],
+    'need': ['N', 'IY', 'D'], 'needed': ['N', 'IY', 'D', 'IH', 'D'],
+    'feel': ['F', 'IY', 'L'], 'feels': ['F', 'IY', 'L', 'Z'],
+    'become': ['B', 'IH', 'K', 'AH', 'M'],
+    'leave': ['L', 'IY', 'V'], 'left': ['L', 'EH', 'F', 'T'],
+    'put': ['P', 'UH', 'T'], 'keep': ['K', 'IY', 'P'],
+    'let': ['L', 'EH', 'T'], 'begin': ['B', 'IH', 'G', 'IH', 'N'],
+    'seem': ['S', 'IY', 'M'], 'help': ['HH', 'EH', 'L', 'P'],
+    'show': ['SH', 'OW'], 'hear': ['HH', 'IY', 'R'],
+    'play': ['P', 'L', 'EY'], 'run': ['R', 'AH', 'N'],
+    'move': ['M', 'UW', 'V'], 'live': ['L', 'IH', 'V'],
+    'believe': ['B', 'IH', 'L', 'IY', 'V'],
+    # Question words
+    'what': ['W', 'AH', 'T'], 'where': ['W', 'EH', 'R'],
+    'when': ['W', 'EH', 'N'], 'why': ['W', 'AY'],
+    'how': ['HH', 'AW'], 'who': ['HH', 'UW'],
+    'which': ['W', 'IH', 'CH'],
+    # Conjunctions
+    'and': ['AE', 'N', 'D'], 'or': ['AO', 'R'],
+    'but': ['B', 'AH', 'T'], 'if': ['IH', 'F'],
+    'then': ['DH', 'EH', 'N'], 'because': ['B', 'IH', 'K', 'AO', 'Z'],
+    'so': ['S', 'OW'], 'than': ['DH', 'AE', 'N'],
+    # Prepositions
+    'of': ['AH', 'V'], 'to': ['T', 'UW'], 'in': ['IH', 'N'],
+    'for': ['F', 'AO', 'R'], 'on': ['AA', 'N'], 'with': ['W', 'IH', 'TH'],
+    'at': ['AE', 'T'], 'by': ['B', 'AY'], 'from': ['F', 'R', 'AH', 'M'],
+    'up': ['AH', 'P'], 'about': ['AX', 'B', 'AW', 'T'],
+    'into': ['IH', 'N', 'T', 'UW'], 'over': ['OW', 'V', 'ER'],
+    'after': ['AE', 'F', 'T', 'ER'], 'out': ['AW', 'T'],
+    'down': ['D', 'AW', 'N'], 'off': ['AO', 'F'],
+    'under': ['AH', 'N', 'D', 'ER'], 'again': ['AX', 'G', 'EH', 'N'],
+    'there': ['DH', 'EH', 'R'], 'here': ['HH', 'IY', 'R'],
+    # Articles/Determiners
+    'this': ['DH', 'IH', 'S'], 'that': ['DH', 'AE', 'T'],
+    'these': ['DH', 'IY', 'Z'], 'those': ['DH', 'OW', 'Z'],
+    'my': ['M', 'AY'], 'your': ['Y', 'AO', 'R'],
+    'his': ['HH', 'IH', 'Z'], 'its': ['IH', 'T', 'S'],
+    'our': ['AW', 'ER'], 'their': ['DH', 'EH', 'R'],
+    'some': ['S', 'AH', 'M'], 'any': ['EH', 'N', 'IY'],
+    'no': ['N', 'OW'], 'all': ['AO', 'L'],
+    'each': ['IY', 'CH'], 'every': ['EH', 'V', 'R', 'IY'],
+    'both': ['B', 'OW', 'TH'], 'few': ['F', 'Y', 'UW'],
+    'more': ['M', 'AO', 'R'], 'most': ['M', 'OW', 'S', 'T'],
+    'other': ['AH', 'DH', 'ER'], 'such': ['S', 'AH', 'CH'],
+    # Adjectives
+    'good': ['G', 'UH', 'D'], 'new': ['N', 'UW'],
+    'first': ['F', 'ER', 'S', 'T'], 'last': ['L', 'AE', 'S', 'T'],
+    'long': ['L', 'AO', 'NG'], 'great': ['G', 'R', 'EY', 'T'],
+    'little': ['L', 'IH', 'T', 'AX', 'L'], 'own': ['OW', 'N'],
+    'old': ['OW', 'L', 'D'], 'right': ['R', 'AY', 'T'],
+    'big': ['B', 'IH', 'G'], 'high': ['HH', 'AY'],
+    'different': ['D', 'IH', 'F', 'ER', 'AX', 'N', 'T'],
+    'small': ['S', 'M', 'AO', 'L'], 'large': ['L', 'AA', 'R', 'JH'],
+    'next': ['N', 'EH', 'K', 'S', 'T'], 'early': ['ER', 'L', 'IY'],
+    'young': ['Y', 'AH', 'NG'], 'important': ['IH', 'M', 'P', 'AO', 'R', 'T', 'AX', 'N', 'T'],
+    'public': ['P', 'AH', 'B', 'L', 'IH', 'K'],
+    'bad': ['B', 'AE', 'D'], 'same': ['S', 'EY', 'M'],
+    # Adverbs
+    'now': ['N', 'AW'], 'just': ['JH', 'AH', 'S', 'T'],
+    'only': ['OW', 'N', 'L', 'IY'], 'very': ['V', 'EH', 'R', 'IY'],
+    'also': ['AO', 'L', 'S', 'OW'], 'well': ['W', 'EH', 'L'],
+    'back': ['B', 'AE', 'K'], 'even': ['IY', 'V', 'AX', 'N'],
+    'still': ['S', 'T', 'IH', 'L'], 'too': ['T', 'UW'],
+    'here': ['HH', 'IY', 'R'], 'much': ['M', 'AH', 'CH'],
+    'really': ['R', 'IY', 'L', 'IY'], 'always': ['AO', 'L', 'W', 'EY', 'Z'],
+    'never': ['N', 'EH', 'V', 'ER'], 'today': ['T', 'AX', 'D', 'EY'],
+    # Nouns
+    'time': ['T', 'AY', 'M'], 'year': ['Y', 'IY', 'R'],
+    'people': ['P', 'IY', 'P', 'AX', 'L'], 'way': ['W', 'EY'],
+    'day': ['D', 'EY'], 'man': ['M', 'AE', 'N'],
+    'thing': ['TH', 'IH', 'NG'], 'woman': ['W', 'UH', 'M', 'AX', 'N'],
+    'life': ['L', 'AY', 'F'], 'child': ['CH', 'AY', 'L', 'D'],
+    'world': ['W', 'ER', 'L', 'D'], 'school': ['S', 'K', 'UW', 'L'],
+    'state': ['S', 'T', 'EY', 'T'], 'family': ['F', 'AE', 'M', 'AX', 'L', 'IY'],
+    'student': ['S', 'T', 'UW', 'D', 'AX', 'N', 'T'],
+    'group': ['G', 'R', 'UW', 'P'], 'country': ['K', 'AH', 'N', 'T', 'R', 'IY'],
+    'problem': ['P', 'R', 'AA', 'B', 'L', 'AX', 'M'],
+    'hand': ['HH', 'AE', 'N', 'D'], 'part': ['P', 'AA', 'R', 'T'],
+    'place': ['P', 'L', 'EY', 'S'], 'case': ['K', 'EY', 'S'],
+    'week': ['W', 'IY', 'K'], 'company': ['K', 'AH', 'M', 'P', 'AX', 'N', 'IY'],
+    'system': ['S', 'IH', 'S', 'T', 'AX', 'M'],
+    'program': ['P', 'R', 'OW', 'G', 'R', 'AE', 'M'],
+    'question': ['K', 'W', 'EH', 'S', 'CH', 'AX', 'N'],
+    'government': ['G', 'AH', 'V', 'ER', 'N', 'M', 'AX', 'N', 'T'],
+    'number': ['N', 'AH', 'M', 'B', 'ER'],
+    'night': ['N', 'AY', 'T'], 'point': ['P', 'OY', 'N', 'T'],
+    'home': ['HH', 'OW', 'M'], 'water': ['W', 'AO', 'T', 'ER'],
+    'room': ['R', 'UW', 'M'], 'mother': ['M', 'AH', 'DH', 'ER'],
+    'area': ['EH', 'R', 'IY', 'AX'], 'money': ['M', 'AH', 'N', 'IY'],
+    'story': ['S', 'T', 'AO', 'R', 'IY'], 'fact': ['F', 'AE', 'K', 'T'],
+    'month': ['M', 'AH', 'N', 'TH'], 'lot': ['L', 'AA', 'T'],
+    'study': ['S', 'T', 'AH', 'D', 'IY'], 'book': ['B', 'UH', 'K'],
+    'eye': ['AY'], 'job': ['JH', 'AA', 'B'],
+    'word': ['W', 'ER', 'D'], 'business': ['B', 'IH', 'Z', 'N', 'IH', 'S'],
+    'issue': ['IH', 'SH', 'UW'], 'side': ['S', 'AY', 'D'],
+    'kind': ['K', 'AY', 'N', 'D'], 'head': ['HH', 'EH', 'D'],
+    'house': ['HH', 'AW', 'S'], 'friend': ['F', 'R', 'EH', 'N', 'D'],
+    'father': ['F', 'AA', 'DH', 'ER'], 'power': ['P', 'AW', 'ER'],
+    'hour': ['AW', 'ER'], 'game': ['G', 'EY', 'M'],
+    'line': ['L', 'AY', 'N'], 'end': ['EH', 'N', 'D'],
+    'member': ['M', 'EH', 'M', 'B', 'ER'], 'law': ['L', 'AO'],
+    'car': ['K', 'AA', 'R'], 'city': ['S', 'IH', 'T', 'IY'],
+    'name': ['N', 'EY', 'M'], 'team': ['T', 'IY', 'M'],
+    'minute': ['M', 'IH', 'N', 'IH', 'T'], 'idea': ['AY', 'D', 'IY', 'AX'],
+    'body': ['B', 'AA', 'D', 'IY'], 'information': ['IH', 'N', 'F', 'ER', 'M', 'EY', 'SH', 'AX', 'N'],
+    'face': ['F', 'EY', 'S'], 'others': ['AH', 'DH', 'ER', 'Z'],
+    'level': ['L', 'EH', 'V', 'AX', 'L'], 'office': ['AO', 'F', 'IH', 'S'],
+    'door': ['D', 'AO', 'R'], 'health': ['HH', 'EH', 'L', 'TH'],
+    'person': ['P', 'ER', 'S', 'AX', 'N'], 'art': ['AA', 'R', 'T'],
+    'war': ['W', 'AO', 'R'], 'history': ['HH', 'IH', 'S', 'T', 'ER', 'IY'],
+    'party': ['P', 'AA', 'R', 'T', 'IY'], 'result': ['R', 'IH', 'Z', 'AH', 'L', 'T'],
+    'change': ['CH', 'EY', 'N', 'JH'], 'morning': ['M', 'AO', 'R', 'N', 'IH', 'NG'],
+    'reason': ['R', 'IY', 'Z', 'AX', 'N'], 'research': ['R', 'IY', 'S', 'ER', 'CH'],
+    'girl': ['G', 'ER', 'L'], 'guy': ['G', 'AY'],
+    'food': ['F', 'UW', 'D'], 'moment': ['M', 'OW', 'M', 'AX', 'N', 'T'],
+    'teacher': ['T', 'IY', 'CH', 'ER'], 'force': ['F', 'AO', 'R', 'S'],
+    'education': ['EH', 'JH', 'AX', 'K', 'EY', 'SH', 'AX', 'N'],
+    # Numbers
+    'one': ['W', 'AH', 'N'], 'two': ['T', 'UW'],
+    'three': ['TH', 'R', 'IY'], 'four': ['F', 'AO', 'R'],
+    'five': ['F', 'AY', 'V'], 'six': ['S', 'IH', 'K', 'S'],
+    'seven': ['S', 'EH', 'V', 'AX', 'N'], 'eight': ['EY', 'T'],
+    'nine': ['N', 'AY', 'N'], 'ten': ['T', 'EH', 'N'],
+    'zero': ['Z', 'IY', 'R', 'OW'],
+    # Greetings
+    'hello': ['HH', 'AX', 'L', 'OW'], 'hi': ['HH', 'AY'],
+    'hey': ['HH', 'EY'], 'welcome': ['W', 'EH', 'L', 'K', 'AX', 'M'],
+    'goodbye': ['G', 'UH', 'D', 'B', 'AY'], 'bye': ['B', 'AY'],
+    'thanks': ['TH', 'AE', 'NG', 'K', 'S'], 'thank': ['TH', 'AE', 'NG', 'K'],
+    'please': ['P', 'L', 'IY', 'Z'], 'sorry': ['S', 'AA', 'R', 'IY'],
+    'yes': ['Y', 'EH', 'S'], 'yeah': ['Y', 'AE'],
+    'no': ['N', 'OW'], 'not': ['N', 'AA', 'T'],
+    'ok': ['OW', 'K', 'EY'], 'okay': ['OW', 'K', 'EY'],
+    # TTS related
+    'text': ['T', 'EH', 'K', 'S', 'T'],
+    'speech': ['S', 'P', 'IY', 'CH'],
+    'voice': ['V', 'OY', 'S'],
+    'sound': ['S', 'AW', 'N', 'D'],
+    'audio': ['AO', 'D', 'IY', 'OW'],
+    'vedes': ['V', 'IY', 'D', 'EH', 'S'],
+    'synthesis': ['S', 'IH', 'N', 'TH', 'AX', 'S', 'IH', 'S'],
+    'synthesize': ['S', 'IH', 'N', 'TH', 'AX', 'S', 'AY', 'Z'],
+    'generate': ['JH', 'EH', 'N', 'ER', 'EY', 'T'],
+    'computer': ['K', 'AX', 'M', 'P', 'Y', 'UW', 'T', 'ER'],
+    'technology': ['T', 'EH', 'K', 'N', 'AA', 'L', 'AX', 'JH', 'IY'],
+}
+# Letter patterns for unknown words
+PATTERNS = [
+    ('tion', ['SH', 'AX', 'N']),
+    ('sion', ['ZH', 'AX', 'N']),
+    ('ight', ['AY', 'T']),
+    ('ough', ['AO']),
+    ('ould', ['UH', 'D']),
+    ('ious', ['IY', 'AX', 'S']),
+    ('eous', ['IY', 'AX', 'S']),
+    ('ness', ['N', 'AX', 'S']),
+    ('ment', ['M', 'AX', 'N', 'T']),
+    ('able', ['AX', 'B', 'AX', 'L']),
+    ('ible', ['AX', 'B', 'AX', 'L']),
+    ('ally', ['AX', 'L', 'IY']),
+    ('ful', ['F', 'AX', 'L']),
+    ('less', ['L', 'AX', 'S']),
+    ('ing', ['IH', 'NG']),
+    ('ck', ['K']),
+    ('th', ['TH']),
+    ('sh', ['SH']),
+    ('ch', ['CH']),
+    ('wh', ['W']),
+    ('ph', ['F']),
+    ('gh', []),
+    ('ng', ['NG']),
+    ('qu', ['K', 'W']),
+    ('ee', ['IY']),
+    ('ea', ['IY']),
+    ('oo', ['UW']),
+    ('ou', ['AW']),
+    ('ow', ['OW']),
+    ('ai', ['EY']),
+    ('ay', ['EY']),
+    ('ey', ['IY']),
+    ('oy', ['OY']),
+    ('oi', ['OY']),
+    ('au', ['AO']),
+    ('aw', ['AO']),
+    ('ie', ['IY']),
+    ('ue', ['UW']),
+    ('ew', ['UW']),
+    ('er', ['ER']),
+    ('ir', ['ER']),
+    ('ur', ['ER']),
+    ('or', ['AO', 'R']),
+    ('ar', ['AA', 'R']),
+]
+LETTER_PHONEMES = {
+    'a': 'AE', 'b': 'B', 'c': 'K', 'd': 'D', 'e': 'EH',
+    'f': 'F', 'g': 'G', 'h': 'HH', 'i': 'IH', 'j': 'JH',
+    'k': 'K', 'l': 'L', 'm': 'M', 'n': 'N', 'o': 'AA',
+    'p': 'P', 'q': 'K', 'r': 'R', 's': 'S', 't': 'T',
+    'u': 'AH', 'v': 'V', 'w': 'W', 'x': 'K', 'y': 'Y', 'z': 'Z'
+}
 # ============================================
+# TEXT TO PHONEME CONVERTER
 # ============================================
+class TextToPhoneme:
+    def __init__(self):
+        self.dictionary = DICTIONARY
+        self.patterns = sorted(PATTERNS, key=lambda x: -len(x[0]))
+        self.letters = LETTER_PHONEMES
+    def convert(self, text):
+        """Convert text to phoneme sequence"""
+        text = text.lower().strip()
+        text = re.sub(r'[^\w\s.,!?\'-]', '', text)
+        words = re.findall(r"[\w']+|[.,!?]", text)
+        phonemes = []
+        for i, word in enumerate(words):
+            if word in '.,!?':
+                phonemes.append('PAU')
+            elif word in self.dictionary:
+                phonemes.extend(self.dictionary[word])
+            else:
+                phonemes.extend(self._convert_unknown(word))
+            # Add short pause between words
+            if i < len(words) - 1 and word not in '.,!?':
+                phonemes.append('SIL')
+        return phonemes
+    def _convert_unknown(self, word):
+        """Convert unknown word using patterns"""
+        phonemes = []
+        i = 0
+        word = word.lower()
+        while i < len(word):
+            matched = False
+            for pattern, phons in self.patterns:
+                if word[i:].startswith(pattern):
+                    phonemes.extend(phons)
+                    i += len(pattern)
+                    matched = True
+                    break
+            if not matched:
+                char = word[i]
+                if char in self.letters:
+                    phonemes.append(self.letters[char])
+                i += 1
+        return phonemes
+# ============================================
+# KLATT FORMANT SYNTHESIZER
+# ============================================
+class KlattSynthesizer:
+    """Klatt-style formant synthesizer - 100% from scratch"""
+    def __init__(self, sample_rate=22050):
+        self.sample_rate = sample_rate
+        self.base_f0 = 120
+    def synthesize(self, phonemes, rate=1.0, pitch=1.0):
+        """Synthesize audio from phonemes"""
+        if not phonemes:
+            return np.zeros(int(self.sample_rate * 0.5), dtype=np.float32)
+        f0 = self.base_f0 * pitch
+        audio_segments = []
+        for i, phoneme in enumerate(phonemes):
+            if phoneme not in PHONEMES:
+                continue
+            params = PHONEMES[phoneme]
+            duration_ms = params['dur'] / rate
+            duration_ms = max(20, min(duration_ms, 300))
+            # Get neighboring phonemes for coarticulation
+            prev_phon = phonemes[i-1] if i > 0 else None
+            next_phon = phonemes[i+1] if i < len(phonemes)-1 else None
+            segment = self._synthesize_phoneme(
+                phoneme, params, f0, duration_ms, prev_phon, next_phon
+            )
+            audio_segments.append(segment)
+        if not audio_segments:
+            return np.zeros(int(self.sample_rate * 0.5), dtype=np.float32)
+        # Concatenate with overlap
+        audio = self._concatenate(audio_segments)
+        # Final processing
+        audio = self._apply_final_envelope(audio)
+        audio = audio / (np.max(np.abs(audio)) + 1e-8)
+        return audio.astype(np.float32)
+    def _synthesize_phoneme(self, phoneme, params, f0, duration_ms, prev_phon, next_phon):
+        """Synthesize single phoneme"""
+        n_samples = int(self.sample_rate * duration_ms / 1000)
+        n_samples = max(n_samples, 10)
+        if params.get('silence'):
+            return np.zeros(n_samples, dtype=np.float32)
+        t = np.arange(n_samples) / self.sample_rate
+        # Generate source signal
+        if params['voiced']:
+            source = self._generate_glottal_source(t, f0)
+        else:
+            source = self._generate_noise(n_samples)
+        # Handle different phoneme types
+        if params.get('stop'):
+            audio = self._synthesize_stop(source, params, n_samples, t)
+        elif params.get('fricative'):
+            audio = self._synthesize_fricative(source, params, n_samples, t, f0)
+        elif params.get('affricate'):
+            audio = self._synthesize_affricate(source, params, n_samples, t, f0)
+        elif params.get('nasal'):
+            audio = self._synthesize_nasal(source, params, n_samples, t, f0)
+        else:
+            # Vowels and approximants
+            audio = self._apply_formants(source, params)
+        # Apply envelope
+        audio = self._apply_envelope(audio, phoneme, params)
+        # Coarticulation
+        audio = self._apply_coarticulation(audio, phoneme, prev_phon, next_phon)
+        return audio
+    def _generate_glottal_source(self, t, f0):
+        """Generate glottal pulse train using LF model approximation"""
+        # Rosenberg glottal pulse approximation
+        T0 = 1.0 / f0
+        phase = (t % T0) / T0
+        # Glottal waveform
+        glottal = np.zeros_like(t)
+        # Opening phase (0 to 0.4)
+        mask1 = phase < 0.4
+        glottal[mask1] = 0.5 * (1 - np.cos(np.pi * phase[mask1] / 0.4))
+        # Closing phase (0.4 to 0.6)
+        mask2 = (phase >= 0.4) & (phase < 0.6)
+        glottal[mask2] = np.cos(np.pi * (phase[mask2] - 0.4) / 0.4)
+        # Closed phase (0.6 to 1.0)
+        mask3 = phase >= 0.6
+        glottal[mask3] = 0
+        # Add jitter (frequency perturbation) and shimmer (amplitude perturbation)
+        jitter = 1 + 0.01 * np.random.randn(len(t))
+        shimmer = 1 + 0.03 * np.random.randn(len(t))
+        glottal = glottal * shimmer
+        # Add aspiration noise
+        aspiration = np.random.randn(len(t)) * 0.02
+        glottal = glottal + aspiration
+        return glottal
+    def _generate_noise(self, n_samples):
+        """Generate white noise"""
+        return np.random.randn(n_samples)
+    def _apply_formants(self, source, params):
+        """Apply formant filtering using cascaded resonators"""
+        audio = source.copy()
+        formants = [
+            (params['f1'], 80),   # F1 with bandwidth
+            (params['f2'], 100),  # F2
+            (params['f3'], 120),  # F3
+            (params['f4'], 150),  # F4
+        ]
+        result = np.zeros_like(audio)
+        for freq, bw in formants:
+            if freq <= 0 or freq >= self.sample_rate / 2:
+                continue
+            # Design resonator (second-order bandpass)
+            filtered = self._resonator(audio, freq, bw)
+            result += filtered
+        return result
+    def _resonator(self, signal, freq, bandwidth):
+        """Second-order resonator (formant filter)"""
+        if freq <= 0 or freq >= self.sample_rate / 2:
+            return signal
+        # Convert to digital filter coefficients
+        r = np.exp(-np.pi * bandwidth / self.sample_rate)
+        theta = 2 * np.pi * freq / self.sample_rate
+        # IIR filter coefficients
+        a1 = -2 * r * np.cos(theta)
+        a2 = r * r
+        b0 = 1 - r
+        # Apply filter using direct form
+        y = np.zeros_like(signal)
+        for i in range(2, len(signal)):
+            y[i] = b0 * signal[i] - a1 * y[i-1] - a2 * y[i-2]
+        return y
+    def _synthesize_stop(self, source, params, n_samples, t):
+        """Synthesize stop consonant"""
+        audio = np.zeros(n_samples)
+        # Closure phase (silence)
+        closure_len = n_samples // 2
+        # Burst phase
+        burst_len = n_samples - closure_len
+        burst_start = closure_len
+        # Generate burst
+        burst_freq = params.get('burst_freq', 1500)
+        burst = np.random.randn(burst_len) * 0.5
+        # Filter burst
+        if burst_freq < self.sample_rate / 2:
+            try:
+                b, a = signal.butter(2, burst_freq / (self.sample_rate / 2), 'low')
+                burst = signal.filtfilt(b, a, burst)
+            except:
+                pass
+        audio[burst_start:] = burst
+        # Add voice bar for voiced stops
+        if params['voiced']:
+            voice_bar = self._generate_glottal_source(t[:closure_len], 100) * 0.3
+            audio[:closure_len] = voice_bar
+        return audio
+    def _synthesize_fricative(self, source, params, n_samples, t, f0):
+        """Synthesize fricative consonant"""
+        # Generate frication noise
+        noise = np.random.randn(n_samples)
+        # Filter based on frication frequency
+        fric_freq = params.get('fric_freq', 4000)
+        try:
+            if fric_freq > 3000:
+                # High-pass for /s/, /f/
+                b, a = signal.butter(4, 2000 / (self.sample_rate / 2), 'high')
+            else:
+                # Band-pass for /sh/
+                low = max(100, fric_freq - 1000)
+                high = min(fric_freq + 1000, self.sample_rate / 2 - 100)
+                b, a = signal.butter(2, [low / (self.sample_rate / 2),
+                                         high / (self.sample_rate / 2)], 'band')
+            noise = signal.filtfilt(b, a, noise)
+        except:
+            pass
+        audio = noise * 0.4
+        # Add voicing for voiced fricatives
+        if params['voiced']:
+            voiced = self._generate_glottal_source(t, f0)
+            voiced = self._apply_formants(voiced, params) * 0.3
+            audio = audio + voiced
+        return audio
+    def _synthesize_affricate(self, source, params, n_samples, t, f0):
+        """Synthesize affricate (stop + fricative)"""
+        stop_len = n_samples // 3
+        fric_len = n_samples - stop_len
+        audio = np.zeros(n_samples)
+        # Stop portion
+        audio[:stop_len] = 0
+        # Fricative portion
+        fric = np.random.randn(fric_len) * 0.4
+        try:
+            b, a = signal.butter(2, 2500 / (self.sample_rate / 2), 'high')
+            fric = signal.filtfilt(b, a, fric)
+        except:
+            pass
+        audio[stop_len:] = fric
+        return audio
+    def _synthesize_nasal(self, source, params, n_samples, t, f0):
+        """Synthesize nasal consonant"""
+        # Generate voiced source
+        voiced = self._generate_glottal_source(t, f0)
+        # Apply nasal formants (lower frequencies)
+        audio = self._apply_formants(voiced, params)
+        # Add nasal resonance (around 250-300 Hz)
+        try:
+            b, a = signal.butter(2, 400 / (self.sample_rate / 2), 'low')
+            nasal = signal.filtfilt(b, a, voiced) * 0.5
+            audio = audio + nasal
+        except:
+            pass
+        # Add anti-resonance effect
+        audio = audio * 0.7
+        return audio
+    def _apply_envelope(self, audio, phoneme, params):
+        """Apply amplitude envelope"""
+        n = len(audio)
+        if n < 4:
+            return audio
+        envelope = np.ones(n)
+        if params.get('stop'):
+            # Sharp attack for stops
+            attack = max(1, n // 10)
+            release = max(1, n // 4)
+        elif params.get('fricative'):
+            attack = max(1, n // 5)
+            release = max(1, n // 5)
+        else:
+            # Smooth envelope for vowels
+            attack = max(1, n // 6)
+            release = max(1, n // 6)
+        envelope[:attack] = np.linspace(0.01, 1, attack)
+        envelope[-release:] = np.linspace(1, 0.01, release)
+        return audio * envelope
+    def _apply_coarticulation(self, audio, current, prev_phon, next_phon):
+        """Apply coarticulation effects"""
+        n = len(audio)
+        if n < 20:
+            return audio
+        # Simple transition smoothing
+        transition_len = min(n // 4, 50)
+        # Fade in from previous phoneme
+        if prev_phon and prev_phon not in ['SIL', 'PAU']:
+            fade_in = np.linspace(0.7, 1.0, transition_len)
+            audio[:transition_len] *= fade_in
+        # Fade out to next phoneme
+        if next_phon and next_phon not in ['SIL', 'PAU']:
+            fade_out = np.linspace(1.0, 0.7, transition_len)
+            audio[-transition_len:] *= fade_out
+        return audio
+    def _concatenate(self, segments):
+        """Concatenate segments with crossfade"""
+        if len(segments) == 0:
+            return np.zeros(1000)
+        if len(segments) == 1:
+            return segments[0]
+        # Overlap-add with crossfade
+        overlap = 32
+        total_len = sum(len(s) for s in segments) - overlap * (len(segments) - 1)
+        total_len = max(total_len, 1)
+        audio = np.zeros(total_len)
+        pos = 0
+        for i, seg in enumerate(segments):
+            if len(seg) == 0:
+                continue
+            end = min(pos + len(seg), total_len)
+            seg_len = end - pos
+            if seg_len <= 0:
+                break
+            if i > 0 and pos >= overlap:
+                # Crossfade
+                fade_len = min(overlap, seg_len)
+                fade_in = np.linspace(0, 1, fade_len)
+                fade_out = np.linspace(1, 0, fade_len)
+                audio[pos:pos + fade_len] *= fade_out
+                seg_copy = seg[:seg_len].copy()
+                seg_copy[:fade_len] *= fade_in
+                audio[pos:end] += seg_copy
+            else:
+                audio[pos:end] = seg[:seg_len]
+            pos = end - overlap
+            pos = max(0, pos)
+        return audio
+    def _apply_final_envelope(self, audio):
+        """Apply final envelope to entire audio"""
+        n = len(audio)
+        if n < 100:
+            return audio
+        fade_len = min(n // 30, 300)
+        audio[:fade_len] *= np.linspace(0, 1, fade_len)
+        audio[-fade_len:] *= np.linspace(1, 0, fade_len)
+        return audio
+# ============================================
+# MAIN TTS CLASS
+# ============================================
+class VedesTTS:
+    """Vedes TTS - 100% From Scratch"""
+    def __init__(self, sample_rate=22050):
+        self.sample_rate = sample_rate
+        self.text_to_phoneme = TextToPhoneme()
+        self.synthesizer = KlattSynthesizer(sample_rate)
+    def synthesize(self, text, rate=1.0, pitch=1.0):
+        """Convert text to speech"""
+        if not text or not text.strip():
+            return np.zeros(self.sample_rate, dtype=np.float32)
+        # Convert text to phonemes
+        phonemes = self.text_to_phoneme.convert(text)
+        if not phonemes:
+            return np.zeros(self.sample_rate, dtype=np.float32)
+        # Synthesize audio
+        audio = self.synthesizer.synthesize(phonemes, rate, pitch)
+        return audio
+# ============================================
+# INITIALIZE
+# ============================================
+print("=" * 50)
+print("🎙️ VEDES TTS - 100% From Scratch")
+print("No APIs, No Pre-trained Models")
+print("=" * 50)
+tts = VedesTTS(SAMPLE_RATE)
+print("✅ Initialized successfully!")
+print("=" * 50)
+# ============================================
+# GRADIO INTERFACE
+# ============================================
+def synthesize_speech(text, speaking_rate, pitch_shift):
+    """Gradio synthesis function"""
+    if not text or not text.strip():
+        return None
+    text = text.strip()[:500]
+    try:
+        # Convert pitch shift to multiplier
+        pitch_mult = 2 ** (pitch_shift / 12)
+        # Synthesize
+        audio = tts.synthesize(text, rate=speaking_rate, pitch=pitch_mult)
+        if len(audio) < 100:
+            return None
+        # Convert to int16
+        audio = np.clip(audio, -1, 1)
+        audio_int16 = (audio * 32767).astype(np.int16)
+        return (SAMPLE_RATE, audio_int16)
+    except Exception as e:
+        print(f"Error: {e}")
+        return None
+# Create Gradio interface
+with gr.Blocks(
+    title="Vedes TTS",
+    theme=gr.themes.Soft(primary_hue="indigo")
+) as demo:
+    gr.Markdown("""
+    # 🎙️ Vedes TTS - From Scratch
+    ### 100% Custom Built - No APIs, No Pre-trained Models
+    This TTS uses **Klatt formant synthesis** - the same technique used in early
+    speech synthesizers. It converts text to phonemes, then generates audio using
+    digital resonators that simulate the human vocal tract.
+    """)
+    with gr.Row():
+        with gr.Column(scale=2):
+            text_input = gr.Textbox(
+                label="📝 Enter Text",
+                placeholder="Type something... (e.g., Hello, how are you today?)",
+                lines=4
+            )
+            with gr.Row():
+                rate_slider = gr.Slider(
+                    minimum=0.5, maximum=2.0, value=1.0, step=0.1,
+                    label="⏱️ Speaking Rate"
+                )
+                pitch_slider = gr.Slider(
+                    minimum=-6, maximum=6, value=0, step=1,
+                    label="🎵 Pitch (semitones)"
+                )
+            synth_btn = gr.Button("🔊 Synthesize", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            audio_output = gr.Audio(label="🎧 Output", type="numpy")
+    gr.Examples(
+        examples=[
+            ["Hello, welcome to Vedes."],
+            ["How are you today?"],
+            ["This is a test."],
+            ["The quick brown fox."],
+            ["Good morning!"],
+            ["Thank you very much."],
+            ["I am fine."],
+            ["What is your name?"],
+            ["Nice to meet you."],
+            ["Have a good day."],
+        ],
         inputs=text_input,
+        label="📚 Examples"
     )
+    gr.Markdown("""
+    ---
+    ### 🔧 How It Works
+    1. **Text → Phonemes**: Converts words to speech sounds using a dictionary
+    2. **Glottal Source**: Generates vocal cord vibrations mathematically
+    3. **Formant Filters**: Shapes sound using resonators (F1, F2, F3, F4)
+    4. **Coarticulation**: Smooths transitions between sounds
+    ### ⚠️ Limitations
+    This is **educational/demonstration** quality - not production TTS.
+    Real TTS systems use neural networks trained on thousands of hours of speech.
+    ---
+    *Built from scratch with NumPy and SciPy - No external TTS APIs!*
+    """)
+    synth_btn.click(
         fn=synthesize_speech,
+        inputs=[text_input, rate_slider, pitch_slider],
         outputs=audio_output
     )
     text_input.submit(
         fn=synthesize_speech,
+        inputs=[text_input, rate_slider, pitch_slider],
         outputs=audio_output
     )
 if __name__ == "__main__":
     demo.launch()