Spaces:

vedaco
/

Vedes

Sleeping

App Files Files Community

vedaco commited on 17 days ago

Commit

a8983e0

verified ·

1 Parent(s): c60acbd

Update app.py

Browse files

Files changed (1) hide show

app.py +620 -534

app.py CHANGED Viewed

@@ -1,540 +1,633 @@
-import tensorflow as tf
 import numpy as np
 import gradio as gr
-import scipy.signal as signal
 from scipy.io import wavfile
 import io
-import os
-# Disable GPU warnings
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
 # ============================================
-# VEDES TTS - Text-to-Speech Model from Scratch
 # ============================================
 class VedesConfig:
-    """Configuration for Vedes TTS Model"""
-    # Audio parameters
     sample_rate = 22050
-    n_fft = 1024
-    hop_length = 256
-    win_length = 1024
-    n_mels = 80
-    fmin = 0
-    fmax = 8000
-    # Model parameters
-    embedding_dim = 256
-    encoder_dim = 256
-    decoder_dim = 256
-    attention_dim = 128
-    prenet_dim = 128
-    postnet_dim = 256
-    postnet_layers = 5
-    max_decoder_steps = 500
-    # Text parameters
-    vocab = "abcdefghijklmnopqrstuvwxyz .,!?'-"
-    vocab_size = len(vocab) + 1
 config = VedesConfig()
 # ============================================
-# TEXT PROCESSING
 # ============================================
-class TextProcessor:
-    """Text to sequence converter"""
-    def __init__(self, vocab):
-        self.vocab = vocab
-        self.char_to_idx = {char: idx + 1 for idx, char in enumerate(vocab)}
-        self.idx_to_char = {idx + 1: char for idx, char in enumerate(vocab)}
-        self.idx_to_char[0] = '<pad>'
-    def text_to_sequence(self, text):
-        """Convert text to sequence of integers"""
-        text = text.lower()
-        sequence = []
-        for char in text:
-            if char in self.char_to_idx:
-                sequence.append(self.char_to_idx[char])
-        return np.array(sequence, dtype=np.int32)
-text_processor = TextProcessor(config.vocab)
 # ============================================
-# MODEL LAYERS
 # ============================================
-class Prenet(tf.keras.layers.Layer):
-    """Prenet with dropout"""
-    def __init__(self, units, **kwargs):
-        super().__init__(**kwargs)
-        self.units = units
-    def build(self, input_shape):
-        self.dense1 = tf.keras.layers.Dense(self.units, activation='relu')
-        self.dense2 = tf.keras.layers.Dense(self.units, activation='relu')
-        super().build(input_shape)
-    def call(self, inputs, training=True):
-        x = self.dense1(inputs)
-        x = tf.nn.dropout(x, rate=0.5) if training else x * 0.5
-        x = self.dense2(x)
-        x = tf.nn.dropout(x, rate=0.5) if training else x * 0.5
-        return x
-class Encoder(tf.keras.layers.Layer):
-    """Text Encoder"""
-    def __init__(self, vocab_size, embed_dim, encoder_dim, **kwargs):
-        super().__init__(**kwargs)
-        self.vocab_size = vocab_size
-        self.embed_dim = embed_dim
-        self.encoder_dim = encoder_dim
-    def build(self, input_shape):
-        self.embedding = tf.keras.layers.Embedding(self.vocab_size, self.embed_dim)
-        self.conv1 = tf.keras.layers.Conv1D(self.encoder_dim, 5, padding='same', activation='relu')
-        self.bn1 = tf.keras.layers.BatchNormalization()
-        self.conv2 = tf.keras.layers.Conv1D(self.encoder_dim, 5, padding='same', activation='relu')
-        self.bn2 = tf.keras.layers.BatchNormalization()
-        self.conv3 = tf.keras.layers.Conv1D(self.encoder_dim, 5, padding='same', activation='relu')
-        self.bn3 = tf.keras.layers.BatchNormalization()
-        self.bilstm = tf.keras.layers.Bidirectional(
-            tf.keras.layers.LSTM(self.encoder_dim // 2, return_sequences=True),
-            merge_mode='concat'
-        )
-        super().build(input_shape)
-    def call(self, inputs, training=True):
-        x = self.embedding(inputs)
-        x = self.conv1(x)
-        x = self.bn1(x, training=training)
-        x = self.conv2(x)
-        x = self.bn2(x, training=training)
-        x = self.conv3(x)
-        x = self.bn3(x, training=training)
-        x = self.bilstm(x)
-        return x
-class Attention(tf.keras.layers.Layer):
-    """Bahdanau-style Attention"""
-    def __init__(self, attention_dim, **kwargs):
-        super().__init__(**kwargs)
-        self.attention_dim = attention_dim
-    def build(self, input_shape):
-        self.W_query = tf.keras.layers.Dense(self.attention_dim)
-        self.W_keys = tf.keras.layers.Dense(self.attention_dim)
-        self.V = tf.keras.layers.Dense(1)
-        super().build(input_shape)
-    def call(self, query, keys):
-        """
-        query: [batch, decoder_dim]
-        keys: [batch, seq_len, encoder_dim]
-        """
-        query_expanded = tf.expand_dims(query, 1)
-        scores = self.V(tf.nn.tanh(
-            self.W_query(query_expanded) + self.W_keys(keys)
-        ))
-        scores = tf.squeeze(scores, -1)
-        weights = tf.nn.softmax(scores, axis=-1)
-        context = tf.reduce_sum(tf.expand_dims(weights, -1) * keys, axis=1)
-        return context, weights
-class Decoder(tf.keras.layers.Layer):
-    """Autoregressive Decoder"""
-    def __init__(self, decoder_dim, n_mels, prenet_dim, attention_dim, encoder_dim, **kwargs):
-        super().__init__(**kwargs)
-        self.decoder_dim = decoder_dim
-        self.n_mels = n_mels
-        self.prenet_dim = prenet_dim
-        self.attention_dim = attention_dim
-        self.encoder_dim = encoder_dim
-    def build(self, input_shape):
-        self.prenet = Prenet(self.prenet_dim)
-        self.attention = Attention(self.attention_dim)
-        # Single GRU layer (simpler than LSTM for this case)
-        self.gru = tf.keras.layers.GRUCell(self.decoder_dim)
-        # Output projections
-        self.mel_dense = tf.keras.layers.Dense(self.n_mels)
-        self.stop_dense = tf.keras.layers.Dense(1)
-        # Build prenet
-        self.prenet.build([None, self.n_mels])
-        super().build(input_shape)
-    def get_initial_state(self, batch_size):
-        return tf.zeros([batch_size, self.decoder_dim])
-    def step(self, decoder_input, encoder_outputs, state, training=True):
-        """Single decoder step"""
-        # Prenet
-        prenet_out = self.prenet(decoder_input, training=training)
-        # Attention
-        context, attention_weights = self.attention(state, encoder_outputs)
-        # GRU input
-        gru_input = tf.concat([prenet_out, context], axis=-1)
-        # GRU
-        gru_out, new_states = self.gru(gru_input, [state])
-        new_state = new_states[0] if isinstance(new_states, list) else new_states
-        # Outputs
-        output_concat = tf.concat([gru_out, context], axis=-1)
-        mel_output = self.mel_dense(output_concat)
-        stop_output = self.stop_dense(output_concat)
-        return mel_output, stop_output, new_state, attention_weights
-class Postnet(tf.keras.layers.Layer):
-    """Postnet for mel refinement"""
-    def __init__(self, n_mels, postnet_dim, num_layers=5, **kwargs):
-        super().__init__(**kwargs)
-        self.n_mels = n_mels
-        self.postnet_dim = postnet_dim
-        self.num_layers = num_layers
-    def build(self, input_shape):
-        self.convs = []
-        self.bns = []
-        for i in range(self.num_layers):
-            out_dim = self.n_mels if i == self.num_layers - 1 else self.postnet_dim
-            self.convs.append(tf.keras.layers.Conv1D(out_dim, 5, padding='same'))
-            self.bns.append(tf.keras.layers.BatchNormalization())
-        super().build(input_shape)
-    def call(self, inputs, training=True):
-        x = inputs
-        for i, (conv, bn) in enumerate(zip(self.convs, self.bns)):
-            x = conv(x)
-            x = bn(x, training=training)
-            if i < self.num_layers - 1:
-                x = tf.nn.tanh(x)
-        return inputs + x
-# ============================================
-# VEDES TTS MODEL
-# ============================================
-class VedesTTS(tf.keras.Model):
-    """Complete Vedes TTS Model"""
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.config = config
-    def build(self, input_shape):
-        self.encoder = Encoder(
-            self.config.vocab_size,
-            self.config.embedding_dim,
-            self.config.encoder_dim
-        )
-        self.decoder = Decoder(
-            self.config.decoder_dim,
-            self.config.n_mels,
-            self.config.prenet_dim,
-            self.config.attention_dim,
-            self.config.encoder_dim
-        )
-        self.postnet = Postnet(
-            self.config.n_mels,
-            self.config.postnet_dim,
-            self.config.postnet_layers
-        )
-        super().build(input_shape)
-    def call(self, inputs, training=True):
-        """Forward pass"""
-        if isinstance(inputs, (list, tuple)):
-            text_inputs = inputs[0]
-        else:
-            text_inputs = inputs
-        return self.inference_eager(text_inputs, self.config.max_decoder_steps)
-    def inference_eager(self, text_sequence, max_steps=500):
-        """Eager mode inference"""
-        if len(text_sequence.shape) == 1:
-            text_sequence = tf.expand_dims(text_sequence, 0)
-        batch_size = tf.shape(text_sequence)[0]
-        # Encode
-        encoder_outputs = self.encoder(text_sequence, training=False)
-        # Initialize decoder
-        state = self.decoder.get_initial_state(batch_size)
-        decoder_input = tf.zeros([batch_size, self.config.n_mels])
-        mel_outputs = []
-        for step in range(max_steps):
-            mel_out, stop_out, state, _ = self.decoder.step(
-                decoder_input, encoder_outputs, state, training=False
-            )
-            mel_outputs.append(mel_out)
-            decoder_input = mel_out
-            if tf.nn.sigmoid(stop_out[0, 0]) > 0.5 and step > 10:
-                break
-        if len(mel_outputs) == 0:
-            return tf.zeros([1, 1, self.config.n_mels])
-        mel_outputs = tf.stack(mel_outputs, axis=1)
-        mel_outputs = self.postnet(mel_outputs, training=False)
-        return mel_outputs
-# ============================================
-# GRIFFIN-LIM VOCODER
-# ============================================
-class GriffinLimVocoder:
-    """Griffin-Lim algorithm for mel to audio"""
-    def __init__(self, config):
-        self.config = config
-        self.mel_basis = self._create_mel_filterbank()
-    def _hz_to_mel(self, hz):
-        return 2595 * np.log10(1 + hz / 700)
-    def _mel_to_hz(self, mel):
-        return 700 * (10 ** (mel / 2595) - 1)
-    def _create_mel_filterbank(self):
-        n_fft = self.config.n_fft
-        n_mels = self.config.n_mels
-        sample_rate = self.config.sample_rate
-        mel_fmin = self._hz_to_mel(self.config.fmin)
-        mel_fmax = self._hz_to_mel(self.config.fmax)
-        mel_points = np.linspace(mel_fmin, mel_fmax, n_mels + 2)
-        hz_points = self._mel_to_hz(mel_points)
-        bin_points = np.floor((n_fft + 1) * hz_points / sample_rate).astype(int)
-        filterbank = np.zeros((n_mels, n_fft // 2 + 1))
-        for i in range(n_mels):
-            left, center, right = bin_points[i], bin_points[i + 1], bin_points[i + 2]
-            for j in range(left, center):
-                if center != left:
-                    filterbank[i, j] = (j - left) / (center - left)
-            for j in range(center, right):
-                if right != center:
-                    filterbank[i, j] = (right - j) / (right - center)
-        return filterbank
-    def mel_to_linear(self, mel_spec):
-        mel_basis_pinv = np.linalg.pinv(self.mel_basis)
-        return np.maximum(1e-10, np.dot(mel_spec, mel_basis_pinv.T))
-    def griffin_lim(self, mel_spectrogram, n_iter=32):
-        # Denormalize
-        spectrogram = np.exp(np.clip(mel_spectrogram, -10, 10))
-        # Mel to linear
-        linear_spec = self.mel_to_linear(spectrogram)
-        # Initialize phase
-        phase = np.exp(2j * np.pi * np.random.rand(*linear_spec.shape))
-        complex_spec = linear_spec * phase
-        for _ in range(n_iter):
-            audio = self._istft(complex_spec)
-            complex_spec = self._stft(audio)
-            phase = np.exp(1j * np.angle(complex_spec))
-            complex_spec = linear_spec * phase
-        audio = self._istft(complex_spec)
-        # Normalize
-        max_val = np.max(np.abs(audio))
-        if max_val > 0:
-            audio = audio / max_val
-        return audio.astype(np.float32)
-    def _stft(self, audio):
-        frames = []
-        window = np.hanning(self.config.win_length)
-        for i in range(0, max(1, len(audio) - self.config.win_length), self.config.hop_length):
-            frame = audio[i:i + self.config.win_length]
-            if len(frame) < self.config.win_length:
-                frame = np.pad(frame, (0, self.config.win_length - len(frame)))
-            frames.append(np.fft.rfft(frame * window))
-        return np.array(frames) if frames else np.zeros((1, self.config.n_fft // 2 + 1), dtype=complex)
-    def _istft(self, complex_spec):
-        n_frames = len(complex_spec)
-        expected_len = self.config.hop_length * n_frames + self.config.win_length
-        audio = np.zeros(expected_len)
-        window = np.hanning(self.config.win_length)
-        window_sum = np.zeros(expected_len)
-        for i, frame in enumerate(complex_spec):
-            start = i * self.config.hop_length
-            end = start + self.config.win_length
-            audio[start:end] += np.real(np.fft.irfft(frame, self.config.win_length)) * window
-            window_sum[start:end] += window ** 2
-        # Normalize by window sum
-        window_sum = np.maximum(window_sum, 1e-8)
-        audio = audio / window_sum
         return audio
 # ============================================
-# SIMPLE SYNTHESIZER (Fallback)
 # ============================================
-class SimpleSynthesizer:
-    """Simple formant-based synthesizer as fallback"""
     def __init__(self, sample_rate=22050):
         self.sample_rate = sample_rate
-        # Basic phoneme frequencies
-        self.phonemes = {
-            'a': {'f1': 730, 'f2': 1090},
-            'e': {'f1': 530, 'f2': 1840},
-            'i': {'f1': 270, 'f2': 2290},
-            'o': {'f1': 570, 'f2': 840},
-            'u': {'f1': 300, 'f2': 870},
-        }
-        self.default_formant = {'f1': 500, 'f2': 1500}
-    def synthesize(self, text, duration_per_char=0.08):
-        """Generate speech-like audio from text"""
-        audio = np.array([], dtype=np.float32)
-        text = text.lower()
-        for char in text:
-            if char == ' ':
-                # Silence for space
-                silence = np.zeros(int(self.sample_rate * 0.1))
-                audio = np.concatenate([audio, silence])
-            elif char in 'aeiou':
-                # Vowel
-                segment = self._generate_vowel(char, duration_per_char)
-                audio = np.concatenate([audio, segment])
-            elif char in 'bcdfghjklmnpqrstvwxyz':
-                # Consonant (simplified)
-                segment = self._generate_consonant(char, duration_per_char * 0.5)
-                audio = np.concatenate([audio, segment])
-            elif char in '.,!?':
-                # Punctuation pause
-                silence = np.zeros(int(self.sample_rate * 0.15))
-                audio = np.concatenate([audio, silence])
-        # Apply envelope
-        if len(audio) > 0:
-            audio = self._apply_envelope(audio)
-            audio = audio / (np.max(np.abs(audio)) + 1e-8)
-        return audio.astype(np.float32)
-    def _generate_vowel(self, char, duration):
-        """Generate a vowel sound"""
-        t = np.linspace(0, duration, int(self.sample_rate * duration))
-        formant = self.phonemes.get(char, self.default_formant)
-        f0 = 120  # Fundamental frequency
-        # Generate harmonics
-        signal = np.zeros_like(t)
-        for harmonic in range(1, 8):
-            freq = f0 * harmonic
-            amp = 1.0 / harmonic
-            signal += amp * np.sin(2 * np.pi * freq * t)
-        # Add formants
-        f1_signal = np.sin(2 * np.pi * formant['f1'] * t) * 0.3
-        f2_signal = np.sin(2 * np.pi * formant['f2'] * t) * 0.2
-        signal = signal + f1_signal + f2_signal
-        # Apply envelope
-        envelope = np.ones_like(t)
-        attack = int(len(t) * 0.1)
-        release = int(len(t) * 0.2)
-        envelope[:attack] = np.linspace(0, 1, attack)
-        envelope[-release:] = np.linspace(1, 0, release)
-        return (signal * envelope).astype(np.float32)
-    def _generate_consonant(self, char, duration):
-        """Generate consonant-like noise"""
-        n_samples = int(self.sample_rate * duration)
-        # Noise-based consonants
-        if char in 'sfhx':
-            noise = np.random.randn(n_samples) * 0.3
-        elif char in 'bp':
-            # Plosive
-            noise = np.random.randn(n_samples) * 0.5
-            noise[:n_samples//4] = 0
-        else:
-            # Default consonant
-            noise = np.random.randn(n_samples) * 0.2
-        # Envelope
-        envelope = np.ones(n_samples)
-        fade = min(n_samples // 4, 100)
-        envelope[:fade] = np.linspace(0, 1, fade)
-        envelope[-fade:] = np.linspace(1, 0, fade)
-        return (noise * envelope).astype(np.float32)
-    def _apply_envelope(self, audio):
-        """Apply overall envelope to audio"""
-        fade_len = min(len(audio) // 10, 1000)
-        if fade_len > 0:
-            audio[:fade_len] *= np.linspace(0, 1, fade_len)
-            audio[-fade_len:] *= np.linspace(1, 0, fade_len)
         return audio
@@ -543,26 +636,12 @@ class SimpleSynthesizer:
 # ============================================
 print("=" * 50)
-print("Initializing Vedes TTS Model...")
 print("=" * 50)
-# Create model and vocoder
-model = VedesTTS(config)
-vocoder = GriffinLimVocoder(config)
-simple_synth = SimpleSynthesizer(config.sample_rate)
-# Build model
-try:
-    dummy_input = tf.zeros([1, 10], dtype=tf.int32)
-    model.build(input_shape=[None, None])
-    _ = model(dummy_input, training=False)
-    print("Neural model initialized successfully!")
-    USE_NEURAL = True
-except Exception as e:
-    print(f"Neural model init warning: {e}")
-    print("Using simple synthesizer as fallback")
-    USE_NEURAL = False
 print("=" * 50)
@@ -570,42 +649,28 @@ print("=" * 50)
 # SYNTHESIS FUNCTION
 # ============================================
-def synthesize_speech(text, speaking_rate=1.0, pitch_shift=0):
-    """Convert text to speech"""
     if not text or len(text.strip()) == 0:
         return None
-    text = text.strip()[:500]  # Limit length
     try:
-        if USE_NEURAL:
-            # Use neural model
-            text_sequence = text_processor.text_to_sequence(text)
-            if len(text_sequence) == 0:
-                return None
-            text_tensor = tf.constant(text_sequence, dtype=tf.int32)
-            # Generate mel spectrogram
-            mel_output = model.inference_eager(text_tensor, max_steps=config.max_decoder_steps)
-            mel_spectrogram = mel_output[0].numpy()
-            # Apply pitch shift
-            if pitch_shift != 0:
-                mel_spectrogram = mel_spectrogram * (2 ** (pitch_shift / 12))
-            # Convert to audio
-            audio = vocoder.griffin_lim(mel_spectrogram)
-        else:
-            # Use simple synthesizer
-            audio = simple_synth.synthesize(text)
-        # Adjust speaking rate
-        if speaking_rate != 1.0 and len(audio) > 100:
-            target_length = int(len(audio) / speaking_rate)
-            audio = signal.resample(audio, target_length)
-        # Normalize and convert to int16
         audio = np.clip(audio, -1, 1)
         audio_int16 = (audio * 32767).astype(np.int16)
@@ -613,34 +678,32 @@ def synthesize_speech(text, speaking_rate=1.0, pitch_shift=0):
     except Exception as e:
         print(f"Synthesis error: {e}")
-        # Fallback to simple synthesizer
-        try:
-            audio = simple_synth.synthesize(text)
-            audio_int16 = (np.clip(audio, -1, 1) * 32767).astype(np.int16)
-            return (config.sample_rate, audio_int16)
-        except:
-            return None
 # ============================================
 # GRADIO INTERFACE
 # ============================================
-with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
         # 🎙️ Vedes TTS - Text-to-Speech Synthesis
-        ### Built from scratch with TensorFlow
-        Enter text below to convert it to speech!
         """
     )
     with gr.Row():
         with gr.Column(scale=2):
             text_input = gr.Textbox(
-                label="📝 Input Text",
-                placeholder="Type or paste your text here...",
                 lines=4,
                 max_lines=10
             )
@@ -651,18 +714,30 @@ with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
                     maximum=2.0,
                     value=1.0,
                     step=0.1,
-                    label="🎚️ Speaking Rate"
                 )
                 pitch_shift = gr.Slider(
-                    minimum=-5,
-                    maximum=5,
                     value=0,
                     step=1,
-                    label="🎵 Pitch Shift"
                 )
-            synthesize_btn = gr.Button("🔊 Synthesize Speech", variant="primary", size="lg")
         with gr.Column(scale=1):
             audio_output = gr.Audio(
@@ -672,14 +747,17 @@ with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
     gr.Examples(
         examples=[
-            ["Hello world!"],
-            ["Welcome to Vedes text to speech."],
             ["The quick brown fox jumps over the lazy dog."],
             ["How are you doing today?"],
             ["This is a test of the speech synthesis system."],
         ],
         inputs=text_input,
-        label="📚 Example Texts"
     )
     gr.Markdown(
@@ -687,30 +765,38 @@ with gr.Blocks(title="Vedes TTS", theme=gr.themes.Soft()) as demo:
         ---
         ### ℹ️ About Vedes TTS
-        **Architecture:**
-        - Encoder: Embedding + Conv1D + BiLSTM
-        - Attention: Bahdanau-style attention
-        - Decoder: GRU with Prenet
-        - Postnet: Conv1D refinement
-        - Vocoder: Griffin-Lim
-        Built with TensorFlow 2.x and Gradio
         """
     )
     # Event handlers
     synthesize_btn.click(
         fn=synthesize_speech,
-        inputs=[text_input, speaking_rate, pitch_shift],
         outputs=audio_output
     )
     text_input.submit(
         fn=synthesize_speech,
-        inputs=[text_input, speaking_rate, pitch_shift],
         outputs=audio_output
     )
 # Launch
 if __name__ == "__main__":
     demo.launch()

 import numpy as np
 import gradio as gr
+from scipy import signal
 from scipy.io import wavfile
 import io
+import re
 # ============================================
+# VEDES TTS - Formant-Based Speech Synthesizer
 # ============================================
 class VedesConfig:
+    """Configuration"""
     sample_rate = 22050
 config = VedesConfig()
 # ============================================
+# PHONEME DEFINITIONS
 # ============================================
+# Phoneme to formant mapping (F1, F2, F3, duration_ms, is_voiced)
+PHONEMES = {
+    # Vowels (voiced)
+    'AA': (710, 1100, 2540, 120, True),   # father
+    'AE': (660, 1720, 2410, 120, True),   # cat
+    'AH': (520, 1190, 2390, 100, True),   # but
+    'AO': (570, 840, 2410, 120, True),    # dog
+    'AW': (630, 1200, 2550, 150, True),   # how
+    'AY': (710, 1100, 2540, 150, True),   # my
+    'EH': (530, 1840, 2480, 100, True),   # bed
+    'ER': (490, 1350, 1690, 120, True),   # bird
+    'EY': (450, 2100, 2680, 140, True),   # say
+    'IH': (400, 1920, 2560, 80, True),    # bit
+    'IY': (270, 2290, 3010, 120, True),   # see
+    'OW': (450, 850, 2500, 140, True),    # go
+    'OY': (490, 1350, 2480, 160, True),   # boy
+    'UH': (440, 1020, 2240, 100, True),   # book
+    'UW': (300, 870, 2240, 120, True),    # too
+    # Consonants - Stops
+    'B': (200, 1100, 2150, 60, True),
+    'D': (200, 1600, 2600, 50, True),
+    'G': (200, 1990, 2850, 50, True),
+    'P': (200, 800, 2000, 80, False),
+    'T': (200, 1600, 2600, 70, False),
+    'K': (200, 1990, 2850, 80, False),
+    # Consonants - Fricatives
+    'F': (175, 900, 2400, 100, False),
+    'V': (175, 1100, 2400, 80, True),
+    'TH': (200, 1400, 2200, 80, False),
+    'DH': (200, 1600, 2400, 60, True),
+    'S': (200, 1800, 4000, 100, False),
+    'Z': (200, 1600, 3500, 80, True),
+    'SH': (200, 1800, 2600, 100, False),
+    'ZH': (200, 1800, 2600, 80, True),
+    'HH': (280, 1200, 2400, 80, False),
+    # Consonants - Nasals
+    'M': (280, 900, 2200, 80, True),
+    'N': (280, 1700, 2600, 70, True),
+    'NG': (280, 2300, 2750, 80, True),
+    # Consonants - Liquids
+    'L': (350, 1100, 2700, 70, True),
+    'R': (420, 1300, 1600, 70, True),
+    # Consonants - Glides
+    'W': (300, 870, 2240, 60, True),
+    'Y': (280, 2250, 3000, 50, True),
+    # Special
+    'CH': (200, 1800, 2600, 100, False),
+    'JH': (200, 1800, 2600, 80, True),
+    # Silence
+    'SIL': (0, 0, 0, 100, False),
+    'PAU': (0, 0, 0, 150, False),
+}
+# Letter to phoneme mapping (simplified)
+LETTER_TO_PHONEME = {
+    'a': ['AE'],
+    'b': ['B'],
+    'c': ['K'],
+    'd': ['D'],
+    'e': ['EH'],
+    'f': ['F'],
+    'g': ['G'],
+    'h': ['HH'],
+    'i': ['IH'],
+    'j': ['JH'],
+    'k': ['K'],
+    'l': ['L'],
+    'm': ['M'],
+    'n': ['N'],
+    'o': ['AA'],
+    'p': ['P'],
+    'q': ['K', 'W'],
+    'r': ['R'],
+    's': ['S'],
+    't': ['T'],
+    'u': ['AH'],
+    'v': ['V'],
+    'w': ['W'],
+    'x': ['K', 'S'],
+    'y': ['Y'],
+    'z': ['Z'],
+    ' ': ['SIL'],
+    '.': ['PAU'],
+    ',': ['PAU'],
+    '!': ['PAU'],
+    '?': ['PAU'],
+    '-': ['SIL'],
+    "'": [],
+}
+# Common word pronunciations
+WORD_PRONUNCIATIONS = {
+    'the': ['DH', 'AH'],
+    'a': ['AH'],
+    'an': ['AE', 'N'],
+    'is': ['IH', 'Z'],
+    'are': ['AA', 'R'],
+    'was': ['W', 'AA', 'Z'],
+    'were': ['W', 'ER'],
+    'be': ['B', 'IY'],
+    'been': ['B', 'IH', 'N'],
+    'have': ['HH', 'AE', 'V'],
+    'has': ['HH', 'AE', 'Z'],
+    'had': ['HH', 'AE', 'D'],
+    'do': ['D', 'UW'],
+    'does': ['D', 'AH', 'Z'],
+    'did': ['D', 'IH', 'D'],
+    'will': ['W', 'IH', 'L'],
+    'would': ['W', 'UH', 'D'],
+    'could': ['K', 'UH', 'D'],
+    'should': ['SH', 'UH', 'D'],
+    'can': ['K', 'AE', 'N'],
+    'may': ['M', 'EY'],
+    'might': ['M', 'AY', 'T'],
+    'must': ['M', 'AH', 'S', 'T'],
+    'i': ['AY'],
+    'you': ['Y', 'UW'],
+    'he': ['HH', 'IY'],
+    'she': ['SH', 'IY'],
+    'it': ['IH', 'T'],
+    'we': ['W', 'IY'],
+    'they': ['DH', 'EY'],
+    'this': ['DH', 'IH', 'S'],
+    'that': ['DH', 'AE', 'T'],
+    'what': ['W', 'AH', 'T'],
+    'which': ['W', 'IH', 'CH'],
+    'who': ['HH', 'UW'],
+    'how': ['HH', 'AW'],
+    'when': ['W', 'EH', 'N'],
+    'where': ['W', 'EH', 'R'],
+    'why': ['W', 'AY'],
+    'all': ['AO', 'L'],
+    'each': ['IY', 'CH'],
+    'every': ['EH', 'V', 'R', 'IY'],
+    'both': ['B', 'OW', 'TH'],
+    'few': ['F', 'Y', 'UW'],
+    'more': ['M', 'AO', 'R'],
+    'most': ['M', 'OW', 'S', 'T'],
+    'other': ['AH', 'DH', 'ER'],
+    'some': ['S', 'AH', 'M'],
+    'such': ['S', 'AH', 'CH'],
+    'no': ['N', 'OW'],
+    'not': ['N', 'AA', 'T'],
+    'only': ['OW', 'N', 'L', 'IY'],
+    'same': ['S', 'EY', 'M'],
+    'so': ['S', 'OW'],
+    'than': ['DH', 'AE', 'N'],
+    'too': ['T', 'UW'],
+    'very': ['V', 'EH', 'R', 'IY'],
+    'just': ['JH', 'AH', 'S', 'T'],
+    'hello': ['HH', 'EH', 'L', 'OW'],
+    'hi': ['HH', 'AY'],
+    'welcome': ['W', 'EH', 'L', 'K', 'AH', 'M'],
+    'to': ['T', 'UW'],
+    'world': ['W', 'ER', 'L', 'D'],
+    'speech': ['S', 'P', 'IY', 'CH'],
+    'text': ['T', 'EH', 'K', 'S', 'T'],
+    'voice': ['V', 'OY', 'S'],
+    'sound': ['S', 'AW', 'N', 'D'],
+    'good': ['G', 'UH', 'D'],
+    'great': ['G', 'R', 'EY', 'T'],
+    'nice': ['N', 'AY', 'S'],
+    'thank': ['TH', 'AE', 'NG', 'K'],
+    'thanks': ['TH', 'AE', 'NG', 'K', 'S'],
+    'please': ['P', 'L', 'IY', 'Z'],
+    'yes': ['Y', 'EH', 'S'],
+    'yeah': ['Y', 'AE'],
+    'ok': ['OW', 'K', 'EY'],
+    'okay': ['OW', 'K', 'EY'],
+    'and': ['AE', 'N', 'D'],
+    'or': ['AO', 'R'],
+    'but': ['B', 'AH', 'T'],
+    'if': ['IH', 'F'],
+    'then': ['DH', 'EH', 'N'],
+    'because': ['B', 'IH', 'K', 'AO', 'Z'],
+    'as': ['AE', 'Z'],
+    'until': ['AH', 'N', 'T', 'IH', 'L'],
+    'while': ['W', 'AY', 'L'],
+    'of': ['AH', 'V'],
+    'at': ['AE', 'T'],
+    'by': ['B', 'AY'],
+    'for': ['F', 'AO', 'R'],
+    'with': ['W', 'IH', 'TH'],
+    'about': ['AH', 'B', 'AW', 'T'],
+    'into': ['IH', 'N', 'T', 'UW'],
+    'through': ['TH', 'R', 'UW'],
+    'during': ['D', 'UH', 'R', 'IH', 'NG'],
+    'before': ['B', 'IH', 'F', 'AO', 'R'],
+    'after': ['AE', 'F', 'T', 'ER'],
+    'above': ['AH', 'B', 'AH', 'V'],
+    'below': ['B', 'IH', 'L', 'OW'],
+    'from': ['F', 'R', 'AH', 'M'],
+    'up': ['AH', 'P'],
+    'down': ['D', 'AW', 'N'],
+    'in': ['IH', 'N'],
+    'out': ['AW', 'T'],
+    'on': ['AA', 'N'],
+    'off': ['AO', 'F'],
+    'over': ['OW', 'V', 'ER'],
+    'under': ['AH', 'N', 'D', 'ER'],
+    'again': ['AH', 'G', 'EH', 'N'],
+    'there': ['DH', 'EH', 'R'],
+    'here': ['HH', 'IY', 'R'],
+    'today': ['T', 'AH', 'D', 'EY'],
+    'now': ['N', 'AW'],
+    'my': ['M', 'AY'],
+    'your': ['Y', 'AO', 'R'],
+    'his': ['HH', 'IH', 'Z'],
+    'her': ['HH', 'ER'],
+    'our': ['AW', 'ER'],
+    'their': ['DH', 'EH', 'R'],
+    'test': ['T', 'EH', 'S', 'T'],
+    'testing': ['T', 'EH', 'S', 'T', 'IH', 'NG'],
+    'one': ['W', 'AH', 'N'],
+    'two': ['T', 'UW'],
+    'three': ['TH', 'R', 'IY'],
+    'four': ['F', 'AO', 'R'],
+    'five': ['F', 'AY', 'V'],
+    'name': ['N', 'EY', 'M'],
+    'vedes': ['V', 'IY', 'D', 'EH', 'S'],
+    'synthesis': ['S', 'IH', 'N', 'TH', 'AH', 'S', 'IH', 'S'],
+    'system': ['S', 'IH', 'S', 'T', 'AH', 'M'],
+}
+# Common letter patterns
+PATTERNS = [
+    (r'tion', ['SH', 'AH', 'N']),
+    (r'sion', ['ZH', 'AH', 'N']),
+    (r'ough', ['AH', 'F']),
+    (r'ight', ['AY', 'T']),
+    (r'ould', ['UH', 'D']),
+    (r'tion', ['SH', 'AH', 'N']),
+    (r'th', ['TH']),
+    (r'ch', ['CH']),
+    (r'sh', ['SH']),
+    (r'ph', ['F']),
+    (r'wh', ['W']),
+    (r'ck', ['K']),
+    (r'ng', ['NG']),
+    (r'qu', ['K', 'W']),
+    (r'ee', ['IY']),
+    (r'ea', ['IY']),
+    (r'oo', ['UW']),
+    (r'ou', ['AW']),
+    (r'ow', ['OW']),
+    (r'ai', ['EY']),
+    (r'ay', ['EY']),
+    (r'oy', ['OY']),
+    (r'oi', ['OY']),
+    (r'au', ['AO']),
+    (r'aw', ['AO']),
+    (r'ie', ['IY']),
+    (r'ei', ['EY']),
+    (r'ue', ['UW']),
+    (r'ew', ['UW']),
+]
 # ============================================
+# TEXT TO PHONEME CONVERTER
 # ============================================
+class TextToPhoneme:
+    """Convert text to phoneme sequence"""
+    def __init__(self):
+        self.word_dict = WORD_PRONUNCIATIONS
+        self.letter_map = LETTER_TO_PHONEME
+        self.patterns = PATTERNS
+    def convert(self, text):
+        """Convert text to phoneme list"""
+        text = text.lower().strip()
+        words = re.findall(r"[\w']+|[.,!?;:\-]|\s+", text)
+        phonemes = []
+        for word in words:
+            word = word.strip()
+            if not word:
+                continue
+            if word in self.word_dict:
+                phonemes.extend(self.word_dict[word])
+            elif word.isspace():
+                phonemes.append('SIL')
+            elif word in '.,!?;:':
+                phonemes.append('PAU')
+            else:
+                # Convert letter by letter with pattern matching
+                phonemes.extend(self._convert_word(word))
+        return phonemes
+    def _convert_word(self, word):
+        """Convert a single word to phonemes"""
+        phonemes = []
+        i = 0
+        word = word.lower()
+        while i < len(word):
+            matched = False
+            # Try pattern matching (longer patterns first)
+            for pattern, phon_list in sorted(self.patterns, key=lambda x: -len(x[0])):
+                if word[i:].startswith(pattern):
+                    phonemes.extend(phon_list)
+                    i += len(pattern)
+                    matched = True
+                    break
+            if not matched:
+                # Single letter conversion
+                char = word[i]
+                if char in self.letter_map:
+                    phonemes.extend(self.letter_map[char])
+                i += 1
+        return phonemes
+# ============================================
+# FORMANT SYNTHESIZER
+# ============================================
+class FormantSynthesizer:
+    """Klatt-style formant synthesizer"""
+    def __init__(self, sample_rate=22050):
+        self.sample_rate = sample_rate
+        self.base_f0 = 120  # Base fundamental frequency
+    def synthesize(self, phonemes, speaking_rate=1.0, pitch_shift=0):
+        """Synthesize audio from phoneme sequence"""
+        if not phonemes:
+            return np.zeros(1000, dtype=np.float32)
+        # Adjust pitch
+        f0 = self.base_f0 * (2 ** (pitch_shift / 12))
+        audio_segments = []
+        for i, phoneme in enumerate(phonemes):
+            if phoneme not in PHONEMES:
+                continue
+            f1, f2, f3, duration_ms, is_voiced = PHONEMES[phoneme]
+            # Adjust duration for speaking rate
+            duration_ms = int(duration_ms / speaking_rate)
+            duration_ms = max(30, min(duration_ms, 300))
+            # Generate phoneme audio
+            segment = self._generate_phoneme(
+                f0, f1, f2, f3, duration_ms, is_voiced, phoneme
+            )
+            audio_segments.append(segment)
+        if not audio_segments:
+            return np.zeros(1000, dtype=np.float32)
+        # Concatenate with smoothing
+        audio = self._concatenate_smooth(audio_segments)
+        # Apply overall envelope and normalization
+        audio = self._apply_envelope(audio)
+        audio = audio / (np.max(np.abs(audio)) + 1e-8)
+        return audio.astype(np.float32)
+    def _generate_phoneme(self, f0, f1, f2, f3, duration_ms, is_voiced, phoneme):
+        """Generate audio for a single phoneme"""
+        n_samples = int(self.sample_rate * duration_ms / 1000)
+        t = np.linspace(0, duration_ms / 1000, n_samples)
+        if phoneme in ['SIL', 'PAU']:
+            return np.zeros(n_samples, dtype=np.float32)
+        if is_voiced:
+            # Generate glottal pulse train
+            source = self._generate_voice_source(t, f0)
+        else:
+            # Generate noise for unvoiced
+            source = np.random.randn(n_samples) * 0.3
+        # Apply formant filtering
+        if f1 > 0:
+            audio = self._apply_formants(source, [f1, f2, f3])
+        else:
+            audio = source
+        # Apply consonant characteristics
+        audio = self._apply_consonant_shape(audio, phoneme)
+        # Apply envelope
+        audio = self._apply_phoneme_envelope(audio, phoneme)
+        return audio.astype(np.float32)
+    def _generate_voice_source(self, t, f0):
+        """Generate glottal source with harmonics"""
+        source = np.zeros_like(t)
+        # Add harmonics with decreasing amplitude
+        for harmonic in range(1, 12):
+            freq = f0 * harmonic
+            if freq > self.sample_rate / 2:
+                break
+            amp = 1.0 / (harmonic ** 1.2)
+            # Add slight vibrato
+            vibrato = 1 + 0.01 * np.sin(2 * np.pi * 5 * t)
+            source += amp * np.sin(2 * np.pi * freq * vibrato * t)
+        # Add some noise for naturalness
+        source += np.random.randn(len(t)) * 0.02
+        return source
+    def _apply_formants(self, source, formants):
+        """Apply formant filtering using resonators"""
+        audio = source.copy()
+        for i, f in enumerate(formants):
+            if f <= 0 or f >= self.sample_rate / 2:
+                continue
+            # Bandwidth increases with formant number
+            bandwidth = 60 + i * 40
+            # Design bandpass filter
+            try:
+                low = max(20, f - bandwidth)
+                high = min(self.sample_rate / 2 - 100, f + bandwidth)
+                if low >= high:
+                    continue
+                b, a = signal.butter(
+                    2,
+                    [low / (self.sample_rate / 2), high / (self.sample_rate / 2)],
+                    btype='band'
+                )
+                filtered = signal.filtfilt(b, a, source)
+                # Weight formants (F1 strongest)
+                weight = 1.0 / (i + 1)
+                audio = audio + filtered * weight
+            except Exception:
+                pass
+        return audio
+    def _apply_consonant_shape(self, audio, phoneme):
+        """Apply consonant-specific characteristics"""
+        n = len(audio)
+        # Plosives: silence then burst
+        if phoneme in ['P', 'T', 'K', 'B', 'D', 'G']:
+            silence_len = n // 3
+            audio[:silence_len] = 0
+            burst = np.random.randn(n // 6) * 0.5
+            audio[silence_len:silence_len + len(burst)] += burst
+        # Fricatives: add more noise
+        elif phoneme in ['F', 'S', 'SH', 'TH', 'HH']:
+            noise = np.random.randn(n) * 0.3
+            # High-pass for 's' and 'sh'
+            if phoneme in ['S', 'SH']:
+                try:
+                    b, a = signal.butter(2, 3000 / (self.sample_rate / 2), btype='high')
+                    noise = signal.filtfilt(b, a, noise)
+                except:
+                    pass
+            audio = audio * 0.3 + noise * 0.7
+        # Nasals: add low frequency resonance
+        elif phoneme in ['M', 'N', 'NG']:
+            try:
+                b, a = signal.butter(2, 500 / (self.sample_rate / 2), btype='low')
+                low_comp = signal.filtfilt(b, a, audio)
+                audio = audio * 0.5 + low_comp * 0.5
+            except:
+                pass
+        return audio
+    def _apply_phoneme_envelope(self, audio, phoneme):
+        """Apply amplitude envelope to phoneme"""
+        n = len(audio)
+        if n < 4:
+            return audio
+        envelope = np.ones(n)
+        # Attack and release times depend on phoneme type
+        if phoneme in ['P', 'T', 'K', 'B', 'D', 'G']:
+            # Plosives: sharp attack
+            attack = max(1, n // 8)
+            release = max(1, n // 4)
+        elif phoneme in ['F', 'S', 'SH', 'V', 'Z', 'ZH', 'TH', 'DH']:
+            # Fricatives: gradual
+            attack = max(1, n // 4)
+            release = max(1, n // 4)
+        else:
+            # Vowels and sonorants
+            attack = max(1, n // 5)
+            release = max(1, n // 5)
+        envelope[:attack] = np.linspace(0, 1, attack)
+        envelope[-release:] = np.linspace(1, 0, release)
+        return audio * envelope
+    def _concatenate_smooth(self, segments):
+        """Concatenate segments with crossfade"""
+        if len(segments) == 0:
+            return np.zeros(1000, dtype=np.float32)
+        if len(segments) == 1:
+            return segments[0]
+        # Calculate total length with overlap
+        overlap = 64
+        total_length = sum(len(s) for s in segments) - overlap * (len(segments) - 1)
+        total_length = max(total_length, 1)
+        audio = np.zeros(total_length, dtype=np.float32)
+        pos = 0
+        for i, segment in enumerate(segments):
+            if len(segment) == 0:
+                continue
+            end_pos = min(pos + len(segment), total_length)
+            seg_len = end_pos - pos
+            if seg_len <= 0:
+                break
+            # Crossfade with previous segment
+            if i > 0 and pos > 0:
+                fade_len = min(overlap, seg_len, pos)
+                if fade_len > 0:
+                    fade_in = np.linspace(0, 1, fade_len)
+                    fade_out = np.linspace(1, 0, fade_len)
+                    audio[pos:pos + fade_len] *= fade_out
+                    segment_copy = segment[:seg_len].copy()
+                    segment_copy[:fade_len] *= fade_in
+                    audio[pos:end_pos] += segment_copy
+                else:
+                    audio[pos:end_pos] = segment[:seg_len]
+            else:
+                audio[pos:end_pos] = segment[:seg_len]
+            pos = end_pos - overlap
+            pos = max(0, pos)
+        return audio
+    def _apply_envelope(self, audio):
+        """Apply overall envelope"""
+        n = len(audio)
+        if n < 100:
+            return audio
+        fade_len = min(n // 20, 500)
+        audio[:fade_len] *= np.linspace(0, 1, fade_len)
+        audio[-fade_len:] *= np.linspace(1, 0, fade_len)
         return audio
 # ============================================
+# VEDES TTS MAIN CLASS
 # ============================================
+class VedesTTS:
+    """Main TTS class"""
     def __init__(self, sample_rate=22050):
         self.sample_rate = sample_rate
+        self.text_to_phoneme = TextToPhoneme()
+        self.synthesizer = FormantSynthesizer(sample_rate)
+    def synthesize(self, text, speaking_rate=1.0, pitch_shift=0):
+        """Convert text to speech"""
+        # Text to phonemes
+        phonemes = self.text_to_phoneme.convert(text)
+        if not phonemes:
+            return np.zeros(self.sample_rate, dtype=np.float32)
+        # Phonemes to audio
+        audio = self.synthesizer.synthesize(phonemes, speaking_rate, pitch_shift)
         return audio
 # ============================================
 print("=" * 50)
+print("🎙️ Initializing Vedes TTS...")
 print("=" * 50)
+tts = VedesTTS(config.sample_rate)
+print("✅ Vedes TTS initialized successfully!")
 print("=" * 50)
 # SYNTHESIS FUNCTION
 # ============================================
+def synthesize_speech(text, speaking_rate=1.0, pitch_shift=0, voice_type="neutral"):
+    """Main synthesis function for Gradio"""
     if not text or len(text.strip()) == 0:
         return None
+    text = text.strip()[:1000]  # Limit length
     try:
+        # Adjust base pitch for voice type
+        pitch_adjust = pitch_shift
+        if voice_type == "high":
+            pitch_adjust += 5
+        elif voice_type == "low":
+            pitch_adjust -= 5
+        # Synthesize
+        audio = tts.synthesize(text, speaking_rate, pitch_adjust)
+        if len(audio) < 100:
+            return None
+        # Convert to int16
         audio = np.clip(audio, -1, 1)
         audio_int16 = (audio * 32767).astype(np.int16)
     except Exception as e:
         print(f"Synthesis error: {e}")
+        return None
 # ============================================
 # GRADIO INTERFACE
 # ============================================
+with gr.Blocks(
+    title="Vedes TTS",
+    theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple")
+) as demo:
     gr.Markdown(
         """
         # 🎙️ Vedes TTS - Text-to-Speech Synthesis
+        ### A formant-based speech synthesizer built from scratch
+        Type any text below and hear it spoken!
         """
     )
     with gr.Row():
         with gr.Column(scale=2):
             text_input = gr.Textbox(
+                label="📝 Enter Text",
+                placeholder="Type something to synthesize... (e.g., 'Hello, welcome to Vedes!')",
                 lines=4,
                 max_lines=10
             )
                     maximum=2.0,
                     value=1.0,
                     step=0.1,
+                    label="🎚️ Speaking Rate",
+                    info="Slower ← → Faster"
                 )
                 pitch_shift = gr.Slider(
+                    minimum=-10,
+                    maximum=10,
                     value=0,
                     step=1,
+                    label="🎵 Pitch Shift",
+                    info="Lower ← → Higher"
                 )
+            voice_type = gr.Radio(
+                choices=["neutral", "high", "low"],
+                value="neutral",
+                label="🗣️ Voice Type"
+            )
+            synthesize_btn = gr.Button(
+                "🔊 Synthesize Speech",
+                variant="primary",
+                size="lg"
+            )
         with gr.Column(scale=1):
             audio_output = gr.Audio(
     gr.Examples(
         examples=[
+            ["Hello, welcome to Vedes text to speech!"],
             ["The quick brown fox jumps over the lazy dog."],
             ["How are you doing today?"],
             ["This is a test of the speech synthesis system."],
+            ["Good morning! Nice to meet you."],
+            ["One, two, three, four, five."],
+            ["Please say hello to my friend."],
+            ["What is your name?"],
         ],
         inputs=text_input,
+        label="📚 Try These Examples"
     )
     gr.Markdown(
         ---
         ### ℹ️ About Vedes TTS
+        **How it works:**
+        1. **Text Processing** - Converts text to phonemes using pronunciation rules
+        2. **Formant Synthesis** - Generates speech using formant frequencies (F1, F2, F3)
+        3. **Source-Filter Model** - Combines glottal source with vocal tract filtering
+        **Features:**
+        - 🔤 Letter-to-phoneme conversion with common word dictionary
+        - 🎵 Adjustable pitch and speaking rate
+        - 🗣️ Multiple voice types (neutral, high, low pitch)
+        - ⚡ Real-time synthesis - no neural network required!
+        **Supported:** English text with basic punctuation
+        ---
+        *Built with Python, NumPy, SciPy, and Gradio* ❤️
         """
     )
     # Event handlers
     synthesize_btn.click(
         fn=synthesize_speech,
+        inputs=[text_input, speaking_rate, pitch_shift, voice_type],
         outputs=audio_output
     )
     text_input.submit(
         fn=synthesize_speech,
+        inputs=[text_input, speaking_rate, pitch_shift, voice_type],
         outputs=audio_output
     )
 # Launch
 if __name__ == "__main__":
     demo.launch()