Spaces:

vedaco
/

Vedes

Sleeping

App Files Files Community

vedaco commited on 18 days ago

Commit

decc960

verified ·

1 Parent(s): bf4b051

Update app.py

Browse files

Files changed (1) hide show

app.py +667 -57

app.py CHANGED Viewed

@@ -1,70 +1,680 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-def respond(
-    message,
-    history: list[dict[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-    hf_token: gr.OAuthToken,
-):
-    """
-    For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-    """
-    client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
-    messages = [{"role": "system", "content": system_message}]
-    messages.extend(history)
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        choices = message.choices
-        token = ""
-        if len(choices) and choices[0].delta.content:
-            token = choices[0].delta.content
-        response += token
-        yield response
 """
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-chatbot = gr.ChatInterface(
-    respond,
-    type="messages",
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
-with gr.Blocks() as demo:
-    with gr.Sidebar():
-        gr.LoginButton()
-    chatbot.render()
 if __name__ == "__main__":
-    demo.launch()

+import tensorflow as tf
+import numpy as np
 import gradio as gr
+import scipy.signal as signal
+from scipy.io import wavfile
+import io
+import os
+# ============================================
+# VEDES TTS - Text-to-Speech Model from Scratch
+# ============================================
+# Audio Configuration
+class VedesConfig:
+    """Configuration for Vedes TTS Model"""
+    # Audio parameters
+    sample_rate = 22050
+    n_fft = 1024
+    hop_length = 256
+    win_length = 1024
+    n_mels = 80
+    fmin = 0
+    fmax = 8000
+    # Model parameters
+    embedding_dim = 256
+    encoder_dim = 256
+    decoder_dim = 512
+    attention_dim = 128
+    prenet_dim = 256
+    postnet_dim = 512
+    postnet_layers = 5
+    max_decoder_steps = 1000
+    # Text parameters
+    vocab = "abcdefghijklmnopqrstuvwxyz .,!?'-"
+    vocab_size = len(vocab) + 1  # +1 for padding
+config = VedesConfig()
+# ============================================
+# TEXT PROCESSING
+# ============================================
+class TextProcessor:
+    """Text to sequence converter"""
+    def __init__(self, vocab):
+        self.vocab = vocab
+        self.char_to_idx = {char: idx + 1 for idx, char in enumerate(vocab)}
+        self.idx_to_char = {idx + 1: char for idx, char in enumerate(vocab)}
+        self.idx_to_char[0] = '<pad>'
+    def text_to_sequence(self, text):
+        """Convert text to sequence of integers"""
+        text = text.lower()
+        sequence = []
+        for char in text:
+            if char in self.char_to_idx:
+                sequence.append(self.char_to_idx[char])
+        return np.array(sequence, dtype=np.int32)
+    def sequence_to_text(self, sequence):
+        """Convert sequence back to text"""
+        return ''.join([self.idx_to_char.get(idx, '') for idx in sequence])
+text_processor = TextProcessor(config.vocab)
+# ============================================
+# VEDES TTS MODEL COMPONENTS
+# ============================================
+class VedesPrenet(tf.keras.layers.Layer):
+    """Prenet: 2-layer FC with dropout"""
+    def __init__(self, units, **kwargs):
+        super().__init__(**kwargs)
+        self.dense1 = tf.keras.layers.Dense(units, activation='relu')
+        self.dense2 = tf.keras.layers.Dense(units, activation='relu')
+        self.dropout1 = tf.keras.layers.Dropout(0.5)
+        self.dropout2 = tf.keras.layers.Dropout(0.5)
+    def call(self, inputs, training=True):
+        x = self.dropout1(self.dense1(inputs), training=training)
+        x = self.dropout2(self.dense2(x), training=training)
+        return x
+class VedesEncoder(tf.keras.layers.Layer):
+    """Encoder: Embedding + Conv layers + BiLSTM"""
+    def __init__(self, vocab_size, embed_dim, encoder_dim, **kwargs):
+        super().__init__(**kwargs)
+        self.embedding = tf.keras.layers.Embedding(vocab_size, embed_dim)
+        # 3 Conv layers with batch norm
+        self.conv_layers = []
+        self.batch_norms = []
+        for i in range(3):
+            self.conv_layers.append(
+                tf.keras.layers.Conv1D(encoder_dim, 5, padding='same', activation='relu')
+            )
+            self.batch_norms.append(tf.keras.layers.BatchNormalization())
+        self.dropout = tf.keras.layers.Dropout(0.5)
+        # Bidirectional LSTM
+        self.bilstm = tf.keras.layers.Bidirectional(
+            tf.keras.layers.LSTM(encoder_dim // 2, return_sequences=True)
+        )
+    def call(self, inputs, training=True):
+        x = self.embedding(inputs)
+        for conv, bn in zip(self.conv_layers, self.batch_norms):
+            x = conv(x)
+            x = bn(x, training=training)
+            x = self.dropout(x, training=training)
+        x = self.bilstm(x)
+        return x
+class VedesAttention(tf.keras.layers.Layer):
+    """Location-Sensitive Attention"""
+    def __init__(self, attention_dim, **kwargs):
+        super().__init__(**kwargs)
+        self.attention_dim = attention_dim
+        self.query_layer = tf.keras.layers.Dense(attention_dim, use_bias=False)
+        self.memory_layer = tf.keras.layers.Dense(attention_dim, use_bias=False)
+        self.location_conv = tf.keras.layers.Conv1D(32, 31, padding='same')
+        self.location_dense = tf.keras.layers.Dense(attention_dim, use_bias=False)
+        self.v = tf.keras.layers.Dense(1, use_bias=False)
+    def call(self, query, memory, prev_attention):
+        """
+        query: decoder hidden state [batch, decoder_dim]
+        memory: encoder outputs [batch, seq_len, encoder_dim]
+        prev_attention: previous attention weights [batch, seq_len]
+        """
+        # Process query
+        processed_query = self.query_layer(tf.expand_dims(query, 1))
+        # Process memory
+        processed_memory = self.memory_layer(memory)
+        # Process location
+        prev_attention_expanded = tf.expand_dims(prev_attention, -1)
+        location_features = self.location_conv(prev_attention_expanded)
+        processed_location = self.location_dense(location_features)
+        # Compute attention scores
+        scores = self.v(tf.nn.tanh(
+            processed_query + processed_memory + processed_location
+        ))
+        scores = tf.squeeze(scores, -1)
+        # Softmax to get attention weights
+        attention_weights = tf.nn.softmax(scores, axis=-1)
+        # Compute context vector
+        context = tf.reduce_sum(
+            tf.expand_dims(attention_weights, -1) * memory, axis=1
+        )
+        return context, attention_weights
+class VedesDecoder(tf.keras.layers.Layer):
+    """Autoregressive Decoder"""
+    def __init__(self, decoder_dim, n_mels, prenet_dim, attention_dim, **kwargs):
+        super().__init__(**kwargs)
+        self.n_mels = n_mels
+        self.decoder_dim = decoder_dim
+        self.prenet = VedesPrenet(prenet_dim)
+        self.attention = VedesAttention(attention_dim)
+        # Decoder LSTM cells
+        self.lstm1 = tf.keras.layers.LSTMCell(decoder_dim)
+        self.lstm2 = tf.keras.layers.LSTMCell(decoder_dim)
+        # Output projections
+        self.mel_dense = tf.keras.layers.Dense(n_mels)
+        self.stop_dense = tf.keras.layers.Dense(1)
+    def get_initial_state(self, batch_size, encoder_seq_len):
+        """Initialize decoder states"""
+        return {
+            'lstm1_state': [
+                tf.zeros([batch_size, self.decoder_dim]),
+                tf.zeros([batch_size, self.decoder_dim])
+            ],
+            'lstm2_state': [
+                tf.zeros([batch_size, self.decoder_dim]),
+                tf.zeros([batch_size, self.decoder_dim])
+            ],
+            'attention_weights': tf.zeros([batch_size, encoder_seq_len]),
+            'context': tf.zeros([batch_size, self.decoder_dim * 2])
+        }
+    def decode_step(self, decoder_input, encoder_outputs, state, training=True):
+        """Single decoder step"""
+        # Prenet
+        prenet_out = self.prenet(decoder_input, training=training)
+        # Concatenate with context
+        lstm1_input = tf.concat([prenet_out, state['context']], axis=-1)
+        # First LSTM
+        lstm1_out, new_lstm1_state = self.lstm1(lstm1_input, state['lstm1_state'])
+        # Attention
+        context, attention_weights = self.attention(
+            lstm1_out, encoder_outputs, state['attention_weights']
+        )
+        # Second LSTM
+        lstm2_input = tf.concat([lstm1_out, context], axis=-1)
+        lstm2_out, new_lstm2_state = self.lstm2(lstm2_input, state['lstm2_state'])
+        # Output projections
+        decoder_output = tf.concat([lstm2_out, context], axis=-1)
+        mel_output = self.mel_dense(decoder_output)
+        stop_output = self.stop_dense(decoder_output)
+        # Update state
+        new_state = {
+            'lstm1_state': list(new_lstm1_state),
+            'lstm2_state': list(new_lstm2_state),
+            'attention_weights': attention_weights,
+            'context': context
+        }
+        return mel_output, stop_output, new_state
+class VedesPostnet(tf.keras.layers.Layer):
+    """Postnet: 5 Conv layers to refine mel spectrogram"""
+    def __init__(self, n_mels, postnet_dim, num_layers=5, **kwargs):
+        super().__init__(**kwargs)
+        self.conv_layers = []
+        self.batch_norms = []
+        for i in range(num_layers):
+            in_channels = n_mels if i == 0 else postnet_dim
+            out_channels = n_mels if i == num_layers - 1 else postnet_dim
+            activation = None if i == num_layers - 1 else 'tanh'
+            self.conv_layers.append(
+                tf.keras.layers.Conv1D(out_channels, 5, padding='same', activation=activation)
+            )
+            self.batch_norms.append(tf.keras.layers.BatchNormalization())
+        self.dropout = tf.keras.layers.Dropout(0.5)
+    def call(self, inputs, training=True):
+        x = inputs
+        for conv, bn in zip(self.conv_layers, self.batch_norms):
+            x = conv(x)
+            x = bn(x, training=training)
+            x = self.dropout(x, training=training)
+        return inputs + x
+class VedesTTS(tf.keras.Model):
+    """Complete Vedes TTS Model"""
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.encoder = VedesEncoder(
+            config.vocab_size,
+            config.embedding_dim,
+            config.encoder_dim
+        )
+        self.decoder = VedesDecoder(
+            config.decoder_dim,
+            config.n_mels,
+            config.prenet_dim,
+            config.attention_dim
+        )
+        self.postnet = VedesPostnet(
+            config.n_mels,
+            config.postnet_dim,
+            config.postnet_layers
+        )
+    def call(self, inputs, training=True):
+        """Forward pass for training"""
+        text_inputs, mel_targets = inputs
+        batch_size = tf.shape(text_inputs)[0]
+        # Encode
+        encoder_outputs = self.encoder(text_inputs, training=training)
+        encoder_seq_len = tf.shape(encoder_outputs)[1]
+        # Initialize decoder
+        state = self.decoder.get_initial_state(batch_size, encoder_seq_len)
+        # Teacher forcing
+        mel_outputs = []
+        stop_outputs = []
+        # Start with zeros
+        decoder_input = tf.zeros([batch_size, self.config.n_mels])
+        for t in range(tf.shape(mel_targets)[1]):
+            mel_out, stop_out, state = self.decoder.decode_step(
+                decoder_input, encoder_outputs, state, training=training
+            )
+            mel_outputs.append(mel_out)
+            stop_outputs.append(stop_out)
+            # Teacher forcing
+            decoder_input = mel_targets[:, t, :]
+        mel_outputs = tf.stack(mel_outputs, axis=1)
+        stop_outputs = tf.stack(stop_outputs, axis=1)
+        # Postnet
+        mel_outputs_postnet = self.postnet(mel_outputs, training=training)
+        return mel_outputs, mel_outputs_postnet, stop_outputs
+    @tf.function(reduce_retracing=True)
+    def inference(self, text_sequence, max_steps=1000):
+        """Inference mode - autoregressive generation"""
+        text_sequence = tf.expand_dims(text_sequence, 0)
+        batch_size = 1
+        # Encode
+        encoder_outputs = self.encoder(text_sequence, training=False)
+        encoder_seq_len = tf.shape(encoder_outputs)[1]
+        # Initialize
+        state = self.decoder.get_initial_state(batch_size, encoder_seq_len)
+        decoder_input = tf.zeros([batch_size, self.config.n_mels])
+        mel_outputs = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
+        for step in tf.range(max_steps):
+            mel_out, stop_out, state = self.decoder.decode_step(
+                decoder_input, encoder_outputs, state, training=False
+            )
+            mel_outputs = mel_outputs.write(step, mel_out[0])
+            decoder_input = mel_out
+            # Check stop token
+            if tf.nn.sigmoid(stop_out[0, 0]) > 0.5:
+                break
+        mel_outputs = mel_outputs.stack()
+        mel_outputs = tf.expand_dims(mel_outputs, 0)
+        # Postnet refinement
+        mel_outputs = self.postnet(mel_outputs, training=False)
+        return mel_outputs[0]
+# ============================================
+# GRIFFIN-LIM VOCODER
+# ============================================
+class GriffinLimVocoder:
+    """Griffin-Lim algorithm for mel spectrogram to audio"""
+    def __init__(self, config):
+        self.config = config
+        self.mel_basis = self._create_mel_filterbank()
+    def _create_mel_filterbank(self):
+        """Create mel filterbank matrix"""
+        n_fft = self.config.n_fft
+        n_mels = self.config.n_mels
+        sample_rate = self.config.sample_rate
+        fmin = self.config.fmin
+        fmax = self.config.fmax
+        # Mel frequencies
+        mel_fmin = self._hz_to_mel(fmin)
+        mel_fmax = self._hz_to_mel(fmax)
+        mel_points = np.linspace(mel_fmin, mel_fmax, n_mels + 2)
+        hz_points = self._mel_to_hz(mel_points)
+        # FFT bins
+        bin_points = np.floor((n_fft + 1) * hz_points / sample_rate).astype(int)
+        # Create filterbank
+        filterbank = np.zeros((n_mels, n_fft // 2 + 1))
+        for i in range(n_mels):
+            left = bin_points[i]
+            center = bin_points[i + 1]
+            right = bin_points[i + 2]
+            for j in range(left, center):
+                if center != left:
+                    filterbank[i, j] = (j - left) / (center - left)
+            for j in range(center, right):
+                if right != center:
+                    filterbank[i, j] = (right - j) / (right - center)
+        return filterbank
+    def _hz_to_mel(self, hz):
+        return 2595 * np.log10(1 + hz / 700)
+    def _mel_to_hz(self, mel):
+        return 700 * (10 ** (mel / 2595) - 1)
+    def mel_to_linear(self, mel_spec):
+        """Convert mel spectrogram to linear spectrogram"""
+        mel_basis_pinv = np.linalg.pinv(self.mel_basis)
+        linear = np.maximum(1e-10, np.dot(mel_spec, mel_basis_pinv.T))
+        return linear
+    def griffin_lim(self, spectrogram, n_iter=60):
+        """Griffin-Lim algorithm"""
+        # Denormalize
+        spectrogram = np.exp(spectrogram)
+        # Convert mel to linear
+        linear_spec = self.mel_to_linear(spectrogram)
+        # Initialize phase randomly
+        angles = np.exp(2j * np.pi * np.random.rand(*linear_spec.shape))
+        complex_spec = linear_spec * angles
+        # Iterate
+        for _ in range(n_iter):
+            # Inverse STFT
+            audio = self._istft(complex_spec)
+            # Forward STFT
+            complex_spec = self._stft(audio)
+            # Keep magnitude, update phase
+            angles = np.exp(1j * np.angle(complex_spec))
+            complex_spec = linear_spec * angles
+        # Final inverse STFT
+        audio = self._istft(complex_spec)
+        # Normalize
+        audio = audio / (np.max(np.abs(audio)) + 1e-8)
+        return audio.astype(np.float32)
+    def _stft(self, audio):
+        """Short-time Fourier transform"""
+        return np.array([
+            np.fft.rfft(
+                audio[i:i + self.config.win_length] *
+                np.hanning(self.config.win_length)
+            )
+            for i in range(0, len(audio) - self.config.win_length, self.config.hop_length)
+        ])
+    def _istft(self, complex_spec):
+        """Inverse short-time Fourier transform"""
+        n_frames = complex_spec.shape[0]
+        expected_len = self.config.hop_length * n_frames + self.config.win_length
+        audio = np.zeros(expected_len)
+        window = np.hanning(self.config.win_length)
+        for i, frame in enumerate(complex_spec):
+            start = i * self.config.hop_length
+            audio[start:start + self.config.win_length] += np.real(
+                np.fft.irfft(frame, self.config.win_length)
+            ) * window
+        return audio
+# ============================================
+# INITIALIZE MODEL AND VOCODER
+# ============================================
+print("Initializing Vedes TTS Model...")
+model = VedesTTS(config)
+vocoder = GriffinLimVocoder(config)
+# Build model with dummy input
+dummy_text = tf.zeros([1, 10], dtype=tf.int32)
+dummy_mel = tf.zeros([1, 50, config.n_mels])
+_ = model([dummy_text, dummy_mel], training=False)
+print("Model initialized successfully!")
+print(f"Total parameters: {model.count_params():,}")
+# ============================================
+# SYNTHESIS FUNCTION
+# ============================================
+def synthesize_speech(text, speaking_rate=1.0, pitch_shift=0):
+    """
+    Convert text to speech
+    Args:
+        text: Input text string
+        speaking_rate: Speed of speech (0.5 - 2.0)
+        pitch_shift: Pitch adjustment in semitones (-5 to 5)
+    Returns:
+        tuple: (sample_rate, audio_array)
+    """
+    if not text or len(text.strip()) == 0:
+        return None
+    try:
+        # Clean and process text
+        text = text.strip().lower()
+        # Convert text to sequence
+        text_sequence = text_processor.text_to_sequence(text)
+        if len(text_sequence) == 0:
+            return None
+        text_tensor = tf.constant(text_sequence, dtype=tf.int32)
+        # Generate mel spectrogram
+        max_steps = int(len(text_sequence) * 20 / speaking_rate)
+        max_steps = min(max_steps, config.max_decoder_steps)
+        mel_spectrogram = model.inference(text_tensor, max_steps=max_steps)
+        mel_spectrogram = mel_spectrogram.numpy()
+        # Apply pitch shift (simple frequency scaling)
+        if pitch_shift != 0:
+            shift_factor = 2 ** (pitch_shift / 12)
+            mel_spectrogram = mel_spectrogram * shift_factor
+        # Convert to audio using Griffin-Lim
+        audio = vocoder.griffin_lim(mel_spectrogram)
+        # Resample for speaking rate
+        if speaking_rate != 1.0:
+            target_length = int(len(audio) / speaking_rate)
+            audio = signal.resample(audio, target_length)
+        # Ensure audio is in correct format
+        audio = np.clip(audio, -1, 1)
+        audio = (audio * 32767).astype(np.int16)
+        return (config.sample_rate, audio)
+    except Exception as e:
+        print(f"Error during synthesis: {e}")
+        return None
+# ============================================
+# GRADIO INTERFACE
+# ============================================
+# Custom CSS for better styling
+custom_css = """
+.gradio-container {
+    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+}
+.title {
+    text-align: center;
+    color: #2c3e50;
+}
+.description {
+    text-align: center;
+    color: #7f8c8d;
+}
 """
+# Create Gradio interface
+with gr.Blocks(css=custom_css, title="Vedes TTS") as demo:
+    gr.Markdown(
+        """
+        # 🎙️ Vedes TTS - Text-to-Speech Synthesis
+        ### Built from scratch with TensorFlow
+        Enter any text below and convert it to natural-sounding speech!
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=2):
+            text_input = gr.Textbox(
+                label="Input Text",
+                placeholder="Enter the text you want to convert to speech...",
+                lines=3,
+                max_lines=10
+            )
+            with gr.Row():
+                speaking_rate = gr.Slider(
+                    minimum=0.5,
+                    maximum=2.0,
+                    value=1.0,
+                    step=0.1,
+                    label="Speaking Rate",
+                    info="Adjust speech speed"
+                )
+                pitch_shift = gr.Slider(
+                    minimum=-5,
+                    maximum=5,
+                    value=0,
+                    step=1,
+                    label="Pitch Shift (semitones)",
+                    info="Adjust voice pitch"
+                )
+            synthesize_btn = gr.Button("🔊 Synthesize Speech", variant="primary")
+        with gr.Column(scale=1):
+            audio_output = gr.Audio(
+                label="Generated Speech",
+                type="numpy"
+            )
+    # Example texts
+    gr.Examples(
+        examples=[
+            ["Hello, welcome to Vedes text to speech system!"],
+            ["The quick brown fox jumps over the lazy dog."],
+            ["Artificial intelligence is transforming the world."],
+            ["Good morning! How are you doing today?"],
+            ["This is a demonstration of neural text to speech."],
+        ],
+        inputs=text_input
+    )
+    gr.Markdown(
+        """
+        ---
+        ### About Vedes TTS
+        **Architecture:**
+        - **Encoder:** 3 Conv1D layers + Bidirectional LSTM
+        - **Attention:** Location-sensitive attention mechanism
+        - **Decoder:** Autoregressive LSTM with prenet
+        - **Postnet:** 5 Conv1D layers for mel refinement
+        - **Vocoder:** Griffin-Lim algorithm
+        **Features:**
+        - Character-level text processing
+        - Adjustable speaking rate
+        - Pitch shifting capability
+        - Real-time synthesis
+        Built with ❤️ using TensorFlow and Gradio
+        """
+    )
+    # Event handlers
+    synthesize_btn.click(
+        fn=synthesize_speech,
+        inputs=[text_input, speaking_rate, pitch_shift],
+        outputs=audio_output
+    )
+    text_input.submit(
+        fn=synthesize_speech,
+        inputs=[text_input, speaking_rate, pitch_shift],
+        outputs=audio_output
+    )
+# Launch
 if __name__ == "__main__":
+    demo.launch()