import streamlit as st import pandas as pd import numpy as np import tensorflow as tf import pickle from keras.models import load_model from tensorflow.keras.models import Model from tensorflow.keras.layers import Input, Concatenate from tensorflow.keras.preprocessing.sequence import pad_sequences from huggingface_hub import hf_hub_download # ------------------------------------------------ # Page configuration # ------------------------------------------------ st.set_page_config( page_title="English → Hindi Seq2Seq Translator", layout="centered", page_icon="🧠", ) st.title("🧠 English → Hindi Translation (Seq2Seq + Self-Attention)") st.markdown("A demonstration of **LSTM Encoder–Decoder** with **Multi-Head Self-Attention** for translation tasks.") # ------------------------------------------------ # About Section # ------------------------------------------------ with st.expander("ℹ️ About This Project"): st.markdown(""" This demo showcases a **Seq2Seq translation model enhanced with self-attention**. It demonstrates how attention helps the decoder focus on relevant parts of the input during translation. **Key Highlights:** - Encoder–Decoder with LSTMs - Integrated Multi-Head Self and Cross-Attention - Two model sizes: **12M** and **42M** parameters - Designed for educational visualization and experimentation """) # ------------------------------------------------ # Purpose Section # ------------------------------------------------ with st.expander("🎯 Purpose"): st.markdown(""" This Space is designed for: - Demonstration and educational purposes - Understanding **Seq2Seq + Attention mechanisms** - Translating English sentences to Hindi - Exploring how encoder outputs can serve as **context embeddings** for downstream NLP tasks """) # ------------------------------------------------ # Load models and tokenizers # ------------------------------------------------ @st.cache_resource def load_model_and_tokenizer(model_file, tokenizer_file): model_path = hf_hub_download(repo_id="Daksh0505/Seq2Seq-LSTM-MultiHeadAttention", filename=model_file) tokenizer_path = hf_hub_download(repo_id="Daksh0505/Seq2Seq-LSTM-MultiHeadAttention", filename=tokenizer_file) model = load_model(model_path) with open(tokenizer_path, "rb") as f: tokenizer = pickle.load(f) return model, tokenizer['english'], tokenizer['hindi'] model_12M, tokenizer_en_12, tokenizer_hi_12 = load_model_and_tokenizer( "seq2seq-lstm-multiheadattention-12.3.keras", "seq2seq-tokenizers-12.3M.pkl" ) model_42M, tokenizer_en_42, tokenizer_hi_42 = load_model_and_tokenizer( "seq2seq-lstm-multiheadattention-42.keras", "seq2seq-tokenizers-42M.pkl" ) # ------------------------------------------------ # Load dataset # ------------------------------------------------ @st.cache_data def load_data(): return pd.read_csv("translation.csv").head(5000) data = load_data() # ------------------------------------------------ # Model selection # ------------------------------------------------ st.subheader("⚙️ Select Model Size") model_choice = st.radio("Choose a model:", ["12M parameters", "42M parameters"], index=0, horizontal=True) if model_choice == "12M parameters": model = model_12M tokenizer_en = tokenizer_en_12 tokenizer_hi = tokenizer_hi_12 max_seq_len = 40 else: model = model_42M tokenizer_en = tokenizer_en_42 tokenizer_hi = tokenizer_hi_42 max_seq_len = 50 word2idx_en = tokenizer_en.word_index word2idx_hi = tokenizer_hi.word_index idx2word_hi = tokenizer_hi.index_word max_vocab_en = len(word2idx_en) + 1 # ------------------------------------------------ # Safe layer extraction helper # ------------------------------------------------ def get_layer_safe(model, possible_names): for name in possible_names: try: return model.get_layer(name) except: continue raise ValueError(f"No matching layer found among {possible_names}") # ------------------------------------------------ # Build encoder & decoder inference # ------------------------------------------------ @st.cache_resource def build_inference_models(model): encoder_input = model.input[0] emb_layer = get_layer_safe(model, ['embedding', 'embedding_0']) norm_layer = get_layer_safe(model, ['layer_normalization', 'layer_normalization_0']) drop_layer = get_layer_safe(model, ['dropout', 'dropout_1']) lstm_layer = get_layer_safe(model, ['bidirectional']) enc_emb = drop_layer(norm_layer(emb_layer(encoder_input), training=False)) enc_out, fh, fc, bh, bc = lstm_layer(enc_emb) state_h = Concatenate()([fh, bh]) state_c = Concatenate()([fc, bc]) encoder_model = Model(encoder_input, [enc_out, state_h, state_c]) # Decoder decoder_input = Input(shape=(1,)) decoder_lstm = get_layer_safe(model, ['lstm_1', 'lstm']) decoder_emb = get_layer_safe(model, ['embedding_1', 'embedding_2']) decoder_norm = get_layer_safe(model, ['layer_normalization_1', 'layer_normalization_2']) decoder_drop = get_layer_safe(model, ['dropout_2', 'dropout_1']) decoder_dense = get_layer_safe(model, ['dense']) attention_layer = get_layer_safe(model, ['multi_head_attention']) decoder_state_input_h = Input(shape=(decoder_lstm.units,)) decoder_state_input_c = Input(shape=(decoder_lstm.units,)) encoder_outputs_input = Input(shape=(None, decoder_lstm.units)) dec_emb = decoder_drop(decoder_norm(decoder_emb(decoder_input), training=False)) dec_out, dec_h, dec_c = decoder_lstm(dec_emb, initial_state=[decoder_state_input_h, decoder_state_input_c]) context = attention_layer(query=dec_out, key=encoder_outputs_input, value=encoder_outputs_input) dec_combined = Concatenate(axis=-1)([context, dec_out]) dec_final = decoder_dense(dec_combined) decoder_model = Model( [decoder_input, decoder_state_input_h, decoder_state_input_c, encoder_outputs_input], [dec_final, dec_h, dec_c] ) return encoder_model, decoder_model encoder_model, decoder_model = build_inference_models(model) # ------------------------------------------------ # Helper functions # ------------------------------------------------ def preprocess_input_sentence(sentence): oov_idx = word2idx_en.get('', 1) seq = [word2idx_en.get(w.lower(), oov_idx) for w in sentence.split()] seq = [idx if idx < max_vocab_en else oov_idx for idx in seq] return pad_sequences([seq], maxlen=max_seq_len, padding='post') def decode_sequence(input_seq): start_token = word2idx_hi[''] end_token = word2idx_hi[''] enc_outs, h, c = encoder_model.predict(input_seq, verbose=0) target_seq = np.array([[start_token]]) decoded_sentence = [] for _ in range(max_seq_len): output_tokens, h, c = decoder_model.predict([target_seq, h, c, enc_outs], verbose=0) sampled_idx = np.argmax(output_tokens[0, 0, :]) if sampled_idx == end_token: break if sampled_idx > 0: decoded_sentence.append(idx2word_hi.get(sampled_idx, '')) target_seq[0, 0] = sampled_idx return " ".join(decoded_sentence) # ------------------------------------------------ # Input selection with session_state caching # ------------------------------------------------ st.subheader("📝 Try a Sample Translation") # Initialize selected text in session_state if "selected_text" not in st.session_state: st.session_state.selected_text = data["english"].iloc[0] if "translation" not in st.session_state: st.session_state.translation = "" # Dropdown for English sentence selection selected_text = st.selectbox( "Select an English sentence:", data["english"].tolist(), index=data["english"].tolist().index(st.session_state.selected_text) ) # Update session_state if selection changed if selected_text != st.session_state.selected_text: st.session_state.selected_text = selected_text st.session_state.translation = "" # Reset translation for new selection original_hindi = data.loc[data["english"] == selected_text, "hindi"].values[0] st.write("**Original English:**", selected_text) st.write("**Reference Hindi:**", original_hindi) # Translate button if st.button("🚀 Translate"): with st.spinner("Generating translation..."): preprocessed = preprocess_input_sentence(selected_text) translation = decode_sequence(preprocessed) st.session_state.translation = translation # Show cached translation if exists if st.session_state.translation: st.success(f"✅ **Predicted Hindi Translation:** {st.session_state.translation}") # ------------------------------------------------ # Learning Header # ------------------------------------------------ st.subheader(" Understanding the Model") # ------------------------------------------------ # Self Attention Section # ------------------------------------------------ with st.expander("🔹 Self-Attention Mechanism"): st.markdown(""" Self-Attention is a mechanism where each token in a sequence attends to **other tokens in the same sequence** to capture dependencies. **Key points:** - Helps the model focus on relevant words within the same sentence. - Computes attention scores between all pairs of positions in the input. - Often implemented as **Multi-Head Self-Attention** to capture different types of relationships simultaneously. **Example:** In the sentence *"The cat sat on the mat"*, self-attention allows the model to understand that *"cat"* is related to *"sat"* and *"mat"*. """) # ------------------------------------------------ # Cross Attention Section # ------------------------------------------------ with st.expander("🔹 Cross-Attention Mechanism"): st.markdown(""" Cross-Attention is used in encoder-decoder architectures where the **decoder attends to encoder outputs**. **Key points:** - Decoder queries encoder outputs to focus on relevant parts of the input sentence. - Crucial for translation, summarization, or any sequence-to-sequence task. **Example:** Translating *"I am hungry"* to Hindi: when generating the Hindi word *"भूखा"*, cross-attention helps the decoder focus on *"hungry"* in the English input. """) # ------------------------------------------------ # Multi-Head Attention Section # ------------------------------------------------ with st.expander("🔹 Multi-Head Attention"): st.markdown(""" Multi-Head Attention is an extension of the attention mechanism that allows the model to **capture information from different representation subspaces simultaneously**. **Key Points:** - Instead of using a single attention function, we use **multiple attention heads**. - Each head learns to focus on **different parts or relationships** of the input. - The outputs from all heads are **concatenated and linearly projected** to form the final context vector. - Improves the model’s ability to understand complex dependencies in sequences. **Example:** - In translating *"The cat sat on the mat"*: - Head 1 may focus on subject-verb relations (*cat ↔ sat*). - Head 2 may focus on verb-object relations (*sat ↔ mat*). - Head 3 may focus on positional or syntactic patterns. - Combining all heads gives a richer context for the decoder. **In your Seq2Seq Model:** - Multi-Head Attention can be used as: - **Self-Attention** in encoder/decoder layers - **Cross-Attention** between encoder outputs and decoder hidden states """) # ------------------------------------------------ # Seq2Seq task Explaining Section # ------------------------------------------------ with st.expander("🔹 Sequence-to-Sequence (Seq2Seq) Task"): st.markdown(""" Seq2Seq models map an **input sequence** to an **output sequence**, often with **different lengths**. **Examples:** - Machine Translation: English → Hindi - Text Summarization - Chatbots / Dialogue Systems **Characteristics:** - Handles variable-length input and output sequences. - Uses encoder to process input, decoder to generate output. - Can integrate attention mechanisms to improve alignment between input and output tokens. """) # ------------------------------------------------ # Seq2Seq Task- Fixed-Length vs Variable-Length Section # ------------------------------------------------ with st.expander("🔹 Fixed-Length vs Variable-Length Tasks"): st.markdown(""" **Fixed-Length Tasks:** - Input and output sequences have the **same length**. - Example: Time series forecasting with fixed steps, classification tasks. **Variable-Length Tasks:** - Input and output sequences can **differ in length**. - Example: Machine translation, summarization, speech recognition. - Seq2Seq models are designed to handle this flexibility. """) # ------------------------------------------------ # Mathematics Expanders (Advanced / Optional) # ------------------------------------------------ st.subheader("🧮 Mathematics Behind the Model") with st.expander("🔹 Self-Attention Equations", expanded=False): st.markdown(r""" The attention function is computed as: \[ \text{Attention}(Q,K,V) = \text{softmax}\left(\frac{Q K^T}{\sqrt{d_k}}\right) V \] Where: - \(Q\) = Query matrix - \(K\) = Key matrix - \(V\) = Value matrix - \(d_k\) = Dimension of key vectors This allows the model to compute a weighted sum of values based on relevance. """) with st.expander("🔹 Multi-Head Attention Equations", expanded=False): st.markdown(r""" Multi-Head Attention combines multiple self-attention heads: \[ \text{MultiHead}(Q,K,V) = \text{Concat}(\text{head}_1, ..., \text{head}_h) W^O \] Each head: \[ \text{head}_i = \text{Attention}(Q W_i^Q, K W_i^K, V W_i^V) \] Where \(W_i^Q, W_i^K, W_i^V, W^O\) are learnable projection matrices. """) with st.expander("🔹 Cross-Attention / Encoder-Decoder Attention", expanded=False): st.markdown(r""" Cross-Attention computes attention using decoder queries and encoder outputs: \[ \text{Context}_t = \text{Attention}(Q_t, K_{enc}, V_{enc}) \] - \(Q_t\) = decoder hidden state at timestep \(t\) - \(K_{enc}, V_{enc}\) = encoder outputs """) with st.expander("🔹 Seq2Seq Decoder Step", expanded=False): st.markdown(r""" At each decoder timestep: \[ s_t, c_t = \text{LSTM}(y_{t-1}, s_{t-1}, c_{t-1}) \] \[ \text{Output}_t = \text{Dense}(\text{Concat}(s_t, \text{Context}_t)) \] """) # ------------------------------------------------ # Show model architecture # ------------------------------------------------ st.markdown("---") show_arch = st.checkbox("🧩 Show Model Architecture") if show_arch: layer_info = [] for i, layer in enumerate(model.layers): try: out_shape = layer.output_shape except: try: out_shape = layer.output.shape except: out_shape = "N/A" layer_info.append([i, layer.name, str(out_shape)]) df_layers = pd.DataFrame(layer_info, columns=["Index", "Layer Name", "Output Shape"]) st.subheader("Model Architecture Overview") st.dataframe(df_layers, width='stretch') # ------------------------------------------------ # Footer # ------------------------------------------------ st.markdown("---") st.subheader("🔗 Resources") st.markdown(""" - 🧩 **Model Repository:** [Daksh0505/Seq2Seq-LSTM-MultiHeadAttention](https://huggingface.co/Daksh0505/Seq2Seq-LSTM-MultiHeadAttention) - 📘 **Dataset:** English–Hindi Parallel Corpus- IIT Bombay - 🧠 **Framework:** TensorFlow / Keras """) st.caption("© 2025 Daksh Bhardwaj | For educational and research purposes.")