|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import tensorflow as tf |
|
|
import pickle |
|
|
from keras.models import load_model |
|
|
from tensorflow.keras.models import Model |
|
|
from tensorflow.keras.layers import Input, Concatenate |
|
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
|
from huggingface_hub import hf_hub_download |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.set_page_config( |
|
|
page_title="English → Hindi Seq2Seq Translator", |
|
|
layout="centered", |
|
|
page_icon="🧠", |
|
|
) |
|
|
|
|
|
st.title("🧠 English → Hindi Translation using Seq2Seq + Multi-Head Attention") |
|
|
st.markdown("A demonstration of **LSTM Encoder–Decoder** with **Multi-Head Self-Attention** for translation tasks.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with st.expander("ℹ️ About This Project"): |
|
|
st.markdown(""" |
|
|
This demo showcases a **Seq2Seq translation model enhanced with self-attention**. |
|
|
It demonstrates how attention helps the decoder focus on relevant parts of the input during translation. |
|
|
|
|
|
**Key Highlights:** |
|
|
- Encoder–Decoder with LSTMs |
|
|
- Integrated Multi-Head Self and Cross-Attention |
|
|
- Two model sizes: **12M** and **42M** parameters |
|
|
- Designed for educational visualization and experimentation |
|
|
""") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with st.expander("🎯 Purpose"): |
|
|
st.markdown(""" |
|
|
This Space is designed for: |
|
|
- Demonstration and educational purposes |
|
|
- Understanding **Seq2Seq + Attention mechanisms** |
|
|
- Translating English sentences to Hindi |
|
|
- Exploring how encoder outputs can serve as **context embeddings** for downstream NLP tasks |
|
|
""") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@st.cache_resource |
|
|
def load_model_and_tokenizer(model_file, tokenizer_file): |
|
|
model_path = hf_hub_download(repo_id="Daksh0505/Seq2Seq-LSTM-MultiHeadAttention", filename=model_file) |
|
|
tokenizer_path = hf_hub_download(repo_id="Daksh0505/Seq2Seq-LSTM-MultiHeadAttention", filename=tokenizer_file) |
|
|
|
|
|
model = load_model(model_path) |
|
|
with open(tokenizer_path, "rb") as f: |
|
|
tokenizer = pickle.load(f) |
|
|
return model, tokenizer['english'], tokenizer['hindi'] |
|
|
|
|
|
model_12M, tokenizer_en_12, tokenizer_hi_12 = load_model_and_tokenizer( |
|
|
"seq2seq-lstm-multiheadattention-12.3.keras", "seq2seq-tokenizers-12.3M.pkl" |
|
|
) |
|
|
model_42M, tokenizer_en_42, tokenizer_hi_42 = load_model_and_tokenizer( |
|
|
"seq2seq-lstm-multiheadattention-42.keras", "seq2seq-tokenizers-42M.pkl" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@st.cache_data |
|
|
def load_data(): |
|
|
return pd.read_csv("translation.csv").head(5000) |
|
|
|
|
|
data = load_data() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.subheader("⚙️ Select Model Size") |
|
|
model_choice = st.radio("Choose a model:", ["12M parameters", "42M parameters"], index=0, horizontal=True) |
|
|
|
|
|
if model_choice == "12M parameters": |
|
|
model = model_12M |
|
|
tokenizer_en = tokenizer_en_12 |
|
|
tokenizer_hi = tokenizer_hi_12 |
|
|
max_seq_len = 40 |
|
|
else: |
|
|
model = model_42M |
|
|
tokenizer_en = tokenizer_en_42 |
|
|
tokenizer_hi = tokenizer_hi_42 |
|
|
max_seq_len = 50 |
|
|
|
|
|
word2idx_en = tokenizer_en.word_index |
|
|
word2idx_hi = tokenizer_hi.word_index |
|
|
idx2word_hi = tokenizer_hi.index_word |
|
|
max_vocab_en = len(word2idx_en) + 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_layer_safe(model, possible_names): |
|
|
for name in possible_names: |
|
|
try: |
|
|
return model.get_layer(name) |
|
|
except: |
|
|
continue |
|
|
raise ValueError(f"No matching layer found among {possible_names}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@st.cache_resource |
|
|
def build_inference_models(model): |
|
|
encoder_input = model.input[0] |
|
|
emb_layer = get_layer_safe(model, ['embedding', 'embedding_0']) |
|
|
norm_layer = get_layer_safe(model, ['layer_normalization', 'layer_normalization_0']) |
|
|
drop_layer = get_layer_safe(model, ['dropout', 'dropout_1']) |
|
|
lstm_layer = get_layer_safe(model, ['bidirectional']) |
|
|
|
|
|
enc_emb = drop_layer(norm_layer(emb_layer(encoder_input), training=False)) |
|
|
enc_out, fh, fc, bh, bc = lstm_layer(enc_emb) |
|
|
state_h = Concatenate()([fh, bh]) |
|
|
state_c = Concatenate()([fc, bc]) |
|
|
encoder_model = Model(encoder_input, [enc_out, state_h, state_c]) |
|
|
|
|
|
|
|
|
decoder_input = Input(shape=(1,)) |
|
|
decoder_lstm = get_layer_safe(model, ['lstm_1', 'lstm']) |
|
|
decoder_emb = get_layer_safe(model, ['embedding_1', 'embedding_2']) |
|
|
decoder_norm = get_layer_safe(model, ['layer_normalization_1', 'layer_normalization_2']) |
|
|
decoder_drop = get_layer_safe(model, ['dropout_2', 'dropout_1']) |
|
|
decoder_dense = get_layer_safe(model, ['dense']) |
|
|
attention_layer = get_layer_safe(model, ['multi_head_attention']) |
|
|
|
|
|
decoder_state_input_h = Input(shape=(decoder_lstm.units,)) |
|
|
decoder_state_input_c = Input(shape=(decoder_lstm.units,)) |
|
|
encoder_outputs_input = Input(shape=(None, decoder_lstm.units)) |
|
|
|
|
|
dec_emb = decoder_drop(decoder_norm(decoder_emb(decoder_input), training=False)) |
|
|
dec_out, dec_h, dec_c = decoder_lstm(dec_emb, initial_state=[decoder_state_input_h, decoder_state_input_c]) |
|
|
context = attention_layer(query=dec_out, key=encoder_outputs_input, value=encoder_outputs_input) |
|
|
dec_combined = Concatenate(axis=-1)([context, dec_out]) |
|
|
dec_final = decoder_dense(dec_combined) |
|
|
|
|
|
decoder_model = Model( |
|
|
[decoder_input, decoder_state_input_h, decoder_state_input_c, encoder_outputs_input], |
|
|
[dec_final, dec_h, dec_c] |
|
|
) |
|
|
|
|
|
return encoder_model, decoder_model |
|
|
|
|
|
encoder_model, decoder_model = build_inference_models(model) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def preprocess_input_sentence(sentence): |
|
|
oov_idx = word2idx_en.get('<OOV>', 1) |
|
|
seq = [word2idx_en.get(w.lower(), oov_idx) for w in sentence.split()] |
|
|
seq = [idx if idx < max_vocab_en else oov_idx for idx in seq] |
|
|
return pad_sequences([seq], maxlen=max_seq_len, padding='post') |
|
|
|
|
|
def decode_sequence(input_seq): |
|
|
start_token = word2idx_hi['<start>'] |
|
|
end_token = word2idx_hi['<end>'] |
|
|
enc_outs, h, c = encoder_model.predict(input_seq, verbose=0) |
|
|
target_seq = np.array([[start_token]]) |
|
|
decoded_sentence = [] |
|
|
for _ in range(max_seq_len): |
|
|
output_tokens, h, c = decoder_model.predict([target_seq, h, c, enc_outs], verbose=0) |
|
|
sampled_idx = np.argmax(output_tokens[0, 0, :]) |
|
|
if sampled_idx == end_token: |
|
|
break |
|
|
if sampled_idx > 0: |
|
|
decoded_sentence.append(idx2word_hi.get(sampled_idx, '')) |
|
|
target_seq[0, 0] = sampled_idx |
|
|
return " ".join(decoded_sentence) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.subheader("📝 Try a Sample Translation") |
|
|
|
|
|
|
|
|
if "selected_text" not in st.session_state: |
|
|
st.session_state.selected_text = data["english"].iloc[0] |
|
|
if "translation" not in st.session_state: |
|
|
st.session_state.translation = "" |
|
|
|
|
|
|
|
|
selected_text = st.selectbox( |
|
|
"Select an English sentence:", |
|
|
data["english"].tolist(), |
|
|
index=data["english"].tolist().index(st.session_state.selected_text) |
|
|
) |
|
|
|
|
|
|
|
|
if selected_text != st.session_state.selected_text: |
|
|
st.session_state.selected_text = selected_text |
|
|
st.session_state.translation = "" |
|
|
|
|
|
original_hindi = data.loc[data["english"] == selected_text, "hindi"].values[0] |
|
|
st.write("**Original English:**", selected_text) |
|
|
st.write("**Reference Hindi:**", original_hindi) |
|
|
|
|
|
|
|
|
if st.button("🚀 Translate"): |
|
|
with st.spinner("Generating translation..."): |
|
|
preprocessed = preprocess_input_sentence(selected_text) |
|
|
translation = decode_sequence(preprocessed) |
|
|
st.session_state.translation = translation |
|
|
|
|
|
|
|
|
if st.session_state.translation: |
|
|
st.success(f"✅ **Predicted Hindi Translation:** {st.session_state.translation}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.subheader("Leaning how it works") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with st.expander("🔹 Self-Attention Mechanism"): |
|
|
st.markdown(""" |
|
|
Self-Attention is a mechanism where each token in a sequence attends to **other tokens in the same sequence** to capture dependencies. |
|
|
|
|
|
**Key points:** |
|
|
- Helps the model focus on relevant words within the same sentence. |
|
|
- Computes attention scores between all pairs of positions in the input. |
|
|
- Often implemented as **Multi-Head Self-Attention** to capture different types of relationships simultaneously. |
|
|
|
|
|
**Example:** |
|
|
In the sentence *"The cat sat on the mat"*, self-attention allows the model to understand that *"cat"* is related to *"sat"* and *"mat"*. |
|
|
""") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with st.expander("🔹 Cross-Attention Mechanism"): |
|
|
st.markdown(""" |
|
|
Cross-Attention is used in encoder-decoder architectures where the **decoder attends to encoder outputs**. |
|
|
|
|
|
**Key points:** |
|
|
- Decoder queries encoder outputs to focus on relevant parts of the input sentence. |
|
|
- Crucial for translation, summarization, or any sequence-to-sequence task. |
|
|
|
|
|
**Example:** |
|
|
Translating *"I am hungry"* to Hindi: when generating the Hindi word *"भूखा"*, cross-attention helps the decoder focus on *"hungry"* in the English input. |
|
|
""") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with st.expander("🔹 Multi-Head Attention"): |
|
|
st.markdown(""" |
|
|
Multi-Head Attention is an extension of the attention mechanism that allows the model to **capture information from different representation subspaces simultaneously**. |
|
|
|
|
|
**Key Points:** |
|
|
- Instead of using a single attention function, we use **multiple attention heads**. |
|
|
- Each head learns to focus on **different parts or relationships** of the input. |
|
|
- The outputs from all heads are **concatenated and linearly projected** to form the final context vector. |
|
|
- Improves the model’s ability to understand complex dependencies in sequences. |
|
|
|
|
|
**Example:** |
|
|
- In translating *"The cat sat on the mat"*: |
|
|
- Head 1 may focus on subject-verb relations (*cat ↔ sat*). |
|
|
- Head 2 may focus on verb-object relations (*sat ↔ mat*). |
|
|
- Head 3 may focus on positional or syntactic patterns. |
|
|
- Combining all heads gives a richer context for the decoder. |
|
|
|
|
|
**In your Seq2Seq Model:** |
|
|
- Multi-Head Attention can be used as: |
|
|
- **Self-Attention** in encoder/decoder layers |
|
|
- **Cross-Attention** between encoder outputs and decoder hidden states |
|
|
""") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with st.expander("🔹 Sequence-to-Sequence (Seq2Seq) Task"): |
|
|
st.markdown(""" |
|
|
Seq2Seq models map an **input sequence** to an **output sequence**, often with **different lengths**. |
|
|
|
|
|
**Examples:** |
|
|
- Machine Translation: English → Hindi |
|
|
- Text Summarization |
|
|
- Chatbots / Dialogue Systems |
|
|
|
|
|
**Characteristics:** |
|
|
- Handles variable-length input and output sequences. |
|
|
- Uses encoder to process input, decoder to generate output. |
|
|
- Can integrate attention mechanisms to improve alignment between input and output tokens. |
|
|
""") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with st.expander("🔹 Fixed-Length vs Variable-Length Tasks"): |
|
|
st.markdown(""" |
|
|
**Fixed-Length Tasks:** |
|
|
- Input and output sequences have the **same length**. |
|
|
- Example: Time series forecasting with fixed steps, classification tasks. |
|
|
|
|
|
**Variable-Length Tasks:** |
|
|
- Input and output sequences can **differ in length**. |
|
|
- Example: Machine translation, summarization, speech recognition. |
|
|
- Seq2Seq models are designed to handle this flexibility. |
|
|
""") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.markdown("---") |
|
|
show_arch = st.checkbox("🧩 Show Model Architecture") |
|
|
|
|
|
if show_arch: |
|
|
layer_info = [] |
|
|
for i, layer in enumerate(model.layers): |
|
|
try: |
|
|
out_shape = layer.output_shape |
|
|
except: |
|
|
try: |
|
|
out_shape = layer.output.shape |
|
|
except: |
|
|
out_shape = "N/A" |
|
|
layer_info.append([i, layer.name, str(out_shape)]) |
|
|
df_layers = pd.DataFrame(layer_info, columns=["Index", "Layer Name", "Output Shape"]) |
|
|
st.subheader("Model Architecture Overview") |
|
|
st.dataframe(df_layers, width='stretch') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.markdown("---") |
|
|
st.subheader("🔗 Resources") |
|
|
st.markdown(""" |
|
|
- 🧩 **Model Repository:** [Daksh0505/Seq2Seq-LSTM-MultiHeadAttention](https://huggingface.co/Daksh0505/Seq2Seq-LSTM-MultiHeadAttention) |
|
|
- 📘 **Dataset:** English–Hindi Parallel Corpus- IIT Bombay |
|
|
- 🧠 **Framework:** TensorFlow / Keras |
|
|
""") |
|
|
|
|
|
st.caption("© 2025 Daksh Bhardwaj | For educational and research purposes.") |