Spaces:

Daksh0505
/

Seq2Seq-LSTM-MultiHeadAttention-Translation

Sleeping

App Files Files Community

Seq2Seq-LSTM-MultiHeadAttention-Translation / app.py

Daksh0505

Update app.py

bc44d3c verified about 2 months ago

raw

history blame

14.3 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import tensorflow as tf
	import pickle
	from keras.models import load_model
	from tensorflow.keras.models import Model
	from tensorflow.keras.layers import Input, Concatenate
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	from huggingface_hub import hf_hub_download

	# ------------------------------------------------
	# Page configuration
	# ------------------------------------------------
	st.set_page_config(
	page_title="English → Hindi Seq2Seq Translator",
	layout="centered",
	page_icon="🧠",
	)

	st.title("🧠 English → Hindi Translation using Seq2Seq + Multi-Head Attention")
	st.markdown("A demonstration of LSTM Encoder–Decoder with Multi-Head Self-Attention for translation tasks.")

	# ------------------------------------------------
	# About Section
	# ------------------------------------------------
	with st.expander("ℹ️ About This Project"):
	st.markdown("""
	This demo showcases a Seq2Seq translation model enhanced with self-attention.
	It demonstrates how attention helps the decoder focus on relevant parts of the input during translation.

	Key Highlights:
	- Encoder–Decoder with LSTMs
	- Integrated Multi-Head Self and Cross-Attention
	- Two model sizes: 12M and 42M parameters
	- Designed for educational visualization and experimentation
	""")

	# ------------------------------------------------
	# Purpose Section
	# ------------------------------------------------
	with st.expander("🎯 Purpose"):
	st.markdown("""
	This Space is designed for:
	- Demonstration and educational purposes
	- Understanding Seq2Seq + Attention mechanisms
	- Translating English sentences to Hindi
	- Exploring how encoder outputs can serve as context embeddings for downstream NLP tasks
	""")

	# ------------------------------------------------
	# Load models and tokenizers
	# ------------------------------------------------
	@st.cache_resource
	def load_model_and_tokenizer(model_file, tokenizer_file):
	model_path = hf_hub_download(repo_id="Daksh0505/Seq2Seq-LSTM-MultiHeadAttention", filename=model_file)
	tokenizer_path = hf_hub_download(repo_id="Daksh0505/Seq2Seq-LSTM-MultiHeadAttention", filename=tokenizer_file)

	model = load_model(model_path)
	with open(tokenizer_path, "rb") as f:
	tokenizer = pickle.load(f)
	return model, tokenizer['english'], tokenizer['hindi']

	model_12M, tokenizer_en_12, tokenizer_hi_12 = load_model_and_tokenizer(
	"seq2seq-lstm-multiheadattention-12.3.keras", "seq2seq-tokenizers-12.3M.pkl"
	)
	model_42M, tokenizer_en_42, tokenizer_hi_42 = load_model_and_tokenizer(
	"seq2seq-lstm-multiheadattention-42.keras", "seq2seq-tokenizers-42M.pkl"
	)

	# ------------------------------------------------
	# Load dataset
	# ------------------------------------------------
	@st.cache_data
	def load_data():
	return pd.read_csv("translation.csv").head(5000)

	data = load_data()

	# ------------------------------------------------
	# Model selection
	# ------------------------------------------------
	st.subheader("⚙️ Select Model Size")
	model_choice = st.radio("Choose a model:", ["12M parameters", "42M parameters"], index=0, horizontal=True)

	if model_choice == "12M parameters":
	model = model_12M
	tokenizer_en = tokenizer_en_12
	tokenizer_hi = tokenizer_hi_12
	max_seq_len = 40
	else:
	model = model_42M
	tokenizer_en = tokenizer_en_42
	tokenizer_hi = tokenizer_hi_42
	max_seq_len = 50

	word2idx_en = tokenizer_en.word_index
	word2idx_hi = tokenizer_hi.word_index
	idx2word_hi = tokenizer_hi.index_word
	max_vocab_en = len(word2idx_en) + 1

	# ------------------------------------------------
	# Safe layer extraction helper
	# ------------------------------------------------
	def get_layer_safe(model, possible_names):
	for name in possible_names:
	try:
	return model.get_layer(name)
	except:
	continue
	raise ValueError(f"No matching layer found among {possible_names}")

	# ------------------------------------------------
	# Build encoder & decoder inference
	# ------------------------------------------------
	@st.cache_resource
	def build_inference_models(model):
	encoder_input = model.input[0]
	emb_layer = get_layer_safe(model, ['embedding', 'embedding_0'])
	norm_layer = get_layer_safe(model, ['layer_normalization', 'layer_normalization_0'])
	drop_layer = get_layer_safe(model, ['dropout', 'dropout_1'])
	lstm_layer = get_layer_safe(model, ['bidirectional'])

	enc_emb = drop_layer(norm_layer(emb_layer(encoder_input), training=False))
	enc_out, fh, fc, bh, bc = lstm_layer(enc_emb)
	state_h = Concatenate()([fh, bh])
	state_c = Concatenate()([fc, bc])
	encoder_model = Model(encoder_input, [enc_out, state_h, state_c])

	# Decoder
	decoder_input = Input(shape=(1,))
	decoder_lstm = get_layer_safe(model, ['lstm_1', 'lstm'])
	decoder_emb = get_layer_safe(model, ['embedding_1', 'embedding_2'])
	decoder_norm = get_layer_safe(model, ['layer_normalization_1', 'layer_normalization_2'])
	decoder_drop = get_layer_safe(model, ['dropout_2', 'dropout_1'])
	decoder_dense = get_layer_safe(model, ['dense'])
	attention_layer = get_layer_safe(model, ['multi_head_attention'])

	decoder_state_input_h = Input(shape=(decoder_lstm.units,))
	decoder_state_input_c = Input(shape=(decoder_lstm.units,))
	encoder_outputs_input = Input(shape=(None, decoder_lstm.units))

	dec_emb = decoder_drop(decoder_norm(decoder_emb(decoder_input), training=False))
	dec_out, dec_h, dec_c = decoder_lstm(dec_emb, initial_state=[decoder_state_input_h, decoder_state_input_c])
	context = attention_layer(query=dec_out, key=encoder_outputs_input, value=encoder_outputs_input)
	dec_combined = Concatenate(axis=-1)([context, dec_out])
	dec_final = decoder_dense(dec_combined)

	decoder_model = Model(
	[decoder_input, decoder_state_input_h, decoder_state_input_c, encoder_outputs_input],
	[dec_final, dec_h, dec_c]
	)

	return encoder_model, decoder_model

	encoder_model, decoder_model = build_inference_models(model)

	# ------------------------------------------------
	# Helper functions
	# ------------------------------------------------
	def preprocess_input_sentence(sentence):
	oov_idx = word2idx_en.get('<OOV>', 1)
	seq = [word2idx_en.get(w.lower(), oov_idx) for w in sentence.split()]
	seq = [idx if idx < max_vocab_en else oov_idx for idx in seq]
	return pad_sequences([seq], maxlen=max_seq_len, padding='post')

	def decode_sequence(input_seq):
	start_token = word2idx_hi['<start>']
	end_token = word2idx_hi['<end>']
	enc_outs, h, c = encoder_model.predict(input_seq, verbose=0)
	target_seq = np.array([[start_token]])
	decoded_sentence = []
	for _ in range(max_seq_len):
	output_tokens, h, c = decoder_model.predict([target_seq, h, c, enc_outs], verbose=0)
	sampled_idx = np.argmax(output_tokens[0, 0, :])
	if sampled_idx == end_token:
	break
	if sampled_idx > 0:
	decoded_sentence.append(idx2word_hi.get(sampled_idx, ''))
	target_seq[0, 0] = sampled_idx
	return " ".join(decoded_sentence)

	# ------------------------------------------------
	# Input selection with session_state caching
	# ------------------------------------------------
	st.subheader("📝 Try a Sample Translation")

	# Initialize selected text in session_state
	if "selected_text" not in st.session_state:
	st.session_state.selected_text = data["english"].iloc[0]
	if "translation" not in st.session_state:
	st.session_state.translation = ""

	# Dropdown for English sentence selection
	selected_text = st.selectbox(
	"Select an English sentence:",
	data["english"].tolist(),
	index=data["english"].tolist().index(st.session_state.selected_text)
	)

	# Update session_state if selection changed
	if selected_text != st.session_state.selected_text:
	st.session_state.selected_text = selected_text
	st.session_state.translation = "" # Reset translation for new selection

	original_hindi = data.loc[data["english"] == selected_text, "hindi"].values[0]
	st.write("Original English:", selected_text)
	st.write("Reference Hindi:", original_hindi)

	# Translate button
	if st.button("🚀 Translate"):
	with st.spinner("Generating translation..."):
	preprocessed = preprocess_input_sentence(selected_text)
	translation = decode_sequence(preprocessed)
	st.session_state.translation = translation

	# Show cached translation if exists
	if st.session_state.translation:
	st.success(f"✅ Predicted Hindi Translation: {st.session_state.translation}")

	# ------------------------------------------------
	# Learning Header
	# ------------------------------------------------
	st.subheader("Leaning how it works")

	# ------------------------------------------------
	# Self Attention Section
	# ------------------------------------------------
	with st.expander("🔹 Self-Attention Mechanism"):
	st.markdown("""
	Self-Attention is a mechanism where each token in a sequence attends to other tokens in the same sequence to capture dependencies.

	Key points:
	- Helps the model focus on relevant words within the same sentence.
	- Computes attention scores between all pairs of positions in the input.
	- Often implemented as Multi-Head Self-Attention to capture different types of relationships simultaneously.

	Example:
	In the sentence "The cat sat on the mat", self-attention allows the model to understand that "cat" is related to "sat" and "mat".
	""")

	# ------------------------------------------------
	# Cross Attention Section
	# ------------------------------------------------
	with st.expander("🔹 Cross-Attention Mechanism"):
	st.markdown("""
	Cross-Attention is used in encoder-decoder architectures where the decoder attends to encoder outputs.

	Key points:
	- Decoder queries encoder outputs to focus on relevant parts of the input sentence.
	- Crucial for translation, summarization, or any sequence-to-sequence task.

	Example:
	Translating "I am hungry" to Hindi: when generating the Hindi word "भूखा", cross-attention helps the decoder focus on "hungry" in the English input.
	""")

	# ------------------------------------------------
	# Multi-Head Attention Section
	# ------------------------------------------------
	with st.expander("🔹 Multi-Head Attention"):
	st.markdown("""
	Multi-Head Attention is an extension of the attention mechanism that allows the model to capture information from different representation subspaces simultaneously.

	Key Points:
	- Instead of using a single attention function, we use multiple attention heads.
	- Each head learns to focus on different parts or relationships of the input.
	- The outputs from all heads are concatenated and linearly projected to form the final context vector.
	- Improves the model’s ability to understand complex dependencies in sequences.

	Example:
	- In translating "The cat sat on the mat":
	- Head 1 may focus on subject-verb relations (cat ↔ sat).
	- Head 2 may focus on verb-object relations (sat ↔ mat).
	- Head 3 may focus on positional or syntactic patterns.
	- Combining all heads gives a richer context for the decoder.

	In your Seq2Seq Model:
	- Multi-Head Attention can be used as:
	- Self-Attention in encoder/decoder layers
	- Cross-Attention between encoder outputs and decoder hidden states
	""")

	# ------------------------------------------------
	# Seq2Seq task Explaining Section
	# ------------------------------------------------
	with st.expander("🔹 Sequence-to-Sequence (Seq2Seq) Task"):
	st.markdown("""
	Seq2Seq models map an input sequence to an output sequence, often with different lengths.

	Examples:
	- Machine Translation: English → Hindi
	- Text Summarization
	- Chatbots / Dialogue Systems

	Characteristics:
	- Handles variable-length input and output sequences.
	- Uses encoder to process input, decoder to generate output.
	- Can integrate attention mechanisms to improve alignment between input and output tokens.
	""")

	# ------------------------------------------------
	# Seq2Seq Task- Fixed-Length vs Variable-Length Section
	# ------------------------------------------------
	with st.expander("🔹 Fixed-Length vs Variable-Length Tasks"):
	st.markdown("""
	Fixed-Length Tasks:
	- Input and output sequences have the same length.
	- Example: Time series forecasting with fixed steps, classification tasks.

	Variable-Length Tasks:
	- Input and output sequences can differ in length.
	- Example: Machine translation, summarization, speech recognition.
	- Seq2Seq models are designed to handle this flexibility.
	""")

	# ------------------------------------------------
	# Show model architecture
	# ------------------------------------------------
	st.markdown("---")
	show_arch = st.checkbox("🧩 Show Model Architecture")

	if show_arch:
	layer_info = []
	for i, layer in enumerate(model.layers):
	try:
	out_shape = layer.output_shape
	except:
	try:
	out_shape = layer.output.shape
	except:
	out_shape = "N/A"
	layer_info.append([i, layer.name, str(out_shape)])
	df_layers = pd.DataFrame(layer_info, columns=["Index", "Layer Name", "Output Shape"])
	st.subheader("Model Architecture Overview")
	st.dataframe(df_layers, width='stretch')

	# ------------------------------------------------
	# Footer
	# ------------------------------------------------
	st.markdown("---")
	st.subheader("🔗 Resources")
	st.markdown("""
	- 🧩 Model Repository: [Daksh0505/Seq2Seq-LSTM-MultiHeadAttention](https://huggingface.co/Daksh0505/Seq2Seq-LSTM-MultiHeadAttention)
	- 📘 Dataset: English–Hindi Parallel Corpus- IIT Bombay
	- 🧠 Framework: TensorFlow / Keras
	""")

	st.caption("© 2025 Daksh Bhardwaj \| For educational and research purposes.")