import streamlit as st
import pandas as pd
import numpy as np
import tensorflow as tf
import pickle
from keras.models import load_model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Concatenate
from tensorflow.keras.preprocessing.sequence import pad_sequences
from huggingface_hub import hf_hub_download

# ------------------------------------------------
# Page configuration
# ------------------------------------------------
st.set_page_config(
    page_title="English → Hindi Seq2Seq Translator",
    layout="centered",
    page_icon="🧠",
)

st.title("🧠 English → Hindi Translation (Seq2Seq + Self-Attention)")
st.markdown("A demonstration of **LSTM Encoder–Decoder** with **Multi-Head Self-Attention** for translation tasks.")

# ------------------------------------------------
# About Section
# ------------------------------------------------
with st.expander("ℹ️ About This Project"):
    st.markdown("""
    This demo showcases a **Seq2Seq translation model enhanced with self-attention**.  
    It demonstrates how attention helps the decoder focus on relevant parts of the input during translation.

    **Key Highlights:**
    - Encoder–Decoder with LSTMs  
    - Integrated Multi-Head Self and Cross-Attention  
    - Two model sizes: **12M** and **42M** parameters  
    - Designed for educational visualization and experimentation  
    """)

# ------------------------------------------------
# Purpose Section
# ------------------------------------------------
with st.expander("🎯 Purpose"):
    st.markdown("""
    This Space is designed for:
    - Demonstration and educational purposes  
    - Understanding **Seq2Seq + Attention mechanisms**  
    - Translating English sentences to Hindi  
    - Exploring how encoder outputs can serve as **context embeddings** for downstream NLP tasks  
    """)

# ------------------------------------------------
# Load models and tokenizers
# ------------------------------------------------
@st.cache_resource
def load_model_and_tokenizer(model_file, tokenizer_file):
    model_path = hf_hub_download(repo_id="Daksh0505/Seq2Seq-LSTM-MultiHeadAttention", filename=model_file)
    tokenizer_path = hf_hub_download(repo_id="Daksh0505/Seq2Seq-LSTM-MultiHeadAttention", filename=tokenizer_file)

    model = load_model(model_path)
    with open(tokenizer_path, "rb") as f:
        tokenizer = pickle.load(f)
    return model, tokenizer['english'], tokenizer['hindi']

model_12M, tokenizer_en_12, tokenizer_hi_12 = load_model_and_tokenizer(
    "seq2seq-lstm-multiheadattention-12.3.keras", "seq2seq-tokenizers-12.3M.pkl"
)
model_42M, tokenizer_en_42, tokenizer_hi_42 = load_model_and_tokenizer(
    "seq2seq-lstm-multiheadattention-42.keras", "seq2seq-tokenizers-42M.pkl"
)

# ------------------------------------------------
# Load dataset
# ------------------------------------------------
@st.cache_data
def load_data():
    return pd.read_csv("translation.csv").head(5000)

data = load_data()

# ------------------------------------------------
# Model selection
# ------------------------------------------------
st.subheader("⚙️ Select Model Size")
model_choice = st.radio("Choose a model:", ["12M parameters", "42M parameters"], index=0, horizontal=True)

if model_choice == "12M parameters":
    model = model_12M
    tokenizer_en = tokenizer_en_12
    tokenizer_hi = tokenizer_hi_12
    max_seq_len = 40
else:
    model = model_42M
    tokenizer_en = tokenizer_en_42
    tokenizer_hi = tokenizer_hi_42
    max_seq_len = 50

word2idx_en = tokenizer_en.word_index
word2idx_hi = tokenizer_hi.word_index
idx2word_hi = tokenizer_hi.index_word
max_vocab_en = len(word2idx_en) + 1

# ------------------------------------------------
# Safe layer extraction helper
# ------------------------------------------------
def get_layer_safe(model, possible_names):
    for name in possible_names:
        try:
            return model.get_layer(name)
        except:
            continue
    raise ValueError(f"No matching layer found among {possible_names}")

# ------------------------------------------------
# Build encoder & decoder inference
# ------------------------------------------------
@st.cache_resource
def build_inference_models(model):
    encoder_input = model.input[0]
    emb_layer = get_layer_safe(model, ['embedding', 'embedding_0'])
    norm_layer = get_layer_safe(model, ['layer_normalization', 'layer_normalization_0'])
    drop_layer = get_layer_safe(model, ['dropout', 'dropout_1'])
    lstm_layer = get_layer_safe(model, ['bidirectional'])

    enc_emb = drop_layer(norm_layer(emb_layer(encoder_input), training=False))
    enc_out, fh, fc, bh, bc = lstm_layer(enc_emb)
    state_h = Concatenate()([fh, bh])
    state_c = Concatenate()([fc, bc])
    encoder_model = Model(encoder_input, [enc_out, state_h, state_c])

    # Decoder
    decoder_input = Input(shape=(1,))
    decoder_lstm = get_layer_safe(model, ['lstm_1', 'lstm'])
    decoder_emb = get_layer_safe(model, ['embedding_1', 'embedding_2'])
    decoder_norm = get_layer_safe(model, ['layer_normalization_1', 'layer_normalization_2'])
    decoder_drop = get_layer_safe(model, ['dropout_2', 'dropout_1'])
    decoder_dense = get_layer_safe(model, ['dense'])
    attention_layer = get_layer_safe(model, ['multi_head_attention'])

    decoder_state_input_h = Input(shape=(decoder_lstm.units,))
    decoder_state_input_c = Input(shape=(decoder_lstm.units,))
    encoder_outputs_input = Input(shape=(None, decoder_lstm.units))

    dec_emb = decoder_drop(decoder_norm(decoder_emb(decoder_input), training=False))
    dec_out, dec_h, dec_c = decoder_lstm(dec_emb, initial_state=[decoder_state_input_h, decoder_state_input_c])
    context = attention_layer(query=dec_out, key=encoder_outputs_input, value=encoder_outputs_input)
    dec_combined = Concatenate(axis=-1)([context, dec_out])
    dec_final = decoder_dense(dec_combined)

    decoder_model = Model(
        [decoder_input, decoder_state_input_h, decoder_state_input_c, encoder_outputs_input],
        [dec_final, dec_h, dec_c]
    )

    return encoder_model, decoder_model

encoder_model, decoder_model = build_inference_models(model)

# ------------------------------------------------
# Helper functions
# ------------------------------------------------
def preprocess_input_sentence(sentence):
    oov_idx = word2idx_en.get('<OOV>', 1)
    seq = [word2idx_en.get(w.lower(), oov_idx) for w in sentence.split()]
    seq = [idx if idx < max_vocab_en else oov_idx for idx in seq]
    return pad_sequences([seq], maxlen=max_seq_len, padding='post')

def decode_sequence(input_seq):
    start_token = word2idx_hi['<start>']
    end_token = word2idx_hi['<end>']
    enc_outs, h, c = encoder_model.predict(input_seq, verbose=0)
    target_seq = np.array([[start_token]])
    decoded_sentence = []
    for _ in range(max_seq_len):
        output_tokens, h, c = decoder_model.predict([target_seq, h, c, enc_outs], verbose=0)
        sampled_idx = np.argmax(output_tokens[0, 0, :])
        if sampled_idx == end_token:
            break
        if sampled_idx > 0:
            decoded_sentence.append(idx2word_hi.get(sampled_idx, ''))
        target_seq[0, 0] = sampled_idx
    return " ".join(decoded_sentence)

# ------------------------------------------------
# Input selection with session_state caching
# ------------------------------------------------
st.subheader("📝 Try a Sample Translation")

# Initialize selected text in session_state
if "selected_text" not in st.session_state:
    st.session_state.selected_text = data["english"].iloc[0]
if "translation" not in st.session_state:
    st.session_state.translation = ""

# Dropdown for English sentence selection
selected_text = st.selectbox(
    "Select an English sentence:",
    data["english"].tolist(),
    index=data["english"].tolist().index(st.session_state.selected_text)
)

# Update session_state if selection changed
if selected_text != st.session_state.selected_text:
    st.session_state.selected_text = selected_text
    st.session_state.translation = ""  # Reset translation for new selection

original_hindi = data.loc[data["english"] == selected_text, "hindi"].values[0]
st.write("**Original English:**", selected_text)
st.write("**Reference Hindi:**", original_hindi)

# Translate button
if st.button("🚀 Translate"):
    with st.spinner("Generating translation..."):
        preprocessed = preprocess_input_sentence(selected_text)
        translation = decode_sequence(preprocessed)
    st.session_state.translation = translation

# Show cached translation if exists
if st.session_state.translation:
    st.success(f"✅ **Predicted Hindi Translation:** {st.session_state.translation}")

# ------------------------------------------------
# Learning Header
# ------------------------------------------------
st.subheader(" Understanding the Model")

# ------------------------------------------------
# Self Attention Section
# ------------------------------------------------
with st.expander("🔹 Self-Attention Mechanism"):
    st.markdown("""
    Self-Attention is a mechanism where each token in a sequence attends to **other tokens in the same sequence** to capture dependencies.
    
    **Key points:**
    - Helps the model focus on relevant words within the same sentence.
    - Computes attention scores between all pairs of positions in the input.
    - Often implemented as **Multi-Head Self-Attention** to capture different types of relationships simultaneously.
    
    **Example:**  
    In the sentence *"The cat sat on the mat"*, self-attention allows the model to understand that *"cat"* is related to *"sat"* and *"mat"*.
    """)
    
# ------------------------------------------------
# Cross Attention Section
# ------------------------------------------------
with st.expander("🔹 Cross-Attention Mechanism"):
    st.markdown("""
    Cross-Attention is used in encoder-decoder architectures where the **decoder attends to encoder outputs**.
    
    **Key points:**
    - Decoder queries encoder outputs to focus on relevant parts of the input sentence.
    - Crucial for translation, summarization, or any sequence-to-sequence task.
    
    **Example:**  
    Translating *"I am hungry"* to Hindi: when generating the Hindi word *"भूखा"*, cross-attention helps the decoder focus on *"hungry"* in the English input.
    """)

# ------------------------------------------------
# Multi-Head Attention Section
# ------------------------------------------------
with st.expander("🔹 Multi-Head Attention"):
    st.markdown("""
    Multi-Head Attention is an extension of the attention mechanism that allows the model to **capture information from different representation subspaces simultaneously**.

    **Key Points:**
    - Instead of using a single attention function, we use **multiple attention heads**.
    - Each head learns to focus on **different parts or relationships** of the input.
    - The outputs from all heads are **concatenated and linearly projected** to form the final context vector.
    - Improves the model’s ability to understand complex dependencies in sequences.

    **Example:**
    - In translating *"The cat sat on the mat"*:
        - Head 1 may focus on subject-verb relations (*cat ↔ sat*).
        - Head 2 may focus on verb-object relations (*sat ↔ mat*).
        - Head 3 may focus on positional or syntactic patterns.
    - Combining all heads gives a richer context for the decoder.
    
    **In your Seq2Seq Model:**
    - Multi-Head Attention can be used as:
        - **Self-Attention** in encoder/decoder layers
        - **Cross-Attention** between encoder outputs and decoder hidden states
    """)

# ------------------------------------------------
# Seq2Seq task Explaining Section
# ------------------------------------------------
with st.expander("🔹 Sequence-to-Sequence (Seq2Seq) Task"):
    st.markdown("""
    Seq2Seq models map an **input sequence** to an **output sequence**, often with **different lengths**.
    
    **Examples:**
    - Machine Translation: English → Hindi
    - Text Summarization
    - Chatbots / Dialogue Systems
    
    **Characteristics:**
    - Handles variable-length input and output sequences.
    - Uses encoder to process input, decoder to generate output.
    - Can integrate attention mechanisms to improve alignment between input and output tokens.
    """)

# ------------------------------------------------
# Seq2Seq Task- Fixed-Length vs Variable-Length Section
# ------------------------------------------------
with st.expander("🔹 Fixed-Length vs Variable-Length Tasks"):
    st.markdown("""
    **Fixed-Length Tasks:**
    - Input and output sequences have the **same length**.
    - Example: Time series forecasting with fixed steps, classification tasks.

    **Variable-Length Tasks:**
    - Input and output sequences can **differ in length**.
    - Example: Machine translation, summarization, speech recognition.
    - Seq2Seq models are designed to handle this flexibility.
    """)

# ------------------------------------------------
# Mathematics Expanders (Advanced / Optional)
# ------------------------------------------------
st.subheader("🧮 Mathematics Behind the Model")

with st.expander("🔹 Self-Attention Equations", expanded=False):
    st.markdown(r"""
    The attention function is computed as:

    \[
    \text{Attention}(Q,K,V) = \text{softmax}\left(\frac{Q K^T}{\sqrt{d_k}}\right) V
    \]

    Where:
    - \(Q\) = Query matrix  
    - \(K\) = Key matrix  
    - \(V\) = Value matrix  
    - \(d_k\) = Dimension of key vectors  

    This allows the model to compute a weighted sum of values based on relevance.
    """)

with st.expander("🔹 Multi-Head Attention Equations", expanded=False):
    st.markdown(r"""
    Multi-Head Attention combines multiple self-attention heads:

    \[
    \text{MultiHead}(Q,K,V) = \text{Concat}(\text{head}_1, ..., \text{head}_h) W^O
    \]

    Each head: 
    \[
    \text{head}_i = \text{Attention}(Q W_i^Q, K W_i^K, V W_i^V)
    \]

    Where \(W_i^Q, W_i^K, W_i^V, W^O\) are learnable projection matrices.
    """)

with st.expander("🔹 Cross-Attention / Encoder-Decoder Attention", expanded=False):
    st.markdown(r"""
    Cross-Attention computes attention using decoder queries and encoder outputs:

    \[
    \text{Context}_t = \text{Attention}(Q_t, K_{enc}, V_{enc})
    \]

    - \(Q_t\) = decoder hidden state at timestep \(t\)  
    - \(K_{enc}, V_{enc}\) = encoder outputs
    """)

with st.expander("🔹 Seq2Seq Decoder Step", expanded=False):
    st.markdown(r"""
    At each decoder timestep:

    \[
    s_t, c_t = \text{LSTM}(y_{t-1}, s_{t-1}, c_{t-1})
    \]

    \[
    \text{Output}_t = \text{Dense}(\text{Concat}(s_t, \text{Context}_t))
    \]
    """)

# ------------------------------------------------
# Show model architecture
# ------------------------------------------------
st.markdown("---")
show_arch = st.checkbox("🧩 Show Model Architecture")

if show_arch:
    layer_info = []
    for i, layer in enumerate(model.layers):
        try:
            out_shape = layer.output_shape
        except:
            try:
                out_shape = layer.output.shape
            except:
                out_shape = "N/A"
        layer_info.append([i, layer.name, str(out_shape)])
    df_layers = pd.DataFrame(layer_info, columns=["Index", "Layer Name", "Output Shape"])
    st.subheader("Model Architecture Overview")
    st.dataframe(df_layers, width='stretch')

# ------------------------------------------------
# Footer
# ------------------------------------------------
st.markdown("---")
st.subheader("🔗 Resources")
st.markdown("""
- 🧩 **Model Repository:** [Daksh0505/Seq2Seq-LSTM-MultiHeadAttention](https://huggingface.co/Daksh0505/Seq2Seq-LSTM-MultiHeadAttention)  
- 📘 **Dataset:** English–Hindi Parallel Corpus- IIT Bombay 
- 🧠 **Framework:** TensorFlow / Keras  
""")

st.caption("© 2025 Daksh Bhardwaj | For educational and research purposes.")