import os
# turn off Streamlit’s automatic file-watching
os.environ["STREAMLIT_SERVER_ENABLE_FILE_WATCHER"] = "false"

import sys
import types
import torch               # now safe to import
import streamlit as st
import numpy as np
import matplotlib.pyplot as plt
import numpy as np

# Prevent Streamlit from trying to walk torch.classes' non-standard __path__
if isinstance(getattr(sys.modules.get("torch"), "classes", None), types.ModuleType):
    torch.classes.__path__ = []

import torch
import numpy as np
import streamlit as st
from transformers import GPT2TokenizerFast

# --- Setup ---
st.set_page_config(page_title="Text to Embedding Visualizer", layout="wide")
st.title("🔍 Token Embedding & Positional Encoding Coding Demo")

# --- Input UI ---
sentence = st.text_input("Enter your sentence", "Learning is fun")
embedding_dim = st.slider("Embedding Dimension (even only)", min_value=4, max_value=64, value=8, step=2)

# --- Load tokenizer ---

# Set custom cache directory within your app's working directory (which is writable on Spaces)
# os.environ['TRANSFORMERS_CACHE'] = './hf_cache'

# Load the tokenizer using the custom cache path
# tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", cache_dir="./hf_cache")
from transformers import GPT2TokenizerFast
# Load tokenizer from bundled local files only
tokenizer = GPT2TokenizerFast.from_pretrained("./assets/tokenizer", local_files_only=True)


# tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
input_ids = tokenizer.encode(sentence, return_tensors="pt")[0]
tokens = tokenizer.convert_ids_to_tokens(input_ids)

# st.markdown("### 1️⃣ Tokenization")
# with st.expander("Show Token IDs"):
#     st.write("**Tokens:**", tokens)
#     st.write("**Token IDs:**", input_ids.tolist())

st.markdown("### 1️⃣ Tokenization")
with st.expander("Token IDs and Subwords"):
    st.write("**Tokens:**", tokens)
    st.write("**Token IDs:**", input_ids.tolist())

with st.expander("📜 Show Code: Tokenization"):
    st.code("""
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
input_ids = tokenizer.encode(sentence, return_tensors="pt")[0]
tokens = tokenizer.convert_ids_to_tokens(input_ids)
    """, language="python")


# --- Embedding Matrix ---
torch.manual_seed(0)  # Reproducibility
embedding_matrix = torch.nn.Embedding(tokenizer.vocab_size, embedding_dim)
embedded = embedding_matrix(input_ids)

st.markdown("### 2️⃣ Embedding")
with st.expander("Show Token Embeddings"):
    st.write("Shape:", embedded.shape)
    st.write(embedded)

with st.expander("📜 Show Code: Embedding"):
    st.code(f"""
embedding_matrix = torch.nn.Embedding(tokenizer.vocab_size, {embedding_dim})
embedded = embedding_matrix(input_ids)
    """, language="python")

# --- Positional Encoding ---
def get_positional_encoding(seq_len, dim):
    pe = torch.zeros(seq_len, dim)
    position = torch.arange(0, seq_len, dtype=torch.float32).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, dim, 2).float() * (-np.log(10000.0) / dim))
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    return pe

pos_enc = get_positional_encoding(len(input_ids), embedding_dim)

st.markdown("### 3️⃣ Positional Encoding")
with st.expander("Show Positional Encoding"):
    st.write("Shape:", pos_enc.shape)
    st.write(pos_enc)

with st.expander("📜 Show Code: Positional Encoding"):
    st.code(f'''
def get_positional_encoding(seq_len, dim):
    pe = torch.zeros(seq_len, dim)
    position = torch.arange(0, seq_len).unsqueeze(1).float()
    div_term = torch.exp(torch.arange(0, dim, 2).float() * (-np.log(10000.0) / dim))
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    return pe

pos_enc = get_positional_encoding(len(input_ids), {embedding_dim})
    ''', language="python")

# --- Combined Embedding + Position ---
embedded_with_pos = embedded + pos_enc

st.markdown("### 4️⃣ Embedding + Positional Encoding")
with st.expander("Show Combined Embedding"):
    st.write(embedded_with_pos)

with st.expander("📜 Show Code: Add Positional Encoding"):
    st.code("""
embedded_with_pos = embedded + pos_enc
    """, language="python")

# --- Approximate Reverse to Token IDs ---
def find_closest_token(vec, emb_matrix):
    sims = torch.nn.functional.cosine_similarity(vec.unsqueeze(0), emb_matrix.weight, dim=1)
    return torch.argmax(sims).item()

recovered_ids = [find_closest_token(vec, embedding_matrix) for vec in embedded]
#recovered_text = tokenizer.decode(recovered_ids)

#st.markdown("### 5️⃣ Approximate Reverse")
#with st.expander("Recovered Tokens"):
#    st.write("**Recovered IDs:**", recovered_ids)
#    st.write("**Recovered Text:**", recovered_text)

recovered_tokens = tokenizer.convert_ids_to_tokens(recovered_ids)  # ← Subwords
recovered_text = tokenizer.decode(recovered_ids)                   # ← Final string

st.markdown("### 5️⃣ Approximate Reverse")
with st.expander("Recovered Tokens and Text"):
    st.write("**Recovered Token IDs:**", recovered_ids)
    st.write("**Recovered Subword Tokens (BPE):**", recovered_tokens)
    st.write("**Recovered Sentence:**", recovered_text)

with st.expander("📜 Show Code: Recover Token IDs and Text"):
    st.code("""
def find_closest_token(vec, emb_matrix):
    sims = torch.nn.functional.cosine_similarity(vec.unsqueeze(0), emb_matrix.weight, dim=1)
    return torch.argmax(sims).item()

recovered_ids = [find_closest_token(vec, embedding_matrix) for vec in embedded]
recovered_tokens = tokenizer.convert_ids_to_tokens(recovered_ids)
recovered_text = tokenizer.decode(recovered_ids)
    """, language="python")

# --- Recover Position (Approx) ---
recovered_pos = embedded_with_pos - embedded
position_error = pos_enc - recovered_pos

st.markdown("### 6️⃣ Recovered Positional Encoding")
with st.expander("Compare Recovered vs Original"):
    st.write("**Recovered Positional Encoding:**")
    st.write(recovered_pos)
    st.write("**Difference from Original (should be ~0):**")
    st.write(position_error)

with st.expander("📜 Show Code: Recovered Positional Encoding"):
    st.code("""
recovered_pos = embedded_with_pos - embedded
position_error = pos_enc - recovered_pos
    """, language="python")

# Estimate position from positional encoding using cosine similarity
def estimate_position_from_encoding(pe_row, full_table):
    sims = torch.nn.functional.cosine_similarity(pe_row.unsqueeze(0), full_table, dim=1)
    return torch.argmax(sims).item()

# Build reference table of known encodings for positions 0 to N
reference_pos_table = get_positional_encoding(seq_len=len(input_ids), dim=embedding_dim)

# Now estimate each token's position
estimated_positions = [estimate_position_from_encoding(row, reference_pos_table) for row in recovered_pos]

st.markdown("### 7️⃣ Estimate Position from Positional Encoding")
with st.expander("Recovered Positions"):
    st.write("**Estimated Token Positions:**", estimated_positions)
    st.write("**Original True Positions:**", list(range(len(input_ids))))

with st.expander("📜 Show Code: Estimate Positions"):
    st.code("""
def estimate_position_from_encoding(pe_row, full_table):
    sims = torch.nn.functional.cosine_similarity(pe_row.unsqueeze(0), full_table, dim=1)
    return torch.argmax(sims).item()

reference_pos_table = get_positional_encoding(seq_len=len(input_ids), dim=embedding_dim)
estimated_positions = [estimate_position_from_encoding(row, reference_pos_table) for row in recovered_pos]
    """, language="python")


st.markdown("### 📘 Final Notes: Theory & Formulas")

with st.expander("🧠 Theory and Formulas"):
    st.markdown(r"""
### 1️⃣ Tokenization (BPE)

We use **Byte Pair Encoding (BPE)** to break text into subword units.
For example:

"Learning is fun" → ["Learning", "Ġis", "Ġfun"]


Note: The "Ġ" indicates a **space** before the token.

---

### 2️⃣ Embedding

Each token ID $t_i \in \mathbb{Z}$ is mapped to a dense vector:

$$
\text{Embedding}(t_i) = \mathbf{e}_i \in \mathbb{R}^d
$$

Where:

- $t_i$: token ID  
- $\mathbf{e}_i$: embedding vector of dimension $d$

---

### 3️⃣ Sinusoidal Positional Encoding

Used to encode the **position $p$** of a token without learnable parameters:

$$
\text{PE}(p, 2i) = \sin\left(\frac{p}{10000^{\frac{2i}{d}}}\right)
$$

$$
\text{PE}(p, 2i+1) = \cos\left(\frac{p}{10000^{\frac{2i}{d}}}\right)
$$

Where:

- $p$: position index (0, 1, 2, …)  
- $i$: dimension index  
- $d$: total embedding dimension

This gives a positional vector $\text{PE}(p) \in \mathbb{R}^d$

---

### 4️⃣ Add Embedding and Positional Encoding

We add the embedding and positional encoding element-wise:

$$
\mathbf{z}_i = \mathbf{e}_i + \text{PE}(p_i)
$$

Where:

- $\mathbf{z}_i$: final input to the transformer

---

### 5️⃣ Reverse Lookup (Approximate)

We find the nearest embedding using cosine similarity:

$$
\hat{t}_i = \underset{j}{\arg\max} \left( \frac{ \mathbf{z}_i \cdot \mathbf{e}_j }{ \| \mathbf{z}_i \| \, \| \mathbf{e}_j \| } \right)
$$

---

### 6️⃣ Recover Position from Embedding + PE

To isolate positional encoding:

$$
\text{Recovered PE}_i = \mathbf{z}_i - \mathbf{e}_i
$$

We then compare this with reference positional encodings to estimate token position.

---

### 🌟 Summary Table

| Step | What Happens |
|------|--------------|
| **Tokenization** | Sentence → Subwords → Token IDs |
| **Embedding** | Token IDs → Vectors |
| **Pos Encoding** | Position Index → Sin/Cos Vector |
| **Sum** | Embedding + PE = Input to Transformer |
| **Reverse** | Approximate token ID from vector |
| **PE Recovery** | Recover position using similarity |

    """, unsafe_allow_html=True)


st.markdown("### 🤖 Transformer Internals: Key Concepts")

with st.expander("🔁 Multi-Head Attention: Q, K, V Projections"):
    st.markdown(r"""
Each token embedding $\mathbf{x}_i$ is linearly projected into:
- Query vector: $Q_i = \mathbf{x}_i W^Q$
- Key vector: $K_i = \mathbf{x}_i W^K$
- Value vector: $V_i = \mathbf{x}_i W^V$

All of shape: $\mathbb{R}^{d_{model} \times d_{head}}$

Multiple such projections (heads) run in parallel:

$$
\text{MultiHead}(X) = \text{Concat}(\text{head}_1, ..., \text{head}_h) W^O
$$

Each head does:
$$
\text{Attention}(Q, K, V) = \text{softmax}\left( \frac{Q K^\top}{\sqrt{d_k}} \right) V
$$
""", unsafe_allow_html=True)

with st.expander("🧠 Contextualized Representations"):
    st.markdown(r"""
The attention mechanism lets each token **attend to others**, allowing the output for each token to contain **context**.

For example:
- Token "fun" gets influenced by "is" and "learning"
- The output is no longer static but dynamic, depending on sentence context

This is what makes Transformers powerful for understanding relationships between tokens.
""")

with st.expander("🛠 Feed-Forward Neural Network (FFN)"):
    st.markdown(r"""
After attention, each token’s vector goes through a two-layer feed-forward network applied independently:

$$
\text{FFN}(x) = \max(0, x W_1 + b_1) W_2 + b_2
$$

This allows deeper transformations on each token representation.
""")

with st.expander("📊 Softmax Over Vocabulary"):
    st.markdown(r"""
The final output layer transforms each token representation to **logits** for the full vocabulary.

Then, softmax is applied to convert them into probabilities:

$$
P(w_i \mid \text{context}) = \frac{\exp(\text{logit}_i)}{\sum_j \exp(\text{logit}_j)}
$$

The token with the highest probability is typically selected as the **predicted next word**.
""")

with st.expander("🔮 Predicted Next Token"):
    st.markdown(r"""
By chaining all steps (embedding → attention → FFN → softmax), the model predicts the **next token**:

E.g.,  
Input: `"Learning is"`  
Predicted next token: `"fun"`

This is how autoregressive models like GPT-2 **generate text** one token at a time.
""")

st.markdown("### 🎨 Visualizations: Transformer Internals")

# ---- 1. Attention Heatmap ----
with st.expander("🔁 Multi-Head Attention Score Heatmap (QKᵀ / √d)"):
    st.markdown("""
This heatmap shows how the attention mechanism scores each query against all keys.  
Brighter color = higher attention weight.

$$
\\text{Attention}(Q, K, V) = \\text{softmax}\\left( \\frac{QK^T}{\\sqrt{d_k}} \\right)V
$$
""", unsafe_allow_html=True)

    tokens = ["Learning", "is", "fun"]
    Q = np.array([[1, 0], [0.5, 0.5], [0, 1]])
    K = np.array([[1, 0], [0.5, 0.5], [0, 1]])
    scores = np.dot(Q, K.T) / np.sqrt(2)
    softmax_scores = np.exp(scores) / np.sum(np.exp(scores), axis=1, keepdims=True)

    fig1, ax1 = plt.subplots()
    cax = ax1.matshow(softmax_scores, cmap="Blues")
    fig1.colorbar(cax)
    ax1.set_xticks(np.arange(len(tokens)))
    ax1.set_xticklabels(tokens)
    ax1.set_yticks(np.arange(len(tokens)))
    ax1.set_yticklabels(tokens)
    ax1.set_xlabel("Key Tokens (K)")
    ax1.set_ylabel("Query Tokens (Q)")
    ax1.set_title("Attention Score Heatmap")
    st.pyplot(fig1)

# ---- 2. Softmax Curve ----
with st.expander("📊 Softmax Curve for Vocabulary Logits"):
    st.markdown("""
This curve shows how softmax converts logits into probabilities.  
Higher logits result in higher predicted probabilities:

$$
\\text{Softmax}(x_i) = \\frac{e^{x_i}}{\\sum_j e^{x_j}}
$$
""", unsafe_allow_html=True)

    x = np.linspace(-4, 4, 100)
    logits = np.vstack([x, x + 1, x - 1])
    exps = np.exp(logits)
    softmax = exps / np.sum(exps, axis=0)

    fig2, ax2 = plt.subplots()
    ax2.plot(x, softmax[0], label='Token A')
    ax2.plot(x, softmax[1], label='Token B')
    ax2.plot(x, softmax[2], label='Token C')
    ax2.set_title("Softmax Output vs Logit Value")
    ax2.set_xlabel("Logit")
    ax2.set_ylabel("Probability")
    ax2.legend()
    st.pyplot(fig2)