embedding-positional

Sleeping

App Files Files Community

schoginitoys commited on May 27, 2025

Commit

316297e

verified ·

1 Parent(s): 0ed5ac0

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +120 -678

src/streamlit_app.py CHANGED Viewed

@@ -1,694 +1,136 @@
-# ONCE
-# from transformers import GPT2TokenizerFast, GPT2Model
-# import os
-# # Load from local offline folder
-# model = GPT2Model.from_pretrained("./models")
-# tokenizer = GPT2TokenizerFast.from_pretrained("./models")
-# from transformers import GPT2Model, GPT2TokenizerFast
-# model = GPT2Model.from_pretrained("gpt2")
-# tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
-# model.save_pretrained("./models")
-# tokenizer.save_pretrained("./models")
-# model = GPT2Model.from_pretrained("openai-community/gpt2")
-# tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
-# model.save_pretrained("models")
-# tokenizer.save_pretrained("models")
-# from transformers import GPT2Tokenizer, GPT2Model
-# model_id = "gpt2"
-# GPT2Model.from_pretrained(model_id).save_pretrained("models")
-# GPT2Tokenizer.from_pretrained(model_id).save_pretrained("models")
-# print("✅ Downloaded and saved GPT-2 to models")
 import streamlit as st
-st.set_page_config(page_title="GPT-2 Attention Explorer", layout="wide")
-import torch
 import numpy as np
 from transformers import GPT2TokenizerFast, GPT2Model
-import seaborn as sns
-import matplotlib.pyplot as plt
-import pandas as pd
 @st.cache_resource
-def load_model():
-    tokenizer = GPT2TokenizerFast.from_pretrained("./models")
-    model = GPT2Model.from_pretrained("./models", output_attentions=True, attn_implementation="eager")
-    model.eval()
-    return tokenizer, model
-tokenizer, model = load_model()
-st.title("🧠 GPT-2 Token Inspector + Self-Attention Visualizer")
-with st.expander("📊 GPT-2 Model Architecture Summary"):
-    st.markdown("""
-    - **Vocabulary size (V):** `50257`
-    - **Embedding dimension (d):** `768`
-    - **Max Position Length (L):** `1024`
-      - This is sometimes also called:
-          - n_positions in config
-          - max sequence length
-          - context length
-          - max context window
-    - **Transformer Layers:** `12`
-    - **Attention Heads per Layer:** `12`
-    - **Per-head Dimension (dₖ):** `64`
-    - **Feedforward Hidden Layer Size:** `3072`
-    - **Total Parameters:** ~117 million
----
-## Question: Transformer Layers: 12 means each layer has 12 Attention Heads?
-## 🧠 Quick Answer:
-> ✅ **No**, 12 Transformer Layers ≠ 12 Heads per Layer
-> 🔁 But in **GPT-2 (small)**, both happen to be **12** — **by design coincidence**, not definition.
----
-## 🔍 Breakdown of GPT-2’s Architecture
-| Component                     | GPT-2 (small) default |
-| ----------------------------- | --------------------- |
-| Embedding size (`d_model`)    | 768                   |
-| **Transformer layers**        | 12                    |
-| **Attention heads per layer** | 12                    |
-| Hidden feedforward size       | 3072                  |
-| Max position embeddings       | 1024                  |
----
-### ✅ So in GPT-2:
-* Each of the **12 transformer layers** has:
-  * **Multi-head attention**
-  * With **12 heads per layer**
-  * Each head has `64` dimensions (`768 ÷ 12 = 64`)
----
-## 📌 Why this Confusion Happens
-The number of **layers** and **heads per layer** are:
-* Configured independently in the model
-* But **coincidentally** both set to 12 in GPT-2 small
-In other models:
-| Model        | Layers | Heads per Layer |
-| ------------ | ------ | --------------- |
-| GPT-2 Medium | 24     | 16              |
-| GPT-2 Large  | 36     | 20              |
-| GPT-3        | 96     | 96              |
-| LLaMA 2 7B   | 32     | 32              |
-So again:
-> 🔁 **12 layers ≠ 12 heads** in general — it's just a choice in GPT-2 small.
----
-## 💡 Want a table in your app to explain this too?
-I can give you a section like:
-> "🧩 Layers vs Heads — What's the Difference?"
-Let me know and I’ll drop in that Streamlit code too.
-    """)
-sentence = st.text_input("Enter a sentence:", "The cat sat on the mat")
-if st.button("Analyze & Visualize") and sentence.strip():
-    inputs = tokenizer(sentence, return_tensors='pt', return_offsets_mapping=True, return_special_tokens_mask=True)
-    token_ids = inputs['input_ids'][0]
-    tokens = tokenizer.convert_ids_to_tokens(token_ids)
-    position_ids = torch.arange(token_ids.shape[0]).unsqueeze(0)
-    inputs.pop("special_tokens_mask", None)
-    inputs.pop("offset_mapping", None)
-    with torch.no_grad():
-        outputs = model(**inputs, position_ids=position_ids)
-    attentions = outputs.attentions
-    embeddings = outputs.last_hidden_state[0].numpy()
-    pos_embedding_layer = model.wpe
-    pos_embeddings = pos_embedding_layer(position_ids).squeeze(0).detach().numpy()
-    word_embedding_layer = model.wte
-    word_embeddings = word_embedding_layer(token_ids).detach().numpy()
-    final_input = word_embeddings + pos_embeddings
-    # 1. BPE Tokens
-    st.subheader("🧾 Byte Pair Encoded Tokens (BPE)")
-    st.markdown("GPT-2 uses **Byte Pair Encoding (BPE)** to split input text into subword units.")
-    st.code(" ".join(tokens))
-    # 2. Token IDs
-    st.subheader("🔢 Token IDs")
-    st.markdown("Each token is mapped to an integer ID using the GPT-2 vocabulary.")
-    st.code(token_ids.tolist())
-    # 3. Word Embeddings
-    st.subheader("💎 Raw Word Embeddings (first 5 tokens)")
-    st.markdown("Each token ID is used to lookup a learnable word embedding vector:")
-    st.latex(r"\text{Embedding}(t_i) = \mathbf{E}[t_i]")
-    st.markdown(r"Where $\mathbf{E} \in \mathbb{R}^{V \times d}$ with $V$ = vocab size and $d = 768$.")
-    df_word_embed = pd.DataFrame(word_embeddings[:5])
-    df_word_embed.index = [f"{i}: {tok}" for i, tok in enumerate(tokens[:5])]
-    st.dataframe(df_word_embed.style.format(precision=4))
-    # 4. Positional Encodings
-    st.subheader("🧭 Positional Encodings (first 5 tokens)")
-    st.markdown("GPT-2 adds learned positional vectors from a table indexed by position:")
-    st.latex(r"\text{PosEnc}(i) = \mathbf{P}[i]")
-    st.markdown("Example (first 5 positions, first 5 dimensions):")
-    df_pos_example = pd.DataFrame(pos_embeddings[:5, :5],
-                                  columns=[f"dim {i}" for i in range(5)],
-                                  index=[f"{i}: {tok}" for i, tok in enumerate(tokens[:5])])
-    st.dataframe(df_pos_example.style.format(precision=5))
-    st.markdown(r"Where $\mathbf{P} \in \mathbb{R}^{L \times d}$ is learned and not sinusoidal in GPT-2.")
-    # 5. Final Input Vectors
-    st.subheader("🧮 Final Input = Word Embedding + Positional Encoding")
-    st.markdown("These are the actual vectors passed into the first transformer block:")
-    st.latex(r"\mathbf{X}_i = \text{Embedding}(t_i) + \text{PosEnc}(i)")
-    st.markdown("Let's confirm this by showing:")
-    st.code("final_input[i][j] ≈ word_embedding[i][j] + pos_embedding[i][j]")
-    for i in range(2):  # for first 2 tokens
-        df_sum_example = pd.DataFrame({
-            'Word': word_embeddings[i, :5],
-            'PosEnc': pos_embeddings[i, :5],
-            'Final Input': final_input[i, :5],
-            'Word + Pos': word_embeddings[i, :5] + pos_embeddings[i, :5]
-        })
-        df_sum_example.index = [f"dim {j}" for j in range(5)]
-        st.markdown(f"**Token {i}: `{tokens[i]}`**")
-        st.dataframe(df_sum_example.style.format(precision=5))
-    # 6. Output Embeddings
-    st.subheader("📐 Output Embedding Vectors (first 5 tokens)")
-    st.markdown("These are the final hidden states after passing through all transformer layers:")
-    st.latex(r"\text{Output}_i = \text{TransformerLayers}(\mathbf{X}_i)")
-    df_embed_example = pd.DataFrame(embeddings[:5, :5],
-                                     columns=[f"dim {j}" for j in range(5)],
-                                     index=[f"{i}: {tok}" for i, tok in enumerate(tokens[:5])])
-    st.dataframe(df_embed_example.style.format(precision=5))
-    st.markdown("📌 These are **not** equal to the input vectors—they are fully context-aware representations!")
-    # 🔄 Move sliders here just above heatmap
-    layer_num = st.slider("Select Transformer Layer", 0, model.config.n_layer - 1, 0)
-    head_num = st.slider("Select Attention Head", 0, model.config.n_head - 1, 0)
-    attn = attentions[layer_num][0, head_num].numpy()
-    # 7. Attention Heatmap
-    st.subheader(f"🎯 Attention Heatmap — Layer {layer_num+1}, Head {head_num+1}")
-    st.markdown("This shows how each token attends to others in the sequence:")
-    st.latex(r"\text{Attention}(Q, K, V) = \text{softmax} \left( \frac{QK^\top}{\sqrt{d_k}} \right) V")
-    fig, ax = plt.subplots(figsize=(8, 6))
-    sns.heatmap(attn, xticklabels=tokens, yticklabels=tokens, cmap="YlOrRd", annot=True, fmt=".2f", ax=ax)
-    ax.set_xlabel("Key Tokens")
-    ax.set_ylabel("Query Tokens")
-    st.pyplot(fig)
-    # 8. Attention Head Breakdown (for token 0)
-    st.subheader("🔍 Attention Head Breakdown (1 Token)")
-    st.markdown("Let's inspect how **GPT-2 computes attention for a single token** (first token in the sequence).")
-    # Fetch weight matrix for Q, K, V from the model's first block
-    # block = model.transformer.h[0]  # Use layer 0
-    block = model.h[0]  # ✅ Correct for GPT2Model
-    # W_qkv = block.attn.c_attn.weight.detach().numpy().T  # shape (768, 3*768)
-    W_qkv = block.attn.c_attn.weight.detach().numpy()  # ✅ shape (2304, 768)
-    b_qkv = block.attn.c_attn.bias.detach().numpy()      # shape (3*768,)
-    # Final input for token 0
-    x0 = final_input[0]  # shape (768,)
-    # Linear projection for Q, K, V
-    qkv = x0 @ W_qkv + b_qkv  # shape (3*768,)
-    Q, K, V = np.split(qkv, 3)
-    # Show Q, K, V for head 0
-    Q0 = Q[:64]
-    K0_all = K.reshape(12, 64)  # For all heads
-    V0_all = V.reshape(12, 64)
-    K0 = K0_all[0]
-    V0 = V0_all[0]
-    # Dot product and softmax
-    score = Q0 @ K0.T  # scalar
-    scaled_score = score / np.sqrt(64)
-    softmax_weight = np.exp(scaled_score) / np.sum(np.exp(scaled_score))
-    attn_output = softmax_weight * V0  # simulated for 1 token self-attending to itself
-    st.markdown("### Formula Recap")
-    st.latex(r"Q = x W^Q,\quad K = x W^K,\quad V = x W^V")
-    st.latex(r"\text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^\top}{\sqrt{d_k}}\right)V")
-    # Show Q0, K0, softmax and V0
-    df_breakdown = pd.DataFrame({
-        "Q₀": Q0,
-        "K₀": K0,
-        "Q₀·K₀": Q0 * K0,
-        "V₀": V0,
-        "AttnOut": attn_output
-    })
-    df_breakdown.index = [f"dim {i}" for i in range(64)]
-    st.dataframe(df_breakdown.style.format(precision=5))
-    st.markdown("### 🧮 Self-Attention Matrix Shape Annotations")
-    st.markdown("""
-    **Key tensor dimensions involved in attention computation:**
-    - `W_qkv`: **(2304, 768)** – learned projection matrix for Q, K, V combined
-    - `b_qkv`: **(2304,)** – bias vector
-    - `X`: **(5, 768)** – input vectors for 5 tokens
-    - `qkv_all = X @ W_qkv + b_qkv`: → **(5, 2304)**
-    - `Q_all, K_all, V_all = np.split(qkv_all, 3)`: → each **(5, 768)**
-    - `Q0, K0, V0 = [:, :64]`: head 0 slice → **(5, 64)**
-    - `q0 @ K0.T`: **(1, 64) × (64, 5)** → **(1, 5)**
-    - `softmax_weights`: **(1, 5)**
-    - `attn_output = softmax_weights @ V0`: **(1, 64)**
-    """)
-    # 9. Matrix-Level Self-Attention (Token 0 → All)
-    st.subheader("🔬 Matrix-Level Self-Attention (Token 0 → All)")
-    st.markdown("""
-    This section shows how **Token 0** attends to all other tokens using matrix-level self-attention.
-    We compute the dot products, apply softmax, and produce the output for head 0 in layer 0.
-    """)
-    # Use same block
-    block = model.h[0]
-    W_qkv = block.attn.c_attn.weight.detach().numpy()  # (2304, 768)
-    b_qkv = block.attn.c_attn.bias.detach().numpy()    # (2304,)
-    X = final_input[:5]  # (5, 768)
-    # Compute Q, K, V for all 5 tokens
-    # qkv_all = X @ W_qkv.T + b_qkv  # shape (5, 2304)
-    qkv_all = X @ W_qkv + b_qkv  # ✅ (5 × 768) @ (768 × 2304)
-    Q_all, K_all, V_all = np.split(qkv_all, 3, axis=1)
-    # Head 0 slices
-    Q0 = Q_all[:, :64]   # (5, 64)
-    K0 = K_all[:, :64]   # (5, 64)
-    V0 = V_all[:, :64]   # (5, 64)
-    # Compute raw attention scores for token 0
-    q0 = Q0[0].reshape(1, 64)        # (1, 64)
-    attn_scores = q0 @ K0.T          # (1, 5)
-    scaled_scores = attn_scores / np.sqrt(64)
-    softmax_weights = np.exp(scaled_scores)
-    softmax_weights /= softmax_weights.sum(axis=-1, keepdims=True)  # shape (1, 5)
-    # Weighted sum of V0 rows
-    attn_output_0 = softmax_weights @ V0  # (1, 64)
-    # Display matrices
-    st.markdown("### Raw Scaled Attention Scores (Q₀Kᵀ / √dₖ):")
-    df_scores = pd.DataFrame(scaled_scores[0], columns=["Score"], index=[f"Token {i}" for i in range(5)])
-    st.dataframe(df_scores.style.format(precision=5))
-    st.markdown("### Softmax Attention Weights αᵢ:")
-    df_weights = pd.DataFrame(softmax_weights[0], columns=["Weight αᵢ"], index=[f"Token {i}" for i in range(5)])
-    st.dataframe(df_weights.style.format(precision=5))
-    st.markdown("### Value Vᵢ vectors (Head 0, first 5 dims):")
-    df_values = pd.DataFrame(V0[:, :5], columns=[f"dim {i}" for i in range(5)],
-                             index=[f"Token {i}" for i in range(5)])
-    st.dataframe(df_values.style.format(precision=5))
-    st.markdown("### Final Attention Output (weighted sum of Vᵢ):")
-    df_attn_out = pd.DataFrame(attn_output_0[:, :5], columns=[f"dim {i}" for i in range(5)],
-                               index=["AttnOut₀"])
-    st.dataframe(df_attn_out.style.format(precision=5))
-    # 10. Per-Head Projection Matrices
-    st.subheader("🧬 Per-Head Projection Matrices (Wq, Wk, Wv)")
-    st.markdown("""
-    In GPT-2, each attention **head has its own set of projection weights** to compute Queries (Q), Keys (K), and Values (V) from the input vector.
-    The full `W_qkv` layer maps from **(768,) → (2304,)** and is split into 3 parts:
-    - `Wq` = first 768 columns → shape `(768, 768)`
-    - `Wk` = next 768 columns  → shape `(768, 768)`
-    - `Wv` = last 768 columns  → shape `(768, 768)`
-    Each head receives a unique slice from each projection:
-    - 12 heads × 64 dimensions = 768
-    - So head 0 → `Wq[:, :64]`, head 1 → `Wq[:, 64:128]`, etc.
-    """)
-    block = model.h[0]
-    W_qkv_full = block.attn.c_attn.weight.detach().numpy().T  # shape (768, 2304)
-    W_q, W_k, W_v = np.split(W_qkv_full, 3, axis=1)  # each: (768, 768)
-    # Show Wq head 0 and 1
-    Wq_head0 = W_q[:, :64]
-    Wq_head1 = W_q[:, 64:128]
-    df_q = pd.DataFrame({
-        "Wq_head0": Wq_head0[:5, 0],
-        "Wq_head1": Wq_head1[:5, 0]
-    }, index=[f"dim {i}" for i in range(5)])
-    st.markdown("### Wq projection weights for head 0 vs head 1 (first 5 input dims → output dim 0):")
-    st.dataframe(df_q.style.format(precision=5))
-    # Show Wk and Wv for head 0
-    Wk_head0 = W_k[:, :64]
-    Wv_head0 = W_v[:, :64]
-    df_kv = pd.DataFrame({
-        "Wk_head0": Wk_head0[:5, 0],
-        "Wv_head0": Wv_head0[:5, 0]
-    }, index=[f"dim {i}" for i in range(5)])
-    st.markdown("### Wk and Wv projection weights for head 0 (first 5 input dims → output dim 0):")
-    st.dataframe(df_kv.style.format(precision=5))
-    st.markdown("""
-    ✅ This confirms that each head has **distinct projections** for Q, K, and V.
-    The same input `x` is transformed differently per head, allowing GPT-2 to learn different attention perspectives.
-    """)
-    # 11 · 📐 How W_qkv Projects an Input Vector into Q, K, V
-    st.subheader("📐 How W_qkv Projects an Input Vector → Q, K, V")
-    st.markdown("""
-    In GPT-2, the combined projection layer `c_attn` maps a single input embedding
-    into a concatenated vector that contains **Q, K, and V**.
-    Each of these is 768-dimensional, so the full output is 768 × 3 = 2304.
-    """)
-    st.latex(r"x \in \mathbb{R}^{768} \quad \rightarrow \quad [Q \;|\; K \;|\; V] \in \mathbb{R}^{2304}")
-    st.markdown("---")
-    st.markdown("### 🧪 Mini GPT Example (3D → 6D Projection)")
-    st.markdown("Imagine a tiny model:")
-    st.markdown("""
-    - Input vector `x ∈ ℝ³`
-    - Q, K, V are each 2D → total output = 6D
-    - Thus:
-    """)
-    st.latex(r"W_{\text{qkv}} \in \mathbb{R}^{6 \times 3}, \quad b_{\text{qkv}} \in \mathbb{R}^6")
-    # Miniature input vector and projection weights
-    mini_x = np.array([1.0, 2.0, 3.0])                # (3,)
-    mini_W = np.array(                                 # (6, 3)
-        [
-            [0.1, 0.2, 0.3],   # → Q₁
-            [0.4, 0.5, 0.6],   # → Q₂
-            [0.7, 0.8, 0.9],   # → K₁
-            [1.0, 1.1, 1.2],   # → K₂
-            [1.3, 1.4, 1.5],   # → V₁
-            [1.6, 1.7, 1.8],   # → V₂
-        ]
     )
-    mini_b = np.array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06])  # (6,)
-    mini_out = mini_W @ mini_x + mini_b                      # (6,)
-    Qm, Km, Vm = np.split(mini_out, 3)                       # each (2,)
-    st.code("Input vector x = [1.0, 2.0, 3.0]   # shape (3,)")
-    st.code("W_qkv shape = (6, 3)   # maps 3 → 6")
-    st.code(f"Output = W_qkv @ x + b = {mini_out.round(2).tolist()}")
-    df_mini = pd.DataFrame(
-        {
-            "Q": Qm.round(2),
-            "K": Km.round(2),
-            "V": Vm.round(2)
-        },
-        index=["dim 1", "dim 2"]
     )
-    st.markdown("**Split into Q, K, V (each 2D):**")
-    st.dataframe(df_mini.style.format(precision=2))
-    st.markdown("---")
-    st.markdown("### 📏 Real GPT-2 Projection Shapes")
-    df_shapes = pd.DataFrame({
-        "Tensor": [
-            "Input x",
-            "W_qkv (linear layer)",
-            "b_qkv (bias)",
-            "Output = x @ W_qkv + b",
-            "Q / K / V each",
-            "Head reshaping"
-        ],
-        "Shape": [
-            "(768,)",
-            "(2304, 768)",
-            "(2304,)",
-            "(2304,)",
-            "(768,)",
-            "12 heads × 64 dims = 768"
-        ]
-    })
-    st.dataframe(df_shapes)
-    st.markdown("""
-    Each attention **head** gets its own slice:
-    - Q_head₀ = Q[:, :64]
-    - K_head₀ = K[:, :64]
-    - V_head₀ = V[:, :64]
-    That’s how one input vector creates multi-headed Q, K, and V for scaled dot-product attention.
-    """)
-    st.subheader("Additional notes:")
-    st.markdown(
-        """
----
-## 🧠 What Does `Ġ` Mean?
-The character `Ġ` (U+0120: Latin Capital Letter G with dot above) is used to:
-> **Represent a leading space** before the token.
----
-### ✅ Example:
-Let’s look at a sentence:
-```
-"The cat sat on the mat"
-```
-When tokenized using GPT-2 tokenizer (`GPT2TokenizerFast`), it becomes:
-```
-['The', 'Ġcat', 'Ġsat', 'Ġon', 'Ġthe', 'Ġmat']
-```
-* `'The'` → First word, no leading space.
-* `'Ġcat'` → Space + "cat"
-* `'Ġsat'` → Space + "sat"
-* etc.
-So `Ġ` means:
-> "This token starts after a space."
----
-### ⚠️ Why Not Just Use `" "`?
-Because GPT-2 uses a **vocabulary of subword units** (BPE). These tokens are strings, not raw characters or bytes. Including space as a separate token would have complicated the merge process. So:
-* `Ġ` = internal marker used in the vocabulary file
-* It's not a space character but tells the tokenizer "insert space before decoding this."
----
-### ✅ When Detokenizing
-The tokenizer **removes the `Ġ` and adds a space** during decoding:
-```python
-from transformers import GPT2TokenizerFast
-tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
-tokens = tokenizer.tokenize("The cat sat on the mat")
-print(tokens)
-# ['The', 'Ġcat', 'Ġsat', 'Ġon', 'Ġthe', 'Ġmat']
-ids = tokenizer.convert_tokens_to_ids(tokens)
-decoded = tokenizer.decode(ids)
-print(decoded)
-# 'The cat sat on the mat'
-```
----
-## ✅ Summary
-| Token    | Interprets As             |
-| -------- | ------------------------- |
-| `'The'`  | `'The'` (no space before) |
-| `'Ġcat'` | `' cat'`                  |
-| `'Ġsat'` | `' sat'`                  |
-| `'Ġon'`  | `' on'`                   |
-| `'Ġthe'` | `' the'`                  |
-| `'Ġmat'` | `' mat'`                  |
----
-## ✅ What is `@` in Python?
-In Python 3.5+, the `@` operator means:
-> **Matrix multiplication** (also called **dot product** or **tensor contraction** depending on context)
----
-### ✅ Equivalent to:
-```python
-A @ B    ⟺    np.matmul(A, B)
-```
-Or if both are 1D/2D NumPy arrays:
-```python
-A @ B    ⟺    np.dot(A, B)
-```
----
-## 🔍 In your case:
-```python
-Output = W_qkv @ x + b
-```
-### Let’s say:
-* `x` = shape **(3,)**
-* `W_qkv` = shape **(6, 3)**
-* `b` = shape **(6,)**
----
-### Then:
-* `W_qkv @ x` → matrix–vector multiplication
-  → shape: **(6,)**
-* Adding `b` → element-wise vector addition
-  → final shape: **(6,)**
----
-### So this line:
-```python
-Output = W_qkv @ x + b
-```
-Means:
-1. Multiply the **input vector `x`** with the **projection matrix `W_qkv`**
-2. Add a **bias vector `b`**
-3. Result = combined **\[Q | K | V]** output
----
-## ✅ Example:
-```python
-x = np.array([1, 2, 3])
-W_qkv = np.array([
-  [0.1, 0.2, 0.3],  # Q1
-  [0.4, 0.5, 0.6],  # Q2
-  [0.7, 0.8, 0.9],  # K1
-  [1.0, 1.1, 1.2],  # K2
-  [1.3, 1.4, 1.5],  # V1
-  [1.6, 1.7, 1.8],  # V2
-])
-b = np.array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06])
-output = W_qkv @ x + b
-```
-Manually:
-* `W_qkv @ x` = `[1.4, 3.2, 5.0, 6.8, 8.6, 10.4]`
-* After adding `b` → `[1.41, 3.22, 5.03, 6.84, 8.65, 10.46]`
----
-## ✅ Summary
-| Expression    | Meaning                       |
-| ------------- | ----------------------------- |
-| `@`           | Matrix multiplication (`dot`) |
-| `W @ x + b`   | Linear transformation         |
-| Shape `W @ x` | `(m, n) @ (n,) = (m,)`        |
-Would you like to include this in your Streamlit visualizer as an expandable note or equation section?
-        """)

 import streamlit as st
 import numpy as np
 from transformers import GPT2TokenizerFast, GPT2Model
+# 1. Load tokenizer and model
 @st.cache_resource
+def load_resources():
+    # tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
+    tokenizer = GPT2TokenizerFast.from_pretrained("./assets/tokenizer", local_files_only=True)
+    # model = GPT2Model.from_pretrained("gpt2")
+    model = GPT2Model.from_pretrained("./assets/model", local_files_only=True)
+    # from transformers import GPT2TokenizerFast
+    # # Load tokenizer from bundled local files only
+    #
+    return tokenizer, model
+# Initialize resources
+tokenizer, model = load_resources()
+# 2. Helper to get the full embedding matrix
+@st.cache_resource
+def get_embedding_matrix():
+    return model.get_input_embeddings().weight.detach().cpu().numpy()
+# 3. Initialize session state
+for key in ["tokens", "token_ids", "embeddings", "current_id"]:
+    if key not in st.session_state:
+        if key in ["tokens", "token_ids"]:
+            st.session_state[key] = []
+        else:
+            st.session_state[key] = {} if key == "embeddings" else None
+st.title("🔍 Embedding & Positional Encoding Explorer")
+# 4. Sentence input & BPE tokenize
+sentence = st.text_input("Enter a sentence to tokenize:")
+if st.button("BPE Tokenize"):
+    ids = tokenizer.encode(sentence, add_special_tokens=False)
+    toks = tokenizer.convert_ids_to_tokens(ids)
+    st.session_state.tokens = toks
+    st.session_state.token_ids = ids
+# 5. Display tokens + IDs with embedding buttons
+if st.session_state.tokens:
+    st.subheader("Tokens and IDs")
+    cols = st.columns([4, 1])
+    for i, (tok, tid) in enumerate(zip(st.session_state.tokens, st.session_state.token_ids)):
+        cols[0].write(f"{i+1}. **{tok}** → ID {tid}")
+        if cols[1].button(f"Create Embedding for {tid}", key=f"embed_{tid}"):
+            vec = model.get_input_embeddings().weight[tid].detach().cpu().numpy()
+            st.session_state.embeddings[tid] = vec.copy()
+            st.session_state.current_id = tid
+# 6. Show & edit embedding sliders for selected token
+if st.session_state.current_id is not None:
+    tok_id = st.session_state.current_id
+    emb_vec = st.session_state.embeddings[tok_id]
+    st.subheader(f"Embedding for token ID {tok_id}")
+    for dim in range(len(emb_vec)):
+        emb_vec[dim] = st.slider(
+            f"Emb Dim {dim}", -5.0, 5.0, float(emb_vec[dim]), step=0.01,
+            key=f"slider_{tok_id}_{dim}"
+        )
+    st.session_state.embeddings[tok_id] = emb_vec
+    # 7. Similarity search on current embedding
+    # if st.button("Similarity Search", key="sim_search"):
+    #     matrix = get_embedding_matrix()
+    #     query = emb_vec
+    #     dot = matrix.dot(query)
+    #     mat_norm = np.linalg.norm(matrix, axis=1)
+    #     q_norm = np.linalg.norm(query)
+    #     sims = dot / (mat_norm * q_norm + 1e-12)
+    #     topk = (-sims).argsort()[1:21]
+    #     st.write("**Top 20 similar tokens:**")
+    #     for idx in topk:
+    #         token_str = tokenizer.convert_ids_to_tokens([idx])[0]
+    #         st.write(f"ID {idx} ({token_str}): {sims[idx]:.4f}")
+    # 8. Positional Encoding inputs
+    st.subheader("Positional Encoding")
+    # Show formula in LaTeX
+    st.markdown(r"""
+**Positional Encoding Formula**
+For position $p$ and dimension $d$ (where $D$ is the embedding size):
+$$
+PE(p,d) = \begin{cases}
+\sin\bigl(\frac{p}{10000^{d / D}}\bigr), & \text{if } d \text{ is even} \\
+\cos\bigl(\frac{p}{10000^{(d-1) / D}}\bigr), & \text{if } d \text{ is odd}
+\end{cases}
+$$
+""")
+    pos = st.number_input("Position (p)", min_value=0, format="%d")
+    dim = st.number_input(
+        "Dimension index (0-based)", min_value=0, max_value=len(emb_vec)-1, format="%d"
     )
+    emb_dim = st.number_input(
+        "Embedding Dimension (vector length)", value=len(emb_vec), format="%d"
     )
+    # 9. Add Pos Encoding
+    if st.button("Compute and Add Pos Encoding to the Embedding"):
+        p, d, D = int(pos), int(dim), int(emb_dim)
+        if 0 <= d < D:
+            if d % 2 == 0:
+                pe = np.sin(p / (10000 ** (d / D)))
+            else:
+                pe = np.cos(p / (10000 ** ((d - 1) / D)))
+            emb_vec[d] += pe
+            st.session_state.embeddings[tok_id] = emb_vec
+        else:
+            st.error("Dimension index out of range.")
+    # 10. Similarity search with positional encoding
+    if st.button("Similarity Search (Using the Embedding)", key="sim_search_pos"):
+        matrix = get_embedding_matrix()
+        query = st.session_state.embeddings[tok_id]
+        dot = matrix.dot(query)
+        mat_norm = np.linalg.norm(matrix, axis=1)
+        q_norm = np.linalg.norm(query)
+        sims = dot / (mat_norm * q_norm + 1e-12)
+        topk = (-sims).argsort()[1:21]
+        st.write("**Top 20 similar tokens after PosEnc:**")
+        for idx in topk:
+            token_str = tokenizer.convert_ids_to_tokens([idx])[0]
+            st.write(f"ID {idx} ({token_str}): {sims[idx]:.4f}")