Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +120 -678
src/streamlit_app.py
CHANGED
|
@@ -1,694 +1,136 @@
|
|
| 1 |
-
|
| 2 |
-
# ONCE
|
| 3 |
-
# from transformers import GPT2TokenizerFast, GPT2Model
|
| 4 |
-
# import os
|
| 5 |
-
|
| 6 |
-
# # Load from local offline folder
|
| 7 |
-
# model = GPT2Model.from_pretrained("./models")
|
| 8 |
-
# tokenizer = GPT2TokenizerFast.from_pretrained("./models")
|
| 9 |
-
|
| 10 |
-
# from transformers import GPT2Model, GPT2TokenizerFast
|
| 11 |
-
|
| 12 |
-
# model = GPT2Model.from_pretrained("gpt2")
|
| 13 |
-
# tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
|
| 14 |
-
|
| 15 |
-
# model.save_pretrained("./models")
|
| 16 |
-
# tokenizer.save_pretrained("./models")
|
| 17 |
-
|
| 18 |
-
# model = GPT2Model.from_pretrained("openai-community/gpt2")
|
| 19 |
-
# tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
|
| 20 |
-
# model.save_pretrained("models")
|
| 21 |
-
# tokenizer.save_pretrained("models")
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
# from transformers import GPT2Tokenizer, GPT2Model
|
| 25 |
-
|
| 26 |
-
# model_id = "gpt2"
|
| 27 |
-
# GPT2Model.from_pretrained(model_id).save_pretrained("models")
|
| 28 |
-
# GPT2Tokenizer.from_pretrained(model_id).save_pretrained("models")
|
| 29 |
-
|
| 30 |
-
# print("✅ Downloaded and saved GPT-2 to models")
|
| 31 |
-
|
| 32 |
-
|
| 33 |
import streamlit as st
|
| 34 |
-
st.set_page_config(page_title="GPT-2 Attention Explorer", layout="wide")
|
| 35 |
-
|
| 36 |
-
import torch
|
| 37 |
import numpy as np
|
| 38 |
from transformers import GPT2TokenizerFast, GPT2Model
|
| 39 |
-
import seaborn as sns
|
| 40 |
-
import matplotlib.pyplot as plt
|
| 41 |
-
import pandas as pd
|
| 42 |
|
|
|
|
| 43 |
@st.cache_resource
|
| 44 |
-
def
|
| 45 |
-
tokenizer = GPT2TokenizerFast.from_pretrained("
|
| 46 |
-
|
| 47 |
-
model.
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
tokenizer, model = load_model()
|
| 51 |
-
|
| 52 |
-
st.title("🧠 GPT-2 Token Inspector + Self-Attention Visualizer")
|
| 53 |
-
|
| 54 |
-
with st.expander("📊 GPT-2 Model Architecture Summary"):
|
| 55 |
-
st.markdown("""
|
| 56 |
-
- **Vocabulary size (V):** `50257`
|
| 57 |
-
- **Embedding dimension (d):** `768`
|
| 58 |
-
- **Max Position Length (L):** `1024`
|
| 59 |
-
- This is sometimes also called:
|
| 60 |
-
- n_positions in config
|
| 61 |
-
- max sequence length
|
| 62 |
-
- context length
|
| 63 |
-
- max context window
|
| 64 |
-
- **Transformer Layers:** `12`
|
| 65 |
-
- **Attention Heads per Layer:** `12`
|
| 66 |
-
- **Per-head Dimension (dₖ):** `64`
|
| 67 |
-
- **Feedforward Hidden Layer Size:** `3072`
|
| 68 |
-
- **Total Parameters:** ~117 million
|
| 69 |
-
|
| 70 |
-
---
|
| 71 |
-
|
| 72 |
-
## Question: Transformer Layers: 12 means each layer has 12 Attention Heads?
|
| 73 |
-
|
| 74 |
-
## 🧠 Quick Answer:
|
| 75 |
-
|
| 76 |
-
> ✅ **No**, 12 Transformer Layers ≠ 12 Heads per Layer
|
| 77 |
-
> 🔁 But in **GPT-2 (small)**, both happen to be **12** — **by design coincidence**, not definition.
|
| 78 |
-
|
| 79 |
-
---
|
| 80 |
-
|
| 81 |
-
## 🔍 Breakdown of GPT-2’s Architecture
|
| 82 |
-
|
| 83 |
-
| Component | GPT-2 (small) default |
|
| 84 |
-
| ----------------------------- | --------------------- |
|
| 85 |
-
| Embedding size (`d_model`) | 768 |
|
| 86 |
-
| **Transformer layers** | 12 |
|
| 87 |
-
| **Attention heads per layer** | 12 |
|
| 88 |
-
| Hidden feedforward size | 3072 |
|
| 89 |
-
| Max position embeddings | 1024 |
|
| 90 |
-
|
| 91 |
-
---
|
| 92 |
-
|
| 93 |
-
### ✅ So in GPT-2:
|
| 94 |
-
|
| 95 |
-
* Each of the **12 transformer layers** has:
|
| 96 |
-
|
| 97 |
-
* **Multi-head attention**
|
| 98 |
-
* With **12 heads per layer**
|
| 99 |
-
* Each head has `64` dimensions (`768 ÷ 12 = 64`)
|
| 100 |
-
|
| 101 |
-
---
|
| 102 |
-
|
| 103 |
-
## 📌 Why this Confusion Happens
|
| 104 |
-
|
| 105 |
-
The number of **layers** and **heads per layer** are:
|
| 106 |
-
|
| 107 |
-
* Configured independently in the model
|
| 108 |
-
* But **coincidentally** both set to 12 in GPT-2 small
|
| 109 |
-
|
| 110 |
-
In other models:
|
| 111 |
-
|
| 112 |
-
| Model | Layers | Heads per Layer |
|
| 113 |
-
| ------------ | ------ | --------------- |
|
| 114 |
-
| GPT-2 Medium | 24 | 16 |
|
| 115 |
-
| GPT-2 Large | 36 | 20 |
|
| 116 |
-
| GPT-3 | 96 | 96 |
|
| 117 |
-
| LLaMA 2 7B | 32 | 32 |
|
| 118 |
-
|
| 119 |
-
So again:
|
| 120 |
-
|
| 121 |
-
> 🔁 **12 layers ≠ 12 heads** in general — it's just a choice in GPT-2 small.
|
| 122 |
-
|
| 123 |
-
---
|
| 124 |
-
|
| 125 |
-
## 💡 Want a table in your app to explain this too?
|
| 126 |
-
|
| 127 |
-
I can give you a section like:
|
| 128 |
-
|
| 129 |
-
> "🧩 Layers vs Heads — What's the Difference?"
|
| 130 |
-
|
| 131 |
-
Let me know and I’ll drop in that Streamlit code too.
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
""")
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
sentence = st.text_input("Enter a sentence:", "The cat sat on the mat")
|
| 139 |
-
|
| 140 |
-
if st.button("Analyze & Visualize") and sentence.strip():
|
| 141 |
-
|
| 142 |
-
inputs = tokenizer(sentence, return_tensors='pt', return_offsets_mapping=True, return_special_tokens_mask=True)
|
| 143 |
-
token_ids = inputs['input_ids'][0]
|
| 144 |
-
tokens = tokenizer.convert_ids_to_tokens(token_ids)
|
| 145 |
-
position_ids = torch.arange(token_ids.shape[0]).unsqueeze(0)
|
| 146 |
-
|
| 147 |
-
inputs.pop("special_tokens_mask", None)
|
| 148 |
-
inputs.pop("offset_mapping", None)
|
| 149 |
-
|
| 150 |
-
with torch.no_grad():
|
| 151 |
-
outputs = model(**inputs, position_ids=position_ids)
|
| 152 |
-
|
| 153 |
-
attentions = outputs.attentions
|
| 154 |
-
embeddings = outputs.last_hidden_state[0].numpy()
|
| 155 |
-
|
| 156 |
-
pos_embedding_layer = model.wpe
|
| 157 |
-
pos_embeddings = pos_embedding_layer(position_ids).squeeze(0).detach().numpy()
|
| 158 |
-
|
| 159 |
-
word_embedding_layer = model.wte
|
| 160 |
-
word_embeddings = word_embedding_layer(token_ids).detach().numpy()
|
| 161 |
-
|
| 162 |
-
final_input = word_embeddings + pos_embeddings
|
| 163 |
-
|
| 164 |
-
# 1. BPE Tokens
|
| 165 |
-
st.subheader("🧾 Byte Pair Encoded Tokens (BPE)")
|
| 166 |
-
st.markdown("GPT-2 uses **Byte Pair Encoding (BPE)** to split input text into subword units.")
|
| 167 |
-
st.code(" ".join(tokens))
|
| 168 |
-
|
| 169 |
-
# 2. Token IDs
|
| 170 |
-
st.subheader("🔢 Token IDs")
|
| 171 |
-
st.markdown("Each token is mapped to an integer ID using the GPT-2 vocabulary.")
|
| 172 |
-
st.code(token_ids.tolist())
|
| 173 |
-
|
| 174 |
-
# 3. Word Embeddings
|
| 175 |
-
st.subheader("💎 Raw Word Embeddings (first 5 tokens)")
|
| 176 |
-
st.markdown("Each token ID is used to lookup a learnable word embedding vector:")
|
| 177 |
-
st.latex(r"\text{Embedding}(t_i) = \mathbf{E}[t_i]")
|
| 178 |
-
st.markdown(r"Where $\mathbf{E} \in \mathbb{R}^{V \times d}$ with $V$ = vocab size and $d = 768$.")
|
| 179 |
-
df_word_embed = pd.DataFrame(word_embeddings[:5])
|
| 180 |
-
df_word_embed.index = [f"{i}: {tok}" for i, tok in enumerate(tokens[:5])]
|
| 181 |
-
st.dataframe(df_word_embed.style.format(precision=4))
|
| 182 |
-
|
| 183 |
-
# 4. Positional Encodings
|
| 184 |
-
st.subheader("🧭 Positional Encodings (first 5 tokens)")
|
| 185 |
-
st.markdown("GPT-2 adds learned positional vectors from a table indexed by position:")
|
| 186 |
-
st.latex(r"\text{PosEnc}(i) = \mathbf{P}[i]")
|
| 187 |
-
|
| 188 |
-
st.markdown("Example (first 5 positions, first 5 dimensions):")
|
| 189 |
-
df_pos_example = pd.DataFrame(pos_embeddings[:5, :5],
|
| 190 |
-
columns=[f"dim {i}" for i in range(5)],
|
| 191 |
-
index=[f"{i}: {tok}" for i, tok in enumerate(tokens[:5])])
|
| 192 |
-
st.dataframe(df_pos_example.style.format(precision=5))
|
| 193 |
-
|
| 194 |
-
st.markdown(r"Where $\mathbf{P} \in \mathbb{R}^{L \times d}$ is learned and not sinusoidal in GPT-2.")
|
| 195 |
-
|
| 196 |
-
# 5. Final Input Vectors
|
| 197 |
-
st.subheader("🧮 Final Input = Word Embedding + Positional Encoding")
|
| 198 |
-
st.markdown("These are the actual vectors passed into the first transformer block:")
|
| 199 |
-
st.latex(r"\mathbf{X}_i = \text{Embedding}(t_i) + \text{PosEnc}(i)")
|
| 200 |
-
|
| 201 |
-
st.markdown("Let's confirm this by showing:")
|
| 202 |
-
st.code("final_input[i][j] ≈ word_embedding[i][j] + pos_embedding[i][j]")
|
| 203 |
-
|
| 204 |
-
for i in range(2): # for first 2 tokens
|
| 205 |
-
df_sum_example = pd.DataFrame({
|
| 206 |
-
'Word': word_embeddings[i, :5],
|
| 207 |
-
'PosEnc': pos_embeddings[i, :5],
|
| 208 |
-
'Final Input': final_input[i, :5],
|
| 209 |
-
'Word + Pos': word_embeddings[i, :5] + pos_embeddings[i, :5]
|
| 210 |
-
})
|
| 211 |
-
df_sum_example.index = [f"dim {j}" for j in range(5)]
|
| 212 |
-
st.markdown(f"**Token {i}: `{tokens[i]}`**")
|
| 213 |
-
st.dataframe(df_sum_example.style.format(precision=5))
|
| 214 |
-
|
| 215 |
-
# 6. Output Embeddings
|
| 216 |
-
st.subheader("📐 Output Embedding Vectors (first 5 tokens)")
|
| 217 |
-
st.markdown("These are the final hidden states after passing through all transformer layers:")
|
| 218 |
-
st.latex(r"\text{Output}_i = \text{TransformerLayers}(\mathbf{X}_i)")
|
| 219 |
-
|
| 220 |
-
df_embed_example = pd.DataFrame(embeddings[:5, :5],
|
| 221 |
-
columns=[f"dim {j}" for j in range(5)],
|
| 222 |
-
index=[f"{i}: {tok}" for i, tok in enumerate(tokens[:5])])
|
| 223 |
-
st.dataframe(df_embed_example.style.format(precision=5))
|
| 224 |
-
|
| 225 |
-
st.markdown("📌 These are **not** equal to the input vectors—they are fully context-aware representations!")
|
| 226 |
-
|
| 227 |
-
# 🔄 Move sliders here just above heatmap
|
| 228 |
-
layer_num = st.slider("Select Transformer Layer", 0, model.config.n_layer - 1, 0)
|
| 229 |
-
head_num = st.slider("Select Attention Head", 0, model.config.n_head - 1, 0)
|
| 230 |
-
attn = attentions[layer_num][0, head_num].numpy()
|
| 231 |
-
|
| 232 |
-
# 7. Attention Heatmap
|
| 233 |
-
st.subheader(f"🎯 Attention Heatmap — Layer {layer_num+1}, Head {head_num+1}")
|
| 234 |
-
st.markdown("This shows how each token attends to others in the sequence:")
|
| 235 |
-
st.latex(r"\text{Attention}(Q, K, V) = \text{softmax} \left( \frac{QK^\top}{\sqrt{d_k}} \right) V")
|
| 236 |
-
fig, ax = plt.subplots(figsize=(8, 6))
|
| 237 |
-
sns.heatmap(attn, xticklabels=tokens, yticklabels=tokens, cmap="YlOrRd", annot=True, fmt=".2f", ax=ax)
|
| 238 |
-
ax.set_xlabel("Key Tokens")
|
| 239 |
-
ax.set_ylabel("Query Tokens")
|
| 240 |
-
st.pyplot(fig)
|
| 241 |
-
|
| 242 |
-
# 8. Attention Head Breakdown (for token 0)
|
| 243 |
-
st.subheader("🔍 Attention Head Breakdown (1 Token)")
|
| 244 |
-
|
| 245 |
-
st.markdown("Let's inspect how **GPT-2 computes attention for a single token** (first token in the sequence).")
|
| 246 |
-
|
| 247 |
-
# Fetch weight matrix for Q, K, V from the model's first block
|
| 248 |
-
# block = model.transformer.h[0] # Use layer 0
|
| 249 |
-
block = model.h[0] # ✅ Correct for GPT2Model
|
| 250 |
-
|
| 251 |
-
# W_qkv = block.attn.c_attn.weight.detach().numpy().T # shape (768, 3*768)
|
| 252 |
-
W_qkv = block.attn.c_attn.weight.detach().numpy() # ✅ shape (2304, 768)
|
| 253 |
-
|
| 254 |
-
b_qkv = block.attn.c_attn.bias.detach().numpy() # shape (3*768,)
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
# Final input for token 0
|
| 258 |
-
x0 = final_input[0] # shape (768,)
|
| 259 |
-
|
| 260 |
-
# Linear projection for Q, K, V
|
| 261 |
-
qkv = x0 @ W_qkv + b_qkv # shape (3*768,)
|
| 262 |
-
Q, K, V = np.split(qkv, 3)
|
| 263 |
-
|
| 264 |
-
# Show Q, K, V for head 0
|
| 265 |
-
Q0 = Q[:64]
|
| 266 |
-
K0_all = K.reshape(12, 64) # For all heads
|
| 267 |
-
V0_all = V.reshape(12, 64)
|
| 268 |
-
|
| 269 |
-
K0 = K0_all[0]
|
| 270 |
-
V0 = V0_all[0]
|
| 271 |
-
|
| 272 |
-
# Dot product and softmax
|
| 273 |
-
score = Q0 @ K0.T # scalar
|
| 274 |
-
scaled_score = score / np.sqrt(64)
|
| 275 |
-
softmax_weight = np.exp(scaled_score) / np.sum(np.exp(scaled_score))
|
| 276 |
-
|
| 277 |
-
attn_output = softmax_weight * V0 # simulated for 1 token self-attending to itself
|
| 278 |
-
|
| 279 |
-
st.markdown("### Formula Recap")
|
| 280 |
-
|
| 281 |
-
st.latex(r"Q = x W^Q,\quad K = x W^K,\quad V = x W^V")
|
| 282 |
-
|
| 283 |
-
st.latex(r"\text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^\top}{\sqrt{d_k}}\right)V")
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
# Show Q0, K0, softmax and V0
|
| 287 |
-
df_breakdown = pd.DataFrame({
|
| 288 |
-
"Q₀": Q0,
|
| 289 |
-
"K₀": K0,
|
| 290 |
-
"Q₀·K₀": Q0 * K0,
|
| 291 |
-
"V₀": V0,
|
| 292 |
-
"AttnOut": attn_output
|
| 293 |
-
})
|
| 294 |
-
df_breakdown.index = [f"dim {i}" for i in range(64)]
|
| 295 |
-
st.dataframe(df_breakdown.style.format(precision=5))
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
st.markdown("### 🧮 Self-Attention Matrix Shape Annotations")
|
| 299 |
-
|
| 300 |
-
st.markdown("""
|
| 301 |
-
**Key tensor dimensions involved in attention computation:**
|
| 302 |
-
|
| 303 |
-
- `W_qkv`: **(2304, 768)** – learned projection matrix for Q, K, V combined
|
| 304 |
-
- `b_qkv`: **(2304,)** – bias vector
|
| 305 |
-
- `X`: **(5, 768)** – input vectors for 5 tokens
|
| 306 |
-
- `qkv_all = X @ W_qkv + b_qkv`: → **(5, 2304)**
|
| 307 |
-
- `Q_all, K_all, V_all = np.split(qkv_all, 3)`: → each **(5, 768)**
|
| 308 |
-
- `Q0, K0, V0 = [:, :64]`: head 0 slice → **(5, 64)**
|
| 309 |
-
- `q0 @ K0.T`: **(1, 64) × (64, 5)** → **(1, 5)**
|
| 310 |
-
- `softmax_weights`: **(1, 5)**
|
| 311 |
-
- `attn_output = softmax_weights @ V0`: **(1, 64)**
|
| 312 |
-
""")
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
# 9. Matrix-Level Self-Attention (Token 0 → All)
|
| 317 |
-
st.subheader("🔬 Matrix-Level Self-Attention (Token 0 → All)")
|
| 318 |
-
|
| 319 |
-
st.markdown("""
|
| 320 |
-
This section shows how **Token 0** attends to all other tokens using matrix-level self-attention.
|
| 321 |
-
We compute the dot products, apply softmax, and produce the output for head 0 in layer 0.
|
| 322 |
-
""")
|
| 323 |
-
|
| 324 |
-
# Use same block
|
| 325 |
-
block = model.h[0]
|
| 326 |
-
W_qkv = block.attn.c_attn.weight.detach().numpy() # (2304, 768)
|
| 327 |
-
b_qkv = block.attn.c_attn.bias.detach().numpy() # (2304,)
|
| 328 |
-
|
| 329 |
-
X = final_input[:5] # (5, 768)
|
| 330 |
-
|
| 331 |
-
# Compute Q, K, V for all 5 tokens
|
| 332 |
-
# qkv_all = X @ W_qkv.T + b_qkv # shape (5, 2304)
|
| 333 |
-
qkv_all = X @ W_qkv + b_qkv # ✅ (5 × 768) @ (768 × 2304)
|
| 334 |
-
|
| 335 |
-
Q_all, K_all, V_all = np.split(qkv_all, 3, axis=1)
|
| 336 |
-
|
| 337 |
-
# Head 0 slices
|
| 338 |
-
Q0 = Q_all[:, :64] # (5, 64)
|
| 339 |
-
K0 = K_all[:, :64] # (5, 64)
|
| 340 |
-
V0 = V_all[:, :64] # (5, 64)
|
| 341 |
|
| 342 |
-
#
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
scaled_scores = attn_scores / np.sqrt(64)
|
| 346 |
-
softmax_weights = np.exp(scaled_scores)
|
| 347 |
-
softmax_weights /= softmax_weights.sum(axis=-1, keepdims=True) # shape (1, 5)
|
| 348 |
|
| 349 |
-
# Weighted sum of V0 rows
|
| 350 |
-
attn_output_0 = softmax_weights @ V0 # (1, 64)
|
| 351 |
|
| 352 |
-
|
| 353 |
-
st.markdown("### Raw Scaled Attention Scores (Q₀Kᵀ / √dₖ):")
|
| 354 |
-
df_scores = pd.DataFrame(scaled_scores[0], columns=["Score"], index=[f"Token {i}" for i in range(5)])
|
| 355 |
-
st.dataframe(df_scores.style.format(precision=5))
|
| 356 |
-
|
| 357 |
-
st.markdown("### Softmax Attention Weights αᵢ:")
|
| 358 |
-
df_weights = pd.DataFrame(softmax_weights[0], columns=["Weight αᵢ"], index=[f"Token {i}" for i in range(5)])
|
| 359 |
-
st.dataframe(df_weights.style.format(precision=5))
|
| 360 |
-
|
| 361 |
-
st.markdown("### Value Vᵢ vectors (Head 0, first 5 dims):")
|
| 362 |
-
df_values = pd.DataFrame(V0[:, :5], columns=[f"dim {i}" for i in range(5)],
|
| 363 |
-
index=[f"Token {i}" for i in range(5)])
|
| 364 |
-
st.dataframe(df_values.style.format(precision=5))
|
| 365 |
-
|
| 366 |
-
st.markdown("### Final Attention Output (weighted sum of Vᵢ):")
|
| 367 |
-
df_attn_out = pd.DataFrame(attn_output_0[:, :5], columns=[f"dim {i}" for i in range(5)],
|
| 368 |
-
index=["AttnOut₀"])
|
| 369 |
-
st.dataframe(df_attn_out.style.format(precision=5))
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
# 10. Per-Head Projection Matrices
|
| 373 |
-
st.subheader("🧬 Per-Head Projection Matrices (Wq, Wk, Wv)")
|
| 374 |
-
|
| 375 |
-
st.markdown("""
|
| 376 |
-
In GPT-2, each attention **head has its own set of projection weights** to compute Queries (Q), Keys (K), and Values (V) from the input vector.
|
| 377 |
-
|
| 378 |
-
The full `W_qkv` layer maps from **(768,) → (2304,)** and is split into 3 parts:
|
| 379 |
-
- `Wq` = first 768 columns → shape `(768, 768)`
|
| 380 |
-
- `Wk` = next 768 columns → shape `(768, 768)`
|
| 381 |
-
- `Wv` = last 768 columns → shape `(768, 768)`
|
| 382 |
-
|
| 383 |
-
Each head receives a unique slice from each projection:
|
| 384 |
-
- 12 heads × 64 dimensions = 768
|
| 385 |
-
- So head 0 → `Wq[:, :64]`, head 1 → `Wq[:, 64:128]`, etc.
|
| 386 |
-
""")
|
| 387 |
-
|
| 388 |
-
block = model.h[0]
|
| 389 |
-
W_qkv_full = block.attn.c_attn.weight.detach().numpy().T # shape (768, 2304)
|
| 390 |
-
W_q, W_k, W_v = np.split(W_qkv_full, 3, axis=1) # each: (768, 768)
|
| 391 |
-
|
| 392 |
-
# Show Wq head 0 and 1
|
| 393 |
-
Wq_head0 = W_q[:, :64]
|
| 394 |
-
Wq_head1 = W_q[:, 64:128]
|
| 395 |
-
|
| 396 |
-
df_q = pd.DataFrame({
|
| 397 |
-
"Wq_head0": Wq_head0[:5, 0],
|
| 398 |
-
"Wq_head1": Wq_head1[:5, 0]
|
| 399 |
-
}, index=[f"dim {i}" for i in range(5)])
|
| 400 |
-
st.markdown("### Wq projection weights for head 0 vs head 1 (first 5 input dims → output dim 0):")
|
| 401 |
-
st.dataframe(df_q.style.format(precision=5))
|
| 402 |
-
|
| 403 |
-
# Show Wk and Wv for head 0
|
| 404 |
-
Wk_head0 = W_k[:, :64]
|
| 405 |
-
Wv_head0 = W_v[:, :64]
|
| 406 |
-
|
| 407 |
-
df_kv = pd.DataFrame({
|
| 408 |
-
"Wk_head0": Wk_head0[:5, 0],
|
| 409 |
-
"Wv_head0": Wv_head0[:5, 0]
|
| 410 |
-
}, index=[f"dim {i}" for i in range(5)])
|
| 411 |
-
st.markdown("### Wk and Wv projection weights for head 0 (first 5 input dims → output dim 0):")
|
| 412 |
-
st.dataframe(df_kv.style.format(precision=5))
|
| 413 |
-
|
| 414 |
-
st.markdown("""
|
| 415 |
-
✅ This confirms that each head has **distinct projections** for Q, K, and V.
|
| 416 |
-
The same input `x` is transformed differently per head, allowing GPT-2 to learn different attention perspectives.
|
| 417 |
-
""")
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
# 11 · 📐 How W_qkv Projects an Input Vector into Q, K, V
|
| 421 |
-
st.subheader("📐 How W_qkv Projects an Input Vector → Q, K, V")
|
| 422 |
-
|
| 423 |
-
st.markdown("""
|
| 424 |
-
In GPT-2, the combined projection layer `c_attn` maps a single input embedding
|
| 425 |
-
into a concatenated vector that contains **Q, K, and V**.
|
| 426 |
-
|
| 427 |
-
Each of these is 768-dimensional, so the full output is 768 × 3 = 2304.
|
| 428 |
-
""")
|
| 429 |
-
|
| 430 |
-
st.latex(r"x \in \mathbb{R}^{768} \quad \rightarrow \quad [Q \;|\; K \;|\; V] \in \mathbb{R}^{2304}")
|
| 431 |
-
|
| 432 |
-
st.markdown("---")
|
| 433 |
-
|
| 434 |
-
st.markdown("### 🧪 Mini GPT Example (3D → 6D Projection)")
|
| 435 |
-
|
| 436 |
-
st.markdown("Imagine a tiny model:")
|
| 437 |
-
|
| 438 |
-
st.markdown("""
|
| 439 |
-
- Input vector `x ∈ ℝ³`
|
| 440 |
-
- Q, K, V are each 2D → total output = 6D
|
| 441 |
-
- Thus:
|
| 442 |
-
""")
|
| 443 |
|
| 444 |
-
|
|
|
|
| 445 |
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
[
|
| 456 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 457 |
)
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
mini_out = mini_W @ mini_x + mini_b # (6,)
|
| 461 |
-
Qm, Km, Vm = np.split(mini_out, 3) # each (2,)
|
| 462 |
-
|
| 463 |
-
st.code("Input vector x = [1.0, 2.0, 3.0] # shape (3,)")
|
| 464 |
-
st.code("W_qkv shape = (6, 3) # maps 3 → 6")
|
| 465 |
-
|
| 466 |
-
st.code(f"Output = W_qkv @ x + b = {mini_out.round(2).tolist()}")
|
| 467 |
-
|
| 468 |
-
df_mini = pd.DataFrame(
|
| 469 |
-
{
|
| 470 |
-
"Q": Qm.round(2),
|
| 471 |
-
"K": Km.round(2),
|
| 472 |
-
"V": Vm.round(2)
|
| 473 |
-
},
|
| 474 |
-
index=["dim 1", "dim 2"]
|
| 475 |
)
|
| 476 |
|
| 477 |
-
|
| 478 |
-
st.
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
"
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
st.markdown("""
|
| 505 |
-
Each attention **head** gets its own slice:
|
| 506 |
-
- Q_head₀ = Q[:, :64]
|
| 507 |
-
- K_head₀ = K[:, :64]
|
| 508 |
-
- V_head₀ = V[:, :64]
|
| 509 |
-
|
| 510 |
-
That’s how one input vector creates multi-headed Q, K, and V for scaled dot-product attention.
|
| 511 |
-
""")
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
st.subheader("Additional notes:")
|
| 515 |
-
st.markdown(
|
| 516 |
-
"""
|
| 517 |
-
---
|
| 518 |
-
|
| 519 |
-
## 🧠 What Does `Ġ` Mean?
|
| 520 |
-
|
| 521 |
-
The character `Ġ` (U+0120: Latin Capital Letter G with dot above) is used to:
|
| 522 |
-
|
| 523 |
-
> **Represent a leading space** before the token.
|
| 524 |
-
|
| 525 |
-
---
|
| 526 |
-
|
| 527 |
-
### ✅ Example:
|
| 528 |
-
|
| 529 |
-
Let’s look at a sentence:
|
| 530 |
-
|
| 531 |
-
```
|
| 532 |
-
"The cat sat on the mat"
|
| 533 |
-
```
|
| 534 |
-
|
| 535 |
-
When tokenized using GPT-2 tokenizer (`GPT2TokenizerFast`), it becomes:
|
| 536 |
-
|
| 537 |
-
```
|
| 538 |
-
['The', 'Ġcat', 'Ġsat', 'Ġon', 'Ġthe', 'Ġmat']
|
| 539 |
-
```
|
| 540 |
-
|
| 541 |
-
* `'The'` → First word, no leading space.
|
| 542 |
-
* `'Ġcat'` → Space + "cat"
|
| 543 |
-
* `'Ġsat'` → Space + "sat"
|
| 544 |
-
* etc.
|
| 545 |
-
|
| 546 |
-
So `Ġ` means:
|
| 547 |
-
|
| 548 |
-
> "This token starts after a space."
|
| 549 |
-
|
| 550 |
-
---
|
| 551 |
-
|
| 552 |
-
### ⚠️ Why Not Just Use `" "`?
|
| 553 |
-
|
| 554 |
-
Because GPT-2 uses a **vocabulary of subword units** (BPE). These tokens are strings, not raw characters or bytes. Including space as a separate token would have complicated the merge process. So:
|
| 555 |
-
|
| 556 |
-
* `Ġ` = internal marker used in the vocabulary file
|
| 557 |
-
* It's not a space character but tells the tokenizer "insert space before decoding this."
|
| 558 |
-
|
| 559 |
-
---
|
| 560 |
-
|
| 561 |
-
### ✅ When Detokenizing
|
| 562 |
-
|
| 563 |
-
The tokenizer **removes the `Ġ` and adds a space** during decoding:
|
| 564 |
-
|
| 565 |
-
```python
|
| 566 |
-
from transformers import GPT2TokenizerFast
|
| 567 |
-
|
| 568 |
-
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
|
| 569 |
-
|
| 570 |
-
tokens = tokenizer.tokenize("The cat sat on the mat")
|
| 571 |
-
print(tokens)
|
| 572 |
-
# ['The', 'Ġcat', 'Ġsat', 'Ġon', 'Ġthe', 'Ġmat']
|
| 573 |
-
|
| 574 |
-
ids = tokenizer.convert_tokens_to_ids(tokens)
|
| 575 |
-
decoded = tokenizer.decode(ids)
|
| 576 |
-
print(decoded)
|
| 577 |
-
# 'The cat sat on the mat'
|
| 578 |
-
```
|
| 579 |
-
|
| 580 |
-
---
|
| 581 |
-
|
| 582 |
-
## ✅ Summary
|
| 583 |
-
|
| 584 |
-
| Token | Interprets As |
|
| 585 |
-
| -------- | ------------------------- |
|
| 586 |
-
| `'The'` | `'The'` (no space before) |
|
| 587 |
-
| `'Ġcat'` | `' cat'` |
|
| 588 |
-
| `'Ġsat'` | `' sat'` |
|
| 589 |
-
| `'Ġon'` | `' on'` |
|
| 590 |
-
| `'Ġthe'` | `' the'` |
|
| 591 |
-
| `'Ġmat'` | `' mat'` |
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
---
|
| 595 |
-
|
| 596 |
-
## ✅ What is `@` in Python?
|
| 597 |
-
|
| 598 |
-
In Python 3.5+, the `@` operator means:
|
| 599 |
-
|
| 600 |
-
> **Matrix multiplication** (also called **dot product** or **tensor contraction** depending on context)
|
| 601 |
-
|
| 602 |
-
---
|
| 603 |
-
|
| 604 |
-
### ✅ Equivalent to:
|
| 605 |
-
|
| 606 |
-
```python
|
| 607 |
-
A @ B ⟺ np.matmul(A, B)
|
| 608 |
-
```
|
| 609 |
-
|
| 610 |
-
Or if both are 1D/2D NumPy arrays:
|
| 611 |
-
|
| 612 |
-
```python
|
| 613 |
-
A @ B ⟺ np.dot(A, B)
|
| 614 |
-
```
|
| 615 |
-
|
| 616 |
-
---
|
| 617 |
-
|
| 618 |
-
## 🔍 In your case:
|
| 619 |
-
|
| 620 |
-
```python
|
| 621 |
-
Output = W_qkv @ x + b
|
| 622 |
-
```
|
| 623 |
-
|
| 624 |
-
### Let’s say:
|
| 625 |
-
|
| 626 |
-
* `x` = shape **(3,)**
|
| 627 |
-
* `W_qkv` = shape **(6, 3)**
|
| 628 |
-
* `b` = shape **(6,)**
|
| 629 |
-
|
| 630 |
-
---
|
| 631 |
-
|
| 632 |
-
### Then:
|
| 633 |
-
|
| 634 |
-
* `W_qkv @ x` → matrix–vector multiplication
|
| 635 |
-
→ shape: **(6,)**
|
| 636 |
-
|
| 637 |
-
* Adding `b` → element-wise vector addition
|
| 638 |
-
→ final shape: **(6,)**
|
| 639 |
-
|
| 640 |
-
---
|
| 641 |
-
|
| 642 |
-
### So this line:
|
| 643 |
-
|
| 644 |
-
```python
|
| 645 |
-
Output = W_qkv @ x + b
|
| 646 |
-
```
|
| 647 |
-
|
| 648 |
-
Means:
|
| 649 |
-
|
| 650 |
-
1. Multiply the **input vector `x`** with the **projection matrix `W_qkv`**
|
| 651 |
-
2. Add a **bias vector `b`**
|
| 652 |
-
3. Result = combined **\[Q | K | V]** output
|
| 653 |
-
|
| 654 |
-
---
|
| 655 |
-
|
| 656 |
-
## ✅ Example:
|
| 657 |
-
|
| 658 |
-
```python
|
| 659 |
-
x = np.array([1, 2, 3])
|
| 660 |
-
W_qkv = np.array([
|
| 661 |
-
[0.1, 0.2, 0.3], # Q1
|
| 662 |
-
[0.4, 0.5, 0.6], # Q2
|
| 663 |
-
[0.7, 0.8, 0.9], # K1
|
| 664 |
-
[1.0, 1.1, 1.2], # K2
|
| 665 |
-
[1.3, 1.4, 1.5], # V1
|
| 666 |
-
[1.6, 1.7, 1.8], # V2
|
| 667 |
-
])
|
| 668 |
-
b = np.array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06])
|
| 669 |
-
|
| 670 |
-
output = W_qkv @ x + b
|
| 671 |
-
```
|
| 672 |
-
|
| 673 |
-
Manually:
|
| 674 |
-
|
| 675 |
-
* `W_qkv @ x` = `[1.4, 3.2, 5.0, 6.8, 8.6, 10.4]`
|
| 676 |
-
* After adding `b` → `[1.41, 3.22, 5.03, 6.84, 8.65, 10.46]`
|
| 677 |
-
|
| 678 |
-
---
|
| 679 |
-
|
| 680 |
-
## ✅ Summary
|
| 681 |
-
|
| 682 |
-
| Expression | Meaning |
|
| 683 |
-
| ------------- | ----------------------------- |
|
| 684 |
-
| `@` | Matrix multiplication (`dot`) |
|
| 685 |
-
| `W @ x + b` | Linear transformation |
|
| 686 |
-
| Shape `W @ x` | `(m, n) @ (n,) = (m,)` |
|
| 687 |
-
|
| 688 |
-
Would you like to include this in your Streamlit visualizer as an expandable note or equation section?
|
| 689 |
-
|
| 690 |
-
|
| 691 |
|
| 692 |
|
| 693 |
-
""")
|
| 694 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
| 2 |
import numpy as np
|
| 3 |
from transformers import GPT2TokenizerFast, GPT2Model
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
+
# 1. Load tokenizer and model
|
| 6 |
@st.cache_resource
|
| 7 |
+
def load_resources():
|
| 8 |
+
# tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
|
| 9 |
+
tokenizer = GPT2TokenizerFast.from_pretrained("./assets/tokenizer", local_files_only=True)
|
| 10 |
+
# model = GPT2Model.from_pretrained("gpt2")
|
| 11 |
+
model = GPT2Model.from_pretrained("./assets/model", local_files_only=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
+
# from transformers import GPT2TokenizerFast
|
| 14 |
+
# # Load tokenizer from bundled local files only
|
| 15 |
+
#
|
|
|
|
|
|
|
|
|
|
| 16 |
|
|
|
|
|
|
|
| 17 |
|
| 18 |
+
return tokenizer, model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
+
# Initialize resources
|
| 21 |
+
tokenizer, model = load_resources()
|
| 22 |
|
| 23 |
+
# 2. Helper to get the full embedding matrix
|
| 24 |
+
@st.cache_resource
|
| 25 |
+
def get_embedding_matrix():
|
| 26 |
+
return model.get_input_embeddings().weight.detach().cpu().numpy()
|
| 27 |
+
|
| 28 |
+
# 3. Initialize session state
|
| 29 |
+
for key in ["tokens", "token_ids", "embeddings", "current_id"]:
|
| 30 |
+
if key not in st.session_state:
|
| 31 |
+
if key in ["tokens", "token_ids"]:
|
| 32 |
+
st.session_state[key] = []
|
| 33 |
+
else:
|
| 34 |
+
st.session_state[key] = {} if key == "embeddings" else None
|
| 35 |
+
|
| 36 |
+
st.title("🔍 Embedding & Positional Encoding Explorer")
|
| 37 |
+
|
| 38 |
+
# 4. Sentence input & BPE tokenize
|
| 39 |
+
sentence = st.text_input("Enter a sentence to tokenize:")
|
| 40 |
+
if st.button("BPE Tokenize"):
|
| 41 |
+
ids = tokenizer.encode(sentence, add_special_tokens=False)
|
| 42 |
+
toks = tokenizer.convert_ids_to_tokens(ids)
|
| 43 |
+
st.session_state.tokens = toks
|
| 44 |
+
st.session_state.token_ids = ids
|
| 45 |
+
|
| 46 |
+
# 5. Display tokens + IDs with embedding buttons
|
| 47 |
+
if st.session_state.tokens:
|
| 48 |
+
st.subheader("Tokens and IDs")
|
| 49 |
+
cols = st.columns([4, 1])
|
| 50 |
+
for i, (tok, tid) in enumerate(zip(st.session_state.tokens, st.session_state.token_ids)):
|
| 51 |
+
cols[0].write(f"{i+1}. **{tok}** → ID {tid}")
|
| 52 |
+
if cols[1].button(f"Create Embedding for {tid}", key=f"embed_{tid}"):
|
| 53 |
+
vec = model.get_input_embeddings().weight[tid].detach().cpu().numpy()
|
| 54 |
+
st.session_state.embeddings[tid] = vec.copy()
|
| 55 |
+
st.session_state.current_id = tid
|
| 56 |
+
|
| 57 |
+
# 6. Show & edit embedding sliders for selected token
|
| 58 |
+
if st.session_state.current_id is not None:
|
| 59 |
+
tok_id = st.session_state.current_id
|
| 60 |
+
emb_vec = st.session_state.embeddings[tok_id]
|
| 61 |
+
st.subheader(f"Embedding for token ID {tok_id}")
|
| 62 |
+
for dim in range(len(emb_vec)):
|
| 63 |
+
emb_vec[dim] = st.slider(
|
| 64 |
+
f"Emb Dim {dim}", -5.0, 5.0, float(emb_vec[dim]), step=0.01,
|
| 65 |
+
key=f"slider_{tok_id}_{dim}"
|
| 66 |
+
)
|
| 67 |
+
st.session_state.embeddings[tok_id] = emb_vec
|
| 68 |
+
|
| 69 |
+
# 7. Similarity search on current embedding
|
| 70 |
+
# if st.button("Similarity Search", key="sim_search"):
|
| 71 |
+
# matrix = get_embedding_matrix()
|
| 72 |
+
# query = emb_vec
|
| 73 |
+
# dot = matrix.dot(query)
|
| 74 |
+
# mat_norm = np.linalg.norm(matrix, axis=1)
|
| 75 |
+
# q_norm = np.linalg.norm(query)
|
| 76 |
+
# sims = dot / (mat_norm * q_norm + 1e-12)
|
| 77 |
+
# topk = (-sims).argsort()[1:21]
|
| 78 |
+
# st.write("**Top 20 similar tokens:**")
|
| 79 |
+
# for idx in topk:
|
| 80 |
+
# token_str = tokenizer.convert_ids_to_tokens([idx])[0]
|
| 81 |
+
# st.write(f"ID {idx} ({token_str}): {sims[idx]:.4f}")
|
| 82 |
+
|
| 83 |
+
# 8. Positional Encoding inputs
|
| 84 |
+
st.subheader("Positional Encoding")
|
| 85 |
+
|
| 86 |
+
# Show formula in LaTeX
|
| 87 |
+
st.markdown(r"""
|
| 88 |
+
**Positional Encoding Formula**
|
| 89 |
+
|
| 90 |
+
For position $p$ and dimension $d$ (where $D$ is the embedding size):
|
| 91 |
+
|
| 92 |
+
$$
|
| 93 |
+
PE(p,d) = \begin{cases}
|
| 94 |
+
\sin\bigl(\frac{p}{10000^{d / D}}\bigr), & \text{if } d \text{ is even} \\
|
| 95 |
+
\cos\bigl(\frac{p}{10000^{(d-1) / D}}\bigr), & \text{if } d \text{ is odd}
|
| 96 |
+
\end{cases}
|
| 97 |
+
$$
|
| 98 |
+
""")
|
| 99 |
+
|
| 100 |
+
pos = st.number_input("Position (p)", min_value=0, format="%d")
|
| 101 |
+
dim = st.number_input(
|
| 102 |
+
"Dimension index (0-based)", min_value=0, max_value=len(emb_vec)-1, format="%d"
|
| 103 |
)
|
| 104 |
+
emb_dim = st.number_input(
|
| 105 |
+
"Embedding Dimension (vector length)", value=len(emb_vec), format="%d"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
)
|
| 107 |
|
| 108 |
+
# 9. Add Pos Encoding
|
| 109 |
+
if st.button("Compute and Add Pos Encoding to the Embedding"):
|
| 110 |
+
p, d, D = int(pos), int(dim), int(emb_dim)
|
| 111 |
+
if 0 <= d < D:
|
| 112 |
+
if d % 2 == 0:
|
| 113 |
+
pe = np.sin(p / (10000 ** (d / D)))
|
| 114 |
+
else:
|
| 115 |
+
pe = np.cos(p / (10000 ** ((d - 1) / D)))
|
| 116 |
+
emb_vec[d] += pe
|
| 117 |
+
st.session_state.embeddings[tok_id] = emb_vec
|
| 118 |
+
else:
|
| 119 |
+
st.error("Dimension index out of range.")
|
| 120 |
+
|
| 121 |
+
# 10. Similarity search with positional encoding
|
| 122 |
+
if st.button("Similarity Search (Using the Embedding)", key="sim_search_pos"):
|
| 123 |
+
matrix = get_embedding_matrix()
|
| 124 |
+
query = st.session_state.embeddings[tok_id]
|
| 125 |
+
dot = matrix.dot(query)
|
| 126 |
+
mat_norm = np.linalg.norm(matrix, axis=1)
|
| 127 |
+
q_norm = np.linalg.norm(query)
|
| 128 |
+
sims = dot / (mat_norm * q_norm + 1e-12)
|
| 129 |
+
topk = (-sims).argsort()[1:21]
|
| 130 |
+
st.write("**Top 20 similar tokens after PosEnc:**")
|
| 131 |
+
for idx in topk:
|
| 132 |
+
token_str = tokenizer.convert_ids_to_tokens([idx])[0]
|
| 133 |
+
st.write(f"ID {idx} ({token_str}): {sims[idx]:.4f}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
|
|
|
|
| 136 |
|