schoginitoys commited on
Commit
316297e
·
verified ·
1 Parent(s): 0ed5ac0

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +120 -678
src/streamlit_app.py CHANGED
@@ -1,694 +1,136 @@
1
-
2
- # ONCE
3
- # from transformers import GPT2TokenizerFast, GPT2Model
4
- # import os
5
-
6
- # # Load from local offline folder
7
- # model = GPT2Model.from_pretrained("./models")
8
- # tokenizer = GPT2TokenizerFast.from_pretrained("./models")
9
-
10
- # from transformers import GPT2Model, GPT2TokenizerFast
11
-
12
- # model = GPT2Model.from_pretrained("gpt2")
13
- # tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
14
-
15
- # model.save_pretrained("./models")
16
- # tokenizer.save_pretrained("./models")
17
-
18
- # model = GPT2Model.from_pretrained("openai-community/gpt2")
19
- # tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
20
- # model.save_pretrained("models")
21
- # tokenizer.save_pretrained("models")
22
-
23
-
24
- # from transformers import GPT2Tokenizer, GPT2Model
25
-
26
- # model_id = "gpt2"
27
- # GPT2Model.from_pretrained(model_id).save_pretrained("models")
28
- # GPT2Tokenizer.from_pretrained(model_id).save_pretrained("models")
29
-
30
- # print("✅ Downloaded and saved GPT-2 to models")
31
-
32
-
33
  import streamlit as st
34
- st.set_page_config(page_title="GPT-2 Attention Explorer", layout="wide")
35
-
36
- import torch
37
  import numpy as np
38
  from transformers import GPT2TokenizerFast, GPT2Model
39
- import seaborn as sns
40
- import matplotlib.pyplot as plt
41
- import pandas as pd
42
 
 
43
  @st.cache_resource
44
- def load_model():
45
- tokenizer = GPT2TokenizerFast.from_pretrained("./models")
46
- model = GPT2Model.from_pretrained("./models", output_attentions=True, attn_implementation="eager")
47
- model.eval()
48
- return tokenizer, model
49
-
50
- tokenizer, model = load_model()
51
-
52
- st.title("🧠 GPT-2 Token Inspector + Self-Attention Visualizer")
53
-
54
- with st.expander("📊 GPT-2 Model Architecture Summary"):
55
- st.markdown("""
56
- - **Vocabulary size (V):** `50257`
57
- - **Embedding dimension (d):** `768`
58
- - **Max Position Length (L):** `1024`
59
- - This is sometimes also called:
60
- - n_positions in config
61
- - max sequence length
62
- - context length
63
- - max context window
64
- - **Transformer Layers:** `12`
65
- - **Attention Heads per Layer:** `12`
66
- - **Per-head Dimension (dₖ):** `64`
67
- - **Feedforward Hidden Layer Size:** `3072`
68
- - **Total Parameters:** ~117 million
69
-
70
- ---
71
-
72
- ## Question: Transformer Layers: 12 means each layer has 12 Attention Heads?
73
-
74
- ## 🧠 Quick Answer:
75
-
76
- > ✅ **No**, 12 Transformer Layers ≠ 12 Heads per Layer
77
- > 🔁 But in **GPT-2 (small)**, both happen to be **12** — **by design coincidence**, not definition.
78
-
79
- ---
80
-
81
- ## 🔍 Breakdown of GPT-2’s Architecture
82
-
83
- | Component | GPT-2 (small) default |
84
- | ----------------------------- | --------------------- |
85
- | Embedding size (`d_model`) | 768 |
86
- | **Transformer layers** | 12 |
87
- | **Attention heads per layer** | 12 |
88
- | Hidden feedforward size | 3072 |
89
- | Max position embeddings | 1024 |
90
-
91
- ---
92
-
93
- ### ✅ So in GPT-2:
94
-
95
- * Each of the **12 transformer layers** has:
96
-
97
- * **Multi-head attention**
98
- * With **12 heads per layer**
99
- * Each head has `64` dimensions (`768 ÷ 12 = 64`)
100
-
101
- ---
102
-
103
- ## 📌 Why this Confusion Happens
104
-
105
- The number of **layers** and **heads per layer** are:
106
-
107
- * Configured independently in the model
108
- * But **coincidentally** both set to 12 in GPT-2 small
109
-
110
- In other models:
111
-
112
- | Model | Layers | Heads per Layer |
113
- | ------------ | ------ | --------------- |
114
- | GPT-2 Medium | 24 | 16 |
115
- | GPT-2 Large | 36 | 20 |
116
- | GPT-3 | 96 | 96 |
117
- | LLaMA 2 7B | 32 | 32 |
118
-
119
- So again:
120
-
121
- > 🔁 **12 layers ≠ 12 heads** in general — it's just a choice in GPT-2 small.
122
-
123
- ---
124
-
125
- ## 💡 Want a table in your app to explain this too?
126
-
127
- I can give you a section like:
128
-
129
- > "🧩 Layers vs Heads — What's the Difference?"
130
-
131
- Let me know and I’ll drop in that Streamlit code too.
132
-
133
-
134
-
135
- """)
136
-
137
-
138
- sentence = st.text_input("Enter a sentence:", "The cat sat on the mat")
139
-
140
- if st.button("Analyze & Visualize") and sentence.strip():
141
-
142
- inputs = tokenizer(sentence, return_tensors='pt', return_offsets_mapping=True, return_special_tokens_mask=True)
143
- token_ids = inputs['input_ids'][0]
144
- tokens = tokenizer.convert_ids_to_tokens(token_ids)
145
- position_ids = torch.arange(token_ids.shape[0]).unsqueeze(0)
146
-
147
- inputs.pop("special_tokens_mask", None)
148
- inputs.pop("offset_mapping", None)
149
-
150
- with torch.no_grad():
151
- outputs = model(**inputs, position_ids=position_ids)
152
-
153
- attentions = outputs.attentions
154
- embeddings = outputs.last_hidden_state[0].numpy()
155
-
156
- pos_embedding_layer = model.wpe
157
- pos_embeddings = pos_embedding_layer(position_ids).squeeze(0).detach().numpy()
158
-
159
- word_embedding_layer = model.wte
160
- word_embeddings = word_embedding_layer(token_ids).detach().numpy()
161
-
162
- final_input = word_embeddings + pos_embeddings
163
-
164
- # 1. BPE Tokens
165
- st.subheader("🧾 Byte Pair Encoded Tokens (BPE)")
166
- st.markdown("GPT-2 uses **Byte Pair Encoding (BPE)** to split input text into subword units.")
167
- st.code(" ".join(tokens))
168
-
169
- # 2. Token IDs
170
- st.subheader("🔢 Token IDs")
171
- st.markdown("Each token is mapped to an integer ID using the GPT-2 vocabulary.")
172
- st.code(token_ids.tolist())
173
-
174
- # 3. Word Embeddings
175
- st.subheader("💎 Raw Word Embeddings (first 5 tokens)")
176
- st.markdown("Each token ID is used to lookup a learnable word embedding vector:")
177
- st.latex(r"\text{Embedding}(t_i) = \mathbf{E}[t_i]")
178
- st.markdown(r"Where $\mathbf{E} \in \mathbb{R}^{V \times d}$ with $V$ = vocab size and $d = 768$.")
179
- df_word_embed = pd.DataFrame(word_embeddings[:5])
180
- df_word_embed.index = [f"{i}: {tok}" for i, tok in enumerate(tokens[:5])]
181
- st.dataframe(df_word_embed.style.format(precision=4))
182
-
183
- # 4. Positional Encodings
184
- st.subheader("🧭 Positional Encodings (first 5 tokens)")
185
- st.markdown("GPT-2 adds learned positional vectors from a table indexed by position:")
186
- st.latex(r"\text{PosEnc}(i) = \mathbf{P}[i]")
187
-
188
- st.markdown("Example (first 5 positions, first 5 dimensions):")
189
- df_pos_example = pd.DataFrame(pos_embeddings[:5, :5],
190
- columns=[f"dim {i}" for i in range(5)],
191
- index=[f"{i}: {tok}" for i, tok in enumerate(tokens[:5])])
192
- st.dataframe(df_pos_example.style.format(precision=5))
193
-
194
- st.markdown(r"Where $\mathbf{P} \in \mathbb{R}^{L \times d}$ is learned and not sinusoidal in GPT-2.")
195
-
196
- # 5. Final Input Vectors
197
- st.subheader("🧮 Final Input = Word Embedding + Positional Encoding")
198
- st.markdown("These are the actual vectors passed into the first transformer block:")
199
- st.latex(r"\mathbf{X}_i = \text{Embedding}(t_i) + \text{PosEnc}(i)")
200
-
201
- st.markdown("Let's confirm this by showing:")
202
- st.code("final_input[i][j] ≈ word_embedding[i][j] + pos_embedding[i][j]")
203
-
204
- for i in range(2): # for first 2 tokens
205
- df_sum_example = pd.DataFrame({
206
- 'Word': word_embeddings[i, :5],
207
- 'PosEnc': pos_embeddings[i, :5],
208
- 'Final Input': final_input[i, :5],
209
- 'Word + Pos': word_embeddings[i, :5] + pos_embeddings[i, :5]
210
- })
211
- df_sum_example.index = [f"dim {j}" for j in range(5)]
212
- st.markdown(f"**Token {i}: `{tokens[i]}`**")
213
- st.dataframe(df_sum_example.style.format(precision=5))
214
-
215
- # 6. Output Embeddings
216
- st.subheader("📐 Output Embedding Vectors (first 5 tokens)")
217
- st.markdown("These are the final hidden states after passing through all transformer layers:")
218
- st.latex(r"\text{Output}_i = \text{TransformerLayers}(\mathbf{X}_i)")
219
-
220
- df_embed_example = pd.DataFrame(embeddings[:5, :5],
221
- columns=[f"dim {j}" for j in range(5)],
222
- index=[f"{i}: {tok}" for i, tok in enumerate(tokens[:5])])
223
- st.dataframe(df_embed_example.style.format(precision=5))
224
-
225
- st.markdown("📌 These are **not** equal to the input vectors—they are fully context-aware representations!")
226
-
227
- # 🔄 Move sliders here just above heatmap
228
- layer_num = st.slider("Select Transformer Layer", 0, model.config.n_layer - 1, 0)
229
- head_num = st.slider("Select Attention Head", 0, model.config.n_head - 1, 0)
230
- attn = attentions[layer_num][0, head_num].numpy()
231
-
232
- # 7. Attention Heatmap
233
- st.subheader(f"🎯 Attention Heatmap — Layer {layer_num+1}, Head {head_num+1}")
234
- st.markdown("This shows how each token attends to others in the sequence:")
235
- st.latex(r"\text{Attention}(Q, K, V) = \text{softmax} \left( \frac{QK^\top}{\sqrt{d_k}} \right) V")
236
- fig, ax = plt.subplots(figsize=(8, 6))
237
- sns.heatmap(attn, xticklabels=tokens, yticklabels=tokens, cmap="YlOrRd", annot=True, fmt=".2f", ax=ax)
238
- ax.set_xlabel("Key Tokens")
239
- ax.set_ylabel("Query Tokens")
240
- st.pyplot(fig)
241
-
242
- # 8. Attention Head Breakdown (for token 0)
243
- st.subheader("🔍 Attention Head Breakdown (1 Token)")
244
-
245
- st.markdown("Let's inspect how **GPT-2 computes attention for a single token** (first token in the sequence).")
246
-
247
- # Fetch weight matrix for Q, K, V from the model's first block
248
- # block = model.transformer.h[0] # Use layer 0
249
- block = model.h[0] # ✅ Correct for GPT2Model
250
-
251
- # W_qkv = block.attn.c_attn.weight.detach().numpy().T # shape (768, 3*768)
252
- W_qkv = block.attn.c_attn.weight.detach().numpy() # ✅ shape (2304, 768)
253
-
254
- b_qkv = block.attn.c_attn.bias.detach().numpy() # shape (3*768,)
255
-
256
-
257
- # Final input for token 0
258
- x0 = final_input[0] # shape (768,)
259
-
260
- # Linear projection for Q, K, V
261
- qkv = x0 @ W_qkv + b_qkv # shape (3*768,)
262
- Q, K, V = np.split(qkv, 3)
263
-
264
- # Show Q, K, V for head 0
265
- Q0 = Q[:64]
266
- K0_all = K.reshape(12, 64) # For all heads
267
- V0_all = V.reshape(12, 64)
268
-
269
- K0 = K0_all[0]
270
- V0 = V0_all[0]
271
-
272
- # Dot product and softmax
273
- score = Q0 @ K0.T # scalar
274
- scaled_score = score / np.sqrt(64)
275
- softmax_weight = np.exp(scaled_score) / np.sum(np.exp(scaled_score))
276
-
277
- attn_output = softmax_weight * V0 # simulated for 1 token self-attending to itself
278
-
279
- st.markdown("### Formula Recap")
280
-
281
- st.latex(r"Q = x W^Q,\quad K = x W^K,\quad V = x W^V")
282
-
283
- st.latex(r"\text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^\top}{\sqrt{d_k}}\right)V")
284
-
285
-
286
- # Show Q0, K0, softmax and V0
287
- df_breakdown = pd.DataFrame({
288
- "Q₀": Q0,
289
- "K₀": K0,
290
- "Q₀·K₀": Q0 * K0,
291
- "V₀": V0,
292
- "AttnOut": attn_output
293
- })
294
- df_breakdown.index = [f"dim {i}" for i in range(64)]
295
- st.dataframe(df_breakdown.style.format(precision=5))
296
-
297
-
298
- st.markdown("### 🧮 Self-Attention Matrix Shape Annotations")
299
-
300
- st.markdown("""
301
- **Key tensor dimensions involved in attention computation:**
302
-
303
- - `W_qkv`: **(2304, 768)** – learned projection matrix for Q, K, V combined
304
- - `b_qkv`: **(2304,)** – bias vector
305
- - `X`: **(5, 768)** – input vectors for 5 tokens
306
- - `qkv_all = X @ W_qkv + b_qkv`: → **(5, 2304)**
307
- - `Q_all, K_all, V_all = np.split(qkv_all, 3)`: → each **(5, 768)**
308
- - `Q0, K0, V0 = [:, :64]`: head 0 slice → **(5, 64)**
309
- - `q0 @ K0.T`: **(1, 64) × (64, 5)** → **(1, 5)**
310
- - `softmax_weights`: **(1, 5)**
311
- - `attn_output = softmax_weights @ V0`: **(1, 64)**
312
- """)
313
-
314
-
315
-
316
- # 9. Matrix-Level Self-Attention (Token 0 → All)
317
- st.subheader("🔬 Matrix-Level Self-Attention (Token 0 → All)")
318
-
319
- st.markdown("""
320
- This section shows how **Token 0** attends to all other tokens using matrix-level self-attention.
321
- We compute the dot products, apply softmax, and produce the output for head 0 in layer 0.
322
- """)
323
-
324
- # Use same block
325
- block = model.h[0]
326
- W_qkv = block.attn.c_attn.weight.detach().numpy() # (2304, 768)
327
- b_qkv = block.attn.c_attn.bias.detach().numpy() # (2304,)
328
-
329
- X = final_input[:5] # (5, 768)
330
-
331
- # Compute Q, K, V for all 5 tokens
332
- # qkv_all = X @ W_qkv.T + b_qkv # shape (5, 2304)
333
- qkv_all = X @ W_qkv + b_qkv # ✅ (5 × 768) @ (768 × 2304)
334
-
335
- Q_all, K_all, V_all = np.split(qkv_all, 3, axis=1)
336
-
337
- # Head 0 slices
338
- Q0 = Q_all[:, :64] # (5, 64)
339
- K0 = K_all[:, :64] # (5, 64)
340
- V0 = V_all[:, :64] # (5, 64)
341
 
342
- # Compute raw attention scores for token 0
343
- q0 = Q0[0].reshape(1, 64) # (1, 64)
344
- attn_scores = q0 @ K0.T # (1, 5)
345
- scaled_scores = attn_scores / np.sqrt(64)
346
- softmax_weights = np.exp(scaled_scores)
347
- softmax_weights /= softmax_weights.sum(axis=-1, keepdims=True) # shape (1, 5)
348
 
349
- # Weighted sum of V0 rows
350
- attn_output_0 = softmax_weights @ V0 # (1, 64)
351
 
352
- # Display matrices
353
- st.markdown("### Raw Scaled Attention Scores (Q₀Kᵀ / √dₖ):")
354
- df_scores = pd.DataFrame(scaled_scores[0], columns=["Score"], index=[f"Token {i}" for i in range(5)])
355
- st.dataframe(df_scores.style.format(precision=5))
356
-
357
- st.markdown("### Softmax Attention Weights αᵢ:")
358
- df_weights = pd.DataFrame(softmax_weights[0], columns=["Weight αᵢ"], index=[f"Token {i}" for i in range(5)])
359
- st.dataframe(df_weights.style.format(precision=5))
360
-
361
- st.markdown("### Value Vᵢ vectors (Head 0, first 5 dims):")
362
- df_values = pd.DataFrame(V0[:, :5], columns=[f"dim {i}" for i in range(5)],
363
- index=[f"Token {i}" for i in range(5)])
364
- st.dataframe(df_values.style.format(precision=5))
365
-
366
- st.markdown("### Final Attention Output (weighted sum of Vᵢ):")
367
- df_attn_out = pd.DataFrame(attn_output_0[:, :5], columns=[f"dim {i}" for i in range(5)],
368
- index=["AttnOut₀"])
369
- st.dataframe(df_attn_out.style.format(precision=5))
370
-
371
-
372
- # 10. Per-Head Projection Matrices
373
- st.subheader("🧬 Per-Head Projection Matrices (Wq, Wk, Wv)")
374
-
375
- st.markdown("""
376
- In GPT-2, each attention **head has its own set of projection weights** to compute Queries (Q), Keys (K), and Values (V) from the input vector.
377
-
378
- The full `W_qkv` layer maps from **(768,) → (2304,)** and is split into 3 parts:
379
- - `Wq` = first 768 columns → shape `(768, 768)`
380
- - `Wk` = next 768 columns → shape `(768, 768)`
381
- - `Wv` = last 768 columns → shape `(768, 768)`
382
-
383
- Each head receives a unique slice from each projection:
384
- - 12 heads × 64 dimensions = 768
385
- - So head 0 → `Wq[:, :64]`, head 1 → `Wq[:, 64:128]`, etc.
386
- """)
387
-
388
- block = model.h[0]
389
- W_qkv_full = block.attn.c_attn.weight.detach().numpy().T # shape (768, 2304)
390
- W_q, W_k, W_v = np.split(W_qkv_full, 3, axis=1) # each: (768, 768)
391
-
392
- # Show Wq head 0 and 1
393
- Wq_head0 = W_q[:, :64]
394
- Wq_head1 = W_q[:, 64:128]
395
-
396
- df_q = pd.DataFrame({
397
- "Wq_head0": Wq_head0[:5, 0],
398
- "Wq_head1": Wq_head1[:5, 0]
399
- }, index=[f"dim {i}" for i in range(5)])
400
- st.markdown("### Wq projection weights for head 0 vs head 1 (first 5 input dims → output dim 0):")
401
- st.dataframe(df_q.style.format(precision=5))
402
-
403
- # Show Wk and Wv for head 0
404
- Wk_head0 = W_k[:, :64]
405
- Wv_head0 = W_v[:, :64]
406
-
407
- df_kv = pd.DataFrame({
408
- "Wk_head0": Wk_head0[:5, 0],
409
- "Wv_head0": Wv_head0[:5, 0]
410
- }, index=[f"dim {i}" for i in range(5)])
411
- st.markdown("### Wk and Wv projection weights for head 0 (first 5 input dims → output dim 0):")
412
- st.dataframe(df_kv.style.format(precision=5))
413
-
414
- st.markdown("""
415
- ✅ This confirms that each head has **distinct projections** for Q, K, and V.
416
- The same input `x` is transformed differently per head, allowing GPT-2 to learn different attention perspectives.
417
- """)
418
-
419
-
420
- # 11 · 📐 How W_qkv Projects an Input Vector into Q, K, V
421
- st.subheader("📐 How W_qkv Projects an Input Vector → Q, K, V")
422
-
423
- st.markdown("""
424
- In GPT-2, the combined projection layer `c_attn` maps a single input embedding
425
- into a concatenated vector that contains **Q, K, and V**.
426
-
427
- Each of these is 768-dimensional, so the full output is 768 × 3 = 2304.
428
- """)
429
-
430
- st.latex(r"x \in \mathbb{R}^{768} \quad \rightarrow \quad [Q \;|\; K \;|\; V] \in \mathbb{R}^{2304}")
431
-
432
- st.markdown("---")
433
-
434
- st.markdown("### 🧪 Mini GPT Example (3D → 6D Projection)")
435
-
436
- st.markdown("Imagine a tiny model:")
437
-
438
- st.markdown("""
439
- - Input vector `x ∈ ℝ³`
440
- - Q, K, V are each 2D → total output = 6D
441
- - Thus:
442
- """)
443
 
444
- st.latex(r"W_{\text{qkv}} \in \mathbb{R}^{6 \times 3}, \quad b_{\text{qkv}} \in \mathbb{R}^6")
 
445
 
446
- # Miniature input vector and projection weights
447
- mini_x = np.array([1.0, 2.0, 3.0]) # (3,)
448
- mini_W = np.array( # (6, 3)
449
- [
450
- [0.1, 0.2, 0.3], # → Q₁
451
- [0.4, 0.5, 0.6], # Q₂
452
- [0.7, 0.8, 0.9], # → K₁
453
- [1.0, 1.1, 1.2], # K₂
454
- [1.3, 1.4, 1.5], # → V₁
455
- [1.6, 1.7, 1.8], # → V₂
456
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
  )
458
- mini_b = np.array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06]) # (6,)
459
-
460
- mini_out = mini_W @ mini_x + mini_b # (6,)
461
- Qm, Km, Vm = np.split(mini_out, 3) # each (2,)
462
-
463
- st.code("Input vector x = [1.0, 2.0, 3.0] # shape (3,)")
464
- st.code("W_qkv shape = (6, 3) # maps 3 → 6")
465
-
466
- st.code(f"Output = W_qkv @ x + b = {mini_out.round(2).tolist()}")
467
-
468
- df_mini = pd.DataFrame(
469
- {
470
- "Q": Qm.round(2),
471
- "K": Km.round(2),
472
- "V": Vm.round(2)
473
- },
474
- index=["dim 1", "dim 2"]
475
  )
476
 
477
- st.markdown("**Split into Q, K, V (each 2D):**")
478
- st.dataframe(df_mini.style.format(precision=2))
479
-
480
- st.markdown("---")
481
-
482
- st.markdown("### 📏 Real GPT-2 Projection Shapes")
483
-
484
- df_shapes = pd.DataFrame({
485
- "Tensor": [
486
- "Input x",
487
- "W_qkv (linear layer)",
488
- "b_qkv (bias)",
489
- "Output = x @ W_qkv + b",
490
- "Q / K / V each",
491
- "Head reshaping"
492
- ],
493
- "Shape": [
494
- "(768,)",
495
- "(2304, 768)",
496
- "(2304,)",
497
- "(2304,)",
498
- "(768,)",
499
- "12 heads × 64 dims = 768"
500
- ]
501
- })
502
- st.dataframe(df_shapes)
503
-
504
- st.markdown("""
505
- Each attention **head** gets its own slice:
506
- - Q_head₀ = Q[:, :64]
507
- - K_head₀ = K[:, :64]
508
- - V_head₀ = V[:, :64]
509
-
510
- That’s how one input vector creates multi-headed Q, K, and V for scaled dot-product attention.
511
- """)
512
-
513
-
514
- st.subheader("Additional notes:")
515
- st.markdown(
516
- """
517
- ---
518
-
519
- ## 🧠 What Does `Ġ` Mean?
520
-
521
- The character `Ġ` (U+0120: Latin Capital Letter G with dot above) is used to:
522
-
523
- > **Represent a leading space** before the token.
524
-
525
- ---
526
-
527
- ### ✅ Example:
528
-
529
- Let’s look at a sentence:
530
-
531
- ```
532
- "The cat sat on the mat"
533
- ```
534
-
535
- When tokenized using GPT-2 tokenizer (`GPT2TokenizerFast`), it becomes:
536
-
537
- ```
538
- ['The', 'Ġcat', 'Ġsat', 'Ġon', 'Ġthe', 'Ġmat']
539
- ```
540
-
541
- * `'The'` → First word, no leading space.
542
- * `'Ġcat'` → Space + "cat"
543
- * `'Ġsat'` → Space + "sat"
544
- * etc.
545
-
546
- So `Ġ` means:
547
-
548
- > "This token starts after a space."
549
-
550
- ---
551
-
552
- ### ⚠️ Why Not Just Use `" "`?
553
-
554
- Because GPT-2 uses a **vocabulary of subword units** (BPE). These tokens are strings, not raw characters or bytes. Including space as a separate token would have complicated the merge process. So:
555
-
556
- * `Ġ` = internal marker used in the vocabulary file
557
- * It's not a space character but tells the tokenizer "insert space before decoding this."
558
-
559
- ---
560
-
561
- ### ✅ When Detokenizing
562
-
563
- The tokenizer **removes the `Ġ` and adds a space** during decoding:
564
-
565
- ```python
566
- from transformers import GPT2TokenizerFast
567
-
568
- tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
569
-
570
- tokens = tokenizer.tokenize("The cat sat on the mat")
571
- print(tokens)
572
- # ['The', 'Ġcat', 'Ġsat', 'Ġon', 'Ġthe', 'Ġmat']
573
-
574
- ids = tokenizer.convert_tokens_to_ids(tokens)
575
- decoded = tokenizer.decode(ids)
576
- print(decoded)
577
- # 'The cat sat on the mat'
578
- ```
579
-
580
- ---
581
-
582
- ## ✅ Summary
583
-
584
- | Token | Interprets As |
585
- | -------- | ------------------------- |
586
- | `'The'` | `'The'` (no space before) |
587
- | `'Ġcat'` | `' cat'` |
588
- | `'Ġsat'` | `' sat'` |
589
- | `'Ġon'` | `' on'` |
590
- | `'Ġthe'` | `' the'` |
591
- | `'Ġmat'` | `' mat'` |
592
-
593
-
594
- ---
595
-
596
- ## ✅ What is `@` in Python?
597
-
598
- In Python 3.5+, the `@` operator means:
599
-
600
- > **Matrix multiplication** (also called **dot product** or **tensor contraction** depending on context)
601
-
602
- ---
603
-
604
- ### ✅ Equivalent to:
605
-
606
- ```python
607
- A @ B ⟺ np.matmul(A, B)
608
- ```
609
-
610
- Or if both are 1D/2D NumPy arrays:
611
-
612
- ```python
613
- A @ B ⟺ np.dot(A, B)
614
- ```
615
-
616
- ---
617
-
618
- ## 🔍 In your case:
619
-
620
- ```python
621
- Output = W_qkv @ x + b
622
- ```
623
-
624
- ### Let’s say:
625
-
626
- * `x` = shape **(3,)**
627
- * `W_qkv` = shape **(6, 3)**
628
- * `b` = shape **(6,)**
629
-
630
- ---
631
-
632
- ### Then:
633
-
634
- * `W_qkv @ x` → matrix–vector multiplication
635
- → shape: **(6,)**
636
-
637
- * Adding `b` → element-wise vector addition
638
- → final shape: **(6,)**
639
-
640
- ---
641
-
642
- ### So this line:
643
-
644
- ```python
645
- Output = W_qkv @ x + b
646
- ```
647
-
648
- Means:
649
-
650
- 1. Multiply the **input vector `x`** with the **projection matrix `W_qkv`**
651
- 2. Add a **bias vector `b`**
652
- 3. Result = combined **\[Q | K | V]** output
653
-
654
- ---
655
-
656
- ## ✅ Example:
657
-
658
- ```python
659
- x = np.array([1, 2, 3])
660
- W_qkv = np.array([
661
- [0.1, 0.2, 0.3], # Q1
662
- [0.4, 0.5, 0.6], # Q2
663
- [0.7, 0.8, 0.9], # K1
664
- [1.0, 1.1, 1.2], # K2
665
- [1.3, 1.4, 1.5], # V1
666
- [1.6, 1.7, 1.8], # V2
667
- ])
668
- b = np.array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06])
669
-
670
- output = W_qkv @ x + b
671
- ```
672
-
673
- Manually:
674
-
675
- * `W_qkv @ x` = `[1.4, 3.2, 5.0, 6.8, 8.6, 10.4]`
676
- * After adding `b` → `[1.41, 3.22, 5.03, 6.84, 8.65, 10.46]`
677
-
678
- ---
679
-
680
- ## ✅ Summary
681
-
682
- | Expression | Meaning |
683
- | ------------- | ----------------------------- |
684
- | `@` | Matrix multiplication (`dot`) |
685
- | `W @ x + b` | Linear transformation |
686
- | Shape `W @ x` | `(m, n) @ (n,) = (m,)` |
687
-
688
- Would you like to include this in your Streamlit visualizer as an expandable note or equation section?
689
-
690
-
691
 
692
 
693
- """)
694
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
 
 
 
2
  import numpy as np
3
  from transformers import GPT2TokenizerFast, GPT2Model
 
 
 
4
 
5
+ # 1. Load tokenizer and model
6
  @st.cache_resource
7
+ def load_resources():
8
+ # tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
9
+ tokenizer = GPT2TokenizerFast.from_pretrained("./assets/tokenizer", local_files_only=True)
10
+ # model = GPT2Model.from_pretrained("gpt2")
11
+ model = GPT2Model.from_pretrained("./assets/model", local_files_only=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ # from transformers import GPT2TokenizerFast
14
+ # # Load tokenizer from bundled local files only
15
+ #
 
 
 
16
 
 
 
17
 
18
+ return tokenizer, model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ # Initialize resources
21
+ tokenizer, model = load_resources()
22
 
23
+ # 2. Helper to get the full embedding matrix
24
+ @st.cache_resource
25
+ def get_embedding_matrix():
26
+ return model.get_input_embeddings().weight.detach().cpu().numpy()
27
+
28
+ # 3. Initialize session state
29
+ for key in ["tokens", "token_ids", "embeddings", "current_id"]:
30
+ if key not in st.session_state:
31
+ if key in ["tokens", "token_ids"]:
32
+ st.session_state[key] = []
33
+ else:
34
+ st.session_state[key] = {} if key == "embeddings" else None
35
+
36
+ st.title("🔍 Embedding & Positional Encoding Explorer")
37
+
38
+ # 4. Sentence input & BPE tokenize
39
+ sentence = st.text_input("Enter a sentence to tokenize:")
40
+ if st.button("BPE Tokenize"):
41
+ ids = tokenizer.encode(sentence, add_special_tokens=False)
42
+ toks = tokenizer.convert_ids_to_tokens(ids)
43
+ st.session_state.tokens = toks
44
+ st.session_state.token_ids = ids
45
+
46
+ # 5. Display tokens + IDs with embedding buttons
47
+ if st.session_state.tokens:
48
+ st.subheader("Tokens and IDs")
49
+ cols = st.columns([4, 1])
50
+ for i, (tok, tid) in enumerate(zip(st.session_state.tokens, st.session_state.token_ids)):
51
+ cols[0].write(f"{i+1}. **{tok}** → ID {tid}")
52
+ if cols[1].button(f"Create Embedding for {tid}", key=f"embed_{tid}"):
53
+ vec = model.get_input_embeddings().weight[tid].detach().cpu().numpy()
54
+ st.session_state.embeddings[tid] = vec.copy()
55
+ st.session_state.current_id = tid
56
+
57
+ # 6. Show & edit embedding sliders for selected token
58
+ if st.session_state.current_id is not None:
59
+ tok_id = st.session_state.current_id
60
+ emb_vec = st.session_state.embeddings[tok_id]
61
+ st.subheader(f"Embedding for token ID {tok_id}")
62
+ for dim in range(len(emb_vec)):
63
+ emb_vec[dim] = st.slider(
64
+ f"Emb Dim {dim}", -5.0, 5.0, float(emb_vec[dim]), step=0.01,
65
+ key=f"slider_{tok_id}_{dim}"
66
+ )
67
+ st.session_state.embeddings[tok_id] = emb_vec
68
+
69
+ # 7. Similarity search on current embedding
70
+ # if st.button("Similarity Search", key="sim_search"):
71
+ # matrix = get_embedding_matrix()
72
+ # query = emb_vec
73
+ # dot = matrix.dot(query)
74
+ # mat_norm = np.linalg.norm(matrix, axis=1)
75
+ # q_norm = np.linalg.norm(query)
76
+ # sims = dot / (mat_norm * q_norm + 1e-12)
77
+ # topk = (-sims).argsort()[1:21]
78
+ # st.write("**Top 20 similar tokens:**")
79
+ # for idx in topk:
80
+ # token_str = tokenizer.convert_ids_to_tokens([idx])[0]
81
+ # st.write(f"ID {idx} ({token_str}): {sims[idx]:.4f}")
82
+
83
+ # 8. Positional Encoding inputs
84
+ st.subheader("Positional Encoding")
85
+
86
+ # Show formula in LaTeX
87
+ st.markdown(r"""
88
+ **Positional Encoding Formula**
89
+
90
+ For position $p$ and dimension $d$ (where $D$ is the embedding size):
91
+
92
+ $$
93
+ PE(p,d) = \begin{cases}
94
+ \sin\bigl(\frac{p}{10000^{d / D}}\bigr), & \text{if } d \text{ is even} \\
95
+ \cos\bigl(\frac{p}{10000^{(d-1) / D}}\bigr), & \text{if } d \text{ is odd}
96
+ \end{cases}
97
+ $$
98
+ """)
99
+
100
+ pos = st.number_input("Position (p)", min_value=0, format="%d")
101
+ dim = st.number_input(
102
+ "Dimension index (0-based)", min_value=0, max_value=len(emb_vec)-1, format="%d"
103
  )
104
+ emb_dim = st.number_input(
105
+ "Embedding Dimension (vector length)", value=len(emb_vec), format="%d"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  )
107
 
108
+ # 9. Add Pos Encoding
109
+ if st.button("Compute and Add Pos Encoding to the Embedding"):
110
+ p, d, D = int(pos), int(dim), int(emb_dim)
111
+ if 0 <= d < D:
112
+ if d % 2 == 0:
113
+ pe = np.sin(p / (10000 ** (d / D)))
114
+ else:
115
+ pe = np.cos(p / (10000 ** ((d - 1) / D)))
116
+ emb_vec[d] += pe
117
+ st.session_state.embeddings[tok_id] = emb_vec
118
+ else:
119
+ st.error("Dimension index out of range.")
120
+
121
+ # 10. Similarity search with positional encoding
122
+ if st.button("Similarity Search (Using the Embedding)", key="sim_search_pos"):
123
+ matrix = get_embedding_matrix()
124
+ query = st.session_state.embeddings[tok_id]
125
+ dot = matrix.dot(query)
126
+ mat_norm = np.linalg.norm(matrix, axis=1)
127
+ q_norm = np.linalg.norm(query)
128
+ sims = dot / (mat_norm * q_norm + 1e-12)
129
+ topk = (-sims).argsort()[1:21]
130
+ st.write("**Top 20 similar tokens after PosEnc:**")
131
+ for idx in topk:
132
+ token_str = tokenizer.convert_ids_to_tokens([idx])[0]
133
+ st.write(f"ID {idx} ({token_str}): {sims[idx]:.4f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
 
 
136