File size: 8,773 Bytes
1e9b3ac
5ebdf85
9942d88
0ed5ac0
8366831
1e9b3ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ebdf85
1e9b3ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ebdf85
1e9b3ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ebdf85
1e9b3ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# transformer_demo_persistent.py
import streamlit as st
import numpy as np

# ─── Config ───────────────────────────────────────────────────────── 
VOCAB_SIZE = 50257
SEQ_LEN    = 12
D_MODEL    = 64     # demo dimensionality
NUM_HEADS  = 4
D_HEAD     = D_MODEL // NUM_HEADS

np.random.seed(0)
embedding_matrix = np.random.randn(VOCAB_SIZE, D_MODEL) * 0.01

# ─── Helpers ────────────────────────────────────────────────────────
def get_positional_encoding(seq_len, d_model):
    pos = np.arange(seq_len)[:, None]
    dim = np.arange(d_model)[None, :]
    angle_rates = 1 / (10000 ** (2 * (dim // 2) / d_model))
    angles = pos * angle_rates
    pe = np.zeros((seq_len, d_model))
    pe[:, 0::2] = np.sin(angles[:, 0::2])
    pe[:, 1::2] = np.cos(angles[:, 1::2])
    return pe

def gelu(x):
    # Gaussian Error Linear Unit for non-linearity
    return x * 0.5 * (1.0 + np.tanh(np.sqrt(2/np.pi)*(x + 0.044715*np.power(x,3))))

def softmax(x, axis=-1):
    e = np.exp(x - np.max(x, axis=axis, keepdims=True))
    return e / e.sum(axis=axis, keepdims=True)

def self_attention(Q, K, V):
    # Scaled dot-product attention
    scores = Q @ K.T / np.sqrt(D_HEAD)
    weights = softmax(scores, axis=-1)
    context = weights @ V
    return context, weights

# ─── Streamlit UI ───────────────────────────────────────────────────
st.title("πŸ” Transformer Step-by-Step Demo (Persistent Outputs)")

# Display model/demo dimensions with explanations
st.markdown("#### Model & Demo Dimensions")
st.write(f"- **Vocabulary size (|V|):** {VOCAB_SIZE} (total tokens)")
st.write(f"- **Sequence length (SEQ_LEN):** {SEQ_LEN} (max tokens processed)")
st.write(f"- **Model dimension (d_model):** {D_MODEL} (embedding & hidden size)")
st.write(f"- **Number of heads (H):** {NUM_HEADS} (parallel attention heads)")
st.write(f"- **Head dimension (d_k):** {D_HEAD} (d_model / H)")



sentence = st.text_input("Input (up to 12 words):", 
                         "The quick brown fox jumps over the lazy dog")
words = sentence.split()[:SEQ_LEN]

# Initialize session state flags
for i in range(1, 8):
    st.session_state.setdefault(f"stage_{i}", False)

# Buttons to toggle each stage
cols = st.columns(7)
labels = [
    "1️⃣ Tokenize","2️⃣ Embed","3️⃣ Pos-Enc",
    "4️⃣ Self-Attn","5️⃣ Multi-Head","6️⃣ Add&Norm+FFN",
    "7️⃣ Logitsβ†’Next"
]
for i, col in enumerate(cols, start=1):
    if col.button(labels[i-1], key=f"btn_{i}"):
        st.session_state[f"stage_{i}"] = True

# Precompute shared values
token_ids = [abs(hash(w)) % VOCAB_SIZE for w in words]
embeds   = embedding_matrix[token_ids]
pe       = get_positional_encoding(len(words), D_MODEL)
x        = embeds + pe  # position-aware embeddings

# ─── 1️⃣ Tokenization ───────────────────────────────────────────────
if st.session_state.stage_1:
    st.markdown("### 1️⃣ Tokenization")
    st.write("**What:** Map each word/subword β†’ unique integer ID using a vocab of size 50257.")
    st.write("**Why:** Neural networks require numeric inputs, not raw text.")
    st.write("**Output (token IDs):**", token_ids)

# ─── 2️⃣ Embedding ──────────────────────────────────────────────────
if st.session_state.stage_2:
    st.markdown("### 2️⃣ Embedding")
    st.write("**What:** Replace each token ID with a learned dense vector of size d_model.")
    st.write("**Why:** Embeddings capture semantic relationshipsβ€”similar words lie close in vector space.")
    st.write(f"Shape: {embeds.shape}")
    st.write(embeds)

# ─── 3️⃣ Positional Encoding ────────────────────────────────────────
if st.session_state.stage_3:
    st.markdown("### 3️⃣ Positional Encoding")
    st.write("**What:** Add sinusoidal vectors so model knows each token’s position in the sequence:")
    st.latex(r"""
      \mathrm{PE}_{(pos,2k)}=\sin\bigl(\tfrac{pos}{10000^{2k/d_{model}}}\bigr),\quad
      \mathrm{PE}_{(pos,2k+1)}=\cos\bigl(\tfrac{pos}{10000^{2k/d_{model}}}\bigr).
    """)
    st.write("**Why:** Self-attention itself is permutation-invariantβ€”positions must be encoded separately.")
    st.write("**Example vector at position 0:**", x[0])

# ─── 4️⃣ Self-Attention (single head) ───────────────────────────────
if st.session_state.stage_4:
    st.markdown("### 4️⃣ Scaled Dot-Product Self-Attention (Single Head)")
    st.write("**What:** For each token, compute:")
    st.write("- **Query (Q):** Projection that asks β€œwhat am I looking for?”")
    st.write("- **Key (K):** Projection that asks β€œwhat do you offer?”")
    st.write("- **Value (V):** Projection holding actual information to pass forward")
    st.write("Mathematically:")
    st.latex(r"""
      Q = XW^Q,\quad
      K = XW^K,\quad
      V = XW^V,
      \quad
      \alpha_{ij}=\frac{Q_i\cdot K_j}{\sqrt{d_k}},\quad
      \beta_{ij}=\mathrm{softmax}_j(\alpha_{ij}),\quad
      C_i=\sum_j \beta_{ij}\,V_j.
    """)
    # random projection for demo
    Wq = np.random.randn(D_MODEL, D_HEAD)
    Wk = np.random.randn(D_MODEL, D_HEAD)
    Wv = np.random.randn(D_MODEL, D_HEAD)
    Q, K, V = x @ Wq, x @ Wk, x @ Wv
    context, weights = self_attention(Q, K, V)
    st.write("Attention weights for token 0 (softmax across all positions):")
    st.write(weights[0])
    st.write("Resulting context vector for token 0:")
    st.write(context[0])

# ─── 5️⃣ Multi-Head Attention ───────────────────────────────────────
if st.session_state.stage_5:
    st.markdown("### 5️⃣ Multi-Head Attention")
    st.write("**What:** Run H parallel self-attention heads, each with its own (W^Q,W^K,W^V).")
    st.write("**Why:** Different heads can focus on different relationsβ€”syntax, semantics, coreference, etc.")
    heads = []
    for h in range(NUM_HEADS):
        Wq = np.random.randn(D_MODEL, D_HEAD)
        Wk = np.random.randn(D_MODEL, D_HEAD)
        Wv = np.random.randn(D_MODEL, D_HEAD)
        c, _ = self_attention(x @ Wq, x @ Wk, x @ Wv)
        heads.append(c)
    multi_c = np.concatenate(heads, axis=-1)
    st.write(f"Concatenated output shape: {multi_c.shape}")

# ─── 6️⃣ Add & Norm + Feed-Forward ──────────────────────────────────
if st.session_state.stage_6:
    st.markdown("### 6️⃣ Add & Norm + Position-wise Feed-Forward")
    st.write("**Residual & LayerNorm:** Stabilize training by adding input back:")
    st.write("Z1 = LayerNorm(MHA(X) + X)")
    st.write("**Feed-Forward Network:**")
    st.latex(r"""
      \mathrm{FFN}(Z)=W_2\bigl[\mathrm{GELU}(W_1Z + b_1)\bigr] + b_2,
    """)
    st.write("GELU(x)β‰ˆxΒ·0.5Β·(1+tanh(√(2/Ο€)(x+0.0447xΒ³))) soft-gates small values.")
    # simulate MHA + residual
    x1 = x + np.random.randn(*x.shape) * 0.01
    ln1 = (x1 - x1.mean(-1, keepdims=True)) / (x1.std(-1, keepdims=True) + 1e-5)
    W1 = np.random.randn(D_MODEL, 4*D_MODEL); b1 = np.zeros(4*D_MODEL)
    W2 = np.random.randn(4*D_MODEL, D_MODEL); b2 = np.zeros(D_MODEL)
    y = gelu(ln1 @ W1 + b1) @ W2 + b2
    ln2 = (y + ln1 - (y+ln1).mean(-1, keepdims=True)) / ((y+ln1).std(-1, keepdims=True)+1e-5)
    st.write("Output vector at token 0 after FFN & Add+Norm:")
    st.write(ln2[0])

# ─── 7️⃣ Final Projection & Softmax ─────────────────────────────────
if st.session_state.stage_7:
    st.markdown("### 7️⃣ Project to Vocab + Softmax β†’ Next Token")
    st.write("**What:** Map each final vector back to logits over 50K-plus vocab.")
    st.write("**Why:** Softmax(logits) gives a probability distribution; highest-prob token is chosen.")
    x_final = np.random.randn(len(words), D_MODEL) * 0.01
    Wout = np.random.randn(D_MODEL, VOCAB_SIZE); b_out = np.zeros(VOCAB_SIZE)
    logits = x_final @ Wout + b_out
    probs = softmax(logits, axis=-1)
    next_id = np.argmax(probs[-1])
    st.write("Predicted next token ID:", next_id)
    st.info("In a full model, you’d map that ID β†’ word via the vocab dictionary.")