Spaces:
Sleeping
Sleeping
File size: 8,773 Bytes
1e9b3ac 5ebdf85 9942d88 0ed5ac0 8366831 1e9b3ac 5ebdf85 1e9b3ac 5ebdf85 1e9b3ac 5ebdf85 1e9b3ac | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 | # transformer_demo_persistent.py
import streamlit as st
import numpy as np
# βββ Config βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
VOCAB_SIZE = 50257
SEQ_LEN = 12
D_MODEL = 64 # demo dimensionality
NUM_HEADS = 4
D_HEAD = D_MODEL // NUM_HEADS
np.random.seed(0)
embedding_matrix = np.random.randn(VOCAB_SIZE, D_MODEL) * 0.01
# βββ Helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def get_positional_encoding(seq_len, d_model):
pos = np.arange(seq_len)[:, None]
dim = np.arange(d_model)[None, :]
angle_rates = 1 / (10000 ** (2 * (dim // 2) / d_model))
angles = pos * angle_rates
pe = np.zeros((seq_len, d_model))
pe[:, 0::2] = np.sin(angles[:, 0::2])
pe[:, 1::2] = np.cos(angles[:, 1::2])
return pe
def gelu(x):
# Gaussian Error Linear Unit for non-linearity
return x * 0.5 * (1.0 + np.tanh(np.sqrt(2/np.pi)*(x + 0.044715*np.power(x,3))))
def softmax(x, axis=-1):
e = np.exp(x - np.max(x, axis=axis, keepdims=True))
return e / e.sum(axis=axis, keepdims=True)
def self_attention(Q, K, V):
# Scaled dot-product attention
scores = Q @ K.T / np.sqrt(D_HEAD)
weights = softmax(scores, axis=-1)
context = weights @ V
return context, weights
# βββ Streamlit UI βββββββββββββββββββββββββββββββββββββββββββββββββββ
st.title("π Transformer Step-by-Step Demo (Persistent Outputs)")
# Display model/demo dimensions with explanations
st.markdown("#### Model & Demo Dimensions")
st.write(f"- **Vocabulary size (|V|):** {VOCAB_SIZE} (total tokens)")
st.write(f"- **Sequence length (SEQ_LEN):** {SEQ_LEN} (max tokens processed)")
st.write(f"- **Model dimension (d_model):** {D_MODEL} (embedding & hidden size)")
st.write(f"- **Number of heads (H):** {NUM_HEADS} (parallel attention heads)")
st.write(f"- **Head dimension (d_k):** {D_HEAD} (d_model / H)")
sentence = st.text_input("Input (up to 12 words):",
"The quick brown fox jumps over the lazy dog")
words = sentence.split()[:SEQ_LEN]
# Initialize session state flags
for i in range(1, 8):
st.session_state.setdefault(f"stage_{i}", False)
# Buttons to toggle each stage
cols = st.columns(7)
labels = [
"1οΈβ£ Tokenize","2οΈβ£ Embed","3οΈβ£ Pos-Enc",
"4οΈβ£ Self-Attn","5οΈβ£ Multi-Head","6οΈβ£ Add&Norm+FFN",
"7οΈβ£ LogitsβNext"
]
for i, col in enumerate(cols, start=1):
if col.button(labels[i-1], key=f"btn_{i}"):
st.session_state[f"stage_{i}"] = True
# Precompute shared values
token_ids = [abs(hash(w)) % VOCAB_SIZE for w in words]
embeds = embedding_matrix[token_ids]
pe = get_positional_encoding(len(words), D_MODEL)
x = embeds + pe # position-aware embeddings
# βββ 1οΈβ£ Tokenization βββββββββββββββββββββββββββββββββββββββββββββββ
if st.session_state.stage_1:
st.markdown("### 1οΈβ£ Tokenization")
st.write("**What:** Map each word/subword β unique integer ID using a vocab of size 50257.")
st.write("**Why:** Neural networks require numeric inputs, not raw text.")
st.write("**Output (token IDs):**", token_ids)
# βββ 2οΈβ£ Embedding ββββββββββββββββββββββββββββββββββββββββββββββββββ
if st.session_state.stage_2:
st.markdown("### 2οΈβ£ Embedding")
st.write("**What:** Replace each token ID with a learned dense vector of size d_model.")
st.write("**Why:** Embeddings capture semantic relationshipsβsimilar words lie close in vector space.")
st.write(f"Shape: {embeds.shape}")
st.write(embeds)
# βββ 3οΈβ£ Positional Encoding ββββββββββββββββββββββββββββββββββββββββ
if st.session_state.stage_3:
st.markdown("### 3οΈβ£ Positional Encoding")
st.write("**What:** Add sinusoidal vectors so model knows each tokenβs position in the sequence:")
st.latex(r"""
\mathrm{PE}_{(pos,2k)}=\sin\bigl(\tfrac{pos}{10000^{2k/d_{model}}}\bigr),\quad
\mathrm{PE}_{(pos,2k+1)}=\cos\bigl(\tfrac{pos}{10000^{2k/d_{model}}}\bigr).
""")
st.write("**Why:** Self-attention itself is permutation-invariantβpositions must be encoded separately.")
st.write("**Example vector at position 0:**", x[0])
# βββ 4οΈβ£ Self-Attention (single head) βββββββββββββββββββββββββββββββ
if st.session_state.stage_4:
st.markdown("### 4οΈβ£ Scaled Dot-Product Self-Attention (Single Head)")
st.write("**What:** For each token, compute:")
st.write("- **Query (Q):** Projection that asks βwhat am I looking for?β")
st.write("- **Key (K):** Projection that asks βwhat do you offer?β")
st.write("- **Value (V):** Projection holding actual information to pass forward")
st.write("Mathematically:")
st.latex(r"""
Q = XW^Q,\quad
K = XW^K,\quad
V = XW^V,
\quad
\alpha_{ij}=\frac{Q_i\cdot K_j}{\sqrt{d_k}},\quad
\beta_{ij}=\mathrm{softmax}_j(\alpha_{ij}),\quad
C_i=\sum_j \beta_{ij}\,V_j.
""")
# random projection for demo
Wq = np.random.randn(D_MODEL, D_HEAD)
Wk = np.random.randn(D_MODEL, D_HEAD)
Wv = np.random.randn(D_MODEL, D_HEAD)
Q, K, V = x @ Wq, x @ Wk, x @ Wv
context, weights = self_attention(Q, K, V)
st.write("Attention weights for token 0 (softmax across all positions):")
st.write(weights[0])
st.write("Resulting context vector for token 0:")
st.write(context[0])
# βββ 5οΈβ£ Multi-Head Attention βββββββββββββββββββββββββββββββββββββββ
if st.session_state.stage_5:
st.markdown("### 5οΈβ£ Multi-Head Attention")
st.write("**What:** Run H parallel self-attention heads, each with its own (W^Q,W^K,W^V).")
st.write("**Why:** Different heads can focus on different relationsβsyntax, semantics, coreference, etc.")
heads = []
for h in range(NUM_HEADS):
Wq = np.random.randn(D_MODEL, D_HEAD)
Wk = np.random.randn(D_MODEL, D_HEAD)
Wv = np.random.randn(D_MODEL, D_HEAD)
c, _ = self_attention(x @ Wq, x @ Wk, x @ Wv)
heads.append(c)
multi_c = np.concatenate(heads, axis=-1)
st.write(f"Concatenated output shape: {multi_c.shape}")
# βββ 6οΈβ£ Add & Norm + Feed-Forward ββββββββββββββββββββββββββββββββββ
if st.session_state.stage_6:
st.markdown("### 6οΈβ£ Add & Norm + Position-wise Feed-Forward")
st.write("**Residual & LayerNorm:** Stabilize training by adding input back:")
st.write("Z1 = LayerNorm(MHA(X) + X)")
st.write("**Feed-Forward Network:**")
st.latex(r"""
\mathrm{FFN}(Z)=W_2\bigl[\mathrm{GELU}(W_1Z + b_1)\bigr] + b_2,
""")
st.write("GELU(x)βxΒ·0.5Β·(1+tanh(β(2/Ο)(x+0.0447xΒ³))) soft-gates small values.")
# simulate MHA + residual
x1 = x + np.random.randn(*x.shape) * 0.01
ln1 = (x1 - x1.mean(-1, keepdims=True)) / (x1.std(-1, keepdims=True) + 1e-5)
W1 = np.random.randn(D_MODEL, 4*D_MODEL); b1 = np.zeros(4*D_MODEL)
W2 = np.random.randn(4*D_MODEL, D_MODEL); b2 = np.zeros(D_MODEL)
y = gelu(ln1 @ W1 + b1) @ W2 + b2
ln2 = (y + ln1 - (y+ln1).mean(-1, keepdims=True)) / ((y+ln1).std(-1, keepdims=True)+1e-5)
st.write("Output vector at token 0 after FFN & Add+Norm:")
st.write(ln2[0])
# βββ 7οΈβ£ Final Projection & Softmax βββββββββββββββββββββββββββββββββ
if st.session_state.stage_7:
st.markdown("### 7οΈβ£ Project to Vocab + Softmax β Next Token")
st.write("**What:** Map each final vector back to logits over 50K-plus vocab.")
st.write("**Why:** Softmax(logits) gives a probability distribution; highest-prob token is chosen.")
x_final = np.random.randn(len(words), D_MODEL) * 0.01
Wout = np.random.randn(D_MODEL, VOCAB_SIZE); b_out = np.zeros(VOCAB_SIZE)
logits = x_final @ Wout + b_out
probs = softmax(logits, axis=-1)
next_id = np.argmax(probs[-1])
st.write("Predicted next token ID:", next_id)
st.info("In a full model, youβd map that ID β word via the vocab dictionary.")
|