Spaces:
Sleeping
Sleeping
| # transformer_demo_persistent.py | |
| import streamlit as st | |
| import numpy as np | |
| # βββ Config βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| VOCAB_SIZE = 50257 | |
| SEQ_LEN = 12 | |
| D_MODEL = 64 # demo dimensionality | |
| NUM_HEADS = 4 | |
| D_HEAD = D_MODEL // NUM_HEADS | |
| np.random.seed(0) | |
| embedding_matrix = np.random.randn(VOCAB_SIZE, D_MODEL) * 0.01 | |
| # βββ Helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def get_positional_encoding(seq_len, d_model): | |
| pos = np.arange(seq_len)[:, None] | |
| dim = np.arange(d_model)[None, :] | |
| angle_rates = 1 / (10000 ** (2 * (dim // 2) / d_model)) | |
| angles = pos * angle_rates | |
| pe = np.zeros((seq_len, d_model)) | |
| pe[:, 0::2] = np.sin(angles[:, 0::2]) | |
| pe[:, 1::2] = np.cos(angles[:, 1::2]) | |
| return pe | |
| def gelu(x): | |
| # Gaussian Error Linear Unit for non-linearity | |
| return x * 0.5 * (1.0 + np.tanh(np.sqrt(2/np.pi)*(x + 0.044715*np.power(x,3)))) | |
| def softmax(x, axis=-1): | |
| e = np.exp(x - np.max(x, axis=axis, keepdims=True)) | |
| return e / e.sum(axis=axis, keepdims=True) | |
| def self_attention(Q, K, V): | |
| # Scaled dot-product attention | |
| scores = Q @ K.T / np.sqrt(D_HEAD) | |
| weights = softmax(scores, axis=-1) | |
| context = weights @ V | |
| return context, weights | |
| # βββ Streamlit UI βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.title("π Transformer Step-by-Step Demo (Persistent Outputs)") | |
| # Display model/demo dimensions with explanations | |
| st.markdown("#### Model & Demo Dimensions") | |
| st.write(f"- **Vocabulary size (|V|):** {VOCAB_SIZE} (total tokens)") | |
| st.write(f"- **Sequence length (SEQ_LEN):** {SEQ_LEN} (max tokens processed)") | |
| st.write(f"- **Model dimension (d_model):** {D_MODEL} (embedding & hidden size)") | |
| st.write(f"- **Number of heads (H):** {NUM_HEADS} (parallel attention heads)") | |
| st.write(f"- **Head dimension (d_k):** {D_HEAD} (d_model / H)") | |
| sentence = st.text_input("Input (up to 12 words):", | |
| "The quick brown fox jumps over the lazy dog") | |
| words = sentence.split()[:SEQ_LEN] | |
| # Initialize session state flags | |
| for i in range(1, 8): | |
| st.session_state.setdefault(f"stage_{i}", False) | |
| # Buttons to toggle each stage | |
| cols = st.columns(7) | |
| labels = [ | |
| "1οΈβ£ Tokenize","2οΈβ£ Embed","3οΈβ£ Pos-Enc", | |
| "4οΈβ£ Self-Attn","5οΈβ£ Multi-Head","6οΈβ£ Add&Norm+FFN", | |
| "7οΈβ£ LogitsβNext" | |
| ] | |
| for i, col in enumerate(cols, start=1): | |
| if col.button(labels[i-1], key=f"btn_{i}"): | |
| st.session_state[f"stage_{i}"] = True | |
| # Precompute shared values | |
| token_ids = [abs(hash(w)) % VOCAB_SIZE for w in words] | |
| embeds = embedding_matrix[token_ids] | |
| pe = get_positional_encoding(len(words), D_MODEL) | |
| x = embeds + pe # position-aware embeddings | |
| # βββ 1οΈβ£ Tokenization βββββββββββββββββββββββββββββββββββββββββββββββ | |
| if st.session_state.stage_1: | |
| st.markdown("### 1οΈβ£ Tokenization") | |
| st.write("**What:** Map each word/subword β unique integer ID using a vocab of size 50257.") | |
| st.write("**Why:** Neural networks require numeric inputs, not raw text.") | |
| st.write("**Output (token IDs):**", token_ids) | |
| # βββ 2οΈβ£ Embedding ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if st.session_state.stage_2: | |
| st.markdown("### 2οΈβ£ Embedding") | |
| st.write("**What:** Replace each token ID with a learned dense vector of size d_model.") | |
| st.write("**Why:** Embeddings capture semantic relationshipsβsimilar words lie close in vector space.") | |
| st.write(f"Shape: {embeds.shape}") | |
| st.write(embeds) | |
| # βββ 3οΈβ£ Positional Encoding ββββββββββββββββββββββββββββββββββββββββ | |
| if st.session_state.stage_3: | |
| st.markdown("### 3οΈβ£ Positional Encoding") | |
| st.write("**What:** Add sinusoidal vectors so model knows each tokenβs position in the sequence:") | |
| st.latex(r""" | |
| \mathrm{PE}_{(pos,2k)}=\sin\bigl(\tfrac{pos}{10000^{2k/d_{model}}}\bigr),\quad | |
| \mathrm{PE}_{(pos,2k+1)}=\cos\bigl(\tfrac{pos}{10000^{2k/d_{model}}}\bigr). | |
| """) | |
| st.write("**Why:** Self-attention itself is permutation-invariantβpositions must be encoded separately.") | |
| st.write("**Example vector at position 0:**", x[0]) | |
| # βββ 4οΈβ£ Self-Attention (single head) βββββββββββββββββββββββββββββββ | |
| if st.session_state.stage_4: | |
| st.markdown("### 4οΈβ£ Scaled Dot-Product Self-Attention (Single Head)") | |
| st.write("**What:** For each token, compute:") | |
| st.write("- **Query (Q):** Projection that asks βwhat am I looking for?β") | |
| st.write("- **Key (K):** Projection that asks βwhat do you offer?β") | |
| st.write("- **Value (V):** Projection holding actual information to pass forward") | |
| st.write("Mathematically:") | |
| st.latex(r""" | |
| Q = XW^Q,\quad | |
| K = XW^K,\quad | |
| V = XW^V, | |
| \quad | |
| \alpha_{ij}=\frac{Q_i\cdot K_j}{\sqrt{d_k}},\quad | |
| \beta_{ij}=\mathrm{softmax}_j(\alpha_{ij}),\quad | |
| C_i=\sum_j \beta_{ij}\,V_j. | |
| """) | |
| # random projection for demo | |
| Wq = np.random.randn(D_MODEL, D_HEAD) | |
| Wk = np.random.randn(D_MODEL, D_HEAD) | |
| Wv = np.random.randn(D_MODEL, D_HEAD) | |
| Q, K, V = x @ Wq, x @ Wk, x @ Wv | |
| context, weights = self_attention(Q, K, V) | |
| st.write("Attention weights for token 0 (softmax across all positions):") | |
| st.write(weights[0]) | |
| st.write("Resulting context vector for token 0:") | |
| st.write(context[0]) | |
| # βββ 5οΈβ£ Multi-Head Attention βββββββββββββββββββββββββββββββββββββββ | |
| if st.session_state.stage_5: | |
| st.markdown("### 5οΈβ£ Multi-Head Attention") | |
| st.write("**What:** Run H parallel self-attention heads, each with its own (W^Q,W^K,W^V).") | |
| st.write("**Why:** Different heads can focus on different relationsβsyntax, semantics, coreference, etc.") | |
| heads = [] | |
| for h in range(NUM_HEADS): | |
| Wq = np.random.randn(D_MODEL, D_HEAD) | |
| Wk = np.random.randn(D_MODEL, D_HEAD) | |
| Wv = np.random.randn(D_MODEL, D_HEAD) | |
| c, _ = self_attention(x @ Wq, x @ Wk, x @ Wv) | |
| heads.append(c) | |
| multi_c = np.concatenate(heads, axis=-1) | |
| st.write(f"Concatenated output shape: {multi_c.shape}") | |
| # βββ 6οΈβ£ Add & Norm + Feed-Forward ββββββββββββββββββββββββββββββββββ | |
| if st.session_state.stage_6: | |
| st.markdown("### 6οΈβ£ Add & Norm + Position-wise Feed-Forward") | |
| st.write("**Residual & LayerNorm:** Stabilize training by adding input back:") | |
| st.write("Z1 = LayerNorm(MHA(X) + X)") | |
| st.write("**Feed-Forward Network:**") | |
| st.latex(r""" | |
| \mathrm{FFN}(Z)=W_2\bigl[\mathrm{GELU}(W_1Z + b_1)\bigr] + b_2, | |
| """) | |
| st.write("GELU(x)βxΒ·0.5Β·(1+tanh(β(2/Ο)(x+0.0447xΒ³))) soft-gates small values.") | |
| # simulate MHA + residual | |
| x1 = x + np.random.randn(*x.shape) * 0.01 | |
| ln1 = (x1 - x1.mean(-1, keepdims=True)) / (x1.std(-1, keepdims=True) + 1e-5) | |
| W1 = np.random.randn(D_MODEL, 4*D_MODEL); b1 = np.zeros(4*D_MODEL) | |
| W2 = np.random.randn(4*D_MODEL, D_MODEL); b2 = np.zeros(D_MODEL) | |
| y = gelu(ln1 @ W1 + b1) @ W2 + b2 | |
| ln2 = (y + ln1 - (y+ln1).mean(-1, keepdims=True)) / ((y+ln1).std(-1, keepdims=True)+1e-5) | |
| st.write("Output vector at token 0 after FFN & Add+Norm:") | |
| st.write(ln2[0]) | |
| # βββ 7οΈβ£ Final Projection & Softmax βββββββββββββββββββββββββββββββββ | |
| if st.session_state.stage_7: | |
| st.markdown("### 7οΈβ£ Project to Vocab + Softmax β Next Token") | |
| st.write("**What:** Map each final vector back to logits over 50K-plus vocab.") | |
| st.write("**Why:** Softmax(logits) gives a probability distribution; highest-prob token is chosen.") | |
| x_final = np.random.randn(len(words), D_MODEL) * 0.01 | |
| Wout = np.random.randn(D_MODEL, VOCAB_SIZE); b_out = np.zeros(VOCAB_SIZE) | |
| logits = x_final @ Wout + b_out | |
| probs = softmax(logits, axis=-1) | |
| next_id = np.argmax(probs[-1]) | |
| st.write("Predicted next token ID:", next_id) | |
| st.info("In a full model, youβd map that ID β word via the vocab dictionary.") | |