transformer-introduction

Sleeping

App Files Files Community

transformer-introduction / src /streamlit_app.py

schoginitoys

Update src/streamlit_app.py

8366831 verified 9 months ago

raw

history blame contribute delete

8.77 kB

	# transformer_demo_persistent.py
	import streamlit as st
	import numpy as np

	# ─── Config ─────────────────────────────────────────────────────────
	VOCAB_SIZE = 50257
	SEQ_LEN = 12
	D_MODEL = 64 # demo dimensionality
	NUM_HEADS = 4
	D_HEAD = D_MODEL // NUM_HEADS

	np.random.seed(0)
	embedding_matrix = np.random.randn(VOCAB_SIZE, D_MODEL) * 0.01

	# ─── Helpers ────────────────────────────────────────────────────────
	def get_positional_encoding(seq_len, d_model):
	pos = np.arange(seq_len)[:, None]
	dim = np.arange(d_model)[None, :]
	angle_rates = 1 / (10000 ** (2 * (dim // 2) / d_model))
	angles = pos * angle_rates
	pe = np.zeros((seq_len, d_model))
	pe[:, 0::2] = np.sin(angles[:, 0::2])
	pe[:, 1::2] = np.cos(angles[:, 1::2])
	return pe

	def gelu(x):
	# Gaussian Error Linear Unit for non-linearity
	return x * 0.5 * (1.0 + np.tanh(np.sqrt(2/np.pi)(x + 0.044715np.power(x,3))))

	def softmax(x, axis=-1):
	e = np.exp(x - np.max(x, axis=axis, keepdims=True))
	return e / e.sum(axis=axis, keepdims=True)

	def self_attention(Q, K, V):
	# Scaled dot-product attention
	scores = Q @ K.T / np.sqrt(D_HEAD)
	weights = softmax(scores, axis=-1)
	context = weights @ V
	return context, weights

	# ─── Streamlit UI ───────────────────────────────────────────────────
	st.title("🔍 Transformer Step-by-Step Demo (Persistent Outputs)")

	# Display model/demo dimensions with explanations
	st.markdown("#### Model & Demo Dimensions")
	st.write(f"- Vocabulary size (\|V\|): {VOCAB_SIZE} (total tokens)")
	st.write(f"- Sequence length (SEQ_LEN): {SEQ_LEN} (max tokens processed)")
	st.write(f"- Model dimension (d_model): {D_MODEL} (embedding & hidden size)")
	st.write(f"- Number of heads (H): {NUM_HEADS} (parallel attention heads)")
	st.write(f"- Head dimension (d_k): {D_HEAD} (d_model / H)")



	sentence = st.text_input("Input (up to 12 words):",
	"The quick brown fox jumps over the lazy dog")
	words = sentence.split()[:SEQ_LEN]

	# Initialize session state flags
	for i in range(1, 8):
	st.session_state.setdefault(f"stage_{i}", False)

	# Buttons to toggle each stage
	cols = st.columns(7)
	labels = [
	"1️⃣ Tokenize","2️⃣ Embed","3️⃣ Pos-Enc",
	"4️⃣ Self-Attn","5️⃣ Multi-Head","6️⃣ Add&Norm+FFN",
	"7️⃣ Logits→Next"
	]
	for i, col in enumerate(cols, start=1):
	if col.button(labels[i-1], key=f"btn_{i}"):
	st.session_state[f"stage_{i}"] = True

	# Precompute shared values
	token_ids = [abs(hash(w)) % VOCAB_SIZE for w in words]
	embeds = embedding_matrix[token_ids]
	pe = get_positional_encoding(len(words), D_MODEL)
	x = embeds + pe # position-aware embeddings

	# ─── 1️⃣ Tokenization ───────────────────────────────────────────────
	if st.session_state.stage_1:
	st.markdown("### 1️⃣ Tokenization")
	st.write("What: Map each word/subword → unique integer ID using a vocab of size 50257.")
	st.write("Why: Neural networks require numeric inputs, not raw text.")
	st.write("Output (token IDs):", token_ids)

	# ─── 2️⃣ Embedding ──────────────────────────────────────────────────
	if st.session_state.stage_2:
	st.markdown("### 2️⃣ Embedding")
	st.write("What: Replace each token ID with a learned dense vector of size d_model.")
	st.write("Why: Embeddings capture semantic relationships—similar words lie close in vector space.")
	st.write(f"Shape: {embeds.shape}")
	st.write(embeds)

	# ─── 3️⃣ Positional Encoding ────────────────────────────────────────
	if st.session_state.stage_3:
	st.markdown("### 3️⃣ Positional Encoding")
	st.write("What: Add sinusoidal vectors so model knows each token’s position in the sequence:")
	st.latex(r"""
	\mathrm{PE}_{(pos,2k)}=\sin\bigl(\tfrac{pos}{10000^{2k/d_{model}}}\bigr),\quad
	\mathrm{PE}_{(pos,2k+1)}=\cos\bigl(\tfrac{pos}{10000^{2k/d_{model}}}\bigr).
	""")
	st.write("Why: Self-attention itself is permutation-invariant—positions must be encoded separately.")
	st.write("Example vector at position 0:", x[0])

	# ─── 4️⃣ Self-Attention (single head) ───────────────────────────────
	if st.session_state.stage_4:
	st.markdown("### 4️⃣ Scaled Dot-Product Self-Attention (Single Head)")
	st.write("What: For each token, compute:")
	st.write("- Query (Q): Projection that asks “what am I looking for?”")
	st.write("- Key (K): Projection that asks “what do you offer?”")
	st.write("- Value (V): Projection holding actual information to pass forward")
	st.write("Mathematically:")
	st.latex(r"""
	Q = XW^Q,\quad
	K = XW^K,\quad
	V = XW^V,
	\quad
	\alpha_{ij}=\frac{Q_i\cdot K_j}{\sqrt{d_k}},\quad
	\beta_{ij}=\mathrm{softmax}_j(\alpha_{ij}),\quad
	C_i=\sum_j \beta_{ij}\,V_j.
	""")
	# random projection for demo
	Wq = np.random.randn(D_MODEL, D_HEAD)
	Wk = np.random.randn(D_MODEL, D_HEAD)
	Wv = np.random.randn(D_MODEL, D_HEAD)
	Q, K, V = x @ Wq, x @ Wk, x @ Wv
	context, weights = self_attention(Q, K, V)
	st.write("Attention weights for token 0 (softmax across all positions):")
	st.write(weights[0])
	st.write("Resulting context vector for token 0:")
	st.write(context[0])

	# ─── 5️⃣ Multi-Head Attention ───────────────────────────────────────
	if st.session_state.stage_5:
	st.markdown("### 5️⃣ Multi-Head Attention")
	st.write("What: Run H parallel self-attention heads, each with its own (W^Q,W^K,W^V).")
	st.write("Why: Different heads can focus on different relations—syntax, semantics, coreference, etc.")
	heads = []
	for h in range(NUM_HEADS):
	Wq = np.random.randn(D_MODEL, D_HEAD)
	Wk = np.random.randn(D_MODEL, D_HEAD)
	Wv = np.random.randn(D_MODEL, D_HEAD)
	c, _ = self_attention(x @ Wq, x @ Wk, x @ Wv)
	heads.append(c)
	multi_c = np.concatenate(heads, axis=-1)
	st.write(f"Concatenated output shape: {multi_c.shape}")

	# ─── 6️⃣ Add & Norm + Feed-Forward ──────────────────────────────────
	if st.session_state.stage_6:
	st.markdown("### 6️⃣ Add & Norm + Position-wise Feed-Forward")
	st.write("Residual & LayerNorm: Stabilize training by adding input back:")
	st.write("Z1 = LayerNorm(MHA(X) + X)")
	st.write("Feed-Forward Network:")
	st.latex(r"""
	\mathrm{FFN}(Z)=W_2\bigl[\mathrm{GELU}(W_1Z + b_1)\bigr] + b_2,
	""")
	st.write("GELU(x)≈x·0.5·(1+tanh(√(2/π)(x+0.0447x³))) soft-gates small values.")
	# simulate MHA + residual
	x1 = x + np.random.randn(x.shape) 0.01
	ln1 = (x1 - x1.mean(-1, keepdims=True)) / (x1.std(-1, keepdims=True) + 1e-5)
	W1 = np.random.randn(D_MODEL, 4D_MODEL); b1 = np.zeros(4D_MODEL)
	W2 = np.random.randn(4*D_MODEL, D_MODEL); b2 = np.zeros(D_MODEL)
	y = gelu(ln1 @ W1 + b1) @ W2 + b2
	ln2 = (y + ln1 - (y+ln1).mean(-1, keepdims=True)) / ((y+ln1).std(-1, keepdims=True)+1e-5)
	st.write("Output vector at token 0 after FFN & Add+Norm:")
	st.write(ln2[0])

	# ─── 7️⃣ Final Projection & Softmax ─────────────────────────────────
	if st.session_state.stage_7:
	st.markdown("### 7️⃣ Project to Vocab + Softmax → Next Token")
	st.write("What: Map each final vector back to logits over 50K-plus vocab.")
	st.write("Why: Softmax(logits) gives a probability distribution; highest-prob token is chosen.")
	x_final = np.random.randn(len(words), D_MODEL) * 0.01
	Wout = np.random.randn(D_MODEL, VOCAB_SIZE); b_out = np.zeros(VOCAB_SIZE)
	logits = x_final @ Wout + b_out
	probs = softmax(logits, axis=-1)
	next_id = np.argmax(probs[-1])
	st.write("Predicted next token ID:", next_id)
	st.info("In a full model, you’d map that ID → word via the vocab dictionary.")