# LLM Token & Attention Explorer with Streamlit # Features: Tokenization, OpenAI Embeddings, Positional Encoding, Final Tensor, Multi-Head Attention Simulation import streamlit as st import numpy as np import tiktoken import os from openai import OpenAI import matplotlib.pyplot as plt import seaborn as sns from sklearn.decomposition import PCA st.set_page_config(page_title="LLM Token Explorer", layout="centered") st.title("๐Ÿง  LLM Attention Explorer: Tokens, Embeddings, Positional Encoding, and Multi-Head Visualization") # Introductory Explanations with st.expander("โ„น๏ธ About This App", expanded=True): st.markdown(""" This interactive app lets you explore how Large Language Models (LLMs) like GPT-3/4 work internally. You'll learn about tokenization, embeddings, positional encoding, and multi-head self-attention through real-time visualizations and simulations. """) with st.expander("๐Ÿงพ What is a Token?"): st.markdown(""" A token is a basic unit of text. It could be as small as a character or as large as a word depending on the tokenizer. GPT models use subword tokenization (like Byte-Pair Encoding), meaning common patterns get their own token. For example: - "apple" โ†’ might be 1 token - "unhappiness" โ†’ might be split into ["un", "happiness"] """) with st.expander("๐Ÿ“Œ What Are Embeddings?"): st.markdown(""" Embeddings are high-dimensional vectors that represent the meaning of each token. Similar tokens (like 'cat' and 'dog') have embeddings that are close in space. They're used by the model to perform mathematical operations on language. """) with st.expander("๐Ÿ“ Why Positional Encoding?"): st.markdown(""" Since transformers process all tokens in parallel and not sequentially, they need to know token positions. Positional encodings are added to token embeddings to give each token a unique place in the sequence. """) with st.expander("๐Ÿง  What is Self-Attention?"): st.markdown(""" Self-attention allows the model to weigh the importance of each token in a sentence when encoding a specific token. For example, in "The cat sat because it was tired", attention helps "it" focus more on "cat" than other words. """) with st.expander("๐Ÿ” Understanding Multi-Head Attention"): st.markdown(""" Each attention head learns different aspects of language. For example: - One head might learn grammar structure. - Another might learn long-distance relationships. Heads run in parallel and their outputs are concatenated to form a rich representation of each token. """) client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) st.text(f"OpenAI key found: {'Yes' if os.getenv('OPENAI_API_KEY') else 'No'}") st.header("โœ๏ธ Input Text") input_text = st.text_area("Enter your text:", height=150) tokenizer_name = st.selectbox("Choose tokenizer:", ["cl100k_base", "p50k_base", "r50k_base", "gpt2"]) if input_text: st.subheader("๐Ÿ”ค Tokenization") enc = tiktoken.get_encoding(tokenizer_name) tokens = enc.encode(input_text) token_strings = [enc.decode([t]) for t in tokens] with st.expander("๐Ÿงพ Token IDs", expanded=True): st.write(tokens) with st.expander("๐Ÿ“– Decoded Tokens", expanded=True): st.write(token_strings) st.info(f"Token count: {len(tokens)}") fig, ax = plt.subplots() ax.bar(range(len(tokens)), tokens, tick_label=token_strings) ax.set_xlabel("Token") ax.set_ylabel("Token ID") ax.set_title("Token IDs for Input Text") plt.xticks(rotation=45, ha='right') st.pyplot(fig) st.subheader("๐Ÿ”— OpenAI Token Embeddings") embeddings = [] for tok in token_strings: response = client.embeddings.create(input=[tok], model="text-embedding-ada-002") embedding = response.data[0].embedding embeddings.append(embedding) with st.expander(f"๐Ÿ”ธ '{tok}' Embedding", expanded=True): st.write(embedding) fig, ax = plt.subplots(figsize=(8, 1)) sns.heatmap(np.array(embedding).reshape(1, -1), cmap="viridis", cbar=True, ax=ax) ax.set_title("Embedding Heatmap") ax.axis('off') st.pyplot(fig) st.success("Generated embeddings for all tokens.") st.subheader("๐Ÿ“ Positional Encoding") def get_positional_encoding(seq_len, dim): PE = np.zeros((seq_len, dim)) for pos in range(seq_len): for i in range(0, dim, 2): div_term = np.exp(i * -np.log(10000.0) / dim) PE[pos, i] = np.sin(pos * div_term) if i+1 < dim: PE[pos, i+1] = np.cos(pos * div_term) return PE dim = len(embeddings[0]) PE = get_positional_encoding(len(tokens), dim) with st.expander("๐Ÿ“ Positional Encoding Matrix", expanded=True): st.write(PE) st.subheader("๐Ÿงฎ Final Input Tensor (Embedding + PE)") embedded = np.array(embeddings) combined = embedded + PE with st.expander("๐Ÿงพ Final Tensor", expanded=True): st.write(combined) st.subheader("๐Ÿง  Simulated Multi-Head Self-Attention") if st.button("Simulate Attention"): embed_dim = 32 num_heads = 4 head_dim = embed_dim // num_heads x = np.random.randn(len(tokens), embed_dim) W_q, W_k, W_v = [np.random.randn(embed_dim, embed_dim) for _ in range(3)] Q = x @ W_q K = x @ W_k V = x @ W_v def split_heads(t): return t.reshape(len(tokens), num_heads, head_dim).transpose(1, 0, 2) Qh, Kh, Vh = split_heads(Q), split_heads(K), split_heads(V) def attention(q, k, v): scores = q @ k.T / np.sqrt(k.shape[-1]) weights = np.exp(scores - np.max(scores, axis=-1, keepdims=True)) weights /= np.sum(weights, axis=-1, keepdims=True) return weights @ v, weights outputs = [] for i in range(num_heads): out, weights = attention(Qh[i], Kh[i], Vh[i]) with st.expander(f"Head {i+1}"): st.write("Q:", Qh[i]) st.write("K:", Kh[i]) st.write("V:", Vh[i]) st.write("Attention Weights:", weights) fig, ax = plt.subplots() sns.heatmap(weights, cmap="Blues", ax=ax) ax.set_title("Attention Weights Heatmap") st.pyplot(fig) outputs.append(out) final = np.concatenate(outputs, axis=-1) with st.expander("๐Ÿงฉ Concatenated Output"): st.write(final) with st.expander("๐Ÿ“Š Transformer and GPT Model Component Comparison (Table)", expanded=True): st.markdown(""" | Parameter | Original Transformer (2017) | GPT-2 (2019) | GPT-3 (2020) | GPT-4 (2023, est.) | |----------------------------------|------------------------------|-------------------------|--------------------------|----------------------------| | **Max Context Length (tokens)** | 512 | 1024 | 2048 | 8192 / 32,768 | | **Vocab Size** | ~37,000 (BPE) | 50,257 | 50,257 | ~100,000 (multimodal-aware) | | **Embedding Dimension (D)** | 512 | 768 โ€“ 1600 | 12,288 | 12,288+ | | **Layers / Transformer Blocks** | 6 (base), 12 (large) | 12 โ€“ 48 (XL) | 96 | ~120 โ€“ 160 (est.) | | **Self-Attention Heads** | 8 | 12 โ€“ 25 | 96 | 120 โ€“ 128+ (est.) | | **Dim per Attention Head** | 64 | 64 | 128 | ~128 | | **Batch Size (training)** | ~25k tokens | ~512 โ€“ 2048 tokens | ~3.2M tokens | Multi-million tokens (est.) | | **Tensor Shape** | [Batch, Tokens, Dim] | Same | Same | Same | | **Parameters (Total)** | ~65M | 124M โ€“ 1.5B | 175B | ~500B โ€“ 1T+ (speculative) | **Explanations:** - **Context Length**: Max number of tokens the model can see at once. - **Embedding Dim**: Size of token vectors. - **Layers**: Depth of the network (attention + FFN). - **Heads**: Parallel attention mechanisms. - **Dim per Head**: Each head gets a slice of the full embedding. - **Tensor Shape**: Internal model shape: [Batch, Tokens, Embedding]. """)