import os # turn off Streamlitโ€™s automatic file-watching os.environ["STREAMLIT_SERVER_ENABLE_FILE_WATCHER"] = "false" import sys import types import torch # now safe to import import streamlit as st import numpy as np import matplotlib.pyplot as plt import numpy as np # Prevent Streamlit from trying to walk torch.classes' non-standard __path__ if isinstance(getattr(sys.modules.get("torch"), "classes", None), types.ModuleType): torch.classes.__path__ = [] import torch import numpy as np import streamlit as st from transformers import GPT2TokenizerFast # --- Setup --- st.set_page_config(page_title="Text to Embedding Visualizer", layout="wide") st.title("๐Ÿ” Token Embedding & Positional Encoding Coding Demo") # --- Input UI --- sentence = st.text_input("Enter your sentence", "Learning is fun") embedding_dim = st.slider("Embedding Dimension (even only)", min_value=4, max_value=64, value=8, step=2) # --- Load tokenizer --- # Set custom cache directory within your app's working directory (which is writable on Spaces) # os.environ['TRANSFORMERS_CACHE'] = './hf_cache' # Load the tokenizer using the custom cache path # tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", cache_dir="./hf_cache") from transformers import GPT2TokenizerFast # Load tokenizer from bundled local files only tokenizer = GPT2TokenizerFast.from_pretrained("./assets/tokenizer", local_files_only=True) # tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") input_ids = tokenizer.encode(sentence, return_tensors="pt")[0] tokens = tokenizer.convert_ids_to_tokens(input_ids) # st.markdown("### 1๏ธโƒฃ Tokenization") # with st.expander("Show Token IDs"): # st.write("**Tokens:**", tokens) # st.write("**Token IDs:**", input_ids.tolist()) st.markdown("### 1๏ธโƒฃ Tokenization") with st.expander("Token IDs and Subwords"): st.write("**Tokens:**", tokens) st.write("**Token IDs:**", input_ids.tolist()) with st.expander("๐Ÿ“œ Show Code: Tokenization"): st.code(""" tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") input_ids = tokenizer.encode(sentence, return_tensors="pt")[0] tokens = tokenizer.convert_ids_to_tokens(input_ids) """, language="python") # --- Embedding Matrix --- torch.manual_seed(0) # Reproducibility embedding_matrix = torch.nn.Embedding(tokenizer.vocab_size, embedding_dim) embedded = embedding_matrix(input_ids) st.markdown("### 2๏ธโƒฃ Embedding") with st.expander("Show Token Embeddings"): st.write("Shape:", embedded.shape) st.write(embedded) with st.expander("๐Ÿ“œ Show Code: Embedding"): st.code(f""" embedding_matrix = torch.nn.Embedding(tokenizer.vocab_size, {embedding_dim}) embedded = embedding_matrix(input_ids) """, language="python") # --- Positional Encoding --- def get_positional_encoding(seq_len, dim): pe = torch.zeros(seq_len, dim) position = torch.arange(0, seq_len, dtype=torch.float32).unsqueeze(1) div_term = torch.exp(torch.arange(0, dim, 2).float() * (-np.log(10000.0) / dim)) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) return pe pos_enc = get_positional_encoding(len(input_ids), embedding_dim) st.markdown("### 3๏ธโƒฃ Positional Encoding") with st.expander("Show Positional Encoding"): st.write("Shape:", pos_enc.shape) st.write(pos_enc) with st.expander("๐Ÿ“œ Show Code: Positional Encoding"): st.code(f''' def get_positional_encoding(seq_len, dim): pe = torch.zeros(seq_len, dim) position = torch.arange(0, seq_len).unsqueeze(1).float() div_term = torch.exp(torch.arange(0, dim, 2).float() * (-np.log(10000.0) / dim)) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) return pe pos_enc = get_positional_encoding(len(input_ids), {embedding_dim}) ''', language="python") # --- Combined Embedding + Position --- embedded_with_pos = embedded + pos_enc st.markdown("### 4๏ธโƒฃ Embedding + Positional Encoding") with st.expander("Show Combined Embedding"): st.write(embedded_with_pos) with st.expander("๐Ÿ“œ Show Code: Add Positional Encoding"): st.code(""" embedded_with_pos = embedded + pos_enc """, language="python") # --- Approximate Reverse to Token IDs --- def find_closest_token(vec, emb_matrix): sims = torch.nn.functional.cosine_similarity(vec.unsqueeze(0), emb_matrix.weight, dim=1) return torch.argmax(sims).item() recovered_ids = [find_closest_token(vec, embedding_matrix) for vec in embedded] #recovered_text = tokenizer.decode(recovered_ids) #st.markdown("### 5๏ธโƒฃ Approximate Reverse") #with st.expander("Recovered Tokens"): # st.write("**Recovered IDs:**", recovered_ids) # st.write("**Recovered Text:**", recovered_text) recovered_tokens = tokenizer.convert_ids_to_tokens(recovered_ids) # โ† Subwords recovered_text = tokenizer.decode(recovered_ids) # โ† Final string st.markdown("### 5๏ธโƒฃ Approximate Reverse") with st.expander("Recovered Tokens and Text"): st.write("**Recovered Token IDs:**", recovered_ids) st.write("**Recovered Subword Tokens (BPE):**", recovered_tokens) st.write("**Recovered Sentence:**", recovered_text) with st.expander("๐Ÿ“œ Show Code: Recover Token IDs and Text"): st.code(""" def find_closest_token(vec, emb_matrix): sims = torch.nn.functional.cosine_similarity(vec.unsqueeze(0), emb_matrix.weight, dim=1) return torch.argmax(sims).item() recovered_ids = [find_closest_token(vec, embedding_matrix) for vec in embedded] recovered_tokens = tokenizer.convert_ids_to_tokens(recovered_ids) recovered_text = tokenizer.decode(recovered_ids) """, language="python") # --- Recover Position (Approx) --- recovered_pos = embedded_with_pos - embedded position_error = pos_enc - recovered_pos st.markdown("### 6๏ธโƒฃ Recovered Positional Encoding") with st.expander("Compare Recovered vs Original"): st.write("**Recovered Positional Encoding:**") st.write(recovered_pos) st.write("**Difference from Original (should be ~0):**") st.write(position_error) with st.expander("๐Ÿ“œ Show Code: Recovered Positional Encoding"): st.code(""" recovered_pos = embedded_with_pos - embedded position_error = pos_enc - recovered_pos """, language="python") # Estimate position from positional encoding using cosine similarity def estimate_position_from_encoding(pe_row, full_table): sims = torch.nn.functional.cosine_similarity(pe_row.unsqueeze(0), full_table, dim=1) return torch.argmax(sims).item() # Build reference table of known encodings for positions 0 to N reference_pos_table = get_positional_encoding(seq_len=len(input_ids), dim=embedding_dim) # Now estimate each token's position estimated_positions = [estimate_position_from_encoding(row, reference_pos_table) for row in recovered_pos] st.markdown("### 7๏ธโƒฃ Estimate Position from Positional Encoding") with st.expander("Recovered Positions"): st.write("**Estimated Token Positions:**", estimated_positions) st.write("**Original True Positions:**", list(range(len(input_ids)))) with st.expander("๐Ÿ“œ Show Code: Estimate Positions"): st.code(""" def estimate_position_from_encoding(pe_row, full_table): sims = torch.nn.functional.cosine_similarity(pe_row.unsqueeze(0), full_table, dim=1) return torch.argmax(sims).item() reference_pos_table = get_positional_encoding(seq_len=len(input_ids), dim=embedding_dim) estimated_positions = [estimate_position_from_encoding(row, reference_pos_table) for row in recovered_pos] """, language="python") st.markdown("### ๐Ÿ“˜ Final Notes: Theory & Formulas") with st.expander("๐Ÿง  Theory and Formulas"): st.markdown(r""" ### 1๏ธโƒฃ Tokenization (BPE) We use **Byte Pair Encoding (BPE)** to break text into subword units. For example: "Learning is fun" โ†’ ["Learning", "ฤ is", "ฤ fun"] Note: The "ฤ " indicates a **space** before the token. --- ### 2๏ธโƒฃ Embedding Each token ID $t_i \in \mathbb{Z}$ is mapped to a dense vector: $$ \text{Embedding}(t_i) = \mathbf{e}_i \in \mathbb{R}^d $$ Where: - $t_i$: token ID - $\mathbf{e}_i$: embedding vector of dimension $d$ --- ### 3๏ธโƒฃ Sinusoidal Positional Encoding Used to encode the **position $p$** of a token without learnable parameters: $$ \text{PE}(p, 2i) = \sin\left(\frac{p}{10000^{\frac{2i}{d}}}\right) $$ $$ \text{PE}(p, 2i+1) = \cos\left(\frac{p}{10000^{\frac{2i}{d}}}\right) $$ Where: - $p$: position index (0, 1, 2, โ€ฆ) - $i$: dimension index - $d$: total embedding dimension This gives a positional vector $\text{PE}(p) \in \mathbb{R}^d$ --- ### 4๏ธโƒฃ Add Embedding and Positional Encoding We add the embedding and positional encoding element-wise: $$ \mathbf{z}_i = \mathbf{e}_i + \text{PE}(p_i) $$ Where: - $\mathbf{z}_i$: final input to the transformer --- ### 5๏ธโƒฃ Reverse Lookup (Approximate) We find the nearest embedding using cosine similarity: $$ \hat{t}_i = \underset{j}{\arg\max} \left( \frac{ \mathbf{z}_i \cdot \mathbf{e}_j }{ \| \mathbf{z}_i \| \, \| \mathbf{e}_j \| } \right) $$ --- ### 6๏ธโƒฃ Recover Position from Embedding + PE To isolate positional encoding: $$ \text{Recovered PE}_i = \mathbf{z}_i - \mathbf{e}_i $$ We then compare this with reference positional encodings to estimate token position. --- ### ๐ŸŒŸ Summary Table | Step | What Happens | |------|--------------| | **Tokenization** | Sentence โ†’ Subwords โ†’ Token IDs | | **Embedding** | Token IDs โ†’ Vectors | | **Pos Encoding** | Position Index โ†’ Sin/Cos Vector | | **Sum** | Embedding + PE = Input to Transformer | | **Reverse** | Approximate token ID from vector | | **PE Recovery** | Recover position using similarity | """, unsafe_allow_html=True) st.markdown("### ๐Ÿค– Transformer Internals: Key Concepts") with st.expander("๐Ÿ” Multi-Head Attention: Q, K, V Projections"): st.markdown(r""" Each token embedding $\mathbf{x}_i$ is linearly projected into: - Query vector: $Q_i = \mathbf{x}_i W^Q$ - Key vector: $K_i = \mathbf{x}_i W^K$ - Value vector: $V_i = \mathbf{x}_i W^V$ All of shape: $\mathbb{R}^{d_{model} \times d_{head}}$ Multiple such projections (heads) run in parallel: $$ \text{MultiHead}(X) = \text{Concat}(\text{head}_1, ..., \text{head}_h) W^O $$ Each head does: $$ \text{Attention}(Q, K, V) = \text{softmax}\left( \frac{Q K^\top}{\sqrt{d_k}} \right) V $$ """, unsafe_allow_html=True) with st.expander("๐Ÿง  Contextualized Representations"): st.markdown(r""" The attention mechanism lets each token **attend to others**, allowing the output for each token to contain **context**. For example: - Token "fun" gets influenced by "is" and "learning" - The output is no longer static but dynamic, depending on sentence context This is what makes Transformers powerful for understanding relationships between tokens. """) with st.expander("๐Ÿ›  Feed-Forward Neural Network (FFN)"): st.markdown(r""" After attention, each tokenโ€™s vector goes through a two-layer feed-forward network applied independently: $$ \text{FFN}(x) = \max(0, x W_1 + b_1) W_2 + b_2 $$ This allows deeper transformations on each token representation. """) with st.expander("๐Ÿ“Š Softmax Over Vocabulary"): st.markdown(r""" The final output layer transforms each token representation to **logits** for the full vocabulary. Then, softmax is applied to convert them into probabilities: $$ P(w_i \mid \text{context}) = \frac{\exp(\text{logit}_i)}{\sum_j \exp(\text{logit}_j)} $$ The token with the highest probability is typically selected as the **predicted next word**. """) with st.expander("๐Ÿ”ฎ Predicted Next Token"): st.markdown(r""" By chaining all steps (embedding โ†’ attention โ†’ FFN โ†’ softmax), the model predicts the **next token**: E.g., Input: `"Learning is"` Predicted next token: `"fun"` This is how autoregressive models like GPT-2 **generate text** one token at a time. """) st.markdown("### ๐ŸŽจ Visualizations: Transformer Internals") # ---- 1. Attention Heatmap ---- with st.expander("๐Ÿ” Multi-Head Attention Score Heatmap (QKแต€ / โˆšd)"): st.markdown(""" This heatmap shows how the attention mechanism scores each query against all keys. Brighter color = higher attention weight. $$ \\text{Attention}(Q, K, V) = \\text{softmax}\\left( \\frac{QK^T}{\\sqrt{d_k}} \\right)V $$ """, unsafe_allow_html=True) tokens = ["Learning", "is", "fun"] Q = np.array([[1, 0], [0.5, 0.5], [0, 1]]) K = np.array([[1, 0], [0.5, 0.5], [0, 1]]) scores = np.dot(Q, K.T) / np.sqrt(2) softmax_scores = np.exp(scores) / np.sum(np.exp(scores), axis=1, keepdims=True) fig1, ax1 = plt.subplots() cax = ax1.matshow(softmax_scores, cmap="Blues") fig1.colorbar(cax) ax1.set_xticks(np.arange(len(tokens))) ax1.set_xticklabels(tokens) ax1.set_yticks(np.arange(len(tokens))) ax1.set_yticklabels(tokens) ax1.set_xlabel("Key Tokens (K)") ax1.set_ylabel("Query Tokens (Q)") ax1.set_title("Attention Score Heatmap") st.pyplot(fig1) # ---- 2. Softmax Curve ---- with st.expander("๐Ÿ“Š Softmax Curve for Vocabulary Logits"): st.markdown(""" This curve shows how softmax converts logits into probabilities. Higher logits result in higher predicted probabilities: $$ \\text{Softmax}(x_i) = \\frac{e^{x_i}}{\\sum_j e^{x_j}} $$ """, unsafe_allow_html=True) x = np.linspace(-4, 4, 100) logits = np.vstack([x, x + 1, x - 1]) exps = np.exp(logits) softmax = exps / np.sum(exps, axis=0) fig2, ax2 = plt.subplots() ax2.plot(x, softmax[0], label='Token A') ax2.plot(x, softmax[1], label='Token B') ax2.plot(x, softmax[2], label='Token C') ax2.set_title("Softmax Output vs Logit Value") ax2.set_xlabel("Logit") ax2.set_ylabel("Probability") ax2.legend() st.pyplot(fig2)