Spaces:
Runtime error
Runtime error
| import os | |
| # turn off Streamlit’s automatic file-watching | |
| os.environ["STREAMLIT_SERVER_ENABLE_FILE_WATCHER"] = "false" | |
| import sys | |
| import types | |
| import torch # now safe to import | |
| import streamlit as st | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| # Prevent Streamlit from trying to walk torch.classes' non-standard __path__ | |
| if isinstance(getattr(sys.modules.get("torch"), "classes", None), types.ModuleType): | |
| torch.classes.__path__ = [] | |
| import torch | |
| import numpy as np | |
| import streamlit as st | |
| from transformers import GPT2TokenizerFast | |
| # --- Setup --- | |
| st.set_page_config(page_title="Text to Embedding Visualizer", layout="wide") | |
| st.title("🔍 Token Embedding & Positional Encoding Coding Demo") | |
| # --- Input UI --- | |
| sentence = st.text_input("Enter your sentence", "Learning is fun") | |
| embedding_dim = st.slider("Embedding Dimension (even only)", min_value=4, max_value=64, value=8, step=2) | |
| # --- Load tokenizer --- | |
| # Set custom cache directory within your app's working directory (which is writable on Spaces) | |
| # os.environ['TRANSFORMERS_CACHE'] = './hf_cache' | |
| # Load the tokenizer using the custom cache path | |
| # tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", cache_dir="./hf_cache") | |
| from transformers import GPT2TokenizerFast | |
| # Load tokenizer from bundled local files only | |
| tokenizer = GPT2TokenizerFast.from_pretrained("./assets/tokenizer", local_files_only=True) | |
| # tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") | |
| input_ids = tokenizer.encode(sentence, return_tensors="pt")[0] | |
| tokens = tokenizer.convert_ids_to_tokens(input_ids) | |
| # st.markdown("### 1️⃣ Tokenization") | |
| # with st.expander("Show Token IDs"): | |
| # st.write("**Tokens:**", tokens) | |
| # st.write("**Token IDs:**", input_ids.tolist()) | |
| st.markdown("### 1️⃣ Tokenization") | |
| with st.expander("Token IDs and Subwords"): | |
| st.write("**Tokens:**", tokens) | |
| st.write("**Token IDs:**", input_ids.tolist()) | |
| with st.expander("📜 Show Code: Tokenization"): | |
| st.code(""" | |
| tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") | |
| input_ids = tokenizer.encode(sentence, return_tensors="pt")[0] | |
| tokens = tokenizer.convert_ids_to_tokens(input_ids) | |
| """, language="python") | |
| # --- Embedding Matrix --- | |
| torch.manual_seed(0) # Reproducibility | |
| embedding_matrix = torch.nn.Embedding(tokenizer.vocab_size, embedding_dim) | |
| embedded = embedding_matrix(input_ids) | |
| st.markdown("### 2️⃣ Embedding") | |
| with st.expander("Show Token Embeddings"): | |
| st.write("Shape:", embedded.shape) | |
| st.write(embedded) | |
| with st.expander("📜 Show Code: Embedding"): | |
| st.code(f""" | |
| embedding_matrix = torch.nn.Embedding(tokenizer.vocab_size, {embedding_dim}) | |
| embedded = embedding_matrix(input_ids) | |
| """, language="python") | |
| # --- Positional Encoding --- | |
| def get_positional_encoding(seq_len, dim): | |
| pe = torch.zeros(seq_len, dim) | |
| position = torch.arange(0, seq_len, dtype=torch.float32).unsqueeze(1) | |
| div_term = torch.exp(torch.arange(0, dim, 2).float() * (-np.log(10000.0) / dim)) | |
| pe[:, 0::2] = torch.sin(position * div_term) | |
| pe[:, 1::2] = torch.cos(position * div_term) | |
| return pe | |
| pos_enc = get_positional_encoding(len(input_ids), embedding_dim) | |
| st.markdown("### 3️⃣ Positional Encoding") | |
| with st.expander("Show Positional Encoding"): | |
| st.write("Shape:", pos_enc.shape) | |
| st.write(pos_enc) | |
| with st.expander("📜 Show Code: Positional Encoding"): | |
| st.code(f''' | |
| def get_positional_encoding(seq_len, dim): | |
| pe = torch.zeros(seq_len, dim) | |
| position = torch.arange(0, seq_len).unsqueeze(1).float() | |
| div_term = torch.exp(torch.arange(0, dim, 2).float() * (-np.log(10000.0) / dim)) | |
| pe[:, 0::2] = torch.sin(position * div_term) | |
| pe[:, 1::2] = torch.cos(position * div_term) | |
| return pe | |
| pos_enc = get_positional_encoding(len(input_ids), {embedding_dim}) | |
| ''', language="python") | |
| # --- Combined Embedding + Position --- | |
| embedded_with_pos = embedded + pos_enc | |
| st.markdown("### 4️⃣ Embedding + Positional Encoding") | |
| with st.expander("Show Combined Embedding"): | |
| st.write(embedded_with_pos) | |
| with st.expander("📜 Show Code: Add Positional Encoding"): | |
| st.code(""" | |
| embedded_with_pos = embedded + pos_enc | |
| """, language="python") | |
| # --- Approximate Reverse to Token IDs --- | |
| def find_closest_token(vec, emb_matrix): | |
| sims = torch.nn.functional.cosine_similarity(vec.unsqueeze(0), emb_matrix.weight, dim=1) | |
| return torch.argmax(sims).item() | |
| recovered_ids = [find_closest_token(vec, embedding_matrix) for vec in embedded] | |
| #recovered_text = tokenizer.decode(recovered_ids) | |
| #st.markdown("### 5️⃣ Approximate Reverse") | |
| #with st.expander("Recovered Tokens"): | |
| # st.write("**Recovered IDs:**", recovered_ids) | |
| # st.write("**Recovered Text:**", recovered_text) | |
| recovered_tokens = tokenizer.convert_ids_to_tokens(recovered_ids) # ← Subwords | |
| recovered_text = tokenizer.decode(recovered_ids) # ← Final string | |
| st.markdown("### 5️⃣ Approximate Reverse") | |
| with st.expander("Recovered Tokens and Text"): | |
| st.write("**Recovered Token IDs:**", recovered_ids) | |
| st.write("**Recovered Subword Tokens (BPE):**", recovered_tokens) | |
| st.write("**Recovered Sentence:**", recovered_text) | |
| with st.expander("📜 Show Code: Recover Token IDs and Text"): | |
| st.code(""" | |
| def find_closest_token(vec, emb_matrix): | |
| sims = torch.nn.functional.cosine_similarity(vec.unsqueeze(0), emb_matrix.weight, dim=1) | |
| return torch.argmax(sims).item() | |
| recovered_ids = [find_closest_token(vec, embedding_matrix) for vec in embedded] | |
| recovered_tokens = tokenizer.convert_ids_to_tokens(recovered_ids) | |
| recovered_text = tokenizer.decode(recovered_ids) | |
| """, language="python") | |
| # --- Recover Position (Approx) --- | |
| recovered_pos = embedded_with_pos - embedded | |
| position_error = pos_enc - recovered_pos | |
| st.markdown("### 6️⃣ Recovered Positional Encoding") | |
| with st.expander("Compare Recovered vs Original"): | |
| st.write("**Recovered Positional Encoding:**") | |
| st.write(recovered_pos) | |
| st.write("**Difference from Original (should be ~0):**") | |
| st.write(position_error) | |
| with st.expander("📜 Show Code: Recovered Positional Encoding"): | |
| st.code(""" | |
| recovered_pos = embedded_with_pos - embedded | |
| position_error = pos_enc - recovered_pos | |
| """, language="python") | |
| # Estimate position from positional encoding using cosine similarity | |
| def estimate_position_from_encoding(pe_row, full_table): | |
| sims = torch.nn.functional.cosine_similarity(pe_row.unsqueeze(0), full_table, dim=1) | |
| return torch.argmax(sims).item() | |
| # Build reference table of known encodings for positions 0 to N | |
| reference_pos_table = get_positional_encoding(seq_len=len(input_ids), dim=embedding_dim) | |
| # Now estimate each token's position | |
| estimated_positions = [estimate_position_from_encoding(row, reference_pos_table) for row in recovered_pos] | |
| st.markdown("### 7️⃣ Estimate Position from Positional Encoding") | |
| with st.expander("Recovered Positions"): | |
| st.write("**Estimated Token Positions:**", estimated_positions) | |
| st.write("**Original True Positions:**", list(range(len(input_ids)))) | |
| with st.expander("📜 Show Code: Estimate Positions"): | |
| st.code(""" | |
| def estimate_position_from_encoding(pe_row, full_table): | |
| sims = torch.nn.functional.cosine_similarity(pe_row.unsqueeze(0), full_table, dim=1) | |
| return torch.argmax(sims).item() | |
| reference_pos_table = get_positional_encoding(seq_len=len(input_ids), dim=embedding_dim) | |
| estimated_positions = [estimate_position_from_encoding(row, reference_pos_table) for row in recovered_pos] | |
| """, language="python") | |
| st.markdown("### 📘 Final Notes: Theory & Formulas") | |
| with st.expander("🧠 Theory and Formulas"): | |
| st.markdown(r""" | |
| ### 1️⃣ Tokenization (BPE) | |
| We use **Byte Pair Encoding (BPE)** to break text into subword units. | |
| For example: | |
| "Learning is fun" → ["Learning", "Ġis", "Ġfun"] | |
| Note: The "Ġ" indicates a **space** before the token. | |
| --- | |
| ### 2️⃣ Embedding | |
| Each token ID $t_i \in \mathbb{Z}$ is mapped to a dense vector: | |
| $$ | |
| \text{Embedding}(t_i) = \mathbf{e}_i \in \mathbb{R}^d | |
| $$ | |
| Where: | |
| - $t_i$: token ID | |
| - $\mathbf{e}_i$: embedding vector of dimension $d$ | |
| --- | |
| ### 3️⃣ Sinusoidal Positional Encoding | |
| Used to encode the **position $p$** of a token without learnable parameters: | |
| $$ | |
| \text{PE}(p, 2i) = \sin\left(\frac{p}{10000^{\frac{2i}{d}}}\right) | |
| $$ | |
| $$ | |
| \text{PE}(p, 2i+1) = \cos\left(\frac{p}{10000^{\frac{2i}{d}}}\right) | |
| $$ | |
| Where: | |
| - $p$: position index (0, 1, 2, …) | |
| - $i$: dimension index | |
| - $d$: total embedding dimension | |
| This gives a positional vector $\text{PE}(p) \in \mathbb{R}^d$ | |
| --- | |
| ### 4️⃣ Add Embedding and Positional Encoding | |
| We add the embedding and positional encoding element-wise: | |
| $$ | |
| \mathbf{z}_i = \mathbf{e}_i + \text{PE}(p_i) | |
| $$ | |
| Where: | |
| - $\mathbf{z}_i$: final input to the transformer | |
| --- | |
| ### 5️⃣ Reverse Lookup (Approximate) | |
| We find the nearest embedding using cosine similarity: | |
| $$ | |
| \hat{t}_i = \underset{j}{\arg\max} \left( \frac{ \mathbf{z}_i \cdot \mathbf{e}_j }{ \| \mathbf{z}_i \| \, \| \mathbf{e}_j \| } \right) | |
| $$ | |
| --- | |
| ### 6️⃣ Recover Position from Embedding + PE | |
| To isolate positional encoding: | |
| $$ | |
| \text{Recovered PE}_i = \mathbf{z}_i - \mathbf{e}_i | |
| $$ | |
| We then compare this with reference positional encodings to estimate token position. | |
| --- | |
| ### 🌟 Summary Table | |
| | Step | What Happens | | |
| |------|--------------| | |
| | **Tokenization** | Sentence → Subwords → Token IDs | | |
| | **Embedding** | Token IDs → Vectors | | |
| | **Pos Encoding** | Position Index → Sin/Cos Vector | | |
| | **Sum** | Embedding + PE = Input to Transformer | | |
| | **Reverse** | Approximate token ID from vector | | |
| | **PE Recovery** | Recover position using similarity | | |
| """, unsafe_allow_html=True) | |
| st.markdown("### 🤖 Transformer Internals: Key Concepts") | |
| with st.expander("🔁 Multi-Head Attention: Q, K, V Projections"): | |
| st.markdown(r""" | |
| Each token embedding $\mathbf{x}_i$ is linearly projected into: | |
| - Query vector: $Q_i = \mathbf{x}_i W^Q$ | |
| - Key vector: $K_i = \mathbf{x}_i W^K$ | |
| - Value vector: $V_i = \mathbf{x}_i W^V$ | |
| All of shape: $\mathbb{R}^{d_{model} \times d_{head}}$ | |
| Multiple such projections (heads) run in parallel: | |
| $$ | |
| \text{MultiHead}(X) = \text{Concat}(\text{head}_1, ..., \text{head}_h) W^O | |
| $$ | |
| Each head does: | |
| $$ | |
| \text{Attention}(Q, K, V) = \text{softmax}\left( \frac{Q K^\top}{\sqrt{d_k}} \right) V | |
| $$ | |
| """, unsafe_allow_html=True) | |
| with st.expander("🧠 Contextualized Representations"): | |
| st.markdown(r""" | |
| The attention mechanism lets each token **attend to others**, allowing the output for each token to contain **context**. | |
| For example: | |
| - Token "fun" gets influenced by "is" and "learning" | |
| - The output is no longer static but dynamic, depending on sentence context | |
| This is what makes Transformers powerful for understanding relationships between tokens. | |
| """) | |
| with st.expander("🛠 Feed-Forward Neural Network (FFN)"): | |
| st.markdown(r""" | |
| After attention, each token’s vector goes through a two-layer feed-forward network applied independently: | |
| $$ | |
| \text{FFN}(x) = \max(0, x W_1 + b_1) W_2 + b_2 | |
| $$ | |
| This allows deeper transformations on each token representation. | |
| """) | |
| with st.expander("📊 Softmax Over Vocabulary"): | |
| st.markdown(r""" | |
| The final output layer transforms each token representation to **logits** for the full vocabulary. | |
| Then, softmax is applied to convert them into probabilities: | |
| $$ | |
| P(w_i \mid \text{context}) = \frac{\exp(\text{logit}_i)}{\sum_j \exp(\text{logit}_j)} | |
| $$ | |
| The token with the highest probability is typically selected as the **predicted next word**. | |
| """) | |
| with st.expander("🔮 Predicted Next Token"): | |
| st.markdown(r""" | |
| By chaining all steps (embedding → attention → FFN → softmax), the model predicts the **next token**: | |
| E.g., | |
| Input: `"Learning is"` | |
| Predicted next token: `"fun"` | |
| This is how autoregressive models like GPT-2 **generate text** one token at a time. | |
| """) | |
| st.markdown("### 🎨 Visualizations: Transformer Internals") | |
| # ---- 1. Attention Heatmap ---- | |
| with st.expander("🔁 Multi-Head Attention Score Heatmap (QKᵀ / √d)"): | |
| st.markdown(""" | |
| This heatmap shows how the attention mechanism scores each query against all keys. | |
| Brighter color = higher attention weight. | |
| $$ | |
| \\text{Attention}(Q, K, V) = \\text{softmax}\\left( \\frac{QK^T}{\\sqrt{d_k}} \\right)V | |
| $$ | |
| """, unsafe_allow_html=True) | |
| tokens = ["Learning", "is", "fun"] | |
| Q = np.array([[1, 0], [0.5, 0.5], [0, 1]]) | |
| K = np.array([[1, 0], [0.5, 0.5], [0, 1]]) | |
| scores = np.dot(Q, K.T) / np.sqrt(2) | |
| softmax_scores = np.exp(scores) / np.sum(np.exp(scores), axis=1, keepdims=True) | |
| fig1, ax1 = plt.subplots() | |
| cax = ax1.matshow(softmax_scores, cmap="Blues") | |
| fig1.colorbar(cax) | |
| ax1.set_xticks(np.arange(len(tokens))) | |
| ax1.set_xticklabels(tokens) | |
| ax1.set_yticks(np.arange(len(tokens))) | |
| ax1.set_yticklabels(tokens) | |
| ax1.set_xlabel("Key Tokens (K)") | |
| ax1.set_ylabel("Query Tokens (Q)") | |
| ax1.set_title("Attention Score Heatmap") | |
| st.pyplot(fig1) | |
| # ---- 2. Softmax Curve ---- | |
| with st.expander("📊 Softmax Curve for Vocabulary Logits"): | |
| st.markdown(""" | |
| This curve shows how softmax converts logits into probabilities. | |
| Higher logits result in higher predicted probabilities: | |
| $$ | |
| \\text{Softmax}(x_i) = \\frac{e^{x_i}}{\\sum_j e^{x_j}} | |
| $$ | |
| """, unsafe_allow_html=True) | |
| x = np.linspace(-4, 4, 100) | |
| logits = np.vstack([x, x + 1, x - 1]) | |
| exps = np.exp(logits) | |
| softmax = exps / np.sum(exps, axis=0) | |
| fig2, ax2 = plt.subplots() | |
| ax2.plot(x, softmax[0], label='Token A') | |
| ax2.plot(x, softmax[1], label='Token B') | |
| ax2.plot(x, softmax[2], label='Token C') | |
| ax2.set_title("Softmax Output vs Logit Value") | |
| ax2.set_xlabel("Logit") | |
| ax2.set_ylabel("Probability") | |
| ax2.legend() | |
| st.pyplot(fig2) | |