import os # turn off Streamlit’s automatic file-watching os.environ["STREAMLIT_SERVER_ENABLE_FILE_WATCHER"] = "false" import sys import types import torch # now safe to import import streamlit as st import numpy as np # Prevent Streamlit from trying to walk torch.classes' non-standard __path__ if isinstance(getattr(sys.modules.get("torch"), "classes", None), types.ModuleType): torch.classes.__path__ = [] import torch import numpy as np import streamlit as st from transformers import GPT2TokenizerFast # --- Setup --- st.set_page_config(page_title="Text to Embedding Visualizer", layout="wide") st.title("🔍 Token Embedding & Positional Encoding Coding Demo") # --- Input UI --- sentence = st.text_input("Enter your sentence", "Learning is fun") embedding_dim = st.slider("Embedding Dimension (even only)", min_value=4, max_value=64, value=8, step=2) # --- Load tokenizer --- # Set custom cache directory within your app's working directory (which is writable on Spaces) os.environ['TRANSFORMERS_CACHE'] = './hf_cache' # Load the tokenizer using the custom cache path # tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", cache_dir="./hf_cache") from transformers import GPT2TokenizerFast # Load tokenizer from bundled local files only tokenizer = GPT2TokenizerFast.from_pretrained("./assets/tokenizer", local_files_only=True) # tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") input_ids = tokenizer.encode(sentence, return_tensors="pt")[0] tokens = tokenizer.convert_ids_to_tokens(input_ids) # st.markdown("### 1️⃣ Tokenization") # with st.expander("Show Token IDs"): # st.write("**Tokens:**", tokens) # st.write("**Token IDs:**", input_ids.tolist()) st.markdown("### 1️⃣ Tokenization") with st.expander("Token IDs and Subwords"): st.write("**Tokens:**", tokens) st.write("**Token IDs:**", input_ids.tolist()) with st.expander("📜 Show Code: Tokenization"): st.code(""" tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") input_ids = tokenizer.encode(sentence, return_tensors="pt")[0] tokens = tokenizer.convert_ids_to_tokens(input_ids) """, language="python") # --- Embedding Matrix --- torch.manual_seed(0) # Reproducibility embedding_matrix = torch.nn.Embedding(tokenizer.vocab_size, embedding_dim) embedded = embedding_matrix(input_ids) st.markdown("### 2️⃣ Embedding") with st.expander("Show Token Embeddings"): st.write("Shape:", embedded.shape) st.write(embedded) with st.expander("📜 Show Code: Embedding"): st.code(f""" embedding_matrix = torch.nn.Embedding(tokenizer.vocab_size, {embedding_dim}) embedded = embedding_matrix(input_ids) """, language="python") # --- Positional Encoding --- def get_positional_encoding(seq_len, dim): pe = torch.zeros(seq_len, dim) position = torch.arange(0, seq_len, dtype=torch.float32).unsqueeze(1) div_term = torch.exp(torch.arange(0, dim, 2).float() * (-np.log(10000.0) / dim)) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) return pe pos_enc = get_positional_encoding(len(input_ids), embedding_dim) st.markdown("### 3️⃣ Positional Encoding") with st.expander("Show Positional Encoding"): st.write("Shape:", pos_enc.shape) st.write(pos_enc) with st.expander("📜 Show Code: Positional Encoding"): st.code(f''' def get_positional_encoding(seq_len, dim): pe = torch.zeros(seq_len, dim) position = torch.arange(0, seq_len).unsqueeze(1).float() div_term = torch.exp(torch.arange(0, dim, 2).float() * (-np.log(10000.0) / dim)) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) return pe pos_enc = get_positional_encoding(len(input_ids), {embedding_dim}) ''', language="python") # --- Combined Embedding + Position --- embedded_with_pos = embedded + pos_enc st.markdown("### 4️⃣ Embedding + Positional Encoding") with st.expander("Show Combined Embedding"): st.write(embedded_with_pos) with st.expander("📜 Show Code: Add Positional Encoding"): st.code(""" embedded_with_pos = embedded + pos_enc """, language="python") # --- Approximate Reverse to Token IDs --- def find_closest_token(vec, emb_matrix): sims = torch.nn.functional.cosine_similarity(vec.unsqueeze(0), emb_matrix.weight, dim=1) return torch.argmax(sims).item() recovered_ids = [find_closest_token(vec, embedding_matrix) for vec in embedded] #recovered_text = tokenizer.decode(recovered_ids) #st.markdown("### 5️⃣ Approximate Reverse") #with st.expander("Recovered Tokens"): # st.write("**Recovered IDs:**", recovered_ids) # st.write("**Recovered Text:**", recovered_text) recovered_tokens = tokenizer.convert_ids_to_tokens(recovered_ids) # ← Subwords recovered_text = tokenizer.decode(recovered_ids) # ← Final string st.markdown("### 5️⃣ Approximate Reverse") with st.expander("Recovered Tokens and Text"): st.write("**Recovered Token IDs:**", recovered_ids) st.write("**Recovered Subword Tokens (BPE):**", recovered_tokens) st.write("**Recovered Sentence:**", recovered_text) with st.expander("📜 Show Code: Recover Token IDs and Text"): st.code(""" def find_closest_token(vec, emb_matrix): sims = torch.nn.functional.cosine_similarity(vec.unsqueeze(0), emb_matrix.weight, dim=1) return torch.argmax(sims).item() recovered_ids = [find_closest_token(vec, embedding_matrix) for vec in embedded] recovered_tokens = tokenizer.convert_ids_to_tokens(recovered_ids) recovered_text = tokenizer.decode(recovered_ids) """, language="python") # --- Recover Position (Approx) --- recovered_pos = embedded_with_pos - embedded position_error = pos_enc - recovered_pos st.markdown("### 6️⃣ Recovered Positional Encoding") with st.expander("Compare Recovered vs Original"): st.write("**Recovered Positional Encoding:**") st.write(recovered_pos) st.write("**Difference from Original (should be ~0):**") st.write(position_error) with st.expander("📜 Show Code: Recovered Positional Encoding"): st.code(""" recovered_pos = embedded_with_pos - embedded position_error = pos_enc - recovered_pos """, language="python") # Estimate position from positional encoding using cosine similarity def estimate_position_from_encoding(pe_row, full_table): sims = torch.nn.functional.cosine_similarity(pe_row.unsqueeze(0), full_table, dim=1) return torch.argmax(sims).item() # Build reference table of known encodings for positions 0 to N reference_pos_table = get_positional_encoding(seq_len=len(input_ids), dim=embedding_dim) # Now estimate each token's position estimated_positions = [estimate_position_from_encoding(row, reference_pos_table) for row in recovered_pos] st.markdown("### 7️⃣ Estimate Position from Positional Encoding") with st.expander("Recovered Positions"): st.write("**Estimated Token Positions:**", estimated_positions) st.write("**Original True Positions:**", list(range(len(input_ids)))) with st.expander("📜 Show Code: Estimate Positions"): st.code(""" def estimate_position_from_encoding(pe_row, full_table): sims = torch.nn.functional.cosine_similarity(pe_row.unsqueeze(0), full_table, dim=1) return torch.argmax(sims).item() reference_pos_table = get_positional_encoding(seq_len=len(input_ids), dim=embedding_dim) estimated_positions = [estimate_position_from_encoding(row, reference_pos_table) for row in recovered_pos] """, language="python") st.markdown("### 📘 Final Notes: Theory & Formulas") with st.expander("🧠 Theory and Formulas"): st.markdown(r""" ### 1️⃣ Tokenization (BPE) We use **Byte Pair Encoding (BPE)** to break text into subword units. For example: "Learning is fun" → ["Learning", "Ġis", "Ġfun"] Note: The "Ġ" indicates a **space** before the token. --- ### 2️⃣ Embedding Each token ID $t_i \in \mathbb{Z}$ is mapped to a dense vector: $$ \text{Embedding}(t_i) = \mathbf{e}_i \in \mathbb{R}^d $$ Where: - $t_i$: token ID - $\mathbf{e}_i$: embedding vector of dimension $d$ --- ### 3️⃣ Sinusoidal Positional Encoding Used to encode the **position $p$** of a token without learnable parameters: $$ \text{PE}(p, 2i) = \sin\left(\frac{p}{10000^{\frac{2i}{d}}}\right) $$ $$ \text{PE}(p, 2i+1) = \cos\left(\frac{p}{10000^{\frac{2i}{d}}}\right) $$ Where: - $p$: position index (0, 1, 2, …) - $i$: dimension index - $d$: total embedding dimension This gives a positional vector $\text{PE}(p) \in \mathbb{R}^d$ --- ### 4️⃣ Add Embedding and Positional Encoding We add the embedding and positional encoding element-wise: $$ \mathbf{z}_i = \mathbf{e}_i + \text{PE}(p_i) $$ Where: - $\mathbf{z}_i$: final input to the transformer --- ### 5️⃣ Reverse Lookup (Approximate) We find the nearest embedding using cosine similarity: $$ \hat{t}_i = \underset{j}{\arg\max} \left( \frac{ \mathbf{z}_i \cdot \mathbf{e}_j }{ \| \mathbf{z}_i \| \, \| \mathbf{e}_j \| } \right) $$ --- ### 6️⃣ Recover Position from Embedding + PE To isolate positional encoding: $$ \text{Recovered PE}_i = \mathbf{z}_i - \mathbf{e}_i $$ We then compare this with reference positional encodings to estimate token position. --- ### 🌟 Summary Table | Step | What Happens | |------|--------------| | **Tokenization** | Sentence → Subwords → Token IDs | | **Embedding** | Token IDs → Vectors | | **Pos Encoding** | Position Index → Sin/Cos Vector | | **Sum** | Embedding + PE = Input to Transformer | | **Reverse** | Approximate token ID from vector | | **PE Recovery** | Recover position using similarity | """, unsafe_allow_html=True)