Spaces:
Runtime error
Runtime error
| import os | |
| # turn off Streamlit’s automatic file-watching | |
| os.environ["STREAMLIT_SERVER_ENABLE_FILE_WATCHER"] = "false" | |
| import sys | |
| import types | |
| import torch # now safe to import | |
| import streamlit as st | |
| import numpy as np | |
| # Prevent Streamlit from trying to walk torch.classes' non-standard __path__ | |
| if isinstance(getattr(sys.modules.get("torch"), "classes", None), types.ModuleType): | |
| torch.classes.__path__ = [] | |
| import torch | |
| import numpy as np | |
| import streamlit as st | |
| from transformers import GPT2TokenizerFast | |
| # --- Setup --- | |
| st.set_page_config(page_title="Text to Embedding Visualizer", layout="wide") | |
| st.title("🔍 Token Embedding & Positional Encoding Coding Demo") | |
| # --- Input UI --- | |
| sentence = st.text_input("Enter your sentence", "Learning is fun") | |
| embedding_dim = st.slider("Embedding Dimension (even only)", min_value=4, max_value=64, value=8, step=2) | |
| # --- Load tokenizer --- | |
| # Set custom cache directory within your app's working directory (which is writable on Spaces) | |
| os.environ['TRANSFORMERS_CACHE'] = './hf_cache' | |
| # Load the tokenizer using the custom cache path | |
| # tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", cache_dir="./hf_cache") | |
| from transformers import GPT2TokenizerFast | |
| # Load tokenizer from bundled local files only | |
| tokenizer = GPT2TokenizerFast.from_pretrained("./assets/tokenizer", local_files_only=True) | |
| # tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") | |
| input_ids = tokenizer.encode(sentence, return_tensors="pt")[0] | |
| tokens = tokenizer.convert_ids_to_tokens(input_ids) | |
| # st.markdown("### 1️⃣ Tokenization") | |
| # with st.expander("Show Token IDs"): | |
| # st.write("**Tokens:**", tokens) | |
| # st.write("**Token IDs:**", input_ids.tolist()) | |
| st.markdown("### 1️⃣ Tokenization") | |
| with st.expander("Token IDs and Subwords"): | |
| st.write("**Tokens:**", tokens) | |
| st.write("**Token IDs:**", input_ids.tolist()) | |
| with st.expander("📜 Show Code: Tokenization"): | |
| st.code(""" | |
| tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") | |
| input_ids = tokenizer.encode(sentence, return_tensors="pt")[0] | |
| tokens = tokenizer.convert_ids_to_tokens(input_ids) | |
| """, language="python") | |
| # --- Embedding Matrix --- | |
| torch.manual_seed(0) # Reproducibility | |
| embedding_matrix = torch.nn.Embedding(tokenizer.vocab_size, embedding_dim) | |
| embedded = embedding_matrix(input_ids) | |
| st.markdown("### 2️⃣ Embedding") | |
| with st.expander("Show Token Embeddings"): | |
| st.write("Shape:", embedded.shape) | |
| st.write(embedded) | |
| with st.expander("📜 Show Code: Embedding"): | |
| st.code(f""" | |
| embedding_matrix = torch.nn.Embedding(tokenizer.vocab_size, {embedding_dim}) | |
| embedded = embedding_matrix(input_ids) | |
| """, language="python") | |
| # --- Positional Encoding --- | |
| def get_positional_encoding(seq_len, dim): | |
| pe = torch.zeros(seq_len, dim) | |
| position = torch.arange(0, seq_len, dtype=torch.float32).unsqueeze(1) | |
| div_term = torch.exp(torch.arange(0, dim, 2).float() * (-np.log(10000.0) / dim)) | |
| pe[:, 0::2] = torch.sin(position * div_term) | |
| pe[:, 1::2] = torch.cos(position * div_term) | |
| return pe | |
| pos_enc = get_positional_encoding(len(input_ids), embedding_dim) | |
| st.markdown("### 3️⃣ Positional Encoding") | |
| with st.expander("Show Positional Encoding"): | |
| st.write("Shape:", pos_enc.shape) | |
| st.write(pos_enc) | |
| with st.expander("📜 Show Code: Positional Encoding"): | |
| st.code(f''' | |
| def get_positional_encoding(seq_len, dim): | |
| pe = torch.zeros(seq_len, dim) | |
| position = torch.arange(0, seq_len).unsqueeze(1).float() | |
| div_term = torch.exp(torch.arange(0, dim, 2).float() * (-np.log(10000.0) / dim)) | |
| pe[:, 0::2] = torch.sin(position * div_term) | |
| pe[:, 1::2] = torch.cos(position * div_term) | |
| return pe | |
| pos_enc = get_positional_encoding(len(input_ids), {embedding_dim}) | |
| ''', language="python") | |
| # --- Combined Embedding + Position --- | |
| embedded_with_pos = embedded + pos_enc | |
| st.markdown("### 4️⃣ Embedding + Positional Encoding") | |
| with st.expander("Show Combined Embedding"): | |
| st.write(embedded_with_pos) | |
| with st.expander("📜 Show Code: Add Positional Encoding"): | |
| st.code(""" | |
| embedded_with_pos = embedded + pos_enc | |
| """, language="python") | |
| # --- Approximate Reverse to Token IDs --- | |
| def find_closest_token(vec, emb_matrix): | |
| sims = torch.nn.functional.cosine_similarity(vec.unsqueeze(0), emb_matrix.weight, dim=1) | |
| return torch.argmax(sims).item() | |
| recovered_ids = [find_closest_token(vec, embedding_matrix) for vec in embedded] | |
| #recovered_text = tokenizer.decode(recovered_ids) | |
| #st.markdown("### 5️⃣ Approximate Reverse") | |
| #with st.expander("Recovered Tokens"): | |
| # st.write("**Recovered IDs:**", recovered_ids) | |
| # st.write("**Recovered Text:**", recovered_text) | |
| recovered_tokens = tokenizer.convert_ids_to_tokens(recovered_ids) # ← Subwords | |
| recovered_text = tokenizer.decode(recovered_ids) # ← Final string | |
| st.markdown("### 5️⃣ Approximate Reverse") | |
| with st.expander("Recovered Tokens and Text"): | |
| st.write("**Recovered Token IDs:**", recovered_ids) | |
| st.write("**Recovered Subword Tokens (BPE):**", recovered_tokens) | |
| st.write("**Recovered Sentence:**", recovered_text) | |
| with st.expander("📜 Show Code: Recover Token IDs and Text"): | |
| st.code(""" | |
| def find_closest_token(vec, emb_matrix): | |
| sims = torch.nn.functional.cosine_similarity(vec.unsqueeze(0), emb_matrix.weight, dim=1) | |
| return torch.argmax(sims).item() | |
| recovered_ids = [find_closest_token(vec, embedding_matrix) for vec in embedded] | |
| recovered_tokens = tokenizer.convert_ids_to_tokens(recovered_ids) | |
| recovered_text = tokenizer.decode(recovered_ids) | |
| """, language="python") | |
| # --- Recover Position (Approx) --- | |
| recovered_pos = embedded_with_pos - embedded | |
| position_error = pos_enc - recovered_pos | |
| st.markdown("### 6️⃣ Recovered Positional Encoding") | |
| with st.expander("Compare Recovered vs Original"): | |
| st.write("**Recovered Positional Encoding:**") | |
| st.write(recovered_pos) | |
| st.write("**Difference from Original (should be ~0):**") | |
| st.write(position_error) | |
| with st.expander("📜 Show Code: Recovered Positional Encoding"): | |
| st.code(""" | |
| recovered_pos = embedded_with_pos - embedded | |
| position_error = pos_enc - recovered_pos | |
| """, language="python") | |
| # Estimate position from positional encoding using cosine similarity | |
| def estimate_position_from_encoding(pe_row, full_table): | |
| sims = torch.nn.functional.cosine_similarity(pe_row.unsqueeze(0), full_table, dim=1) | |
| return torch.argmax(sims).item() | |
| # Build reference table of known encodings for positions 0 to N | |
| reference_pos_table = get_positional_encoding(seq_len=len(input_ids), dim=embedding_dim) | |
| # Now estimate each token's position | |
| estimated_positions = [estimate_position_from_encoding(row, reference_pos_table) for row in recovered_pos] | |
| st.markdown("### 7️⃣ Estimate Position from Positional Encoding") | |
| with st.expander("Recovered Positions"): | |
| st.write("**Estimated Token Positions:**", estimated_positions) | |
| st.write("**Original True Positions:**", list(range(len(input_ids)))) | |
| with st.expander("📜 Show Code: Estimate Positions"): | |
| st.code(""" | |
| def estimate_position_from_encoding(pe_row, full_table): | |
| sims = torch.nn.functional.cosine_similarity(pe_row.unsqueeze(0), full_table, dim=1) | |
| return torch.argmax(sims).item() | |
| reference_pos_table = get_positional_encoding(seq_len=len(input_ids), dim=embedding_dim) | |
| estimated_positions = [estimate_position_from_encoding(row, reference_pos_table) for row in recovered_pos] | |
| """, language="python") | |
| st.markdown("### 📘 Final Notes: Theory & Formulas") | |
| with st.expander("🧠 Theory and Formulas"): | |
| st.markdown(r""" | |
| ### 1️⃣ Tokenization (BPE) | |
| We use **Byte Pair Encoding (BPE)** to break text into subword units. | |
| For example: | |
| "Learning is fun" → ["Learning", "Ġis", "Ġfun"] | |
| Note: The "Ġ" indicates a **space** before the token. | |
| --- | |
| ### 2️⃣ Embedding | |
| Each token ID $t_i \in \mathbb{Z}$ is mapped to a dense vector: | |
| $$ | |
| \text{Embedding}(t_i) = \mathbf{e}_i \in \mathbb{R}^d | |
| $$ | |
| Where: | |
| - $t_i$: token ID | |
| - $\mathbf{e}_i$: embedding vector of dimension $d$ | |
| --- | |
| ### 3️⃣ Sinusoidal Positional Encoding | |
| Used to encode the **position $p$** of a token without learnable parameters: | |
| $$ | |
| \text{PE}(p, 2i) = \sin\left(\frac{p}{10000^{\frac{2i}{d}}}\right) | |
| $$ | |
| $$ | |
| \text{PE}(p, 2i+1) = \cos\left(\frac{p}{10000^{\frac{2i}{d}}}\right) | |
| $$ | |
| Where: | |
| - $p$: position index (0, 1, 2, …) | |
| - $i$: dimension index | |
| - $d$: total embedding dimension | |
| This gives a positional vector $\text{PE}(p) \in \mathbb{R}^d$ | |
| --- | |
| ### 4️⃣ Add Embedding and Positional Encoding | |
| We add the embedding and positional encoding element-wise: | |
| $$ | |
| \mathbf{z}_i = \mathbf{e}_i + \text{PE}(p_i) | |
| $$ | |
| Where: | |
| - $\mathbf{z}_i$: final input to the transformer | |
| --- | |
| ### 5️⃣ Reverse Lookup (Approximate) | |
| We find the nearest embedding using cosine similarity: | |
| $$ | |
| \hat{t}_i = \underset{j}{\arg\max} \left( \frac{ \mathbf{z}_i \cdot \mathbf{e}_j }{ \| \mathbf{z}_i \| \, \| \mathbf{e}_j \| } \right) | |
| $$ | |
| --- | |
| ### 6️⃣ Recover Position from Embedding + PE | |
| To isolate positional encoding: | |
| $$ | |
| \text{Recovered PE}_i = \mathbf{z}_i - \mathbf{e}_i | |
| $$ | |
| We then compare this with reference positional encodings to estimate token position. | |
| --- | |
| ### 🌟 Summary Table | |
| | Step | What Happens | | |
| |------|--------------| | |
| | **Tokenization** | Sentence → Subwords → Token IDs | | |
| | **Embedding** | Token IDs → Vectors | | |
| | **Pos Encoding** | Position Index → Sin/Cos Vector | | |
| | **Sum** | Embedding + PE = Input to Transformer | | |
| | **Reverse** | Approximate token ID from vector | | |
| | **PE Recovery** | Recover position using similarity | | |
| """, unsafe_allow_html=True) | |