embed-and-pos-encode-code

Runtime error

App Files Files Community

schoginitoys commited on May 25, 2025

Commit

ca69551

verified ·

1 Parent(s): 32fc4b5

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +267 -125

src/streamlit_app.py CHANGED Viewed

@@ -12,132 +12,274 @@ import numpy as np
 if isinstance(getattr(sys.modules.get("torch"), "classes", None), types.ModuleType):
     torch.classes.__path__ = []
-# pip install tiktoken transformers
-import tiktoken
 from transformers import GPT2TokenizerFast
-st.set_page_config(page_title="Embedding Dimension Visualizer", layout="wide")
-st.title("🔍 Embedding Dimension Visualizer")
-# ---- THEORY EXPANDER ----
-with st.expander("📖 Theory: Tokenization, BPE & Positional Encoding"):
-    st.markdown("""
-**1️⃣ Tokenization**
-Splits raw text into atomic units (“tokens”).
-**2️⃣ Byte-Pair Encoding (BPE)**
-Iteratively merges the most frequent pair of symbols to build a subword vocabulary.
-E.g.  "embedding" → ["em", "bed", "ding"]
-**3️⃣ Positional Encoding**
-We add a deterministic sinusoidal vector to each token embedding so the model knows position.
-""")
-    st.markdown("For embedding dimension \(d\), position \(pos\) and channel index \(i\):")
-    st.latex(r"""\mathrm{PE}_{(pos,\,2i)}   = \sin\!\Bigl(\frac{pos}{10000^{2i/d}}\Bigr)""")
-    st.latex(r"""\mathrm{PE}_{(pos,\,2i+1)} = \cos\!\Bigl(\frac{pos}{10000^{2i/d}}\Bigr)""")
-    st.markdown("""
-- \(pos\) starts at 0 for the first token
-- Even channels use \(\sin\), odd channels use \(\cos\)
-- This injects unique, smoothly varying positional signals into each embedding
-""")
-# ---- Sidebar ----
-with st.sidebar:
-    st.header("Settings")
-    input_text = st.text_input("Enter text to embed", value="Hello world!")
-    dim = st.number_input(
-        "Embedding dimensions",
-        min_value=2,
-        max_value=1536,
-        value=3,
-        step=1,
-        help="Choose 2, 3, 512, 768, 1536, etc."
-    )
-    tokenizer_choice = st.selectbox(
-        "Choose tokenizer",
-        ["tiktoken", "openai", "huggingface"],
-        help="Which tokenization scheme to demo."
-    )
-    generate = st.button("Generate / Reset Embedding")
-if not generate:
-    st.info("Adjust the settings in the sidebar and click **Generate / Reset Embedding** to see the tokens and sliders.")
-    st.stop()
-# ---- Tokenize ----
-if tokenizer_choice in ("tiktoken", "openai"):
-    model_name = "gpt2" if tokenizer_choice=="tiktoken" else "gpt-3.5-turbo"
-    enc = tiktoken.encoding_for_model(model_name)
-    token_ids = enc.encode(input_text)
-    token_strs = [enc.decode([tid]) for tid in token_ids]
-else:
-    hf_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
-    token_ids = hf_tokenizer.encode(input_text)
-    token_strs = hf_tokenizer.convert_ids_to_tokens(token_ids)
-st.subheader("🪶 Tokens and IDs")
-for i, (tok, tid) in enumerate(zip(token_strs, token_ids), start=1):
-    st.write(f"**{i}.** `{tok}` → ID **{tid}**")
-st.write("---")
-st.subheader("📊 Embedding + Positional Encoding per Token")
-st.write(f"Input: `{input_text}` | Tokenizer: **{tokenizer_choice}** | Dims per token: **{dim}**")
-if dim > 20:
-    st.warning("Showing >20 sliders per block may be unwieldy; consider smaller dims for teaching.")
-# helper for sinusoidal positional encoding
-def get_positional_encoding(position: int, d_model: int) -> np.ndarray:
-    pe = np.zeros(d_model, dtype=float)
-    for i in range(d_model):
-        angle = position / np.power(10000, (2 * (i // 2)) / d_model)
-        pe[i] = np.sin(angle) if (i % 2 == 0) else np.cos(angle)
     return pe
-# ---- For each token, three slider‐blocks ----
-for t_idx, tok in enumerate(token_strs, start=1):
-    emb = np.random.uniform(-1.0, 1.0, size=dim)
-    pe  = get_positional_encoding(t_idx - 1, dim)
-    combined = emb + pe
-    with st.expander(f"Token {t_idx}: `{tok}`"):
-        st.markdown("**1️⃣ Embedding**")
-        for d in range(dim):
-            st.slider(
-                label=f"Emb Dim {d+1}",
-                min_value=-1.0, max_value=1.0,
-                value=float(emb[d]),
-                key=f"t{t_idx}_emb{d+1}",
-                disabled=True
-            )
-        st.markdown("**2️⃣ Positional Encoding (sin / cos)**")
-        for d in range(dim):
-            st.slider(
-                label=f"PE Dim {d+1}",
-                min_value=-1.0, max_value=1.0,
-                value=float(pe[d]),
-                key=f"t{t_idx}_pe{d+1}",
-                disabled=True
-            )
-        st.markdown("**3️⃣ Embedding + Positional Encoding**")
-        for d in range(dim):
-            st.slider(
-                label=f"Sum Dim {d+1}",
-                min_value=-2.0, max_value=2.0,
-                value=float(combined[d]),
-                key=f"t{t_idx}_sum{d+1}",
-                disabled=True
-            )
-# ---- NEW FINAL SECTION ----
-st.write("---")
-st.subheader("Final Input Embedding Plus Positional Encoding Ready to Send to ATtention Heads")
-for t_idx, tid in enumerate(token_ids, start=1):
-    with st.expander(f"Token ID {tid}"):
-        for d in range(1, dim+1):
-            # pull the “sum” value out of session state
-            val = st.session_state.get(f"t{t_idx}_sum{d}", None)
-            st.write(f"Dim {d}: {val:.4f}" if val is not None else f"Dim {d}: N/A")

 if isinstance(getattr(sys.modules.get("torch"), "classes", None), types.ModuleType):
     torch.classes.__path__ = []
+import torch
+import numpy as np
+import streamlit as st
 from transformers import GPT2TokenizerFast
+# --- Setup ---
+st.set_page_config(page_title="Text to Embedding Visualizer", layout="wide")
+st.title("🔍 Token Embedding & Positional Encoding Coding Demo")
+# --- Input UI ---
+sentence = st.text_input("Enter your sentence", "Learning is fun")
+embedding_dim = st.slider("Embedding Dimension (even only)", min_value=4, max_value=64, value=8, step=2)
+# --- Load tokenizer ---
+tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
+input_ids = tokenizer.encode(sentence, return_tensors="pt")[0]
+tokens = tokenizer.convert_ids_to_tokens(input_ids)
+# st.markdown("### 1️⃣ Tokenization")
+# with st.expander("Show Token IDs"):
+#     st.write("**Tokens:**", tokens)
+#     st.write("**Token IDs:**", input_ids.tolist())
+st.markdown("### 1️⃣ Tokenization")
+with st.expander("Token IDs and Subwords"):
+    st.write("**Tokens:**", tokens)
+    st.write("**Token IDs:**", input_ids.tolist())
+with st.expander("📜 Show Code: Tokenization"):
+    st.code("""
+tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
+input_ids = tokenizer.encode(sentence, return_tensors="pt")[0]
+tokens = tokenizer.convert_ids_to_tokens(input_ids)
+    """, language="python")
+# --- Embedding Matrix ---
+torch.manual_seed(0)  # Reproducibility
+embedding_matrix = torch.nn.Embedding(tokenizer.vocab_size, embedding_dim)
+embedded = embedding_matrix(input_ids)
+st.markdown("### 2️⃣ Embedding")
+with st.expander("Show Token Embeddings"):
+    st.write("Shape:", embedded.shape)
+    st.write(embedded)
+with st.expander("📜 Show Code: Embedding"):
+    st.code(f"""
+embedding_matrix = torch.nn.Embedding(tokenizer.vocab_size, {embedding_dim})
+embedded = embedding_matrix(input_ids)
+    """, language="python")
+# --- Positional Encoding ---
+def get_positional_encoding(seq_len, dim):
+    pe = torch.zeros(seq_len, dim)
+    position = torch.arange(0, seq_len, dtype=torch.float32).unsqueeze(1)
+    div_term = torch.exp(torch.arange(0, dim, 2).float() * (-np.log(10000.0) / dim))
+    pe[:, 0::2] = torch.sin(position * div_term)
+    pe[:, 1::2] = torch.cos(position * div_term)
     return pe
+pos_enc = get_positional_encoding(len(input_ids), embedding_dim)
+st.markdown("### 3️⃣ Positional Encoding")
+with st.expander("Show Positional Encoding"):
+    st.write("Shape:", pos_enc.shape)
+    st.write(pos_enc)
+with st.expander("📜 Show Code: Positional Encoding"):
+    st.code(f'''
+def get_positional_encoding(seq_len, dim):
+    pe = torch.zeros(seq_len, dim)
+    position = torch.arange(0, seq_len).unsqueeze(1).float()
+    div_term = torch.exp(torch.arange(0, dim, 2).float() * (-np.log(10000.0) / dim))
+    pe[:, 0::2] = torch.sin(position * div_term)
+    pe[:, 1::2] = torch.cos(position * div_term)
+    return pe
+pos_enc = get_positional_encoding(len(input_ids), {embedding_dim})
+    ''', language="python")
+# --- Combined Embedding + Position ---
+embedded_with_pos = embedded + pos_enc
+st.markdown("### 4️⃣ Embedding + Positional Encoding")
+with st.expander("Show Combined Embedding"):
+    st.write(embedded_with_pos)
+with st.expander("📜 Show Code: Add Positional Encoding"):
+    st.code("""
+embedded_with_pos = embedded + pos_enc
+    """, language="python")
+# --- Approximate Reverse to Token IDs ---
+def find_closest_token(vec, emb_matrix):
+    sims = torch.nn.functional.cosine_similarity(vec.unsqueeze(0), emb_matrix.weight, dim=1)
+    return torch.argmax(sims).item()
+recovered_ids = [find_closest_token(vec, embedding_matrix) for vec in embedded]
+#recovered_text = tokenizer.decode(recovered_ids)
+#st.markdown("### 5️⃣ Approximate Reverse")
+#with st.expander("Recovered Tokens"):
+#    st.write("**Recovered IDs:**", recovered_ids)
+#    st.write("**Recovered Text:**", recovered_text)
+recovered_tokens = tokenizer.convert_ids_to_tokens(recovered_ids)  # ← Subwords
+recovered_text = tokenizer.decode(recovered_ids)                   # ← Final string
+st.markdown("### 5️⃣ Approximate Reverse")
+with st.expander("Recovered Tokens and Text"):
+    st.write("**Recovered Token IDs:**", recovered_ids)
+    st.write("**Recovered Subword Tokens (BPE):**", recovered_tokens)
+    st.write("**Recovered Sentence:**", recovered_text)
+with st.expander("📜 Show Code: Recover Token IDs and Text"):
+    st.code("""
+def find_closest_token(vec, emb_matrix):
+    sims = torch.nn.functional.cosine_similarity(vec.unsqueeze(0), emb_matrix.weight, dim=1)
+    return torch.argmax(sims).item()
+recovered_ids = [find_closest_token(vec, embedding_matrix) for vec in embedded]
+recovered_tokens = tokenizer.convert_ids_to_tokens(recovered_ids)
+recovered_text = tokenizer.decode(recovered_ids)
+    """, language="python")
+# --- Recover Position (Approx) ---
+recovered_pos = embedded_with_pos - embedded
+position_error = pos_enc - recovered_pos
+st.markdown("### 6️⃣ Recovered Positional Encoding")
+with st.expander("Compare Recovered vs Original"):
+    st.write("**Recovered Positional Encoding:**")
+    st.write(recovered_pos)
+    st.write("**Difference from Original (should be ~0):**")
+    st.write(position_error)
+with st.expander("📜 Show Code: Recovered Positional Encoding"):
+    st.code("""
+recovered_pos = embedded_with_pos - embedded
+position_error = pos_enc - recovered_pos
+    """, language="python")
+# Estimate position from positional encoding using cosine similarity
+def estimate_position_from_encoding(pe_row, full_table):
+    sims = torch.nn.functional.cosine_similarity(pe_row.unsqueeze(0), full_table, dim=1)
+    return torch.argmax(sims).item()
+# Build reference table of known encodings for positions 0 to N
+reference_pos_table = get_positional_encoding(seq_len=len(input_ids), dim=embedding_dim)
+# Now estimate each token's position
+estimated_positions = [estimate_position_from_encoding(row, reference_pos_table) for row in recovered_pos]
+st.markdown("### 7️⃣ Estimate Position from Positional Encoding")
+with st.expander("Recovered Positions"):
+    st.write("**Estimated Token Positions:**", estimated_positions)
+    st.write("**Original True Positions:**", list(range(len(input_ids))))
+with st.expander("📜 Show Code: Estimate Positions"):
+    st.code("""
+def estimate_position_from_encoding(pe_row, full_table):
+    sims = torch.nn.functional.cosine_similarity(pe_row.unsqueeze(0), full_table, dim=1)
+    return torch.argmax(sims).item()
+reference_pos_table = get_positional_encoding(seq_len=len(input_ids), dim=embedding_dim)
+estimated_positions = [estimate_position_from_encoding(row, reference_pos_table) for row in recovered_pos]
+    """, language="python")
+st.markdown("### 📘 Final Notes: Theory & Formulas")
+with st.expander("🧠 Theory and Formulas"):
+    st.markdown(r"""
+### 1️⃣ Tokenization (BPE)
+We use **Byte Pair Encoding (BPE)** to break text into subword units.
+For example:
+"Learning is fun" → ["Learning", "Ġis", "Ġfun"]
+Note: The "Ġ" indicates a **space** before the token.
+---
+### 2️⃣ Embedding
+Each token ID $t_i \in \mathbb{Z}$ is mapped to a dense vector:
+$$
+\text{Embedding}(t_i) = \mathbf{e}_i \in \mathbb{R}^d
+$$
+Where:
+- $t_i$: token ID
+- $\mathbf{e}_i$: embedding vector of dimension $d$
+---
+### 3️⃣ Sinusoidal Positional Encoding
+Used to encode the **position $p$** of a token without learnable parameters:
+$$
+\text{PE}(p, 2i) = \sin\left(\frac{p}{10000^{\frac{2i}{d}}}\right)
+$$
+$$
+\text{PE}(p, 2i+1) = \cos\left(\frac{p}{10000^{\frac{2i}{d}}}\right)
+$$
+Where:
+- $p$: position index (0, 1, 2, …)
+- $i$: dimension index
+- $d$: total embedding dimension
+This gives a positional vector $\text{PE}(p) \in \mathbb{R}^d$
+---
+### 4️⃣ Add Embedding and Positional Encoding
+We add the embedding and positional encoding element-wise:
+$$
+\mathbf{z}_i = \mathbf{e}_i + \text{PE}(p_i)
+$$
+Where:
+- $\mathbf{z}_i$: final input to the transformer
+---
+### 5️⃣ Reverse Lookup (Approximate)
+We find the nearest embedding using cosine similarity:
+$$
+\hat{t}_i = \underset{j}{\arg\max} \left( \frac{ \mathbf{z}_i \cdot \mathbf{e}_j }{ \| \mathbf{z}_i \| \, \| \mathbf{e}_j \| } \right)
+$$
+---
+### 6️⃣ Recover Position from Embedding + PE
+To isolate positional encoding:
+$$
+\text{Recovered PE}_i = \mathbf{z}_i - \mathbf{e}_i
+$$
+We then compare this with reference positional encodings to estimate token position.
+---
+### 🌟 Summary Table
+| Step | What Happens |
+|------|--------------|
+| **Tokenization** | Sentence → Subwords → Token IDs |
+| **Embedding** | Token IDs → Vectors |
+| **Pos Encoding** | Position Index → Sin/Cos Vector |
+| **Sum** | Embedding + PE = Input to Transformer |
+| **Reverse** | Approximate token ID from vector |
+| **PE Recovery** | Recover position using similarity |
+    """, unsafe_allow_html=True)