Spaces:
Sleeping
Sleeping
| import os | |
| # turn off Streamlit’s automatic file-watching | |
| os.environ["STREAMLIT_SERVER_ENABLE_FILE_WATCHER"] = "false" | |
| import sys | |
| import types | |
| import torch # now safe to import | |
| import streamlit as st | |
| import numpy as np | |
| # Prevent Streamlit from trying to walk torch.classes' non-standard __path__ | |
| if isinstance(getattr(sys.modules.get("torch"), "classes", None), types.ModuleType): | |
| torch.classes.__path__ = [] | |
| # pip install tiktoken transformers | |
| import tiktoken | |
| from transformers import GPT2TokenizerFast | |
| st.set_page_config(page_title="Embedding Dimension Visualizer", layout="wide") | |
| st.title("🔍 Embedding Dimension Visualizer") | |
| # ---- THEORY EXPANDER ---- | |
| with st.expander("📖 Theory: Tokenization, BPE & Positional Encoding"): | |
| st.markdown(""" | |
| **1️⃣ Tokenization** | |
| Splits raw text into atomic units (“tokens”). | |
| **2️⃣ Byte-Pair Encoding (BPE)** | |
| Iteratively merges the most frequent pair of symbols to build a subword vocabulary. | |
| E.g. "embedding" → ["em", "bed", "ding"] | |
| **3️⃣ Positional Encoding** | |
| We add a deterministic sinusoidal vector to each token embedding so the model knows position. | |
| """) | |
| st.markdown("For embedding dimension \(d\), position \(pos\) and channel index \(i\):") | |
| st.latex(r"""\mathrm{PE}_{(pos,\,2i)} = \sin\!\Bigl(\frac{pos}{10000^{2i/d}}\Bigr)""") | |
| st.latex(r"""\mathrm{PE}_{(pos,\,2i+1)} = \cos\!\Bigl(\frac{pos}{10000^{2i/d}}\Bigr)""") | |
| st.markdown(""" | |
| - \(pos\) starts at 0 for the first token | |
| - Even channels use \(\sin\), odd channels use \(\cos\) | |
| - This injects unique, smoothly varying positional signals into each embedding | |
| """) | |
| # ---- Sidebar ---- | |
| with st.sidebar: | |
| st.header("Settings") | |
| input_text = st.text_input("Enter text to embed", value="Hello world!") | |
| dim = st.number_input( | |
| "Embedding dimensions", | |
| min_value=2, | |
| max_value=1536, | |
| value=3, | |
| step=1, | |
| help="Choose 2, 3, 512, 768, 1536, etc." | |
| ) | |
| tokenizer_choice = st.selectbox( | |
| "Choose tokenizer", | |
| ["tiktoken", "openai", "huggingface"], | |
| help="Which tokenization scheme to demo." | |
| ) | |
| generate = st.button("Generate / Reset Embedding") | |
| if not generate: | |
| st.info("Adjust the settings in the sidebar and click **Generate / Reset Embedding** to see the tokens and sliders.") | |
| st.stop() | |
| # ---- Tokenize ---- | |
| if tokenizer_choice in ("tiktoken", "openai"): | |
| model_name = "gpt2" if tokenizer_choice=="tiktoken" else "gpt-3.5-turbo" | |
| enc = tiktoken.encoding_for_model(model_name) | |
| token_ids = enc.encode(input_text) | |
| token_strs = [enc.decode([tid]) for tid in token_ids] | |
| else: | |
| hf_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") | |
| token_ids = hf_tokenizer.encode(input_text) | |
| token_strs = hf_tokenizer.convert_ids_to_tokens(token_ids) | |
| st.subheader("🪶 Tokens and IDs") | |
| for i, (tok, tid) in enumerate(zip(token_strs, token_ids), start=1): | |
| st.write(f"**{i}.** `{tok}` → ID **{tid}**") | |
| st.write("---") | |
| st.subheader("📊 Embedding + Positional Encoding per Token") | |
| st.write(f"Input: `{input_text}` | Tokenizer: **{tokenizer_choice}** | Dims per token: **{dim}**") | |
| if dim > 20: | |
| st.warning("Showing >20 sliders per block may be unwieldy; consider smaller dims for teaching.") | |
| # helper for sinusoidal positional encoding | |
| def get_positional_encoding(position: int, d_model: int) -> np.ndarray: | |
| pe = np.zeros(d_model, dtype=float) | |
| for i in range(d_model): | |
| angle = position / np.power(10000, (2 * (i // 2)) / d_model) | |
| pe[i] = np.sin(angle) if (i % 2 == 0) else np.cos(angle) | |
| return pe | |
| # ---- For each token, three slider‐blocks ---- | |
| for t_idx, tok in enumerate(token_strs, start=1): | |
| emb = np.random.uniform(-1.0, 1.0, size=dim) | |
| pe = get_positional_encoding(t_idx - 1, dim) | |
| combined = emb + pe | |
| with st.expander(f"Token {t_idx}: `{tok}`"): | |
| st.markdown("**1️⃣ Embedding**") | |
| for d in range(dim): | |
| st.slider( | |
| label=f"Emb Dim {d+1}", | |
| min_value=-1.0, max_value=1.0, | |
| value=float(emb[d]), | |
| key=f"t{t_idx}_emb{d+1}", | |
| disabled=True | |
| ) | |
| st.markdown("**2️⃣ Positional Encoding (sin / cos)**") | |
| for d in range(dim): | |
| st.slider( | |
| label=f"PE Dim {d+1}", | |
| min_value=-1.0, max_value=1.0, | |
| value=float(pe[d]), | |
| key=f"t{t_idx}_pe{d+1}", | |
| disabled=True | |
| ) | |
| st.markdown("**3️⃣ Embedding + Positional Encoding**") | |
| for d in range(dim): | |
| st.slider( | |
| label=f"Sum Dim {d+1}", | |
| min_value=-2.0, max_value=2.0, | |
| value=float(combined[d]), | |
| key=f"t{t_idx}_sum{d+1}", | |
| disabled=True | |
| ) | |
| # ---- NEW FINAL SECTION ---- | |
| st.write("---") | |
| st.subheader("Final Input Embedding Plus Positional Encoding Ready to Send to ATtention Heads") | |
| for t_idx, tid in enumerate(token_ids, start=1): | |
| with st.expander(f"Token ID {tid}"): | |
| for d in range(1, dim+1): | |
| # pull the “sum” value out of session state | |
| val = st.session_state.get(f"t{t_idx}_sum{d}", None) | |
| st.write(f"Dim {d}: {val:.4f}" if val is not None else f"Dim {d}: N/A") | |