import streamlit as st import numpy as np import tiktoken import os from openai import OpenAI import matplotlib.pyplot as plt import seaborn as sns from sklearn.decomposition import PCA # Setup st.set_page_config(page_title="LLM Token Explorer", layout="centered") st.title("๐Ÿง  LLM Token & Embedding Explorer") client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) # Debug Key Check st.text(f"OpenAI key found: {'Yes' if os.getenv('OPENAI_API_KEY') else 'No'}") # ---------- Input Section ---------- st.header("โœ๏ธ Input Text") st.markdown("Enter any short sentence or phrase you'd like to explore. We'll break it down into tokens and explore their structure and meaning.") input_text = st.text_area("Enter your text:", height=150) # ---------- Tokenizer Selection ---------- st.header("๐Ÿ”ง Tokenizer Choice") st.markdown("Choose a tokenizer from the available ones in `tiktoken`. Different models use different tokenization strategies.") tokenizer_name = st.selectbox("Choose tokenizer:", ["cl100k_base", "p50k_base", "r50k_base", "gpt2"]) if input_text: # ---------- Tokenization Info ---------- st.subheader("๐Ÿ”ค Token Information") st.markdown("This shows how your input text is broken down into tokens. Each token is a subword unit that the model processes individually.") if st.button("๐Ÿ” Show Token Details"): enc = tiktoken.get_encoding(tokenizer_name) tokens = enc.encode(input_text) token_strings = [enc.decode([t]) for t in tokens] with st.expander("๐Ÿงพ Token IDs"): st.write(tokens) with st.expander("๐Ÿ“– Decoded Tokens"): st.write(token_strings) st.info(f"Token count: {len(tokens)}") if st.button("๐Ÿ“Š Show Token ID Chart"): fig, ax = plt.subplots() ax.bar(range(len(tokens)), tokens, tick_label=token_strings) ax.set_xlabel("Token") ax.set_ylabel("Token ID") ax.set_title("Token IDs for Input Text") plt.xticks(rotation=45, ha='right') st.pyplot(fig) # ---------- Embedding Section ---------- st.subheader("๐Ÿ”— Token Embeddings (OpenAI)") st.markdown(""" Each token is mapped to a high-dimensional vector called an **embedding**. These vectors capture the contextual meaning of words and are the foundation of how language models understand text. We use the `text-embedding-ada-002` model from OpenAI to generate embeddings for each token. """) if st.button("๐Ÿ“ก Generate Embeddings"): with st.spinner("Generating embedding for each token..."): try: enc = tiktoken.get_encoding(tokenizer_name) tokens = enc.encode(input_text) token_strings = [enc.decode([t]) for t in tokens] all_embeddings = [] for i, token_text in enumerate(token_strings): response = client.embeddings.create( input=[token_text], model="text-embedding-ada-002" ) embedding = response.data[0].embedding all_embeddings.append(embedding) with st.expander(f"๐Ÿ”ธ Token {i+1}: '{token_text}'"): st.write(embedding) st.caption(f"Embedding dimension: {len(embedding)}") # Embedding Heatmap fig, ax = plt.subplots(figsize=(8, 1)) sns.heatmap(np.array(embedding).reshape(1, -1), cmap="viridis", cbar=True, ax=ax) ax.set_title("Embedding Heatmap") ax.axis('off') st.pyplot(fig) st.success(f"Successfully generated embeddings for {len(token_strings)} tokens.") # Optional PCA Visualization if st.checkbox("๐Ÿงญ Visualize all embeddings in 2D (PCA)"): pca = PCA(n_components=2) reduced = pca.fit_transform(np.array(all_embeddings)) fig, ax = plt.subplots() ax.scatter(reduced[:, 0], reduced[:, 1]) for i, label in enumerate(token_strings): ax.text(reduced[i, 0], reduced[i, 1], label, fontsize=9) ax.set_title("Token Embeddings (PCA 2D)") st.pyplot(fig) except Exception as e: st.error(f"OpenAI Error: {str(e)}") # ---------- Positional Encoding Section ---------- st.subheader("๐Ÿ“ Positional Encoding") st.markdown(""" Transformers have no built-in notion of order, so **positional encoding** adds a signal to each token to tell the model where it occurs in the sequence. We use sinusoidal positional encoding similar to what was introduced in the original Transformer paper. """) if st.button("๐ŸŒ€ Generate Positional Encoding"): enc = tiktoken.get_encoding(tokenizer_name) tokens = enc.encode(input_text) seq_len = len(tokens) dim = st.slider("Select positional encoding dimension:", 16, 512, 64, step=16) def get_positional_encoding(seq_len, dim): PE = np.zeros((seq_len, dim)) for pos in range(seq_len): for i in range(0, dim, 2): div_term = np.exp(i * -np.log(10000.0) / dim) PE[pos, i] = np.sin(pos * div_term) if i+1 < dim: PE[pos, i+1] = np.cos(pos * div_term) return PE PE = get_positional_encoding(seq_len, dim) with st.expander("๐Ÿ“ Positional Encoding Matrix"): st.write(PE) st.caption(f"Shape: {PE.shape}") if st.checkbox("๐Ÿ”ฌ Show Positional Encoding Heatmap"): fig, ax = plt.subplots(figsize=(10, seq_len // 2 + 1)) sns.heatmap(PE, cmap="coolwarm", cbar=True, ax=ax) ax.set_title("Positional Encoding Heatmap") st.pyplot(fig)