llm-token-explorer / src /streamlit_app.py
schoginitoys's picture
Update src/streamlit_app.py
4329495 verified
raw
history blame
6.06 kB
import streamlit as st
import numpy as np
import tiktoken
import os
from openai import OpenAI
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
# Setup
st.set_page_config(page_title="LLM Token Explorer", layout="centered")
st.title("🧠 LLM Token & Embedding Explorer")
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
# Debug Key Check
st.text(f"OpenAI key found: {'Yes' if os.getenv('OPENAI_API_KEY') else 'No'}")
# ---------- Input Section ----------
st.header("✍️ Input Text")
st.markdown("Enter any short sentence or phrase you'd like to explore. We'll break it down into tokens and explore their structure and meaning.")
input_text = st.text_area("Enter your text:", height=150)
# ---------- Tokenizer Selection ----------
st.header("πŸ”§ Tokenizer Choice")
st.markdown("Choose a tokenizer from the available ones in `tiktoken`. Different models use different tokenization strategies.")
tokenizer_name = st.selectbox("Choose tokenizer:", ["cl100k_base", "p50k_base", "r50k_base", "gpt2"])
if input_text:
# ---------- Tokenization Info ----------
st.subheader("πŸ”€ Token Information")
st.markdown("This shows how your input text is broken down into tokens. Each token is a subword unit that the model processes individually.")
if st.button("πŸ” Show Token Details"):
enc = tiktoken.get_encoding(tokenizer_name)
tokens = enc.encode(input_text)
token_strings = [enc.decode([t]) for t in tokens]
with st.expander("🧾 Token IDs"):
st.write(tokens)
with st.expander("πŸ“– Decoded Tokens"):
st.write(token_strings)
st.info(f"Token count: {len(tokens)}")
if st.button("πŸ“Š Show Token ID Chart"):
fig, ax = plt.subplots()
ax.bar(range(len(tokens)), tokens, tick_label=token_strings)
ax.set_xlabel("Token")
ax.set_ylabel("Token ID")
ax.set_title("Token IDs for Input Text")
plt.xticks(rotation=45, ha='right')
st.pyplot(fig)
# ---------- Embedding Section ----------
st.subheader("πŸ”— Token Embeddings (OpenAI)")
st.markdown("""
Each token is mapped to a high-dimensional vector called an **embedding**. These vectors capture the contextual meaning of words and are the foundation of how language models understand text.
We use the `text-embedding-ada-002` model from OpenAI to generate embeddings for each token.
""")
if st.button("πŸ“‘ Generate Embeddings"):
with st.spinner("Generating embedding for each token..."):
try:
enc = tiktoken.get_encoding(tokenizer_name)
tokens = enc.encode(input_text)
token_strings = [enc.decode([t]) for t in tokens]
all_embeddings = []
for i, token_text in enumerate(token_strings):
response = client.embeddings.create(
input=[token_text],
model="text-embedding-ada-002"
)
embedding = response.data[0].embedding
all_embeddings.append(embedding)
with st.expander(f"πŸ”Έ Token {i+1}: '{token_text}'"):
st.write(embedding)
st.caption(f"Embedding dimension: {len(embedding)}")
# Embedding Heatmap
fig, ax = plt.subplots(figsize=(8, 1))
sns.heatmap(np.array(embedding).reshape(1, -1), cmap="viridis", cbar=True, ax=ax)
ax.set_title("Embedding Heatmap")
ax.axis('off')
st.pyplot(fig)
st.success(f"Successfully generated embeddings for {len(token_strings)} tokens.")
# Optional PCA Visualization
if st.checkbox("🧭 Visualize all embeddings in 2D (PCA)"):
pca = PCA(n_components=2)
reduced = pca.fit_transform(np.array(all_embeddings))
fig, ax = plt.subplots()
ax.scatter(reduced[:, 0], reduced[:, 1])
for i, label in enumerate(token_strings):
ax.text(reduced[i, 0], reduced[i, 1], label, fontsize=9)
ax.set_title("Token Embeddings (PCA 2D)")
st.pyplot(fig)
except Exception as e:
st.error(f"OpenAI Error: {str(e)}")
# ---------- Positional Encoding Section ----------
st.subheader("πŸ“ Positional Encoding")
st.markdown("""
Transformers have no built-in notion of order, so **positional encoding** adds a signal to each token to tell the model where it occurs in the sequence.
We use sinusoidal positional encoding similar to what was introduced in the original Transformer paper.
""")
if st.button("πŸŒ€ Generate Positional Encoding"):
enc = tiktoken.get_encoding(tokenizer_name)
tokens = enc.encode(input_text)
seq_len = len(tokens)
dim = st.slider("Select positional encoding dimension:", 16, 512, 64, step=16)
def get_positional_encoding(seq_len, dim):
PE = np.zeros((seq_len, dim))
for pos in range(seq_len):
for i in range(0, dim, 2):
div_term = np.exp(i * -np.log(10000.0) / dim)
PE[pos, i] = np.sin(pos * div_term)
if i+1 < dim:
PE[pos, i+1] = np.cos(pos * div_term)
return PE
PE = get_positional_encoding(seq_len, dim)
with st.expander("πŸ“ Positional Encoding Matrix"):
st.write(PE)
st.caption(f"Shape: {PE.shape}")
if st.checkbox("πŸ”¬ Show Positional Encoding Heatmap"):
fig, ax = plt.subplots(figsize=(10, seq_len // 2 + 1))
sns.heatmap(PE, cmap="coolwarm", cbar=True, ax=ax)
ax.set_title("Positional Encoding Heatmap")
st.pyplot(fig)