Spaces:
Build error
Build error
File size: 6,061 Bytes
2a5ba2a f8b7cc9 81a22fa 4329495 81a22fa f8b7cc9 ef2d6c6 f8b7cc9 584603b f8b7cc9 4329495 3da4ee6 ef2d6c6 f8b7cc9 ef2d6c6 f8b7cc9 ef2d6c6 f8b7cc9 ef2d6c6 f8b7cc9 ef2d6c6 f8b7cc9 ef2d6c6 f8b7cc9 4329495 ef2d6c6 4329495 ef2d6c6 f8b7cc9 ef2d6c6 4329495 ef2d6c6 4329495 ef2d6c6 4329495 ef2d6c6 4329495 f8b7cc9 ef2d6c6 4329495 ef2d6c6 f8b7cc9 ef2d6c6 f8b7cc9 2a5ba2a f8b7cc9 ef2d6c6 f8b7cc9 ef2d6c6 4329495 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | import streamlit as st
import numpy as np
import tiktoken
import os
from openai import OpenAI
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
# Setup
st.set_page_config(page_title="LLM Token Explorer", layout="centered")
st.title("π§ LLM Token & Embedding Explorer")
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
# Debug Key Check
st.text(f"OpenAI key found: {'Yes' if os.getenv('OPENAI_API_KEY') else 'No'}")
# ---------- Input Section ----------
st.header("βοΈ Input Text")
st.markdown("Enter any short sentence or phrase you'd like to explore. We'll break it down into tokens and explore their structure and meaning.")
input_text = st.text_area("Enter your text:", height=150)
# ---------- Tokenizer Selection ----------
st.header("π§ Tokenizer Choice")
st.markdown("Choose a tokenizer from the available ones in `tiktoken`. Different models use different tokenization strategies.")
tokenizer_name = st.selectbox("Choose tokenizer:", ["cl100k_base", "p50k_base", "r50k_base", "gpt2"])
if input_text:
# ---------- Tokenization Info ----------
st.subheader("π€ Token Information")
st.markdown("This shows how your input text is broken down into tokens. Each token is a subword unit that the model processes individually.")
if st.button("π Show Token Details"):
enc = tiktoken.get_encoding(tokenizer_name)
tokens = enc.encode(input_text)
token_strings = [enc.decode([t]) for t in tokens]
with st.expander("π§Ύ Token IDs"):
st.write(tokens)
with st.expander("π Decoded Tokens"):
st.write(token_strings)
st.info(f"Token count: {len(tokens)}")
if st.button("π Show Token ID Chart"):
fig, ax = plt.subplots()
ax.bar(range(len(tokens)), tokens, tick_label=token_strings)
ax.set_xlabel("Token")
ax.set_ylabel("Token ID")
ax.set_title("Token IDs for Input Text")
plt.xticks(rotation=45, ha='right')
st.pyplot(fig)
# ---------- Embedding Section ----------
st.subheader("π Token Embeddings (OpenAI)")
st.markdown("""
Each token is mapped to a high-dimensional vector called an **embedding**. These vectors capture the contextual meaning of words and are the foundation of how language models understand text.
We use the `text-embedding-ada-002` model from OpenAI to generate embeddings for each token.
""")
if st.button("π‘ Generate Embeddings"):
with st.spinner("Generating embedding for each token..."):
try:
enc = tiktoken.get_encoding(tokenizer_name)
tokens = enc.encode(input_text)
token_strings = [enc.decode([t]) for t in tokens]
all_embeddings = []
for i, token_text in enumerate(token_strings):
response = client.embeddings.create(
input=[token_text],
model="text-embedding-ada-002"
)
embedding = response.data[0].embedding
all_embeddings.append(embedding)
with st.expander(f"πΈ Token {i+1}: '{token_text}'"):
st.write(embedding)
st.caption(f"Embedding dimension: {len(embedding)}")
# Embedding Heatmap
fig, ax = plt.subplots(figsize=(8, 1))
sns.heatmap(np.array(embedding).reshape(1, -1), cmap="viridis", cbar=True, ax=ax)
ax.set_title("Embedding Heatmap")
ax.axis('off')
st.pyplot(fig)
st.success(f"Successfully generated embeddings for {len(token_strings)} tokens.")
# Optional PCA Visualization
if st.checkbox("π§ Visualize all embeddings in 2D (PCA)"):
pca = PCA(n_components=2)
reduced = pca.fit_transform(np.array(all_embeddings))
fig, ax = plt.subplots()
ax.scatter(reduced[:, 0], reduced[:, 1])
for i, label in enumerate(token_strings):
ax.text(reduced[i, 0], reduced[i, 1], label, fontsize=9)
ax.set_title("Token Embeddings (PCA 2D)")
st.pyplot(fig)
except Exception as e:
st.error(f"OpenAI Error: {str(e)}")
# ---------- Positional Encoding Section ----------
st.subheader("π Positional Encoding")
st.markdown("""
Transformers have no built-in notion of order, so **positional encoding** adds a signal to each token to tell the model where it occurs in the sequence.
We use sinusoidal positional encoding similar to what was introduced in the original Transformer paper.
""")
if st.button("π Generate Positional Encoding"):
enc = tiktoken.get_encoding(tokenizer_name)
tokens = enc.encode(input_text)
seq_len = len(tokens)
dim = st.slider("Select positional encoding dimension:", 16, 512, 64, step=16)
def get_positional_encoding(seq_len, dim):
PE = np.zeros((seq_len, dim))
for pos in range(seq_len):
for i in range(0, dim, 2):
div_term = np.exp(i * -np.log(10000.0) / dim)
PE[pos, i] = np.sin(pos * div_term)
if i+1 < dim:
PE[pos, i+1] = np.cos(pos * div_term)
return PE
PE = get_positional_encoding(seq_len, dim)
with st.expander("π Positional Encoding Matrix"):
st.write(PE)
st.caption(f"Shape: {PE.shape}")
if st.checkbox("π¬ Show Positional Encoding Heatmap"):
fig, ax = plt.subplots(figsize=(10, seq_len // 2 + 1))
sns.heatmap(PE, cmap="coolwarm", cbar=True, ax=ax)
ax.set_title("Positional Encoding Heatmap")
st.pyplot(fig)
|