Spaces:

schogini
/

llm-token-explorer

Build error

App Files Files Community

llm-token-explorer / src /streamlit_app.py

schoginitoys

Update src/streamlit_app.py

4329495 verified 10 months ago

raw

history blame

6.06 kB

	import streamlit as st
	import numpy as np
	import tiktoken
	import os
	from openai import OpenAI
	import matplotlib.pyplot as plt
	import seaborn as sns
	from sklearn.decomposition import PCA

	# Setup
	st.set_page_config(page_title="LLM Token Explorer", layout="centered")
	st.title("🧠 LLM Token & Embedding Explorer")

	client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

	# Debug Key Check
	st.text(f"OpenAI key found: {'Yes' if os.getenv('OPENAI_API_KEY') else 'No'}")

	# ---------- Input Section ----------
	st.header("✍️ Input Text")
	st.markdown("Enter any short sentence or phrase you'd like to explore. We'll break it down into tokens and explore their structure and meaning.")
	input_text = st.text_area("Enter your text:", height=150)

	# ---------- Tokenizer Selection ----------
	st.header("🔧 Tokenizer Choice")
	st.markdown("Choose a tokenizer from the available ones in `tiktoken`. Different models use different tokenization strategies.")
	tokenizer_name = st.selectbox("Choose tokenizer:", ["cl100k_base", "p50k_base", "r50k_base", "gpt2"])

	if input_text:
	# ---------- Tokenization Info ----------
	st.subheader("🔤 Token Information")
	st.markdown("This shows how your input text is broken down into tokens. Each token is a subword unit that the model processes individually.")

	if st.button("🔍 Show Token Details"):
	enc = tiktoken.get_encoding(tokenizer_name)
	tokens = enc.encode(input_text)
	token_strings = [enc.decode([t]) for t in tokens]

	with st.expander("🧾 Token IDs"):
	st.write(tokens)

	with st.expander("📖 Decoded Tokens"):
	st.write(token_strings)

	st.info(f"Token count: {len(tokens)}")

	if st.button("📊 Show Token ID Chart"):
	fig, ax = plt.subplots()
	ax.bar(range(len(tokens)), tokens, tick_label=token_strings)
	ax.set_xlabel("Token")
	ax.set_ylabel("Token ID")
	ax.set_title("Token IDs for Input Text")
	plt.xticks(rotation=45, ha='right')
	st.pyplot(fig)

	# ---------- Embedding Section ----------
	st.subheader("🔗 Token Embeddings (OpenAI)")
	st.markdown("""
	Each token is mapped to a high-dimensional vector called an embedding. These vectors capture the contextual meaning of words and are the foundation of how language models understand text.

	We use the `text-embedding-ada-002` model from OpenAI to generate embeddings for each token.
	""")

	if st.button("📡 Generate Embeddings"):
	with st.spinner("Generating embedding for each token..."):
	try:
	enc = tiktoken.get_encoding(tokenizer_name)
	tokens = enc.encode(input_text)
	token_strings = [enc.decode([t]) for t in tokens]

	all_embeddings = []

	for i, token_text in enumerate(token_strings):
	response = client.embeddings.create(
	input=[token_text],
	model="text-embedding-ada-002"
	)
	embedding = response.data[0].embedding
	all_embeddings.append(embedding)

	with st.expander(f"🔸 Token {i+1}: '{token_text}'"):
	st.write(embedding)
	st.caption(f"Embedding dimension: {len(embedding)}")

	# Embedding Heatmap
	fig, ax = plt.subplots(figsize=(8, 1))
	sns.heatmap(np.array(embedding).reshape(1, -1), cmap="viridis", cbar=True, ax=ax)
	ax.set_title("Embedding Heatmap")
	ax.axis('off')
	st.pyplot(fig)

	st.success(f"Successfully generated embeddings for {len(token_strings)} tokens.")

	# Optional PCA Visualization
	if st.checkbox("🧭 Visualize all embeddings in 2D (PCA)"):
	pca = PCA(n_components=2)
	reduced = pca.fit_transform(np.array(all_embeddings))
	fig, ax = plt.subplots()
	ax.scatter(reduced[:, 0], reduced[:, 1])
	for i, label in enumerate(token_strings):
	ax.text(reduced[i, 0], reduced[i, 1], label, fontsize=9)
	ax.set_title("Token Embeddings (PCA 2D)")
	st.pyplot(fig)

	except Exception as e:
	st.error(f"OpenAI Error: {str(e)}")

	# ---------- Positional Encoding Section ----------
	st.subheader("📍 Positional Encoding")
	st.markdown("""
	Transformers have no built-in notion of order, so positional encoding adds a signal to each token to tell the model where it occurs in the sequence.

	We use sinusoidal positional encoding similar to what was introduced in the original Transformer paper.
	""")

	if st.button("🌀 Generate Positional Encoding"):
	enc = tiktoken.get_encoding(tokenizer_name)
	tokens = enc.encode(input_text)
	seq_len = len(tokens)
	dim = st.slider("Select positional encoding dimension:", 16, 512, 64, step=16)

	def get_positional_encoding(seq_len, dim):
	PE = np.zeros((seq_len, dim))
	for pos in range(seq_len):
	for i in range(0, dim, 2):
	div_term = np.exp(i * -np.log(10000.0) / dim)
	PE[pos, i] = np.sin(pos * div_term)
	if i+1 < dim:
	PE[pos, i+1] = np.cos(pos * div_term)
	return PE

	PE = get_positional_encoding(seq_len, dim)

	with st.expander("📐 Positional Encoding Matrix"):
	st.write(PE)
	st.caption(f"Shape: {PE.shape}")

	if st.checkbox("🔬 Show Positional Encoding Heatmap"):
	fig, ax = plt.subplots(figsize=(10, seq_len // 2 + 1))
	sns.heatmap(PE, cmap="coolwarm", cbar=True, ax=ax)
	ax.set_title("Positional Encoding Heatmap")
	st.pyplot(fig)