Spaces:
Build error
Build error
| import streamlit as st | |
| import numpy as np | |
| import tiktoken | |
| import os | |
| from openai import OpenAI | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from sklearn.decomposition import PCA | |
| # Setup | |
| st.set_page_config(page_title="LLM Token Explorer", layout="centered") | |
| st.title("π§ LLM Token & Embedding Explorer") | |
| client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) | |
| # Debug Key Check | |
| st.text(f"OpenAI key found: {'Yes' if os.getenv('OPENAI_API_KEY') else 'No'}") | |
| # ---------- Input Section ---------- | |
| st.header("βοΈ Input Text") | |
| st.markdown("Enter any short sentence or phrase you'd like to explore. We'll break it down into tokens and explore their structure and meaning.") | |
| input_text = st.text_area("Enter your text:", height=150) | |
| # ---------- Tokenizer Selection ---------- | |
| st.header("π§ Tokenizer Choice") | |
| st.markdown("Choose a tokenizer from the available ones in `tiktoken`. Different models use different tokenization strategies.") | |
| tokenizer_name = st.selectbox("Choose tokenizer:", ["cl100k_base", "p50k_base", "r50k_base", "gpt2"]) | |
| if input_text: | |
| # ---------- Tokenization Info ---------- | |
| st.subheader("π€ Token Information") | |
| st.markdown("This shows how your input text is broken down into tokens. Each token is a subword unit that the model processes individually.") | |
| if st.button("π Show Token Details"): | |
| enc = tiktoken.get_encoding(tokenizer_name) | |
| tokens = enc.encode(input_text) | |
| token_strings = [enc.decode([t]) for t in tokens] | |
| with st.expander("π§Ύ Token IDs"): | |
| st.write(tokens) | |
| with st.expander("π Decoded Tokens"): | |
| st.write(token_strings) | |
| st.info(f"Token count: {len(tokens)}") | |
| if st.button("π Show Token ID Chart"): | |
| fig, ax = plt.subplots() | |
| ax.bar(range(len(tokens)), tokens, tick_label=token_strings) | |
| ax.set_xlabel("Token") | |
| ax.set_ylabel("Token ID") | |
| ax.set_title("Token IDs for Input Text") | |
| plt.xticks(rotation=45, ha='right') | |
| st.pyplot(fig) | |
| # ---------- Embedding Section ---------- | |
| st.subheader("π Token Embeddings (OpenAI)") | |
| st.markdown(""" | |
| Each token is mapped to a high-dimensional vector called an **embedding**. These vectors capture the contextual meaning of words and are the foundation of how language models understand text. | |
| We use the `text-embedding-ada-002` model from OpenAI to generate embeddings for each token. | |
| """) | |
| if st.button("π‘ Generate Embeddings"): | |
| with st.spinner("Generating embedding for each token..."): | |
| try: | |
| enc = tiktoken.get_encoding(tokenizer_name) | |
| tokens = enc.encode(input_text) | |
| token_strings = [enc.decode([t]) for t in tokens] | |
| all_embeddings = [] | |
| for i, token_text in enumerate(token_strings): | |
| response = client.embeddings.create( | |
| input=[token_text], | |
| model="text-embedding-ada-002" | |
| ) | |
| embedding = response.data[0].embedding | |
| all_embeddings.append(embedding) | |
| with st.expander(f"πΈ Token {i+1}: '{token_text}'"): | |
| st.write(embedding) | |
| st.caption(f"Embedding dimension: {len(embedding)}") | |
| # Embedding Heatmap | |
| fig, ax = plt.subplots(figsize=(8, 1)) | |
| sns.heatmap(np.array(embedding).reshape(1, -1), cmap="viridis", cbar=True, ax=ax) | |
| ax.set_title("Embedding Heatmap") | |
| ax.axis('off') | |
| st.pyplot(fig) | |
| st.success(f"Successfully generated embeddings for {len(token_strings)} tokens.") | |
| # Optional PCA Visualization | |
| if st.checkbox("π§ Visualize all embeddings in 2D (PCA)"): | |
| pca = PCA(n_components=2) | |
| reduced = pca.fit_transform(np.array(all_embeddings)) | |
| fig, ax = plt.subplots() | |
| ax.scatter(reduced[:, 0], reduced[:, 1]) | |
| for i, label in enumerate(token_strings): | |
| ax.text(reduced[i, 0], reduced[i, 1], label, fontsize=9) | |
| ax.set_title("Token Embeddings (PCA 2D)") | |
| st.pyplot(fig) | |
| except Exception as e: | |
| st.error(f"OpenAI Error: {str(e)}") | |
| # ---------- Positional Encoding Section ---------- | |
| st.subheader("π Positional Encoding") | |
| st.markdown(""" | |
| Transformers have no built-in notion of order, so **positional encoding** adds a signal to each token to tell the model where it occurs in the sequence. | |
| We use sinusoidal positional encoding similar to what was introduced in the original Transformer paper. | |
| """) | |
| if st.button("π Generate Positional Encoding"): | |
| enc = tiktoken.get_encoding(tokenizer_name) | |
| tokens = enc.encode(input_text) | |
| seq_len = len(tokens) | |
| dim = st.slider("Select positional encoding dimension:", 16, 512, 64, step=16) | |
| def get_positional_encoding(seq_len, dim): | |
| PE = np.zeros((seq_len, dim)) | |
| for pos in range(seq_len): | |
| for i in range(0, dim, 2): | |
| div_term = np.exp(i * -np.log(10000.0) / dim) | |
| PE[pos, i] = np.sin(pos * div_term) | |
| if i+1 < dim: | |
| PE[pos, i+1] = np.cos(pos * div_term) | |
| return PE | |
| PE = get_positional_encoding(seq_len, dim) | |
| with st.expander("π Positional Encoding Matrix"): | |
| st.write(PE) | |
| st.caption(f"Shape: {PE.shape}") | |
| if st.checkbox("π¬ Show Positional Encoding Heatmap"): | |
| fig, ax = plt.subplots(figsize=(10, seq_len // 2 + 1)) | |
| sns.heatmap(PE, cmap="coolwarm", cbar=True, ax=ax) | |
| ax.set_title("Positional Encoding Heatmap") | |
| st.pyplot(fig) | |