File size: 6,061 Bytes
2a5ba2a
f8b7cc9
 
 
81a22fa
4329495
 
 
81a22fa
f8b7cc9
 
ef2d6c6
f8b7cc9
584603b
f8b7cc9
4329495
3da4ee6
 
ef2d6c6
 
 
f8b7cc9
 
ef2d6c6
 
 
f8b7cc9
 
 
ef2d6c6
 
 
 
 
f8b7cc9
 
 
 
ef2d6c6
f8b7cc9
ef2d6c6
 
f8b7cc9
ef2d6c6
f8b7cc9
 
4329495
 
 
 
 
 
 
 
 
ef2d6c6
 
 
 
 
 
 
4329495
ef2d6c6
 
f8b7cc9
ef2d6c6
 
 
 
4329495
 
ef2d6c6
 
 
 
 
 
4329495
ef2d6c6
 
 
 
 
4329495
 
 
 
 
 
 
ef2d6c6
 
4329495
 
 
 
 
 
 
 
 
 
 
f8b7cc9
 
 
ef2d6c6
 
 
 
 
 
 
4329495
ef2d6c6
f8b7cc9
 
 
ef2d6c6
f8b7cc9
 
 
 
 
 
 
 
 
 
2a5ba2a
f8b7cc9
ef2d6c6
 
f8b7cc9
ef2d6c6
4329495
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import streamlit as st
import numpy as np
import tiktoken
import os
from openai import OpenAI
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

# Setup
st.set_page_config(page_title="LLM Token Explorer", layout="centered")
st.title("🧠 LLM Token & Embedding Explorer")

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Debug Key Check
st.text(f"OpenAI key found: {'Yes' if os.getenv('OPENAI_API_KEY') else 'No'}")

# ---------- Input Section ----------
st.header("✍️ Input Text")
st.markdown("Enter any short sentence or phrase you'd like to explore. We'll break it down into tokens and explore their structure and meaning.")
input_text = st.text_area("Enter your text:", height=150)

# ---------- Tokenizer Selection ----------
st.header("πŸ”§ Tokenizer Choice")
st.markdown("Choose a tokenizer from the available ones in `tiktoken`. Different models use different tokenization strategies.")
tokenizer_name = st.selectbox("Choose tokenizer:", ["cl100k_base", "p50k_base", "r50k_base", "gpt2"])

if input_text:
    # ---------- Tokenization Info ----------
    st.subheader("πŸ”€ Token Information")
    st.markdown("This shows how your input text is broken down into tokens. Each token is a subword unit that the model processes individually.")
    
    if st.button("πŸ” Show Token Details"):
        enc = tiktoken.get_encoding(tokenizer_name)
        tokens = enc.encode(input_text)
        token_strings = [enc.decode([t]) for t in tokens]

        with st.expander("🧾 Token IDs"):
            st.write(tokens)

        with st.expander("πŸ“– Decoded Tokens"):
            st.write(token_strings)

        st.info(f"Token count: {len(tokens)}")

        if st.button("πŸ“Š Show Token ID Chart"):
            fig, ax = plt.subplots()
            ax.bar(range(len(tokens)), tokens, tick_label=token_strings)
            ax.set_xlabel("Token")
            ax.set_ylabel("Token ID")
            ax.set_title("Token IDs for Input Text")
            plt.xticks(rotation=45, ha='right')
            st.pyplot(fig)

    # ---------- Embedding Section ----------
    st.subheader("πŸ”— Token Embeddings (OpenAI)")
    st.markdown("""
    Each token is mapped to a high-dimensional vector called an **embedding**. These vectors capture the contextual meaning of words and are the foundation of how language models understand text.
    
    We use the `text-embedding-ada-002` model from OpenAI to generate embeddings for each token.
    """)

    if st.button("πŸ“‘ Generate Embeddings"):
        with st.spinner("Generating embedding for each token..."):
            try:
                enc = tiktoken.get_encoding(tokenizer_name)
                tokens = enc.encode(input_text)
                token_strings = [enc.decode([t]) for t in tokens]

                all_embeddings = []

                for i, token_text in enumerate(token_strings):
                    response = client.embeddings.create(
                        input=[token_text],
                        model="text-embedding-ada-002"
                    )
                    embedding = response.data[0].embedding
                    all_embeddings.append(embedding)

                    with st.expander(f"πŸ”Έ Token {i+1}: '{token_text}'"):
                        st.write(embedding)
                        st.caption(f"Embedding dimension: {len(embedding)}")

                        # Embedding Heatmap
                        fig, ax = plt.subplots(figsize=(8, 1))
                        sns.heatmap(np.array(embedding).reshape(1, -1), cmap="viridis", cbar=True, ax=ax)
                        ax.set_title("Embedding Heatmap")
                        ax.axis('off')
                        st.pyplot(fig)

                st.success(f"Successfully generated embeddings for {len(token_strings)} tokens.")

                # Optional PCA Visualization
                if st.checkbox("🧭 Visualize all embeddings in 2D (PCA)"):
                    pca = PCA(n_components=2)
                    reduced = pca.fit_transform(np.array(all_embeddings))
                    fig, ax = plt.subplots()
                    ax.scatter(reduced[:, 0], reduced[:, 1])
                    for i, label in enumerate(token_strings):
                        ax.text(reduced[i, 0], reduced[i, 1], label, fontsize=9)
                    ax.set_title("Token Embeddings (PCA 2D)")
                    st.pyplot(fig)

            except Exception as e:
                st.error(f"OpenAI Error: {str(e)}")

    # ---------- Positional Encoding Section ----------
    st.subheader("πŸ“ Positional Encoding")
    st.markdown("""
    Transformers have no built-in notion of order, so **positional encoding** adds a signal to each token to tell the model where it occurs in the sequence.
    
    We use sinusoidal positional encoding similar to what was introduced in the original Transformer paper.
    """)

    if st.button("πŸŒ€ Generate Positional Encoding"):
        enc = tiktoken.get_encoding(tokenizer_name)
        tokens = enc.encode(input_text)
        seq_len = len(tokens)
        dim = st.slider("Select positional encoding dimension:", 16, 512, 64, step=16)

        def get_positional_encoding(seq_len, dim):
            PE = np.zeros((seq_len, dim))
            for pos in range(seq_len):
                for i in range(0, dim, 2):
                    div_term = np.exp(i * -np.log(10000.0) / dim)
                    PE[pos, i] = np.sin(pos * div_term)
                    if i+1 < dim:
                        PE[pos, i+1] = np.cos(pos * div_term)
            return PE

        PE = get_positional_encoding(seq_len, dim)

        with st.expander("πŸ“ Positional Encoding Matrix"):
            st.write(PE)
            st.caption(f"Shape: {PE.shape}")

        if st.checkbox("πŸ”¬ Show Positional Encoding Heatmap"):
            fig, ax = plt.subplots(figsize=(10, seq_len // 2 + 1))
            sns.heatmap(PE, cmap="coolwarm", cbar=True, ax=ax)
            ax.set_title("Positional Encoding Heatmap")
            st.pyplot(fig)