Spaces:

schogini
/

llm-token-explorer

Build error

App Files Files Community

schoginitoys commited on May 5, 2025

Commit

3adf2d7

verified ·

1 Parent(s): 2971f05

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +35 -41

src/streamlit_app.py CHANGED Viewed

@@ -30,28 +30,27 @@ if input_text:
     # ---------- Tokenization Info ----------
     st.subheader("🔤 Token Information")
     st.markdown("This shows how your input text is broken down into tokens. Each token is a subword unit that the model processes individually.")
-    if st.button("🔍 Show Token Details"):
-        enc = tiktoken.get_encoding(tokenizer_name)
-        tokens = enc.encode(input_text)
-        token_strings = [enc.decode([t]) for t in tokens]
-        with st.expander("🧾 Token IDs"):
-            st.write(tokens)
-        with st.expander("📖 Decoded Tokens"):
-            st.write(token_strings)
-        st.info(f"Token count: {len(tokens)}")
-        if st.button("📊 Show Token ID Chart"):
-            fig, ax = plt.subplots()
-            ax.bar(range(len(tokens)), tokens, tick_label=token_strings)
-            ax.set_xlabel("Token")
-            ax.set_ylabel("Token ID")
-            ax.set_title("Token IDs for Input Text")
-            plt.xticks(rotation=45, ha='right')
-            st.pyplot(fig)
     # ---------- Embedding Section ----------
     st.subheader("🔗 Token Embeddings (OpenAI)")
@@ -64,10 +63,6 @@ if input_text:
     if st.button("📡 Generate Embeddings"):
         with st.spinner("Generating embedding for each token..."):
             try:
-                enc = tiktoken.get_encoding(tokenizer_name)
-                tokens = enc.encode(input_text)
-                token_strings = [enc.decode([t]) for t in tokens]
                 all_embeddings = []
                 for i, token_text in enumerate(token_strings):
@@ -78,7 +73,7 @@ if input_text:
                     embedding = response.data[0].embedding
                     all_embeddings.append(embedding)
-                    with st.expander(f"🔸 Token {i+1}: '{token_text}'"):
                         st.write(embedding)
                         st.caption(f"Embedding dimension: {len(embedding)}")
@@ -91,16 +86,16 @@ if input_text:
                 st.success(f"Successfully generated embeddings for {len(token_strings)} tokens.")
-                # Optional PCA Visualization
-                if st.checkbox("🧭 Visualize all embeddings in 2D (PCA)"):
-                    pca = PCA(n_components=2)
-                    reduced = pca.fit_transform(np.array(all_embeddings))
-                    fig, ax = plt.subplots()
-                    ax.scatter(reduced[:, 0], reduced[:, 1])
-                    for i, label in enumerate(token_strings):
-                        ax.text(reduced[i, 0], reduced[i, 1], label, fontsize=9)
-                    ax.set_title("Token Embeddings (PCA 2D)")
-                    st.pyplot(fig)
             except Exception as e:
                 st.error(f"OpenAI Error: {str(e)}")
@@ -114,8 +109,6 @@ if input_text:
     """)
     if st.button("🌀 Generate Positional Encoding"):
-        enc = tiktoken.get_encoding(tokenizer_name)
-        tokens = enc.encode(input_text)
         seq_len = len(tokens)
         dim = st.slider("Select positional encoding dimension:", 16, 512, 64, step=16)
@@ -131,12 +124,13 @@ if input_text:
         PE = get_positional_encoding(seq_len, dim)
-        with st.expander("📐 Positional Encoding Matrix"):
             st.write(PE)
             st.caption(f"Shape: {PE.shape}")
-        if st.checkbox("🔬 Show Positional Encoding Heatmap"):
-            fig, ax = plt.subplots(figsize=(10, seq_len // 2 + 1))
-            sns.heatmap(PE, cmap="coolwarm", cbar=True, ax=ax)
-            ax.set_title("Positional Encoding Heatmap")
-            st.pyplot(fig)

     # ---------- Tokenization Info ----------
     st.subheader("🔤 Token Information")
     st.markdown("This shows how your input text is broken down into tokens. Each token is a subword unit that the model processes individually.")
+    enc = tiktoken.get_encoding(tokenizer_name)
+    tokens = enc.encode(input_text)
+    token_strings = [enc.decode([t]) for t in tokens]
+    with st.expander("🧾 Token IDs", expanded=True):
+        st.write(tokens)
+    with st.expander("📖 Decoded Tokens", expanded=True):
+        st.write(token_strings)
+    st.info(f"Token count: {len(tokens)}")
+    # ✅ Always show token ID chart
+    fig, ax = plt.subplots()
+    ax.bar(range(len(tokens)), tokens, tick_label=token_strings)
+    ax.set_xlabel("Token")
+    ax.set_ylabel("Token ID")
+    ax.set_title("Token IDs for Input Text")
+    plt.xticks(rotation=45, ha='right')
+    st.pyplot(fig)
     # ---------- Embedding Section ----------
     st.subheader("🔗 Token Embeddings (OpenAI)")
     if st.button("📡 Generate Embeddings"):
         with st.spinner("Generating embedding for each token..."):
             try:
                 all_embeddings = []
                 for i, token_text in enumerate(token_strings):
                     embedding = response.data[0].embedding
                     all_embeddings.append(embedding)
+                    with st.expander(f"🔸 Token {i+1}: '{token_text}'", expanded=True):
                         st.write(embedding)
                         st.caption(f"Embedding dimension: {len(embedding)}")
                 st.success(f"Successfully generated embeddings for {len(token_strings)} tokens.")
+                # ✅ PCA Visualization ON by default
+                st.subheader("🧭 Token Embeddings in 2D (PCA)")
+                pca = PCA(n_components=2)
+                reduced = pca.fit_transform(np.array(all_embeddings))
+                fig, ax = plt.subplots()
+                ax.scatter(reduced[:, 0], reduced[:, 1])
+                for i, label in enumerate(token_strings):
+                    ax.text(reduced[i, 0], reduced[i, 1], label, fontsize=9)
+                ax.set_title("Token Embeddings (PCA 2D)")
+                st.pyplot(fig)
             except Exception as e:
                 st.error(f"OpenAI Error: {str(e)}")
     """)
     if st.button("🌀 Generate Positional Encoding"):
         seq_len = len(tokens)
         dim = st.slider("Select positional encoding dimension:", 16, 512, 64, step=16)
         PE = get_positional_encoding(seq_len, dim)
+        with st.expander("📐 Positional Encoding Matrix", expanded=True):
             st.write(PE)
             st.caption(f"Shape: {PE.shape}")
+        # ✅ Default show heatmap ON
+        st.subheader("🔬 Positional Encoding Heatmap")
+        fig, ax = plt.subplots(figsize=(10, seq_len // 2 + 1))
+        sns.heatmap(PE, cmap="coolwarm", cbar=True, ax=ax)
+        ax.set_title("Positional Encoding Heatmap")
+        st.pyplot(fig)