schoginitoys commited on
Commit
ef2d6c6
Β·
verified Β·
1 Parent(s): bea0fbe

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +58 -34
src/streamlit_app.py CHANGED
@@ -1,66 +1,89 @@
1
  import streamlit as st
2
  import numpy as np
3
  import tiktoken
4
- # import openai
5
  import os
6
-
7
  from openai import OpenAI
 
8
 
9
  # Setup
10
  st.set_page_config(page_title="LLM Token Explorer", layout="centered")
11
- st.title("LLM Token & Embedding Explorer")
12
 
13
- # OpenAI key from environment
14
- # openai.api_key = os.getenv("OPENAI_API_KEY")
15
-
16
- from dotenv import load_dotenv
17
  load_dotenv()
 
18
 
19
- client = OpenAI() # Uses OPENAI_API_KEY from env automatically
20
-
21
-
22
-
23
- # Input text
24
  input_text = st.text_area("Enter your text:", height=150)
25
 
26
- # Tokenizer selection
 
 
27
  tokenizer_name = st.selectbox("Choose tokenizer:", ["cl100k_base", "p50k_base", "r50k_base", "gpt2"])
28
 
29
  if input_text:
30
- # Tokenization
31
- if st.button("General Token Info"):
 
 
 
32
  enc = tiktoken.get_encoding(tokenizer_name)
33
  tokens = enc.encode(input_text)
34
  token_strings = [enc.decode([t]) for t in tokens]
35
 
36
- with st.expander("Token IDs"):
37
  st.write(tokens)
38
- with st.expander("Decoded Tokens"):
 
39
  st.write(token_strings)
 
40
  st.info(f"Token count: {len(tokens)}")
41
 
42
- # OpenAI Embedding
43
- if st.button("Generate Embedding using OpenAI"):
44
- with st.spinner("Calling OpenAI..."):
 
 
 
 
 
 
 
45
  try:
46
- response = client.embeddings.create(
47
- input=[input_text],
48
- model="text-embedding-ada-002"
49
- )
50
- embedding = response.data[0].embedding
51
-
52
- with st.expander("Embedding Vector"):
53
- st.write(embedding)
54
- st.info(f"Embedding dimension: {len(embedding)}")
 
 
 
 
 
 
 
 
55
  except Exception as e:
56
  st.error(f"OpenAI Error: {str(e)}")
57
 
58
- # Positional Encoding
59
- if st.button("Generate Positional Encoding"):
 
 
 
 
 
 
 
60
  enc = tiktoken.get_encoding(tokenizer_name)
61
  tokens = enc.encode(input_text)
62
  seq_len = len(tokens)
63
- dim = st.slider("Encoding dimension:", 16, 512, 64, step=16)
64
 
65
  def get_positional_encoding(seq_len, dim):
66
  PE = np.zeros((seq_len, dim))
@@ -73,6 +96,7 @@ if input_text:
73
  return PE
74
 
75
  PE = get_positional_encoding(seq_len, dim)
76
- with st.expander("Positional Encoding Matrix"):
 
77
  st.write(PE)
78
- st.info(f"Shape: {PE.shape}")
 
1
  import streamlit as st
2
  import numpy as np
3
  import tiktoken
 
4
  import os
 
5
  from openai import OpenAI
6
+ from dotenv import load_dotenv
7
 
8
  # Setup
9
  st.set_page_config(page_title="LLM Token Explorer", layout="centered")
10
+ st.title("🧠 LLM Token & Embedding Explorer")
11
 
 
 
 
 
12
  load_dotenv()
13
+ client = OpenAI() # Automatically uses OPENAI_API_KEY from .env
14
 
15
+ # ---------- Input Section ----------
16
+ st.header("✍️ Input Text")
17
+ st.markdown("Enter any short sentence or phrase you'd like to explore. We'll break it down into tokens and explore their structure and meaning.")
 
 
18
  input_text = st.text_area("Enter your text:", height=150)
19
 
20
+ # ---------- Tokenizer Selection ----------
21
+ st.header("πŸ”§ Tokenizer Choice")
22
+ st.markdown("Choose a tokenizer from the available ones in `tiktoken`. Different models use different tokenization strategies.")
23
  tokenizer_name = st.selectbox("Choose tokenizer:", ["cl100k_base", "p50k_base", "r50k_base", "gpt2"])
24
 
25
  if input_text:
26
+ # ---------- Tokenization Info ----------
27
+ st.subheader("πŸ”€ Token Information")
28
+ st.markdown("This shows how your input text is broken down into tokens. Each token is a subword unit that the model processes individually.")
29
+
30
+ if st.button("πŸ” Show Token Details"):
31
  enc = tiktoken.get_encoding(tokenizer_name)
32
  tokens = enc.encode(input_text)
33
  token_strings = [enc.decode([t]) for t in tokens]
34
 
35
+ with st.expander("🧾 Token IDs"):
36
  st.write(tokens)
37
+
38
+ with st.expander("πŸ“– Decoded Tokens"):
39
  st.write(token_strings)
40
+
41
  st.info(f"Token count: {len(tokens)}")
42
 
43
+ # ---------- Embedding Section ----------
44
+ st.subheader("πŸ”— Token Embeddings (OpenAI)")
45
+ st.markdown("""
46
+ Each token is mapped to a high-dimensional vector called an **embedding**. These vectors capture the contextual meaning of words and are the foundation of how language models understand text.
47
+
48
+ We use the `text-embedding-ada-002` model from OpenAI to generate embeddings for each token.
49
+ """)
50
+
51
+ if st.button("πŸ“‘ Generate Embeddings"):
52
+ with st.spinner("Generating embedding for each token..."):
53
  try:
54
+ enc = tiktoken.get_encoding(tokenizer_name)
55
+ tokens = enc.encode(input_text)
56
+ token_strings = [enc.decode([t]) for t in tokens]
57
+
58
+ for i, token_text in enumerate(token_strings):
59
+ response = client.embeddings.create(
60
+ input=[token_text],
61
+ model="text-embedding-ada-002"
62
+ )
63
+ embedding = response.data[0].embedding
64
+
65
+ with st.expander(f"πŸ”Έ Token {i+1}: '{token_text}'"):
66
+ st.write(embedding)
67
+ st.caption(f"Embedding dimension: {len(embedding)}")
68
+
69
+ st.success(f"Successfully generated embeddings for {len(token_strings)} tokens.")
70
+
71
  except Exception as e:
72
  st.error(f"OpenAI Error: {str(e)}")
73
 
74
+ # ---------- Positional Encoding Section ----------
75
+ st.subheader("πŸ“ Positional Encoding")
76
+ st.markdown("""
77
+ Transformers have no built-in notion of order, so **positional encoding** adds a signal to each token to tell the model where it occurs in the sequence.
78
+
79
+ We use sinusoidal positional encoding similar to what was introduced in the original Transformer paper.
80
+ """)
81
+
82
+ if st.button("πŸŒ€ Generate Positional Encoding"):
83
  enc = tiktoken.get_encoding(tokenizer_name)
84
  tokens = enc.encode(input_text)
85
  seq_len = len(tokens)
86
+ dim = st.slider("Select positional encoding dimension:", 16, 512, 64, step=16)
87
 
88
  def get_positional_encoding(seq_len, dim):
89
  PE = np.zeros((seq_len, dim))
 
96
  return PE
97
 
98
  PE = get_positional_encoding(seq_len, dim)
99
+
100
+ with st.expander("πŸ“ Positional Encoding Matrix"):
101
  st.write(PE)
102
+ st.caption(f"Shape: {PE.shape}")