Spaces:
Sleeping
Sleeping
File size: 4,997 Bytes
5ebdf85 9942d88 5ebdf85 9942d88 316297e 5ebdf85 316297e 5ebdf85 316297e 5ebdf85 316297e 5ebdf85 316297e 5ebdf85 316297e 5ebdf85 316297e 5ebdf85 316297e 5ebdf85 0ed5ac0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import streamlit as st
import numpy as np
from transformers import GPT2TokenizerFast, GPT2Model
# 1. Load tokenizer and model
@st.cache_resource
def load_resources():
# tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer = GPT2TokenizerFast.from_pretrained("./assets/tokenizer", local_files_only=True)
# model = GPT2Model.from_pretrained("gpt2")
model = GPT2Model.from_pretrained("./assets/model", local_files_only=True)
# from transformers import GPT2TokenizerFast
# # Load tokenizer from bundled local files only
#
return tokenizer, model
# Initialize resources
tokenizer, model = load_resources()
# 2. Helper to get the full embedding matrix
@st.cache_resource
def get_embedding_matrix():
return model.get_input_embeddings().weight.detach().cpu().numpy()
# 3. Initialize session state
for key in ["tokens", "token_ids", "embeddings", "current_id"]:
if key not in st.session_state:
if key in ["tokens", "token_ids"]:
st.session_state[key] = []
else:
st.session_state[key] = {} if key == "embeddings" else None
st.title("🔍 Embedding & Positional Encoding Explorer")
# 4. Sentence input & BPE tokenize
sentence = st.text_input("Enter a sentence to tokenize:")
if st.button("BPE Tokenize"):
ids = tokenizer.encode(sentence, add_special_tokens=False)
toks = tokenizer.convert_ids_to_tokens(ids)
st.session_state.tokens = toks
st.session_state.token_ids = ids
# 5. Display tokens + IDs with embedding buttons
if st.session_state.tokens:
st.subheader("Tokens and IDs")
cols = st.columns([4, 1])
for i, (tok, tid) in enumerate(zip(st.session_state.tokens, st.session_state.token_ids)):
cols[0].write(f"{i+1}. **{tok}** → ID {tid}")
if cols[1].button(f"Create Embedding for {tid}", key=f"embed_{tid}"):
vec = model.get_input_embeddings().weight[tid].detach().cpu().numpy()
st.session_state.embeddings[tid] = vec.copy()
st.session_state.current_id = tid
# 6. Show & edit embedding sliders for selected token
if st.session_state.current_id is not None:
tok_id = st.session_state.current_id
emb_vec = st.session_state.embeddings[tok_id]
st.subheader(f"Embedding for token ID {tok_id}")
for dim in range(len(emb_vec)):
emb_vec[dim] = st.slider(
f"Emb Dim {dim}", -5.0, 5.0, float(emb_vec[dim]), step=0.01,
key=f"slider_{tok_id}_{dim}"
)
st.session_state.embeddings[tok_id] = emb_vec
# 7. Similarity search on current embedding
# if st.button("Similarity Search", key="sim_search"):
# matrix = get_embedding_matrix()
# query = emb_vec
# dot = matrix.dot(query)
# mat_norm = np.linalg.norm(matrix, axis=1)
# q_norm = np.linalg.norm(query)
# sims = dot / (mat_norm * q_norm + 1e-12)
# topk = (-sims).argsort()[1:21]
# st.write("**Top 20 similar tokens:**")
# for idx in topk:
# token_str = tokenizer.convert_ids_to_tokens([idx])[0]
# st.write(f"ID {idx} ({token_str}): {sims[idx]:.4f}")
# 8. Positional Encoding inputs
st.subheader("Positional Encoding")
# Show formula in LaTeX
st.markdown(r"""
**Positional Encoding Formula**
For position $p$ and dimension $d$ (where $D$ is the embedding size):
$$
PE(p,d) = \begin{cases}
\sin\bigl(\frac{p}{10000^{d / D}}\bigr), & \text{if } d \text{ is even} \\
\cos\bigl(\frac{p}{10000^{(d-1) / D}}\bigr), & \text{if } d \text{ is odd}
\end{cases}
$$
""")
pos = st.number_input("Position (p)", min_value=0, format="%d")
dim = st.number_input(
"Dimension index (0-based)", min_value=0, max_value=len(emb_vec)-1, format="%d"
)
emb_dim = st.number_input(
"Embedding Dimension (vector length)", value=len(emb_vec), format="%d"
)
# 9. Add Pos Encoding
if st.button("Compute and Add Pos Encoding to the Embedding"):
p, d, D = int(pos), int(dim), int(emb_dim)
if 0 <= d < D:
if d % 2 == 0:
pe = np.sin(p / (10000 ** (d / D)))
else:
pe = np.cos(p / (10000 ** ((d - 1) / D)))
emb_vec[d] += pe
st.session_state.embeddings[tok_id] = emb_vec
else:
st.error("Dimension index out of range.")
# 10. Similarity search with positional encoding
if st.button("Similarity Search (Using the Embedding)", key="sim_search_pos"):
matrix = get_embedding_matrix()
query = st.session_state.embeddings[tok_id]
dot = matrix.dot(query)
mat_norm = np.linalg.norm(matrix, axis=1)
q_norm = np.linalg.norm(query)
sims = dot / (mat_norm * q_norm + 1e-12)
topk = (-sims).argsort()[1:21]
st.write("**Top 20 similar tokens after PosEnc:**")
for idx in topk:
token_str = tokenizer.convert_ids_to_tokens([idx])[0]
st.write(f"ID {idx} ({token_str}): {sims[idx]:.4f}")
|