Spaces:

schogini
/

embed-and-pos-encode

Sleeping

App Files Files Community

embed-and-pos-encode / src /streamlit_app.py

schoginitoys

Update src/streamlit_app.py

a53f381 verified 8 months ago

raw

history blame contribute delete

5.33 kB

	import os
	# turn off Streamlit’s automatic file-watching
	os.environ["STREAMLIT_SERVER_ENABLE_FILE_WATCHER"] = "false"

	import sys
	import types
	import torch # now safe to import
	import streamlit as st
	import numpy as np

	# Prevent Streamlit from trying to walk torch.classes' non-standard __path__
	if isinstance(getattr(sys.modules.get("torch"), "classes", None), types.ModuleType):
	torch.classes.__path__ = []

	# pip install tiktoken transformers
	import tiktoken
	from transformers import GPT2TokenizerFast

	st.set_page_config(page_title="Embedding Dimension Visualizer", layout="wide")
	st.title("🔍 Embedding Dimension Visualizer")

	# ---- THEORY EXPANDER ----
	with st.expander("📖 Theory: Tokenization, BPE & Positional Encoding"):
	st.markdown("""
	1️⃣ Tokenization
	Splits raw text into atomic units (“tokens”).

	2️⃣ Byte-Pair Encoding (BPE)
	Iteratively merges the most frequent pair of symbols to build a subword vocabulary.
	E.g. "embedding" → ["em", "bed", "ding"]

	3️⃣ Positional Encoding
	We add a deterministic sinusoidal vector to each token embedding so the model knows position.
	""")
	st.markdown("For embedding dimension \(d\), position \(pos\) and channel index \(i\):")
	st.latex(r"""\mathrm{PE}_{(pos,\,2i)} = \sin\!\Bigl(\frac{pos}{10000^{2i/d}}\Bigr)""")
	st.latex(r"""\mathrm{PE}_{(pos,\,2i+1)} = \cos\!\Bigl(\frac{pos}{10000^{2i/d}}\Bigr)""")
	st.markdown("""
	- \(pos\) starts at 0 for the first token
	- Even channels use \(\sin\), odd channels use \(\cos\)
	- This injects unique, smoothly varying positional signals into each embedding
	""")


	# ---- Sidebar ----
	with st.sidebar:
	st.header("Settings")
	input_text = st.text_input("Enter text to embed", value="Hello world!")
	dim = st.number_input(
	"Embedding dimensions",
	min_value=2,
	max_value=1536,
	value=3,
	step=1,
	help="Choose 2, 3, 512, 768, 1536, etc."
	)
	tokenizer_choice = st.selectbox(
	"Choose tokenizer",
	["tiktoken", "openai", "huggingface"],
	help="Which tokenization scheme to demo."
	)
	generate = st.button("Generate / Reset Embedding")

	if not generate:
	st.info("Adjust the settings in the sidebar and click Generate / Reset Embedding to see the tokens and sliders.")
	st.stop()

	# ---- Tokenize ----
	if tokenizer_choice in ("tiktoken", "openai"):
	model_name = "gpt2" if tokenizer_choice=="tiktoken" else "gpt-3.5-turbo"
	enc = tiktoken.encoding_for_model(model_name)
	token_ids = enc.encode(input_text)
	token_strs = [enc.decode([tid]) for tid in token_ids]
	else:
	hf_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
	token_ids = hf_tokenizer.encode(input_text)
	token_strs = hf_tokenizer.convert_ids_to_tokens(token_ids)

	st.subheader("🪶 Tokens and IDs")
	for i, (tok, tid) in enumerate(zip(token_strs, token_ids), start=1):
	st.write(f"{i}. `{tok}` → ID {tid}")

	st.write("---")
	st.subheader("📊 Embedding + Positional Encoding per Token")
	st.write(f"Input: `{input_text}` \| Tokenizer: {tokenizer_choice} \| Dims per token: {dim}")
	if dim > 20:
	st.warning("Showing >20 sliders per block may be unwieldy; consider smaller dims for teaching.")

	# helper for sinusoidal positional encoding
	def get_positional_encoding(position: int, d_model: int) -> np.ndarray:
	pe = np.zeros(d_model, dtype=float)
	for i in range(d_model):
	angle = position / np.power(10000, (2 * (i // 2)) / d_model)
	pe[i] = np.sin(angle) if (i % 2 == 0) else np.cos(angle)
	return pe

	# ---- For each token, three slider‐blocks ----
	for t_idx, tok in enumerate(token_strs, start=1):
	emb = np.random.uniform(-1.0, 1.0, size=dim)
	pe = get_positional_encoding(t_idx - 1, dim)
	combined = emb + pe

	with st.expander(f"Token {t_idx}: `{tok}`"):
	st.markdown("1️⃣ Embedding")
	for d in range(dim):
	st.slider(
	label=f"Emb Dim {d+1}",
	min_value=-1.0, max_value=1.0,
	value=float(emb[d]),
	key=f"t{t_idx}_emb{d+1}",
	disabled=True
	)

	st.markdown("2️⃣ Positional Encoding (sin / cos)")
	for d in range(dim):
	st.slider(
	label=f"PE Dim {d+1}",
	min_value=-1.0, max_value=1.0,
	value=float(pe[d]),
	key=f"t{t_idx}_pe{d+1}",
	disabled=True
	)

	st.markdown("3️⃣ Embedding + Positional Encoding")
	for d in range(dim):
	st.slider(
	label=f"Sum Dim {d+1}",
	min_value=-2.0, max_value=2.0,
	value=float(combined[d]),
	key=f"t{t_idx}_sum{d+1}",
	disabled=True
	)

	# ---- NEW FINAL SECTION ----
	st.write("---")
	st.subheader("Final Input Embedding Plus Positional Encoding Ready to Send to ATtention Heads")

	for t_idx, tid in enumerate(token_ids, start=1):
	with st.expander(f"Token ID {tid}"):
	for d in range(1, dim+1):
	# pull the “sum” value out of session state
	val = st.session_state.get(f"t{t_idx}_sum{d}", None)
	st.write(f"Dim {d}: {val:.4f}" if val is not None else f"Dim {d}: N/A")