embed-and-pos-encode-code

Runtime error

App Files Files Community

embed-and-pos-encode-code / src /streamlit_app.py

schoginitoys

Update src/streamlit_app.py

9628d55 verified 9 months ago

raw

history blame

9.66 kB

	import os
	# turn off Streamlit’s automatic file-watching
	os.environ["STREAMLIT_SERVER_ENABLE_FILE_WATCHER"] = "false"

	import sys
	import types
	import torch # now safe to import
	import streamlit as st
	import numpy as np

	# Prevent Streamlit from trying to walk torch.classes' non-standard __path__
	if isinstance(getattr(sys.modules.get("torch"), "classes", None), types.ModuleType):
	torch.classes.__path__ = []

	import torch
	import numpy as np
	import streamlit as st
	from transformers import GPT2TokenizerFast

	# --- Setup ---
	st.set_page_config(page_title="Text to Embedding Visualizer", layout="wide")
	st.title("🔍 Token Embedding & Positional Encoding Coding Demo")

	# --- Input UI ---
	sentence = st.text_input("Enter your sentence", "Learning is fun")
	embedding_dim = st.slider("Embedding Dimension (even only)", min_value=4, max_value=64, value=8, step=2)

	# --- Load tokenizer ---

	# Set custom cache directory within your app's working directory (which is writable on Spaces)
	os.environ['TRANSFORMERS_CACHE'] = './hf_cache'

	# Load the tokenizer using the custom cache path
	# tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", cache_dir="./hf_cache")
	from transformers import GPT2TokenizerFast
	# Load tokenizer from bundled local files only
	tokenizer = GPT2TokenizerFast.from_pretrained("./assets/tokenizer", local_files_only=True)


	# tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
	input_ids = tokenizer.encode(sentence, return_tensors="pt")[0]
	tokens = tokenizer.convert_ids_to_tokens(input_ids)

	# st.markdown("### 1️⃣ Tokenization")
	# with st.expander("Show Token IDs"):
	# st.write("Tokens:", tokens)
	# st.write("Token IDs:", input_ids.tolist())

	st.markdown("### 1️⃣ Tokenization")
	with st.expander("Token IDs and Subwords"):
	st.write("Tokens:", tokens)
	st.write("Token IDs:", input_ids.tolist())

	with st.expander("📜 Show Code: Tokenization"):
	st.code("""
	tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
	input_ids = tokenizer.encode(sentence, return_tensors="pt")[0]
	tokens = tokenizer.convert_ids_to_tokens(input_ids)
	""", language="python")


	# --- Embedding Matrix ---
	torch.manual_seed(0) # Reproducibility
	embedding_matrix = torch.nn.Embedding(tokenizer.vocab_size, embedding_dim)
	embedded = embedding_matrix(input_ids)

	st.markdown("### 2️⃣ Embedding")
	with st.expander("Show Token Embeddings"):
	st.write("Shape:", embedded.shape)
	st.write(embedded)

	with st.expander("📜 Show Code: Embedding"):
	st.code(f"""
	embedding_matrix = torch.nn.Embedding(tokenizer.vocab_size, {embedding_dim})
	embedded = embedding_matrix(input_ids)
	""", language="python")

	# --- Positional Encoding ---
	def get_positional_encoding(seq_len, dim):
	pe = torch.zeros(seq_len, dim)
	position = torch.arange(0, seq_len, dtype=torch.float32).unsqueeze(1)
	div_term = torch.exp(torch.arange(0, dim, 2).float() * (-np.log(10000.0) / dim))
	pe[:, 0::2] = torch.sin(position * div_term)
	pe[:, 1::2] = torch.cos(position * div_term)
	return pe

	pos_enc = get_positional_encoding(len(input_ids), embedding_dim)

	st.markdown("### 3️⃣ Positional Encoding")
	with st.expander("Show Positional Encoding"):
	st.write("Shape:", pos_enc.shape)
	st.write(pos_enc)

	with st.expander("📜 Show Code: Positional Encoding"):
	st.code(f'''
	def get_positional_encoding(seq_len, dim):
	pe = torch.zeros(seq_len, dim)
	position = torch.arange(0, seq_len).unsqueeze(1).float()
	div_term = torch.exp(torch.arange(0, dim, 2).float() * (-np.log(10000.0) / dim))
	pe[:, 0::2] = torch.sin(position * div_term)
	pe[:, 1::2] = torch.cos(position * div_term)
	return pe

	pos_enc = get_positional_encoding(len(input_ids), {embedding_dim})
	''', language="python")

	# --- Combined Embedding + Position ---
	embedded_with_pos = embedded + pos_enc

	st.markdown("### 4️⃣ Embedding + Positional Encoding")
	with st.expander("Show Combined Embedding"):
	st.write(embedded_with_pos)

	with st.expander("📜 Show Code: Add Positional Encoding"):
	st.code("""
	embedded_with_pos = embedded + pos_enc
	""", language="python")

	# --- Approximate Reverse to Token IDs ---
	def find_closest_token(vec, emb_matrix):
	sims = torch.nn.functional.cosine_similarity(vec.unsqueeze(0), emb_matrix.weight, dim=1)
	return torch.argmax(sims).item()

	recovered_ids = [find_closest_token(vec, embedding_matrix) for vec in embedded]
	#recovered_text = tokenizer.decode(recovered_ids)

	#st.markdown("### 5️⃣ Approximate Reverse")
	#with st.expander("Recovered Tokens"):
	# st.write("Recovered IDs:", recovered_ids)
	# st.write("Recovered Text:", recovered_text)

	recovered_tokens = tokenizer.convert_ids_to_tokens(recovered_ids) # ← Subwords
	recovered_text = tokenizer.decode(recovered_ids) # ← Final string

	st.markdown("### 5️⃣ Approximate Reverse")
	with st.expander("Recovered Tokens and Text"):
	st.write("Recovered Token IDs:", recovered_ids)
	st.write("Recovered Subword Tokens (BPE):", recovered_tokens)
	st.write("Recovered Sentence:", recovered_text)

	with st.expander("📜 Show Code: Recover Token IDs and Text"):
	st.code("""
	def find_closest_token(vec, emb_matrix):
	sims = torch.nn.functional.cosine_similarity(vec.unsqueeze(0), emb_matrix.weight, dim=1)
	return torch.argmax(sims).item()

	recovered_ids = [find_closest_token(vec, embedding_matrix) for vec in embedded]
	recovered_tokens = tokenizer.convert_ids_to_tokens(recovered_ids)
	recovered_text = tokenizer.decode(recovered_ids)
	""", language="python")

	# --- Recover Position (Approx) ---
	recovered_pos = embedded_with_pos - embedded
	position_error = pos_enc - recovered_pos

	st.markdown("### 6️⃣ Recovered Positional Encoding")
	with st.expander("Compare Recovered vs Original"):
	st.write("Recovered Positional Encoding:")
	st.write(recovered_pos)
	st.write("Difference from Original (should be ~0):")
	st.write(position_error)

	with st.expander("📜 Show Code: Recovered Positional Encoding"):
	st.code("""
	recovered_pos = embedded_with_pos - embedded
	position_error = pos_enc - recovered_pos
	""", language="python")

	# Estimate position from positional encoding using cosine similarity
	def estimate_position_from_encoding(pe_row, full_table):
	sims = torch.nn.functional.cosine_similarity(pe_row.unsqueeze(0), full_table, dim=1)
	return torch.argmax(sims).item()

	# Build reference table of known encodings for positions 0 to N
	reference_pos_table = get_positional_encoding(seq_len=len(input_ids), dim=embedding_dim)

	# Now estimate each token's position
	estimated_positions = [estimate_position_from_encoding(row, reference_pos_table) for row in recovered_pos]

	st.markdown("### 7️⃣ Estimate Position from Positional Encoding")
	with st.expander("Recovered Positions"):
	st.write("Estimated Token Positions:", estimated_positions)
	st.write("Original True Positions:", list(range(len(input_ids))))

	with st.expander("📜 Show Code: Estimate Positions"):
	st.code("""
	def estimate_position_from_encoding(pe_row, full_table):
	sims = torch.nn.functional.cosine_similarity(pe_row.unsqueeze(0), full_table, dim=1)
	return torch.argmax(sims).item()

	reference_pos_table = get_positional_encoding(seq_len=len(input_ids), dim=embedding_dim)
	estimated_positions = [estimate_position_from_encoding(row, reference_pos_table) for row in recovered_pos]
	""", language="python")


	st.markdown("### 📘 Final Notes: Theory & Formulas")

	with st.expander("🧠 Theory and Formulas"):
	st.markdown(r"""
	### 1️⃣ Tokenization (BPE)

	We use Byte Pair Encoding (BPE) to break text into subword units.
	For example:

	"Learning is fun" → ["Learning", "Ġis", "Ġfun"]


	Note: The "Ġ" indicates a space before the token.

	---

	### 2️⃣ Embedding

	Each token ID $t_i \in \mathbb{Z}$ is mapped to a dense vector:

	$$
	\text{Embedding}(t_i) = \mathbf{e}_i \in \mathbb{R}^d
	$$

	Where:

	- $t_i$: token ID
	- $\mathbf{e}_i$: embedding vector of dimension $d$

	---

	### 3️⃣ Sinusoidal Positional Encoding

	Used to encode the position $p$ of a token without learnable parameters:

	$$
	\text{PE}(p, 2i) = \sin\left(\frac{p}{10000^{\frac{2i}{d}}}\right)
	$$

	$$
	\text{PE}(p, 2i+1) = \cos\left(\frac{p}{10000^{\frac{2i}{d}}}\right)
	$$

	Where:

	- $p$: position index (0, 1, 2, …)
	- $i$: dimension index
	- $d$: total embedding dimension

	This gives a positional vector $\text{PE}(p) \in \mathbb{R}^d$

	---

	### 4️⃣ Add Embedding and Positional Encoding

	We add the embedding and positional encoding element-wise:

	$$
	\mathbf{z}_i = \mathbf{e}_i + \text{PE}(p_i)
	$$

	Where:

	- $\mathbf{z}_i$: final input to the transformer

	---

	### 5️⃣ Reverse Lookup (Approximate)

	We find the nearest embedding using cosine similarity:

	$$
	\hat{t}_i = \underset{j}{\arg\max} \left( \frac{ \mathbf{z}_i \cdot \mathbf{e}_j }{ \\| \mathbf{z}_i \\| \, \\| \mathbf{e}_j \\| } \right)
	$$

	---

	### 6️⃣ Recover Position from Embedding + PE

	To isolate positional encoding:

	$$
	\text{Recovered PE}_i = \mathbf{z}_i - \mathbf{e}_i
	$$

	We then compare this with reference positional encodings to estimate token position.

	---

	### 🌟 Summary Table

	\| Step \| What Happens \|
	\|------\|--------------\|
	\| Tokenization \| Sentence → Subwords → Token IDs \|
	\| Embedding \| Token IDs → Vectors \|
	\| Pos Encoding \| Position Index → Sin/Cos Vector \|
	\| Sum \| Embedding + PE = Input to Transformer \|
	\| Reverse \| Approximate token ID from vector \|
	\| PE Recovery \| Recover position using similarity \|

	""", unsafe_allow_html=True)