Spaces:

chyams
/

embedding-explorer

Running

App Files Files Community

embedding-explorer / app.py

chyams

Embedding Explorer: new examples, auto-execute, neighbors dropdown, zoom

c18f969 18 days ago

raw

history blame contribute delete

50.2 kB

	"""
	Embedding Explorer — Interactive word vector visualization
	Responsible AI: Technology, Power, and Justice
	Huston-Tillotson University

	Enter words or vector expressions (comma-separated). Each item becomes
	an arrow in 3D. Click an item to see its nearest neighbors.

	Configuration (HuggingFace Space environment variables):
	EXAMPLES — JSON list of example inputs (words or expressions)
	N_NEIGHBORS — number of neighbors to show on click (default 8)
	"""

	import os
	import json
	import re
	import warnings
	import urllib.parse
	import subprocess

	import pandas # noqa: F401 — import before plotly to avoid circular import
	import numpy as np
	import plotly.graph_objects as go
	import gradio as gr

	warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")

	# ── Configuration (all changeable via HF Space env vars) ─────

	EXAMPLES = json.loads(os.environ.get("EXAMPLES", json.dumps([
	"dog cat fish car truck",
	"paris france berlin germany tokyo japan",
	"man woman king queen prince princess",
	"man - woman, uncle - aunt, man woman uncle aunt",
	"aunt - woman + man, man woman uncle aunt",
	"nephew - man + woman, man woman nephew niece",
	"king - man + woman, man woman king queen",
	"paris - france + italy, paris france italy rome",
	"sushi - japan + germany, sushi japan germany bratwurst",
	"hitler - germany + italy, germany italy hitler mussolini",
	])))

	N_NEIGHBORS = int(os.environ.get("N_NEIGHBORS", "4"))

	# ── Share URL infrastructure ─────────────────────────────────

	REBRANDLY_API_KEY = os.environ.get("REBRANDLY_API_KEY", "")
	_SPACE_ID = os.environ.get("SPACE_ID", "")
	if _SPACE_ID:
	_owner, _name = _SPACE_ID.split("/")
	_BASE_URL = f"https://{_owner}-{_name}.hf.space/"
	else:
	_BASE_URL = "http://localhost:7860/"

	# ── Course design system colors ──────────────────────────────

	PURPLE = "#63348d"
	PURPLE_LIGHT = "#ded9f4"
	PURPLE_DARK = "#301848"
	GOLD = "#f0c040"
	PINK = "#de95a0"
	DARK = "#1a1a2e"
	GRAY = "#888888"
	BG = "#fafafa"

	# Categorical palette — darkest shade from each design-system color family
	# 10 shades per family from color_palette.md (darkest → lightest)
	_SHADES = {
	"purple": ["#64348d", "#6f3ba4", "#7942bb", "#8455c5", "#906acd",
	"#9d80d6", "#ab95de", "#baabe5", "#cbc1ec", "#ddd7f4"],
	"blue": ["#344b8d", "#3b58a4", "#4266bb", "#5579c5", "#6a8dcd",
	"#809fd6", "#95b2de", "#abc3e5", "#c1d4ec", "#d7e5f4"],
	"green": ["#348d64", "#3ba470", "#42bb7c", "#55c588", "#6acd95",
	"#80d6a2", "#95deb0", "#abe5bf", "#c1eccf", "#d7f4e0"],
	"red": ["#8d3437", "#a43b41", "#bb424a", "#c5555f", "#cd6a75",
	"#d6808a", "#de95a0", "#e5abb4", "#ecc1c9", "#f4d7dd"],
	"yellow": ["#8d7734", "#a48b3b", "#bb9f42", "#c5ac55", "#cdb86a",
	"#d6c480", "#decf95", "#e5daab", "#ece5c1", "#f4efd7"],
	"orange": ["#8d5534", "#a4633b", "#bb7242", "#c58355", "#cd946a",
	"#d6a580", "#deb695", "#e5c6ab", "#ecd6c1", "#f4e5d7"],
	}

	# Darkest shade from each family — up to 12 items cycle through these 6
	PALETTE = [
	_SHADES["purple"][0],
	_SHADES["blue"][0],
	_SHADES["red"][0],
	_SHADES["yellow"][0],
	_SHADES["green"][0],
	_SHADES["orange"][0],
	]

	# Map each darkest color to its shade family for lookup
	_COLOR_FAMILY = {shades[0]: shades for shades in _SHADES.values()}


	def lighten(hex_color, amount=0.3):
	"""Lighten a color using the design-system palette shades.

	Maps amount (0.0–1.0) to palette shade index. Falls back to
	arithmetic blending for colors not in the palette.
	"""
	base = hex_color.lower()
	if base in _COLOR_FAMILY:
	shades = _COLOR_FAMILY[base]
	idx = min(int(amount * len(shades)), len(shades) - 1)
	return shades[idx]
	# Fallback for non-palette colors (e.g., GOLD)
	h = hex_color.lstrip("#")
	r, g, b = int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16)
	r = int(r + (255 - r) * amount)
	g = int(g + (255 - g) * amount)
	b = int(b + (255 - b) * amount)
	return f"#{r:02x}{g:02x}{b:02x}"

	# ── Load GloVe embeddings on startup ─────────────────────────

	import time
	import gensim.downloader as api
	from gensim.models import KeyedVectors

	# Native binary cache — loads ~10x faster than gensim's text format
	_CACHE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), ".cache")
	_CACHE_PATH = os.path.join(_CACHE_DIR, "glove-wiki-gigaword-300.kv")


	def load_model(name="glove-wiki-gigaword-300", retries=5):
	"""Load GloVe vectors. Uses native binary cache for fast startup after first run."""
	# Fast path: load from native binary cache (memory-mapped)
	if os.path.exists(_CACHE_PATH):
	print("=" * 60)
	print("Loading GloVe vectors from cache (memory-mapped)...")
	print("=" * 60)
	t0 = time.time()
	m = KeyedVectors.load(_CACHE_PATH, mmap="r")
	print(f"Loaded in {time.time() - t0:.1f}s")
	return m

	# Slow path: download via gensim, then save native cache
	for attempt in range(1, retries + 1):
	try:
	print("=" * 60)
	print(f"Downloading GloVe word vectors ({name})...")
	if attempt == 1:
	print("First run only — ~376 MB download. Will cache for fast startup.")
	else:
	print(f"Retry {attempt}/{retries}...")
	print("=" * 60)
	m = api.load(name)
	# Save native binary cache for next time
	os.makedirs(_CACHE_DIR, exist_ok=True)
	m.save(_CACHE_PATH)
	print(f"Cached to {_CACHE_DIR} for fast startup next time.")
	return m
	except Exception as e:
	print(f"Attempt {attempt} failed: {e}")
	if attempt < retries:
	wait = 2 ** attempt
	print(f"Retrying in {wait}s...")
	time.sleep(wait)
	else:
	raise


	model = load_model()
	VOCAB = set(model.key_to_index.keys())
	DIMS = model.vector_size

	print(f"Ready: {len(VOCAB):,} words, {DIMS} dimensions each")


	# ── Helpers ──────────────────────────────────────────────────

	def parse_expression(expr):
	"""Parse 'king - man + woman' → (positives, negatives, ordered).

	ordered is [(word, sign, coeff), ...] for display formatting.
	Supports fractional coefficients: '0.5 king - 0.3 man + 1.5 woman'.
	"""
	tokens = re.findall(r"\d*\.?\d+\|[a-z']+\|[+-]", expr.lower())
	pos, neg, ordered = [], [], []
	sign = "+"
	coeff = 1.0
	for t in tokens:
	if t in "+-":
	sign = t
	coeff = 1.0
	elif re.match(r"^\d*\.?\d+$", t):
	coeff = float(t)
	elif t in VOCAB:
	(pos if sign == "+" else neg).append((t, coeff))
	ordered.append((t, sign, coeff))
	coeff = 1.0
	return pos, neg, ordered


	def _coeff_str(c):
	"""Format coefficient for display. Returns '' for 1.0, '0.5\u00b7' otherwise."""
	if c == 1.0:
	return ""
	if c == int(c):
	return f"{int(c)}\u00b7"
	return f"{c:g}\u00b7"


	def parse_items(text):
	"""Parse comma-separated input into items (words or vector expressions).

	Returns (items, bad_words) where each item is:
	(label, vector, is_expr, operand_words, ordered_ops_or_None)
	"""
	if not text or not text.strip():
	return [], []

	items = []
	bad_words = []
	seen_labels = set()

	# Split by comma → parts
	parts = [p.strip() for p in text.split(",")]

	for part in parts:
	if not part:
	continue

	# Detect expression: contains + or - between word characters or digits
	if re.search(r"(?:[a-z']\|\d)\s[+\-]\s(?:[a-z']\|\d)", part.lower()):
	# It's an arithmetic expression
	pos, neg, ordered = parse_expression(part)
	if len(pos) + len(neg) < 2:
	# Check for bad words
	all_tokens = re.findall(r"[a-z']+", part.lower())
	bad_words.extend(t for t in all_tokens if t not in VOCAB)
	continue
	# Compute result vector
	vec = np.zeros(DIMS)
	for w, c in pos:
	vec += c * model[w]
	for w, c in neg:
	vec -= c * model[w]
	# Build label
	label_parts = []
	for w, s, c in ordered:
	cstr = _coeff_str(c)
	if not label_parts:
	label_parts.append(f"{cstr}{w}")
	elif s == "+":
	label_parts.append(f"+ {cstr}{w}")
	else:
	label_parts.append(f"− {cstr}{w}")
	label = " ".join(label_parts)
	if label not in seen_labels:
	seen_labels.add(label)
	operand_words = set(w for w, c in pos + neg)
	items.append((label, vec, True, operand_words, list(ordered)))
	else:
	# Plain words — each word is a separate item
	words = re.split(r"\s+", part.lower().strip())
	for w in words:
	w = w.strip()
	if not w:
	continue
	if w not in VOCAB:
	bad_words.append(w)
	continue
	if w not in seen_labels:
	seen_labels.add(w)
	items.append((w, model[w], False, {w}, None))

	return items, bad_words


	def reduce_3d(vectors):
	"""MDS (cosine distance) → 3D. Normalizes to [-1, 1] for consistent label sizing."""
	from sklearn.manifold import MDS
	from sklearn.metrics.pairwise import cosine_distances
	n = len(vectors)
	if n < 2:
	return np.zeros((n, 3))
	dist = cosine_distances(vectors)
	nc = min(3, n)
	mds = MDS(n_components=nc, dissimilarity="precomputed",
	random_state=42, normalized_stress="auto", max_iter=300)
	coords = mds.fit_transform(dist)
	if nc < 3:
	coords = np.hstack([coords, np.zeros((n, 3 - nc))])
	# Normalize to [-1, 1] so axis ranges and label sizes are consistent
	max_abs = np.abs(coords).max()
	if max_abs > 1e-8:
	coords = coords / max_abs
	return coords


	def _axis(title=""):
	"""3D axis — minimal, no built-in grid (we draw our own floor grid)."""
	return dict(
	showgrid=False,
	zeroline=False,
	showticklabels=False,
	title=title,
	showspikes=False,
	showbackground=False,
	)


	def layout_3d(axis_range=1.3, camera=None):
	"""Shared Plotly 3D layout. Uses uirevision to preserve camera across updates."""
	ax_x, ax_y, ax_z = _axis(), _axis(), _axis()
	fixed = [-axis_range, axis_range]
	ax_x["range"] = fixed
	ax_y["range"] = fixed
	ax_z["range"] = fixed
	default_camera = dict(eye=dict(x=1.0, y=1.0, z=0.8))
	return dict(
	scene=dict(
	xaxis=ax_x,
	yaxis=ax_y,
	zaxis=ax_z,
	bgcolor="white",
	camera=camera or default_camera,
	aspectmode="cube",
	),
	paper_bgcolor="white",
	margin=dict(l=0, r=0, t=10, b=10),
	showlegend=True,
	legend=dict(
	yanchor="top", y=0.99, xanchor="right", x=0.99,
	bgcolor="rgba(255,255,255,0.85)",
	font=dict(family="Inter, sans-serif", size=12),
	),
	font=dict(family="Inter, sans-serif"),
	autosize=True,
	uirevision="keep",
	)


	ARROW_WIDTH = 10 # line width in pixels
	ARROW_HEAD_LENGTH = 0.08 # arrowhead length in coordinate units
	ARROW_HEAD_WIDTH = 0.03 # arrowhead half-width in coordinate units


	def add_arrow(fig, px, py, pz, color, width=ARROW_WIDTH,
	head_length=ARROW_HEAD_LENGTH, head_width=ARROW_HEAD_WIDTH,
	sx=0, sy=0, sz=0, dash=None):
	"""Draw a vector arrow from (sx,sy,sz) to (px,py,pz) with a flat arrowhead."""
	start = np.array([sx, sy, sz])
	tip = np.array([px, py, pz])
	vec = tip - start
	length = np.linalg.norm(vec)
	if length < 1e-8:
	return
	d = vec / length # unit direction

	# Shorten line so it doesn't overlap arrowhead
	shorten = min(head_length, length * 0.3)
	end = tip - d * shorten

	# Vector line
	fig.add_trace(go.Scatter3d(
	x=[start[0], end[0]], y=[start[1], end[1]], z=[start[2], end[2]],
	mode="lines", line=dict(color=color, width=width, dash=dash),
	showlegend=False, hoverinfo="none",
	))

	# Flat arrowhead using Mesh3d (diamond-cross pyramid, visible from all angles)
	up = np.array([0, 0, 1]) if abs(d[2]) < 0.9 else np.array([0, 1, 0])
	p1 = np.cross(d, up)
	p1 = p1 / np.linalg.norm(p1)
	p2 = np.cross(d, p1)

	base = tip - d * head_length
	w1 = base + p1 * head_width
	w2 = base - p1 * head_width
	w3 = base + p2 * head_width
	w4 = base - p2 * head_width

	# 5 vertices: tip + 4 base points
	vx = [tip[0], w1[0], w2[0], w3[0], w4[0]]
	vy = [tip[1], w1[1], w2[1], w3[1], w4[1]]
	vz = [tip[2], w1[2], w2[2], w3[2], w4[2]]

	# 4 triangular faces forming the pyramid
	fig.add_trace(go.Mesh3d(
	x=vx, y=vy, z=vz,
	i=[0, 0, 0, 0], j=[1, 3, 2, 4], k=[3, 2, 4, 1],
	color=color, opacity=1.0,
	flatshading=True,
	lighting=dict(ambient=1, diffuse=0, specular=0, fresnel=0),
	showlegend=False, hoverinfo="none",
	))


	def add_floor_grid(fig, range_val=1.3, step=0.25):
	"""Draw a grid on the z=0 plane and axis lines through origin (3B1B style)."""
	grid_color = "rgba(99,52,141,0.18)"
	axis_color = "rgba(99,52,141,0.60)"

	# Grid lines on z=0 plane — batched with None separators for efficiency
	vals = np.arange(-range_val, range_val + step / 2, step)

	# Lines parallel to Y axis (varying x)
	xs, ys, zs = [], [], []
	for v in vals:
	xs.extend([v, v, None])
	ys.extend([-range_val, range_val, None])
	zs.extend([0, 0, None])
	fig.add_trace(go.Scatter3d(
	x=xs, y=ys, z=zs,
	mode="lines", line=dict(color=grid_color, width=1),
	showlegend=False, hoverinfo="none",
	))

	# Lines parallel to X axis (varying y)
	xs, ys, zs = [], [], []
	for v in vals:
	xs.extend([-range_val, range_val, None])
	ys.extend([v, v, None])
	zs.extend([0, 0, None])
	fig.add_trace(go.Scatter3d(
	x=xs, y=ys, z=zs,
	mode="lines", line=dict(color=grid_color, width=1),
	showlegend=False, hoverinfo="none",
	))

	# Three axis lines through origin
	for ax in [
	([-range_val, range_val], [0, 0], [0, 0]), # X
	([0, 0], [-range_val, range_val], [0, 0]), # Y
	([0, 0], [0, 0], [-range_val, range_val]), # Z
	]:
	fig.add_trace(go.Scatter3d(
	x=list(ax[0]), y=list(ax[1]), z=list(ax[2]),
	mode="lines", line=dict(color=axis_color, width=2),
	showlegend=False, hoverinfo="none",
	))


	def blank(msg):
	"""Empty placeholder figure with a centered message."""
	fig = go.Figure()
	fig.add_annotation(
	text=msg, xref="paper", yref="paper", x=0.5, y=0.5,
	showarrow=False,
	font=dict(size=16, color=GRAY, family="Inter, sans-serif"),
	)
	fig.update_layout(
	xaxis_visible=False, yaxis_visible=False,
	height=560, paper_bgcolor="white", plot_bgcolor="white",
	margin=dict(l=0, r=0, t=0, b=0),
	)
	return fig


	# ── Share URL helpers ────────────────────────────────────────

	def _shorten_url(long_url):
	"""Shorten a URL via Rebrandly API (using curl). Falls back to long URL."""
	if not REBRANDLY_API_KEY or "localhost" in long_url:
	return long_url
	try:
	payload = json.dumps({
	"destination": long_url,
	"domain": {"fullName": "go.ropavieja.org"},
	})
	result = subprocess.run(
	[
	"curl", "-s", "-X", "POST",
	"https://api.rebrandly.com/v1/links",
	"-H", "Content-Type: application/json",
	"-H", f"apikey: {REBRANDLY_API_KEY}",
	"-d", payload,
	],
	capture_output=True, text=True, timeout=10,
	)
	if result.returncode != 0 or not result.stdout.strip():
	return long_url
	data = json.loads(result.stdout)
	return f"https://{data['shortUrl']}"
	except (subprocess.TimeoutExpired, KeyError, json.JSONDecodeError, OSError) as exc:
	print(f"[share] Rebrandly error: {exc}")
	return long_url


	def _parse_camera(cam_str):
	"""Parse compact camera string (ex,ey,ez[,cx,cy,cz,ux,uy,uz]) to Plotly camera dict."""
	if not cam_str:
	return None
	try:
	vals = [float(v) for v in cam_str.split(",")]
	if len(vals) >= 3:
	camera = dict(eye=dict(x=vals[0], y=vals[1], z=vals[2]))
	if len(vals) >= 6:
	camera["center"] = dict(x=vals[3], y=vals[4], z=vals[5])
	if len(vals) >= 9:
	camera["up"] = dict(x=vals[6], y=vals[7], z=vals[8])
	return camera
	except (ValueError, IndexError):
	pass
	return None


	def _encode_camera(camera_json):
	"""Encode Plotly camera JSON to compact string for URL params."""
	if not camera_json:
	return ""
	try:
	cam = json.loads(camera_json)
	eye = cam.get("eye", {})
	center = cam.get("center", {})
	up = cam.get("up", {})
	vals = [
	eye.get("x", 1.5), eye.get("y", 1.5), eye.get("z", 1.2),
	center.get("x", 0), center.get("y", 0), center.get("z", 0),
	up.get("x", 0), up.get("y", 0), up.get("z", 1),
	]
	return ",".join(f"{v:.2f}" for v in vals)
	except (json.JSONDecodeError, TypeError):
	return ""


	# ── Main visualization ───────────────────────────────────────

	def explore(input_text, selected, hidden=None, camera=None, n_neighbors=None):
	"""Unified 3D visualization of words and vector expressions.

	Args:
	input_text: Comma-separated words and/or expressions.
	selected: Currently selected item for neighbor display (or None).
	hidden: Set of labels to hide from rendering (MDS still uses all items).
	camera: Plotly camera dict to set initial view.
	n_neighbors: Number of nearest neighbors to show (default N_NEIGHBORS).

	Returns:
	(fig, status_md, radio_update, all_labels)
	"""

	if not input_text or not input_text.strip():
	return (
	blank("Enter words or expressions above to visualize in 3D"),
	"",
	gr.update(choices=[], value=None, visible=False),
	[],
	)

	items, bad = parse_items(input_text)

	if not items:
	msg = "No valid items found."
	if bad:
	msg += f"<br>Not in vocabulary: {', '.join(bad)}"
	return blank(msg), "", gr.update(choices=[], value=None, visible=False), []

	items = items[:12]
	labels = [item[0] for item in items]
	hidden = hidden or set()

	# Visible labels for radio choices (exclude hidden)
	visible_labels = [l for l in labels if l not in hidden]

	# No auto-select — user clicks radio to see neighbors
	if selected and selected != "(clear)" and selected in visible_labels:
	sel_idx = labels.index(selected)
	else:
	selected = None
	sel_idx = None

	# ── Collect all unique operand words for MDS ──
	all_words = []
	word_set = set()
	for _, _, _, ops, _ in items:
	for w in ops:
	if w not in word_set:
	word_set.add(w)
	all_words.append(w)

	# Find nearest high-D word for each expression (used in status + MDS padding)
	expr_nearest = {} # label -> (word, similarity)
	for label, vec, is_expr, ops, ordered in items:
	if not is_expr:
	continue
	nearest = model.similar_by_vector(vec, topn=len(ops) + 5)
	for w, s in nearest:
	if w not in ops:
	expr_nearest[label] = (w, s)
	break

	# Pad with neighbor words if < 3 unique words (breaks MDS collinearity)
	helper_words = set()
	if len(all_words) < 3:
	# Try expression nearest-words first
	for label in expr_nearest:
	nw = expr_nearest[label][0]
	if nw not in word_set:
	word_set.add(nw)
	all_words.append(nw)
	helper_words.add(nw)
	if len(all_words) >= 3:
	break
	# Then try neighbors of any plain word
	if len(all_words) < 3:
	for w in list(all_words):
	if w in helper_words:
	continue
	for nw, _ in model.most_similar(w, topn=5):
	if nw not in word_set:
	word_set.add(nw)
	all_words.append(nw)
	helper_words.add(nw)
	if len(all_words) >= 3:
	break
	if len(all_words) >= 3:
	break

	# Gather neighbors if something is selected (and not hidden)
	nn = n_neighbors if n_neighbors is not None else N_NEIGHBORS
	nbr_data = []
	if selected is not None:
	sel_item = items[sel_idx]
	if sel_item[2]: # expression
	raw = model.similar_by_vector(sel_item[1], topn=nn + 20)
	else:
	raw = model.most_similar(sel_item[0], topn=nn + 20)
	all_op_words = set()
	for _, _, _, ops, _ in items:
	all_op_words.update(ops)
	label_set = set(labels)
	nbr_data = [(w, s) for w, s in raw
	if w not in all_op_words and w not in label_set
	][:nn]

	# ── MDS on all operand words + neighbors ──
	mds_words = all_words + [w for w, _ in nbr_data]
	if not mds_words:
	return blank("No valid words found."), "", gr.update(
	choices=[], value=None, visible=False), []

	mds_vecs = np.array([model[w] for w in mds_words])
	mds_coords = reduce_3d(mds_vecs)

	word_3d = {w: mds_coords[i] for i, w in enumerate(all_words)}
	nbr_coords = mds_coords[len(all_words):] if nbr_data else None

	# ── Compute expression results in 3D ──
	extra_points = [] # for dynamic axis range
	expr_info = {} # label -> visualization data

	for label, vec, is_expr, ops, ordered in items:
	if not is_expr:
	continue
	pos_words = [w for w, s, c in ordered if s == "+"]
	neg_words = [w for w, s, c in ordered if s == "-"]
	pos_coeffs = [c for w, s, c in ordered if s == "+"]
	neg_coeffs = [c for w, s, c in ordered if s == "-"]

	if len(neg_words) == 0 and len(pos_words) == 2:
	# Simple addition: chain tip-to-tail + gold result from origin
	a_3d = word_3d[pos_words[0]]
	b_3d = word_3d[pos_words[1]]
	result_3d = pos_coeffs[0] * a_3d + pos_coeffs[1] * b_3d
	extra_points.append(result_3d)
	expr_info[label] = ('add', pos_words[0], pos_words[1],
	pos_coeffs[0], pos_coeffs[1], result_3d)
	else:
	# General: chain through operands + gold result from origin
	cursor = np.zeros(3)
	chain = []
	for w, s, coeff in ordered:
	prev = cursor.copy()
	c = word_3d[w]
	cursor = cursor + coeff * c if s == "+" else cursor - coeff * c
	chain.append((prev.copy(), cursor.copy(), w, s))
	extra_points.append(cursor.copy())
	expr_info[label] = ('chain', chain, cursor.copy())

	# ── Dynamic axis range (computed from ALL items, not just visible) ──
	all_rendered = [word_3d[w] for w in all_words] + extra_points
	if nbr_coords is not None:
	all_rendered.extend(nbr_coords)
	if all_rendered:
	max_abs = np.abs(np.array(all_rendered)).max()
	axis_range = max(1.3, max_abs * 1.15)
	else:
	axis_range = 1.3

	# ── Colors ──
	item_colors = [PALETTE[i % len(PALETTE)] for i in range(len(items))]

	# ── Build figure ──
	fig = go.Figure()
	add_floor_grid(fig, range_val=axis_range)
	annotations = []

	def add_label(x, y, z, text, size=16, color=DARK, opacity=1.0,
	outward=0.07):
	"""Add a 3D text label, shifted outward from origin to avoid overlapping vectors."""
	pt = np.array([x, y, z])
	norm = np.linalg.norm(pt)
	if norm > 1e-8 and outward > 0:
	pt = pt + (pt / norm) * outward
	annotations.append(dict(
	x=float(pt[0]), y=float(pt[1]), z=float(pt[2]),
	text=text, showarrow=False,
	font=dict(size=size, color=color, family="Inter, sans-serif"),
	opacity=opacity, yshift=12,
	))

	for idx, (label, vec, is_expr, ops, ordered) in enumerate(items):
	# Skip hidden items
	if label in hidden:
	continue

	color = item_colors[idx]
	is_sel = (sel_idx is not None and idx == sel_idx)
	is_dim = (sel_idx is not None and idx != sel_idx)

	# Selection-aware styling
	arr_color = lighten(color, 0.5) if is_dim else color
	arr_width = 4 if is_dim else (12 if is_sel else ARROW_WIDTH)
	lbl_color = lighten(color, 0.5) if is_dim else color
	lbl_opacity = 0.7 if is_dim else 1.0
	lbl_size = 18 if is_sel else (15 if is_dim else 16)
	gold = lighten(GOLD, 0.3) if is_dim else GOLD
	gold_width = 6 if is_dim else 12

	if not is_expr:
	# ── Plain word: arrow from origin ──
	c = word_3d[label]
	add_arrow(fig, c[0], c[1], c[2], arr_color, width=arr_width)
	txt = f"<b>{label}</b>" if is_sel else label
	add_label(c[0], c[1], c[2], txt, size=lbl_size,
	color=lbl_color, opacity=lbl_opacity)
	else:
	info = expr_info[label]

	# ── Operand arrows from origin (full length — shows where the word IS) ──
	for w, s, coeff in ordered:
	c = word_3d[w]
	add_arrow(fig, c[0], c[1], c[2], arr_color, width=arr_width)
	txt = f"<b>{w}</b>" if is_sel else w
	add_label(c[0], c[1], c[2], txt, size=lbl_size,
	color=lbl_color, opacity=lbl_opacity)

	# ── Construction arrows with expression labels beside midpoint ──
	gold_lbl = f"<i>{label}</i>"
	gold_lbl_size = 14 if is_dim else 15
	gold_lbl_color = "#b08820" if is_dim else "#8a6a10"

	def _label_beside(start, end):
	"""Place label at midpoint of start→end, offset perpendicular
	to the arrow (on the side farther from origin)."""
	mid = (start + end) / 2
	d = end - start
	length = np.linalg.norm(d)
	if length < 1e-8:
	return mid
	d = d / length
	up = np.array([0., 0., 1.]) if abs(d[2]) < 0.9 \
	else np.array([0., 1., 0.])
	perp = np.cross(d, up)
	pn = np.linalg.norm(perp)
	if pn < 1e-8:
	return mid
	perp = perp / pn * 0.12
	# Pick side farther from origin (pushes label outward)
	if np.dot(mid + perp, mid + perp) >= np.dot(mid - perp, mid - perp):
	return mid + perp
	return mid - perp

	if info[0] == 'add':
	# Second operand drawn from tip of first (chain)
	# info = ('add', word_a, word_b, coeff_a, coeff_b, result_3d)
	a = word_3d[info[1]]
	coeff_a = info[3]
	coeff_b = info[4]
	result_3d = info[5]
	chain_start = coeff_a * a
	chain_color = lighten(color, 0.3) if is_dim else lighten(color, 0.2)
	add_arrow(fig, result_3d[0], result_3d[1], result_3d[2],
	chain_color, width=arr_width,
	sx=chain_start[0], sy=chain_start[1], sz=chain_start[2],
	dash="dot")
	# Gold result from origin
	add_arrow(fig, result_3d[0], result_3d[1], result_3d[2],
	gold, width=gold_width)
	origin = np.zeros(3)
	lpt = _label_beside(origin, result_3d)
	add_label(lpt[0], lpt[1], lpt[2], gold_lbl,
	size=gold_lbl_size, color=gold_lbl_color,
	opacity=lbl_opacity, outward=0)

	else: # chain
	chain_steps = info[1]
	result_3d = info[2]
	chain_color = lighten(color, 0.3) if is_dim else lighten(color, 0.2)
	for i, (start, end, w, s) in enumerate(chain_steps):
	if i == 0:
	continue # first step overlaps operand arrow
	add_arrow(fig, end[0], end[1], end[2],
	chain_color, width=arr_width,
	sx=start[0], sy=start[1], sz=start[2], dash="dot")
	# Gold result from origin
	add_arrow(fig, result_3d[0], result_3d[1], result_3d[2],
	gold, width=gold_width)
	origin = np.zeros(3)
	lpt = _label_beside(origin, result_3d)
	add_label(lpt[0], lpt[1], lpt[2], gold_lbl,
	size=gold_lbl_size, color=gold_lbl_color,
	opacity=lbl_opacity, outward=0)

	# (helper_words are invisible — only present for MDS geometry)

	# ── Neighbors ──
	if selected is not None and nbr_data and nbr_coords is not None:
	sel_color = item_colors[sel_idx]
	nbr_color = lighten(sel_color, 0.3)

	for i, (w, s) in enumerate(nbr_data):
	add_arrow(fig,
	nbr_coords[i, 0], nbr_coords[i, 1], nbr_coords[i, 2],
	nbr_color, width=ARROW_WIDTH,
	sx=0, sy=0, sz=0, dash="dot")
	add_label(nbr_coords[i, 0], nbr_coords[i, 1], nbr_coords[i, 2],
	w, size=16, color=DARK)

	fig.update_layout(**layout_3d(axis_range=axis_range, camera=camera),
	scene_annotations=annotations)

	# ── Status text ──
	n_visible = sum(1 for l, _, ie, _, _ in items if l not in hidden)
	n_hidden = len(hidden & set(labels))
	n_words = sum(1 for l, _, ie, _, _ in items if not ie and l not in hidden)
	n_expr = sum(1 for l, _, ie, _, _ in items if ie and l not in hidden)
	parts = []
	if n_words:
	parts.append(f"{n_words} word{'s' if n_words != 1 else ''}")
	if n_expr:
	parts.append(f"{n_expr} expression{'s' if n_expr != 1 else ''}")
	status = " + ".join(parts) + " in 3D" if parts else "Nothing visible"
	if n_hidden:
	status += f" · {n_hidden} hidden"
	if bad:
	status += f" · Not found: {', '.join(bad)}"
	for label in expr_nearest:
	if label not in hidden:
	w, s = expr_nearest[label]
	status += f" · {label} ≈ {w} ({s:.3f})"
	if nbr_data:
	status += f" · {len(nbr_data)} neighbors of {selected}"

	choices = ["(clear)"] + visible_labels
	return (
	fig,
	status,
	gr.update(choices=choices, value=selected or "(clear)", visible=True),
	labels,
	)


	# ── Gradio UI ────────────────────────────────────────────────

	CSS = """
	.gradio-container { max-width: 100% !important; padding: 0 2rem !important; }
	h1 { color: #63348d !important; }

	/* Example buttons — dark purple outline */
	.purple-examples td {
	border: 2px solid #63348d !important;
	border-radius: 6px !important;
	color: #301848 !important;
	cursor: pointer !important;
	}
	.purple-examples td:hover {
	background: #ded9f4 !important;
	}

	/* Radio neighbor selector — purple text, white-on-purple when selected */
	.nbr-radio label {
	color: #63348d !important;
	border: 1px solid #63348d !important;
	border-radius: 6px !important;
	}
	.nbr-radio label.selected {
	background: #63348d !important;
	color: #ffffff !important;
	}
	.nbr-radio label.selected * {
	color: #ffffff !important;
	}

	/* Visibility checkboxes — compact */
	.vis-cbg label {
	color: #63348d !important;
	border: 1px solid #63348d !important;
	border-radius: 6px !important;
	}
	.vis-cbg label.selected {
	background: #63348d !important;
	color: #ffffff !important;
	}
	.vis-cbg label.selected * {
	color: #ffffff !important;
	}

	/* 3D viewport — full-width 16:9 with border */
	.plot-viewport {
	border: 2px solid #ded9f4 !important;
	border-radius: 8px !important;
	}
	.plot-viewport .plot-container {
	aspect-ratio: 16 / 9 !important;
	width: 100% !important;
	}
	.plot-viewport .js-plotly-plot, .plot-viewport .plotly {
	width: 100% !important;
	height: 100% !important;
	}

	/* Neighbors dropdown — compact */
	.nn-dropdown {
	max-width: 100px !important;
	}
	.nn-dropdown select {
	color: #63348d !important;
	}

	/* Hidden camera state textbox (visible=False prevents DOM rendering in Gradio 6) */
	.camera-hidden { display: none !important; }

	/* Input fields — white for contrast */
	textarea, input[type="text"] {
	background: #ffffff !important;
	}
	"""

	FORCE_LIGHT = """
	<script>
	if(!location.search.includes("__theme=light")){
	const u=new URL(location);u.searchParams.set("__theme","light");location.replace(u);
	}
	</script>
	<script>
	// Camera tracker — polls Plotly camera into hidden textbox for Gradio to read
	(function() {
	console.log('[cam] Camera tracker script loaded');
	var attempts = 0;
	var interval = setInterval(function() {
	attempts++;
	var plots = document.querySelectorAll('.js-plotly-plot');
	if (plots.length === 0) {
	if (attempts % 20 === 0) console.log('[cam] waiting for plot...', attempts);
	return;
	}
	var plot = plots[0];
	if (!plot._fullLayout \|\| !plot._fullLayout.scene \|\| !plot._fullLayout.scene._scene) {
	if (attempts % 20 === 0) console.log('[cam] plot found but no scene yet');
	return;
	}
	try {
	var cam = plot._fullLayout.scene._scene.getCamera();
	var el = document.querySelector('#camera_txt textarea, #camera_txt input');
	if (!el) {
	console.log('[cam] cannot find #camera_txt element');
	return;
	}
	var val = JSON.stringify(cam);
	if (el.value !== val) {
	el.value = val;
	var nativeInputValueSetter = Object.getOwnPropertyDescriptor(
	window.HTMLTextAreaElement.prototype, 'value'
	) \|\| Object.getOwnPropertyDescriptor(
	window.HTMLInputElement.prototype, 'value'
	);
	if (nativeInputValueSetter && nativeInputValueSetter.set) {
	nativeInputValueSetter.set.call(el, val);
	}
	el.dispatchEvent(new Event('input', {bubbles: true}));
	el.dispatchEvent(new Event('change', {bubbles: true}));
	}
	} catch(e) {
	console.log('[cam] error:', e);
	}
	}, 500);
	})();
	</script>
	<script>
	// Clear share params from URL after load (so refresh doesn't re-apply)
	if (new URL(location).searchParams.has('q')) {
	var _clearId = setInterval(function() {
	if (document.querySelector('.js-plotly-plot')) {
	var clean = new URL(location.pathname, location.origin);
	clean.searchParams.set('__theme', 'light');
	history.replaceState(null, '', clean.toString());
	clearInterval(_clearId);
	}
	}, 500);
	}
	</script>
	"""

	_LIGHT = {
	"button_primary_background_fill": "#63348d",
	"button_primary_background_fill_hover": "#4a2769",
	"button_primary_text_color": "#ffffff",
	"block_background_fill": "#f3f0f7",
	"block_border_color": "#ded9f4",
	"body_background_fill": "#ffffff",
	"body_text_color": "#1a1a2e",
	"block_label_text_color": "#63348d",
	"block_title_text_color": "#63348d",
	"background_fill_primary": "#ffffff",
	"background_fill_secondary": "#f3f0f7",
	"input_background_fill": "#ffffff",
	"input_background_fill_focus": "#ffffff",
	"input_border_color": "#ded9f4",
	"input_border_color_focus": "#63348d",
	"input_placeholder_color": "#888888",
	"border_color_primary": "#ded9f4",
	"border_color_accent": "#63348d",
	"panel_background_fill": "#f3f0f7",
	}
	# Mirror every light value into _dark so dark mode looks identical
	_ALL = {}
	for k, v in _LIGHT.items():
	_ALL[k] = v
	_ALL[k + "_dark"] = v

	THEME = gr.themes.Soft(
	primary_hue="purple",
	font=gr.themes.GoogleFont("Inter"),
	).set(**_ALL)

	with gr.Blocks(title="Embedding Explorer") as demo:

	# ── State ──
	all_labels_state = gr.State([])
	loading_share = gr.State(False) # suppress cascading events during share load
	camera_txt = gr.Textbox(elem_id="camera_txt", elem_classes=["camera-hidden"])
	share_params = gr.State({})

	# Force light mode fallback (head param covers most cases, this catches HF Spaces)
	gr.HTML('<script>if(!location.search.includes("__theme=light"))'
	'{const u=new URL(location);u.searchParams.set("__theme","light");'
	'location.replace(u)}</script>')

	gr.Markdown(
	"# Embedding Explorer\n"
	"Words in AI models are stored as vectors — long lists of numbers "
	"that encode meaning. Similar words have similar vectors. "
	"You can even do vector math: king − man + woman ≈ queen. "
	"This tool lets you explore these representations in 3D using "
	"[GloVe](https://nlp.stanford.edu/projects/glove/) word vectors "
	f"({len(VOCAB):,} words, {DIMS} dimensions)."
	)
	gr.Markdown(
	"*Enter words to see them in 3D, or try vector arithmetic "
	"with `+` and `−`. Separate groups with commas. "
	"Click an item below the plot to see its nearest neighbors.*"
	)

	with gr.Row():
	with gr.Column(scale=2):
	exp_in = gr.Textbox(
	label="Words or expressions (comma-separated)",
	placeholder="dog cat fish or king - man + woman",
	lines=1,
	)
	with gr.Column(scale=1):
	with gr.Row():
	exp_btn = gr.Button("Explore", variant="primary")
	share_btn = gr.Button("Share", variant="secondary",
	scale=0, min_width=80)
	share_url = gr.Textbox(label="Share URL", visible=False,
	interactive=False, buttons=["copy"])
	with gr.Column(elem_classes=["purple-examples"]):
	gr.Examples(
	examples=[[e] for e in EXAMPLES],
	inputs=[exp_in],
	label="Try these",
	)
	exp_plot = gr.Plot(label="Embedding Space", elem_classes=["plot-viewport"])
	exp_status = gr.Markdown("")
	vis_cbg = gr.CheckboxGroup(
	label="Visible items (uncheck to hide)",
	choices=[], value=[],
	visible=False, interactive=True,
	elem_classes=["vis-cbg"],
	)
	with gr.Row():
	exp_radio = gr.Radio(
	label="Click to see nearest neighbors",
	choices=[], value=None,
	visible=False, interactive=True,
	elem_classes=["nbr-radio"],
	)
	nn_dropdown = gr.Dropdown(
	label="Neighbors",
	choices=[str(i) for i in range(3, 13)],
	value=str(N_NEIGHBORS),
	interactive=True,
	scale=0,
	min_width=90,
	elem_classes=["nn-dropdown"],
	)

	# ── Event handlers ──

	def _parse_camera_json(camera_json):
	"""Parse camera JSON string (from JS bridge) into Plotly camera dict."""
	if not camera_json:
	return None
	try:
	return json.loads(camera_json)
	except (json.JSONDecodeError, TypeError):
	return None

	def _get_nn(nn_val):
	"""Parse neighbor count from dropdown value."""
	try:
	return int(nn_val)
	except (TypeError, ValueError):
	return N_NEIGHBORS

	def on_explore(input_text, nn_val=None):
	"""Fresh explore — compute MDS, show all items, reset checkboxes.

	Supports @word syntax to auto-select a word for neighbors:
	dog cat fish @dog → plots all 3, shows dog's neighbors
	"""
	nn = _get_nn(nn_val)
	selected = None
	if input_text and "@" in input_text:
	match = re.search(r"@(\S+)", input_text)
	if match:
	selected = match.group(1).lower()
	input_text = re.sub(r"\s*@\S+", "", input_text).strip()
	fig, status, radio, labels = explore(input_text, selected, n_neighbors=nn)
	cbg = gr.update(choices=labels, value=labels, visible=bool(labels))
	return fig, status, radio, labels, cbg, gr.update(value=input_text)

	def on_radio(input_text, selected, all_labels, visible, camera_json, is_loading, nn_val):
	"""Neighbor selection — re-render with current visibility + camera."""
	if is_loading:
	return gr.update(), gr.update(), gr.update(), False
	nn = _get_nn(nn_val)
	hidden = set(all_labels) - set(visible) if all_labels and visible else set()
	camera = _parse_camera_json(camera_json)
	fig, status, radio, _ = explore(input_text, selected, hidden=hidden or None, camera=camera, n_neighbors=nn)
	return fig, status, radio, False

	def on_visibility(input_text, selected, all_labels, visible, camera_json, is_loading, nn_val):
	"""Visibility toggle — re-render with updated hidden set + camera."""
	if is_loading:
	return gr.update(), gr.update(), gr.update(), False
	nn = _get_nn(nn_val)
	hidden = set(all_labels) - set(visible) if all_labels else set()
	# If selected item is now hidden, clear selection
	if selected and selected != "(clear)" and selected in hidden:
	selected = None
	camera = _parse_camera_json(camera_json)
	fig, status, radio, _ = explore(input_text, selected, hidden=hidden or None, camera=camera, n_neighbors=nn)
	return fig, status, radio, False

	def on_nn_change(input_text, selected, all_labels, visible, camera_json, is_loading, nn_val):
	"""Neighbor count changed — re-render if a word is selected."""
	if is_loading:
	return gr.update(), gr.update(), gr.update(), False
	if not selected or selected == "(clear)":
	return gr.update(), gr.update(), gr.update(), False
	nn = _get_nn(nn_val)
	hidden = set(all_labels) - set(visible) if all_labels and visible else set()
	camera = _parse_camera_json(camera_json)
	fig, status, radio, _ = explore(input_text, selected, hidden=hidden or None, camera=camera, n_neighbors=nn)
	return fig, status, radio, False

	def on_share(input_text, selected, visible, camera_json, nn_val, request: gr.Request):
	"""Build share URL encoding current state."""
	params = {}
	if input_text and input_text.strip():
	params["q"] = input_text.strip()
	if selected and selected != "(clear)":
	params["sel"] = selected
	# Only encode visibility if some items are hidden
	if visible is not None and isinstance(visible, list):
	params["vis"] = ",".join(visible)
	if camera_json:
	encoded = _encode_camera(camera_json)
	if encoded:
	params["cam"] = encoded
	nn = _get_nn(nn_val)
	if nn != N_NEIGHBORS:
	params["nn"] = str(nn)
	if not params.get("q"):
	return gr.update(value="Nothing to share", visible=True)
	# Build base URL from request (gets correct port for local dev)
	base_url = _BASE_URL
	if request:
	host = request.headers.get("host", "")
	if host:
	scheme = "https" if _SPACE_ID else "http"
	base_url = f"{scheme}://{host}/"
	long_url = base_url + "?" + urllib.parse.urlencode(params)
	# On localhost, just return the full URL (Rebrandly rejects non-public URLs)
	if "localhost" in long_url or "127.0.0.1" in long_url:
	return gr.update(value=long_url, visible=True)
	short = _shorten_url(long_url)
	return gr.update(value=short, visible=True)

	# ── Wire up events ──

	_EXAMPLE_SET = set(EXAMPLES)

	def on_input_change(input_text, nn_val):
	"""Auto-explore when input matches an example (set by gr.Examples click)."""
	if input_text and input_text.strip() in _EXAMPLE_SET:
	return on_explore(input_text, nn_val)
	return gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update()

	exp_in.change(
	on_input_change,
	inputs=[exp_in, nn_dropdown],
	outputs=[exp_plot, exp_status, exp_radio, all_labels_state, vis_cbg, exp_in],
	)
	exp_btn.click(
	on_explore,
	inputs=[exp_in, nn_dropdown],
	outputs=[exp_plot, exp_status, exp_radio, all_labels_state, vis_cbg, exp_in],
	)
	exp_in.submit(
	on_explore,
	inputs=[exp_in, nn_dropdown],
	outputs=[exp_plot, exp_status, exp_radio, all_labels_state, vis_cbg, exp_in],
	)
	# Radio + visibility + nn: camera_txt is kept up-to-date by polling script
	exp_radio.change(
	on_radio,
	inputs=[exp_in, exp_radio, all_labels_state, vis_cbg, camera_txt, loading_share, nn_dropdown],
	outputs=[exp_plot, exp_status, exp_radio, loading_share],
	)
	vis_cbg.change(
	on_visibility,
	inputs=[exp_in, exp_radio, all_labels_state, vis_cbg, camera_txt, loading_share, nn_dropdown],
	outputs=[exp_plot, exp_status, exp_radio, loading_share],
	)
	nn_dropdown.change(
	on_nn_change,
	inputs=[exp_in, exp_radio, all_labels_state, vis_cbg, camera_txt, loading_share, nn_dropdown],
	outputs=[exp_plot, exp_status, exp_radio, loading_share],
	)

	# Share: camera_txt kept up-to-date by polling script
	share_btn.click(
	fn=on_share,
	inputs=[exp_in, exp_radio, vis_cbg, camera_txt, nn_dropdown],
	outputs=[share_url],
	)

	# ── Share URL loading ──

	def load_share_params(request: gr.Request):
	"""Step 1: Parse query params from URL."""
	qp = dict(request.query_params) if request else {}
	return qp

	def apply_share_params(params):
	"""Step 2: Apply share params — set input, run explore, apply visibility + camera + nn."""
	if not params or "q" not in params:
	# Check if nn param is present even without q
	nn_str = params.get("nn") if params else None
	nn_update = gr.update(value=nn_str) if nn_str else gr.update()
	return (
	gr.update(), # exp_in
	gr.update(), # exp_plot
	gr.update(), # exp_status
	gr.update(), # exp_radio
	gr.update(), # vis_cbg
	[], # all_labels_state
	gr.update(), # camera_txt
	False, # loading_share
	nn_update, # nn_dropdown
	)

	input_text = params.get("q", "")
	selected = params.get("sel")
	if selected == "":
	selected = None
	vis_str = params.get("vis")
	cam_str = params.get("cam")
	nn_str = params.get("nn")

	camera = _parse_camera(cam_str)
	nn = int(nn_str) if nn_str and nn_str.isdigit() else None

	# First explore with all items visible to get labels
	_, _, _, labels = explore(input_text, None, camera=camera, n_neighbors=nn)

	# Apply visibility
	if vis_str:
	visible = [v.strip() for v in vis_str.split(",")]
	hidden = set(labels) - set(visible)
	else:
	visible = labels
	hidden = set()

	fig, status, radio, _ = explore(
	input_text, selected, hidden=hidden or None, camera=camera, n_neighbors=nn
	)

	cbg = gr.update(
	choices=labels,
	value=visible,
	visible=bool(labels),
	)

	# Pre-populate camera_txt so subsequent re-renders preserve camera
	camera_json = json.dumps(camera) if camera else ""

	nn_update = gr.update(value=str(nn)) if nn else gr.update()

	return (
	gr.update(value=input_text),
	fig,
	status,
	radio,
	cbg,
	labels,
	gr.update(value=camera_json),
	True, # loading_share — suppress cascading events
	nn_update,
	)

	demo.load(
	fn=load_share_params,
	outputs=[share_params],
	).then(
	fn=apply_share_params,
	inputs=[share_params],
	outputs=[exp_in, exp_plot, exp_status, exp_radio, vis_cbg, all_labels_state, camera_txt, loading_share, nn_dropdown],
	)

	demo.launch(theme=THEME, css=CSS, head=FORCE_LIGHT)