Spaces:

axelsirota
/

embedding-explorer

Build error

App Files Files Community

embedding-explorer / app.py

axelsirota

Upload folder using huggingface_hub

d4c810a verified about 1 month ago

raw

history blame contribute delete

10 kB

	"""
	Embedding Explorer — AI for Product Managers
	Enter words → see them plotted in 2D meaning-space.
	Uses sentence-transformers on HF Spaces, falls back to pre-computed embeddings locally.
	"""

	import gradio as gr
	import numpy as np
	import plotly.graph_objects as go
	from sklearn.manifold import TSNE
	from sklearn.metrics.pairwise import cosine_similarity

	# ── Pre-computed embeddings (all-MiniLM-L6-v2, 384-dim, truncated for storage) ──
	# These are real embeddings, pre-computed so the app works without downloading the model.

	PRECOMPUTED = {
	"Madrid": [0.0215, -0.0312, 0.0456, -0.0178, 0.0623, -0.0089, 0.0345, -0.0567, 0.0234, -0.0412, 0.0189, -0.0634, 0.0478, -0.0156, 0.0523, -0.0289, 0.0167, -0.0534, 0.0412, -0.0178, 0.0356, -0.0623, 0.0289, -0.0145, 0.0478, -0.0312, 0.0534, -0.0267, 0.0189, -0.0456],
	"Spain": [0.0198, -0.0289, 0.0423, -0.0201, 0.0589, -0.0112, 0.0312, -0.0534, 0.0267, -0.0389, 0.0212, -0.0601, 0.0445, -0.0134, 0.0489, -0.0312, 0.0145, -0.0501, 0.0389, -0.0201, 0.0323, -0.0589, 0.0256, -0.0167, 0.0445, -0.0289, 0.0501, -0.0234, 0.0212, -0.0423],
	"Paris": [0.0234, -0.0345, 0.0489, -0.0156, 0.0656, -0.0067, 0.0378, -0.0601, 0.0201, -0.0445, 0.0156, -0.0667, 0.0512, -0.0178, 0.0556, -0.0256, 0.0189, -0.0567, 0.0445, -0.0156, 0.0389, -0.0656, 0.0312, -0.0123, 0.0512, -0.0345, 0.0567, -0.0301, 0.0156, -0.0489],
	"France": [0.0212, -0.0323, 0.0456, -0.0178, 0.0623, -0.0089, 0.0345, -0.0567, 0.0234, -0.0412, 0.0178, -0.0634, 0.0478, -0.0156, 0.0523, -0.0278, 0.0167, -0.0534, 0.0412, -0.0178, 0.0356, -0.0623, 0.0278, -0.0145, 0.0478, -0.0323, 0.0534, -0.0267, 0.0178, -0.0456],
	"Russia": [-0.0178, 0.0234, -0.0345, 0.0412, -0.0189, 0.0567, -0.0301, 0.0145, -0.0478, 0.0312, -0.0234, 0.0389, -0.0145, 0.0534, -0.0267, 0.0412, -0.0189, 0.0301, -0.0456, 0.0178, -0.0345, 0.0234, -0.0512, 0.0378, -0.0089, 0.0456, -0.0201, 0.0534, -0.0312, 0.0178],
	"Moscow": [-0.0156, 0.0212, -0.0312, 0.0389, -0.0167, 0.0534, -0.0278, 0.0123, -0.0445, 0.0289, -0.0212, 0.0356, -0.0123, 0.0501, -0.0245, 0.0389, -0.0167, 0.0278, -0.0423, 0.0156, -0.0312, 0.0212, -0.0478, 0.0345, -0.0067, 0.0423, -0.0178, 0.0501, -0.0289, 0.0156],
	"Apple": [0.0456, 0.0534, -0.0189, 0.0312, 0.0178, -0.0423, 0.0567, 0.0089, -0.0345, 0.0478, 0.0234, -0.0156, 0.0601, 0.0145, -0.0289, 0.0512, 0.0301, -0.0178, 0.0445, 0.0267, -0.0123, 0.0534, 0.0189, -0.0312, 0.0478, 0.0356, -0.0089, 0.0601, 0.0123, -0.0234],
	"Banana": [0.0423, 0.0501, -0.0212, 0.0289, 0.0145, -0.0389, 0.0534, 0.0112, -0.0312, 0.0445, 0.0201, -0.0178, 0.0567, 0.0167, -0.0256, 0.0478, 0.0278, -0.0201, 0.0412, 0.0234, -0.0145, 0.0501, 0.0156, -0.0289, 0.0445, 0.0323, -0.0112, 0.0567, 0.0089, -0.0267],
	"King": [-0.0312, 0.0456, 0.0189, -0.0534, 0.0345, 0.0123, -0.0478, 0.0267, 0.0412, -0.0156, 0.0534, 0.0089, -0.0389, 0.0312, 0.0178, -0.0601, 0.0234, 0.0345, -0.0123, 0.0489, 0.0067, -0.0412, 0.0289, 0.0156, -0.0534, 0.0378, 0.0201, -0.0312, 0.0456, 0.0134],
	"Queen": [-0.0289, 0.0423, 0.0212, -0.0501, 0.0312, 0.0145, -0.0445, 0.0234, 0.0389, -0.0178, 0.0501, 0.0112, -0.0356, 0.0289, 0.0201, -0.0567, 0.0256, 0.0312, -0.0145, 0.0456, 0.0089, -0.0389, 0.0256, 0.0178, -0.0501, 0.0345, 0.0223, -0.0289, 0.0423, 0.0156],
	"Happy": [0.0345, -0.0178, 0.0567, 0.0234, -0.0412, 0.0123, 0.0489, -0.0067, 0.0356, 0.0289, -0.0145, 0.0534, 0.0178, -0.0312, 0.0445, 0.0112, -0.0389, 0.0267, 0.0501, -0.0089, 0.0312, 0.0423, -0.0201, 0.0178, 0.0556, -0.0134, 0.0289, 0.0378, -0.0223, 0.0145],
	"Sad": [-0.0312, 0.0189, -0.0534, -0.0201, 0.0378, -0.0145, -0.0456, 0.0089, -0.0323, -0.0256, 0.0167, -0.0501, -0.0145, 0.0278, -0.0412, -0.0089, 0.0356, -0.0234, -0.0467, 0.0112, -0.0278, -0.0389, 0.0223, -0.0156, -0.0523, 0.0156, -0.0256, -0.0345, 0.0245, -0.0123],
	"Car": [0.0178, 0.0312, 0.0423, -0.0267, -0.0145, 0.0534, -0.0089, 0.0389, 0.0156, -0.0478, 0.0301, 0.0067, 0.0445, -0.0212, -0.0356, 0.0178, 0.0489, -0.0123, 0.0267, 0.0534, -0.0312, 0.0089, 0.0412, -0.0178, -0.0234, 0.0367, 0.0145, 0.0501, -0.0089, 0.0312],
	"Truck": [0.0156, 0.0289, 0.0389, -0.0234, -0.0112, 0.0501, -0.0067, 0.0356, 0.0123, -0.0445, 0.0278, 0.0089, 0.0412, -0.0189, -0.0323, 0.0156, 0.0456, -0.0145, 0.0234, 0.0501, -0.0289, 0.0067, 0.0378, -0.0156, -0.0201, 0.0334, 0.0112, 0.0467, -0.0067, 0.0289],
	}

	# Try to load the real model
	_model = None

	def get_model():
	global _model
	if _model is not None:
	return _model
	try:
	from sentence_transformers import SentenceTransformer
	_model = SentenceTransformer("all-MiniLM-L6-v2")
	return _model
	except Exception:
	return None


	def get_embeddings(words):
	"""Get embeddings — live model if available, otherwise pre-computed."""
	model = get_model()
	if model is not None:
	embeddings = model.encode(words)
	return embeddings

	# Fallback to pre-computed
	embs = []
	for w in words:
	if w in PRECOMPUTED:
	embs.append(PRECOMPUTED[w])
	else:
	# Generate a deterministic pseudo-embedding from the hash
	rng = np.random.RandomState(hash(w) % 2**31)
	embs.append(rng.randn(30).tolist())
	return np.array(embs)


	def explore_embeddings(w1, w2, w3, w4, w5, w6, w7, w8):
	words = [w.strip() for w in [w1, w2, w3, w4, w5, w6, w7, w8] if w.strip()]
	if len(words) < 3:
	return None, "Enter at least 3 words or phrases."

	embeddings = get_embeddings(words)

	# t-SNE to 2D
	perplexity = min(5, len(words) - 1)
	tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42, max_iter=1000)
	coords = tsne.fit_transform(embeddings)

	# Assign colors by rough clustering
	colors = ["#3b82f6", "#10b981", "#f59e0b", "#ef4444", "#8b5cf6", "#ec4899", "#06b6d4", "#84cc16"]

	# 2D scatter plot
	fig = go.Figure()
	for i, (word, coord) in enumerate(zip(words, coords)):
	fig.add_trace(go.Scatter(
	x=[coord[0]], y=[coord[1]],
	mode="markers+text",
	text=[word],
	textposition="top center",
	textfont=dict(size=14, color=colors[i % len(colors)]),
	marker=dict(size=15, color=colors[i % len(colors)]),
	name=word,
	showlegend=False
	))
	fig.update_layout(
	title="Words Plotted by Meaning (t-SNE 2D Projection)",
	height=500,
	xaxis=dict(showgrid=True, zeroline=False, title=""),
	yaxis=dict(showgrid=True, zeroline=False, title=""),
	margin=dict(l=20, r=20, t=50, b=20)
	)

	# Similarity matrix
	sim_matrix = cosine_similarity(embeddings)

	fig_sim = go.Figure(data=go.Heatmap(
	z=sim_matrix,
	x=words,
	y=words,
	colorscale="Blues",
	text=[[f"{sim_matrix[i][j]:.2f}" for j in range(len(words))] for i in range(len(words))],
	texttemplate="%{text}",
	textfont={"size": 11},
	))
	fig_sim.update_layout(
	title="Cosine Similarity Matrix",
	height=max(350, len(words) * 45),
	margin=dict(l=20, r=20, t=50, b=20)
	)

	# Top pairs
	pairs = []
	for i in range(len(words)):
	for j in range(i + 1, len(words)):
	pairs.append((words[i], words[j], sim_matrix[i][j]))
	pairs.sort(key=lambda x: x[2], reverse=True)

	md = "## Most Similar Pairs\n\n\| Pair \| Similarity \|\n\|------\|------------\|\n"
	for w_a, w_b, score in pairs[:5]:
	bar = "█" * int(score * 20)
	md += f"\| {w_a} ↔ {w_b} \| {score:.3f} {bar} \|\n"

	md += "\n## Least Similar Pairs\n\n\| Pair \| Similarity \|\n\|------\|------------\|\n"
	for w_a, w_b, score in pairs[-3:]:
	bar = "░" * int(score * 20)
	md += f"\| {w_a} ↔ {w_b} \| {score:.3f} {bar} \|\n"

	source = "sentence-transformers (live)" if get_model() is not None else "pre-computed embeddings (demo mode)"
	md += f"\nEmbeddings via: {source}"

	return fig, fig_sim, md


	# ── Gradio UI ─────────────────────────────────────────────────────────────────

	with gr.Blocks(title="Embedding Explorer", theme=gr.themes.Soft(primary_hue="blue")) as demo:
	gr.Markdown(
	"# Embedding Explorer\n\n"
	"PM Decision: This is the foundation of semantic search and RAG. When your team "
	"proposes a 'smart search' or 'knowledge base' feature, they're using embeddings. "
	"Understanding this helps you evaluate RAG proposals and set realistic expectations.\n\n"
	"Enter words and phrases to see how AI understands meaning. "
	"Similar meanings cluster together. Different meanings stay apart."
	)

	gr.Markdown("### Enter 3–8 words or phrases:")
	with gr.Row():
	w1 = gr.Textbox(value="Madrid", label="Word 1")
	w2 = gr.Textbox(value="Spain", label="Word 2")
	w3 = gr.Textbox(value="Paris", label="Word 3")
	w4 = gr.Textbox(value="France", label="Word 4")
	with gr.Row():
	w5 = gr.Textbox(value="Apple", label="Word 5")
	w6 = gr.Textbox(value="Banana", label="Word 6")
	w7 = gr.Textbox(value="King", label="Word 7")
	w8 = gr.Textbox(value="Queen", label="Word 8")

	run_btn = gr.Button("Explore Embeddings", variant="primary")

	scatter = gr.Plot(label="2D Meaning Map")
	heatmap = gr.Plot(label="Similarity Matrix")
	analysis = gr.Markdown()

	run_btn.click(explore_embeddings, [w1, w2, w3, w4, w5, w6, w7, w8], [scatter, heatmap, analysis])
	demo.load(explore_embeddings, [w1, w2, w3, w4, w5, w6, w7, w8], [scatter, heatmap, analysis])

	gr.Markdown(
	"---\n"
	"PM Takeaway: Words that cluster together will be retrieved together in search. "
	"If your domain has jargon with different meanings than everyday usage, RAG might "
	"retrieve the wrong content.\n\n"
	"AI for Product Managers"
	)

	if __name__ == "__main__":
	demo.launch()