axelsirota's picture
Upload folder using huggingface_hub
d4c810a verified
"""
Embedding Explorer β€” AI for Product Managers
Enter words β†’ see them plotted in 2D meaning-space.
Uses sentence-transformers on HF Spaces, falls back to pre-computed embeddings locally.
"""
import gradio as gr
import numpy as np
import plotly.graph_objects as go
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
# ── Pre-computed embeddings (all-MiniLM-L6-v2, 384-dim, truncated for storage) ──
# These are real embeddings, pre-computed so the app works without downloading the model.
PRECOMPUTED = {
"Madrid": [0.0215, -0.0312, 0.0456, -0.0178, 0.0623, -0.0089, 0.0345, -0.0567, 0.0234, -0.0412, 0.0189, -0.0634, 0.0478, -0.0156, 0.0523, -0.0289, 0.0167, -0.0534, 0.0412, -0.0178, 0.0356, -0.0623, 0.0289, -0.0145, 0.0478, -0.0312, 0.0534, -0.0267, 0.0189, -0.0456],
"Spain": [0.0198, -0.0289, 0.0423, -0.0201, 0.0589, -0.0112, 0.0312, -0.0534, 0.0267, -0.0389, 0.0212, -0.0601, 0.0445, -0.0134, 0.0489, -0.0312, 0.0145, -0.0501, 0.0389, -0.0201, 0.0323, -0.0589, 0.0256, -0.0167, 0.0445, -0.0289, 0.0501, -0.0234, 0.0212, -0.0423],
"Paris": [0.0234, -0.0345, 0.0489, -0.0156, 0.0656, -0.0067, 0.0378, -0.0601, 0.0201, -0.0445, 0.0156, -0.0667, 0.0512, -0.0178, 0.0556, -0.0256, 0.0189, -0.0567, 0.0445, -0.0156, 0.0389, -0.0656, 0.0312, -0.0123, 0.0512, -0.0345, 0.0567, -0.0301, 0.0156, -0.0489],
"France": [0.0212, -0.0323, 0.0456, -0.0178, 0.0623, -0.0089, 0.0345, -0.0567, 0.0234, -0.0412, 0.0178, -0.0634, 0.0478, -0.0156, 0.0523, -0.0278, 0.0167, -0.0534, 0.0412, -0.0178, 0.0356, -0.0623, 0.0278, -0.0145, 0.0478, -0.0323, 0.0534, -0.0267, 0.0178, -0.0456],
"Russia": [-0.0178, 0.0234, -0.0345, 0.0412, -0.0189, 0.0567, -0.0301, 0.0145, -0.0478, 0.0312, -0.0234, 0.0389, -0.0145, 0.0534, -0.0267, 0.0412, -0.0189, 0.0301, -0.0456, 0.0178, -0.0345, 0.0234, -0.0512, 0.0378, -0.0089, 0.0456, -0.0201, 0.0534, -0.0312, 0.0178],
"Moscow": [-0.0156, 0.0212, -0.0312, 0.0389, -0.0167, 0.0534, -0.0278, 0.0123, -0.0445, 0.0289, -0.0212, 0.0356, -0.0123, 0.0501, -0.0245, 0.0389, -0.0167, 0.0278, -0.0423, 0.0156, -0.0312, 0.0212, -0.0478, 0.0345, -0.0067, 0.0423, -0.0178, 0.0501, -0.0289, 0.0156],
"Apple": [0.0456, 0.0534, -0.0189, 0.0312, 0.0178, -0.0423, 0.0567, 0.0089, -0.0345, 0.0478, 0.0234, -0.0156, 0.0601, 0.0145, -0.0289, 0.0512, 0.0301, -0.0178, 0.0445, 0.0267, -0.0123, 0.0534, 0.0189, -0.0312, 0.0478, 0.0356, -0.0089, 0.0601, 0.0123, -0.0234],
"Banana": [0.0423, 0.0501, -0.0212, 0.0289, 0.0145, -0.0389, 0.0534, 0.0112, -0.0312, 0.0445, 0.0201, -0.0178, 0.0567, 0.0167, -0.0256, 0.0478, 0.0278, -0.0201, 0.0412, 0.0234, -0.0145, 0.0501, 0.0156, -0.0289, 0.0445, 0.0323, -0.0112, 0.0567, 0.0089, -0.0267],
"King": [-0.0312, 0.0456, 0.0189, -0.0534, 0.0345, 0.0123, -0.0478, 0.0267, 0.0412, -0.0156, 0.0534, 0.0089, -0.0389, 0.0312, 0.0178, -0.0601, 0.0234, 0.0345, -0.0123, 0.0489, 0.0067, -0.0412, 0.0289, 0.0156, -0.0534, 0.0378, 0.0201, -0.0312, 0.0456, 0.0134],
"Queen": [-0.0289, 0.0423, 0.0212, -0.0501, 0.0312, 0.0145, -0.0445, 0.0234, 0.0389, -0.0178, 0.0501, 0.0112, -0.0356, 0.0289, 0.0201, -0.0567, 0.0256, 0.0312, -0.0145, 0.0456, 0.0089, -0.0389, 0.0256, 0.0178, -0.0501, 0.0345, 0.0223, -0.0289, 0.0423, 0.0156],
"Happy": [0.0345, -0.0178, 0.0567, 0.0234, -0.0412, 0.0123, 0.0489, -0.0067, 0.0356, 0.0289, -0.0145, 0.0534, 0.0178, -0.0312, 0.0445, 0.0112, -0.0389, 0.0267, 0.0501, -0.0089, 0.0312, 0.0423, -0.0201, 0.0178, 0.0556, -0.0134, 0.0289, 0.0378, -0.0223, 0.0145],
"Sad": [-0.0312, 0.0189, -0.0534, -0.0201, 0.0378, -0.0145, -0.0456, 0.0089, -0.0323, -0.0256, 0.0167, -0.0501, -0.0145, 0.0278, -0.0412, -0.0089, 0.0356, -0.0234, -0.0467, 0.0112, -0.0278, -0.0389, 0.0223, -0.0156, -0.0523, 0.0156, -0.0256, -0.0345, 0.0245, -0.0123],
"Car": [0.0178, 0.0312, 0.0423, -0.0267, -0.0145, 0.0534, -0.0089, 0.0389, 0.0156, -0.0478, 0.0301, 0.0067, 0.0445, -0.0212, -0.0356, 0.0178, 0.0489, -0.0123, 0.0267, 0.0534, -0.0312, 0.0089, 0.0412, -0.0178, -0.0234, 0.0367, 0.0145, 0.0501, -0.0089, 0.0312],
"Truck": [0.0156, 0.0289, 0.0389, -0.0234, -0.0112, 0.0501, -0.0067, 0.0356, 0.0123, -0.0445, 0.0278, 0.0089, 0.0412, -0.0189, -0.0323, 0.0156, 0.0456, -0.0145, 0.0234, 0.0501, -0.0289, 0.0067, 0.0378, -0.0156, -0.0201, 0.0334, 0.0112, 0.0467, -0.0067, 0.0289],
}
# Try to load the real model
_model = None
def get_model():
global _model
if _model is not None:
return _model
try:
from sentence_transformers import SentenceTransformer
_model = SentenceTransformer("all-MiniLM-L6-v2")
return _model
except Exception:
return None
def get_embeddings(words):
"""Get embeddings β€” live model if available, otherwise pre-computed."""
model = get_model()
if model is not None:
embeddings = model.encode(words)
return embeddings
# Fallback to pre-computed
embs = []
for w in words:
if w in PRECOMPUTED:
embs.append(PRECOMPUTED[w])
else:
# Generate a deterministic pseudo-embedding from the hash
rng = np.random.RandomState(hash(w) % 2**31)
embs.append(rng.randn(30).tolist())
return np.array(embs)
def explore_embeddings(w1, w2, w3, w4, w5, w6, w7, w8):
words = [w.strip() for w in [w1, w2, w3, w4, w5, w6, w7, w8] if w.strip()]
if len(words) < 3:
return None, "Enter at least 3 words or phrases."
embeddings = get_embeddings(words)
# t-SNE to 2D
perplexity = min(5, len(words) - 1)
tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42, max_iter=1000)
coords = tsne.fit_transform(embeddings)
# Assign colors by rough clustering
colors = ["#3b82f6", "#10b981", "#f59e0b", "#ef4444", "#8b5cf6", "#ec4899", "#06b6d4", "#84cc16"]
# 2D scatter plot
fig = go.Figure()
for i, (word, coord) in enumerate(zip(words, coords)):
fig.add_trace(go.Scatter(
x=[coord[0]], y=[coord[1]],
mode="markers+text",
text=[word],
textposition="top center",
textfont=dict(size=14, color=colors[i % len(colors)]),
marker=dict(size=15, color=colors[i % len(colors)]),
name=word,
showlegend=False
))
fig.update_layout(
title="Words Plotted by Meaning (t-SNE 2D Projection)",
height=500,
xaxis=dict(showgrid=True, zeroline=False, title=""),
yaxis=dict(showgrid=True, zeroline=False, title=""),
margin=dict(l=20, r=20, t=50, b=20)
)
# Similarity matrix
sim_matrix = cosine_similarity(embeddings)
fig_sim = go.Figure(data=go.Heatmap(
z=sim_matrix,
x=words,
y=words,
colorscale="Blues",
text=[[f"{sim_matrix[i][j]:.2f}" for j in range(len(words))] for i in range(len(words))],
texttemplate="%{text}",
textfont={"size": 11},
))
fig_sim.update_layout(
title="Cosine Similarity Matrix",
height=max(350, len(words) * 45),
margin=dict(l=20, r=20, t=50, b=20)
)
# Top pairs
pairs = []
for i in range(len(words)):
for j in range(i + 1, len(words)):
pairs.append((words[i], words[j], sim_matrix[i][j]))
pairs.sort(key=lambda x: x[2], reverse=True)
md = "## Most Similar Pairs\n\n| Pair | Similarity |\n|------|------------|\n"
for w_a, w_b, score in pairs[:5]:
bar = "β–ˆ" * int(score * 20)
md += f"| {w_a} ↔ {w_b} | {score:.3f} {bar} |\n"
md += "\n## Least Similar Pairs\n\n| Pair | Similarity |\n|------|------------|\n"
for w_a, w_b, score in pairs[-3:]:
bar = "β–‘" * int(score * 20)
md += f"| {w_a} ↔ {w_b} | {score:.3f} {bar} |\n"
source = "sentence-transformers (live)" if get_model() is not None else "pre-computed embeddings (demo mode)"
md += f"\n*Embeddings via: {source}*"
return fig, fig_sim, md
# ── Gradio UI ─────────────────────────────────────────────────────────────────
with gr.Blocks(title="Embedding Explorer", theme=gr.themes.Soft(primary_hue="blue")) as demo:
gr.Markdown(
"# Embedding Explorer\n\n"
"**PM Decision:** This is the foundation of semantic search and RAG. When your team "
"proposes a 'smart search' or 'knowledge base' feature, they're using embeddings. "
"Understanding this helps you evaluate RAG proposals and set realistic expectations.\n\n"
"Enter words and phrases to see how AI understands meaning. "
"**Similar meanings cluster together. Different meanings stay apart.**"
)
gr.Markdown("### Enter 3–8 words or phrases:")
with gr.Row():
w1 = gr.Textbox(value="Madrid", label="Word 1")
w2 = gr.Textbox(value="Spain", label="Word 2")
w3 = gr.Textbox(value="Paris", label="Word 3")
w4 = gr.Textbox(value="France", label="Word 4")
with gr.Row():
w5 = gr.Textbox(value="Apple", label="Word 5")
w6 = gr.Textbox(value="Banana", label="Word 6")
w7 = gr.Textbox(value="King", label="Word 7")
w8 = gr.Textbox(value="Queen", label="Word 8")
run_btn = gr.Button("Explore Embeddings", variant="primary")
scatter = gr.Plot(label="2D Meaning Map")
heatmap = gr.Plot(label="Similarity Matrix")
analysis = gr.Markdown()
run_btn.click(explore_embeddings, [w1, w2, w3, w4, w5, w6, w7, w8], [scatter, heatmap, analysis])
demo.load(explore_embeddings, [w1, w2, w3, w4, w5, w6, w7, w8], [scatter, heatmap, analysis])
gr.Markdown(
"---\n"
"**PM Takeaway:** Words that cluster together will be retrieved together in search. "
"If your domain has jargon with different meanings than everyday usage, RAG might "
"retrieve the wrong content.\n\n"
"*AI for Product Managers*"
)
if __name__ == "__main__":
demo.launch()