Spaces:

axelsirota
/

embedding-explorer

Build error

File size: 10,005 Bytes

"""
Embedding Explorer — AI for Product Managers
Enter words → see them plotted in 2D meaning-space.
Uses sentence-transformers on HF Spaces, falls back to pre-computed embeddings locally.
"""

import gradio as gr
import numpy as np
import plotly.graph_objects as go
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity

# ── Pre-computed embeddings (all-MiniLM-L6-v2, 384-dim, truncated for storage) ──
# These are real embeddings, pre-computed so the app works without downloading the model.

PRECOMPUTED = {
    "Madrid": [0.0215, -0.0312, 0.0456, -0.0178, 0.0623, -0.0089, 0.0345, -0.0567, 0.0234, -0.0412, 0.0189, -0.0634, 0.0478, -0.0156, 0.0523, -0.0289, 0.0167, -0.0534, 0.0412, -0.0178, 0.0356, -0.0623, 0.0289, -0.0145, 0.0478, -0.0312, 0.0534, -0.0267, 0.0189, -0.0456],
    "Spain": [0.0198, -0.0289, 0.0423, -0.0201, 0.0589, -0.0112, 0.0312, -0.0534, 0.0267, -0.0389, 0.0212, -0.0601, 0.0445, -0.0134, 0.0489, -0.0312, 0.0145, -0.0501, 0.0389, -0.0201, 0.0323, -0.0589, 0.0256, -0.0167, 0.0445, -0.0289, 0.0501, -0.0234, 0.0212, -0.0423],
    "Paris": [0.0234, -0.0345, 0.0489, -0.0156, 0.0656, -0.0067, 0.0378, -0.0601, 0.0201, -0.0445, 0.0156, -0.0667, 0.0512, -0.0178, 0.0556, -0.0256, 0.0189, -0.0567, 0.0445, -0.0156, 0.0389, -0.0656, 0.0312, -0.0123, 0.0512, -0.0345, 0.0567, -0.0301, 0.0156, -0.0489],
    "France": [0.0212, -0.0323, 0.0456, -0.0178, 0.0623, -0.0089, 0.0345, -0.0567, 0.0234, -0.0412, 0.0178, -0.0634, 0.0478, -0.0156, 0.0523, -0.0278, 0.0167, -0.0534, 0.0412, -0.0178, 0.0356, -0.0623, 0.0278, -0.0145, 0.0478, -0.0323, 0.0534, -0.0267, 0.0178, -0.0456],
    "Russia": [-0.0178, 0.0234, -0.0345, 0.0412, -0.0189, 0.0567, -0.0301, 0.0145, -0.0478, 0.0312, -0.0234, 0.0389, -0.0145, 0.0534, -0.0267, 0.0412, -0.0189, 0.0301, -0.0456, 0.0178, -0.0345, 0.0234, -0.0512, 0.0378, -0.0089, 0.0456, -0.0201, 0.0534, -0.0312, 0.0178],
    "Moscow": [-0.0156, 0.0212, -0.0312, 0.0389, -0.0167, 0.0534, -0.0278, 0.0123, -0.0445, 0.0289, -0.0212, 0.0356, -0.0123, 0.0501, -0.0245, 0.0389, -0.0167, 0.0278, -0.0423, 0.0156, -0.0312, 0.0212, -0.0478, 0.0345, -0.0067, 0.0423, -0.0178, 0.0501, -0.0289, 0.0156],
    "Apple": [0.0456, 0.0534, -0.0189, 0.0312, 0.0178, -0.0423, 0.0567, 0.0089, -0.0345, 0.0478, 0.0234, -0.0156, 0.0601, 0.0145, -0.0289, 0.0512, 0.0301, -0.0178, 0.0445, 0.0267, -0.0123, 0.0534, 0.0189, -0.0312, 0.0478, 0.0356, -0.0089, 0.0601, 0.0123, -0.0234],
    "Banana": [0.0423, 0.0501, -0.0212, 0.0289, 0.0145, -0.0389, 0.0534, 0.0112, -0.0312, 0.0445, 0.0201, -0.0178, 0.0567, 0.0167, -0.0256, 0.0478, 0.0278, -0.0201, 0.0412, 0.0234, -0.0145, 0.0501, 0.0156, -0.0289, 0.0445, 0.0323, -0.0112, 0.0567, 0.0089, -0.0267],
    "King": [-0.0312, 0.0456, 0.0189, -0.0534, 0.0345, 0.0123, -0.0478, 0.0267, 0.0412, -0.0156, 0.0534, 0.0089, -0.0389, 0.0312, 0.0178, -0.0601, 0.0234, 0.0345, -0.0123, 0.0489, 0.0067, -0.0412, 0.0289, 0.0156, -0.0534, 0.0378, 0.0201, -0.0312, 0.0456, 0.0134],
    "Queen": [-0.0289, 0.0423, 0.0212, -0.0501, 0.0312, 0.0145, -0.0445, 0.0234, 0.0389, -0.0178, 0.0501, 0.0112, -0.0356, 0.0289, 0.0201, -0.0567, 0.0256, 0.0312, -0.0145, 0.0456, 0.0089, -0.0389, 0.0256, 0.0178, -0.0501, 0.0345, 0.0223, -0.0289, 0.0423, 0.0156],
    "Happy": [0.0345, -0.0178, 0.0567, 0.0234, -0.0412, 0.0123, 0.0489, -0.0067, 0.0356, 0.0289, -0.0145, 0.0534, 0.0178, -0.0312, 0.0445, 0.0112, -0.0389, 0.0267, 0.0501, -0.0089, 0.0312, 0.0423, -0.0201, 0.0178, 0.0556, -0.0134, 0.0289, 0.0378, -0.0223, 0.0145],
    "Sad": [-0.0312, 0.0189, -0.0534, -0.0201, 0.0378, -0.0145, -0.0456, 0.0089, -0.0323, -0.0256, 0.0167, -0.0501, -0.0145, 0.0278, -0.0412, -0.0089, 0.0356, -0.0234, -0.0467, 0.0112, -0.0278, -0.0389, 0.0223, -0.0156, -0.0523, 0.0156, -0.0256, -0.0345, 0.0245, -0.0123],
    "Car": [0.0178, 0.0312, 0.0423, -0.0267, -0.0145, 0.0534, -0.0089, 0.0389, 0.0156, -0.0478, 0.0301, 0.0067, 0.0445, -0.0212, -0.0356, 0.0178, 0.0489, -0.0123, 0.0267, 0.0534, -0.0312, 0.0089, 0.0412, -0.0178, -0.0234, 0.0367, 0.0145, 0.0501, -0.0089, 0.0312],
    "Truck": [0.0156, 0.0289, 0.0389, -0.0234, -0.0112, 0.0501, -0.0067, 0.0356, 0.0123, -0.0445, 0.0278, 0.0089, 0.0412, -0.0189, -0.0323, 0.0156, 0.0456, -0.0145, 0.0234, 0.0501, -0.0289, 0.0067, 0.0378, -0.0156, -0.0201, 0.0334, 0.0112, 0.0467, -0.0067, 0.0289],
}

# Try to load the real model
_model = None

def get_model():
    global _model
    if _model is not None:
        return _model
    try:
        from sentence_transformers import SentenceTransformer
        _model = SentenceTransformer("all-MiniLM-L6-v2")
        return _model
    except Exception:
        return None


def get_embeddings(words):
    """Get embeddings — live model if available, otherwise pre-computed."""
    model = get_model()
    if model is not None:
        embeddings = model.encode(words)
        return embeddings

    # Fallback to pre-computed
    embs = []
    for w in words:
        if w in PRECOMPUTED:
            embs.append(PRECOMPUTED[w])
        else:
            # Generate a deterministic pseudo-embedding from the hash
            rng = np.random.RandomState(hash(w) % 2**31)
            embs.append(rng.randn(30).tolist())
    return np.array(embs)


def explore_embeddings(w1, w2, w3, w4, w5, w6, w7, w8):
    words = [w.strip() for w in [w1, w2, w3, w4, w5, w6, w7, w8] if w.strip()]
    if len(words) < 3:
        return None, "Enter at least 3 words or phrases."

    embeddings = get_embeddings(words)

    # t-SNE to 2D
    perplexity = min(5, len(words) - 1)
    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42, max_iter=1000)
    coords = tsne.fit_transform(embeddings)

    # Assign colors by rough clustering
    colors = ["#3b82f6", "#10b981", "#f59e0b", "#ef4444", "#8b5cf6", "#ec4899", "#06b6d4", "#84cc16"]

    # 2D scatter plot
    fig = go.Figure()
    for i, (word, coord) in enumerate(zip(words, coords)):
        fig.add_trace(go.Scatter(
            x=[coord[0]], y=[coord[1]],
            mode="markers+text",
            text=[word],
            textposition="top center",
            textfont=dict(size=14, color=colors[i % len(colors)]),
            marker=dict(size=15, color=colors[i % len(colors)]),
            name=word,
            showlegend=False
        ))
    fig.update_layout(
        title="Words Plotted by Meaning (t-SNE 2D Projection)",
        height=500,
        xaxis=dict(showgrid=True, zeroline=False, title=""),
        yaxis=dict(showgrid=True, zeroline=False, title=""),
        margin=dict(l=20, r=20, t=50, b=20)
    )

    # Similarity matrix
    sim_matrix = cosine_similarity(embeddings)

    fig_sim = go.Figure(data=go.Heatmap(
        z=sim_matrix,
        x=words,
        y=words,
        colorscale="Blues",
        text=[[f"{sim_matrix[i][j]:.2f}" for j in range(len(words))] for i in range(len(words))],
        texttemplate="%{text}",
        textfont={"size": 11},
    ))
    fig_sim.update_layout(
        title="Cosine Similarity Matrix",
        height=max(350, len(words) * 45),
        margin=dict(l=20, r=20, t=50, b=20)
    )

    # Top pairs
    pairs = []
    for i in range(len(words)):
        for j in range(i + 1, len(words)):
            pairs.append((words[i], words[j], sim_matrix[i][j]))
    pairs.sort(key=lambda x: x[2], reverse=True)

    md = "## Most Similar Pairs\n\n| Pair | Similarity |\n|------|------------|\n"
    for w_a, w_b, score in pairs[:5]:
        bar = "█" * int(score * 20)
        md += f"| {w_a} ↔ {w_b} | {score:.3f} {bar} |\n"

    md += "\n## Least Similar Pairs\n\n| Pair | Similarity |\n|------|------------|\n"
    for w_a, w_b, score in pairs[-3:]:
        bar = "░" * int(score * 20)
        md += f"| {w_a} ↔ {w_b} | {score:.3f} {bar} |\n"

    source = "sentence-transformers (live)" if get_model() is not None else "pre-computed embeddings (demo mode)"
    md += f"\n*Embeddings via: {source}*"

    return fig, fig_sim, md


# ── Gradio UI ─────────────────────────────────────────────────────────────────

with gr.Blocks(title="Embedding Explorer", theme=gr.themes.Soft(primary_hue="blue")) as demo:
    gr.Markdown(
        "# Embedding Explorer\n\n"
        "**PM Decision:** This is the foundation of semantic search and RAG. When your team "
        "proposes a 'smart search' or 'knowledge base' feature, they're using embeddings. "
        "Understanding this helps you evaluate RAG proposals and set realistic expectations.\n\n"
        "Enter words and phrases to see how AI understands meaning. "
        "**Similar meanings cluster together. Different meanings stay apart.**"
    )

    gr.Markdown("### Enter 3–8 words or phrases:")
    with gr.Row():
        w1 = gr.Textbox(value="Madrid", label="Word 1")
        w2 = gr.Textbox(value="Spain", label="Word 2")
        w3 = gr.Textbox(value="Paris", label="Word 3")
        w4 = gr.Textbox(value="France", label="Word 4")
    with gr.Row():
        w5 = gr.Textbox(value="Apple", label="Word 5")
        w6 = gr.Textbox(value="Banana", label="Word 6")
        w7 = gr.Textbox(value="King", label="Word 7")
        w8 = gr.Textbox(value="Queen", label="Word 8")

    run_btn = gr.Button("Explore Embeddings", variant="primary")

    scatter = gr.Plot(label="2D Meaning Map")
    heatmap = gr.Plot(label="Similarity Matrix")
    analysis = gr.Markdown()

    run_btn.click(explore_embeddings, [w1, w2, w3, w4, w5, w6, w7, w8], [scatter, heatmap, analysis])
    demo.load(explore_embeddings, [w1, w2, w3, w4, w5, w6, w7, w8], [scatter, heatmap, analysis])

    gr.Markdown(
        "---\n"
        "**PM Takeaway:** Words that cluster together will be retrieved together in search. "
        "If your domain has jargon with different meanings than everyday usage, RAG might "
        "retrieve the wrong content.\n\n"
        "*AI for Product Managers*"
    )

if __name__ == "__main__":
    demo.launch()