Spaces:

berndf
/

3D-text-embedding

Runtime error

App Files Files Community

berndf commited on Aug 10, 2025

Commit

6ec32ce

verified ·

1 Parent(s): 0837a0a

Update app.py

Browse files

Files changed (1) hide show

app.py +137 -257

app.py CHANGED Viewed

@@ -1,270 +1,150 @@
-# app.py
-import time
-import random
-import numpy as np
 import streamlit as st
-import plotly.graph_objects as go
-from sklearn.decomposition import PCA
-import torch
 from transformers import AutoTokenizer, AutoModel
-st.set_page_config(page_title="Embedding Demo", layout="wide")
-# ----------------------------
-# BASE DATASETS (lowercase)
-# ----------------------------
-DATASETS = {
-    "countries": [
-        "germany","france","italy","spain","portugal","poland","netherlands","belgium",
-        "austria","switzerland","greece","norway","sweden","finland","denmark","ireland",
-        "hungary","czechia","slovakia","slovenia","iceland","estonia","latvia","lithuania","romania"
-    ],
-    "animals": [
-        "cat","dog","lion","tiger","bear","wolf","fox","eagle","shark","whale",
-        "zebra","giraffe","elephant","hippopotamus","rhinoceros","kangaroo","panda","otter","seal","dolphin",
-        "chimpanzee","gorilla","leopard","cheetah","lynx"
-    ],
-    "furniture": [
-        "armchair","sofa","dining table","coffee table","bookshelf","bed","wardrobe","desk","office chair","dresser",
-        "nightstand","side table","tv stand","loveseat","chaise lounge","bench","hutch","kitchen island","futon","recliner",
-        "ottoman","console table","vanity","buffet","sectional sofa"
-    ],
-    "actors": [
-        "brad pitt","angelina jolie","meryl streep","leonardo dicaprio","tom hanks","scarlett johansson","robert de niro",
-        "natalie portman","matt damon","cate blanchett","johnny depp","keanu reeves","hugh jackman","emma stone","ryan gosling",
-        "jennifer lawrence","christian bale","charlize theron","will smith","anne hathaway","denzel washington","morgan freeman",
-        "julia roberts","george clooney","kate winslet"
-    ],
-    "rock group": [
-        "the beatles","rolling stones","pink floyd","queen","led zeppelin","u2","ac/dc","nirvana","radiohead","metallica",
-        "guns n' roses","red hot chili peppers","coldplay","pearl jam","the police","aerosmith","green day","foo fighters",
-        "the doors","bon jovi","deep purple","the who","the kinks","fleetwood mac","the beach boys"
-    ],
-    "sports": [
-        "soccer","basketball","tennis","baseball","golf","swimming","cycling","running","volleyball","rugby",
-        "boxing","skiing","snowboarding","surfing","skateboarding","karate","judo","fencing","rowing","badminton",
-        "cricket","table tennis","gymnastics","hockey","climbing"
-    ]
-}
-# ----------------------------
-# RANDOM MIXED SETS (once per session)
-# ----------------------------
-def make_random_mixed_sets(base: dict, n_sets: int = 3) -> dict:
-    keys = list(base.keys())
-    mixed = {}
-    for _ in range(n_sets):
-        sources = random.sample(keys, 3)
-        items = []
-        for s in sources:
-            take = min(7, len(base[s]))
-            items.extend(random.sample(base[s], take))
-        mixed_name = "/".join(sources).lower()
-        mixed[mixed_name] = items[:21]
-    return mixed
-if "mixed_added" not in st.session_state:
-    DATASETS.update(make_random_mixed_sets(DATASETS, 3))
-    st.session_state.mixed_added = True
-# ----------------------------
-# MODELS (transformers)
-# ----------------------------
 EMBED_MODELS = {
-    "all-minilm-l6-v2 (384d)": "sentence-transformers/all-MiniLM-L6-v2",
-    "all-mpnet-base-v2 (768d)": "sentence-transformers/all-mpnet-base-v2",
-    "all-roberta-large-v1 (1024d)": "sentence-transformers/all-roberta-large-v1",
 }
-@st.cache_resource(show_spinner=False)
-def load_hf_model(model_name: str):
-    tok = AutoTokenizer.from_pretrained(model_name)
-    mdl = AutoModel.from_pretrained(model_name)
-    mdl.eval()
-    return tok, mdl
-@st.cache_data(show_spinner=False)
-def embed_texts(model_name: str, texts_tuple: tuple):
-    tokenizer, model = load_hf_model(model_name)
-    texts = list(texts_tuple)
     with torch.no_grad():
-        inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
-        outputs = model(**inputs)
-        token_embeddings = outputs.last_hidden_state  # (B,T,H)
-        mask = inputs["attention_mask"].unsqueeze(-1).type_as(token_embeddings)
-        summed = (token_embeddings * mask).sum(dim=1)
-        counts = mask.sum(dim=1).clamp(min=1e-9)
-        embeddings = summed / counts  # mean pooling
     return embeddings.cpu().numpy()
-# ----------------------------
-# STATE: camera + rotation
-# ----------------------------
-if "camera_eye" not in st.session_state:
-    st.session_state.camera_eye = {"x": 1.6, "y": 1.6, "z": 1.2}
-if "spinning" not in st.session_state:
-    st.session_state.spinning = False
-if "angle_rad" not in st.session_state:
-    # derive from eye.x, eye.y
-    e = st.session_state.camera_eye
-    st.session_state.angle_rad = float(np.arctan2(e["y"], e["x"]))
-def update_eye_from_angle(angle_rad: float, radius: float, z: float):
-    return {"x": radius * np.cos(angle_rad), "y": radius * np.sin(angle_rad), "z": z}
-# ----------------------------
-# NAVIGATION via st.query_params
-# ----------------------------
-def goto(page: str):
-    st.query_params["page"] = page
-    st.rerun()
-page = st.query_params.get("page", ["demo"])[0]
-# ----------------------------
-# INFO PAGE
-# ----------------------------
-def info_page():
-    st.title("ℹ about this demo")
-    st.write("""
-**embeddings** turn words (or longer text) into numerical vectors.
-in this vector space, **semantically related** items end up **near** each other.
-why this is useful:
-- semantic search and retrieval
-- clustering and topic discovery
-- recommendations and deduplication
-- measuring similarity and analogies
-this demo embeds single words with a selectable model, reduces to 2d/3d with pca,
-and shows how related words cluster in the projected space.
-    """.strip())
-    if st.button("⬅ back to demo"):
-        goto("demo")
-# ----------------------------
-# DEMO PAGE
-# ----------------------------
-def demo_page():
-    # top row: dataset, model + 2d/3d, info button
-    c1, c2, c3 = st.columns([2, 2, 1])
-    with c1:
-        ds_names = list(DATASETS.keys())
-        dataset_name = st.selectbox("dataset", ds_names, index=ds_names.index("furniture") if "furniture" in ds_names else 0)
-    with c2:
-        cc1, cc2 = st.columns([2, 1])
-        with cc1:
-            model_label = st.selectbox("embedding model", list(EMBED_MODELS.keys()))
-            model_name = EMBED_MODELS[model_label]
-        with cc2:
-            proj_mode = st.radio("projection", ["2d", "3d"], horizontal=True)
-    with c3:
-        if st.button("ℹ info"):
-            goto("info")
-    words = DATASETS[dataset_name]
-    st.text_area("dataset words", "\n".join(words), height=160)
-    # Embed + PCA
-    embs = embed_texts(model_name, tuple(words))
-    if proj_mode == "2d":
-        coords = PCA(n_components=2).fit_transform(embs)
-    else:
-        coords = PCA(n_components=3).fit_transform(embs)
-    title_html = f"<b style='color:#1f77b4; font-size:2.0rem;'>{dataset_name}</b>"
-    if proj_mode == "3d":
-        # compute radius from current eye, keep same z
-        eye = st.session_state.camera_eye
-        radius = float(np.sqrt(eye["x"]**2 + eye["y"]**2)) or 1.6
-        z_eye = float(eye["z"])
-        fig = go.Figure(
-            data=[go.Scatter3d(
-                x=coords[:, 0], y=coords[:, 1], z=coords[:, 2],
-                mode="markers+text", text=words, textposition="top center",
-                marker=dict(size=6),
-            )],
-            layout=go.Layout(
-                title=dict(text=title_html, x=0.5, xanchor="center", yanchor="top",
-                           font=dict(size=30, color="#1f77b4")),
-                scene=dict(
-                    camera=dict(eye=eye, projection=dict(type="perspective")),
-                    xaxis=dict(showbackground=True, backgroundcolor="rgba(255, 230, 230, 1)"),
-                    yaxis=dict(showbackground=True, backgroundcolor="rgba(230, 255, 230, 1)"),
-                    zaxis=dict(showbackground=True, backgroundcolor="rgba(230, 230, 255, 1)"),
-                ),
-                margin=dict(l=0, r=0, b=0, t=60),
-                uirevision="keep",
-            )
-        )
-        # Controls under the plot: Start/Stop rotation
-        b1, b2 = st.columns([1, 1])
-        with b1:
-            start_clicked = st.button("▶ start rotation", disabled=st.session_state.spinning)
-        with b2:
-            stop_clicked = st.button("⏹ stop rotation", disabled=not st.session_state.spinning)
-        # If start pressed: turn on spinner and initialize angle from current stored eye
-        if start_clicked:
-            st.session_state.spinning = True
-            # start from stored angle (not capturing manual camera — simple approach)
-            st.session_state.angle_rad = float(np.arctan2(eye["y"], eye["x"]))
-            # fall through to loop below (this turn will render once, then continue)
-        # If stop pressed: turn off spinner (and keep stop disabled after)
-        if stop_clicked:
-            st.session_state.spinning = False
-        # Live render placeholder
-        placeholder = st.empty()
-        # First draw (static) before any loop
-        placeholder.plotly_chart(fig, use_container_width=True)
-        # Continuous rotation loop while spinning
-        if st.session_state.spinning:
-            # one "batch" of frames, then rerun to keep UI responsive
-            steps_per_batch = 120
-            step = np.deg2rad(3)  # 3 degrees per frame ~ smooth
-            for _ in range(steps_per_batch):
-                if not st.session_state.spinning:
-                    break
-                st.session_state.angle_rad += step
-                new_eye = update_eye_from_angle(st.session_state.angle_rad, radius, z_eye)
-                st.session_state.camera_eye = new_eye
-                fig.update_layout(scene_camera=dict(eye=new_eye, projection=dict(type="perspective")))
-                placeholder.plotly_chart(fig, use_container_width=True)
-                time.sleep(0.033)  # ~30 FPS
-            # If still spinning after this batch, rerun to keep going
-            if st.session_state.spinning:
-                st.rerun()
-    else:
-        fig = go.Figure(
-            data=[go.Scatter(
-                x=coords[:, 0], y=coords[:, 1],
-                mode="markers+text", text=words, textposition="top center",
-                marker=dict(size=9),
-            )],
-            layout=go.Layout(
-                title=dict(text=title_html, x=0.5, xanchor="center", yanchor="top",
-                           font=dict(size=30, color="#1f77b4")),
-                xaxis=dict(title="PC1"),
-                yaxis=dict(title="PC2", scaleanchor="x", scaleratio=1),
-                margin=dict(l=0, r=0, b=0, t=60),
-            )
-        )
-        st.plotly_chart(fig, use_container_width=True)
-# ----------------------------
-# ROUTER
-# ----------------------------
-if page == "info":
-    info_page()
 else:
-    demo_page()

 import streamlit as st
+import plotly.graph_objs as go
+import numpy as np
+import random
 from transformers import AutoTokenizer, AutoModel
+import torch
+from sklearn.decomposition import PCA
+# -------------------
+# CONFIG
+# -------------------
+st.set_page_config(layout="wide", page_title="Embedding Visualizer")
+# -------------------
+# EMBEDDING MODELS
+# -------------------
 EMBED_MODELS = {
+    "all-MiniLM-L6-v2 (384 dims)": "sentence-transformers/all-MiniLM-L6-v2",
+    "all-mpnet-base-v2 (768 dims)": "sentence-transformers/all-mpnet-base-v2",
+    "multi-qa-MiniLM-L6-cos-v1 (384 dims)": "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
 }
+@st.cache_resource
+def load_model(model_name):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModel.from_pretrained(model_name)
+    return tokenizer, model
+def embed_texts(texts, tokenizer, model):
+    tokens = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
     with torch.no_grad():
+        embeddings = model(**tokens).last_hidden_state.mean(dim=1)
     return embeddings.cpu().numpy()
+# -------------------
+# DATASETS
+# -------------------
+base_sets = {
+    "countries": ["Germany", "France", "Italy", "Spain", "Portugal", "Norway", "Sweden", "Denmark", "Poland", "Austria"],
+    "animals": ["Dog", "Cat", "Horse", "Elephant", "Tiger", "Lion", "Monkey", "Giraffe", "Zebra", "Bear"],
+    "furniture": [
+        "Armchair", "Sofa", "Dining table", "Coffee table", "Bookshelf", "Bed", "Wardrobe",
+        "Desk", "Office chair", "Dresser", "Nightstand", "Side table", "TV stand",
+        "Loveseat", "Chaise lounge", "Bench", "Hutch", "Kitchen island", "Futon", "Recliner",
+        "Ottoman", "Console table", "Vanity", "Buffet", "Sectional sofa"
+    ],
+    "actor": ["Tom Hanks", "Brad Pitt", "Leonardo DiCaprio", "Meryl Streep", "Natalie Portman",
+              "Morgan Freeman", "Emma Stone", "Denzel Washington", "Cate Blanchett", "Robert De Niro"],
+    "rock group": ["The Beatles", "The Rolling Stones", "Queen", "Pink Floyd", "Led Zeppelin",
+                   "U2", "The Who", "Metallica", "Nirvana", "Radiohead"]
+}
+# -------------------
+# CREATE RANDOM MIXED SETS
+# -------------------
+def create_random_mixed_sets(num_sets=3):
+    mixed_sets = {}
+    keys = list(base_sets.keys())
+    for _ in range(num_sets):
+        chosen = random.sample(keys, 3)
+        words = []
+        for k in chosen:
+            words.extend(random.sample(base_sets[k], min(7, len(base_sets[k]))))
+        mixed_name = "/".join(chosen)
+        mixed_sets[mixed_name] = words
+    return mixed_sets
+mixed_sets = create_random_mixed_sets()
+datasets = {**base_sets, **mixed_sets}
+# -------------------
+# UI LAYOUT
+# -------------------
+col_top1, col_top2, col_top3 = st.columns([2, 2, 1])
+with col_top1:
+    dataset_name = st.selectbox("Dataset", list(datasets.keys()), index=list(datasets.keys()).index("furniture"))
+with col_top2:
+    embed_model_name = st.selectbox("Embedding model", list(EMBED_MODELS.keys()))
+with col_top3:
+    st.markdown("[ℹ Info](?page=info)")
+if st.query_params.get("page") == "info":
+    st.markdown("""
+    ## embedding demo info
+    embeddings are numerical vector representations of text.
+    they capture meaning so that similar words or phrases are located near each other in the vector space.
+    this makes them useful for search, clustering, recommendation, and semantic analysis.
+    """)
+    st.stop()
+# -------------------
+# MAIN TWO-COLUMN LAYOUT
+# -------------------
+col1, col2 = st.columns([1, 2])
+with col1:
+    dataset_words = st.text_area("Dataset words", "\n".join(datasets[dataset_name]), height=400)
+    words = [w.strip() for w in dataset_words.split("\n") if w.strip()]
+with col2:
+    dim_mode = st.radio("Projection", ["2D", "3D"], horizontal=True)
+# -------------------
+# EMBEDDING & PROJECTION
+# -------------------
+tokenizer, model = load_model(EMBED_MODELS[embed_model_name])
+vectors = embed_texts(words, tokenizer, model)
+if dim_mode == "2D":
+    proj = PCA(n_components=2).fit_transform(vectors)
+else:
+    proj = PCA(n_components=3).fit_transform(vectors)
+# -------------------
+# PLOT
+# -------------------
+rotate = st.session_state.get("rotate", False)
+scene_camera = dict(eye=dict(x=1.25, y=1.25, z=1.25))
+if dim_mode == "3D":
+    trace = go.Scatter3d(
+        x=proj[:, 0], y=proj[:, 1], z=proj[:, 2],
+        mode='markers+text',
+        text=words,
+        marker=dict(size=6, color='blue', opacity=0.8),
+        textposition='top center'
+    )
+    fig = go.Figure(data=[trace])
+    fig.update_layout(scene_camera=scene_camera, margin=dict(l=0, r=0, t=0, b=0))
 else:
+    trace = go.Scatter(
+        x=proj[:, 0], y=proj[:, 1],
+        mode='markers+text',
+        text=words,
+        marker=dict(size=8, color='blue', opacity=0.8),
+        textposition='top center'
+    )
+    fig = go.Figure(data=[trace])
+    fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))
+# -------------------
+# ROTATION BUTTON
+# -------------------
+if st.button("🔄 Toggle Rotation"):
+    st.session_state.rotate = not st.session_state.get("rotate", False)
+if rotate and dim_mode == "3D":
+    fig.update_layout(scene_camera=dict(eye=dict(x=1.25, y=1.25, z=1.25), up=dict(x=0, y=0, z=1)))
+st.plotly_chart(fig, use_container_width=True)