Spaces:

farquasar
/

words

Sleeping

App Files Files Community

farquasar commited on Sep 15, 2025

Commit

a4eef9d

verified ·

1 Parent(s): 0fbf412

Create app.py

Browse files

Files changed (1) hide show

app.py +180 -0

app.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import gradio as gr
+import re
+from functools import lru_cache
+import gensim.downloader as api
+from gensim.models import KeyedVectors
+import pandas as pd
+MODEL_OPTIONS = {
+    "glove-wiki-gigaword-50": "50d GloVe (Wikipedia+Gigaword) — small & fast",
+    "glove-wiki-gigaword-100": "100d GloVe (Wikipedia+Gigaword) — balanced",
+    "glove-wiki-gigaword-200": "200d GloVe (Wikipedia+Gigaword)",
+    "glove-wiki-gigaword-300": "300d GloVe (Wikipedia+Gigaword)",
+    "word2vec-google-news-300": "300d Google News Word2Vec — large (~1.6GB)"
+}
+TOKEN_RE = re.compile(r"[+\-]|[^+\-\s]+")
+@lru_cache(maxsize=4)
+def get_model(name: str) -> KeyedVectors:
+    """Load/download a pre-trained embedding with caching."""
+    return api.load(name)
+def parse_expression(expr: str):
+    tokens = TOKEN_RE.findall(expr.strip())
+    if not tokens:
+        return [], []
+    pos, neg, sign = [], [], '+'
+    for tok in tokens:
+        tok = tok.strip()
+        if tok in ['+', '-']:
+            sign = tok
+            continue
+        (pos if sign == '+' else neg).append(tok)
+    return pos, neg
+# ----------------------
+# Compute functions
+# ----------------------
+def compute_expression(model_name: str, expr: str, topn: int, exclude_inputs: bool):
+    try:
+        model = get_model(model_name)
+    except Exception as e:
+        return None, f"❌ Failed to load model '{model_name}': {e}"
+    pos, neg = parse_expression(expr or "")
+    if not pos and not neg:
+        return None, "⚠️ Please enter at least one word."
+    pos_in = [w for w in pos if w in model.key_to_index]
+    neg_in = [w for w in neg if w in model.key_to_index]
+    oov = [w for w in pos + neg if w not in model.key_to_index]
+    if not pos_in and not neg_in:
+        return None, "❌ All words are out-of-vocabulary for this model. Try different words or a different model."
+    try:
+        results = model.most_similar(positive=pos_in, negative=neg_in, topn=topn + len(pos_in) + len(neg_in))
+    except Exception as e:
+        return None, f"❌ Computation error: {e}"
+    if exclude_inputs:
+        inputs = {w.lower() for w in pos_in + neg_in}
+        results = [(w, s) for (w, s) in results if w.lower() not in inputs]
+    results = results[:topn]
+    df = pd.DataFrame(results, columns=["Word", "Cosine similarity"]) if results else None
+    info_bits = [
+        f"**Model:** `{model_name}` (dim={model.vector_size})",
+        f"**Positive:** {', '.join(pos_in) if pos_in else '—'}",
+        f"**Negative:** {', '.join(neg_in) if neg_in else '—'}",
+    ]
+    if oov:
+        info_bits.append(f"**Out-of-vocabulary skipped:** {', '.join(oov)}")
+    info = "\n\n".join(info_bits)
+    return df, info
+def compute_abc(model_name: str, a: str, b: str, c: str, topn: int, exclude_inputs: bool):
+    try:
+        model = get_model(model_name)
+    except Exception as e:
+        return None, f"❌ Failed to load model '{model_name}': {e}"
+    used, missing = [], []
+    vec = None
+    for word, sign in [(a, +1), (b, +1), (c, -1)]:
+        w = (word or '').strip()
+        if not w:
+            continue
+        if w in model.key_to_index:
+            used.append((w, sign))
+            v = model.get_vector(w)
+            vec = (v if vec is None else vec + sign * v)
+        else:
+            missing.append(w)
+    if vec is None:
+        return None, "❌ No valid words to compute a vector."
+    results = model.similar_by_vector(vec, topn=topn + len(used))
+    if exclude_inputs:
+        inputs = {w for w, _ in used}
+        results = [(w, s) for (w, s) in results if w not in inputs]
+    results = results[:topn]
+    df = pd.DataFrame(results, columns=["Word", "Cosine similarity"]) if results else None
+    info_bits = [
+        f"**Model:** `{model_name}` (dim={model.vector_size})",
+        f"**Used:** {', '.join([('+' if s>0 else '−') + w for w,s in used]) if used else '—'}",
+    ]
+    if missing:
+        info_bits.append(f"**Out-of-vocabulary skipped:** {', '.join(missing)}")
+    info = "\n\n".join(info_bits)
+    return df, info
+# ----------------------
+# UI
+# ----------------------
+with gr.Blocks(title="Word Embeddings Playground — Gradio") as demo:
+    gr.Markdown("""
+    # 🧠 Word Embeddings Playground
+    Type equations like `king + woman - man` and explore nearest words using pre-trained Gensim embeddings.
+    """)
+    with gr.Row():
+        model_name = gr.Dropdown(
+            choices=list(MODEL_OPTIONS.keys()),
+            value="glove-wiki-gigaword-100",
+            label="Model",
+            info="If downloads stall, try a smaller model first (50d/100d)."
+        )
+        topn = gr.Slider(5, 50, value=10, step=1, label="Top N similar results")
+        exclude_inputs = gr.Checkbox(value=True, label="Exclude input words from results")
+    with gr.Tab("Expression: A + B − C + …"):
+        expr = gr.Textbox(value="king + woman - man", label="Expression (use + and -)")
+        compute_btn = gr.Button("Compute", variant="primary")
+        out_df = gr.Dataframe(headers=["Word", "Cosine similarity"], interactive=False)
+        out_info = gr.Markdown()
+        examples = gr.Examples(
+            examples=[
+                ["king + woman - man"],
+                ["paris - france + italy"],
+                ["walk + past - present"],
+                ["big - bigger + small"],
+                ["programmer + woman - man"],
+            ],
+            inputs=[expr],
+            label="Examples"
+        )
+        compute_btn.click(
+            fn=compute_expression,
+            inputs=[model_name, expr, topn, exclude_inputs],
+            outputs=[out_df, out_info]
+        )
+    with gr.Tab("Advanced: A + B − C"):
+        with gr.Row():
+            a = gr.Textbox(value="king", label="Word A (+)")
+            b = gr.Textbox(value="woman", label="Word B (+)")
+            c = gr.Textbox(value="man", label="Word C (−)")
+        compute_btn2 = gr.Button("Compute A + B − C")
+        out_df2 = gr.Dataframe(headers=["Word", "Cosine similarity"], interactive=False)
+        out_info2 = gr.Markdown()
+        compute_btn2.click(
+            fn=compute_abc,
+            inputs=[model_name, a, b, c, topn, exclude_inputs],
+            outputs=[out_df2, out_info2]
+        )
+    gr.Markdown("Built with **Gradio** + **Gensim**. Models load via `gensim.downloader`; first-time downloads can take a while depending on size.")
+if __name__ == "__main__":
+    demo.launch()