Spaces:

farquasar
/

words

Sleeping

File size: 6,422 Bytes

a4eef9d

import gradio as gr
import re
from functools import lru_cache
import gensim.downloader as api
from gensim.models import KeyedVectors
import pandas as pd

MODEL_OPTIONS = {
    "glove-wiki-gigaword-50": "50d GloVe (Wikipedia+Gigaword) — small & fast",
    "glove-wiki-gigaword-100": "100d GloVe (Wikipedia+Gigaword) — balanced",
    "glove-wiki-gigaword-200": "200d GloVe (Wikipedia+Gigaword)",
    "glove-wiki-gigaword-300": "300d GloVe (Wikipedia+Gigaword)",
    "word2vec-google-news-300": "300d Google News Word2Vec — large (~1.6GB)"
}

TOKEN_RE = re.compile(r"[+\-]|[^+\-\s]+")

@lru_cache(maxsize=4)
def get_model(name: str) -> KeyedVectors:
    """Load/download a pre-trained embedding with caching."""
    return api.load(name)

def parse_expression(expr: str):
    tokens = TOKEN_RE.findall(expr.strip())
    if not tokens:
        return [], []
    pos, neg, sign = [], [], '+'
    for tok in tokens:
        tok = tok.strip()
        if tok in ['+', '-']:
            sign = tok
            continue
        (pos if sign == '+' else neg).append(tok)
    return pos, neg

# ----------------------
# Compute functions
# ----------------------

def compute_expression(model_name: str, expr: str, topn: int, exclude_inputs: bool):
    try:
        model = get_model(model_name)
    except Exception as e:
        return None, f"❌ Failed to load model '{model_name}': {e}"

    pos, neg = parse_expression(expr or "")
    if not pos and not neg:
        return None, "⚠️ Please enter at least one word."

    pos_in = [w for w in pos if w in model.key_to_index]
    neg_in = [w for w in neg if w in model.key_to_index]
    oov = [w for w in pos + neg if w not in model.key_to_index]

    if not pos_in and not neg_in:
        return None, "❌ All words are out-of-vocabulary for this model. Try different words or a different model."

    try:
        results = model.most_similar(positive=pos_in, negative=neg_in, topn=topn + len(pos_in) + len(neg_in))
    except Exception as e:
        return None, f"❌ Computation error: {e}"

    if exclude_inputs:
        inputs = {w.lower() for w in pos_in + neg_in}
        results = [(w, s) for (w, s) in results if w.lower() not in inputs]

    results = results[:topn]
    df = pd.DataFrame(results, columns=["Word", "Cosine similarity"]) if results else None

    info_bits = [
        f"**Model:** `{model_name}` (dim={model.vector_size})",
        f"**Positive:** {', '.join(pos_in) if pos_in else '—'}",
        f"**Negative:** {', '.join(neg_in) if neg_in else '—'}",
    ]
    if oov:
        info_bits.append(f"**Out-of-vocabulary skipped:** {', '.join(oov)}")
    info = "\n\n".join(info_bits)
    return df, info


def compute_abc(model_name: str, a: str, b: str, c: str, topn: int, exclude_inputs: bool):
    try:
        model = get_model(model_name)
    except Exception as e:
        return None, f"❌ Failed to load model '{model_name}': {e}"

    used, missing = [], []
    vec = None
    for word, sign in [(a, +1), (b, +1), (c, -1)]:
        w = (word or '').strip()
        if not w:
            continue
        if w in model.key_to_index:
            used.append((w, sign))
            v = model.get_vector(w)
            vec = (v if vec is None else vec + sign * v)
        else:
            missing.append(w)

    if vec is None:
        return None, "❌ No valid words to compute a vector."

    results = model.similar_by_vector(vec, topn=topn + len(used))
    if exclude_inputs:
        inputs = {w for w, _ in used}
        results = [(w, s) for (w, s) in results if w not in inputs]
    results = results[:topn]

    df = pd.DataFrame(results, columns=["Word", "Cosine similarity"]) if results else None

    info_bits = [
        f"**Model:** `{model_name}` (dim={model.vector_size})",
        f"**Used:** {', '.join([('+' if s>0 else '−') + w for w,s in used]) if used else '—'}",
    ]
    if missing:
        info_bits.append(f"**Out-of-vocabulary skipped:** {', '.join(missing)}")
    info = "\n\n".join(info_bits)
    return df, info

# ----------------------
# UI
# ----------------------
with gr.Blocks(title="Word Embeddings Playground — Gradio") as demo:
    gr.Markdown("""
    # 🧠 Word Embeddings Playground
    Type equations like `king + woman - man` and explore nearest words using pre-trained Gensim embeddings.
    """)

    with gr.Row():
        model_name = gr.Dropdown(
            choices=list(MODEL_OPTIONS.keys()),
            value="glove-wiki-gigaword-100",
            label="Model",
            info="If downloads stall, try a smaller model first (50d/100d)."
        )
        topn = gr.Slider(5, 50, value=10, step=1, label="Top N similar results")
        exclude_inputs = gr.Checkbox(value=True, label="Exclude input words from results")

    with gr.Tab("Expression: A + B − C + …"):
        expr = gr.Textbox(value="king + woman - man", label="Expression (use + and -)")
        compute_btn = gr.Button("Compute", variant="primary")
        out_df = gr.Dataframe(headers=["Word", "Cosine similarity"], interactive=False)
        out_info = gr.Markdown()

        examples = gr.Examples(
            examples=[
                ["king + woman - man"],
                ["paris - france + italy"],
                ["walk + past - present"],
                ["big - bigger + small"],
                ["programmer + woman - man"],
            ],
            inputs=[expr],
            label="Examples"
        )

        compute_btn.click(
            fn=compute_expression,
            inputs=[model_name, expr, topn, exclude_inputs],
            outputs=[out_df, out_info]
        )

    with gr.Tab("Advanced: A + B − C"):
        with gr.Row():
            a = gr.Textbox(value="king", label="Word A (+)")
            b = gr.Textbox(value="woman", label="Word B (+)")
            c = gr.Textbox(value="man", label="Word C (−)")
        compute_btn2 = gr.Button("Compute A + B − C")
        out_df2 = gr.Dataframe(headers=["Word", "Cosine similarity"], interactive=False)
        out_info2 = gr.Markdown()

        compute_btn2.click(
            fn=compute_abc,
            inputs=[model_name, a, b, c, topn, exclude_inputs],
            outputs=[out_df2, out_info2]
        )

    gr.Markdown("Built with **Gradio** + **Gensim**. Models load via `gensim.downloader`; first-time downloads can take a while depending on size.")

if __name__ == "__main__":
    demo.launch()