import gradio as gr import re from functools import lru_cache import gensim.downloader as api from gensim.models import KeyedVectors import pandas as pd MODEL_OPTIONS = { "glove-wiki-gigaword-50": "50d GloVe (Wikipedia+Gigaword) — small & fast", "glove-wiki-gigaword-100": "100d GloVe (Wikipedia+Gigaword) — balanced", "glove-wiki-gigaword-200": "200d GloVe (Wikipedia+Gigaword)", "glove-wiki-gigaword-300": "300d GloVe (Wikipedia+Gigaword)", "word2vec-google-news-300": "300d Google News Word2Vec — large (~1.6GB)" } TOKEN_RE = re.compile(r"[+\-]|[^+\-\s]+") @lru_cache(maxsize=4) def get_model(name: str) -> KeyedVectors: """Load/download a pre-trained embedding with caching.""" return api.load(name) def parse_expression(expr: str): tokens = TOKEN_RE.findall(expr.strip()) if not tokens: return [], [] pos, neg, sign = [], [], '+' for tok in tokens: tok = tok.strip() if tok in ['+', '-']: sign = tok continue (pos if sign == '+' else neg).append(tok) return pos, neg # ---------------------- # Compute functions # ---------------------- def compute_expression(model_name: str, expr: str, topn: int, exclude_inputs: bool): try: model = get_model(model_name) except Exception as e: return None, f"❌ Failed to load model '{model_name}': {e}" pos, neg = parse_expression(expr or "") if not pos and not neg: return None, "⚠️ Please enter at least one word." pos_in = [w for w in pos if w in model.key_to_index] neg_in = [w for w in neg if w in model.key_to_index] oov = [w for w in pos + neg if w not in model.key_to_index] if not pos_in and not neg_in: return None, "❌ All words are out-of-vocabulary for this model. Try different words or a different model." try: results = model.most_similar(positive=pos_in, negative=neg_in, topn=topn + len(pos_in) + len(neg_in)) except Exception as e: return None, f"❌ Computation error: {e}" if exclude_inputs: inputs = {w.lower() for w in pos_in + neg_in} results = [(w, s) for (w, s) in results if w.lower() not in inputs] results = results[:topn] df = pd.DataFrame(results, columns=["Word", "Cosine similarity"]) if results else None info_bits = [ f"**Model:** `{model_name}` (dim={model.vector_size})", f"**Positive:** {', '.join(pos_in) if pos_in else '—'}", f"**Negative:** {', '.join(neg_in) if neg_in else '—'}", ] if oov: info_bits.append(f"**Out-of-vocabulary skipped:** {', '.join(oov)}") info = "\n\n".join(info_bits) return df, info def compute_abc(model_name: str, a: str, b: str, c: str, topn: int, exclude_inputs: bool): try: model = get_model(model_name) except Exception as e: return None, f"❌ Failed to load model '{model_name}': {e}" used, missing = [], [] vec = None for word, sign in [(a, +1), (b, +1), (c, -1)]: w = (word or '').strip() if not w: continue if w in model.key_to_index: used.append((w, sign)) v = model.get_vector(w) vec = (v if vec is None else vec + sign * v) else: missing.append(w) if vec is None: return None, "❌ No valid words to compute a vector." results = model.similar_by_vector(vec, topn=topn + len(used)) if exclude_inputs: inputs = {w for w, _ in used} results = [(w, s) for (w, s) in results if w not in inputs] results = results[:topn] df = pd.DataFrame(results, columns=["Word", "Cosine similarity"]) if results else None info_bits = [ f"**Model:** `{model_name}` (dim={model.vector_size})", f"**Used:** {', '.join([('+' if s>0 else '−') + w for w,s in used]) if used else '—'}", ] if missing: info_bits.append(f"**Out-of-vocabulary skipped:** {', '.join(missing)}") info = "\n\n".join(info_bits) return df, info # ---------------------- # UI # ---------------------- with gr.Blocks(title="Word Embeddings Playground — Gradio") as demo: gr.Markdown(""" # 🧠 Word Embeddings Playground Type equations like `king + woman - man` and explore nearest words using pre-trained Gensim embeddings. """) with gr.Row(): model_name = gr.Dropdown( choices=list(MODEL_OPTIONS.keys()), value="glove-wiki-gigaword-100", label="Model", info="If downloads stall, try a smaller model first (50d/100d)." ) topn = gr.Slider(5, 50, value=10, step=1, label="Top N similar results") exclude_inputs = gr.Checkbox(value=True, label="Exclude input words from results") with gr.Tab("Expression: A + B − C + …"): expr = gr.Textbox(value="king + woman - man", label="Expression (use + and -)") compute_btn = gr.Button("Compute", variant="primary") out_df = gr.Dataframe(headers=["Word", "Cosine similarity"], interactive=False) out_info = gr.Markdown() examples = gr.Examples( examples=[ ["king + woman - man"], ["paris - france + italy"], ["walk + past - present"], ["big - bigger + small"], ["programmer + woman - man"], ], inputs=[expr], label="Examples" ) compute_btn.click( fn=compute_expression, inputs=[model_name, expr, topn, exclude_inputs], outputs=[out_df, out_info] ) with gr.Tab("Advanced: A + B − C"): with gr.Row(): a = gr.Textbox(value="king", label="Word A (+)") b = gr.Textbox(value="woman", label="Word B (+)") c = gr.Textbox(value="man", label="Word C (−)") compute_btn2 = gr.Button("Compute A + B − C") out_df2 = gr.Dataframe(headers=["Word", "Cosine similarity"], interactive=False) out_info2 = gr.Markdown() compute_btn2.click( fn=compute_abc, inputs=[model_name, a, b, c, topn, exclude_inputs], outputs=[out_df2, out_info2] ) gr.Markdown("Built with **Gradio** + **Gensim**. Models load via `gensim.downloader`; first-time downloads can take a while depending on size.") if __name__ == "__main__": demo.launch()