farquasar commited on
Commit
a4eef9d
Β·
verified Β·
1 Parent(s): 0fbf412

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +180 -0
app.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import re
3
+ from functools import lru_cache
4
+ import gensim.downloader as api
5
+ from gensim.models import KeyedVectors
6
+ import pandas as pd
7
+
8
+ MODEL_OPTIONS = {
9
+ "glove-wiki-gigaword-50": "50d GloVe (Wikipedia+Gigaword) β€” small & fast",
10
+ "glove-wiki-gigaword-100": "100d GloVe (Wikipedia+Gigaword) β€” balanced",
11
+ "glove-wiki-gigaword-200": "200d GloVe (Wikipedia+Gigaword)",
12
+ "glove-wiki-gigaword-300": "300d GloVe (Wikipedia+Gigaword)",
13
+ "word2vec-google-news-300": "300d Google News Word2Vec β€” large (~1.6GB)"
14
+ }
15
+
16
+ TOKEN_RE = re.compile(r"[+\-]|[^+\-\s]+")
17
+
18
+ @lru_cache(maxsize=4)
19
+ def get_model(name: str) -> KeyedVectors:
20
+ """Load/download a pre-trained embedding with caching."""
21
+ return api.load(name)
22
+
23
+ def parse_expression(expr: str):
24
+ tokens = TOKEN_RE.findall(expr.strip())
25
+ if not tokens:
26
+ return [], []
27
+ pos, neg, sign = [], [], '+'
28
+ for tok in tokens:
29
+ tok = tok.strip()
30
+ if tok in ['+', '-']:
31
+ sign = tok
32
+ continue
33
+ (pos if sign == '+' else neg).append(tok)
34
+ return pos, neg
35
+
36
+ # ----------------------
37
+ # Compute functions
38
+ # ----------------------
39
+
40
+ def compute_expression(model_name: str, expr: str, topn: int, exclude_inputs: bool):
41
+ try:
42
+ model = get_model(model_name)
43
+ except Exception as e:
44
+ return None, f"❌ Failed to load model '{model_name}': {e}"
45
+
46
+ pos, neg = parse_expression(expr or "")
47
+ if not pos and not neg:
48
+ return None, "⚠️ Please enter at least one word."
49
+
50
+ pos_in = [w for w in pos if w in model.key_to_index]
51
+ neg_in = [w for w in neg if w in model.key_to_index]
52
+ oov = [w for w in pos + neg if w not in model.key_to_index]
53
+
54
+ if not pos_in and not neg_in:
55
+ return None, "❌ All words are out-of-vocabulary for this model. Try different words or a different model."
56
+
57
+ try:
58
+ results = model.most_similar(positive=pos_in, negative=neg_in, topn=topn + len(pos_in) + len(neg_in))
59
+ except Exception as e:
60
+ return None, f"❌ Computation error: {e}"
61
+
62
+ if exclude_inputs:
63
+ inputs = {w.lower() for w in pos_in + neg_in}
64
+ results = [(w, s) for (w, s) in results if w.lower() not in inputs]
65
+
66
+ results = results[:topn]
67
+ df = pd.DataFrame(results, columns=["Word", "Cosine similarity"]) if results else None
68
+
69
+ info_bits = [
70
+ f"**Model:** `{model_name}` (dim={model.vector_size})",
71
+ f"**Positive:** {', '.join(pos_in) if pos_in else 'β€”'}",
72
+ f"**Negative:** {', '.join(neg_in) if neg_in else 'β€”'}",
73
+ ]
74
+ if oov:
75
+ info_bits.append(f"**Out-of-vocabulary skipped:** {', '.join(oov)}")
76
+ info = "\n\n".join(info_bits)
77
+ return df, info
78
+
79
+
80
+ def compute_abc(model_name: str, a: str, b: str, c: str, topn: int, exclude_inputs: bool):
81
+ try:
82
+ model = get_model(model_name)
83
+ except Exception as e:
84
+ return None, f"❌ Failed to load model '{model_name}': {e}"
85
+
86
+ used, missing = [], []
87
+ vec = None
88
+ for word, sign in [(a, +1), (b, +1), (c, -1)]:
89
+ w = (word or '').strip()
90
+ if not w:
91
+ continue
92
+ if w in model.key_to_index:
93
+ used.append((w, sign))
94
+ v = model.get_vector(w)
95
+ vec = (v if vec is None else vec + sign * v)
96
+ else:
97
+ missing.append(w)
98
+
99
+ if vec is None:
100
+ return None, "❌ No valid words to compute a vector."
101
+
102
+ results = model.similar_by_vector(vec, topn=topn + len(used))
103
+ if exclude_inputs:
104
+ inputs = {w for w, _ in used}
105
+ results = [(w, s) for (w, s) in results if w not in inputs]
106
+ results = results[:topn]
107
+
108
+ df = pd.DataFrame(results, columns=["Word", "Cosine similarity"]) if results else None
109
+
110
+ info_bits = [
111
+ f"**Model:** `{model_name}` (dim={model.vector_size})",
112
+ f"**Used:** {', '.join([('+' if s>0 else 'βˆ’') + w for w,s in used]) if used else 'β€”'}",
113
+ ]
114
+ if missing:
115
+ info_bits.append(f"**Out-of-vocabulary skipped:** {', '.join(missing)}")
116
+ info = "\n\n".join(info_bits)
117
+ return df, info
118
+
119
+ # ----------------------
120
+ # UI
121
+ # ----------------------
122
+ with gr.Blocks(title="Word Embeddings Playground β€” Gradio") as demo:
123
+ gr.Markdown("""
124
+ # 🧠 Word Embeddings Playground
125
+ Type equations like `king + woman - man` and explore nearest words using pre-trained Gensim embeddings.
126
+ """)
127
+
128
+ with gr.Row():
129
+ model_name = gr.Dropdown(
130
+ choices=list(MODEL_OPTIONS.keys()),
131
+ value="glove-wiki-gigaword-100",
132
+ label="Model",
133
+ info="If downloads stall, try a smaller model first (50d/100d)."
134
+ )
135
+ topn = gr.Slider(5, 50, value=10, step=1, label="Top N similar results")
136
+ exclude_inputs = gr.Checkbox(value=True, label="Exclude input words from results")
137
+
138
+ with gr.Tab("Expression: A + B βˆ’ C + …"):
139
+ expr = gr.Textbox(value="king + woman - man", label="Expression (use + and -)")
140
+ compute_btn = gr.Button("Compute", variant="primary")
141
+ out_df = gr.Dataframe(headers=["Word", "Cosine similarity"], interactive=False)
142
+ out_info = gr.Markdown()
143
+
144
+ examples = gr.Examples(
145
+ examples=[
146
+ ["king + woman - man"],
147
+ ["paris - france + italy"],
148
+ ["walk + past - present"],
149
+ ["big - bigger + small"],
150
+ ["programmer + woman - man"],
151
+ ],
152
+ inputs=[expr],
153
+ label="Examples"
154
+ )
155
+
156
+ compute_btn.click(
157
+ fn=compute_expression,
158
+ inputs=[model_name, expr, topn, exclude_inputs],
159
+ outputs=[out_df, out_info]
160
+ )
161
+
162
+ with gr.Tab("Advanced: A + B βˆ’ C"):
163
+ with gr.Row():
164
+ a = gr.Textbox(value="king", label="Word A (+)")
165
+ b = gr.Textbox(value="woman", label="Word B (+)")
166
+ c = gr.Textbox(value="man", label="Word C (βˆ’)")
167
+ compute_btn2 = gr.Button("Compute A + B βˆ’ C")
168
+ out_df2 = gr.Dataframe(headers=["Word", "Cosine similarity"], interactive=False)
169
+ out_info2 = gr.Markdown()
170
+
171
+ compute_btn2.click(
172
+ fn=compute_abc,
173
+ inputs=[model_name, a, b, c, topn, exclude_inputs],
174
+ outputs=[out_df2, out_info2]
175
+ )
176
+
177
+ gr.Markdown("Built with **Gradio** + **Gensim**. Models load via `gensim.downloader`; first-time downloads can take a while depending on size.")
178
+
179
+ if __name__ == "__main__":
180
+ demo.launch()