shreyask commited on
Commit
451cbf8
·
verified ·
1 Parent(s): ac7577a

fix: LR schedule, analogy examples, remove old pretrain.py

Browse files
Files changed (4) hide show
  1. app.py +1 -1
  2. microembeddings.py +4 -4
  3. pretrain.py +0 -24
  4. pretrain_gensim.py +60 -0
app.py CHANGED
@@ -215,7 +215,7 @@ with gr.Blocks(title="microembeddings", theme=gr.themes.Soft()) as demo:
215
  analogy_btn = gr.Button("Solve", variant="primary")
216
  gr.Examples(
217
  [["man", "king", "woman"], ["france", "paris", "germany"],
218
- ["bigger", "big", "small"]],
219
  inputs=[a_input, b_input, c_input]
220
  )
221
  analogy_text = gr.Textbox(label="Results", interactive=False, lines=6)
 
215
  analogy_btn = gr.Button("Solve", variant="primary")
216
  gr.Examples(
217
  [["man", "king", "woman"], ["france", "paris", "germany"],
218
+ ["big", "bigger", "small"]],
219
  inputs=[a_input, b_input, c_input]
220
  )
221
  analogy_text = gr.Textbox(label="Results", interactive=False, lines=6)
microembeddings.py CHANGED
@@ -80,9 +80,9 @@ def train(corpus, vocab_size, neg_dist, epochs=EPOCHS, embed_dim=EMBED_DIM,
80
  W = (np.random.randn(vocab_size, embed_dim) * scale).astype(np.float32)
81
  C = np.zeros((vocab_size, embed_dim), dtype=np.float32)
82
 
83
- # Each corpus position generates ~window context pairs on average
84
- # (random window from 1..window, mean = (window+1)/2, times 2 sides)
85
- total_steps = epochs * len(corpus) * window
86
  step = 0
87
  losses = []
88
 
@@ -186,7 +186,7 @@ if __name__ == "__main__":
186
  print(f"\n{word}: {', '.join(f'{w} ({s:.3f})' for w, s in neighbors[:5])}")
187
 
188
  print("\n--- Analogies ---")
189
- for a, b, c in [("king", "man", "woman"), ("paris", "france", "germany"),
190
  ("big", "bigger", "small")]:
191
  results = analogy(a, b, c, W_norm, word2idx, idx2word)
192
  ans = results[0][0] if results else "?"
 
80
  W = (np.random.randn(vocab_size, embed_dim) * scale).astype(np.float32)
81
  C = np.zeros((vocab_size, embed_dim), dtype=np.float32)
82
 
83
+ # Each position draws actual_window ~ uniform(1..window), generating
84
+ # 2*actual_window context pairs. Expected pairs = 2 * E[uniform(1..w)] = w+1
85
+ total_steps = epochs * len(corpus) * (window + 1)
86
  step = 0
87
  losses = []
88
 
 
186
  print(f"\n{word}: {', '.join(f'{w} ({s:.3f})' for w, s in neighbors[:5])}")
187
 
188
  print("\n--- Analogies ---")
189
+ for a, b, c in [("man", "king", "woman"), ("france", "paris", "germany"),
190
  ("big", "bigger", "small")]:
191
  results = analogy(a, b, c, W_norm, word2idx, idx2word)
192
  ans = results[0][0] if results else "?"
pretrain.py DELETED
@@ -1,24 +0,0 @@
1
- """Pre-train embeddings and save for the Gradio app."""
2
- import json
3
- import numpy as np
4
- from microembeddings import (
5
- load_text8, build_vocab, prepare_corpus, build_neg_table, train
6
- )
7
-
8
- words = load_text8(3000000)
9
- word2idx, idx2word, freqs = build_vocab(words)
10
- corpus = prepare_corpus(words, word2idx, freqs)
11
- neg_dist = build_neg_table(freqs)
12
-
13
- def progress(epoch, i, total, loss):
14
- pct = i / total * 100
15
- print(f" Epoch {epoch+1}: {pct:.0f}%, loss={loss:.4f}", end="\r")
16
-
17
- W, losses = train(corpus, len(word2idx), neg_dist, epochs=3, callback=progress)
18
-
19
- # Save embeddings as raw binary .npy (no pickle), vocab/losses as JSON
20
- np.save("pretrained_W.npy", W)
21
- vocab_list = [idx2word[i] for i in range(len(idx2word))]
22
- with open("pretrained_vocab.json", "w") as f:
23
- json.dump({"vocab": vocab_list, "losses": [float(x) for x in losses]}, f)
24
- print(f"\nSaved: {W.shape[0]} words x {W.shape[1]} dims")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pretrain_gensim.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Pre-train embeddings using gensim's optimized Word2Vec on full text8.
2
+
3
+ This is the canonical script for generating pretrained_W.npy and
4
+ pretrained_vocab.json. Run locally (not on HF Space):
5
+
6
+ pip install gensim numpy
7
+ python pretrain_gensim.py
8
+
9
+ gensim is a dev-only dependency — the Space runtime uses only numpy,
10
+ gradio, plotly, and scikit-learn (see requirements.txt).
11
+ """
12
+ import json
13
+ import numpy as np
14
+ from gensim.models import Word2Vec
15
+ from microembeddings import load_text8
16
+
17
+ # Load full text8 corpus (17M words)
18
+ words = load_text8(max_words=17_000_000)
19
+ # gensim expects list of sentences — split into ~1000-word chunks
20
+ sentences = [words[i:i+1000] for i in range(0, len(words), 1000)]
21
+
22
+ print(f"Training on {len(words)} words, {len(sentences)} sentences...")
23
+ model = Word2Vec(
24
+ sentences,
25
+ vector_size=50,
26
+ window=5,
27
+ min_count=5,
28
+ sg=1, # skip-gram
29
+ negative=5,
30
+ epochs=5,
31
+ workers=4,
32
+ max_final_vocab=10000,
33
+ )
34
+
35
+ # Export to our format
36
+ vocab_list = list(model.wv.index_to_key)
37
+ W = np.array([model.wv[w] for w in vocab_list], dtype=np.float32)
38
+
39
+ np.save("pretrained_W.npy", W)
40
+ with open("pretrained_vocab.json", "w") as f:
41
+ json.dump({"vocab": vocab_list, "losses": []}, f)
42
+
43
+ print(f"Saved: {W.shape[0]} words x {W.shape[1]} dims")
44
+
45
+ # Quick quality check
46
+ from microembeddings import normalize, most_similar, analogy
47
+ W_norm = normalize(W)
48
+ word2idx = {w: i for i, w in enumerate(vocab_list)}
49
+ idx2word = {i: w for i, w in enumerate(vocab_list)}
50
+
51
+ print("\n--- Nearest Neighbors ---")
52
+ for word in ["king", "france", "dog", "computer"]:
53
+ neighbors = most_similar(word, W_norm, word2idx, idx2word, topn=5)
54
+ print(f"{word}: {', '.join(f'{w} ({s:.3f})' for w, s in neighbors)}")
55
+
56
+ print("\n--- Analogies ---")
57
+ for a, b, c in [("man", "king", "woman"), ("france", "paris", "germany"), ("big", "bigger", "small")]:
58
+ results = analogy(a, b, c, W_norm, word2idx, idx2word)
59
+ ans = results[0][0] if results else "?"
60
+ print(f"{a} : {b} :: {c} : {ans}")