Spaces:
Sleeping
Sleeping
fix: LR schedule, analogy examples, remove old pretrain.py
Browse files- app.py +1 -1
- microembeddings.py +4 -4
- pretrain.py +0 -24
- pretrain_gensim.py +60 -0
app.py
CHANGED
|
@@ -215,7 +215,7 @@ with gr.Blocks(title="microembeddings", theme=gr.themes.Soft()) as demo:
|
|
| 215 |
analogy_btn = gr.Button("Solve", variant="primary")
|
| 216 |
gr.Examples(
|
| 217 |
[["man", "king", "woman"], ["france", "paris", "germany"],
|
| 218 |
-
["
|
| 219 |
inputs=[a_input, b_input, c_input]
|
| 220 |
)
|
| 221 |
analogy_text = gr.Textbox(label="Results", interactive=False, lines=6)
|
|
|
|
| 215 |
analogy_btn = gr.Button("Solve", variant="primary")
|
| 216 |
gr.Examples(
|
| 217 |
[["man", "king", "woman"], ["france", "paris", "germany"],
|
| 218 |
+
["big", "bigger", "small"]],
|
| 219 |
inputs=[a_input, b_input, c_input]
|
| 220 |
)
|
| 221 |
analogy_text = gr.Textbox(label="Results", interactive=False, lines=6)
|
microembeddings.py
CHANGED
|
@@ -80,9 +80,9 @@ def train(corpus, vocab_size, neg_dist, epochs=EPOCHS, embed_dim=EMBED_DIM,
|
|
| 80 |
W = (np.random.randn(vocab_size, embed_dim) * scale).astype(np.float32)
|
| 81 |
C = np.zeros((vocab_size, embed_dim), dtype=np.float32)
|
| 82 |
|
| 83 |
-
# Each
|
| 84 |
-
#
|
| 85 |
-
total_steps = epochs * len(corpus) * window
|
| 86 |
step = 0
|
| 87 |
losses = []
|
| 88 |
|
|
@@ -186,7 +186,7 @@ if __name__ == "__main__":
|
|
| 186 |
print(f"\n{word}: {', '.join(f'{w} ({s:.3f})' for w, s in neighbors[:5])}")
|
| 187 |
|
| 188 |
print("\n--- Analogies ---")
|
| 189 |
-
for a, b, c in [("
|
| 190 |
("big", "bigger", "small")]:
|
| 191 |
results = analogy(a, b, c, W_norm, word2idx, idx2word)
|
| 192 |
ans = results[0][0] if results else "?"
|
|
|
|
| 80 |
W = (np.random.randn(vocab_size, embed_dim) * scale).astype(np.float32)
|
| 81 |
C = np.zeros((vocab_size, embed_dim), dtype=np.float32)
|
| 82 |
|
| 83 |
+
# Each position draws actual_window ~ uniform(1..window), generating
|
| 84 |
+
# 2*actual_window context pairs. Expected pairs = 2 * E[uniform(1..w)] = w+1
|
| 85 |
+
total_steps = epochs * len(corpus) * (window + 1)
|
| 86 |
step = 0
|
| 87 |
losses = []
|
| 88 |
|
|
|
|
| 186 |
print(f"\n{word}: {', '.join(f'{w} ({s:.3f})' for w, s in neighbors[:5])}")
|
| 187 |
|
| 188 |
print("\n--- Analogies ---")
|
| 189 |
+
for a, b, c in [("man", "king", "woman"), ("france", "paris", "germany"),
|
| 190 |
("big", "bigger", "small")]:
|
| 191 |
results = analogy(a, b, c, W_norm, word2idx, idx2word)
|
| 192 |
ans = results[0][0] if results else "?"
|
pretrain.py
DELETED
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
"""Pre-train embeddings and save for the Gradio app."""
|
| 2 |
-
import json
|
| 3 |
-
import numpy as np
|
| 4 |
-
from microembeddings import (
|
| 5 |
-
load_text8, build_vocab, prepare_corpus, build_neg_table, train
|
| 6 |
-
)
|
| 7 |
-
|
| 8 |
-
words = load_text8(3000000)
|
| 9 |
-
word2idx, idx2word, freqs = build_vocab(words)
|
| 10 |
-
corpus = prepare_corpus(words, word2idx, freqs)
|
| 11 |
-
neg_dist = build_neg_table(freqs)
|
| 12 |
-
|
| 13 |
-
def progress(epoch, i, total, loss):
|
| 14 |
-
pct = i / total * 100
|
| 15 |
-
print(f" Epoch {epoch+1}: {pct:.0f}%, loss={loss:.4f}", end="\r")
|
| 16 |
-
|
| 17 |
-
W, losses = train(corpus, len(word2idx), neg_dist, epochs=3, callback=progress)
|
| 18 |
-
|
| 19 |
-
# Save embeddings as raw binary .npy (no pickle), vocab/losses as JSON
|
| 20 |
-
np.save("pretrained_W.npy", W)
|
| 21 |
-
vocab_list = [idx2word[i] for i in range(len(idx2word))]
|
| 22 |
-
with open("pretrained_vocab.json", "w") as f:
|
| 23 |
-
json.dump({"vocab": vocab_list, "losses": [float(x) for x in losses]}, f)
|
| 24 |
-
print(f"\nSaved: {W.shape[0]} words x {W.shape[1]} dims")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pretrain_gensim.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pre-train embeddings using gensim's optimized Word2Vec on full text8.
|
| 2 |
+
|
| 3 |
+
This is the canonical script for generating pretrained_W.npy and
|
| 4 |
+
pretrained_vocab.json. Run locally (not on HF Space):
|
| 5 |
+
|
| 6 |
+
pip install gensim numpy
|
| 7 |
+
python pretrain_gensim.py
|
| 8 |
+
|
| 9 |
+
gensim is a dev-only dependency — the Space runtime uses only numpy,
|
| 10 |
+
gradio, plotly, and scikit-learn (see requirements.txt).
|
| 11 |
+
"""
|
| 12 |
+
import json
|
| 13 |
+
import numpy as np
|
| 14 |
+
from gensim.models import Word2Vec
|
| 15 |
+
from microembeddings import load_text8
|
| 16 |
+
|
| 17 |
+
# Load full text8 corpus (17M words)
|
| 18 |
+
words = load_text8(max_words=17_000_000)
|
| 19 |
+
# gensim expects list of sentences — split into ~1000-word chunks
|
| 20 |
+
sentences = [words[i:i+1000] for i in range(0, len(words), 1000)]
|
| 21 |
+
|
| 22 |
+
print(f"Training on {len(words)} words, {len(sentences)} sentences...")
|
| 23 |
+
model = Word2Vec(
|
| 24 |
+
sentences,
|
| 25 |
+
vector_size=50,
|
| 26 |
+
window=5,
|
| 27 |
+
min_count=5,
|
| 28 |
+
sg=1, # skip-gram
|
| 29 |
+
negative=5,
|
| 30 |
+
epochs=5,
|
| 31 |
+
workers=4,
|
| 32 |
+
max_final_vocab=10000,
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
# Export to our format
|
| 36 |
+
vocab_list = list(model.wv.index_to_key)
|
| 37 |
+
W = np.array([model.wv[w] for w in vocab_list], dtype=np.float32)
|
| 38 |
+
|
| 39 |
+
np.save("pretrained_W.npy", W)
|
| 40 |
+
with open("pretrained_vocab.json", "w") as f:
|
| 41 |
+
json.dump({"vocab": vocab_list, "losses": []}, f)
|
| 42 |
+
|
| 43 |
+
print(f"Saved: {W.shape[0]} words x {W.shape[1]} dims")
|
| 44 |
+
|
| 45 |
+
# Quick quality check
|
| 46 |
+
from microembeddings import normalize, most_similar, analogy
|
| 47 |
+
W_norm = normalize(W)
|
| 48 |
+
word2idx = {w: i for i, w in enumerate(vocab_list)}
|
| 49 |
+
idx2word = {i: w for i, w in enumerate(vocab_list)}
|
| 50 |
+
|
| 51 |
+
print("\n--- Nearest Neighbors ---")
|
| 52 |
+
for word in ["king", "france", "dog", "computer"]:
|
| 53 |
+
neighbors = most_similar(word, W_norm, word2idx, idx2word, topn=5)
|
| 54 |
+
print(f"{word}: {', '.join(f'{w} ({s:.3f})' for w, s in neighbors)}")
|
| 55 |
+
|
| 56 |
+
print("\n--- Analogies ---")
|
| 57 |
+
for a, b, c in [("man", "king", "woman"), ("france", "paris", "germany"), ("big", "bigger", "small")]:
|
| 58 |
+
results = analogy(a, b, c, W_norm, word2idx, idx2word)
|
| 59 |
+
ans = results[0][0] if results else "?"
|
| 60 |
+
print(f"{a} : {b} :: {c} : {ans}")
|