Spaces:

shreyask
/

microembeddings

Sleeping

App Files Files Community

shreyask commited on Mar 3

Commit

43fd8a7

verified ·

1 Parent(s): 451cbf8

fix: robust text8 loading, gensim attribution in UI, training error handling

Browse files

Files changed (2) hide show

app.py +44 -28
microembeddings.py +31 -9

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ from sklearn.decomposition import PCA
 from sklearn.manifold import TSNE
 from microembeddings import (
     load_text8, build_vocab, prepare_corpus, build_neg_table,
-    train, normalize, most_similar, analogy
 )
 # --- Global state ---
@@ -17,49 +17,57 @@ def load_pretrained():
     """Load pre-trained embeddings if available."""
     try:
         W = np.load("pretrained_W.npy")
-        meta = json.load(open("pretrained_vocab.json"))
         vocab = meta["vocab"]
         state["W"] = W
         state["W_norm"] = normalize(W)
         state["word2idx"] = {w: i for i, w in enumerate(vocab)}
         state["idx2word"] = {i: w for i, w in enumerate(vocab)}
         state["losses"] = meta.get("losses", [])
-        return f"Loaded pre-trained: {W.shape[0]} words x {W.shape[1]} dims"
     except FileNotFoundError:
         return "No pre-trained embeddings found. Train from scratch!"
 # --- Tab 1: Train ---
 def run_training(embed_dim, window_size, learning_rate, num_neg, progress=gr.Progress()):
-    progress(0, desc="Loading text8...")
-    words = load_text8(500000)
-    word2idx, idx2word, freqs = build_vocab(words)
-    corpus = prepare_corpus(words, word2idx, freqs)
-    neg_dist = build_neg_table(freqs)
-    state["word2idx"] = word2idx
-    state["idx2word"] = idx2word
-    losses = []
-    def callback(epoch, i, total, loss):
-        pct = i / total
-        progress(pct, desc=f"Epoch {epoch+1}: loss={loss:.4f}")
-        losses.append(loss)
-    W, _ = train(corpus, len(word2idx), neg_dist,
-                 epochs=3, embed_dim=int(embed_dim), lr=learning_rate,
-                 window=int(window_size), num_neg=int(num_neg), callback=callback)
-    state["W"] = W
-    state["W_norm"] = normalize(W)
-    state["losses"] = losses
-    fig = go.Figure()
-    fig.add_trace(go.Scatter(y=losses, mode="lines", name="Loss",
-                             line=dict(color="#4F46E5")))
-    fig.update_layout(title="Training Loss", xaxis_title="Step", yaxis_title="Loss",
-                      template="plotly_white")
-    return fig, f"Done! {W.shape[0]} words x {W.shape[1]} dims"
 # --- Tab 2: Explore ---
@@ -153,6 +161,7 @@ def find_neighbors(word):
 # --- Build UI ---
 load_msg = load_pretrained()
 with gr.Blocks(title="microembeddings", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
@@ -163,10 +172,17 @@ with gr.Blocks(title="microembeddings", theme=gr.themes.Soft()) as demo:
         "(https://kshreyas.dev/post/microembeddings/)"
     )
     gr.Markdown(f"*{load_msg}*")
     with gr.Tabs():
         with gr.Tab("Train"):
-            gr.Markdown("Train word embeddings from scratch on text8 (cleaned Wikipedia).")
             with gr.Row():
                 dim_slider = gr.Slider(25, 100, value=50, step=25,
                                        label="Embedding dimension")

 from sklearn.manifold import TSNE
 from microembeddings import (
     load_text8, build_vocab, prepare_corpus, build_neg_table,
+    train, normalize, most_similar, analogy, describe_text8_source
 )
 # --- Global state ---
     """Load pre-trained embeddings if available."""
     try:
         W = np.load("pretrained_W.npy")
+        with open("pretrained_vocab.json") as f:
+            meta = json.load(f)
         vocab = meta["vocab"]
         state["W"] = W
         state["W_norm"] = normalize(W)
         state["word2idx"] = {w: i for i, w in enumerate(vocab)}
         state["idx2word"] = {i: w for i, w in enumerate(vocab)}
         state["losses"] = meta.get("losses", [])
+        return (
+            "Loaded pre-trained full-text8 gensim vectors: "
+            f"{W.shape[0]} words x {W.shape[1]} dims"
+        )
     except FileNotFoundError:
         return "No pre-trained embeddings found. Train from scratch!"
 # --- Tab 1: Train ---
 def run_training(embed_dim, window_size, learning_rate, num_neg, progress=gr.Progress()):
+    fig = go.Figure()
+    try:
+        progress(0, desc="Loading text8...")
+        words = load_text8(500000)
+        word2idx, idx2word, freqs = build_vocab(words)
+        corpus = prepare_corpus(words, word2idx, freqs)
+        neg_dist = build_neg_table(freqs)
+        state["word2idx"] = word2idx
+        state["idx2word"] = idx2word
+        losses = []
+        def callback(epoch, i, total, loss):
+            pct = i / total
+            progress(pct, desc=f"Epoch {epoch+1}: loss={loss:.4f}")
+            losses.append(loss)
+        W, _ = train(corpus, len(word2idx), neg_dist,
+                     epochs=3, embed_dim=int(embed_dim), lr=learning_rate,
+                     window=int(window_size), num_neg=int(num_neg), callback=callback)
+        state["W"] = W
+        state["W_norm"] = normalize(W)
+        state["losses"] = losses
+        fig.add_trace(go.Scatter(y=losses, mode="lines", name="Loss",
+                                 line=dict(color="#4F46E5")))
+        fig.update_layout(title="Training Loss", xaxis_title="Step", yaxis_title="Loss",
+                          template="plotly_white")
+        return fig, f"Done! {W.shape[0]} words x {W.shape[1]} dims"
+    except Exception as exc:
+        fig.update_layout(title="Training unavailable", template="plotly_white")
+        return fig, f"Training failed: {exc}"
 # --- Tab 2: Explore ---
 # --- Build UI ---
 load_msg = load_pretrained()
+corpus_msg = describe_text8_source()
 with gr.Blocks(title="microembeddings", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         "(https://kshreyas.dev/post/microembeddings/)"
     )
     gr.Markdown(f"*{load_msg}*")
+    gr.Markdown(
+        "*Preloaded vectors use gensim Word2Vec on the full 17M-word text8 corpus.* "
+        "*The Train tab reruns the NumPy implementation on a 500k-word subset so it stays interactive.*"
+    )
     with gr.Tabs():
         with gr.Tab("Train"):
+            gr.Markdown(
+                "Train word embeddings from scratch on text8 (cleaned Wikipedia).\n\n"
+                f"{corpus_msg}"
+            )
             with gr.Row():
                 dim_slider = gr.Slider(25, 100, value=50, step=25,
                                        label="Embedding dimension")

microembeddings.py CHANGED Viewed

@@ -18,19 +18,41 @@ EPOCHS = 3
 MIN_COUNT = 5
 MAX_VOCAB = 10000
 SUBSAMPLE_THRESHOLD = 1e-4
 def load_text8(max_words=500000):
     """Download text8 and return list of words."""
-    fname = "text8"
-    if not os.path.exists(fname):
-        url = "http://mattmahoney.net/dc/text8.zip"
-        print("Downloading text8...")
-        urllib.request.urlretrieve(url, "text8.zip")
-        with zipfile.ZipFile("text8.zip") as z:
-            z.extractall()
-        os.remove("text8.zip")
-    with open(fname) as f:
         words = f.read().split()[:max_words]
     print(f"Loaded {len(words)} words")
     return words

 MIN_COUNT = 5
 MAX_VOCAB = 10000
 SUBSAMPLE_THRESHOLD = 1e-4
+TEXT8_FILE = "text8"
+TEXT8_ZIP = "text8.zip"
+TEXT8_URL = "http://mattmahoney.net/dc/text8.zip"
+def describe_text8_source():
+    """Summarize how training data will be loaded."""
+    if os.path.exists(TEXT8_FILE):
+        return "Local text8 corpus found."
+    if os.path.exists(TEXT8_ZIP):
+        return "Local text8.zip found; it will be extracted on first train."
+    return "text8 is not bundled; Train will download it on first run."
 def load_text8(max_words=500000):
     """Download text8 and return list of words."""
+    downloaded = False
+    if not os.path.exists(TEXT8_FILE):
+        if not os.path.exists(TEXT8_ZIP):
+            print("Downloading text8...")
+            try:
+                urllib.request.urlretrieve(TEXT8_URL, TEXT8_ZIP)
+            except OSError as exc:
+                raise RuntimeError(
+                    "Could not load text8. Add a local text8/text8.zip file or allow outbound download."
+                ) from exc
+            downloaded = True
+        try:
+            with zipfile.ZipFile(TEXT8_ZIP) as z:
+                z.extractall()
+        except (OSError, zipfile.BadZipFile) as exc:
+            raise RuntimeError("text8.zip is missing or invalid.") from exc
+        if downloaded:
+            os.remove(TEXT8_ZIP)
+    with open(TEXT8_FILE) as f:
         words = f.read().split()[:max_words]
     print(f"Loaded {len(words)} words")
     return words