fast-bulk

Sleeping

App Files Files Community

koaning commited on Feb 27

Commit

c1bba49

1 Parent(s): a2cb3cf

fast-bulk

Browse files

Files changed (1) hide show

app.py +76 -138

app.py CHANGED Viewed

@@ -10,6 +10,7 @@
 #     "scikit-learn==1.6.1",
 #     "numpy==2.1.3",
 #     "mohtml==0.1.2",
 # ]
 # ///
@@ -66,124 +67,105 @@ def _(mo, pl, should_stop, uploaded_file, use_default_switch):
 @app.cell
-def _(SentenceTransformer, mo, texts):
     with mo.status.spinner(subtitle="Creating embeddings ...") as _spinner:
-        tfm = SentenceTransformer("all-MiniLM-L6-v2")
         X = tfm.encode(texts)
-    return X, tfm
 @app.cell
-def _(X, mo):
-    with mo.status.spinner(subtitle="Running UMAP ...") as _spinner:
-        from umap import UMAP
-        umap_tfm = UMAP()
-        X_tfm = umap_tfm.fit_transform(X)
-    return UMAP, X_tfm, umap_tfm
 @app.cell
-def _(add_label, mo, neg_label, pos_label, undo):
-    btn_spam = mo.ui.button(label=f"Annotate {neg_label.value}", on_click=lambda d: add_label(neg_label.value))
-    btn_ham = mo.ui.button(label=f"Annotate {pos_label.value}", on_click=lambda d: add_label(pos_label.value))
-    btn_undo = mo.ui.button(label="Undo", on_click=lambda d: undo())
-    return btn_ham, btn_spam, btn_undo
 @app.cell
-def _(chart, get_label, neg_label, pos_label, set_label):
-    def add_label(lab):
         current_labels = get_label()
-        if lab == neg_label.value:
-            new_ham = list(set(current_labels[pos_label.value]).difference(chart.value["index"]))
-            new_spam = list(set(current_labels[neg_label.value]).union(chart.value["index"]))
-        if lab == pos_label.value:
-            new_ham = list(set(current_labels[pos_label.value]).union(chart.value["index"]))
-            new_spam = list(set(current_labels[neg_label.value]).difference(chart.value["index"]))
-        set_label({neg_label.value: new_spam, pos_label.value: new_ham})
     return (add_label,)
 @app.cell
-def _(
-    br,
-    btn_ham,
-    btn_spam,
-    btn_undo,
-    chart,
-    form,
-    json_download,
-    mo,
-    neg_label,
-    pos_label,
-    switch,
-):
-    mo.vstack([
-        mo.md("Assign label names"),
-        mo.hstack([pos_label, neg_label]),
-        mo.md("Explore the data"),
-        mo.hstack([btn_ham, btn_spam, btn_undo, switch, json_download]),
-        br(),
-        form if switch.value else "",
-        br() if switch.value else "",
-        chart
-    ])
-    return
 @app.cell
-def _(chart):
-    chart.value["text"]
-    return
 @app.cell
-def _(chart, get_label, neg_label, pos_label, set_label):
-    def undo():
-        current_labels = get_label()
-        new_spam = set(current_labels[neg_label.value]).difference(chart.value["index"])
-        new_ham = set(current_labels[pos_label.value]).difference(chart.value["index"])
-        set_label({neg_label.value: list(new_spam), pos_label.value: list(new_ham)})
-    return (undo,)
 @app.cell
-def _():
-    from mohtml import br
-    return (br,)
 @app.cell
-def _(mo, neg_label, pos_label):
-    get_label, set_label = mo.state({pos_label.value: [], neg_label.value: []})
-    return get_label, set_label
 @app.cell
 def _(mo):
     text_input = mo.ui.text_area(label="Reference sentences")
     form = mo.md("""{text_input}""").batch(text_input=text_input).form()
     return form, text_input
 @app.cell
-def _(df_emb, labels, mo):
-    from collections import Counter
-    with mo.status.spinner(subtitle="Starting UI ...") as _spinner:
-        df_emb
-    Counter(labels)
-    return (Counter,)
-@app.cell
-def _(df_emb, mo, pl):
     import json
-    data = df_emb.filter(pl.col("label") != "unlabeled").select("text", "label").to_dicts()
     json_download = mo.download(
         data=json.dumps(data).encode("utf-8"),
@@ -195,47 +177,9 @@ def _(df_emb, mo, pl):
 @app.cell
-def _(df_emb, mo, scatter):
-    chart = mo.ui.altair_chart(scatter(df_emb))
-    return (chart,)
-@app.cell
-def _(mo):
-    switch = mo.ui.switch(False, label="Use search")
-    return (switch,)
-@app.cell
-def _(alt, neg_label, pos_label, switch):
-    def scatter(df):
-        return (alt.Chart(df)
-        .mark_circle()
-        .encode(
-            x=alt.X("x:Q"),
-            y=alt.Y("y:Q"),
-            color=alt.Color("sim:Q") if switch.value else alt.Color("label:N", scale=alt.Scale(
-               domain=['unlabeled', pos_label.value, neg_label.value],
-               range=['steelblue', 'green', 'red']
-            ))
-        ).properties(width=500, height=500))
-    return (scatter,)
-@app.cell
-def _(
-    X,
-    X_tfm,
-    cosine_similarity,
-    form,
-    get_label,
-    neg_label,
-    np,
-    pl,
-    pos_label,
-    texts,
-    tfm,
-):
     df_emb = (
         pl.DataFrame({
             "x": X_tfm[:, 0],
@@ -245,25 +189,18 @@ def _(
         }).with_columns(sim=pl.lit(1))
     )
-    if form.value:
-        query = tfm.encode([form.value["text_input"]])
-        similarity = cosine_similarity(query, X)[0]
-        df_emb = df_emb.with_columns(sim=similarity)
-    spam = set(get_label()[neg_label.value])
-    ham = set(get_label()[pos_label.value])
-    labels = []
-    for i in range(df_emb.shape[0]):
-        if i in spam:
-            labels.append(neg_label.value)
-        elif i in ham:
-            labels.append(pos_label.value)
-        else:
-            labels.append("unlabeled")
-    df_emb = df_emb.with_columns(label=np.array(labels))
-    return df_emb, ham, i, labels, query, similarity, spam
 @app.cell
@@ -274,14 +211,15 @@ def _(mo):
         import numpy as np
         from sklearn.metrics.pairwise import cosine_similarity
         from sklearn.linear_model import LogisticRegression
-    return LogisticRegression, alt, cosine_similarity, np, pl
 @app.cell
 def _(mo):
-    with mo.status.spinner(subtitle="Loading SBERT ...") as _spinner:
-        from sentence_transformers import SentenceTransformer
-    return (SentenceTransformer,)
 @app.cell

 #     "scikit-learn==1.6.1",
 #     "numpy==2.1.3",
 #     "mohtml==0.1.2",
+#     "model2vec==0.4.0",
 # ]
 # ///
 @app.cell
+def _(StaticModel, mo):
+    with mo.status.spinner(subtitle="Loading model ...") as _spinner:
+        tfm = StaticModel.from_pretrained("minishlab/potion-retrieval-32M")
+    return (tfm,)
+@app.cell
+def _(mo, texts, tfm):
     with mo.status.spinner(subtitle="Creating embeddings ...") as _spinner:
         X = tfm.encode(texts)
+    return (X,)
 @app.cell
+def _(PCA, X, mo):
+    with mo.status.spinner(subtitle="Running PCA ...") as _spinner:
+        pca_tfm = PCA()
+        X_tfm = pca_tfm.fit_transform(X)
+    return X_tfm, pca_tfm
 @app.cell
+def _(add_label, get_example, mo, neg_label, pos_label):
+    btn_spam = mo.ui.button(
+        label=f"Annotate {neg_label.value}",
+        on_click=lambda d: add_label(get_example(), neg_label.value)
+    )
+    btn_ham = mo.ui.button(
+        label=f"Annotate {pos_label.value}",
+        on_click=lambda d: add_label(get_example(), pos_label.value)
+    )
+    return btn_ham, btn_spam
 @app.cell
+def _(gen, get_label, set_example, set_label):
+    def add_label(text, lab):
         current_labels = get_label()
+        set_label(current_labels + [{"text": text, "label": lab}])
+        set_example(next(gen))
     return (add_label,)
 @app.cell
+def _():
+    from mohtml import br
+    return (br,)
 @app.cell
+def _(mo):
+    get_label, set_label = mo.state([])
+    return get_label, set_label
 @app.cell
+def _(gen, mo):
+    get_example, set_example = mo.state(next(gen))
+    return get_example, set_example
 @app.cell
+def _(div, get_example, p):
+    div(
+        p(get_example()),
+        klass="bg-gray-100 p-4 rounded-lg"
+    )
+    return
 @app.cell
+def _(btn_ham, btn_spam, mo):
+    mo.hstack([
+        btn_ham, btn_spam
+    ])
+    return
+@app.cell
+def _():
+    from mohtml import tailwind_css, div, p
+    tailwind_css()
+    return div, p, tailwind_css
 @app.cell
 def _(mo):
     text_input = mo.ui.text_area(label="Reference sentences")
     form = mo.md("""{text_input}""").batch(text_input=text_input).form()
+    form
     return form, text_input
 @app.cell
+def _(get_label, mo):
     import json
+    data = get_label()
     json_download = mo.download(
         data=json.dumps(data).encode("utf-8"),
 @app.cell
+def _(X, X_tfm, cosine_similarity, form, mo, pl, texts, tfm):
+    mo.stop(not form.value["text_input"], "Need a text input to fetch example")
     df_emb = (
         pl.DataFrame({
             "x": X_tfm[:, 0],
         }).with_columns(sim=pl.lit(1))
     )
+    query = tfm.encode([form.value["text_input"]])
+    similarity = cosine_similarity(query, X)[0]
+    df_emb = df_emb.with_columns(sim=similarity).sort(pl.col("sim"), descending=True)
+    gen = (_["text"] for _ in df_emb.head(100).to_dicts())
+    return df_emb, gen, query, similarity
+@app.cell
+def _(get_label, pl):
+    pl.DataFrame(get_label())
+    return
 @app.cell
         import numpy as np
         from sklearn.metrics.pairwise import cosine_similarity
         from sklearn.linear_model import LogisticRegression
+        from sklearn.decomposition import PCA
+    return LogisticRegression, PCA, alt, cosine_similarity, np, pl
 @app.cell
 def _(mo):
+    with mo.status.spinner(subtitle="Loading model2vec ...") as _spinner:
+        from model2vec import StaticModel
+    return (StaticModel,)
 @app.cell