Spaces:

marimo-team
/

motherduck-embeddings-visualizer

Sleeping

App Files Files Community

mylessss commited on Dec 10, 2024

Commit

5be1c1d

1 Parent(s): 1505be8

update

Browse files

Files changed (1) hide show

app.py +23 -17

app.py CHANGED Viewed

@@ -23,6 +23,7 @@ app = marimo.App(width="medium")
 @app.cell
 def __():
     import marimo as mo
     return (mo,)
@@ -115,24 +116,23 @@ def __(mo):
         Text embeddings typically have hundreds of dimensions (512 in our case), making them impossible to visualize directly. We'll use two techniques to make them interpretable:
-        1. **Dimensionality Reduction**: UMAP will convert our 512D vectors into 2D points while preserving relationships between texts
-        2. **Clustering**: HDBSCAN will group similar texts together
         """
     )
     return
 @app.cell(hide_code=True)
-def __(cluster_points, mo, umap_reduce):
     def md_help(cls):
         import inspect
         return f"def {cls.__name__} {inspect.signature(cls)}:\n    {cls.__doc__}"
     mo.accordion(
         {
-            "`umap_reduce`": md_help(umap_reduce),
             "`cluster_points`": md_help(cluster_points),
         }
     )
@@ -141,11 +141,12 @@ def __(cluster_points, mo, umap_reduce):
 @app.cell
 def __(np):
-    def umap_reduce(np_array, metric="cosine"):
         """
-        Reduce the dimensionality of the embeddings to 2D using
-        UMAP algorithm. UMAP preserves both local and global structure
-        of the high-dimensional data.
         """
         import umap
@@ -157,12 +158,13 @@ def __(np):
         )
         return reducer.fit_transform(np_array)
     def cluster_points(np_array, min_cluster_size=4, max_cluster_size=50):
         """
-        Cluster the embeddings using HDBSCAN algorithm.
-        We first reduce dimensionality to 50D with PCA to speed up clustering,
-        while still preserving most of the important information.
         """
         import hdbscan
         from sklearn.decomposition import PCA
@@ -179,7 +181,8 @@ def __(np):
         return np.where(
             hdb.labels_ == -1, "outlier", "cluster_" + hdb.labels_.astype(str)
         )
-    return cluster_points, umap_reduce
 @app.cell
@@ -207,7 +210,7 @@ def __(mo):
         r"""
         ## Processing the Data
-        Now we'll transform our high-dimensional embeddings into something we can visualize, using `umap_reduce` and `cluster_points`. More details on this step [in the blog](https://motherduck.com/blog/MotherDuck-Visualize-Embeddings-Marimo/).
         """
     )
     return
@@ -220,7 +223,7 @@ def __(
     embeddings,
     metric_dropdown,
     mo,
-    umap_reduce,
 ):
     with mo.status.spinner("Clustering points...") as _s:
         import numba
@@ -232,7 +235,9 @@ def __(
             max_cluster_size=cluster_size_slider.value[1],
         )
         _s.update("Reducing dimensionality...")
-        embeddings_2d = umap_reduce(embeddings_array, metric=metric_dropdown.value)
     mo.show_code()
     return embeddings_2d, embeddings_array, hdb_labels, numba
@@ -326,6 +331,7 @@ def __():
     # ML tools for dimensionality reduction and clustering
     import numpy as np
     return alt, duckdb, np, pl, pyarrow

 @app.cell
 def __():
     import marimo as mo
     return (mo,)
         Text embeddings typically have hundreds of dimensions (512 in our case), making them impossible to visualize directly. We'll use two techniques to make them interpretable:
+        1. **Dimensionality Reduction**: Convert our 512D vectors into 2D points while preserving relationships between texts
+        2. **Clustering**: Group similar texts together into clusters
         """
     )
     return
 @app.cell(hide_code=True)
+def __(cluster_points, mo, reduce_dimensions):
     def md_help(cls):
         import inspect
         return f"def {cls.__name__} {inspect.signature(cls)}:\n    {cls.__doc__}"
     mo.accordion(
         {
+            "`reduce_dimensions`": md_help(reduce_dimensions),
             "`cluster_points`": md_help(cluster_points),
         }
     )
 @app.cell
 def __(np):
+    def reduce_dimensions(np_array, metric="cosine"):
         """
+        Reduce the dimensions of embeddings to a 2D space.
+        Here we use the UMAP algorithm. UMAP preserves both local and
+        global structure of the high-dimensional data.
         """
         import umap
         )
         return reducer.fit_transform(np_array)
     def cluster_points(np_array, min_cluster_size=4, max_cluster_size=50):
         """
+        Cluster the embeddings.
+        Here we use the HDBSCAN algorithm. We first reduce dimensionality to 50D with
+        PCA to speed up clustering, while still preserving most of the important information.
         """
         import hdbscan
         from sklearn.decomposition import PCA
         return np.where(
             hdb.labels_ == -1, "outlier", "cluster_" + hdb.labels_.astype(str)
         )
+    return cluster_points, reduce_dimensions
 @app.cell
         r"""
         ## Processing the Data
+        Now we'll transform our high-dimensional embeddings into something we can visualize, using `reduce_dimensions` and `cluster_points`. More details on this step [in the blog](https://motherduck.com/blog/MotherDuck-Visualize-Embeddings-Marimo/).
         """
     )
     return
     embeddings,
     metric_dropdown,
     mo,
+    reduce_dimensions,
 ):
     with mo.status.spinner("Clustering points...") as _s:
         import numba
             max_cluster_size=cluster_size_slider.value[1],
         )
         _s.update("Reducing dimensionality...")
+        embeddings_2d = reduce_dimensions(
+            embeddings_array, metric=metric_dropdown.value
+        )
     mo.show_code()
     return embeddings_2d, embeddings_array, hdb_labels, numba
     # ML tools for dimensionality reduction and clustering
     import numpy as np
     return alt, duckdb, np, pl, pyarrow