update
Browse files
app.py
CHANGED
|
@@ -23,6 +23,7 @@ app = marimo.App(width="medium")
|
|
| 23 |
@app.cell
|
| 24 |
def __():
|
| 25 |
import marimo as mo
|
|
|
|
| 26 |
return (mo,)
|
| 27 |
|
| 28 |
|
|
@@ -115,24 +116,23 @@ def __(mo):
|
|
| 115 |
|
| 116 |
Text embeddings typically have hundreds of dimensions (512 in our case), making them impossible to visualize directly. We'll use two techniques to make them interpretable:
|
| 117 |
|
| 118 |
-
1. **Dimensionality Reduction**:
|
| 119 |
-
2. **Clustering**:
|
| 120 |
"""
|
| 121 |
)
|
| 122 |
return
|
| 123 |
|
| 124 |
|
| 125 |
@app.cell(hide_code=True)
|
| 126 |
-
def __(cluster_points, mo,
|
| 127 |
def md_help(cls):
|
| 128 |
import inspect
|
| 129 |
|
| 130 |
return f"def {cls.__name__} {inspect.signature(cls)}:\n {cls.__doc__}"
|
| 131 |
|
| 132 |
-
|
| 133 |
mo.accordion(
|
| 134 |
{
|
| 135 |
-
"`
|
| 136 |
"`cluster_points`": md_help(cluster_points),
|
| 137 |
}
|
| 138 |
)
|
|
@@ -141,11 +141,12 @@ def __(cluster_points, mo, umap_reduce):
|
|
| 141 |
|
| 142 |
@app.cell
|
| 143 |
def __(np):
|
| 144 |
-
def
|
| 145 |
"""
|
| 146 |
-
Reduce the
|
| 147 |
-
|
| 148 |
-
|
|
|
|
| 149 |
"""
|
| 150 |
import umap
|
| 151 |
|
|
@@ -157,12 +158,13 @@ def __(np):
|
|
| 157 |
)
|
| 158 |
return reducer.fit_transform(np_array)
|
| 159 |
|
| 160 |
-
|
| 161 |
def cluster_points(np_array, min_cluster_size=4, max_cluster_size=50):
|
| 162 |
"""
|
| 163 |
-
Cluster the embeddings
|
| 164 |
-
|
| 165 |
-
|
|
|
|
|
|
|
| 166 |
"""
|
| 167 |
import hdbscan
|
| 168 |
from sklearn.decomposition import PCA
|
|
@@ -179,7 +181,8 @@ def __(np):
|
|
| 179 |
return np.where(
|
| 180 |
hdb.labels_ == -1, "outlier", "cluster_" + hdb.labels_.astype(str)
|
| 181 |
)
|
| 182 |
-
|
|
|
|
| 183 |
|
| 184 |
|
| 185 |
@app.cell
|
|
@@ -207,7 +210,7 @@ def __(mo):
|
|
| 207 |
r"""
|
| 208 |
## Processing the Data
|
| 209 |
|
| 210 |
-
Now we'll transform our high-dimensional embeddings into something we can visualize, using `
|
| 211 |
"""
|
| 212 |
)
|
| 213 |
return
|
|
@@ -220,7 +223,7 @@ def __(
|
|
| 220 |
embeddings,
|
| 221 |
metric_dropdown,
|
| 222 |
mo,
|
| 223 |
-
|
| 224 |
):
|
| 225 |
with mo.status.spinner("Clustering points...") as _s:
|
| 226 |
import numba
|
|
@@ -232,7 +235,9 @@ def __(
|
|
| 232 |
max_cluster_size=cluster_size_slider.value[1],
|
| 233 |
)
|
| 234 |
_s.update("Reducing dimensionality...")
|
| 235 |
-
embeddings_2d =
|
|
|
|
|
|
|
| 236 |
mo.show_code()
|
| 237 |
return embeddings_2d, embeddings_array, hdb_labels, numba
|
| 238 |
|
|
@@ -326,6 +331,7 @@ def __():
|
|
| 326 |
|
| 327 |
# ML tools for dimensionality reduction and clustering
|
| 328 |
import numpy as np
|
|
|
|
| 329 |
return alt, duckdb, np, pl, pyarrow
|
| 330 |
|
| 331 |
|
|
|
|
| 23 |
@app.cell
|
| 24 |
def __():
|
| 25 |
import marimo as mo
|
| 26 |
+
|
| 27 |
return (mo,)
|
| 28 |
|
| 29 |
|
|
|
|
| 116 |
|
| 117 |
Text embeddings typically have hundreds of dimensions (512 in our case), making them impossible to visualize directly. We'll use two techniques to make them interpretable:
|
| 118 |
|
| 119 |
+
1. **Dimensionality Reduction**: Convert our 512D vectors into 2D points while preserving relationships between texts
|
| 120 |
+
2. **Clustering**: Group similar texts together into clusters
|
| 121 |
"""
|
| 122 |
)
|
| 123 |
return
|
| 124 |
|
| 125 |
|
| 126 |
@app.cell(hide_code=True)
|
| 127 |
+
def __(cluster_points, mo, reduce_dimensions):
|
| 128 |
def md_help(cls):
|
| 129 |
import inspect
|
| 130 |
|
| 131 |
return f"def {cls.__name__} {inspect.signature(cls)}:\n {cls.__doc__}"
|
| 132 |
|
|
|
|
| 133 |
mo.accordion(
|
| 134 |
{
|
| 135 |
+
"`reduce_dimensions`": md_help(reduce_dimensions),
|
| 136 |
"`cluster_points`": md_help(cluster_points),
|
| 137 |
}
|
| 138 |
)
|
|
|
|
| 141 |
|
| 142 |
@app.cell
|
| 143 |
def __(np):
|
| 144 |
+
def reduce_dimensions(np_array, metric="cosine"):
|
| 145 |
"""
|
| 146 |
+
Reduce the dimensions of embeddings to a 2D space.
|
| 147 |
+
|
| 148 |
+
Here we use the UMAP algorithm. UMAP preserves both local and
|
| 149 |
+
global structure of the high-dimensional data.
|
| 150 |
"""
|
| 151 |
import umap
|
| 152 |
|
|
|
|
| 158 |
)
|
| 159 |
return reducer.fit_transform(np_array)
|
| 160 |
|
|
|
|
| 161 |
def cluster_points(np_array, min_cluster_size=4, max_cluster_size=50):
|
| 162 |
"""
|
| 163 |
+
Cluster the embeddings.
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
Here we use the HDBSCAN algorithm. We first reduce dimensionality to 50D with
|
| 167 |
+
PCA to speed up clustering, while still preserving most of the important information.
|
| 168 |
"""
|
| 169 |
import hdbscan
|
| 170 |
from sklearn.decomposition import PCA
|
|
|
|
| 181 |
return np.where(
|
| 182 |
hdb.labels_ == -1, "outlier", "cluster_" + hdb.labels_.astype(str)
|
| 183 |
)
|
| 184 |
+
|
| 185 |
+
return cluster_points, reduce_dimensions
|
| 186 |
|
| 187 |
|
| 188 |
@app.cell
|
|
|
|
| 210 |
r"""
|
| 211 |
## Processing the Data
|
| 212 |
|
| 213 |
+
Now we'll transform our high-dimensional embeddings into something we can visualize, using `reduce_dimensions` and `cluster_points`. More details on this step [in the blog](https://motherduck.com/blog/MotherDuck-Visualize-Embeddings-Marimo/).
|
| 214 |
"""
|
| 215 |
)
|
| 216 |
return
|
|
|
|
| 223 |
embeddings,
|
| 224 |
metric_dropdown,
|
| 225 |
mo,
|
| 226 |
+
reduce_dimensions,
|
| 227 |
):
|
| 228 |
with mo.status.spinner("Clustering points...") as _s:
|
| 229 |
import numba
|
|
|
|
| 235 |
max_cluster_size=cluster_size_slider.value[1],
|
| 236 |
)
|
| 237 |
_s.update("Reducing dimensionality...")
|
| 238 |
+
embeddings_2d = reduce_dimensions(
|
| 239 |
+
embeddings_array, metric=metric_dropdown.value
|
| 240 |
+
)
|
| 241 |
mo.show_code()
|
| 242 |
return embeddings_2d, embeddings_array, hdb_labels, numba
|
| 243 |
|
|
|
|
| 331 |
|
| 332 |
# ML tools for dimensionality reduction and clustering
|
| 333 |
import numpy as np
|
| 334 |
+
|
| 335 |
return alt, duckdb, np, pl, pyarrow
|
| 336 |
|
| 337 |
|