Spaces:
Sleeping
Sleeping
Deploy weighted clustering presets with 11 physics perspectives
Browse files- app.py +106 -224
- material_universe_cache/plotly_studio_export.csv +2 -2
- material_universe_cache/umap_2d_balanced.npy +3 -0
- material_universe_cache/umap_2d_chemical.npy +3 -0
- material_universe_cache/umap_2d_coord_energy.npy +3 -0
- material_universe_cache/umap_2d_electronic.npy +3 -0
- material_universe_cache/umap_2d_esen_only.npy +3 -0
- material_universe_cache/umap_2d_mechanochem.npy +3 -0
- material_universe_cache/umap_2d_mm_only.npy +3 -0
- material_universe_cache/umap_2d_ofm_only.npy +3 -0
- material_universe_cache/umap_2d_orb_only.npy +3 -0
- material_universe_cache/umap_2d_stability.npy +3 -0
- material_universe_cache/umap_2d_structural.npy +3 -0
app.py
CHANGED
|
@@ -16,7 +16,6 @@ import argparse
|
|
| 16 |
import json
|
| 17 |
import os
|
| 18 |
import sys
|
| 19 |
-
from functools import lru_cache
|
| 20 |
from pathlib import Path
|
| 21 |
|
| 22 |
from dotenv import load_dotenv
|
|
@@ -56,8 +55,8 @@ DASH_I18N = {
|
|
| 56 |
),
|
| 57 |
"c1_title": "Band gap distribution over material families",
|
| 58 |
"c2_title": "Compare band gap distribution of material types",
|
| 59 |
-
"c3_title": "
|
| 60 |
-
"c3_desc": "
|
| 61 |
"c4_title": "Look up materials by chemical family",
|
| 62 |
"c5_title": "Band gap by material family and material type",
|
| 63 |
"c6_title": "Show the top N material families",
|
|
@@ -101,9 +100,8 @@ DASH_I18N = {
|
|
| 101 |
"chart_type_avg_cluster": "{type} — Avg Band Gap by Cluster",
|
| 102 |
"chart_pca_2d": "PCA 2D Projection",
|
| 103 |
"chart_pca_3d": "PCA 3D Projection",
|
| 104 |
-
"
|
| 105 |
-
"
|
| 106 |
-
"lbl_weighted_pc3": "Weighted PC3 (Physics-Adaptive)",
|
| 107 |
"chart_variance": "Explained Variance (%)",
|
| 108 |
"chart_component": "Component",
|
| 109 |
"chart_variance_pct": "Variance (%)",
|
|
@@ -159,8 +157,8 @@ DASH_I18N = {
|
|
| 159 |
),
|
| 160 |
"c1_title": "材料ファミリーごとのバンドギャップ分布",
|
| 161 |
"c2_title": "材料タイプ別バンドギャップ分布の比較",
|
| 162 |
-
"c3_title": "
|
| 163 |
-
"c3_desc": "
|
| 164 |
"c4_title": "化学ファミリーで材料を検索",
|
| 165 |
"c5_title": "材料ファミリーと材料タイプ別バンドギャップ",
|
| 166 |
"c6_title": "上位N材料ファミリーを表示",
|
|
@@ -204,9 +202,8 @@ DASH_I18N = {
|
|
| 204 |
"chart_type_avg_cluster": "{type} — クラスター別平均バンドギャップ",
|
| 205 |
"chart_pca_2d": "PCA 2D射影",
|
| 206 |
"chart_pca_3d": "PCA 3D射影",
|
| 207 |
-
"
|
| 208 |
-
"
|
| 209 |
-
"lbl_weighted_pc3": "重み付きPC3(物理適応軸)",
|
| 210 |
"chart_variance": "寄与率(%)",
|
| 211 |
"chart_component": "成分",
|
| 212 |
"chart_variance_pct": "寄与率(%)",
|
|
@@ -292,35 +289,7 @@ df = pd.read_csv(CSV)
|
|
| 292 |
DIM_COLS = [f"dim_{i}" for i in range(32)]
|
| 293 |
ALL_TYPES = ["Metallic", "Semiconductor", "Insulator"]
|
| 294 |
|
| 295 |
-
# Load raw embedding vectors for runtime weighted PCA
|
| 296 |
-
print("Loading raw embedding vectors for weighted PCA...")
|
| 297 |
-
VECTORS_PATH = CACHE / "vectors.npy"
|
| 298 |
-
ESEN_PATH = CACHE / "esen_vectors.npy"
|
| 299 |
-
|
| 300 |
-
if VECTORS_PATH.exists() and ESEN_PATH.exists():
|
| 301 |
-
vectors = np.load(VECTORS_PATH).astype(np.float32) # (33973, 2738)
|
| 302 |
-
esen = np.load(ESEN_PATH).astype(np.float32) # (33973, 128)
|
| 303 |
-
|
| 304 |
-
# Split into embedding spaces
|
| 305 |
-
RAW_EMBEDDINGS = {
|
| 306 |
-
"orb": vectors[:, :1792], # (33973, 1792)
|
| 307 |
-
"mm": vectors[:, 1792:2550], # (33973, 758)
|
| 308 |
-
"ofm": vectors[:, 2550:2738], # (33973, 188)
|
| 309 |
-
"esen": esen, # (33973, 128)
|
| 310 |
-
}
|
| 311 |
-
|
| 312 |
-
mem_mb = sum(v.nbytes for v in RAW_EMBEDDINGS.values()) / (1024 * 1024)
|
| 313 |
-
print(f" Loaded raw vectors: {mem_mb:.1f} MB in memory")
|
| 314 |
-
print(f" orb: {RAW_EMBEDDINGS['orb'].shape}")
|
| 315 |
-
print(f" mm: {RAW_EMBEDDINGS['mm'].shape}")
|
| 316 |
-
print(f" ofm: {RAW_EMBEDDINGS['ofm'].shape}")
|
| 317 |
-
print(f" esen: {RAW_EMBEDDINGS['esen'].shape}")
|
| 318 |
-
else:
|
| 319 |
-
print(" WARNING: Raw vectors not found. Runtime PCA disabled.")
|
| 320 |
-
RAW_EMBEDDINGS = None
|
| 321 |
-
|
| 322 |
# Compute PCA on balanced 32D UMAP for variance bar chart
|
| 323 |
-
# Chart 3 scatter will use runtime weighted PCA instead
|
| 324 |
X = StandardScaler().fit_transform(df[DIM_COLS].values)
|
| 325 |
pca_full = PCA(n_components=20, random_state=42).fit(X)
|
| 326 |
pc_all = pca_full.transform(X) # (33973, 20)
|
|
@@ -418,76 +387,20 @@ if S_ORB is not None and S_MM is not None and S_OFM is not None and S_ESEN is no
|
|
| 418 |
else:
|
| 419 |
print(" WARNING: Centroid similarity matrices not loaded. Custom mode unavailable.")
|
| 420 |
|
| 421 |
-
# ──
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
embedding vectors, then compute PCA on the weighted space. Guarantees
|
| 429 |
-
that PC0, PC1 capture maximum variance in the weighted combination.
|
| 430 |
-
|
| 431 |
-
Args:
|
| 432 |
-
weights: dict with keys 'orb', 'mm', 'ofm', 'esen' (normalized to sum=1)
|
| 433 |
-
n_dims: int, 2 for 2D scatter, 3 for 3D scatter
|
| 434 |
-
|
| 435 |
-
Returns:
|
| 436 |
-
np.ndarray of shape (33973, n_dims) - PCA coordinates
|
| 437 |
-
|
| 438 |
-
Performance: ~500ms (StandardScaler + PCA on 33,973 × 2,866 matrix)
|
| 439 |
-
"""
|
| 440 |
-
if RAW_EMBEDDINGS is None:
|
| 441 |
-
# Fallback: return zeros if raw vectors not loaded
|
| 442 |
-
print("WARNING: RAW_EMBEDDINGS not available, returning zeros")
|
| 443 |
-
return np.zeros((len(df), n_dims))
|
| 444 |
-
|
| 445 |
-
# Weight-concatenate raw embedding vectors
|
| 446 |
-
# Scale by sqrt(dim) to give equal importance per dimension, not per space
|
| 447 |
-
weighted_vectors = np.concatenate([
|
| 448 |
-
weights["orb"] * RAW_EMBEDDINGS["orb"] / np.sqrt(1792),
|
| 449 |
-
weights["mm"] * RAW_EMBEDDINGS["mm"] / np.sqrt(758),
|
| 450 |
-
weights["ofm"] * RAW_EMBEDDINGS["ofm"] / np.sqrt(188),
|
| 451 |
-
weights["esen"] * RAW_EMBEDDINGS["esen"] / np.sqrt(128),
|
| 452 |
-
], axis=1) # Shape: (33973, 2866)
|
| 453 |
-
|
| 454 |
-
# Standardize and compute PCA
|
| 455 |
-
# StandardScaler ensures each dimension has mean=0, std=1
|
| 456 |
-
scaler = StandardScaler()
|
| 457 |
-
scaled = scaler.fit_transform(weighted_vectors)
|
| 458 |
-
|
| 459 |
-
# PCA to extract top n_dims components
|
| 460 |
-
pca = PCA(n_components=n_dims, random_state=42)
|
| 461 |
-
pca_coords = pca.fit_transform(scaled) # Shape: (33973, n_dims)
|
| 462 |
-
|
| 463 |
-
# Log explained variance for debugging
|
| 464 |
-
explained_var = pca.explained_variance_ratio_.sum() * 100
|
| 465 |
-
print(f" Weighted PCA: {explained_var:.1f}% variance explained (n_dims={n_dims})")
|
| 466 |
-
|
| 467 |
-
return pca_coords
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
@lru_cache(maxsize=128)
|
| 471 |
-
def compute_weighted_pca_cached(weights_tuple, n_dims):
|
| 472 |
-
"""
|
| 473 |
-
Cached wrapper for compute_weighted_pca.
|
| 474 |
-
|
| 475 |
-
Args:
|
| 476 |
-
weights_tuple: tuple (w_orb, w_mm, w_ofm, w_esen) - must be hashable
|
| 477 |
-
n_dims: int
|
| 478 |
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
weights = {
|
| 484 |
-
"orb": weights_tuple[0],
|
| 485 |
-
"mm": weights_tuple[1],
|
| 486 |
-
"ofm": weights_tuple[2],
|
| 487 |
-
"esen": weights_tuple[3],
|
| 488 |
-
}
|
| 489 |
|
| 490 |
-
|
| 491 |
|
| 492 |
# ── Label resolution ───────────────────────────────────────────────────
|
| 493 |
|
|
@@ -700,10 +613,6 @@ app.layout = html.Div(
|
|
| 700 |
dcc.Store(id="active-labels",
|
| 701 |
data={"mode": "preset", "key": "balanced"}),
|
| 702 |
|
| 703 |
-
# ── Debouncing for weight sliders ──────────────────────────────
|
| 704 |
-
dcc.Store(id="weight-buffer", data=None),
|
| 705 |
-
dcc.Interval(id="debounce-interval", interval=300, max_intervals=0),
|
| 706 |
-
|
| 707 |
# ── Clustering control panel ────────────────────────────────────
|
| 708 |
html.Div(style={**CARD, "borderLeft": "4px solid #1976d2"}, children=[
|
| 709 |
html.H3(T["lbl_cluster_mode"],
|
|
@@ -769,11 +678,21 @@ app.layout = html.Div(
|
|
| 769 |
html.Div(style={"display": "flex", "gap": "16px",
|
| 770 |
"flexWrap": "wrap", "marginBottom": "12px"}, children=[
|
| 771 |
html.Div([
|
| 772 |
-
html.Div(T["
|
| 773 |
-
dcc.Dropdown(
|
| 774 |
-
|
| 775 |
-
|
| 776 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 777 |
html.Div([
|
| 778 |
html.Div(T["lbl_color_by"], style=LABEL),
|
| 779 |
dcc.Dropdown(id="c3-color", options=color_by_options,
|
|
@@ -806,20 +725,12 @@ app.layout = html.Div(
|
|
| 806 |
]),
|
| 807 |
html.Div(style={"display": "flex", "gap": "16px"}, children=[
|
| 808 |
html.Div(
|
| 809 |
-
dcc.
|
| 810 |
-
id="loading-c3-scatter",
|
| 811 |
-
type="circle",
|
| 812 |
-
children=dcc.Graph(id="c3-scatter")
|
| 813 |
-
),
|
| 814 |
id="c3-scatter-container",
|
| 815 |
style={"flex": "1", "minWidth": 0}
|
| 816 |
),
|
| 817 |
html.Div(
|
| 818 |
-
dcc.
|
| 819 |
-
id="loading-c3-variance",
|
| 820 |
-
type="circle",
|
| 821 |
-
children=dcc.Graph(id="c3-variance")
|
| 822 |
-
),
|
| 823 |
id="c3-variance-container",
|
| 824 |
style={"flex": "1", "minWidth": 0, "display": "none"}
|
| 825 |
),
|
|
@@ -1059,50 +970,6 @@ app.layout = html.Div(
|
|
| 1059 |
# CALLBACKS
|
| 1060 |
# ══════════════════════════════════════════════════════════════════════════
|
| 1061 |
|
| 1062 |
-
# ── Buffer weight changes (instant, no PCA) ──────────────────────────
|
| 1063 |
-
@callback(
|
| 1064 |
-
Output("weight-buffer", "data"),
|
| 1065 |
-
Output("debounce-interval", "max_intervals"),
|
| 1066 |
-
Input("cw-orb", "value"),
|
| 1067 |
-
Input("cw-mm", "value"),
|
| 1068 |
-
Input("cw-ofm", "value"),
|
| 1069 |
-
Input("cw-esen", "value"),
|
| 1070 |
-
State("debounce-interval", "n_intervals"),
|
| 1071 |
-
)
|
| 1072 |
-
def buffer_weights(w_orb, w_mm, w_ofm, w_esen, n_intervals):
|
| 1073 |
-
"""Store weight changes immediately without triggering expensive PCA."""
|
| 1074 |
-
import time
|
| 1075 |
-
weights = [w_orb or 0.25, w_mm or 0.25, w_ofm or 0.25, w_esen or 0.25]
|
| 1076 |
-
timestamp = time.time()
|
| 1077 |
-
|
| 1078 |
-
# Reset interval counter to restart debounce timer
|
| 1079 |
-
# max_intervals=1 means interval will fire once after 300ms
|
| 1080 |
-
return {"weights": weights, "timestamp": timestamp}, 1
|
| 1081 |
-
|
| 1082 |
-
|
| 1083 |
-
# ── Apply debounced weights (after 300ms of no changes) ─────────────
|
| 1084 |
-
@callback(
|
| 1085 |
-
Output("active-labels", "data", allow_duplicate=True),
|
| 1086 |
-
Input("debounce-interval", "n_intervals"),
|
| 1087 |
-
State("weight-buffer", "data"),
|
| 1088 |
-
State("cluster-mode", "value"),
|
| 1089 |
-
prevent_initial_call=True,
|
| 1090 |
-
)
|
| 1091 |
-
def apply_debounced_weights(n_intervals, buffer, mode):
|
| 1092 |
-
"""Apply weights after debounce delay (300ms of no slider changes)."""
|
| 1093 |
-
if mode != "custom" or not buffer:
|
| 1094 |
-
from dash.exceptions import PreventUpdate
|
| 1095 |
-
raise PreventUpdate
|
| 1096 |
-
|
| 1097 |
-
w = buffer["weights"]
|
| 1098 |
-
total = sum(w)
|
| 1099 |
-
if total > 0:
|
| 1100 |
-
w = [x / total for x in w]
|
| 1101 |
-
else:
|
| 1102 |
-
w = [0.25, 0.25, 0.25, 0.25]
|
| 1103 |
-
|
| 1104 |
-
return {"mode": "custom", "weights": w}
|
| 1105 |
-
|
| 1106 |
|
| 1107 |
# ── Active labels computation (preset mode or initial) ────────────────
|
| 1108 |
@callback(
|
|
@@ -1216,40 +1083,22 @@ def chart3(ndim, color_by, filter_type, topn_str, active_data):
|
|
| 1216 |
clusters, families, displays = resolve_labels(active_data)
|
| 1217 |
work = df.assign(Cluster=clusters, Family=families, FamilyDisplay=displays)
|
| 1218 |
|
| 1219 |
-
# =====
|
| 1220 |
-
|
| 1221 |
-
|
| 1222 |
-
preset_key =
|
| 1223 |
-
|
| 1224 |
-
|
| 1225 |
-
|
| 1226 |
-
|
| 1227 |
-
# Fallback to balanced
|
| 1228 |
-
weights_tuple = (0.25, 0.25, 0.25, 0.25)
|
| 1229 |
-
|
| 1230 |
-
# Normalize weights
|
| 1231 |
-
w_sum = sum(weights_tuple)
|
| 1232 |
-
if w_sum > 0:
|
| 1233 |
-
weights = {
|
| 1234 |
-
"orb": weights_tuple[0] / w_sum,
|
| 1235 |
-
"mm": weights_tuple[1] / w_sum,
|
| 1236 |
-
"ofm": weights_tuple[2] / w_sum,
|
| 1237 |
-
"esen": weights_tuple[3] / w_sum,
|
| 1238 |
-
}
|
| 1239 |
-
else:
|
| 1240 |
-
weights = {"orb": 0.25, "mm": 0.25, "ofm": 0.25, "esen": 0.25}
|
| 1241 |
|
| 1242 |
-
|
| 1243 |
-
|
| 1244 |
-
weights_normalized = (weights["orb"], weights["mm"], weights["ofm"], weights["esen"])
|
| 1245 |
-
pca_coords = compute_weighted_pca_cached(weights_normalized, n_dims=n_dims)
|
| 1246 |
|
| 1247 |
-
|
| 1248 |
-
|
| 1249 |
-
|
| 1250 |
-
|
| 1251 |
-
if n_dims == 3:
|
| 1252 |
-
work["PC3_dynamic"] = pca_coords[:, 2]
|
| 1253 |
|
| 1254 |
sub = work if filter_type == "All" else work[work["Type"] == filter_type]
|
| 1255 |
|
|
@@ -1265,28 +1114,40 @@ def chart3(ndim, color_by, filter_type, topn_str, active_data):
|
|
| 1265 |
sub["Cluster"] = sub["Cluster"].astype(str)
|
| 1266 |
color = "Cluster"
|
| 1267 |
|
| 1268 |
-
|
| 1269 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1270 |
color=color, template="plotly_white",
|
| 1271 |
title=T["chart_pca_3d"],
|
| 1272 |
opacity=0.6, height=600)
|
| 1273 |
fig1.update_traces(marker_size=2)
|
| 1274 |
-
|
| 1275 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1276 |
color=color, template="plotly_white",
|
| 1277 |
title=T["chart_pca_2d"],
|
| 1278 |
render_mode="webgl", opacity=0.6, height=600)
|
| 1279 |
fig1.update_traces(marker_size=3)
|
| 1280 |
-
|
| 1281 |
-
|
| 1282 |
-
|
| 1283 |
-
|
| 1284 |
-
if ndim == "3D":
|
| 1285 |
-
fig1.update_layout(scene=dict(
|
| 1286 |
-
xaxis_title=T.get("lbl_weighted_pc1", "Weighted PC1"),
|
| 1287 |
-
yaxis_title=T.get("lbl_weighted_pc2", "Weighted PC2"),
|
| 1288 |
-
zaxis_title=T.get("lbl_weighted_pc3", "Weighted PC3")
|
| 1289 |
-
))
|
| 1290 |
|
| 1291 |
fig1.update_layout(legend=dict(font=dict(size=9)))
|
| 1292 |
|
|
@@ -1307,20 +1168,41 @@ def chart3(ndim, color_by, filter_type, topn_str, active_data):
|
|
| 1307 |
Output("c3-variance-container", "style"),
|
| 1308 |
Output("c3-var-toggle", "children"),
|
| 1309 |
Input("c3-var-toggle", "n_clicks"),
|
| 1310 |
-
|
| 1311 |
)
|
| 1312 |
-
def toggle_variance(n):
|
| 1313 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1314 |
return (
|
| 1315 |
{"flex": "2", "minWidth": 0},
|
| 1316 |
{"flex": "1", "minWidth": 0},
|
| 1317 |
T["chart_variance"] + " \u25c0",
|
| 1318 |
)
|
| 1319 |
-
|
| 1320 |
-
|
| 1321 |
-
|
| 1322 |
-
|
| 1323 |
-
|
|
|
|
| 1324 |
|
| 1325 |
|
| 1326 |
# ── 4 Lookup table ────────────────────────────────────────────────────────
|
|
|
|
| 16 |
import json
|
| 17 |
import os
|
| 18 |
import sys
|
|
|
|
| 19 |
from pathlib import Path
|
| 20 |
|
| 21 |
from dotenv import load_dotenv
|
|
|
|
| 55 |
),
|
| 56 |
"c1_title": "Band gap distribution over material families",
|
| 57 |
"c2_title": "Compare band gap distribution of material types",
|
| 58 |
+
"c3_title": "Material Embedding Space Visualization",
|
| 59 |
+
"c3_desc": "Visualize via supervised UMAP (preset mode) or PCA (custom/fallback).",
|
| 60 |
"c4_title": "Look up materials by chemical family",
|
| 61 |
"c5_title": "Band gap by material family and material type",
|
| 62 |
"c6_title": "Show the top N material families",
|
|
|
|
| 100 |
"chart_type_avg_cluster": "{type} — Avg Band Gap by Cluster",
|
| 101 |
"chart_pca_2d": "PCA 2D Projection",
|
| 102 |
"chart_pca_3d": "PCA 3D Projection",
|
| 103 |
+
"chart_umap_2d": "UMAP 2D Projection (Supervised)",
|
| 104 |
+
"lbl_projection": "Projection Method",
|
|
|
|
| 105 |
"chart_variance": "Explained Variance (%)",
|
| 106 |
"chart_component": "Component",
|
| 107 |
"chart_variance_pct": "Variance (%)",
|
|
|
|
| 157 |
),
|
| 158 |
"c1_title": "材料ファミリーごとのバンドギャップ分布",
|
| 159 |
"c2_title": "材料タイプ別バンドギャップ分布の比較",
|
| 160 |
+
"c3_title": "材料埋め込み空間の可視化",
|
| 161 |
+
"c3_desc": "教師付きUMAP(プリセット)またはPCA(カスタム/フォールバック)で可視化。",
|
| 162 |
"c4_title": "化学ファミリーで材料を検索",
|
| 163 |
"c5_title": "材料ファミリーと材料タイプ別バンドギャップ",
|
| 164 |
"c6_title": "上位N材料ファミリーを表示",
|
|
|
|
| 202 |
"chart_type_avg_cluster": "{type} — クラスター別平均バンドギャップ",
|
| 203 |
"chart_pca_2d": "PCA 2D射影",
|
| 204 |
"chart_pca_3d": "PCA 3D射影",
|
| 205 |
+
"chart_umap_2d": "UMAP 2D射影(教師付き)",
|
| 206 |
+
"lbl_projection": "射影法",
|
|
|
|
| 207 |
"chart_variance": "寄与率(%)",
|
| 208 |
"chart_component": "成分",
|
| 209 |
"chart_variance_pct": "寄与率(%)",
|
|
|
|
| 289 |
DIM_COLS = [f"dim_{i}" for i in range(32)]
|
| 290 |
ALL_TYPES = ["Metallic", "Semiconductor", "Insulator"]
|
| 291 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
# Compute PCA on balanced 32D UMAP for variance bar chart
|
|
|
|
| 293 |
X = StandardScaler().fit_transform(df[DIM_COLS].values)
|
| 294 |
pca_full = PCA(n_components=20, random_state=42).fit(X)
|
| 295 |
pc_all = pca_full.transform(X) # (33973, 20)
|
|
|
|
| 387 |
else:
|
| 388 |
print(" WARNING: Centroid similarity matrices not loaded. Custom mode unavailable.")
|
| 389 |
|
| 390 |
+
# ── Load per-preset UMAP 2D coordinates ────────────────────────────────
|
| 391 |
+
print("Loading per-preset UMAP 2D projections...")
|
| 392 |
+
PRESET_UMAP_2D = {}
|
| 393 |
+
for key in PRESET_KEYS:
|
| 394 |
+
umap_path = CACHE / f"umap_2d_{key}.npy"
|
| 395 |
+
if umap_path.exists():
|
| 396 |
+
PRESET_UMAP_2D[key] = np.load(umap_path).astype(np.float32)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
|
| 398 |
+
if PRESET_UMAP_2D:
|
| 399 |
+
print(f" Loaded {len(PRESET_UMAP_2D)} UMAP 2D projections (~{sum(v.nbytes for v in PRESET_UMAP_2D.values()) / (1024*1024):.1f} MB)")
|
| 400 |
+
else:
|
| 401 |
+
print(" WARNING: No UMAP 2D files found. Will use PCA fallback.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 402 |
|
| 403 |
+
HAS_UMAP = bool(PRESET_UMAP_2D)
|
| 404 |
|
| 405 |
# ── Label resolution ───────────────────────────────────────────────────
|
| 406 |
|
|
|
|
| 613 |
dcc.Store(id="active-labels",
|
| 614 |
data={"mode": "preset", "key": "balanced"}),
|
| 615 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 616 |
# ── Clustering control panel ────────────────────────────────────
|
| 617 |
html.Div(style={**CARD, "borderLeft": "4px solid #1976d2"}, children=[
|
| 618 |
html.H3(T["lbl_cluster_mode"],
|
|
|
|
| 678 |
html.Div(style={"display": "flex", "gap": "16px",
|
| 679 |
"flexWrap": "wrap", "marginBottom": "12px"}, children=[
|
| 680 |
html.Div([
|
| 681 |
+
html.Div(T["lbl_projection"], style=LABEL),
|
| 682 |
+
dcc.Dropdown(
|
| 683 |
+
id="c3-ndim",
|
| 684 |
+
options=(
|
| 685 |
+
[{"label": "UMAP 2D", "value": "UMAP_2D"}]
|
| 686 |
+
if HAS_UMAP else []
|
| 687 |
+
) + [
|
| 688 |
+
{"label": "PCA 2D", "value": "PCA_2D"},
|
| 689 |
+
{"label": "PCA 3D", "value": "PCA_3D"},
|
| 690 |
+
],
|
| 691 |
+
value="UMAP_2D" if HAS_UMAP else "PCA_2D",
|
| 692 |
+
clearable=False,
|
| 693 |
+
style={"width": "150px"}
|
| 694 |
+
),
|
| 695 |
+
], style={"display": "flex", "gap": "8px", "alignItems": "center"}),
|
| 696 |
html.Div([
|
| 697 |
html.Div(T["lbl_color_by"], style=LABEL),
|
| 698 |
dcc.Dropdown(id="c3-color", options=color_by_options,
|
|
|
|
| 725 |
]),
|
| 726 |
html.Div(style={"display": "flex", "gap": "16px"}, children=[
|
| 727 |
html.Div(
|
| 728 |
+
dcc.Graph(id="c3-scatter"),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 729 |
id="c3-scatter-container",
|
| 730 |
style={"flex": "1", "minWidth": 0}
|
| 731 |
),
|
| 732 |
html.Div(
|
| 733 |
+
dcc.Graph(id="c3-variance"),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 734 |
id="c3-variance-container",
|
| 735 |
style={"flex": "1", "minWidth": 0, "display": "none"}
|
| 736 |
),
|
|
|
|
| 970 |
# CALLBACKS
|
| 971 |
# ══════════════════════════════════════════════════════════════════════════
|
| 972 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 973 |
|
| 974 |
# ── Active labels computation (preset mode or initial) ────────────────
|
| 975 |
@callback(
|
|
|
|
| 1083 |
clusters, families, displays = resolve_labels(active_data)
|
| 1084 |
work = df.assign(Cluster=clusters, Family=families, FamilyDisplay=displays)
|
| 1085 |
|
| 1086 |
+
# ===== Assign UMAP 2D coordinates if using UMAP mode =====
|
| 1087 |
+
if ndim == "UMAP_2D" and HAS_UMAP:
|
| 1088 |
+
# Determine which preset's UMAP to use
|
| 1089 |
+
preset_key = "balanced" # Default fallback
|
| 1090 |
+
|
| 1091 |
+
if active_data and active_data.get("mode") == "preset":
|
| 1092 |
+
preset_key = active_data.get("key", "balanced")
|
| 1093 |
+
# Custom mode: use balanced UMAP for fixed layout
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1094 |
|
| 1095 |
+
# Load the appropriate UMAP coordinates
|
| 1096 |
+
umap_2d = PRESET_UMAP_2D.get(preset_key, PRESET_UMAP_2D.get("balanced"))
|
|
|
|
|
|
|
| 1097 |
|
| 1098 |
+
# Assign to DataFrame
|
| 1099 |
+
work = work.copy()
|
| 1100 |
+
work["UMAP_X"] = umap_2d[:, 0]
|
| 1101 |
+
work["UMAP_Y"] = umap_2d[:, 1]
|
|
|
|
|
|
|
| 1102 |
|
| 1103 |
sub = work if filter_type == "All" else work[work["Type"] == filter_type]
|
| 1104 |
|
|
|
|
| 1114 |
sub["Cluster"] = sub["Cluster"].astype(str)
|
| 1115 |
color = "Cluster"
|
| 1116 |
|
| 1117 |
+
# ===== Generate scatter plot based on projection mode =====
|
| 1118 |
+
if ndim == "UMAP_2D":
|
| 1119 |
+
fig1 = px.scatter(
|
| 1120 |
+
sub, x="UMAP_X", y="UMAP_Y",
|
| 1121 |
+
color=color, template="plotly_white",
|
| 1122 |
+
title=T["chart_umap_2d"],
|
| 1123 |
+
render_mode="webgl", opacity=0.6, height=600
|
| 1124 |
+
)
|
| 1125 |
+
fig1.update_traces(marker_size=3)
|
| 1126 |
+
fig1.update_layout(
|
| 1127 |
+
xaxis_title="UMAP 1",
|
| 1128 |
+
yaxis_title="UMAP 2"
|
| 1129 |
+
)
|
| 1130 |
+
elif ndim == "PCA_3D":
|
| 1131 |
+
fig1 = px.scatter_3d(sub, x="PC1", y="PC2", z="PC3",
|
| 1132 |
color=color, template="plotly_white",
|
| 1133 |
title=T["chart_pca_3d"],
|
| 1134 |
opacity=0.6, height=600)
|
| 1135 |
fig1.update_traces(marker_size=2)
|
| 1136 |
+
fig1.update_layout(scene=dict(
|
| 1137 |
+
xaxis_title="PC1",
|
| 1138 |
+
yaxis_title="PC2",
|
| 1139 |
+
zaxis_title="PC3"
|
| 1140 |
+
))
|
| 1141 |
+
else: # PCA_2D
|
| 1142 |
+
fig1 = px.scatter(sub, x="PC1", y="PC2",
|
| 1143 |
color=color, template="plotly_white",
|
| 1144 |
title=T["chart_pca_2d"],
|
| 1145 |
render_mode="webgl", opacity=0.6, height=600)
|
| 1146 |
fig1.update_traces(marker_size=3)
|
| 1147 |
+
fig1.update_layout(
|
| 1148 |
+
xaxis_title="PC1",
|
| 1149 |
+
yaxis_title="PC2"
|
| 1150 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1151 |
|
| 1152 |
fig1.update_layout(legend=dict(font=dict(size=9)))
|
| 1153 |
|
|
|
|
| 1168 |
Output("c3-variance-container", "style"),
|
| 1169 |
Output("c3-var-toggle", "children"),
|
| 1170 |
Input("c3-var-toggle", "n_clicks"),
|
| 1171 |
+
Input("c3-ndim", "value"),
|
| 1172 |
)
|
| 1173 |
+
def toggle_variance(n, ndim):
|
| 1174 |
+
# Hide variance in UMAP mode (not meaningful for UMAP)
|
| 1175 |
+
if ndim == "UMAP_2D":
|
| 1176 |
+
return (
|
| 1177 |
+
{"flex": "1", "minWidth": 0},
|
| 1178 |
+
{"flex": "1", "minWidth": 0, "display": "none"},
|
| 1179 |
+
T["chart_variance"] + " \u25b6",
|
| 1180 |
+
)
|
| 1181 |
+
|
| 1182 |
+
# Existing toggle logic for PCA modes
|
| 1183 |
+
ctx = dash.callback_context
|
| 1184 |
+
if not ctx.triggered or ctx.triggered[0]["prop_id"] == "c3-ndim.value":
|
| 1185 |
+
# Initial state or mode change: hide variance
|
| 1186 |
+
return (
|
| 1187 |
+
{"flex": "1", "minWidth": 0},
|
| 1188 |
+
{"flex": "1", "minWidth": 0, "display": "none"},
|
| 1189 |
+
T["chart_variance"] + " \u25b6",
|
| 1190 |
+
)
|
| 1191 |
+
|
| 1192 |
+
# Toggle on click
|
| 1193 |
+
is_hidden = (n or 0) % 2 == 0
|
| 1194 |
+
if is_hidden:
|
| 1195 |
return (
|
| 1196 |
{"flex": "2", "minWidth": 0},
|
| 1197 |
{"flex": "1", "minWidth": 0},
|
| 1198 |
T["chart_variance"] + " \u25c0",
|
| 1199 |
)
|
| 1200 |
+
else:
|
| 1201 |
+
return (
|
| 1202 |
+
{"flex": "1", "minWidth": 0},
|
| 1203 |
+
{"flex": "1", "minWidth": 0, "display": "none"},
|
| 1204 |
+
T["chart_variance"] + " \u25b6",
|
| 1205 |
+
)
|
| 1206 |
|
| 1207 |
|
| 1208 |
# ── 4 Lookup table ────────────────────────────────────────────────────────
|
material_universe_cache/plotly_studio_export.csv
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:28d5a2616fc8dcd9469c0303f64b4eb61bb8c9bb07c8f614281076a93e87113e
|
| 3 |
+
size 13630877
|
material_universe_cache/umap_2d_balanced.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:677b4fcdc6bd77071d7d8cc0608690f07cfbd34fceb44b75cf149a1ee375c5e9
|
| 3 |
+
size 271912
|
material_universe_cache/umap_2d_chemical.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3d1f19ef7c3de86c290a95a9795e3da53ceaec3ac7376ceb8b18953e80d2f952
|
| 3 |
+
size 271912
|
material_universe_cache/umap_2d_coord_energy.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:69df0550f20b0afff1f02c81fa216dc07b808ea22b9a2e98e4d441db4a692b89
|
| 3 |
+
size 271912
|
material_universe_cache/umap_2d_electronic.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:473c16cea222156875cc5200b74fc21f835b7438c79b605901a6b6b023047b95
|
| 3 |
+
size 271912
|
material_universe_cache/umap_2d_esen_only.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ccd018765d5116fe982881e69fde569cabd1ca72ab5e418948e6dbb703d142fa
|
| 3 |
+
size 271912
|
material_universe_cache/umap_2d_mechanochem.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:607a5d46e9bb74c38cdf1e87648e4bf2414811a6d56a9cc63d318ca73b10e93a
|
| 3 |
+
size 271912
|
material_universe_cache/umap_2d_mm_only.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:990370651b0f798c0271428535b6dc09003138cd1ff7685146f65202aaecab40
|
| 3 |
+
size 271912
|
material_universe_cache/umap_2d_ofm_only.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:80133c0680f7028b2007745f1da99cb47cfceaee942f3e2b076a016ee2cdf0d9
|
| 3 |
+
size 271912
|
material_universe_cache/umap_2d_orb_only.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:65be8b7d5e99d94f6f7b07bb9c5df689a5a9ccb2e18c58f2f6e52e2e0b471f39
|
| 3 |
+
size 271912
|
material_universe_cache/umap_2d_stability.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4653177a621333abe5bf0d219c915f1e6552ab9718da0e0921a74839346c991d
|
| 3 |
+
size 271912
|
material_universe_cache/umap_2d_structural.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:32874494dd779ca1674029c00db116144748118b44c14faab82686f93e518b87
|
| 3 |
+
size 271912
|