Hafnium49 commited on
Commit
2e8f8a3
·
verified ·
1 Parent(s): 80dcc5d

Deploy weighted-clustering-presets: 11 presets, custom weights, C7 search, 32D PCA

Browse files
Dockerfile CHANGED
@@ -7,7 +7,11 @@ RUN pip install --no-cache-dir -r requirements-hf.txt
7
 
8
  COPY app.py .
9
  COPY scripts/apply_labels_to_map.py scripts/
 
10
  COPY material_universe_cache/plotly_studio_export.csv material_universe_cache/
 
 
 
11
 
12
  ENV HOST=0.0.0.0
13
  ENV PORT=7860
 
7
 
8
  COPY app.py .
9
  COPY scripts/apply_labels_to_map.py scripts/
10
+ COPY search/ search/
11
  COPY material_universe_cache/plotly_studio_export.csv material_universe_cache/
12
+ COPY material_universe_cache/cluster_labels_*.npy material_universe_cache/
13
+ COPY material_universe_cache/centroid_sim_*.npy material_universe_cache/
14
+ COPY docs/cluster_definitions_presets.json docs/
15
 
16
  ENV HOST=0.0.0.0
17
  ENV PORT=7860
app.py CHANGED
@@ -1,7 +1,9 @@
1
  """
2
  Materials Database Explorer — Dash 4.0
3
 
4
- Reproduces the Plotly-Studio-generated dashboard with 6 interactive charts.
 
 
5
  Supports English and Japanese localization.
6
 
7
  Usage:
@@ -11,12 +13,18 @@ Usage:
11
  """
12
 
13
  import argparse
 
14
  import os
15
  import sys
 
 
 
 
16
 
 
17
  import pandas as pd
18
  import plotly.express as px
19
- from dash import Dash, dcc, html, Input, Output, callback
20
  from sklearn.decomposition import PCA
21
  from sklearn.preprocessing import StandardScaler
22
 
@@ -48,7 +56,7 @@ DASH_I18N = {
48
  "c1_title": "Band gap distribution over material families",
49
  "c2_title": "Compare band gap distribution of material types",
50
  "c3_title": "Principal Component Analysis (PCA) for high-dimensional material embeddings",
51
- "c3_desc": "Reduces 64-dimensional material embeddings to 2D/3D to reveal clustering patterns.",
52
  "c4_title": "Look up materials by chemical family",
53
  "c5_title": "Band gap by material family and material type",
54
  "c6_title": "Show the top N material families",
@@ -97,6 +105,46 @@ DASH_I18N = {
97
  "chart_variance_pct": "Variance (%)",
98
  "chart_top_n": "Top {n} Families by {metric}",
99
  "colorscale": "Colorscale: ",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  },
101
  "ja": {
102
  "app_title": "材料データベースエクスプローラー",
@@ -108,7 +156,7 @@ DASH_I18N = {
108
  "c1_title": "材料ファミリーごとのバンドギャップ分布",
109
  "c2_title": "材料タイプ別バンドギャップ分布の比較",
110
  "c3_title": "高次元材料埋め込みの主成分分析(PCA)",
111
- "c3_desc": "64次元の材料埋め込みを2D/3Dに次元削減し、クラスタリングパターンを可視化。",
112
  "c4_title": "化学ファミリーで材料を検索",
113
  "c5_title": "材料ファミリーと材料タイプ別バンドギャップ",
114
  "c6_title": "上位N材料ファミリーを表示",
@@ -157,6 +205,45 @@ DASH_I18N = {
157
  "chart_variance_pct": "寄与率(%)",
158
  "chart_top_n": "{metric}上位{n}ファミリー",
159
  "colorscale": "カラースケール: ",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  },
161
  }
162
 
@@ -169,7 +256,6 @@ TYPE_DISPLAY = {
169
  "Semiconductor": MT["type_semiconductor"],
170
  "Insulator": MT["type_insulator"],
171
  }
172
- FAMILY_DISPLAY = FAMILY_TRANSLATIONS_JA if LANG == "ja" else {}
173
 
174
  # Aggregation display mapping (label shown in chart titles)
175
  AGG_DISPLAY = {
@@ -187,21 +273,16 @@ METRIC_DISPLAY = {
187
  }
188
 
189
 
190
- def family_label(en_name):
191
- """Translate family name using FAMILY_TRANSLATIONS_JA (verbatim)."""
192
- return FAMILY_DISPLAY.get(en_name, en_name)
193
-
194
-
195
  def type_label(en_name):
196
  """Translate electronic type using MAP_TRANSLATIONS (verbatim)."""
197
  return TYPE_DISPLAY.get(en_name, en_name)
198
 
199
 
200
  # ── data ──────────────────────────────────────────────────────────────────
201
- CSV = "material_universe_cache/plotly_studio_export.csv"
 
202
  df = pd.read_csv(CSV)
203
- DIM_COLS = [f"dim_{i}" for i in range(64)]
204
- ALL_FAMILIES = sorted(df["Family"].unique())
205
  ALL_TYPES = ["Metallic", "Semiconductor", "Insulator"]
206
 
207
  # Pre-compute PCA (expensive, do once)
@@ -214,10 +295,156 @@ df["PC2"] = pc_all[:, 1]
214
  df["PC3"] = pc_all[:, 2]
215
  print("PCA done.")
216
 
217
- # Build display columns for charts
218
- df["FamilyDisplay"] = df["Family_JA"] if LANG == "ja" else df["Family"]
219
  df["TypeDisplay"] = df["Type"].map(TYPE_DISPLAY)
220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  # ── CSS ───────────────────────────────────────────────────────────────────
222
  CARD = {
223
  "background": "white",
@@ -233,7 +460,7 @@ BODY_FONT = (
233
  '"Hiragino Sans", "Noto Sans JP", sans-serif'
234
  )
235
 
236
- # ── helpers ───────────────────────────────────────────────────────────────
237
  AGG_MAP = {
238
  "Average": "mean", "Median": "median", "Max": "max",
239
  "Min": "min", "Sum": "sum", "Count": "count",
@@ -258,7 +485,19 @@ def value_to_color(val, vmin, vmax):
258
 
259
 
260
  # ── dropdown option builders ─────────────────────────────────────────────
261
- family_options = [{"label": family_label(f), "value": f} for f in ALL_FAMILIES]
 
 
 
 
 
 
 
 
 
 
 
 
262
  type_options = [{"label": type_label(t), "value": t} for t in ALL_TYPES]
263
 
264
  agg_options = [
@@ -337,6 +576,67 @@ app.layout = html.Div(
337
  style={"color": "#666", "maxWidth": "900px",
338
  "marginBottom": "32px"}),
339
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
  # ── 1 PCA ───────────────────────────────────────────────────────
341
  html.Div(style=CARD, children=[
342
  html.H3(T["c3_title"]),
@@ -410,7 +710,7 @@ app.layout = html.Div(
410
  html.Div([
411
  html.Div(T["lbl_families"], style=LABEL),
412
  dcc.Dropdown(id="c1-families", options=family_options,
413
- value=ALL_FAMILIES, multi=True,
414
  style={"width": "500px"}),
415
  ]),
416
  ]),
@@ -513,7 +813,7 @@ app.layout = html.Div(
513
  dcc.Graph(id="c6-graph"),
514
  ]),
515
 
516
- # ── 6 Lookup table ──────────────────────────────────────────────
517
  html.Div(style=CARD, children=[
518
  html.H3(T["c4_title"]),
519
  html.Div(style={"display": "flex", "gap": "16px",
@@ -547,6 +847,75 @@ app.layout = html.Div(
547
  ]),
548
  html.Div(id="c4-table"),
549
  ]),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
550
  ],
551
  )
552
 
@@ -554,20 +923,73 @@ app.layout = html.Div(
554
  # CALLBACKS
555
  # ══════════════════════════════════════════════════════════════════════════
556
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
557
  # ── 1 Band gap distribution ──────────────────────────────────────────────
558
  @callback(
559
  Output("c1-graph", "figure"),
560
  Input("c1-agg", "value"),
561
  Input("c1-type", "value"),
562
  Input("c1-families", "value"),
 
563
  )
564
- def chart1(agg, chart_type, families):
565
- sub = df[df["Family"].isin(families)] if families else df
 
 
566
  grouped = sub.groupby("Family")["BandGap"].agg(AGG_MAP[agg]).reset_index()
567
  grouped.columns = ["Family", "BandGap"]
568
  grouped = grouped.sort_values("BandGap")
569
- # Translate family names for display
570
- grouped["FamilyDisplay"] = grouped["Family"].map(family_label)
 
571
  fn = px.bar if chart_type == "Bar" else px.line
572
  agg_display = AGG_DISPLAY.get(agg, agg)
573
  fig = fn(grouped, x="FamilyDisplay", y="BandGap",
@@ -585,12 +1007,15 @@ def chart1(agg, chart_type, families):
585
  Input("c2-left-type", "value"),
586
  Input("c2-right-type", "value"),
587
  Input("c2-yaxis", "value"),
 
588
  )
589
- def chart2(left_type, right_type, yaxis_mode):
 
 
590
  figs = []
591
  y_max = 0
592
  for mat_type in [left_type, right_type]:
593
- sub = df[df["Type"] == mat_type]
594
  grouped = (sub.groupby("Cluster")["BandGap"].mean()
595
  .reset_index().sort_values("Cluster"))
596
  type_disp = type_label(mat_type)
@@ -613,9 +1038,12 @@ def chart2(left_type, right_type, yaxis_mode):
613
  Input("c3-color", "value"),
614
  Input("c3-filter", "value"),
615
  Input("c3-topn", "value"),
 
616
  )
617
- def chart3(ndim, color_by, filter_type, topn_str):
618
- sub = df if filter_type == "All" else df[df["Type"] == filter_type]
 
 
619
 
620
  # Determine color column — use display columns for translated labels
621
  if color_by == "None":
@@ -694,9 +1122,11 @@ COL_HEADERS = {
694
  Input("c4-type", "value"),
695
  Input("c4-sort", "value"),
696
  Input("c4-limit", "value"),
 
697
  )
698
- def chart4(family, mat_type, sort_col, limit):
699
- sub = df.copy()
 
700
  if family != "All":
701
  sub = sub[sub["Family"] == family]
702
  if mat_type != "All":
@@ -720,7 +1150,8 @@ def chart4(family, mat_type, sort_col, limit):
720
  if c == "BandGap":
721
  val = f"{val:.3f}"
722
  elif c == "Family":
723
- val = family_label(str(val))
 
724
  elif c == "Type":
725
  val = type_label(str(val))
726
  else:
@@ -743,11 +1174,18 @@ def chart4(family, mat_type, sort_col, limit):
743
  Input("c5-agg", "value"),
744
  Input("c5-color", "value"),
745
  Input("c5-sort", "value"),
 
746
  )
747
- def chart5(row_dim, col_dim, val_col, agg_name, color_mode, sort_mode):
 
 
 
 
 
 
748
  agg_fn = AGG_MAP[agg_name]
749
- pivot = df.pivot_table(index=row_dim, columns=col_dim,
750
- values=val_col, aggfunc=agg_fn)
751
 
752
  # Sort
753
  if "Rows" in sort_mode:
@@ -780,12 +1218,11 @@ def chart5(row_dim, col_dim, val_col, agg_name, color_mode, sort_mode):
780
  })
781
  )
782
 
783
- # Translate column headers
784
  def translate_pivot_label(val, dim):
785
- """Translate pivot row/column labels."""
786
  s = str(val)
787
  if dim == "Family":
788
- return family_label(s)
789
  if dim == "Type":
790
  return type_label(s)
791
  return s
@@ -852,18 +1289,21 @@ def chart5(row_dim, col_dim, val_col, agg_name, color_mode, sort_mode):
852
  Output("c6-graph", "figure"),
853
  Input("c6-n", "value"),
854
  Input("c6-metric", "value"),
 
855
  )
856
- def chart6(n, metric):
 
 
 
857
  if metric == "Count":
858
- grouped = df.groupby("Family").size().reset_index(name="Value")
859
  elif metric == "Average BandGap":
860
- grouped = df.groupby("Family")["BandGap"].mean().reset_index(name="Value")
861
  else:
862
- grouped = df.groupby("Family")["BandGap"].max().reset_index(name="Value")
863
  grouped = grouped.nlargest(n, "Value")
864
  grouped = grouped.sort_values("Value")
865
- # Translate family names for display
866
- grouped["FamilyDisplay"] = grouped["Family"].map(family_label)
867
  metric_display = METRIC_DISPLAY.get(metric, metric)
868
  fig = px.bar(grouped, y="FamilyDisplay", x="Value", orientation="h",
869
  title=T["chart_top_n"].format(n=n, metric=metric_display),
@@ -873,6 +1313,119 @@ def chart6(n, metric):
873
  return fig
874
 
875
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
876
  # ── run ───────────────────────────────────────────────────────────────────
877
  if __name__ == "__main__":
878
  host = os.environ.get("HOST", "127.0.0.1")
 
1
  """
2
  Materials Database Explorer — Dash 4.0
3
 
4
+ Reproduces the Plotly-Studio-generated dashboard with 7 interactive charts
5
+ plus weighted clustering presets (Centroid Similarity Decomposition).
6
+
7
  Supports English and Japanese localization.
8
 
9
  Usage:
 
13
  """
14
 
15
  import argparse
16
+ import json
17
  import os
18
  import sys
19
+ from pathlib import Path
20
+
21
+ from dotenv import load_dotenv
22
+ load_dotenv()
23
 
24
+ import numpy as np
25
  import pandas as pd
26
  import plotly.express as px
27
+ from dash import Dash, dcc, html, Input, Output, State, callback
28
  from sklearn.decomposition import PCA
29
  from sklearn.preprocessing import StandardScaler
30
 
 
56
  "c1_title": "Band gap distribution over material families",
57
  "c2_title": "Compare band gap distribution of material types",
58
  "c3_title": "Principal Component Analysis (PCA) for high-dimensional material embeddings",
59
+ "c3_desc": "Reduces 32-dimensional material embeddings to 2D/3D to reveal clustering patterns.",
60
  "c4_title": "Look up materials by chemical family",
61
  "c5_title": "Band gap by material family and material type",
62
  "c6_title": "Show the top N material families",
 
105
  "chart_variance_pct": "Variance (%)",
106
  "chart_top_n": "Top {n} Families by {metric}",
107
  "colorscale": "Colorscale: ",
108
+ "c7_title": "Similar Materials Search",
109
+ "c7_desc": (
110
+ "Find materials with similar physical behavior using weighted "
111
+ "cosine similarity fusion across 4 embedding spaces "
112
+ "(Orb-v3, MEGNet, OFM, eSEN)."
113
+ ),
114
+ "c7_lbl_material": "Query material",
115
+ "c7_lbl_material_ph": "Search by formula or MP_ID...",
116
+ "c7_lbl_topk": "Results",
117
+ "c7_lbl_weights": "Vector weights",
118
+ "c7_btn_search": "Search",
119
+ "c7_status_ready": "Select a material and click Search",
120
+ "c7_status_no_qdrant": "Vector database unavailable: {error}",
121
+ "c7_status_not_found": "Material not found in vector database",
122
+ "c7_col_rank": "Rank",
123
+ "c7_col_formula": "Formula",
124
+ "c7_col_mpid": "MP_ID",
125
+ "c7_col_bandgap": "BandGap (eV)",
126
+ "c7_col_orb": "Orb-v3",
127
+ "c7_col_lmm": "MEGNet",
128
+ "c7_col_lofm": "OFM",
129
+ "c7_col_esen": "eSEN",
130
+ "c7_col_weighted": "Weighted",
131
+ # Clustering mode (B5)
132
+ "lbl_cluster_mode": "Clustering Mode",
133
+ "mode_preset": "Named Presets",
134
+ "mode_custom": "Custom Weights",
135
+ "lbl_perspective": "Clustering Perspective",
136
+ "lbl_cluster_weights": "Embedding space weights",
137
+ "preset_balanced": "Balanced (Equal Weights)",
138
+ "preset_orb_only": "Orb-Only (Force Field)",
139
+ "preset_mm_only": "MEGNet-Only (Coordination)",
140
+ "preset_ofm_only": "OFM-Only (Orbital)",
141
+ "preset_esen_only": "eSEN-Only (Energy)",
142
+ "preset_stability": "Stability Focus",
143
+ "preset_electronic": "Electronic Focus",
144
+ "preset_structural": "Structural Focus",
145
+ "preset_chemical": "Chemical Focus",
146
+ "preset_coord_energy": "Coordination Energy",
147
+ "preset_mechanochem": "Mechanochemical",
148
  },
149
  "ja": {
150
  "app_title": "材料データベースエクスプローラー",
 
156
  "c1_title": "材料ファミリーごとのバンドギャップ分布",
157
  "c2_title": "材料タイプ別バンドギャップ分布の比較",
158
  "c3_title": "高次元材料埋め込みの主成分分析(PCA)",
159
+ "c3_desc": "32次元の材料埋め込みを2D/3Dに次元削減し、クラスタリングパターンを可視化。",
160
  "c4_title": "化学ファミリーで材料を検索",
161
  "c5_title": "材料ファミリーと材料タイプ別バンドギャップ",
162
  "c6_title": "上位N材料ファミリーを表示",
 
205
  "chart_variance_pct": "寄与率(%)",
206
  "chart_top_n": "{metric}上位{n}ファミリー",
207
  "colorscale": "カラースケール: ",
208
+ "c7_title": "類似材料検索",
209
+ "c7_desc": (
210
+ "4つの埋め込み空間(Orb-v3、MEGNet、OFM、eSEN)を用いた"
211
+ "重み付きコサイン類似度融合により、物理的挙動が類似した材料を検索。"
212
+ ),
213
+ "c7_lbl_material": "クエリ材料",
214
+ "c7_lbl_material_ph": "化学式またはMP_IDで検索...",
215
+ "c7_lbl_topk": "結果数",
216
+ "c7_lbl_weights": "ベクトル重み",
217
+ "c7_btn_search": "検索",
218
+ "c7_status_ready": "材料を選択し検��をクリック",
219
+ "c7_status_no_qdrant": "ベクトルDB接続不可: {error}",
220
+ "c7_status_not_found": "ベクトルDBに材料が見つかりません",
221
+ "c7_col_rank": "順位",
222
+ "c7_col_formula": "化学式",
223
+ "c7_col_mpid": "MP_ID",
224
+ "c7_col_bandgap": "BandGap (eV)",
225
+ "c7_col_orb": "Orb-v3",
226
+ "c7_col_lmm": "MEGNet",
227
+ "c7_col_lofm": "OFM",
228
+ "c7_col_esen": "eSEN",
229
+ "c7_col_weighted": "重み付き",
230
+ # Clustering mode (B5)
231
+ "lbl_cluster_mode": "クラスタリングモード",
232
+ "mode_preset": "名前付きプリセット",
233
+ "mode_custom": "カスタム重み",
234
+ "lbl_perspective": "クラスタリング視点",
235
+ "lbl_cluster_weights": "埋め込み空間の重み",
236
+ "preset_balanced": "バランス(均等重み)",
237
+ "preset_orb_only": "Orb単独(力場)",
238
+ "preset_mm_only": "MEGNet単独(配位)",
239
+ "preset_ofm_only": "OFM単独(軌道)",
240
+ "preset_esen_only": "eSEN単独(エネルギー)",
241
+ "preset_stability": "安定性重視",
242
+ "preset_electronic": "電子構造重視",
243
+ "preset_structural": "構造重視",
244
+ "preset_chemical": "化学重視",
245
+ "preset_coord_energy": "配位エネルギー",
246
+ "preset_mechanochem": "力学化学",
247
  },
248
  }
249
 
 
256
  "Semiconductor": MT["type_semiconductor"],
257
  "Insulator": MT["type_insulator"],
258
  }
 
259
 
260
  # Aggregation display mapping (label shown in chart titles)
261
  AGG_DISPLAY = {
 
273
  }
274
 
275
 
 
 
 
 
 
276
  def type_label(en_name):
277
  """Translate electronic type using MAP_TRANSLATIONS (verbatim)."""
278
  return TYPE_DISPLAY.get(en_name, en_name)
279
 
280
 
281
  # ── data ──────────────────────────────────────────────────────────────────
282
+ CACHE = Path("material_universe_cache")
283
+ CSV = CACHE / "plotly_studio_export.csv"
284
  df = pd.read_csv(CSV)
285
+ DIM_COLS = [f"dim_{i}" for i in range(32)]
 
286
  ALL_TYPES = ["Metallic", "Semiconductor", "Insulator"]
287
 
288
  # Pre-compute PCA (expensive, do once)
 
295
  df["PC3"] = pc_all[:, 2]
296
  print("PCA done.")
297
 
298
+ # Build TypeDisplay column (static, never changes with clustering)
 
299
  df["TypeDisplay"] = df["Type"].map(TYPE_DISPLAY)
300
 
301
+ # C7 material dropdown options (formula + mp_id for display, mp_id as value)
302
+ c7_material_options = sorted([
303
+ {"label": f"{row['Formula']} ({row['MP_ID']})", "value": row["MP_ID"]}
304
+ for _, row in df[["MP_ID", "Formula"]].drop_duplicates().iterrows()
305
+ ], key=lambda x: x["label"])
306
+
307
+ # ── Preset label data ──────────────────────────────────────────────────
308
+ PRESET_KEYS = [
309
+ "balanced", "orb_only", "mm_only", "ofm_only", "esen_only",
310
+ "stability", "electronic", "structural", "chemical",
311
+ "coord_energy", "mechanochem",
312
+ ]
313
+
314
+ # Load preset label arrays and family name mappings
315
+ PRESET_LABELS = {}
316
+ PRESET_FAMILIES_EN = {}
317
+ PRESET_FAMILIES_JA = {}
318
+
319
+ # Load family name definitions
320
+ _families_path = Path("docs/cluster_definitions_presets.json")
321
+ _families_data = {}
322
+ if _families_path.exists():
323
+ with open(_families_path, "r", encoding="utf-8") as f:
324
+ _families_data = json.load(f)
325
+
326
+ for key in PRESET_KEYS:
327
+ label_path = CACHE / f"cluster_labels_{key}.npy"
328
+ if label_path.exists():
329
+ PRESET_LABELS[key] = np.load(label_path)
330
+ PRESET_FAMILIES_EN[key] = _families_data.get(key, {})
331
+ PRESET_FAMILIES_JA[key] = _families_data.get(f"{key}_ja", {})
332
+ else:
333
+ print(f" WARNING: {label_path} not found, preset '{key}' unavailable")
334
+
335
+ # Balanced family maps (used as stable reference for custom mode)
336
+ BALANCED_FAMILY_EN = PRESET_FAMILIES_EN.get("balanced", {})
337
+ BALANCED_FAMILY_JA = PRESET_FAMILIES_JA.get("balanced", {})
338
+
339
+ # Fall back to static Family/Family_JA from CSV if no presets loaded
340
+ if not PRESET_LABELS:
341
+ print(" WARNING: No preset labels loaded. Using static CSV clustering.")
342
+
343
+ # ── Centroid similarity matrices (for custom mode) ─────────────────────
344
+ S_ORB = S_MM = S_OFM = S_ESEN = None
345
+ CLUSTER_IDS = []
346
+ _sim_loaded = False
347
+
348
+ for name in ["orb", "mm", "ofm", "esen"]:
349
+ sim_path = CACHE / f"centroid_sim_{name}.npy"
350
+ if sim_path.exists():
351
+ arr = np.load(sim_path)
352
+ if name == "orb":
353
+ S_ORB = arr
354
+ elif name == "mm":
355
+ S_MM = arr
356
+ elif name == "ofm":
357
+ S_OFM = arr
358
+ elif name == "esen":
359
+ S_ESEN = arr
360
+
361
+ if S_ORB is not None and S_MM is not None and S_OFM is not None and S_ESEN is not None:
362
+ _sim_loaded = True
363
+ # Infer cluster IDs from balanced labels
364
+ if "balanced" in PRESET_LABELS:
365
+ CLUSTER_IDS = sorted(set(PRESET_LABELS["balanced"]) - {-1})
366
+ else:
367
+ CLUSTER_IDS = list(range(S_ORB.shape[1]))
368
+ print(f"Centroid similarity loaded: {S_ORB.shape}, {len(CLUSTER_IDS)} clusters")
369
+ else:
370
+ print(" WARNING: Centroid similarity matrices not loaded. Custom mode unavailable.")
371
+
372
+ # ── Label resolution ───────────────────────────────────────────────────
373
+
374
+ def resolve_labels(active_data):
375
+ """Resolve current clustering labels from the active-labels store data.
376
+
377
+ Returns (clusters, families, displays) as pd.Series aligned to df.index.
378
+ - clusters: integer cluster IDs
379
+ - families: English family names
380
+ - displays: display family names (JA if LANG=="ja", else EN)
381
+ """
382
+ if active_data is None or not PRESET_LABELS:
383
+ # Fallback: use CSV columns
384
+ return df["Cluster"], df["Family"], df.get("FamilyDisplay", df["Family"])
385
+
386
+ mode = active_data.get("mode", "preset")
387
+
388
+ if mode == "preset":
389
+ key = active_data.get("key", "balanced")
390
+ if key not in PRESET_LABELS:
391
+ key = "balanced"
392
+ labels = PRESET_LABELS[key]
393
+ fam_en = PRESET_FAMILIES_EN.get(key, {})
394
+ fam_ja = PRESET_FAMILIES_JA.get(key, {})
395
+ else:
396
+ # Custom: weighted centroid similarity
397
+ if not _sim_loaded:
398
+ return df["Cluster"], df["Family"], df.get("FamilyDisplay", df["Family"])
399
+ w = active_data.get("weights", [0.25, 0.25, 0.25, 0.25])
400
+ S = w[0] * S_ORB + w[1] * S_MM + w[2] * S_OFM + w[3] * S_ESEN
401
+ label_indices = S.argmax(axis=1)
402
+ labels = np.array([CLUSTER_IDS[i] for i in label_indices])
403
+ fam_en = BALANCED_FAMILY_EN
404
+ fam_ja = BALANCED_FAMILY_JA
405
+
406
+ clusters = pd.Series(labels, index=df.index)
407
+ families = clusters.astype(str).map(fam_en).fillna("Unclassified")
408
+ if LANG == "ja":
409
+ displays = clusters.astype(str).map(fam_ja).fillna(families)
410
+ else:
411
+ displays = families
412
+
413
+ return clusters, families, displays
414
+
415
+
416
+ def get_all_families(active_data):
417
+ """Get sorted unique family names for the current clustering."""
418
+ _, families, _ = resolve_labels(active_data)
419
+ return sorted(families.unique())
420
+
421
+
422
+ # ── Lazy Qdrant client (only connected when C7 is used) ─────────────────
423
+ _qdrant_client = None
424
+ _qdrant_error = None
425
+
426
+
427
+ def get_qdrant_client():
428
+ """Lazy-init Qdrant client. Returns (client, error_message)."""
429
+ global _qdrant_client, _qdrant_error
430
+ if _qdrant_client is not None:
431
+ return _qdrant_client, None
432
+ if _qdrant_error is not None:
433
+ return None, _qdrant_error
434
+ try:
435
+ from qdrant_client import QdrantClient
436
+ url = os.getenv("QDRANT_URL")
437
+ key = os.getenv("QDRANT_API_KEY")
438
+ if not url or not key:
439
+ _qdrant_error = "QDRANT_URL / QDRANT_API_KEY not set"
440
+ return None, _qdrant_error
441
+ _qdrant_client = QdrantClient(url=url, api_key=key, timeout=30)
442
+ _qdrant_client.get_collection("crystal-chroma-fusion")
443
+ return _qdrant_client, None
444
+ except Exception as e:
445
+ _qdrant_error = str(e)
446
+ return None, _qdrant_error
447
+
448
  # ── CSS ───────────────────────────────────────────────────────────────────
449
  CARD = {
450
  "background": "white",
 
460
  '"Hiragino Sans", "Noto Sans JP", sans-serif'
461
  )
462
 
463
+ # ── helpers ───────────────────────────────────────────────────────────
464
  AGG_MAP = {
465
  "Average": "mean", "Median": "median", "Max": "max",
466
  "Min": "min", "Sum": "sum", "Count": "count",
 
485
 
486
 
487
  # ── dropdown option builders ─────────────────────────────────────────────
488
+
489
+ # Preset dropdown options
490
+ preset_options = [
491
+ {"label": T[f"preset_{k}"], "value": k}
492
+ for k in PRESET_KEYS
493
+ if k in PRESET_LABELS
494
+ ]
495
+ if not preset_options:
496
+ preset_options = [{"label": T["preset_balanced"], "value": "balanced"}]
497
+
498
+ # Initial family list from balanced preset (will be dynamically updated)
499
+ _init_families = get_all_families({"mode": "preset", "key": "balanced"})
500
+ family_options = [{"label": f, "value": f} for f in _init_families]
501
  type_options = [{"label": type_label(t), "value": t} for t in ALL_TYPES]
502
 
503
  agg_options = [
 
576
  style={"color": "#666", "maxWidth": "900px",
577
  "marginBottom": "32px"}),
578
 
579
+ # ── Active labels store ─────────────────────────────────────────
580
+ dcc.Store(id="active-labels",
581
+ data={"mode": "preset", "key": "balanced"}),
582
+
583
+ # ── Clustering control panel ────────────────────────────────────
584
+ html.Div(style={**CARD, "borderLeft": "4px solid #1976d2"}, children=[
585
+ html.H3(T["lbl_cluster_mode"],
586
+ style={"marginTop": "0", "marginBottom": "12px"}),
587
+ html.Div(style={"display": "flex", "gap": "24px",
588
+ "flexWrap": "wrap", "alignItems": "flex-start"},
589
+ children=[
590
+ # Mode selector
591
+ html.Div([
592
+ dcc.RadioItems(
593
+ id="cluster-mode",
594
+ options=[
595
+ {"label": T["mode_preset"], "value": "preset"},
596
+ {"label": T["mode_custom"], "value": "custom"},
597
+ ],
598
+ value="preset",
599
+ inline=True,
600
+ style={"fontSize": "14px"},
601
+ inputStyle={"marginRight": "6px"},
602
+ labelStyle={"marginRight": "20px"},
603
+ ),
604
+ ], style={"marginBottom": "8px"}),
605
+ ]),
606
+ # Preset dropdown
607
+ html.Div(id="preset-container", children=[
608
+ html.Div(T["lbl_perspective"], style=LABEL),
609
+ dcc.Dropdown(
610
+ id="perspective",
611
+ options=preset_options,
612
+ value="balanced",
613
+ clearable=False,
614
+ style={"width": "360px"},
615
+ ),
616
+ ], style={"marginTop": "8px"}),
617
+ # Custom weight sliders
618
+ html.Div(id="custom-container", children=[
619
+ html.Div(T["lbl_cluster_weights"],
620
+ style={**LABEL, "marginTop": "4px"}),
621
+ html.Div(style={"display": "flex", "gap": "24px",
622
+ "flexWrap": "wrap"}, children=[
623
+ html.Div([
624
+ html.Div(label, style={**LABEL, "fontSize": "12px"}),
625
+ dcc.Slider(
626
+ id=f"cw-{sid}", min=0, max=1, step=0.05,
627
+ value=0.25,
628
+ marks={0: "0", 0.5: "0.5", 1: "1"},
629
+ tooltip={"placement": "bottom"},
630
+ ),
631
+ ], style={"width": "180px"})
632
+ for sid, label in [
633
+ ("orb", "Orb-v3"), ("mm", "l-MM"),
634
+ ("ofm", "l-OFM"), ("esen", "eSEN"),
635
+ ]
636
+ ]),
637
+ ], style={"marginTop": "8px", "display": "none"}),
638
+ ]),
639
+
640
  # ── 1 PCA ───────────────────────────────────────────────────────
641
  html.Div(style=CARD, children=[
642
  html.H3(T["c3_title"]),
 
710
  html.Div([
711
  html.Div(T["lbl_families"], style=LABEL),
712
  dcc.Dropdown(id="c1-families", options=family_options,
713
+ value=_init_families, multi=True,
714
  style={"width": "500px"}),
715
  ]),
716
  ]),
 
813
  dcc.Graph(id="c6-graph"),
814
  ]),
815
 
816
+ # ── 7 Lookup table ──────────────────────────────────────────────
817
  html.Div(style=CARD, children=[
818
  html.H3(T["c4_title"]),
819
  html.Div(style={"display": "flex", "gap": "16px",
 
847
  ]),
848
  html.Div(id="c4-table"),
849
  ]),
850
+
851
+ # ── 8 Similar Materials Search ────────────────────────────────
852
+ html.Div(style=CARD, children=[
853
+ html.H3(T["c7_title"]),
854
+ html.P(T["c7_desc"],
855
+ style={"color": "#666", "fontSize": "14px"}),
856
+ # Row 1: material selector + top-K + search button
857
+ html.Div(style={"display": "flex", "gap": "16px",
858
+ "flexWrap": "wrap", "marginBottom": "12px",
859
+ "alignItems": "flex-end"}, children=[
860
+ html.Div([
861
+ html.Div(T["c7_lbl_material"], style=LABEL),
862
+ dcc.Dropdown(
863
+ id="c7-material",
864
+ options=c7_material_options,
865
+ placeholder=T["c7_lbl_material_ph"],
866
+ style={"width": "360px"},
867
+ searchable=True,
868
+ ),
869
+ ]),
870
+ html.Div([
871
+ html.Div(T["c7_lbl_topk"], style=LABEL),
872
+ dcc.Dropdown(
873
+ id="c7-topk",
874
+ options=[5, 10, 20, 50],
875
+ value=10,
876
+ clearable=False,
877
+ style={"width": "90px"},
878
+ ),
879
+ ]),
880
+ html.Button(
881
+ T["c7_btn_search"], id="c7-search-btn", n_clicks=0,
882
+ style={
883
+ "padding": "8px 24px", "background": "#1976d2",
884
+ "color": "white", "border": "none",
885
+ "borderRadius": "4px", "cursor": "pointer",
886
+ "fontWeight": "600", "fontSize": "14px",
887
+ "height": "36px",
888
+ },
889
+ ),
890
+ ]),
891
+ # Row 2: weight sliders
892
+ html.Div([
893
+ html.Div(T["c7_lbl_weights"], style=LABEL),
894
+ ], style={"marginBottom": "4px"}),
895
+ html.Div(style={"display": "flex", "gap": "24px",
896
+ "flexWrap": "wrap", "marginBottom": "16px"},
897
+ children=[
898
+ html.Div([
899
+ html.Div(label, style={**LABEL, "fontSize": "12px"}),
900
+ dcc.Slider(
901
+ id=f"c7-w-{sid}", min=0, max=1, step=0.05,
902
+ value=0.25,
903
+ marks={0: "0", 0.5: "0.5", 1: "1"},
904
+ tooltip={"placement": "bottom"},
905
+ ),
906
+ ], style={"width": "180px"})
907
+ for sid, label in [
908
+ ("orb", "Orb-v3 (1792d)"), ("lmm", "MEGNet (758d)"),
909
+ ("lofm", "OFM (188d)"), ("esen", "eSEN (128d)"),
910
+ ]
911
+ ]),
912
+ # Results area
913
+ html.Div(id="c7-status",
914
+ style={"color": "#666", "marginBottom": "8px",
915
+ "fontSize": "13px"},
916
+ children=T["c7_status_ready"]),
917
+ html.Div(id="c7-results"),
918
+ ]),
919
  ],
920
  )
921
 
 
923
  # CALLBACKS
924
  # ══════════════════════════════════════════════════════════════════════════
925
 
926
+ # ── Active labels computation ──────────────────────────────────────────
927
+ @callback(
928
+ Output("active-labels", "data"),
929
+ Input("cluster-mode", "value"),
930
+ Input("perspective", "value"),
931
+ Input("cw-orb", "value"),
932
+ Input("cw-mm", "value"),
933
+ Input("cw-ofm", "value"),
934
+ Input("cw-esen", "value"),
935
+ )
936
+ def compute_labels(mode, preset, w_orb, w_mm, w_ofm, w_esen):
937
+ if mode == "preset":
938
+ return {"mode": "preset", "key": preset or "balanced"}
939
+ # Custom: normalize weights
940
+ w = [w_orb or 0.25, w_mm or 0.25, w_ofm or 0.25, w_esen or 0.25]
941
+ total = sum(w)
942
+ if total > 0:
943
+ w = [x / total for x in w]
944
+ else:
945
+ w = [0.25, 0.25, 0.25, 0.25]
946
+ return {"mode": "custom", "weights": w}
947
+
948
+
949
+ # ── Clustering mode toggle (show/hide preset vs custom controls) ───────
950
+ @callback(
951
+ Output("preset-container", "style"),
952
+ Output("custom-container", "style"),
953
+ Input("cluster-mode", "value"),
954
+ )
955
+ def toggle_cluster_mode(mode):
956
+ if mode == "preset":
957
+ return {"marginTop": "8px"}, {"marginTop": "8px", "display": "none"}
958
+ return {"marginTop": "8px", "display": "none"}, {"marginTop": "8px"}
959
+
960
+
961
+ # ── Dynamic family dropdown options (B4) ──────────────────────────────
962
+ @callback(
963
+ Output("c1-families", "options"),
964
+ Output("c1-families", "value"),
965
+ Output("c4-family", "options"),
966
+ Input("active-labels", "data"),
967
+ )
968
+ def update_family_options(active_data):
969
+ families = get_all_families(active_data)
970
+ opts = [{"label": f, "value": f} for f in families]
971
+ c4_opts = [{"label": T["opt_all"], "value": "All"}] + opts
972
+ return opts, families, c4_opts
973
+
974
+
975
  # ── 1 Band gap distribution ──────────────────────────────────────────────
976
  @callback(
977
  Output("c1-graph", "figure"),
978
  Input("c1-agg", "value"),
979
  Input("c1-type", "value"),
980
  Input("c1-families", "value"),
981
+ Input("active-labels", "data"),
982
  )
983
+ def chart1(agg, chart_type, families, active_data):
984
+ _, fam_series, display_series = resolve_labels(active_data)
985
+ work = df.assign(Family=fam_series, FamilyDisplay=display_series)
986
+ sub = work[work["Family"].isin(families)] if families else work
987
  grouped = sub.groupby("Family")["BandGap"].agg(AGG_MAP[agg]).reset_index()
988
  grouped.columns = ["Family", "BandGap"]
989
  grouped = grouped.sort_values("BandGap")
990
+ # Build display mapping from the current labels
991
+ fam_to_display = dict(zip(work["Family"], work["FamilyDisplay"]))
992
+ grouped["FamilyDisplay"] = grouped["Family"].map(fam_to_display)
993
  fn = px.bar if chart_type == "Bar" else px.line
994
  agg_display = AGG_DISPLAY.get(agg, agg)
995
  fig = fn(grouped, x="FamilyDisplay", y="BandGap",
 
1007
  Input("c2-left-type", "value"),
1008
  Input("c2-right-type", "value"),
1009
  Input("c2-yaxis", "value"),
1010
+ Input("active-labels", "data"),
1011
  )
1012
+ def chart2(left_type, right_type, yaxis_mode, active_data):
1013
+ clusters, _, _ = resolve_labels(active_data)
1014
+ work = df.assign(Cluster=clusters)
1015
  figs = []
1016
  y_max = 0
1017
  for mat_type in [left_type, right_type]:
1018
+ sub = work[work["Type"] == mat_type]
1019
  grouped = (sub.groupby("Cluster")["BandGap"].mean()
1020
  .reset_index().sort_values("Cluster"))
1021
  type_disp = type_label(mat_type)
 
1038
  Input("c3-color", "value"),
1039
  Input("c3-filter", "value"),
1040
  Input("c3-topn", "value"),
1041
+ Input("active-labels", "data"),
1042
  )
1043
+ def chart3(ndim, color_by, filter_type, topn_str, active_data):
1044
+ clusters, families, displays = resolve_labels(active_data)
1045
+ work = df.assign(Cluster=clusters, Family=families, FamilyDisplay=displays)
1046
+ sub = work if filter_type == "All" else work[work["Type"] == filter_type]
1047
 
1048
  # Determine color column — use display columns for translated labels
1049
  if color_by == "None":
 
1122
  Input("c4-type", "value"),
1123
  Input("c4-sort", "value"),
1124
  Input("c4-limit", "value"),
1125
+ Input("active-labels", "data"),
1126
  )
1127
+ def chart4(family, mat_type, sort_col, limit, active_data):
1128
+ clusters, families, displays = resolve_labels(active_data)
1129
+ sub = df.assign(Cluster=clusters, Family=families, FamilyDisplay=displays)
1130
  if family != "All":
1131
  sub = sub[sub["Family"] == family]
1132
  if mat_type != "All":
 
1150
  if c == "BandGap":
1151
  val = f"{val:.3f}"
1152
  elif c == "Family":
1153
+ # Use display name from FamilyDisplay
1154
+ val = sub.loc[r.name, "FamilyDisplay"] if r.name in sub.index else str(val)
1155
  elif c == "Type":
1156
  val = type_label(str(val))
1157
  else:
 
1174
  Input("c5-agg", "value"),
1175
  Input("c5-color", "value"),
1176
  Input("c5-sort", "value"),
1177
+ Input("active-labels", "data"),
1178
  )
1179
+ def chart5(row_dim, col_dim, val_col, agg_name, color_mode, sort_mode, active_data):
1180
+ clusters, families, displays = resolve_labels(active_data)
1181
+ work = df.assign(Cluster=clusters, Family=families, FamilyDisplay=displays)
1182
+
1183
+ # Build display mapping for pivot labels
1184
+ fam_to_display = dict(zip(work["Family"], work["FamilyDisplay"]))
1185
+
1186
  agg_fn = AGG_MAP[agg_name]
1187
+ pivot = work.pivot_table(index=row_dim, columns=col_dim,
1188
+ values=val_col, aggfunc=agg_fn)
1189
 
1190
  # Sort
1191
  if "Rows" in sort_mode:
 
1218
  })
1219
  )
1220
 
1221
+ # Translate pivot labels
1222
  def translate_pivot_label(val, dim):
 
1223
  s = str(val)
1224
  if dim == "Family":
1225
+ return fam_to_display.get(s, s)
1226
  if dim == "Type":
1227
  return type_label(s)
1228
  return s
 
1289
  Output("c6-graph", "figure"),
1290
  Input("c6-n", "value"),
1291
  Input("c6-metric", "value"),
1292
+ Input("active-labels", "data"),
1293
  )
1294
+ def chart6(n, metric, active_data):
1295
+ _, families, displays = resolve_labels(active_data)
1296
+ work = df.assign(Family=families, FamilyDisplay=displays)
1297
+ fam_to_display = dict(zip(work["Family"], work["FamilyDisplay"]))
1298
  if metric == "Count":
1299
+ grouped = work.groupby("Family").size().reset_index(name="Value")
1300
  elif metric == "Average BandGap":
1301
+ grouped = work.groupby("Family")["BandGap"].mean().reset_index(name="Value")
1302
  else:
1303
+ grouped = work.groupby("Family")["BandGap"].max().reset_index(name="Value")
1304
  grouped = grouped.nlargest(n, "Value")
1305
  grouped = grouped.sort_values("Value")
1306
+ grouped["FamilyDisplay"] = grouped["Family"].map(fam_to_display)
 
1307
  metric_display = METRIC_DISPLAY.get(metric, metric)
1308
  fig = px.bar(grouped, y="FamilyDisplay", x="Value", orientation="h",
1309
  title=T["chart_top_n"].format(n=n, metric=metric_display),
 
1313
  return fig
1314
 
1315
 
1316
+ # ── 7 Similar Materials Search ────────────────────────────────────────────
1317
+
1318
+ def _render_c7_table(results, query_mp_id):
1319
+ """Render weighted cosine similarity results as an HTML table."""
1320
+ score_names = ["orb", "l_mm", "l_ofm", "esen"]
1321
+ col_labels = [
1322
+ T["c7_col_rank"], T["c7_col_formula"], T["c7_col_mpid"],
1323
+ T["c7_col_bandgap"],
1324
+ T["c7_col_orb"], T["c7_col_lmm"], T["c7_col_lofm"], T["c7_col_esen"],
1325
+ T["c7_col_weighted"],
1326
+ ]
1327
+ hdr = html.Tr([
1328
+ html.Th(c, style={"padding": "6px 10px", "fontWeight": "700",
1329
+ "fontFamily": MONO, "fontSize": "12px",
1330
+ "borderBottom": "2px solid #aaa",
1331
+ "textAlign": "right" if i >= 3 else "left"})
1332
+ for i, c in enumerate(col_labels)
1333
+ ])
1334
+ rows = []
1335
+ for rank, r in enumerate(results, 1):
1336
+ is_self = r["mp_id"] == query_mp_id
1337
+ row_bg = {"background": "#e3f2fd"} if is_self else {}
1338
+ cell_style = {"padding": "4px 10px", "fontFamily": MONO,
1339
+ "fontSize": "12px", "borderBottom": "1px solid #eee"}
1340
+ cells = [
1341
+ html.Td(str(rank), style=cell_style),
1342
+ html.Td(r["formula"], style={**cell_style,
1343
+ "fontWeight": "600" if is_self else "400"}),
1344
+ html.Td(r["mp_id"], style=cell_style),
1345
+ html.Td(f"{r['band_gap']:.4f}",
1346
+ style={**cell_style, "textAlign": "right"}),
1347
+ ]
1348
+ for name in score_names:
1349
+ cells.append(html.Td(
1350
+ f"{r['scores'][name]:.4f}",
1351
+ style={**cell_style, "textAlign": "right"},
1352
+ ))
1353
+ cells.append(html.Td(
1354
+ f"{r['weighted_score']:.4f}",
1355
+ style={**cell_style, "textAlign": "right", "fontWeight": "700"},
1356
+ ))
1357
+ rows.append(html.Tr(cells, style=row_bg))
1358
+ return html.Table(
1359
+ [html.Thead(hdr), html.Tbody(rows)],
1360
+ style={"width": "100%", "borderCollapse": "collapse"},
1361
+ )
1362
+
1363
+
1364
+ @callback(
1365
+ Output("c7-results", "children"),
1366
+ Output("c7-status", "children"),
1367
+ Input("c7-search-btn", "n_clicks"),
1368
+ State("c7-material", "value"),
1369
+ State("c7-topk", "value"),
1370
+ State("c7-w-orb", "value"),
1371
+ State("c7-w-lmm", "value"),
1372
+ State("c7-w-lofm", "value"),
1373
+ State("c7-w-esen", "value"),
1374
+ prevent_initial_call=True,
1375
+ )
1376
+ def chart7(n_clicks, mp_id, top_k, w_orb, w_lmm, w_lofm, w_esen):
1377
+ if not mp_id:
1378
+ return None, T["c7_status_ready"]
1379
+
1380
+ # 1. Lazy Qdrant client
1381
+ client, error = get_qdrant_client()
1382
+ if error:
1383
+ return None, T["c7_status_no_qdrant"].format(error=error)
1384
+
1385
+ # 2. Build & normalize weights
1386
+ weights = {"orb": w_orb or 0, "l_mm": w_lmm or 0,
1387
+ "l_ofm": w_lofm or 0, "esen": w_esen or 0}
1388
+ total = sum(weights.values())
1389
+ if total > 0:
1390
+ weights = {k: v / total for k, v in weights.items()}
1391
+ else:
1392
+ weights = {k: 0.25 for k in weights}
1393
+
1394
+ # 3. Resolve mp_id → Qdrant point ID
1395
+ from qdrant_client.models import Filter, FieldCondition, MatchValue
1396
+ scroll_result = client.scroll(
1397
+ collection_name="crystal-chroma-fusion",
1398
+ scroll_filter=Filter(
1399
+ must=[FieldCondition(key="mp_id", match=MatchValue(value=mp_id))]
1400
+ ),
1401
+ limit=1, with_vectors=False,
1402
+ )
1403
+ if not scroll_result[0]:
1404
+ return None, T["c7_status_not_found"]
1405
+ query_point_id = scroll_result[0][0].id
1406
+
1407
+ # 4. Run weighted cosine search
1408
+ from search.fusion import weighted_cosine_search
1409
+ try:
1410
+ results = weighted_cosine_search(
1411
+ client=client,
1412
+ collection="crystal-chroma-fusion",
1413
+ query_point_id=query_point_id,
1414
+ weights=weights,
1415
+ top_k=top_k,
1416
+ prefetch_k=max(top_k * 5, 50),
1417
+ )
1418
+ except Exception as e:
1419
+ return None, f"Search error: {e}"
1420
+
1421
+ if not results:
1422
+ return None, T["c7_status_not_found"]
1423
+
1424
+ w_str = ", ".join(f"{k}={v:.2f}" for k, v in weights.items())
1425
+ status = f"{len(results)} results | {w_str}"
1426
+ return _render_c7_table(results, mp_id), status
1427
+
1428
+
1429
  # ── run ───────────────────────────────────────────────────────────────────
1430
  if __name__ == "__main__":
1431
  host = os.environ.get("HOST", "127.0.0.1")
docs/cluster_definitions_presets.json ADDED
@@ -0,0 +1,400 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "balanced": {
3
+ "-1": "Unclassified / Noise",
4
+ "0": "Heavy Transition Metal Intermetallics",
5
+ "1": "Alkali Fluorides",
6
+ "2": "Rare Earth Carbonitrides",
7
+ "3": "Alkali Hydroxides",
8
+ "4": "Alkali Chalcogenides",
9
+ "5": "Alkali Chlorides",
10
+ "6": "Alkali Heavy Halides",
11
+ "7": "Alkali Phosphates",
12
+ "8": "Alkali Chalcogenide Oxysalts",
13
+ "9": "Transition Metal Pnictides",
14
+ "10": "Transition Metal Silicides",
15
+ "11": "Heavy Transition Metal Borides",
16
+ "12": "p-Block Metal Intermetallics",
17
+ "13": "Rare Earth Intermetallics",
18
+ "14": "Heavy Transition Metal Oxides (Ba/Ru)",
19
+ "15": "Alkali Oxides",
20
+ "16": "Heavy Transition Metal Oxides (Ta/Ba)"
21
+ },
22
+ "balanced_ja": {
23
+ "-1": "未分類 / ノイズ",
24
+ "0": "重遷移金属金属間化合物",
25
+ "1": "アルカリ金属フッ化物",
26
+ "2": "希土類炭窒化物",
27
+ "3": "アルカリ金属水酸化物",
28
+ "4": "アルカリ金属カルコゲナイド",
29
+ "5": "アルカリ金属塩化物",
30
+ "6": "アルカリ金属重ハロゲン化物",
31
+ "7": "アルカリ金属リン酸塩",
32
+ "8": "アルカリ金属カルコゲン酸塩",
33
+ "9": "遷移金属プニクタイド",
34
+ "10": "遷移金属ケイ化物",
35
+ "11": "重遷移金属ホウ化物",
36
+ "12": "p-ブロック金属金属間化合物",
37
+ "13": "希土類金属間化合物",
38
+ "14": "重遷移金属酸化物 (Ba/Ru)",
39
+ "15": "アルカリ金属酸化物",
40
+ "16": "重遷移金属酸化物 (Ta/Ba)"
41
+ },
42
+ "orb_only": {
43
+ "-1": "Unclassified / Noise",
44
+ "0": "Alkali Fluorides",
45
+ "1": "Alkaline Earth Carbonitrides",
46
+ "2": "Alkali Chalcogenide Oxysalts",
47
+ "3": "Rare Earth Intermetallics (Hg/Pm)",
48
+ "4": "Alkali Chlorides",
49
+ "5": "Heavy Transition Metal Intermetallics (Pt/Rh)",
50
+ "6": "p-Block Metal Intermetallics",
51
+ "7": "Transition Metal Pnictides",
52
+ "8": "Heavy Transition Metal Borides",
53
+ "9": "Transition Metal Intermetallics (Si/B)",
54
+ "10": "Transition Metal Intermetallics (Ga/Zn)",
55
+ "11": "Rare Earth Intermetallics (Si/Ge)",
56
+ "12": "Heavy Transition Metal Intermetallics (In/Au)",
57
+ "13": "Alkaline Earth Pnictides",
58
+ "14": "Alkali Chalcogenides (Cu/K)",
59
+ "15": "Rare Earth Chalcogenides (Sb/As)",
60
+ "16": "Rare Earth Chalcogenides (La/Si)",
61
+ "17": "Alkali Chalcogenides (P/K)"
62
+ },
63
+ "orb_only_ja": {
64
+ "-1": "未分類 / ノイズ",
65
+ "0": "アルカリ金属フッ化物",
66
+ "1": "アルカリ土類炭窒化物",
67
+ "2": "アルカリ金属カルコゲン酸塩",
68
+ "3": "希土類金属間化合物 (Hg/Pm)",
69
+ "4": "アルカリ金属塩化物",
70
+ "5": "重遷移金属金属間化合物 (Pt/Rh)",
71
+ "6": "p-ブロック金属金属間化合物",
72
+ "7": "遷移金属プニクタイド",
73
+ "8": "重遷移金属ホウ化物",
74
+ "9": "遷移金属金属間化合物 (Si/B)",
75
+ "10": "遷移金属金属間化合物 (Ga/Zn)",
76
+ "11": "希土類金属間化合物 (Si/Ge)",
77
+ "12": "重遷移金属金属間化合物 (In/Au)",
78
+ "13": "アルカリ土類プニクタイド",
79
+ "14": "アルカリ金属カルコゲナイド (Cu/K)",
80
+ "15": "希土類カルコゲナイド (Sb/As)",
81
+ "16": "希土類カルコゲナイド (La/Si)",
82
+ "17": "アルカリ金属カルコゲナイド (P/K)"
83
+ },
84
+ "mm_only": {
85
+ "-1": "Unclassified / Noise",
86
+ "0": "Heavy Transition Metal Intermetallics",
87
+ "1": "Alkali Fluorides",
88
+ "2": "Alkali Chalcogenide Oxysalts",
89
+ "3": "Alkaline Earth Nitrides",
90
+ "4": "Rare Earth Chalcogenides"
91
+ },
92
+ "mm_only_ja": {
93
+ "-1": "未分類 / ノイズ",
94
+ "0": "重遷移金属金属間化合物",
95
+ "1": "アルカリ金属フッ化物",
96
+ "2": "アルカリ金属カルコゲン酸塩",
97
+ "3": "アルカリ土類窒化物",
98
+ "4": "希土類カルコゲナイド"
99
+ },
100
+ "ofm_only": {
101
+ "-1": "Unclassified / Noise",
102
+ "0": "Alkali Phosphates",
103
+ "1": "Transition Metal Nitrides",
104
+ "2": "Alkali Fluorides",
105
+ "3": "Rare Earth Intermetallics",
106
+ "4": "Heavy Transition Metal Intermetallics (Pt/Si)",
107
+ "5": "Heavy Transition Metal Intermetallics (Si/Ge)",
108
+ "6": "Transition Metal Chalcogenide Oxysalts (Mo/Mn)",
109
+ "7": "Transition Metal Chalcogenide Oxysalts (Ti/Zr)",
110
+ "8": "Transition Metal Chalcogenide Oxysalts (V/Ta)",
111
+ "9": "p-Block Metal Borates",
112
+ "10": "Alkali Silicates",
113
+ "11": "Alkali Chalcogenide Oxysalts",
114
+ "12": "Rare Earth Chalcogenide Oxysalts",
115
+ "13": "Heavy Transition Metal Chalcogenide Oxysalts"
116
+ },
117
+ "ofm_only_ja": {
118
+ "-1": "未分類 / ノイズ",
119
+ "0": "アルカリ金属リン酸塩",
120
+ "1": "遷移金属窒化物",
121
+ "2": "アルカリ金属フッ化物",
122
+ "3": "希土類金属間化合物",
123
+ "4": "重遷移金属金属間化合物 (Pt/Si)",
124
+ "5": "重遷移金属金属間化合物 (Si/Ge)",
125
+ "6": "遷移金属カルコゲン酸塩 (Mo/Mn)",
126
+ "7": "遷移金属カルコゲン酸塩 (Ti/Zr)",
127
+ "8": "遷移金属カルコゲン酸塩 (V/Ta)",
128
+ "9": "p-ブロック金属ホウ酸塩",
129
+ "10": "アルカリ金属ケイ酸塩",
130
+ "11": "アルカリ金属カルコゲン酸塩",
131
+ "12": "希土類カルコゲン酸塩",
132
+ "13": "重遷移金属カルコゲン酸塩"
133
+ },
134
+ "esen_only": {
135
+ "-1": "Unclassified / Noise",
136
+ "0": "Alkali Oxides",
137
+ "1": "Rare Earth Oxides",
138
+ "2": "Heavy Transition Metal Intermetallics",
139
+ "3": "Rare Earth Chalcogenides",
140
+ "4": "Alkali Pnictides",
141
+ "5": "Alkali Chalcogenides",
142
+ "6": "Alkali Chlorides"
143
+ },
144
+ "esen_only_ja": {
145
+ "-1": "未分類 / ノイズ",
146
+ "0": "アルカリ金属酸化物",
147
+ "1": "希土類酸化物",
148
+ "2": "重遷移金属金属間化合物",
149
+ "3": "希土類カルコゲナイド",
150
+ "4": "アルカリ金属プニクタイド",
151
+ "5": "アルカリ金属カルコゲナイド",
152
+ "6": "アルカリ金属塩化物"
153
+ },
154
+ "stability": {
155
+ "-1": "Unclassified / Noise",
156
+ "0": "Heavy Transition Metal Intermetallics (Li/Mg)",
157
+ "1": "Alkali Fluorides",
158
+ "2": "Alkali Hydroxides",
159
+ "3": "Rare Earth Carbonitrides",
160
+ "4": "Alkali Chalcogenides",
161
+ "5": "Alkali Phosphates",
162
+ "6": "Alkali Chalcogenide Oxysalts",
163
+ "7": "Alkali Chlorides",
164
+ "8": "Alkali Heavy Halides",
165
+ "9": "Heavy Transition Metal Oxides (Ba/Ru)",
166
+ "10": "Heavy Transition Metal Borides",
167
+ "11": "Transition Metal Pnictides (Ni/P)",
168
+ "12": "Transition Metal Pnictides (Si/Co)",
169
+ "13": "Transition Metal Oxides",
170
+ "14": "Rare Earth Intermetallics (Sn/Ge)",
171
+ "15": "Heavy Transition Metal Oxides (Ta/Ba)",
172
+ "16": "Alkali Oxides",
173
+ "17": "Alkaline Earth Silicates",
174
+ "18": "p-Block Metal Intermetallics",
175
+ "19": "Rare Earth Intermetallics (Pd/Ni)",
176
+ "20": "Heavy Transition Metal Intermetallics (Au/In)"
177
+ },
178
+ "stability_ja": {
179
+ "-1": "未分類 / ノイズ",
180
+ "0": "重遷移金属金属間化合物 (Li/Mg)",
181
+ "1": "アルカリ金属フッ化物",
182
+ "2": "アルカリ金属水酸化物",
183
+ "3": "希土類炭窒化物",
184
+ "4": "アルカリ金属カルコゲナイド",
185
+ "5": "アルカリ金属リン酸塩",
186
+ "6": "アルカリ金属カルコゲン酸塩",
187
+ "7": "アルカリ金属塩化物",
188
+ "8": "アルカリ金属重ハロゲン化物",
189
+ "9": "重遷移金属酸化物 (Ba/Ru)",
190
+ "10": "重遷移金属ホウ化物",
191
+ "11": "遷移金属プニクタイド (Ni/P)",
192
+ "12": "遷移金属プニクタイド (Si/Co)",
193
+ "13": "遷移金属酸化物",
194
+ "14": "希土類金属間化合物 (Sn/Ge)",
195
+ "15": "重遷移金属酸化物 (Ta/Ba)",
196
+ "16": "アルカリ金属酸化物",
197
+ "17": "アルカリ土類ケイ酸塩",
198
+ "18": "p-ブロック金属金属間化合物",
199
+ "19": "希土類金属間化合物 (Pd/Ni)",
200
+ "20": "重遷移金属金属間化合物 (Au/In)"
201
+ },
202
+ "electronic": {
203
+ "-1": "Unclassified / Noise",
204
+ "0": "Heavy Transition Metal Intermetallics (Li/Mg)",
205
+ "1": "Alkali Fluorides",
206
+ "2": "Alkaline Earth Carbonitrides",
207
+ "3": "Alkali Hydroxides",
208
+ "4": "Alkali Chalcogenides",
209
+ "5": "Heavy Transition Metal Intermetallics (Si/Ni)",
210
+ "6": "Alkali Phosphates",
211
+ "7": "Alkali Chlorides",
212
+ "8": "Alkali Heavy Halides",
213
+ "9": "Transition Metal Chalcogenide Oxysalts",
214
+ "10": "Transition Metal Oxides (Mo/V)",
215
+ "11": "Alkali Oxides",
216
+ "12": "Heavy Transition Metal Oxides",
217
+ "13": "Transition Metal Oxides (Mn/Fe)"
218
+ },
219
+ "electronic_ja": {
220
+ "-1": "未分類 / ノイズ",
221
+ "0": "重遷移金属金属間化合物 (Li/Mg)",
222
+ "1": "アルカリ金属フッ化物",
223
+ "2": "アルカリ土類炭窒化物",
224
+ "3": "アルカリ金属水酸化物",
225
+ "4": "アルカリ金属カルコゲナイド",
226
+ "5": "重遷移金属金属間化合物 (Si/Ni)",
227
+ "6": "アルカリ金属リン酸塩",
228
+ "7": "アルカリ金属塩化物",
229
+ "8": "アルカリ金属重ハロゲン化物",
230
+ "9": "遷移金属カルコゲン酸塩",
231
+ "10": "遷移金属酸化物 (Mo/V)",
232
+ "11": "アルカリ金属酸化物",
233
+ "12": "重遷移金属酸化物",
234
+ "13": "遷移金属酸化物 (Mn/Fe)"
235
+ },
236
+ "structural": {
237
+ "-1": "Unclassified / Noise",
238
+ "0": "Heavy Transition Metal Intermetallics",
239
+ "1": "Alkali Fluorides",
240
+ "2": "Rare Earth Carbonitrides",
241
+ "3": "Alkali Hydroxides",
242
+ "4": "Alkali Chalcogenides",
243
+ "5": "Alkali Phosphates",
244
+ "6": "Alkali Chlorides",
245
+ "7": "Alkali Heavy Halides",
246
+ "8": "Heavy Transition Metal Oxides (Ba/Ru)",
247
+ "9": "Transition Metal Pnictides (Ni/P)",
248
+ "10": "Alkali Chalcogenide Oxysalts",
249
+ "11": "Transition Metal Oxides (Mo/V)",
250
+ "12": "Heavy Transition Metal Borides",
251
+ "13": "Transition Metal Pnictides (Si/Co)",
252
+ "14": "p-Block Metal Intermetallics",
253
+ "15": "Rare Earth Intermetallics",
254
+ "16": "Alkali Silicates",
255
+ "17": "Heavy Transition Metal Oxides (Ba/Ta)",
256
+ "18": "Transition Metal Oxides (Mn/Fe)"
257
+ },
258
+ "structural_ja": {
259
+ "-1": "未分類 / ノイズ",
260
+ "0": "重遷移金属金属間化合物",
261
+ "1": "アルカリ金属フッ化物",
262
+ "2": "希土類炭窒化物",
263
+ "3": "アルカリ金属水酸化物",
264
+ "4": "アルカリ金属カルコゲナイド",
265
+ "5": "アルカリ金属リン酸塩",
266
+ "6": "アルカリ金属塩化物",
267
+ "7": "アルカリ金属重ハロゲン化物",
268
+ "8": "重遷移金属酸化物 (Ba/Ru)",
269
+ "9": "遷移金属プニクタイド (Ni/P)",
270
+ "10": "アルカリ金属カルコゲン酸塩",
271
+ "11": "遷移金属酸化物 (Mo/V)",
272
+ "12": "重遷移金属ホウ化物",
273
+ "13": "遷移金属プニクタイド (Si/Co)",
274
+ "14": "p-ブロック金属金属間化合物",
275
+ "15": "希土類金属間化合物",
276
+ "16": "アルカリ金属ケイ酸塩",
277
+ "17": "重遷移金属酸化物 (Ba/Ta)",
278
+ "18": "遷移金属酸化物 (Mn/Fe)"
279
+ },
280
+ "chemical": {
281
+ "-1": "Unclassified / Noise",
282
+ "0": "Heavy Transition Metal Intermetallics (Li/Mg)",
283
+ "1": "Alkali Fluorides",
284
+ "2": "Alkaline Earth Carbonitrides",
285
+ "3": "Alkali Hydroxides",
286
+ "4": "Alkali Phosphates",
287
+ "5": "Alkali Chalcogenides",
288
+ "6": "Alkali Heavy Halides",
289
+ "7": "Alkali Chlorides",
290
+ "8": "Transition Metal Pnictides",
291
+ "9": "Heavy Transition Metal Oxides (Ba/Ru)",
292
+ "10": "Heavy Transition Metal Borides",
293
+ "11": "Transition Metal Oxides (Mo/V)",
294
+ "12": "Alkali Chalcogenide Oxysalts",
295
+ "13": "Transition Metal Silicides",
296
+ "14": "Heavy Transition Metal Oxides (Ta/Ba)",
297
+ "15": "Transition Metal Oxides (Mn/Fe)",
298
+ "16": "Alkali Oxides",
299
+ "17": "Alkali Silicates",
300
+ "18": "Heavy Transition Metal Intermetallics (Pd/Ni)",
301
+ "19": "Heavy Transition Metal Intermetallics (Au/In)",
302
+ "20": "p-Block Metal Intermetallics"
303
+ },
304
+ "chemical_ja": {
305
+ "-1": "未分類 / ノイズ",
306
+ "0": "重遷移金属金属間化合物 (Li/Mg)",
307
+ "1": "アルカリ金属フッ化物",
308
+ "2": "アルカリ土類炭窒化物",
309
+ "3": "アルカリ金属水酸化物",
310
+ "4": "アルカリ金属リン酸塩",
311
+ "5": "アルカリ金属カルコゲナイド",
312
+ "6": "アルカリ金属重ハロゲン化物",
313
+ "7": "アルカリ金属塩化物",
314
+ "8": "遷移金属プニクタイド",
315
+ "9": "重遷移金属酸化物 (Ba/Ru)",
316
+ "10": "重遷移金属ホウ化物",
317
+ "11": "遷移金属酸化物 (Mo/V)",
318
+ "12": "アルカリ金属カルコゲン酸塩",
319
+ "13": "遷移金属ケイ化物",
320
+ "14": "重遷移金属酸化物 (Ta/Ba)",
321
+ "15": "遷移金属酸化物 (Mn/Fe)",
322
+ "16": "アルカリ金属酸化物",
323
+ "17": "アルカリ金属ケイ酸塩",
324
+ "18": "重遷移金属金属間化合物 (Pd/Ni)",
325
+ "19": "重遷移金属金属間化合物 (Au/In)",
326
+ "20": "p-ブロック金属金属間化合物"
327
+ },
328
+ "coord_energy": {
329
+ "-1": "Unclassified / Noise",
330
+ "0": "Heavy Transition Metal Intermetallics",
331
+ "1": "Alkali Fluorides",
332
+ "2": "Alkali Hydroxides",
333
+ "3": "Alkali Oxides",
334
+ "4": "Alkali Heavy Halides",
335
+ "5": "Alkaline Earth Carbonitrides",
336
+ "6": "Alkali Chalcogenides",
337
+ "7": "p-Block Metal Intermetallics",
338
+ "8": "Rare Earth Intermetallics",
339
+ "9": "Transition Metal Pnictides",
340
+ "10": "Transition Metal Silicides",
341
+ "11": "Heavy Transition Metal Borides"
342
+ },
343
+ "coord_energy_ja": {
344
+ "-1": "未分類 / ノイズ",
345
+ "0": "重遷移金属金属間化合物",
346
+ "1": "アルカリ金属フッ化物",
347
+ "2": "アルカリ金属水酸化物",
348
+ "3": "アルカリ金属酸化物",
349
+ "4": "アルカリ金属重ハロゲン化物",
350
+ "5": "アルカリ土類炭窒化物",
351
+ "6": "アルカリ金属カルコゲナイド",
352
+ "7": "p-ブロック金属金属間化合物",
353
+ "8": "希土類金属間化合物",
354
+ "9": "遷移金属プニクタイド",
355
+ "10": "遷移金属ケイ化物",
356
+ "11": "重遷移金属ホウ化物"
357
+ },
358
+ "mechanochem": {
359
+ "-1": "Unclassified / Noise",
360
+ "0": "Heavy Transition Metal Intermetallics",
361
+ "1": "Alkali Fluorides",
362
+ "2": "Alkaline Earth Carbonitrides",
363
+ "3": "Alkali Hydroxides",
364
+ "4": "Alkali Chlorides",
365
+ "5": "Alkali Heavy Halides",
366
+ "6": "Alkali Chalcogenides",
367
+ "7": "Alkali Phosphates",
368
+ "8": "Rare Earth Intermetallics",
369
+ "9": "Transition Metal Pnictides",
370
+ "10": "Transition Metal Silicides",
371
+ "11": "Heavy Transition Metal Borides",
372
+ "12": "Alkali Chalcogenide Oxysalts",
373
+ "13": "Transition Metal Chalcogenide Oxysalts",
374
+ "14": "Heavy Transition Metal Oxides",
375
+ "15": "Alkali Oxides",
376
+ "16": "Transition Metal Oxides",
377
+ "17": "Rare Earth Oxides"
378
+ },
379
+ "mechanochem_ja": {
380
+ "-1": "未分類 / ノイズ",
381
+ "0": "重遷移金属金属間化合物",
382
+ "1": "アルカリ金属フッ化物",
383
+ "2": "アルカリ土類炭窒化物",
384
+ "3": "アルカリ金属水酸化物",
385
+ "4": "アルカリ金属塩化物",
386
+ "5": "アルカリ金属重ハロゲン化物",
387
+ "6": "アルカリ金属カルコゲナイド",
388
+ "7": "アルカリ金属リン酸塩",
389
+ "8": "希土類金属間化合物",
390
+ "9": "遷移金属プニクタイド",
391
+ "10": "遷移金属ケイ化物",
392
+ "11": "重遷移金属ホウ化物",
393
+ "12": "アルカリ金属カルコゲン酸塩",
394
+ "13": "遷移金属カルコゲン酸塩",
395
+ "14": "重遷移金属酸化物",
396
+ "15": "アルカリ金属酸化物",
397
+ "16": "遷移金属酸化物",
398
+ "17": "希土類酸化物"
399
+ }
400
+ }
material_universe_cache/centroid_sim_esen.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c98ab7da174c3f0937f88c701d24c5ed1ecad53ddb52ae778003746130e63de
3
+ size 2310292
material_universe_cache/centroid_sim_mm.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8d7bf14dd86264aa1b71f5047772cfd19e83d6c0f51d8331d8397b871908212
3
+ size 2310292
material_universe_cache/centroid_sim_ofm.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4bd40e74b7fa5d9bd089ffb1c7cf9a092999aaa67508d232117d1ed966c0ed1
3
+ size 2310292
material_universe_cache/centroid_sim_orb.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46c44c25a72dfed5554fbaff263af45440b005c803e5b3f287e03630aff012a3
3
+ size 2310292
material_universe_cache/cluster_labels_balanced.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad2e2ca00f631643c5a461a88a0afd171560613de5bfe7157c5f167c6035f1dc
3
+ size 271912
material_universe_cache/cluster_labels_chemical.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2dda41b9b2c73b7efac137e9c6cc80ad850f0c01a046a518f68cc9f9ff6b163a
3
+ size 271912
material_universe_cache/cluster_labels_coord_energy.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f7b60ddd865245ff7f913f87fb9251a9798c2f8fdb41ee23af53fa353469cf2
3
+ size 271912
material_universe_cache/cluster_labels_electronic.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11af565fe692d26483eef032089c25cf782f049d411af5fdc07e41f9c67792df
3
+ size 271912
material_universe_cache/cluster_labels_esen_only.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:712e9ade496dca83a1d5d78edbe6706609aaa41a8507271f28005850d21c167f
3
+ size 271912
material_universe_cache/cluster_labels_mechanochem.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26a450c477dcfa3e53415d5ded631b1a14debbb759fe4cf6ef68d20d4831f2f7
3
+ size 271912
material_universe_cache/cluster_labels_mm_only.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3877c1e53ed00194261b6cecd42873141c5210029d142f0eb777a886cec6d2e7
3
+ size 271912
material_universe_cache/cluster_labels_ofm_only.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0612cd6f5b93b42985ed1b87d17848d028b15fae74b6871e4f47aed8c0cdb8ee
3
+ size 271912
material_universe_cache/cluster_labels_orb_only.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:611c644dbfbbc3559821e1c6dfe53771eeadfbca68666c29d199295ffca3575d
3
+ size 271912
material_universe_cache/cluster_labels_stability.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a43025c1822ff6be232dec18acfa2d860763538ce7899f6b438296b7513bbd74
3
+ size 271912
material_universe_cache/cluster_labels_structural.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38d85e5e8e03aefe0c4d6d6124541e8a3bc61bc4aa30e5663c58501b04001c45
3
+ size 271912
material_universe_cache/plotly_studio_export.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0c09907db415aeb8b77eee2c1efdb6a0d44446a9c401a0e09b1731b64c68d61e
3
- size 24754864
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28d5a2616fc8dcd9469c0303f64b4eb61bb8c9bb07c8f614281076a93e87113e
3
+ size 13630877
requirements-hf.txt CHANGED
@@ -2,3 +2,6 @@ dash>=4.0.0
2
  plotly>=6.0
3
  pandas>=2.0
4
  scikit-learn>=1.0
 
 
 
 
2
  plotly>=6.0
3
  pandas>=2.0
4
  scikit-learn>=1.0
5
+ qdrant-client>=1.7.0
6
+ python-dotenv>=1.0
7
+ numpy>=2.0
search/__init__.py ADDED
File without changes
search/fusion.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Weighted cosine similarity fusion for multi-vector search.
3
+
4
+ Searches across all 4 named vector spaces in the crystal-chroma-fusion
5
+ Qdrant collection and re-ranks candidates by a weighted sum of cosine
6
+ similarities.
7
+
8
+ Used by:
9
+ - app.py (C7 Similar Materials Search card)
10
+ - tests/test_quad_fusion.py (Test 6)
11
+ """
12
+
13
+ import numpy as np
14
+ from qdrant_client import QdrantClient, models
15
+
16
+ COLLECTION = "crystal-chroma-fusion"
17
+
18
+ VECTOR_NAMES = ["orb", "l_mm", "l_ofm", "esen"]
19
+
20
+ VECTOR_SPEC = {
21
+ "orb": {"dim": 1792, "distance": "Cosine"},
22
+ "l_mm": {"dim": 758, "distance": "Euclid"},
23
+ "l_ofm": {"dim": 188, "distance": "Euclid"},
24
+ "esen": {"dim": 128, "distance": "Cosine"},
25
+ }
26
+
27
+ DEFAULT_WEIGHTS = {name: 1.0 / len(VECTOR_NAMES) for name in VECTOR_NAMES}
28
+
29
+
30
+ def _search_params():
31
+ """SearchParams with rescore=True for INT8 quantization accuracy."""
32
+ return models.SearchParams(
33
+ quantization=models.QuantizationSearchParams(
34
+ rescore=True,
35
+ oversampling=2.0,
36
+ )
37
+ )
38
+
39
+
40
+ def cosine_similarity(a, b):
41
+ """Cosine similarity between two vectors."""
42
+ a = np.asarray(a, dtype=np.float64)
43
+ b = np.asarray(b, dtype=np.float64)
44
+ dot = np.dot(a, b)
45
+ na = np.linalg.norm(a)
46
+ nb = np.linalg.norm(b)
47
+ if na == 0 or nb == 0:
48
+ return 0.0
49
+ return float(dot / (na * nb))
50
+
51
+
52
+ def weighted_cosine_search(
53
+ client: QdrantClient,
54
+ collection: str,
55
+ query_point_id: int,
56
+ weights: dict[str, float] | None = None,
57
+ top_k: int = 10,
58
+ prefetch_k: int = 50,
59
+ vector_names: list[str] | None = None,
60
+ ) -> list[dict]:
61
+ """
62
+ Weighted cosine similarity fusion search.
63
+
64
+ 1. Retrieve query point's named vectors
65
+ 2. Query top-prefetch_k from each vector space (with rescore)
66
+ 3. Pool all candidate IDs
67
+ 4. Batch-retrieve full vectors for all candidates
68
+ 5. Compute per-vector cosine similarity + weighted sum
69
+ 6. Return top_k results sorted by weighted score
70
+
71
+ Returns list of dicts:
72
+ {"id", "mp_id", "formula", "band_gap", "scores": {...}, "weighted_score"}
73
+ """
74
+ if vector_names is None:
75
+ vector_names = VECTOR_NAMES
76
+ if weights is None:
77
+ weights = DEFAULT_WEIGHTS
78
+
79
+ # 1. Retrieve query vectors
80
+ pts = client.retrieve(collection, ids=[query_point_id], with_vectors=True)
81
+ if not pts:
82
+ return []
83
+ qpoint = pts[0]
84
+ query_vecs = {name: np.array(qpoint.vector[name]) for name in vector_names}
85
+
86
+ # 2. Gather candidates from each vector space
87
+ sp = _search_params()
88
+ candidate_ids = set()
89
+ for name in vector_names:
90
+ results = client.query_points(
91
+ collection_name=collection,
92
+ query=qpoint.vector[name],
93
+ using=name,
94
+ limit=prefetch_k,
95
+ search_params=sp,
96
+ )
97
+ for h in results.points:
98
+ candidate_ids.add(h.id)
99
+
100
+ # 3. Batch-retrieve full vectors for all candidates
101
+ candidate_list = sorted(candidate_ids)
102
+ all_candidates = {}
103
+ batch_size = 100
104
+ for i in range(0, len(candidate_list), batch_size):
105
+ batch_ids = candidate_list[i:i + batch_size]
106
+ batch_pts = client.retrieve(collection, ids=batch_ids, with_vectors=True)
107
+ for p in batch_pts:
108
+ all_candidates[p.id] = p
109
+
110
+ # 4. Compute weighted cosine similarity
111
+ scored = []
112
+ for cid, cpoint in all_candidates.items():
113
+ per_vec = {}
114
+ for name in vector_names:
115
+ cvec = np.array(cpoint.vector[name])
116
+ per_vec[name] = cosine_similarity(query_vecs[name], cvec)
117
+
118
+ weighted = sum(weights.get(name, 0.0) * per_vec[name] for name in vector_names)
119
+ scored.append({
120
+ "id": cid,
121
+ "mp_id": cpoint.payload.get("mp_id", ""),
122
+ "formula": cpoint.payload.get("formula", ""),
123
+ "band_gap": cpoint.payload.get("band_gap", 0.0),
124
+ "scores": per_vec,
125
+ "weighted_score": weighted,
126
+ })
127
+
128
+ scored.sort(key=lambda x: x["weighted_score"], reverse=True)
129
+ return scored[:top_k]