material-universe / scripts /apply_labels_to_map.py
Hafnium49's picture
Initial deployment: Materials Database Explorer (Japanese)
412d7e5 verified
"""
Apply Scientific Labels to Material Universe Map
Reads cluster definitions and regenerates the visualization with
human-readable family names instead of cluster IDs, plus a summary table.
Usage:
uv run python scripts/apply_labels_to_map.py [--lang ja]
Reads from:
- material_universe_cache/materials_clustered.csv
- docs/cluster_definitions.json
Outputs:
- docs/material_universe_map_labeled.html (English)
- docs/material_universe_map_labeled_ja.html (Japanese, with --lang ja)
"""
import pandas as pd
import json
import plotly.express as px
import os
import argparse
INPUT_CSV = "material_universe_cache/materials_clustered.csv"
INPUT_LABELS = "docs/cluster_definitions.json"
# Localization strings
TRANSLATIONS = {
"en": {
"title": "The Material Universe",
"subtitle": "33,973 stable crystals clustered into 21 material families using Tri-Fusion embeddings (Orb-v3 + MEGNet + OFM)",
"chart_title": "The Material Universe: 33,973 Crystals in 21 Families",
"family_summary": "Family Summary",
"legend_title": "Material Family",
"col_id": "ID",
"col_family": "Material Family",
"col_count": "Count",
"col_percent": "%",
"col_bandgap": "Avg Band Gap",
"col_type": "Type",
"type_metallic": "Metallic",
"type_semiconductor": "Semiconductor",
"type_insulator": "Insulator",
"output_file": "docs/material_universe_map_labeled.html",
},
"ja": {
"title": "マテリアル・ユニバース",
"subtitle": "Tri-Fusion埋め込み(Orb-v3 + MEGNet + OFM)を用いて21の材料ファミリーにクラスタリングされた33,973の安定結晶",
"chart_title": "マテリアル・ユニバース: 33,973結晶、21ファミリー",
"family_summary": "ファミリー概要",
"legend_title": "材料ファミリー",
"col_id": "ID",
"col_family": "材料ファミリー",
"col_count": "数",
"col_percent": "%",
"col_bandgap": "平均バンドギャップ",
"col_type": "タイプ",
"type_metallic": "金属",
"type_semiconductor": "半導体",
"type_insulator": "絶縁体",
"output_file": "docs/material_universe_map_labeled_ja.html",
},
}
# Japanese translations for family names
FAMILY_TRANSLATIONS_JA = {
"Unclassified / Noise": "未分類 / ノイズ",
"Intermetallic Alloys (Li/Mg/Rare Earth)": "金属間化合物(Li/Mg/希土類)",
"Alkali Metal Fluorides": "アルカリ金属フッ化物",
"Molybdates & Vanadates": "モリブデン酸塩 & バナジン酸塩",
"Chalcogenide Oxysalts (Sulfates, Selenates, Tellurates)": "カルコゲン酸塩(硫酸塩、セレン酸塩、テルル酸塩)",
"Phosphates & Arsenates": "リン酸塩 & ヒ酸塩",
"Hydrides & Hydroxides": "水素化物 & 水酸化物",
"Zintl Phases (Pnictides)": "ジントル相(プニクタイド)",
"Chalcogenides (Sulfides, Selenides, Tellurides)": "カルコゲナイド(硫化物、セレン化物、テルル化物)",
"Intermetallics & Zintl Compounds (Al/Ga/Ge/Si)": "金属間化合物 & ジントル化合物(Al/Ga/Ge/Si)",
"Perovskite Oxides (Ru, Ir, Pt Group)": "ペロブスカイト酸化物(Ru, Ir, Pt族)",
"Nickel Intermetallics": "ニッケル金属間化合物",
"Nitrides & Carbonitrides": "窒化物 & 炭窒化物",
"Heavy Halides (Bromides & Iodides)": "重ハロゲン化物(臭化物 & ヨウ化物)",
"Chlorides": "塩化物",
"Borides & Silicides": "ホウ化物 & ケイ化物",
"Platinum Group Metal Borides": "白金族金属ホウ化物",
"Transition Metal Oxides (Mn, Fe, Co, Ni)": "遷移金属酸化物(Mn, Fe, Co, Ni)",
"Perovskite-type Tantalates, Niobates & Titanates": "ペロブスカイト型タンタル酸塩、ニオブ酸塩 & チタン酸塩",
"Alkali Metal Oxides": "アルカリ金属酸化物",
"Silicates & Aluminosilicates": "ケイ酸塩 & アルミノケイ酸塩",
"Borates": "ホウ酸塩",
}
def get_electronic_type(avg_gap, lang="en"):
"""Classify electronic type based on average band gap."""
t = TRANSLATIONS[lang]
if avg_gap < 0.1:
return t["type_metallic"]
elif avg_gap > 3.0:
return t["type_insulator"]
else:
return t["type_semiconductor"]
def translate_family(family, lang):
"""Translate family name to target language."""
if lang == "ja":
return FAMILY_TRANSLATIONS_JA.get(family, family)
return family
def generate_summary_table(df, sorted_families, lang="en"):
"""Generate HTML summary table of all families."""
t = TRANSLATIONS[lang]
rows = []
total = len(df)
for family in sorted_families:
subset = df[df["Family"] == family]
count = len(subset)
pct = count / total * 100
avg_gap = subset["BandGap"].mean()
e_type = get_electronic_type(avg_gap, lang)
# Get cluster ID (first one if multiple map to same family)
cluster_id = subset["ClusterID"].iloc[0]
# Translate family name for display
family_display = translate_family(family, lang)
rows.append(
f"<tr><td>{cluster_id}</td><td>{family_display}</td><td>{count:,}</td>"
f"<td>{pct:.1f}%</td><td>{avg_gap:.2f} eV</td><td>{e_type}</td></tr>"
)
table_html = f"""
<table class="summary-table">
<thead>
<tr>
<th>{t["col_id"]}</th>
<th>{t["col_family"]}</th>
<th>{t["col_count"]}</th>
<th>{t["col_percent"]}</th>
<th>{t["col_bandgap"]}</th>
<th>{t["col_type"]}</th>
</tr>
</thead>
<tbody>
{"".join(rows)}
</tbody>
</table>
"""
return table_html
def regenerate_map(lang="en"):
"""Generate the labeled map in the specified language."""
t = TRANSLATIONS[lang]
output_html = t["output_file"]
if not os.path.exists(INPUT_LABELS):
print(f"Error: {INPUT_LABELS} not found. Run Phase 2 analysis first.")
return
if not os.path.exists(INPUT_CSV):
print(f"Error: {INPUT_CSV} not found. Run clustering script first.")
return
print(f"Loading data and labels (lang={lang})...")
df = pd.read_csv(INPUT_CSV)
with open(INPUT_LABELS, "r") as f:
label_map = json.load(f)
# Map IDs to Scientific Names
# Convert Cluster column to string to match JSON keys
df["ClusterID"] = df["Cluster"].astype(str)
# Apply mapping; Default to "Unclassified" if not found
df["Family"] = df["ClusterID"].map(label_map).fillna("Unclassified")
# Count materials per family for sorting
family_counts = df["Family"].value_counts()
df["FamilySize"] = df["Family"].map(family_counts)
# Sort by family size (largest first), then by family name for consistent colors
df = df.sort_values(["FamilySize", "Family"], ascending=[False, True])
# Get unique families sorted by size
sorted_families = (
df.groupby("Family")["FamilySize"]
.first()
.sort_values(ascending=False)
.index.tolist()
)
print(f" Materials: {len(df):,}")
print(f" Families: {len(sorted_families)}")
# Translate family names for display in chart
if lang != "en":
df["FamilyDisplay"] = df["Family"].apply(lambda x: translate_family(x, lang))
sorted_families_display = [translate_family(f, lang) for f in sorted_families]
color_col = "FamilyDisplay"
else:
sorted_families_display = sorted_families
color_col = "Family"
print("\nGenerating Labeled Map...")
fig = px.scatter(
df,
x="x",
y="y",
color=color_col,
category_orders={color_col: sorted_families_display}, # Sort legend by size
hover_data=["Formula", "MP_ID", "BandGap", "ClusterID"],
title=t["chart_title"],
template="plotly_white",
render_mode="webgl",
width=1400,
height=900,
)
# Update layout for better visualization
fig.update_traces(marker=dict(size=3, opacity=0.7))
fig.update_layout(
legend=dict(
orientation="v",
yanchor="top",
y=1,
xanchor="left",
x=1.02,
title=t["legend_title"],
font=dict(size=10),
),
margin=dict(r=280 if lang == "ja" else 250), # More space for Japanese
)
# Generate summary table
summary_table = generate_summary_table(df, sorted_families, lang)
# Get Plotly chart as HTML div
chart_html = fig.to_html(full_html=False, include_plotlyjs="cdn")
# Create full HTML with chart and summary table
full_html = f"""<!DOCTYPE html>
<html lang="{lang}">
<head>
<meta charset="utf-8">
<title>{t["title"]}</title>
<style>
body {{
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Hiragino Sans", "Noto Sans JP", sans-serif;
margin: 0;
padding: 20px;
background: #fafafa;
}}
h1 {{
color: #333;
margin-bottom: 10px;
}}
.subtitle {{
color: #666;
margin-bottom: 20px;
}}
.chart-container {{
background: white;
border-radius: 8px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
padding: 20px;
margin-bottom: 30px;
}}
.summary-container {{
background: white;
border-radius: 8px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
padding: 20px;
max-width: 1000px;
}}
h2 {{
color: #333;
margin-top: 0;
}}
.summary-table {{
width: 100%;
border-collapse: collapse;
font-size: 14px;
}}
.summary-table th {{
background: #f5f5f5;
padding: 12px 8px;
text-align: left;
border-bottom: 2px solid #ddd;
font-weight: 600;
}}
.summary-table td {{
padding: 10px 8px;
border-bottom: 1px solid #eee;
}}
.summary-table tr:hover {{
background: #f9f9f9;
}}
.summary-table tr:nth-child(even) {{
background: #fafafa;
}}
.summary-table tr:nth-child(even):hover {{
background: #f5f5f5;
}}
</style>
</head>
<body>
<h1>{t["title"]}</h1>
<p class="subtitle">{t["subtitle"]}</p>
<div class="chart-container">
{chart_html}
</div>
<div class="summary-container">
<h2>{t["family_summary"]}</h2>
{summary_table}
</div>
</body>
</html>
"""
# Ensure output directory exists
os.makedirs(os.path.dirname(output_html), exist_ok=True)
with open(output_html, "w", encoding="utf-8") as f:
f.write(full_html)
file_size_mb = os.path.getsize(output_html) / (1024 * 1024)
print(f"\nLabeled Map saved to: {output_html}")
print(f" File size: {file_size_mb:.1f} MB")
# Print family distribution
print("\nFamily Distribution:")
for family in sorted_families[:10]:
count = family_counts[family]
pct = count / len(df) * 100
print(f" {family}: {count:,} ({pct:.1f}%)")
if len(sorted_families) > 10:
print(f" ... and {len(sorted_families) - 10} more families")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generate labeled Material Universe map")
parser.add_argument(
"--lang",
choices=["en", "ja"],
default="en",
help="Language for labels (en=English, ja=Japanese)",
)
args = parser.parse_args()
regenerate_map(lang=args.lang)