""" Apply Scientific Labels to Material Universe Map Reads cluster definitions and regenerates the visualization with human-readable family names instead of cluster IDs, plus a summary table. Usage: uv run python scripts/apply_labels_to_map.py [--lang ja] Reads from: - material_universe_cache/materials_clustered.csv - docs/cluster_definitions.json Outputs: - docs/material_universe_map_labeled.html (English) - docs/material_universe_map_labeled_ja.html (Japanese, with --lang ja) """ import pandas as pd import json import plotly.express as px import os import argparse INPUT_CSV = "material_universe_cache/materials_clustered.csv" INPUT_LABELS = "docs/cluster_definitions.json" # Localization strings TRANSLATIONS = { "en": { "title": "The Material Universe", "subtitle": "33,973 stable crystals clustered into 21 material families using Tri-Fusion embeddings (Orb-v3 + MEGNet + OFM)", "chart_title": "The Material Universe: 33,973 Crystals in 21 Families", "family_summary": "Family Summary", "legend_title": "Material Family", "col_id": "ID", "col_family": "Material Family", "col_count": "Count", "col_percent": "%", "col_bandgap": "Avg Band Gap", "col_type": "Type", "type_metallic": "Metallic", "type_semiconductor": "Semiconductor", "type_insulator": "Insulator", "output_file": "docs/material_universe_map_labeled.html", }, "ja": { "title": "マテリアル・ユニバース", "subtitle": "Tri-Fusion埋め込み(Orb-v3 + MEGNet + OFM)を用いて21の材料ファミリーにクラスタリングされた33,973の安定結晶", "chart_title": "マテリアル・ユニバース: 33,973結晶、21ファミリー", "family_summary": "ファミリー概要", "legend_title": "材料ファミリー", "col_id": "ID", "col_family": "材料ファミリー", "col_count": "数", "col_percent": "%", "col_bandgap": "平均バンドギャップ", "col_type": "タイプ", "type_metallic": "金属", "type_semiconductor": "半導体", "type_insulator": "絶縁体", "output_file": "docs/material_universe_map_labeled_ja.html", }, } # Japanese translations for family names FAMILY_TRANSLATIONS_JA = { "Unclassified / Noise": "未分類 / ノイズ", "Intermetallic Alloys (Li/Mg/Rare Earth)": "金属間化合物(Li/Mg/希土類)", "Alkali Metal Fluorides": "アルカリ金属フッ化物", "Molybdates & Vanadates": "モリブデン酸塩 & バナジン酸塩", "Chalcogenide Oxysalts (Sulfates, Selenates, Tellurates)": "カルコゲン酸塩(硫酸塩、セレン酸塩、テルル酸塩)", "Phosphates & Arsenates": "リン酸塩 & ヒ酸塩", "Hydrides & Hydroxides": "水素化物 & 水酸化物", "Zintl Phases (Pnictides)": "ジントル相(プニクタイド)", "Chalcogenides (Sulfides, Selenides, Tellurides)": "カルコゲナイド(硫化物、セレン化物、テルル化物)", "Intermetallics & Zintl Compounds (Al/Ga/Ge/Si)": "金属間化合物 & ジントル化合物(Al/Ga/Ge/Si)", "Perovskite Oxides (Ru, Ir, Pt Group)": "ペロブスカイト酸化物(Ru, Ir, Pt族)", "Nickel Intermetallics": "ニッケル金属間化合物", "Nitrides & Carbonitrides": "窒化物 & 炭窒化物", "Heavy Halides (Bromides & Iodides)": "重ハロゲン化物(臭化物 & ヨウ化物)", "Chlorides": "塩化物", "Borides & Silicides": "ホウ化物 & ケイ化物", "Platinum Group Metal Borides": "白金族金属ホウ化物", "Transition Metal Oxides (Mn, Fe, Co, Ni)": "遷移金属酸化物(Mn, Fe, Co, Ni)", "Perovskite-type Tantalates, Niobates & Titanates": "ペロブスカイト型タンタル酸塩、ニオブ酸塩 & チタン酸塩", "Alkali Metal Oxides": "アルカリ金属酸化物", "Silicates & Aluminosilicates": "ケイ酸塩 & アルミノケイ酸塩", "Borates": "ホウ酸塩", } def get_electronic_type(avg_gap, lang="en"): """Classify electronic type based on average band gap.""" t = TRANSLATIONS[lang] if avg_gap < 0.1: return t["type_metallic"] elif avg_gap > 3.0: return t["type_insulator"] else: return t["type_semiconductor"] def translate_family(family, lang): """Translate family name to target language.""" if lang == "ja": return FAMILY_TRANSLATIONS_JA.get(family, family) return family def generate_summary_table(df, sorted_families, lang="en"): """Generate HTML summary table of all families.""" t = TRANSLATIONS[lang] rows = [] total = len(df) for family in sorted_families: subset = df[df["Family"] == family] count = len(subset) pct = count / total * 100 avg_gap = subset["BandGap"].mean() e_type = get_electronic_type(avg_gap, lang) # Get cluster ID (first one if multiple map to same family) cluster_id = subset["ClusterID"].iloc[0] # Translate family name for display family_display = translate_family(family, lang) rows.append( f"{cluster_id}{family_display}{count:,}" f"{pct:.1f}%{avg_gap:.2f} eV{e_type}" ) table_html = f""" {"".join(rows)}
{t["col_id"]} {t["col_family"]} {t["col_count"]} {t["col_percent"]} {t["col_bandgap"]} {t["col_type"]}
""" return table_html def regenerate_map(lang="en"): """Generate the labeled map in the specified language.""" t = TRANSLATIONS[lang] output_html = t["output_file"] if not os.path.exists(INPUT_LABELS): print(f"Error: {INPUT_LABELS} not found. Run Phase 2 analysis first.") return if not os.path.exists(INPUT_CSV): print(f"Error: {INPUT_CSV} not found. Run clustering script first.") return print(f"Loading data and labels (lang={lang})...") df = pd.read_csv(INPUT_CSV) with open(INPUT_LABELS, "r") as f: label_map = json.load(f) # Map IDs to Scientific Names # Convert Cluster column to string to match JSON keys df["ClusterID"] = df["Cluster"].astype(str) # Apply mapping; Default to "Unclassified" if not found df["Family"] = df["ClusterID"].map(label_map).fillna("Unclassified") # Count materials per family for sorting family_counts = df["Family"].value_counts() df["FamilySize"] = df["Family"].map(family_counts) # Sort by family size (largest first), then by family name for consistent colors df = df.sort_values(["FamilySize", "Family"], ascending=[False, True]) # Get unique families sorted by size sorted_families = ( df.groupby("Family")["FamilySize"] .first() .sort_values(ascending=False) .index.tolist() ) print(f" Materials: {len(df):,}") print(f" Families: {len(sorted_families)}") # Translate family names for display in chart if lang != "en": df["FamilyDisplay"] = df["Family"].apply(lambda x: translate_family(x, lang)) sorted_families_display = [translate_family(f, lang) for f in sorted_families] color_col = "FamilyDisplay" else: sorted_families_display = sorted_families color_col = "Family" print("\nGenerating Labeled Map...") fig = px.scatter( df, x="x", y="y", color=color_col, category_orders={color_col: sorted_families_display}, # Sort legend by size hover_data=["Formula", "MP_ID", "BandGap", "ClusterID"], title=t["chart_title"], template="plotly_white", render_mode="webgl", width=1400, height=900, ) # Update layout for better visualization fig.update_traces(marker=dict(size=3, opacity=0.7)) fig.update_layout( legend=dict( orientation="v", yanchor="top", y=1, xanchor="left", x=1.02, title=t["legend_title"], font=dict(size=10), ), margin=dict(r=280 if lang == "ja" else 250), # More space for Japanese ) # Generate summary table summary_table = generate_summary_table(df, sorted_families, lang) # Get Plotly chart as HTML div chart_html = fig.to_html(full_html=False, include_plotlyjs="cdn") # Create full HTML with chart and summary table full_html = f""" {t["title"]}

{t["title"]}

{t["subtitle"]}

{chart_html}

{t["family_summary"]}

{summary_table}
""" # Ensure output directory exists os.makedirs(os.path.dirname(output_html), exist_ok=True) with open(output_html, "w", encoding="utf-8") as f: f.write(full_html) file_size_mb = os.path.getsize(output_html) / (1024 * 1024) print(f"\nLabeled Map saved to: {output_html}") print(f" File size: {file_size_mb:.1f} MB") # Print family distribution print("\nFamily Distribution:") for family in sorted_families[:10]: count = family_counts[family] pct = count / len(df) * 100 print(f" {family}: {count:,} ({pct:.1f}%)") if len(sorted_families) > 10: print(f" ... and {len(sorted_families) - 10} more families") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Generate labeled Material Universe map") parser.add_argument( "--lang", choices=["en", "ja"], default="en", help="Language for labels (en=English, ja=Japanese)", ) args = parser.parse_args() regenerate_map(lang=args.lang)