""" Apply Scientific Labels to Material Universe Map Reads cluster definitions and regenerates the visualization with human-readable family names instead of cluster IDs, plus a summary table. Usage: uv run python scripts/apply_labels_to_map.py [--lang ja] Reads from: - material_universe_cache/materials_clustered.csv - docs/cluster_definitions.json Outputs: - docs/material_universe_map_labeled.html (English) - docs/material_universe_map_labeled_ja.html (Japanese, with --lang ja) """ import pandas as pd import json import plotly.express as px import os import argparse INPUT_CSV = "material_universe_cache/materials_clustered.csv" INPUT_LABELS = "docs/cluster_definitions.json" # Localization strings TRANSLATIONS = { "en": { "title": "The Material Universe", "subtitle": "33,973 stable crystals clustered into 21 material families using Tri-Fusion embeddings (Orb-v3 + MEGNet + OFM)", "chart_title": "The Material Universe: 33,973 Crystals in 21 Families", "family_summary": "Family Summary", "legend_title": "Material Family", "col_id": "ID", "col_family": "Material Family", "col_count": "Count", "col_percent": "%", "col_bandgap": "Avg Band Gap", "col_type": "Type", "type_metallic": "Metallic", "type_semiconductor": "Semiconductor", "type_insulator": "Insulator", "output_file": "docs/material_universe_map_labeled.html", }, "ja": { "title": "マテリアル・ユニバース", "subtitle": "Tri-Fusion埋め込み(Orb-v3 + MEGNet + OFM)を用いて21の材料ファミリーにクラスタリングされた33,973の安定結晶", "chart_title": "マテリアル・ユニバース: 33,973結晶、21ファミリー", "family_summary": "ファミリー概要", "legend_title": "材料ファミリー", "col_id": "ID", "col_family": "材料ファミリー", "col_count": "数", "col_percent": "%", "col_bandgap": "平均バンドギャップ", "col_type": "タイプ", "type_metallic": "金属", "type_semiconductor": "半導体", "type_insulator": "絶縁体", "output_file": "docs/material_universe_map_labeled_ja.html", }, } # Japanese translations for family names FAMILY_TRANSLATIONS_JA = { "Unclassified / Noise": "未分類 / ノイズ", "Intermetallic Alloys (Li/Mg/Rare Earth)": "金属間化合物(Li/Mg/希土類)", "Alkali Metal Fluorides": "アルカリ金属フッ化物", "Molybdates & Vanadates": "モリブデン酸塩 & バナジン酸塩", "Chalcogenide Oxysalts (Sulfates, Selenates, Tellurates)": "カルコゲン酸塩(硫酸塩、セレン酸塩、テルル酸塩)", "Phosphates & Arsenates": "リン酸塩 & ヒ酸塩", "Hydrides & Hydroxides": "水素化物 & 水酸化物", "Zintl Phases (Pnictides)": "ジントル相(プニクタイド)", "Chalcogenides (Sulfides, Selenides, Tellurides)": "カルコゲナイド(硫化物、セレン化物、テルル化物)", "Intermetallics & Zintl Compounds (Al/Ga/Ge/Si)": "金属間化合物 & ジントル化合物(Al/Ga/Ge/Si)", "Perovskite Oxides (Ru, Ir, Pt Group)": "ペロブスカイト酸化物(Ru, Ir, Pt族)", "Nickel Intermetallics": "ニッケル金属間化合物", "Nitrides & Carbonitrides": "窒化物 & 炭窒化物", "Heavy Halides (Bromides & Iodides)": "重ハロゲン化物(臭化物 & ヨウ化物)", "Chlorides": "塩化物", "Borides & Silicides": "ホウ化物 & ケイ化物", "Platinum Group Metal Borides": "白金族金属ホウ化物", "Transition Metal Oxides (Mn, Fe, Co, Ni)": "遷移金属酸化物(Mn, Fe, Co, Ni)", "Perovskite-type Tantalates, Niobates & Titanates": "ペロブスカイト型タンタル酸塩、ニオブ酸塩 & チタン酸塩", "Alkali Metal Oxides": "アルカリ金属酸化物", "Silicates & Aluminosilicates": "ケイ酸塩 & アルミノケイ酸塩", "Borates": "ホウ酸塩", } def get_electronic_type(avg_gap, lang="en"): """Classify electronic type based on average band gap.""" t = TRANSLATIONS[lang] if avg_gap < 0.1: return t["type_metallic"] elif avg_gap > 3.0: return t["type_insulator"] else: return t["type_semiconductor"] def translate_family(family, lang): """Translate family name to target language.""" if lang == "ja": return FAMILY_TRANSLATIONS_JA.get(family, family) return family def generate_summary_table(df, sorted_families, lang="en"): """Generate HTML summary table of all families.""" t = TRANSLATIONS[lang] rows = [] total = len(df) for family in sorted_families: subset = df[df["Family"] == family] count = len(subset) pct = count / total * 100 avg_gap = subset["BandGap"].mean() e_type = get_electronic_type(avg_gap, lang) # Get cluster ID (first one if multiple map to same family) cluster_id = subset["ClusterID"].iloc[0] # Translate family name for display family_display = translate_family(family, lang) rows.append( f"
| {t["col_id"]} | {t["col_family"]} | {t["col_count"]} | {t["col_percent"]} | {t["col_bandgap"]} | {t["col_type"]} |
|---|
{t["subtitle"]}