Spaces:
Sleeping
Sleeping
| """ | |
| Apply Scientific Labels to Material Universe Map | |
| Reads cluster definitions and regenerates the visualization with | |
| human-readable family names instead of cluster IDs, plus a summary table. | |
| Usage: | |
| uv run python scripts/apply_labels_to_map.py [--lang ja] | |
| Reads from: | |
| - material_universe_cache/materials_clustered.csv | |
| - docs/cluster_definitions.json | |
| Outputs: | |
| - docs/material_universe_map_labeled.html (English) | |
| - docs/material_universe_map_labeled_ja.html (Japanese, with --lang ja) | |
| """ | |
| import pandas as pd | |
| import json | |
| import plotly.express as px | |
| import os | |
| import argparse | |
| INPUT_CSV = "material_universe_cache/materials_clustered.csv" | |
| INPUT_LABELS = "docs/cluster_definitions.json" | |
| # Localization strings | |
| TRANSLATIONS = { | |
| "en": { | |
| "title": "The Material Universe", | |
| "subtitle": "33,973 stable crystals clustered into 21 material families using Tri-Fusion embeddings (Orb-v3 + MEGNet + OFM)", | |
| "chart_title": "The Material Universe: 33,973 Crystals in 21 Families", | |
| "family_summary": "Family Summary", | |
| "legend_title": "Material Family", | |
| "col_id": "ID", | |
| "col_family": "Material Family", | |
| "col_count": "Count", | |
| "col_percent": "%", | |
| "col_bandgap": "Avg Band Gap", | |
| "col_type": "Type", | |
| "type_metallic": "Metallic", | |
| "type_semiconductor": "Semiconductor", | |
| "type_insulator": "Insulator", | |
| "output_file": "docs/material_universe_map_labeled.html", | |
| }, | |
| "ja": { | |
| "title": "マテリアル・ユニバース", | |
| "subtitle": "Tri-Fusion埋め込み(Orb-v3 + MEGNet + OFM)を用いて21の材料ファミリーにクラスタリングされた33,973の安定結晶", | |
| "chart_title": "マテリアル・ユニバース: 33,973結晶、21ファミリー", | |
| "family_summary": "ファミリー概要", | |
| "legend_title": "材料ファミリー", | |
| "col_id": "ID", | |
| "col_family": "材料ファミリー", | |
| "col_count": "数", | |
| "col_percent": "%", | |
| "col_bandgap": "平均バンドギャップ", | |
| "col_type": "タイプ", | |
| "type_metallic": "金属", | |
| "type_semiconductor": "半導体", | |
| "type_insulator": "絶縁体", | |
| "output_file": "docs/material_universe_map_labeled_ja.html", | |
| }, | |
| } | |
| # Japanese translations for family names | |
| FAMILY_TRANSLATIONS_JA = { | |
| "Unclassified / Noise": "未分類 / ノイズ", | |
| "Intermetallic Alloys (Li/Mg/Rare Earth)": "金属間化合物(Li/Mg/希土類)", | |
| "Alkali Metal Fluorides": "アルカリ金属フッ化物", | |
| "Molybdates & Vanadates": "モリブデン酸塩 & バナジン酸塩", | |
| "Chalcogenide Oxysalts (Sulfates, Selenates, Tellurates)": "カルコゲン酸塩(硫酸塩、セレン酸塩、テルル酸塩)", | |
| "Phosphates & Arsenates": "リン酸塩 & ヒ酸塩", | |
| "Hydrides & Hydroxides": "水素化物 & 水酸化物", | |
| "Zintl Phases (Pnictides)": "ジントル相(プニクタイド)", | |
| "Chalcogenides (Sulfides, Selenides, Tellurides)": "カルコゲナイド(硫化物、セレン化物、テルル化物)", | |
| "Intermetallics & Zintl Compounds (Al/Ga/Ge/Si)": "金属間化合物 & ジントル化合物(Al/Ga/Ge/Si)", | |
| "Perovskite Oxides (Ru, Ir, Pt Group)": "ペロブスカイト酸化物(Ru, Ir, Pt族)", | |
| "Nickel Intermetallics": "ニッケル金属間化合物", | |
| "Nitrides & Carbonitrides": "窒化物 & 炭窒化物", | |
| "Heavy Halides (Bromides & Iodides)": "重ハロゲン化物(臭化物 & ヨウ化物)", | |
| "Chlorides": "塩化物", | |
| "Borides & Silicides": "ホウ化物 & ケイ化物", | |
| "Platinum Group Metal Borides": "白金族金属ホウ化物", | |
| "Transition Metal Oxides (Mn, Fe, Co, Ni)": "遷移金属酸化物(Mn, Fe, Co, Ni)", | |
| "Perovskite-type Tantalates, Niobates & Titanates": "ペロブスカイト型タンタル酸塩、ニオブ酸塩 & チタン酸塩", | |
| "Alkali Metal Oxides": "アルカリ金属酸化物", | |
| "Silicates & Aluminosilicates": "ケイ酸塩 & アルミノケイ酸塩", | |
| "Borates": "ホウ酸塩", | |
| } | |
| def get_electronic_type(avg_gap, lang="en"): | |
| """Classify electronic type based on average band gap.""" | |
| t = TRANSLATIONS[lang] | |
| if avg_gap < 0.1: | |
| return t["type_metallic"] | |
| elif avg_gap > 3.0: | |
| return t["type_insulator"] | |
| else: | |
| return t["type_semiconductor"] | |
| def translate_family(family, lang): | |
| """Translate family name to target language.""" | |
| if lang == "ja": | |
| return FAMILY_TRANSLATIONS_JA.get(family, family) | |
| return family | |
| def generate_summary_table(df, sorted_families, lang="en"): | |
| """Generate HTML summary table of all families.""" | |
| t = TRANSLATIONS[lang] | |
| rows = [] | |
| total = len(df) | |
| for family in sorted_families: | |
| subset = df[df["Family"] == family] | |
| count = len(subset) | |
| pct = count / total * 100 | |
| avg_gap = subset["BandGap"].mean() | |
| e_type = get_electronic_type(avg_gap, lang) | |
| # Get cluster ID (first one if multiple map to same family) | |
| cluster_id = subset["ClusterID"].iloc[0] | |
| # Translate family name for display | |
| family_display = translate_family(family, lang) | |
| rows.append( | |
| f"<tr><td>{cluster_id}</td><td>{family_display}</td><td>{count:,}</td>" | |
| f"<td>{pct:.1f}%</td><td>{avg_gap:.2f} eV</td><td>{e_type}</td></tr>" | |
| ) | |
| table_html = f""" | |
| <table class="summary-table"> | |
| <thead> | |
| <tr> | |
| <th>{t["col_id"]}</th> | |
| <th>{t["col_family"]}</th> | |
| <th>{t["col_count"]}</th> | |
| <th>{t["col_percent"]}</th> | |
| <th>{t["col_bandgap"]}</th> | |
| <th>{t["col_type"]}</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| {"".join(rows)} | |
| </tbody> | |
| </table> | |
| """ | |
| return table_html | |
| def regenerate_map(lang="en"): | |
| """Generate the labeled map in the specified language.""" | |
| t = TRANSLATIONS[lang] | |
| output_html = t["output_file"] | |
| if not os.path.exists(INPUT_LABELS): | |
| print(f"Error: {INPUT_LABELS} not found. Run Phase 2 analysis first.") | |
| return | |
| if not os.path.exists(INPUT_CSV): | |
| print(f"Error: {INPUT_CSV} not found. Run clustering script first.") | |
| return | |
| print(f"Loading data and labels (lang={lang})...") | |
| df = pd.read_csv(INPUT_CSV) | |
| with open(INPUT_LABELS, "r") as f: | |
| label_map = json.load(f) | |
| # Map IDs to Scientific Names | |
| # Convert Cluster column to string to match JSON keys | |
| df["ClusterID"] = df["Cluster"].astype(str) | |
| # Apply mapping; Default to "Unclassified" if not found | |
| df["Family"] = df["ClusterID"].map(label_map).fillna("Unclassified") | |
| # Count materials per family for sorting | |
| family_counts = df["Family"].value_counts() | |
| df["FamilySize"] = df["Family"].map(family_counts) | |
| # Sort by family size (largest first), then by family name for consistent colors | |
| df = df.sort_values(["FamilySize", "Family"], ascending=[False, True]) | |
| # Get unique families sorted by size | |
| sorted_families = ( | |
| df.groupby("Family")["FamilySize"] | |
| .first() | |
| .sort_values(ascending=False) | |
| .index.tolist() | |
| ) | |
| print(f" Materials: {len(df):,}") | |
| print(f" Families: {len(sorted_families)}") | |
| # Translate family names for display in chart | |
| if lang != "en": | |
| df["FamilyDisplay"] = df["Family"].apply(lambda x: translate_family(x, lang)) | |
| sorted_families_display = [translate_family(f, lang) for f in sorted_families] | |
| color_col = "FamilyDisplay" | |
| else: | |
| sorted_families_display = sorted_families | |
| color_col = "Family" | |
| print("\nGenerating Labeled Map...") | |
| fig = px.scatter( | |
| df, | |
| x="x", | |
| y="y", | |
| color=color_col, | |
| category_orders={color_col: sorted_families_display}, # Sort legend by size | |
| hover_data=["Formula", "MP_ID", "BandGap", "ClusterID"], | |
| title=t["chart_title"], | |
| template="plotly_white", | |
| render_mode="webgl", | |
| width=1400, | |
| height=900, | |
| ) | |
| # Update layout for better visualization | |
| fig.update_traces(marker=dict(size=3, opacity=0.7)) | |
| fig.update_layout( | |
| legend=dict( | |
| orientation="v", | |
| yanchor="top", | |
| y=1, | |
| xanchor="left", | |
| x=1.02, | |
| title=t["legend_title"], | |
| font=dict(size=10), | |
| ), | |
| margin=dict(r=280 if lang == "ja" else 250), # More space for Japanese | |
| ) | |
| # Generate summary table | |
| summary_table = generate_summary_table(df, sorted_families, lang) | |
| # Get Plotly chart as HTML div | |
| chart_html = fig.to_html(full_html=False, include_plotlyjs="cdn") | |
| # Create full HTML with chart and summary table | |
| full_html = f"""<!DOCTYPE html> | |
| <html lang="{lang}"> | |
| <head> | |
| <meta charset="utf-8"> | |
| <title>{t["title"]}</title> | |
| <style> | |
| body {{ | |
| font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Hiragino Sans", "Noto Sans JP", sans-serif; | |
| margin: 0; | |
| padding: 20px; | |
| background: #fafafa; | |
| }} | |
| h1 {{ | |
| color: #333; | |
| margin-bottom: 10px; | |
| }} | |
| .subtitle {{ | |
| color: #666; | |
| margin-bottom: 20px; | |
| }} | |
| .chart-container {{ | |
| background: white; | |
| border-radius: 8px; | |
| box-shadow: 0 2px 4px rgba(0,0,0,0.1); | |
| padding: 20px; | |
| margin-bottom: 30px; | |
| }} | |
| .summary-container {{ | |
| background: white; | |
| border-radius: 8px; | |
| box-shadow: 0 2px 4px rgba(0,0,0,0.1); | |
| padding: 20px; | |
| max-width: 1000px; | |
| }} | |
| h2 {{ | |
| color: #333; | |
| margin-top: 0; | |
| }} | |
| .summary-table {{ | |
| width: 100%; | |
| border-collapse: collapse; | |
| font-size: 14px; | |
| }} | |
| .summary-table th {{ | |
| background: #f5f5f5; | |
| padding: 12px 8px; | |
| text-align: left; | |
| border-bottom: 2px solid #ddd; | |
| font-weight: 600; | |
| }} | |
| .summary-table td {{ | |
| padding: 10px 8px; | |
| border-bottom: 1px solid #eee; | |
| }} | |
| .summary-table tr:hover {{ | |
| background: #f9f9f9; | |
| }} | |
| .summary-table tr:nth-child(even) {{ | |
| background: #fafafa; | |
| }} | |
| .summary-table tr:nth-child(even):hover {{ | |
| background: #f5f5f5; | |
| }} | |
| </style> | |
| </head> | |
| <body> | |
| <h1>{t["title"]}</h1> | |
| <p class="subtitle">{t["subtitle"]}</p> | |
| <div class="chart-container"> | |
| {chart_html} | |
| </div> | |
| <div class="summary-container"> | |
| <h2>{t["family_summary"]}</h2> | |
| {summary_table} | |
| </div> | |
| </body> | |
| </html> | |
| """ | |
| # Ensure output directory exists | |
| os.makedirs(os.path.dirname(output_html), exist_ok=True) | |
| with open(output_html, "w", encoding="utf-8") as f: | |
| f.write(full_html) | |
| file_size_mb = os.path.getsize(output_html) / (1024 * 1024) | |
| print(f"\nLabeled Map saved to: {output_html}") | |
| print(f" File size: {file_size_mb:.1f} MB") | |
| # Print family distribution | |
| print("\nFamily Distribution:") | |
| for family in sorted_families[:10]: | |
| count = family_counts[family] | |
| pct = count / len(df) * 100 | |
| print(f" {family}: {count:,} ({pct:.1f}%)") | |
| if len(sorted_families) > 10: | |
| print(f" ... and {len(sorted_families) - 10} more families") | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="Generate labeled Material Universe map") | |
| parser.add_argument( | |
| "--lang", | |
| choices=["en", "ja"], | |
| default="en", | |
| help="Language for labels (en=English, ja=Japanese)", | |
| ) | |
| args = parser.parse_args() | |
| regenerate_map(lang=args.lang) | |