"""
Apply Scientific Labels to Material Universe Map

Reads cluster definitions and regenerates the visualization with
human-readable family names instead of cluster IDs, plus a summary table.

Usage:
    uv run python scripts/apply_labels_to_map.py [--lang ja]

Reads from:
    - material_universe_cache/materials_clustered.csv
    - docs/cluster_definitions.json

Outputs:
    - docs/material_universe_map_labeled.html (English)
    - docs/material_universe_map_labeled_ja.html (Japanese, with --lang ja)
"""

import pandas as pd
import json
import plotly.express as px
import os
import argparse

INPUT_CSV = "material_universe_cache/materials_clustered.csv"
INPUT_LABELS = "docs/cluster_definitions.json"

# Localization strings
TRANSLATIONS = {
    "en": {
        "title": "The Material Universe",
        "subtitle": "33,973 stable crystals clustered into 21 material families using Tri-Fusion embeddings (Orb-v3 + MEGNet + OFM)",
        "chart_title": "The Material Universe: 33,973 Crystals in 21 Families",
        "family_summary": "Family Summary",
        "legend_title": "Material Family",
        "col_id": "ID",
        "col_family": "Material Family",
        "col_count": "Count",
        "col_percent": "%",
        "col_bandgap": "Avg Band Gap",
        "col_type": "Type",
        "type_metallic": "Metallic",
        "type_semiconductor": "Semiconductor",
        "type_insulator": "Insulator",
        "output_file": "docs/material_universe_map_labeled.html",
    },
    "ja": {
        "title": "マテリアル・ユニバース",
        "subtitle": "Tri-Fusion埋め込み（Orb-v3 + MEGNet + OFM）を用いて21の材料ファミリーにクラスタリングされた33,973の安定結晶",
        "chart_title": "マテリアル・ユニバース: 33,973結晶、21ファミリー",
        "family_summary": "ファミリー概要",
        "legend_title": "材料ファミリー",
        "col_id": "ID",
        "col_family": "材料ファミリー",
        "col_count": "数",
        "col_percent": "%",
        "col_bandgap": "平均バンドギャップ",
        "col_type": "タイプ",
        "type_metallic": "金属",
        "type_semiconductor": "半導体",
        "type_insulator": "絶縁体",
        "output_file": "docs/material_universe_map_labeled_ja.html",
    },
}

# Japanese translations for family names
FAMILY_TRANSLATIONS_JA = {
    "Unclassified / Noise": "未分類 / ノイズ",
    "Intermetallic Alloys (Li/Mg/Rare Earth)": "金属間化合物（Li/Mg/希土類）",
    "Alkali Metal Fluorides": "アルカリ金属フッ化物",
    "Molybdates & Vanadates": "モリブデン酸塩 & バナジン酸塩",
    "Chalcogenide Oxysalts (Sulfates, Selenates, Tellurates)": "カルコゲン酸塩（硫酸塩、セレン酸塩、テルル酸塩）",
    "Phosphates & Arsenates": "リン酸塩 & ヒ酸塩",
    "Hydrides & Hydroxides": "水素化物 & 水酸化物",
    "Zintl Phases (Pnictides)": "ジントル相（プニクタイド）",
    "Chalcogenides (Sulfides, Selenides, Tellurides)": "カルコゲナイド（硫化物、セレン化物、テルル化物）",
    "Intermetallics & Zintl Compounds (Al/Ga/Ge/Si)": "金属間化合物 & ジントル化合物（Al/Ga/Ge/Si）",
    "Perovskite Oxides (Ru, Ir, Pt Group)": "ペロブスカイト酸化物（Ru, Ir, Pt族）",
    "Nickel Intermetallics": "ニッケル金属間化合物",
    "Nitrides & Carbonitrides": "窒化物 & 炭窒化物",
    "Heavy Halides (Bromides & Iodides)": "重ハロゲン化物（臭化物 & ヨウ化物）",
    "Chlorides": "塩化物",
    "Borides & Silicides": "ホウ化物 & ケイ化物",
    "Platinum Group Metal Borides": "白金族金属ホウ化物",
    "Transition Metal Oxides (Mn, Fe, Co, Ni)": "遷移金属酸化物（Mn, Fe, Co, Ni）",
    "Perovskite-type Tantalates, Niobates & Titanates": "ペロブスカイト型タンタル酸塩、ニオブ酸塩 & チタン酸塩",
    "Alkali Metal Oxides": "アルカリ金属酸化物",
    "Silicates & Aluminosilicates": "ケイ酸塩 & アルミノケイ酸塩",
    "Borates": "ホウ酸塩",
}


def get_electronic_type(avg_gap, lang="en"):
    """Classify electronic type based on average band gap."""
    t = TRANSLATIONS[lang]
    if avg_gap < 0.1:
        return t["type_metallic"]
    elif avg_gap > 3.0:
        return t["type_insulator"]
    else:
        return t["type_semiconductor"]


def translate_family(family, lang):
    """Translate family name to target language."""
    if lang == "ja":
        return FAMILY_TRANSLATIONS_JA.get(family, family)
    return family


def generate_summary_table(df, sorted_families, lang="en"):
    """Generate HTML summary table of all families."""
    t = TRANSLATIONS[lang]
    rows = []
    total = len(df)

    for family in sorted_families:
        subset = df[df["Family"] == family]
        count = len(subset)
        pct = count / total * 100
        avg_gap = subset["BandGap"].mean()
        e_type = get_electronic_type(avg_gap, lang)

        # Get cluster ID (first one if multiple map to same family)
        cluster_id = subset["ClusterID"].iloc[0]

        # Translate family name for display
        family_display = translate_family(family, lang)

        rows.append(
            f"<tr><td>{cluster_id}</td><td>{family_display}</td><td>{count:,}</td>"
            f"<td>{pct:.1f}%</td><td>{avg_gap:.2f} eV</td><td>{e_type}</td></tr>"
        )

    table_html = f"""
    <table class="summary-table">
        <thead>
            <tr>
                <th>{t["col_id"]}</th>
                <th>{t["col_family"]}</th>
                <th>{t["col_count"]}</th>
                <th>{t["col_percent"]}</th>
                <th>{t["col_bandgap"]}</th>
                <th>{t["col_type"]}</th>
            </tr>
        </thead>
        <tbody>
            {"".join(rows)}
        </tbody>
    </table>
    """
    return table_html


def regenerate_map(lang="en"):
    """Generate the labeled map in the specified language."""
    t = TRANSLATIONS[lang]
    output_html = t["output_file"]

    if not os.path.exists(INPUT_LABELS):
        print(f"Error: {INPUT_LABELS} not found. Run Phase 2 analysis first.")
        return

    if not os.path.exists(INPUT_CSV):
        print(f"Error: {INPUT_CSV} not found. Run clustering script first.")
        return

    print(f"Loading data and labels (lang={lang})...")
    df = pd.read_csv(INPUT_CSV)
    with open(INPUT_LABELS, "r") as f:
        label_map = json.load(f)

    # Map IDs to Scientific Names
    # Convert Cluster column to string to match JSON keys
    df["ClusterID"] = df["Cluster"].astype(str)

    # Apply mapping; Default to "Unclassified" if not found
    df["Family"] = df["ClusterID"].map(label_map).fillna("Unclassified")

    # Count materials per family for sorting
    family_counts = df["Family"].value_counts()
    df["FamilySize"] = df["Family"].map(family_counts)

    # Sort by family size (largest first), then by family name for consistent colors
    df = df.sort_values(["FamilySize", "Family"], ascending=[False, True])

    # Get unique families sorted by size
    sorted_families = (
        df.groupby("Family")["FamilySize"]
        .first()
        .sort_values(ascending=False)
        .index.tolist()
    )

    print(f"  Materials: {len(df):,}")
    print(f"  Families: {len(sorted_families)}")

    # Translate family names for display in chart
    if lang != "en":
        df["FamilyDisplay"] = df["Family"].apply(lambda x: translate_family(x, lang))
        sorted_families_display = [translate_family(f, lang) for f in sorted_families]
        color_col = "FamilyDisplay"
    else:
        sorted_families_display = sorted_families
        color_col = "Family"

    print("\nGenerating Labeled Map...")
    fig = px.scatter(
        df,
        x="x",
        y="y",
        color=color_col,
        category_orders={color_col: sorted_families_display},  # Sort legend by size
        hover_data=["Formula", "MP_ID", "BandGap", "ClusterID"],
        title=t["chart_title"],
        template="plotly_white",
        render_mode="webgl",
        width=1400,
        height=900,
    )

    # Update layout for better visualization
    fig.update_traces(marker=dict(size=3, opacity=0.7))
    fig.update_layout(
        legend=dict(
            orientation="v",
            yanchor="top",
            y=1,
            xanchor="left",
            x=1.02,
            title=t["legend_title"],
            font=dict(size=10),
        ),
        margin=dict(r=280 if lang == "ja" else 250),  # More space for Japanese
    )

    # Generate summary table
    summary_table = generate_summary_table(df, sorted_families, lang)

    # Get Plotly chart as HTML div
    chart_html = fig.to_html(full_html=False, include_plotlyjs="cdn")

    # Create full HTML with chart and summary table
    full_html = f"""<!DOCTYPE html>
<html lang="{lang}">
<head>
    <meta charset="utf-8">
    <title>{t["title"]}</title>
    <style>
        body {{
            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Hiragino Sans", "Noto Sans JP", sans-serif;
            margin: 0;
            padding: 20px;
            background: #fafafa;
        }}
        h1 {{
            color: #333;
            margin-bottom: 10px;
        }}
        .subtitle {{
            color: #666;
            margin-bottom: 20px;
        }}
        .chart-container {{
            background: white;
            border-radius: 8px;
            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
            padding: 20px;
            margin-bottom: 30px;
        }}
        .summary-container {{
            background: white;
            border-radius: 8px;
            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
            padding: 20px;
            max-width: 1000px;
        }}
        h2 {{
            color: #333;
            margin-top: 0;
        }}
        .summary-table {{
            width: 100%;
            border-collapse: collapse;
            font-size: 14px;
        }}
        .summary-table th {{
            background: #f5f5f5;
            padding: 12px 8px;
            text-align: left;
            border-bottom: 2px solid #ddd;
            font-weight: 600;
        }}
        .summary-table td {{
            padding: 10px 8px;
            border-bottom: 1px solid #eee;
        }}
        .summary-table tr:hover {{
            background: #f9f9f9;
        }}
        .summary-table tr:nth-child(even) {{
            background: #fafafa;
        }}
        .summary-table tr:nth-child(even):hover {{
            background: #f5f5f5;
        }}
    </style>
</head>
<body>
    <h1>{t["title"]}</h1>
    <p class="subtitle">{t["subtitle"]}</p>

    <div class="chart-container">
        {chart_html}
    </div>

    <div class="summary-container">
        <h2>{t["family_summary"]}</h2>
        {summary_table}
    </div>
</body>
</html>
"""

    # Ensure output directory exists
    os.makedirs(os.path.dirname(output_html), exist_ok=True)

    with open(output_html, "w", encoding="utf-8") as f:
        f.write(full_html)

    file_size_mb = os.path.getsize(output_html) / (1024 * 1024)
    print(f"\nLabeled Map saved to: {output_html}")
    print(f"  File size: {file_size_mb:.1f} MB")

    # Print family distribution
    print("\nFamily Distribution:")
    for family in sorted_families[:10]:
        count = family_counts[family]
        pct = count / len(df) * 100
        print(f"  {family}: {count:,} ({pct:.1f}%)")
    if len(sorted_families) > 10:
        print(f"  ... and {len(sorted_families) - 10} more families")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Generate labeled Material Universe map")
    parser.add_argument(
        "--lang",
        choices=["en", "ja"],
        default="en",
        help="Language for labels (en=English, ja=Japanese)",
    )
    args = parser.parse_args()
    regenerate_map(lang=args.lang)