import gradio as gr
import pandas as pd
import json
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
import os
import datetime

# Import submission handling functions
from submission import add_new_submission

# Optional imports with fallbacks
try:
    from content import format_error, format_warning, format_log
except ImportError:
    def format_error(msg): return f"❌ **Error:** {msg}"
    def format_warning(msg): return f"⚠️ **Warning:** {msg}"
    def format_log(msg): return f"✅ {msg}"

# Configuration
TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("TOKEN", None)
OWNER = "Pettingllms"
GROUNDTRUTH_PATH = f"{OWNER}/AMA-bench"
LOCAL_DEBUG = True

# ---------------------------------------------------------------------------
# Data loading
# ---------------------------------------------------------------------------

def load_jsonl_data(path):
    """Load JSONL data from file."""
    data = []
    if not os.path.exists(path):
        print(f"Warning: {path} not found, returning empty list")
        return data

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                data.append(json.loads(line))
    return data


def load_qa_distribution():
    """Load QA distribution data."""
    qa_dist_path = "data/qa_distribution.json"
    if os.path.exists(qa_dist_path):
        with open(qa_dist_path, "r", encoding="utf-8") as f:
            return json.load(f)
    return None


def convert_jsonl_to_dict(jsonl_data, is_agent=False):
    """
    Convert JSONL data to the dictionary format used by visualization functions.

    Args:
        jsonl_data: List of dictionaries from JSONL file
        is_agent: Boolean indicating if this is agent data

    Returns:
        Three dictionaries: capability_dict, domain_dict, and verified_dict
    """
    capability_dict = {
        "Recall": {},
        "Causal Inference": {},
        "State Updating": {},
        "State Abstraction": {}
    }

    domain_dict = {
        "TEXT2SQL": {},
        "SOFTWARE": {},
        "WEB": {},
        "GAME": {},
        "EMBODIED_AI": {},
        "OPENWORLD_QA": {}
    }

    # Store verified status for each model/agent
    verified_dict = {}

    capability_mapping = {
        "A": "Recall",
        "B": "Causal Inference",
        "C": "State Updating",
        "D": "State Abstraction"
    }

    for entry in jsonl_data:
        name = entry.get("agent_name") if is_agent else entry.get("model")
        if not name:
            continue

        model_family = entry.get("model_family", "")
        verified = entry.get("verified", False)
        scores = entry.get("Score", {})

        # Store verified status
        verified_dict[name] = verified

        # Process each domain
        for domain, domain_scores in scores.items():
            # domain_scores is a list like [{"A": 0.5}, {"B": 0.6}, {"C": 0.7}, {"D": 0.8}]
            if domain not in domain_dict:
                continue

            # Extract capability scores for this domain
            capability_scores_for_domain = {}
            for score_dict in domain_scores:
                for cap_letter, score_value in score_dict.items():
                    capability_scores_for_domain[cap_letter] = score_value

            # Calculate weighted average score for this domain using ratio_in_domain
            avg_domain_score = 0
            if QA_DISTRIBUTION and domain in QA_DISTRIBUTION.get("domain_distribution", {}):
                domain_info = QA_DISTRIBUTION["domain_distribution"][domain]
                problem_types = domain_info.get("problem_types", {})

                weighted_sum = 0
                weight_total = 0
                for cap_letter, score_value in capability_scores_for_domain.items():
                    if cap_letter in problem_types:
                        weight = problem_types[cap_letter].get("ratio_in_domain", 0.0)
                        weighted_sum += score_value * weight
                        weight_total += weight

                avg_domain_score = weighted_sum / weight_total if weight_total > 0 else 0
            else:
                # Fallback to simple average if no distribution data
                domain_score_values = list(capability_scores_for_domain.values())
                avg_domain_score = sum(domain_score_values) / len(domain_score_values) if domain_score_values else 0

            # Store in domain_dict
            domain_dict[domain][name] = {
                "accuracy": avg_domain_score,
                "model_family": model_family,
                "f1": avg_domain_score  # For now, use same value for f1
            }

            # Store in capability_dict with ratio_overall for later weighted averaging
            for cap_letter, score_value in capability_scores_for_domain.items():
                capability_name = capability_mapping.get(cap_letter)
                if capability_name and capability_name in capability_dict:
                    if name not in capability_dict[capability_name]:
                        capability_dict[capability_name][name] = {
                            "accuracy": 0,
                            "model_family": model_family,
                            "f1": 0,
                            "weight_sum": 0
                        }

                    # Use ratio_overall as weight for this capability score
                    weight = 0
                    if QA_DISTRIBUTION and domain in QA_DISTRIBUTION.get("domain_distribution", {}):
                        domain_info = QA_DISTRIBUTION["domain_distribution"][domain]
                        problem_types = domain_info.get("problem_types", {})
                        if cap_letter in problem_types:
                            weight = problem_types[cap_letter].get("ratio_overall", 0.0)
                    else:
                        # Fallback: equal weight across domains
                        weight = 1.0 / 6  # 6 domains

                    capability_dict[capability_name][name]["accuracy"] += score_value * weight
                    capability_dict[capability_name][name]["f1"] += score_value * weight
                    capability_dict[capability_name][name]["weight_sum"] += weight

    # Calculate weighted averages for capability scores
    for capability_name, models in capability_dict.items():
        for model_name, model_data in models.items():
            weight_sum = model_data.get("weight_sum", 1)
            model_data["accuracy"] = model_data["accuracy"] / weight_sum if weight_sum > 0 else 0
            model_data["f1"] = model_data["f1"] / weight_sum if weight_sum > 0 else 0
            del model_data["weight_sum"]

    return capability_dict, domain_dict, verified_dict


# Load all data files
AGENT_DATA = load_jsonl_data("data/agent.jsonl")
MODEL_DATA = load_jsonl_data("data/model.jsonl")
QA_DISTRIBUTION = load_qa_distribution()

# Convert to dictionary format for visualization
AGENT_CAPABILITY, AGENT_DOMAIN, AGENT_VERIFIED = convert_jsonl_to_dict(AGENT_DATA, is_agent=True)
MODEL_CAPABILITY, MODEL_DOMAIN, MODEL_VERIFIED = convert_jsonl_to_dict(MODEL_DATA, is_agent=False)

METRICS = ["Recall", "Causal Inference", "State Updating", "State Abstraction"]

# Weighted ratios (from benchmark data distribution)
# Use QA distribution if available, otherwise use hardcoded values
if QA_DISTRIBUTION:
    domain_dist = QA_DISTRIBUTION.get("domain_distribution", {})
    DOMAIN_RATIO = {
        key: value.get("qa_ratio", 0) for key, value in domain_dist.items()
    }

    problem_types = QA_DISTRIBUTION.get("overall_distribution", {}).get("problem_types", {})
    PROBLEM_TYPE_RATIO = {
        "RECALL": problem_types.get("A", {}).get("ratio", 0.336),
        "CAUSAL_INFERENCE": problem_types.get("B", {}).get("ratio", 0.239),
        "STATE_UPDATING": problem_types.get("C", {}).get("ratio", 0.259),
        "STATE_ABSTRACTION": problem_types.get("D", {}).get("ratio", 0.166),
    }
else:
    # Fallback to hardcoded values
    DOMAIN_RATIO = {
        "TEXT2SQL": 612 / 2496,
        "SOFTWARE": 432 / 2496,
        "WEB": 372 / 2496,
        "GAME": 360 / 2496,
        "EMBODIED_AI": 360 / 2496,
        "OPENWORLD_QA": 360 / 2496,
    }

    PROBLEM_TYPE_RATIO = {
        "RECALL": 839 / 2496,
        "CAUSAL_INFERENCE": 596 / 2496,
        "STATE_UPDATING": 647 / 2496,
        "STATE_ABSTRACTION": 414 / 2496,
    }


def _normalize_category_key(name: str) -> str:
    """Normalize category key to uppercase snake-style for matching."""
    return str(name).strip().upper().replace(" ", "_").replace("-", "_")


def get_category_weights(categories):
    """Return normalized per-category weights based on configured ratios."""
    if not categories:
        return {}

    # Normalize all categories to uppercase with underscores
    normalized = [_normalize_category_key(c) for c in categories]

    # Check if categories match domain keys or problem type keys
    domain_hits = sum(1 for c in normalized if c in DOMAIN_RATIO)
    type_hits = sum(1 for c in normalized if c in PROBLEM_TYPE_RATIO)

    # Detect whether current dict is domain-based or capability/problem-type-based
    use_domain = domain_hits >= type_hits

    weights = {}
    for original in categories:
        key = _normalize_category_key(original)
        if use_domain:
            weight = DOMAIN_RATIO.get(key, 0.0)
        else:
            weight = PROBLEM_TYPE_RATIO.get(key, 0.0)
        weights[original] = weight

    total = sum(weights.values())
    if total <= 0:
        equal_weight = 1.0 / len(categories)
        return {c: equal_weight for c in categories}

    return {c: w / total for c, w in weights.items()}


def get_ratio_overall_weights():
    """
    Get weights based on ratio_overall from qa_distribution.json.
    Returns a nested dict: {domain: {capability: ratio_overall}}
    """
    if not QA_DISTRIBUTION:
        return {}

    weights = {}
    capability_mapping = {
        "A": "Recall",
        "B": "Causal Inference",
        "C": "State Updating",
        "D": "State Abstraction"
    }

    domain_dist = QA_DISTRIBUTION.get("domain_distribution", {})
    for domain, domain_data in domain_dist.items():
        weights[domain] = {}
        problem_types = domain_data.get("problem_types", {})
        for cap_letter, cap_data in problem_types.items():
            capability_name = capability_mapping.get(cap_letter)
            if capability_name:
                weights[domain][capability_name] = cap_data.get("ratio_overall", 0.0)

    return weights


def filter_data_by_items(data_dict, allowed_items):
    """Filter nested score dict to only keep specified items for each category."""
    allowed_set = set(allowed_items)
    filtered = {}
    for category, category_data in data_dict.items():
        filtered[category] = {
            item: item_data
            for item, item_data in category_data.items()
            if item in allowed_set
        }
    return filtered


# Color palette: Distinct colors for better differentiation
COLORS = [
    'rgba(135, 160, 220, 0.5)',  # Light Blue
    'rgba(230, 150, 120, 0.5)',  # Orange
    'rgba(180, 180, 180, 0.5)',  # Gray
    'rgba(255, 215, 100, 0.5)',  # Yellow
    'rgba(140, 180, 220, 0.5)',  # Sky Blue
    'rgba(140, 200, 150, 0.5)',  # Green
    'rgba(200, 160, 140, 0.5)',  # Brown
    'rgba(130, 140, 200, 0.5)',  # Purple-Blue
    'rgba(255, 180, 150, 0.5)',  # Coral
    'rgba(150, 220, 180, 0.5)',  # Mint Green
]


# ---------------------------------------------------------------------------
# Visualization functions
# ---------------------------------------------------------------------------

def create_radar_chart_from_dict(data_dict, title="Performance Radar Chart", top_n=10):
    """
    Create radar chart from dictionary data showing top N entries.

    Args:
        data_dict: Dictionary with structure {category: {item_name: {accuracy: x, f1: y}}}
        title: Chart title
        top_n: Number of top entries to display (default 10)

    Returns:
        Plotly Figure with radar chart (showing only accuracy)
    """
    if not data_dict:
        fig = go.Figure()
        fig.update_layout(title="No data available")
        return fig

    # Extract categories and items
    categories = list(data_dict.keys())
    all_items = set()
    for category_data in data_dict.values():
        all_items.update(category_data.keys())

    # Calculate weighted average accuracy for each item to determine top N
    category_weights = get_category_weights(categories)
    item_avg_scores = {}
    for item in all_items:
        weighted_sum = 0.0
        weight_sum = 0.0
        for category in categories:
            item_data = data_dict[category].get(item, {})
            accuracy = item_data.get('accuracy', 0) if isinstance(item_data, dict) else item_data
            weight = category_weights.get(category, 0.0)
            weighted_sum += accuracy * weight
            weight_sum += weight
        item_avg_scores[item] = (weighted_sum / weight_sum) if weight_sum > 0 else 0

    # Get top N items by average accuracy
    sorted_items = sorted(item_avg_scores.items(), key=lambda x: x[1], reverse=True)
    top_items = [item[0] for item in sorted_items[:top_n]]

    fig = go.Figure()

    # Add trace for each top item
    for idx, item in enumerate(top_items):
        values = []
        for category in categories:
            item_data = data_dict[category].get(item, {})
            # Extract accuracy value only
            accuracy = item_data.get('accuracy', 0) if isinstance(item_data, dict) else item_data
            values.append(accuracy * 100)  # Convert to percentage

        # Close the polygon
        values_closed = values + [values[0]]
        categories_closed = categories + [categories[0]]

        color = COLORS[idx % len(COLORS)]

        fig.add_trace(go.Scatterpolar(
            r=values_closed,
            theta=categories_closed,
            mode='lines+markers',
            fill='toself',
            name=item,
            line=dict(color=color, width=2),
            marker=dict(color=color, size=8),
            fillcolor=color.replace('0.5', '0.15'),
            hovertemplate='<b>%{fullData.name}</b><br>%{theta}: %{r:.2f}%<extra></extra>'
        ))

    # Update layout
    fig.update_layout(
        title=dict(
            text=title,
            x=0.5,
            xanchor='center',
            font=dict(size=20, color='#2c3e50')
        ),
        polar=dict(
            radialaxis=dict(
                visible=True,
                range=[0, 100],
                ticksuffix='%',
                tickfont=dict(size=11),
                gridcolor='rgba(200, 200, 200, 0.3)',
                gridwidth=1
            ),
            angularaxis=dict(
                tickfont=dict(size=13, weight='bold', color='#2c3e50')
            ),
            bgcolor='rgba(245, 245, 245, 0.5)'
        ),
        legend=dict(
            font=dict(size=11),
            title=dict(text="Items", font=dict(size=13)),
            x=1.02,
            y=1,
            xanchor='left',
            yanchor='top',
            bgcolor='rgba(255,255,255,0.8)',
            bordercolor='rgba(100,100,100,0.3)',
            borderwidth=1,
            itemclick="toggleothers",
            itemdoubleclick="toggle"
        ),
        height=600,
        margin=dict(l=80, r=250, t=100, b=80),
        paper_bgcolor='white',
        font=dict(color='#2c3e50')
    )

    return fig


def create_capability_subplots(data_dict, title="Capability Performance", top_n=10):
    """
    Create 2x2 subplot layout with one bar chart per capability, showing top N entries.
    Optimized for responsive sizing with equal spacing across all subplots.

    Args:
        data_dict: Dictionary with structure {capability: {item_name: {accuracy: x, f1: y}}}
        title: Overall chart title
        top_n: Number of top entries to display per subplot (default 10)

    Returns:
        Plotly Figure with 2x2 subplots (showing only accuracy)
    """
    if not data_dict:
        fig = go.Figure()
        fig.update_layout(title="No data available")
        return fig

    # Extract capabilities
    capabilities = list(data_dict.keys())

    # Create 2x2 subplot with optimized spacing for full window coverage
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=capabilities[:4],
        vertical_spacing=0.15,
        horizontal_spacing=0.12,
        specs=[[{"secondary_y": False}, {"secondary_y": False}],
               [{"secondary_y": False}, {"secondary_y": False}]]
    )

    # Position mapping for 2x2 grid
    positions = [(1, 1), (1, 2), (2, 1), (2, 2)]

    # Get all unique items across all capabilities for consistent coloring
    all_items = set()
    for capability_data in data_dict.values():
        all_items.update(capability_data.keys())
    all_items = sorted(list(all_items))

    # Create a bar chart for each capability
    for idx, capability in enumerate(capabilities[:4]):
        row, col = positions[idx]
        capability_data = data_dict[capability]

        # Sort items by accuracy score for this capability and get top N
        sorted_items = sorted(
            capability_data.items(),
            key=lambda x: x[1].get('accuracy', 0) if isinstance(x[1], dict) else x[1],
            reverse=True
        )[:top_n]

        item_names = [item[0] for item in sorted_items]
        item_scores = [
            (item[1].get('accuracy', 0) if isinstance(item[1], dict) else item[1]) * 100
            for item in sorted_items
        ]

        # Assign colors based on global item index
        colors = [COLORS[all_items.index(name) % len(COLORS)] for name in item_names]

        fig.add_trace(
            go.Bar(
                x=item_names,
                y=item_scores,
                marker=dict(
                    color=colors,
                    line=dict(color='rgba(50, 50, 50, 0.5)', width=1)
                ),
                showlegend=False,
                hovertemplate='<b>%{x}</b><br>Score: %{y:.2f}%<extra></extra>',
                width=0.7
            ),
            row=row, col=col
        )

        # Update axes with consistent styling
        fig.update_xaxes(
            tickangle=-45,
            tickfont=dict(size=9),
            tickmode='linear',
            row=row, col=col,
            showgrid=False,
            showline=True,
            linewidth=1,
            linecolor='rgba(200, 200, 200, 0.5)'
        )
        fig.update_yaxes(
            range=[0, 100],
            title_text="Performance (%)",
            title_font=dict(size=12),
            tickfont=dict(size=10),
            gridcolor='rgba(200, 200, 200, 0.3)',
            row=row, col=col,
            showline=True,
            linewidth=1,
            linecolor='rgba(200, 200, 200, 0.5)'
        )

    # Update overall layout with fully responsive sizing
    fig.update_layout(
        title=dict(
            text=title,
            x=0.5,
            xanchor='center',
            font=dict(size=20, color='#2c3e50')
        ),
        height=900,
        autosize=True,
        showlegend=False,
        plot_bgcolor='rgba(245, 245, 245, 0.5)',
        paper_bgcolor='white',
        font=dict(color='#2c3e50', family="Arial, sans-serif"),
        margin=dict(l=80, r=80, t=100, b=120),
        hovermode='closest'
    )

    # Update subplot titles styling
    for annotation in fig['layout']['annotations']:
        annotation['font'] = dict(size=14, color='#2c3e50')
        annotation['xanchor'] = 'center'
        annotation['showarrow'] = False

    return fig


def _rank_prefix(i):
    medals = ["🥇", "🥈", "🥉"]
    return f"{medals[i]} {i+1}" if i < 3 else str(i + 1)


def _fmt(v):
    return f"{v * 100:.2f}%"


def _build_rows_sorted(items, verified_dict, score_fn, type_name):
    """
    Build rows for verified entries only (verified=True).
    Unverified submissions are excluded from the leaderboard display.
    """
    rows = []
    for item in sorted(items):
        if not verified_dict.get(item, False):
            continue
        row = score_fn(item, True, type_name)
        rows.append(row)

    rows.sort(key=lambda r: r["_sort"], reverse=True)
    for i, r in enumerate(rows):
        r["Rank"] = _rank_prefix(i)

    return rows


def create_capability_table(capability_dict, domain_dict, verified_dict, type_name="Agent"):
    """
    Summary table grouped by capability (A/B/C/D).
    verified=True  → ranked by official score
    verified=False → appended unranked, scores marked with * (self-reported)
    """
    items = set()
    for d in domain_dict.values():
        items.update(d.keys())
    if not items:
        return pd.DataFrame()

    cap_cols = {
        "Recall":            "Recall (A)",
        "Causal Inference":  "Causal Inf. (B)",
        "State Updating":    "State Upd. (C)",
        "State Abstraction": "State Abs. (D)",
    }
    cap_weights = {}
    if QA_DISTRIBUTION:
        pt = QA_DISTRIBUTION.get("overall_distribution", {}).get("problem_types", {})
        letter_to_cap = {"A": "Recall", "B": "Causal Inference",
                         "C": "State Updating", "D": "State Abstraction"}
        for letter, info in pt.items():
            cap_weights[letter_to_cap.get(letter, "")] = info.get("ratio", 0.0)

    def score_fn(item, is_verified, type_name):
        model_family = ""
        for cd in capability_dict.values():
            if item in cd and isinstance(cd[item], dict):
                model_family = cd[item].get("model_family", "")
                if model_family:
                    break

        cap_scores = {}
        for cap_name in cap_cols:
            d = capability_dict.get(cap_name, {}).get(item, {})
            cap_scores[cap_name] = d.get("accuracy", 0.0) if isinstance(d, dict) else 0.0

        w_sum = sum(cap_scores[c] * cap_weights.get(c, 0.0) for c in cap_cols)
        w_tot = sum(cap_weights.get(c, 0.0) for c in cap_cols)
        avg = w_sum / w_tot if w_tot > 0 else sum(cap_scores.values()) / len(cap_scores)

        row = {
            type_name:      f"{item} {'✓' if is_verified else '○'}",
            "Model Family": model_family,
            "Avg Score":    _fmt(avg),
            "_sort":        avg,
        }
        for cap_name, col_label in cap_cols.items():
            row[f"{col_label}_score"] = _fmt(cap_scores[cap_name])
        return row

    rows = _build_rows_sorted(items, verified_dict, score_fn, type_name)
    return pd.DataFrame([
        {"Rank": r["Rank"], **{k: v for k, v in r.items() if k not in ("Rank", "_sort")}}
        for r in rows
    ])


def create_domain_table(capability_dict, domain_dict, verified_dict, type_name="Agent"):
    """
    Summary table grouped by domain.
    verified=True  → ranked by official score
    verified=False → appended unranked, scores marked with * (self-reported)
    """
    items = set()
    for d in domain_dict.values():
        items.update(d.keys())
    if not items:
        return pd.DataFrame()

    domain_order = ["TEXT2SQL", "SOFTWARE", "WEB", "GAME", "EMBODIED_AI", "OPENWORLD_QA"]
    domain_weights = {}
    if QA_DISTRIBUTION:
        for dom, info in QA_DISTRIBUTION.get("domain_distribution", {}).items():
            domain_weights[dom] = info.get("qa_ratio", 0.0)

    def score_fn(item, is_verified, type_name):
        model_family = ""
        for cd in capability_dict.values():
            if item in cd and isinstance(cd[item], dict):
                model_family = cd[item].get("model_family", "")
                if model_family:
                    break

        dom_scores = {}
        for dom in domain_order:
            d = domain_dict.get(dom, {}).get(item, {})
            dom_scores[dom] = d.get("accuracy", 0.0) if isinstance(d, dict) else 0.0

        w_sum = sum(dom_scores[d] * domain_weights.get(d, 0.0) for d in domain_order)
        w_tot = sum(domain_weights.get(d, 0.0) for d in domain_order)
        avg = w_sum / w_tot if w_tot > 0 else sum(dom_scores.values()) / len(dom_scores)

        row = {
            type_name:      f"{item} {'✓' if is_verified else '○'}",
            "Model Family": model_family,
            "Avg Score":    _fmt(avg),
            "_sort":        avg,
        }
        for dom in domain_order:
            row[f"{dom}_score"] = _fmt(dom_scores[dom])
        return row

    rows = _build_rows_sorted(items, verified_dict, score_fn, type_name)
    return pd.DataFrame([{"Rank": r["Rank"], **{k: v for k, v in r.items() if k != "Rank" and k != "_sort"}}
                          for r in rows])


def create_summary_table(capability_dict, domain_dict, verified_dict, type_name="Agent"):
    """
    Create summary table showing rank, average accuracy and F1 scores.
    Uses ratio_overall from qa_distribution.json for weighting.

    Args:
        capability_dict: Dictionary with capability scores
        domain_dict: Dictionary with domain scores
        verified_dict: Dictionary mapping item names to verified status
        type_name: "Agent" or "Model"

    Returns:
        pandas DataFrame with rank, verified status, accuracy and F1 columns
    """
    if not capability_dict and not domain_dict:
        return pd.DataFrame()

    # Calculate average scores for each item using ratio_overall
    items = set()
    for category_data in domain_dict.values():
        items.update(category_data.keys())

    rows = []
    for item in sorted(items):
        weighted_accuracy_sum = 0.0
        weighted_f1_sum = 0.0
        total_weight = 0.0
        model_family = ""

        # Get model family from capability dict
        for cap_data in capability_dict.values():
            if item in cap_data:
                item_data = cap_data[item]
                if isinstance(item_data, dict) and not model_family:
                    model_family = item_data.get('model_family', '')
                    break

        # Calculate scores by capability
        capability_scores = {}

        for capability, cap_data in capability_dict.items():
            if item in cap_data:
                item_data = cap_data[item]
                if isinstance(item_data, dict):
                    capability_scores[capability] = item_data.get('accuracy', 0)

        # Calculate weighted average using ratio from overall problem type distribution
        if QA_DISTRIBUTION:
            problem_types = QA_DISTRIBUTION.get("overall_distribution", {}).get("problem_types", {})
            capability_to_letter = {
                "Recall": "A",
                "Causal Inference": "B",
                "State Updating": "C",
                "State Abstraction": "D"
            }

            for capability, score in capability_scores.items():
                letter = capability_to_letter.get(capability)
                if letter and letter in problem_types:
                    weight = problem_types[letter].get("ratio", 0)
                    weighted_accuracy_sum += score * weight
                    weighted_f1_sum += score * weight  # Using same for f1
                    total_weight += weight
        else:
            # Fallback: equal weights
            for score in capability_scores.values():
                weighted_accuracy_sum += score
                weighted_f1_sum += score
                total_weight += 1

        avg_accuracy = (weighted_accuracy_sum / total_weight) if total_weight > 0 else 0
        avg_f1 = (weighted_f1_sum / total_weight) if total_weight > 0 else 0

        # Get verified status and add icon to name
        is_verified = verified_dict.get(item, False)
        verified_icon = " ✓" if is_verified else " ○"
        display_name = f"{item}{verified_icon}"

        rows.append({
            type_name: display_name,
            "Model Family": model_family,
            "Avg Accuracy": avg_accuracy,
            "Avg F1": avg_f1,
            "_acc_sort": avg_accuracy,
            "_verified": is_verified
        })

    df = pd.DataFrame(rows)
    df = df.sort_values(by="_acc_sort", ascending=False).reset_index(drop=True)

    # Add rank column with medals for top 3
    medals = ["🥇", "🥈", "🥉"]
    ranks = []
    for i in range(len(df)):
        if i < 3:
            ranks.append(f"{medals[i]} {i+1}")
        else:
            ranks.append(str(i+1))

    df.insert(0, "Rank", ranks)

    # Format accuracy and F1 as percentages
    df["Avg Accuracy"] = df["Avg Accuracy"].apply(lambda x: f"{x * 100:.2f}%")
    df["Avg F1"] = df["Avg F1"].apply(lambda x: f"{x * 100:.2f}%")

    # Drop sorting columns
    df = df.drop(columns=["_acc_sort", "_verified"])

    return df


# ---------------------------------------------------------------------------
# Build Gradio interface
# ---------------------------------------------------------------------------

def build_app():
    """Build the Gradio application."""

    CSS = """
    .markdown-text {
        font-size: 16px !important;
    }
    .intro-box {
        background: linear-gradient(135deg, rgba(26, 188, 156, 0.1) 0%, rgba(52, 152, 219, 0.1) 100%);
        padding: 25px;
        border-radius: 10px;
        margin: 20px 0;
        border-left: 4px solid #1abc9c;
    }
    """

    # Keep Model Domain view strictly model-only
    model_items = set()
    for capability_data in MODEL_CAPABILITY.values():
        model_items.update(capability_data.keys())
    model_domain_filtered = filter_data_by_items(MODEL_DOMAIN, model_items)
    if not any(len(category_data) > 0 for category_data in model_domain_filtered.values()):
        model_domain_filtered = {}

    import base64, pathlib
    _logo_path = pathlib.Path("assets/ama_logo.jpg")
    if _logo_path.exists():
        _logo_b64 = base64.b64encode(_logo_path.read_bytes()).decode()
        _logo_tag = (
            '<img src="data:image/jpeg;base64,' + _logo_b64 + '"'
            ' alt="AMA-Bench" style="height:80px;object-fit:contain;flex-shrink:0;">'
        )
    else:
        _logo_tag = "🤖 "

    with gr.Blocks(title="AMA-Bench Leaderboard", theme=gr.themes.Soft()) as demo:

        # Header
        gr.HTML(
            """
        <div style="display:flex; align-items:center; justify-content:center;
                    gap:24px; padding:20px 20px 10px; margin-bottom:20px;">
        """
            + _logo_tag
            + """
            <div style="text-align:left;">
                <h1 style="margin:0; font-size:48px; font-weight:700; color:#1a1a2e; line-height:1.1;">
                    AMA-Bench: Leaderboard
                </h1>
                <p style="font-size:18px; color:#666; margin:8px 0 0;">
                    Agent Memory Assessment Benchmark - Performance Visualization
                </p>
            </div>
        </div>
        """
        )

        # Links bar
        gr.HTML("""
        <div style="display:flex; justify-content:center; gap:16px; flex-wrap:wrap; padding:4px 20px 20px;">
            <a href="https://ama-bench.github.io/" target="_blank"
               style="display:inline-flex; align-items:center; gap:6px; padding:8px 18px;
                      background:#1abc9c; color:#fff; border-radius:8px; text-decoration:none;
                      font-size:14px; font-weight:600;">
                🌐 Website
            </a>
            <a href="https://github.com/AMA-Bench/AMA-Hub" target="_blank"
               style="display:inline-flex; align-items:center; gap:6px; padding:8px 18px;
                      background:#24292e; color:#fff; border-radius:8px; text-decoration:none;
                      font-size:14px; font-weight:600;">
                🐙 GitHub
            </a>
            <a href="https://huggingface.co/datasets/AMA-bench/AMA-bench" target="_blank"
               style="display:inline-flex; align-items:center; gap:6px; padding:8px 18px;
                      background:#ff9d00; color:#fff; border-radius:8px; text-decoration:none;
                      font-size:14px; font-weight:600;">
                🤗 Dataset
            </a>
            <a href="https://arxiv.org/pdf/2602.22769" target="_blank"
               style="display:inline-flex; align-items:center; gap:6px; padding:8px 18px;
                      background:#b31b1b; color:#fff; border-radius:8px; text-decoration:none;
                      font-size:14px; font-weight:600;">
                📄 Paper
            </a>
        </div>
        """)

        # Welcome Banner
        gr.HTML("""
        <div class="intro-box">
            <h3 style="margin: 0 0 15px 0; color: #1abc9c; font-size: 24px;">
                🎯 Welcome to AMA-Bench!
            </h3>
            <p style="margin: 15px 0; color: #2c3e50; font-size: 22px; font-weight: 700; line-height: 1.6;">
                Evaluate agent memory itself, not just dialogue.
            </p>
            <p style="margin: 10px 0; color: #2c3e50; font-size: 16px; line-height: 1.6;">
                Built from real agent environment streams and scalable long-horizon trajectories across
                representative domains, AMA-Bench tests whether LLM agents can <strong>recall</strong>,
                perform <strong>causal inference</strong>, <strong>update state</strong>, and
                <strong>abstract state information </strong> over long runs.
            </p>
            <p style="margin: 10px 0; color: #34495e; font-size: 14px;">
                📄 Paper: <a href="https://arxiv.org/abs/2602.22769" style="color: #3498db;">https://arxiv.org/abs/2602.22769</a>
            </p>
        </div>
        """)

        with gr.Tabs():

            # ============================================================
            # Tab 1: Agent Performance
            # ============================================================
            with gr.Tab("🤖 Agent Performance"):
                gr.Markdown("""
                ### Agent Performance Analysis
                Explore agent performance across different domains and capabilities.
                """)

                with gr.Tabs():
                    # Domain Sub-tab (Radar Chart)
                    with gr.Tab("🎯 Domain Performance"):
                        gr.Markdown("""
                        **Radar chart** showing agent performance across different domains.
                        Click legend items to isolate specific agents.
                        """)

                        with gr.Row():
                            agent_domain_top_n = gr.Slider(
                                minimum=1,
                                maximum=10,
                                value=8,
                                step=1,
                                label="Show Top N Agents",
                                info="Select how many top agents to display (1-10)"
                            )

                        agent_domain_chart = gr.Plot(
                            value=create_radar_chart_from_dict(
                                AGENT_DOMAIN,
                                "Agent Performance Across Domains",
                                top_n=8
                            )
                        )

                        with gr.Accordion("📊 Summary Statistics", open=True):
                            gr.Markdown("""
                            **Verification Status:** Only officially verified entries (✓) are shown. User-submitted results (○) will appear after weekly LLM-as-Judge evaluation.
                            """)
                            agent_domain_table = gr.Dataframe(
                                value=create_domain_table(AGENT_CAPABILITY, AGENT_DOMAIN, AGENT_VERIFIED, "Agent"),
                                label="Scores by Domain"
                            )

                        # Update chart when slider changes
                        agent_domain_top_n.change(
                            fn=lambda n: create_radar_chart_from_dict(
                                AGENT_DOMAIN,
                                "Agent Performance Across Domains",
                                top_n=int(n)
                            ),
                            inputs=[agent_domain_top_n],
                            outputs=[agent_domain_chart]
                        )

                    # Capability Sub-tab (Bar Chart)
                    with gr.Tab("⚡ Capability Performance"):
                        gr.Markdown("""
                        Showing agent performance for each capability.
                        Each subplot represents one capability with comparative performance across all agents.
                        """)

                        with gr.Row():
                            agent_capability_top_n = gr.Slider(
                                minimum=1,
                                maximum=10,
                                value=8,
                                step=1,
                                label="Show Top N Agents",
                                info="Select how many top agents to display per capability (1-10)"
                            )

                        agent_capability_chart = gr.Plot(
                            value=create_capability_subplots(
                                AGENT_CAPABILITY,
                                "Agent Performance by Capability",
                                top_n=8
                            )
                        )

                        with gr.Accordion("📊 Summary Statistics", open=True):
                            gr.Markdown("""
                            **Verification Status:** Only officially verified entries (✓) are shown. User-submitted results (○) will appear after weekly LLM-as-Judge evaluation.
                            """)
                            agent_capability_table = gr.Dataframe(
                                value=create_capability_table(AGENT_CAPABILITY, AGENT_DOMAIN, AGENT_VERIFIED, "Agent"),
                                label="Scores by Capability"
                            )

                        # Update chart when slider changes
                        agent_capability_top_n.change(
                            fn=lambda n: create_capability_subplots(
                                AGENT_CAPABILITY,
                                "Agent Performance by Capability",
                                top_n=int(n)
                            ),
                            inputs=[agent_capability_top_n],
                            outputs=[agent_capability_chart]
                        )

            # ============================================================
            # Tab 2: Model Performance
            # ============================================================
            with gr.Tab("🔬 Model Performance"):
                gr.Markdown("""
                ### Model Performance Analysis
                Explore model performance across different domains and capabilities.
                """)

                with gr.Tabs():
                    # Domain Sub-tab (Radar Chart)
                    with gr.Tab("🎯 Domain Performance"):
                        gr.Markdown("""
                        **Radar chart** showing model performance across different domains.
                        Click legend items to isolate specific models.
                        """)

                        with gr.Row():
                            model_domain_top_n = gr.Slider(
                                minimum=1,
                                maximum=10,
                                value=8,
                                step=1,
                                label="Show Top N Models",
                                info="Select how many top models to display (1-10)"
                            )

                        model_domain_chart = gr.Plot(
                            value=create_radar_chart_from_dict(
                                model_domain_filtered,
                                "Model Performance Across Domains",
                                top_n=8
                            )
                        )

                        with gr.Accordion("📊 Summary Statistics", open=True):
                            gr.Markdown("""
                            **Verification Status:** Only officially verified entries (✓) are shown. User-submitted results (○) will appear after weekly LLM-as-Judge evaluation.
                            """)
                            model_domain_table = gr.Dataframe(
                                value=create_domain_table(MODEL_CAPABILITY, model_domain_filtered, MODEL_VERIFIED, "Model"),
                                label="Scores by Domain"
                            )

                        # Update chart when slider changes
                        model_domain_top_n.change(
                            fn=lambda n: create_radar_chart_from_dict(
                                model_domain_filtered,
                                "Model Performance Across Domains",
                                top_n=int(n)
                            ),
                            inputs=[model_domain_top_n],
                            outputs=[model_domain_chart]
                        )

                    # Capability Sub-tab (Bar Chart)
                    with gr.Tab("⚡ Capability Performance"):
                        gr.Markdown("""
                       Show model performance for each capability.
                        Each subplot represents one capability with comparative performance across all models.
                        """)

                        with gr.Row():
                            model_capability_top_n = gr.Slider(
                                minimum=1,
                                maximum=10,
                                value=8,
                                step=1,
                                label="Show Top N Models",
                                info="Select how many top models to display per capability (1-10)"
                            )

                        model_capability_chart = gr.Plot(
                            value=create_capability_subplots(
                                MODEL_CAPABILITY,
                                "Model Performance by Capability",
                                top_n=8
                            )
                        )

                        with gr.Accordion("📊 Summary Statistics", open=True):
                            gr.Markdown("""
                            **Verification Status:** Only officially verified entries (✓) are shown. User-submitted results (○) will appear after weekly LLM-as-Judge evaluation.
                            """)
                            model_capability_table = gr.Dataframe(
                                value=create_capability_table(MODEL_CAPABILITY, MODEL_DOMAIN, MODEL_VERIFIED, "Model"),
                                label="Scores by Capability"
                            )

                        # Update chart when slider changes
                        model_capability_top_n.change(
                            fn=lambda n: create_capability_subplots(
                                MODEL_CAPABILITY,
                                "Model Performance by Capability",
                                top_n=int(n)
                            ),
                            inputs=[model_capability_top_n],
                            outputs=[model_capability_chart]
                        )

            # ============================================================
            # Tab 3: Submit
            # ============================================================
            with gr.Tab("📤 Submit"):
                gr.Markdown("""
                ### Submit Your Model/Agent for Evaluation

                Submit your model or agent predictions to be evaluated on AMA-Bench.
                Your results will be reviewed and scored weekly by our LLM-as-Judge system.

                **⏰ Submission Policy:**
                - Each user can submit **once per week**
                - Submissions are evaluated **weekly** using our LLM-as-Judge system
                - Official scores (`verified=true`) are computed by our evaluation system
                - You can also run your own evaluation if you have access to the groundtruth data
                """)

                with gr.Row():
                    with gr.Column():
                        model_name_textbox = gr.Textbox(
                            label="Model/Agent Name",
                            placeholder="e.g., GPT-4 or MyAgent-v2"
                        )
                        submission_type = gr.Radio(
                            choices=["Model", "Agent"],
                            label="Submission Type",
                            value="Model"
                        )
                        url_textbox = gr.Textbox(
                            label="URL to Model/Agent Information",
                            placeholder="https://..."
                        )
                    with gr.Column():
                        organisation = gr.Textbox(
                            label="Organisation",
                            placeholder="e.g., OpenAI, Anthropic"
                        )
                        model_family_textbox = gr.Textbox(
                            label="Model Family",
                            placeholder="e.g., GPT-4, Claude-3, Qwen3-32B"
                        )
                        mail = gr.Textbox(
                            label="Contact Email",
                            placeholder="your.email@example.com"
                        )
                        file_upload = gr.File(
                            label="Submission File (JSONL format)",
                            file_types=[".jsonl"]
                        )

                gr.Markdown("""
                **📋 Submission Format:**

                Your JSONL file should contain one line per episode:

                ```json
                {
                  "episode_id": "trajectory_id",
                  "question_uuid_list": ["uuid-1", "uuid-2", "uuid-3"],
                  "answer_list": ["The agent moved right.", "..."],
                  "llm_as_judge_score_list": [true, false, true]
                }
                ```

                **Field Descriptions:**
                - `episode_id` *(required)*: The episode identifier — used to automatically look up the domain
                - `question_uuid_list` *(required)*: UUIDs of the benchmark questions in the same order as `answer_list` — used to look up each question's capability (A/B/C/D).
                - `answer_list` *(required)*: Your model/agent's answers, one per question
                - `llm_as_judge_score_list` *(required)*: `true`/`false` per answer — your self-evaluated correctness scores used for leaderboard ranking.

                **Important Notes:**
                - `question_uuid_list`, `answer_list`, and `llm_as_judge_score_list` must all be the same length
                - Domain is resolved automatically from `episode_id`; capability (A/B/C/D) is resolved from `question_uuid_list` — no need to supply them manually
                - All submissions start as `verified=false` and become `verified=true` after official LLM-as-Judge evaluation
                """)

                with gr.Row():
                    submit_button = gr.Button("Submit", variant="primary", size="lg")

                submission_result = gr.HTML()

                submit_button.click(
                    fn=lambda: gr.update(interactive=False, value="⏳ Submitting..."),
                    inputs=[],
                    outputs=[submit_button],
                ).then(
                    fn=add_new_submission,
                    inputs=[
                        model_name_textbox,
                        submission_type,
                        url_textbox,
                        file_upload,
                        organisation,
                        mail,
                        model_family_textbox,
                    ],
                    outputs=[submission_result],
                ).then(
                    fn=lambda: gr.update(interactive=True, value="Submit"),
                    inputs=[],
                    outputs=[submit_button],
                )

            # ============================================================
            # Tab 4: About
            # ============================================================
            with gr.Tab("ℹ️ About"):
                gr.Markdown("""
## AMA-Bench: Agent Memory Assessment Benchmark

AMA-Bench evaluates memory capabilities of LLMs and memory-augmented agents across four cognitive dimensions:
**Recall** (retrieving stored info), **Causal Inference** (cause-and-effect reasoning),
**State Updating** (tracking evolving states), and **State Abstraction** (forming higher-level representations).

### Benchmarks

We evaluate on two complementary subsets:
1. **Real-world Subset:** 2,496 QA pairs from real agent environment streams
2. **Synthetic Subset:** 1,200 QA pairs stratified across five trajectory lengths (8K, 16K, 32K, 64K, and 128K tokens)

### Leaderboard Tabs

- **Agent Performance**: Compares RAG and Agent Memory methods
  - Domain Performance: Radar charts across 6 domains (GAME, Embodied AI, Web, Text2SQL, Openworld QA, Software Engineer)
  - Capability Performance: showing performance on 4 capabilities
  - **Top N Selection**: Choose to display top 1-10 performers

- **Model Performance**: Compares LLM models directly
  - Domain Performance: Radar charts showing performance across different application domains
  - Capability Performance: showing performance on each cognitive capability
  - **Top N Selection**: Choose to display top 1-10 performers

### Metrics

Results are reported as **Accuracy** and **F1 Score**:
- Charts display **Accuracy** only for clarity
- Summary statistics tables show both **Avg Accuracy** and **Avg F1**
- Tables include **Rank** with 🥇🥈🥉 medals for top 3 performers

### Problem Type Distribution

- **Type A (Recall)**: 33.6% - 839 questions
- **Type B (Causal Inference)**: 23.9% - 596 questions
- **Type C (State Updating)**: 25.9% - 647 questions
- **Type D (State Abstraction)**: 16.6% - 414 questions


### Submission Rules

**📋 File Format**
- Submissions must be in **JSONL format** (`.jsonl`), one line per episode
- Each line must be a valid JSON object containing the required fields below
- `question_uuid_list`, `answer_list`, and `llm_as_judge_score_list` must all be the **same length**
- Files containing duplicate `episode_id` entries will be rejected

**📝 Required Fields**

| Field | Type | Description |
|---|---|---|
| `episode_id` | string | Episode identifier, used to automatically resolve domain |
| `question_uuid_list` | list[string] | UUIDs mapping each answer to a benchmark question, used to resolve capability (A/B/C/D) |
| `answer_list` | list[string] | Your model/agent's free-text answers, in the same order as `question_uuid_list` |
| `llm_as_judge_score_list` | list[bool] | Self-evaluated correctness (`true`/`false`) per answer |

**✅ Verification & Scoring**
- All submissions initially appear as `verified=false` (self-reported preview)
- The score shown immediately after submission is based on your `llm_as_judge_score_list`
- Official scores (`verified=true`) are recomputed weekly by our **LLM-as-Judge** evaluation system
- Only `verified=true` entries are displayed on the public leaderboard

**⚠️ Important Notes**
- Domain is resolved automatically from `episode_id` — no need to supply it manually
- Capability (A/B/C/D) is resolved automatically from each `question_uuid` — no need to supply it manually
- Official scores may differ from your self-reported preview after LLM-as-Judge re-evaluation
- We reserve the right to remove submissions that appear to contain fabricated or manipulated scores

---

**Paper:** [https://arxiv.org/abs/2602.22769](https://arxiv.org/abs/2602.22769)

*For questions or submissions, please open a discussion in the Community tab.*
                """)

    return demo


if __name__ == "__main__":
    demo_app = build_app()
    demo_app.launch(debug=True, show_error=True)