Spaces:

HeshamHaroon
/

Arabic-Function-Calling-Leaderboard

Running

File size: 9,717 Bytes

566d03e

"""
Visualization Charts
====================

Plotly-based visualizations for the Arabic Function Calling Leaderboard.
"""

import plotly.graph_objects as go
import plotly.express as px
from typing import Dict, List, Optional
import pandas as pd


# Arabic category names mapping
CATEGORY_NAMES_AR = {
    "simple": "بسيط",
    "multiple": "متعدد",
    "parallel": "متوازي",
    "parallel_multiple": "متوازي متعدد",
    "irrelevance": "اللا صلة",
    "dialect_handling": "اللهجات",
    "multi_turn": "متعدد الأدوار",
    "native_arabic": "العربي الأصلي",
    "java": "جافا",
    "javascript": "جافاسكريبت",
    "rest": "REST",
    "sql": "SQL"
}

# Color palette for models
MODEL_COLORS = [
    "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd",
    "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"
]


def create_radar_chart(
    model_scores: Dict[str, Dict[str, float]],
    categories: Optional[List[str]] = None,
    use_arabic: bool = True,
    title: str = "Model Comparison"
) -> go.Figure:
    """
    Create a radar/spider chart comparing models across categories.

    Args:
        model_scores: Dict mapping model names to category scores
        categories: Categories to include (defaults to main evaluation categories)
        use_arabic: Whether to use Arabic labels
        title: Chart title

    Returns:
        Plotly Figure object
    """
    if categories is None:
        categories = ["simple", "multiple", "parallel", "parallel_multiple",
                      "irrelevance", "dialect_handling"]

    # Prepare category labels
    if use_arabic:
        labels = [CATEGORY_NAMES_AR.get(cat, cat) for cat in categories]
    else:
        labels = categories

    fig = go.Figure()

    for i, (model_name, scores) in enumerate(model_scores.items()):
        values = [scores.get(cat, 0) for cat in categories]
        # Close the radar chart
        values_closed = values + [values[0]]
        labels_closed = labels + [labels[0]]

        fig.add_trace(go.Scatterpolar(
            r=values_closed,
            theta=labels_closed,
            fill='toself',
            name=model_name,
            line_color=MODEL_COLORS[i % len(MODEL_COLORS)],
            opacity=0.7
        ))

    fig.update_layout(
        polar=dict(
            radialaxis=dict(
                visible=True,
                range=[0, 100]
            )
        ),
        showlegend=True,
        title=dict(
            text=title,
            font=dict(size=16)
        ),
        font=dict(
            family="Noto Kufi Arabic, Arial" if use_arabic else "Arial"
        )
    )

    return fig


def create_bar_chart(
    leaderboard_data: List[Dict],
    metric: str = "overall",
    top_n: int = 10,
    use_arabic: bool = True,
    title: str = "Top Models"
) -> go.Figure:
    """
    Create a horizontal bar chart of top models.

    Args:
        leaderboard_data: List of model entries with scores
        metric: Metric to display (default: 'overall')
        top_n: Number of top models to show
        use_arabic: Whether to use Arabic labels
        title: Chart title

    Returns:
        Plotly Figure object
    """
    # Sort and get top N
    sorted_data = sorted(
        leaderboard_data,
        key=lambda x: x.get(metric, 0),
        reverse=True
    )[:top_n]

    # Reverse for horizontal bar chart (top at top)
    sorted_data = sorted_data[::-1]

    models = [d.get('model', d.get('name', 'Unknown')) for d in sorted_data]
    scores = [d.get(metric, 0) for d in sorted_data]

    # Color based on score ranges
    colors = []
    for score in scores:
        if score >= 80:
            colors.append('#2ca02c')  # Green
        elif score >= 60:
            colors.append('#1f77b4')  # Blue
        elif score >= 40:
            colors.append('#ff7f0e')  # Orange
        else:
            colors.append('#d62728')  # Red

    fig = go.Figure(go.Bar(
        x=scores,
        y=models,
        orientation='h',
        marker_color=colors,
        text=[f"{s:.1f}%" for s in scores],
        textposition='outside'
    ))

    metric_label = CATEGORY_NAMES_AR.get(metric, metric) if use_arabic else metric

    fig.update_layout(
        title=dict(
            text=title,
            font=dict(size=16)
        ),
        xaxis=dict(
            title="الدقة (%)" if use_arabic else "Accuracy (%)",
            range=[0, 105]
        ),
        yaxis=dict(
            title=""
        ),
        font=dict(
            family="Noto Kufi Arabic, Arial" if use_arabic else "Arial"
        ),
        height=max(400, len(models) * 40)
    )

    return fig


def create_category_comparison(
    leaderboard_data: List[Dict],
    models: Optional[List[str]] = None,
    use_arabic: bool = True,
    title: str = "Category Performance Comparison"
) -> go.Figure:
    """
    Create a grouped bar chart comparing models across categories.

    Args:
        leaderboard_data: List of model entries with category scores
        models: List of model names to include (default: top 5)
        use_arabic: Whether to use Arabic labels
        title: Chart title

    Returns:
        Plotly Figure object
    """
    # Categories to show
    categories = ["simple", "multiple", "parallel", "parallel_multiple",
                  "irrelevance", "dialect_handling"]

    # Get models to compare
    if models is None:
        sorted_data = sorted(
            leaderboard_data,
            key=lambda x: x.get('overall', 0),
            reverse=True
        )[:5]
        models = [d.get('model', d.get('name', 'Unknown')) for d in sorted_data]

    # Filter data for selected models
    model_data = {
        d.get('model', d.get('name', 'Unknown')): d
        for d in leaderboard_data
        if d.get('model', d.get('name', 'Unknown')) in models
    }

    # Prepare labels
    if use_arabic:
        cat_labels = [CATEGORY_NAMES_AR.get(cat, cat) for cat in categories]
    else:
        cat_labels = categories

    fig = go.Figure()

    for i, model in enumerate(models):
        if model in model_data:
            scores = [model_data[model].get(cat, 0) for cat in categories]
            fig.add_trace(go.Bar(
                name=model,
                x=cat_labels,
                y=scores,
                marker_color=MODEL_COLORS[i % len(MODEL_COLORS)]
            ))

    fig.update_layout(
        barmode='group',
        title=dict(
            text=title,
            font=dict(size=16)
        ),
        xaxis=dict(
            title="الفئة" if use_arabic else "Category",
            tickangle=-45 if use_arabic else 0
        ),
        yaxis=dict(
            title="الدقة (%)" if use_arabic else "Accuracy (%)",
            range=[0, 105]
        ),
        font=dict(
            family="Noto Kufi Arabic, Arial" if use_arabic else "Arial"
        ),
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        ),
        height=500
    )

    return fig


def create_dialect_breakdown(
    model_scores: Dict[str, Dict[str, float]],
    use_arabic: bool = True,
    title: str = "Dialect Performance"
) -> go.Figure:
    """
    Create a chart showing performance across Arabic dialects.

    Args:
        model_scores: Dict mapping model names to dialect scores
        use_arabic: Whether to use Arabic labels
        title: Chart title

    Returns:
        Plotly Figure object
    """
    dialects = ["msa", "egyptian", "gulf", "levantine"]
    dialect_labels = {
        "msa": "الفصحى" if use_arabic else "MSA",
        "egyptian": "المصري" if use_arabic else "Egyptian",
        "gulf": "الخليجي" if use_arabic else "Gulf",
        "levantine": "الشامي" if use_arabic else "Levantine"
    }

    fig = go.Figure()

    for i, (model_name, scores) in enumerate(model_scores.items()):
        dialect_scores = [scores.get(d, 0) for d in dialects]
        labels = [dialect_labels[d] for d in dialects]

        fig.add_trace(go.Bar(
            name=model_name,
            x=labels,
            y=dialect_scores,
            marker_color=MODEL_COLORS[i % len(MODEL_COLORS)]
        ))

    fig.update_layout(
        barmode='group',
        title=dict(
            text=title,
            font=dict(size=16)
        ),
        xaxis=dict(title="اللهجة" if use_arabic else "Dialect"),
        yaxis=dict(
            title="الدقة (%)" if use_arabic else "Accuracy (%)",
            range=[0, 105]
        ),
        font=dict(
            family="Noto Kufi Arabic, Arial" if use_arabic else "Arial"
        ),
        height=400
    )

    return fig


def create_progress_over_time(
    history_data: List[Dict],
    models: Optional[List[str]] = None,
    title: str = "Performance Over Time"
) -> go.Figure:
    """
    Create a line chart showing model performance over time.

    Args:
        history_data: List of evaluation snapshots with dates
        models: Models to include
        title: Chart title

    Returns:
        Plotly Figure object
    """
    if not history_data:
        # Return empty figure
        fig = go.Figure()
        fig.update_layout(title=title)
        return fig

    df = pd.DataFrame(history_data)

    if models is not None:
        df = df[df['model'].isin(models)]

    fig = px.line(
        df,
        x='date',
        y='overall',
        color='model',
        title=title,
        labels={'overall': 'Overall Score (%)', 'date': 'Date', 'model': 'Model'}
    )

    fig.update_layout(
        yaxis=dict(range=[0, 100]),
        height=400
    )

    return fig