|
|
""" |
|
|
Visualization Charts |
|
|
==================== |
|
|
|
|
|
Plotly-based visualizations for the Arabic Function Calling Leaderboard. |
|
|
""" |
|
|
|
|
|
import plotly.graph_objects as go |
|
|
import plotly.express as px |
|
|
from typing import Dict, List, Optional |
|
|
import pandas as pd |
|
|
|
|
|
|
|
|
|
|
|
CATEGORY_NAMES_AR = { |
|
|
"simple": "بسيط", |
|
|
"multiple": "متعدد", |
|
|
"parallel": "متوازي", |
|
|
"parallel_multiple": "متوازي متعدد", |
|
|
"irrelevance": "اللا صلة", |
|
|
"dialect_handling": "اللهجات", |
|
|
"multi_turn": "متعدد الأدوار", |
|
|
"native_arabic": "العربي الأصلي", |
|
|
"java": "جافا", |
|
|
"javascript": "جافاسكريبت", |
|
|
"rest": "REST", |
|
|
"sql": "SQL" |
|
|
} |
|
|
|
|
|
|
|
|
MODEL_COLORS = [ |
|
|
"#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", |
|
|
"#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf" |
|
|
] |
|
|
|
|
|
|
|
|
def create_radar_chart( |
|
|
model_scores: Dict[str, Dict[str, float]], |
|
|
categories: Optional[List[str]] = None, |
|
|
use_arabic: bool = True, |
|
|
title: str = "Model Comparison" |
|
|
) -> go.Figure: |
|
|
""" |
|
|
Create a radar/spider chart comparing models across categories. |
|
|
|
|
|
Args: |
|
|
model_scores: Dict mapping model names to category scores |
|
|
categories: Categories to include (defaults to main evaluation categories) |
|
|
use_arabic: Whether to use Arabic labels |
|
|
title: Chart title |
|
|
|
|
|
Returns: |
|
|
Plotly Figure object |
|
|
""" |
|
|
if categories is None: |
|
|
categories = ["simple", "multiple", "parallel", "parallel_multiple", |
|
|
"irrelevance", "dialect_handling"] |
|
|
|
|
|
|
|
|
if use_arabic: |
|
|
labels = [CATEGORY_NAMES_AR.get(cat, cat) for cat in categories] |
|
|
else: |
|
|
labels = categories |
|
|
|
|
|
fig = go.Figure() |
|
|
|
|
|
for i, (model_name, scores) in enumerate(model_scores.items()): |
|
|
values = [scores.get(cat, 0) for cat in categories] |
|
|
|
|
|
values_closed = values + [values[0]] |
|
|
labels_closed = labels + [labels[0]] |
|
|
|
|
|
fig.add_trace(go.Scatterpolar( |
|
|
r=values_closed, |
|
|
theta=labels_closed, |
|
|
fill='toself', |
|
|
name=model_name, |
|
|
line_color=MODEL_COLORS[i % len(MODEL_COLORS)], |
|
|
opacity=0.7 |
|
|
)) |
|
|
|
|
|
fig.update_layout( |
|
|
polar=dict( |
|
|
radialaxis=dict( |
|
|
visible=True, |
|
|
range=[0, 100] |
|
|
) |
|
|
), |
|
|
showlegend=True, |
|
|
title=dict( |
|
|
text=title, |
|
|
font=dict(size=16) |
|
|
), |
|
|
font=dict( |
|
|
family="Noto Kufi Arabic, Arial" if use_arabic else "Arial" |
|
|
) |
|
|
) |
|
|
|
|
|
return fig |
|
|
|
|
|
|
|
|
def create_bar_chart( |
|
|
leaderboard_data: List[Dict], |
|
|
metric: str = "overall", |
|
|
top_n: int = 10, |
|
|
use_arabic: bool = True, |
|
|
title: str = "Top Models" |
|
|
) -> go.Figure: |
|
|
""" |
|
|
Create a horizontal bar chart of top models. |
|
|
|
|
|
Args: |
|
|
leaderboard_data: List of model entries with scores |
|
|
metric: Metric to display (default: 'overall') |
|
|
top_n: Number of top models to show |
|
|
use_arabic: Whether to use Arabic labels |
|
|
title: Chart title |
|
|
|
|
|
Returns: |
|
|
Plotly Figure object |
|
|
""" |
|
|
|
|
|
sorted_data = sorted( |
|
|
leaderboard_data, |
|
|
key=lambda x: x.get(metric, 0), |
|
|
reverse=True |
|
|
)[:top_n] |
|
|
|
|
|
|
|
|
sorted_data = sorted_data[::-1] |
|
|
|
|
|
models = [d.get('model', d.get('name', 'Unknown')) for d in sorted_data] |
|
|
scores = [d.get(metric, 0) for d in sorted_data] |
|
|
|
|
|
|
|
|
colors = [] |
|
|
for score in scores: |
|
|
if score >= 80: |
|
|
colors.append('#2ca02c') |
|
|
elif score >= 60: |
|
|
colors.append('#1f77b4') |
|
|
elif score >= 40: |
|
|
colors.append('#ff7f0e') |
|
|
else: |
|
|
colors.append('#d62728') |
|
|
|
|
|
fig = go.Figure(go.Bar( |
|
|
x=scores, |
|
|
y=models, |
|
|
orientation='h', |
|
|
marker_color=colors, |
|
|
text=[f"{s:.1f}%" for s in scores], |
|
|
textposition='outside' |
|
|
)) |
|
|
|
|
|
metric_label = CATEGORY_NAMES_AR.get(metric, metric) if use_arabic else metric |
|
|
|
|
|
fig.update_layout( |
|
|
title=dict( |
|
|
text=title, |
|
|
font=dict(size=16) |
|
|
), |
|
|
xaxis=dict( |
|
|
title="الدقة (%)" if use_arabic else "Accuracy (%)", |
|
|
range=[0, 105] |
|
|
), |
|
|
yaxis=dict( |
|
|
title="" |
|
|
), |
|
|
font=dict( |
|
|
family="Noto Kufi Arabic, Arial" if use_arabic else "Arial" |
|
|
), |
|
|
height=max(400, len(models) * 40) |
|
|
) |
|
|
|
|
|
return fig |
|
|
|
|
|
|
|
|
def create_category_comparison( |
|
|
leaderboard_data: List[Dict], |
|
|
models: Optional[List[str]] = None, |
|
|
use_arabic: bool = True, |
|
|
title: str = "Category Performance Comparison" |
|
|
) -> go.Figure: |
|
|
""" |
|
|
Create a grouped bar chart comparing models across categories. |
|
|
|
|
|
Args: |
|
|
leaderboard_data: List of model entries with category scores |
|
|
models: List of model names to include (default: top 5) |
|
|
use_arabic: Whether to use Arabic labels |
|
|
title: Chart title |
|
|
|
|
|
Returns: |
|
|
Plotly Figure object |
|
|
""" |
|
|
|
|
|
categories = ["simple", "multiple", "parallel", "parallel_multiple", |
|
|
"irrelevance", "dialect_handling"] |
|
|
|
|
|
|
|
|
if models is None: |
|
|
sorted_data = sorted( |
|
|
leaderboard_data, |
|
|
key=lambda x: x.get('overall', 0), |
|
|
reverse=True |
|
|
)[:5] |
|
|
models = [d.get('model', d.get('name', 'Unknown')) for d in sorted_data] |
|
|
|
|
|
|
|
|
model_data = { |
|
|
d.get('model', d.get('name', 'Unknown')): d |
|
|
for d in leaderboard_data |
|
|
if d.get('model', d.get('name', 'Unknown')) in models |
|
|
} |
|
|
|
|
|
|
|
|
if use_arabic: |
|
|
cat_labels = [CATEGORY_NAMES_AR.get(cat, cat) for cat in categories] |
|
|
else: |
|
|
cat_labels = categories |
|
|
|
|
|
fig = go.Figure() |
|
|
|
|
|
for i, model in enumerate(models): |
|
|
if model in model_data: |
|
|
scores = [model_data[model].get(cat, 0) for cat in categories] |
|
|
fig.add_trace(go.Bar( |
|
|
name=model, |
|
|
x=cat_labels, |
|
|
y=scores, |
|
|
marker_color=MODEL_COLORS[i % len(MODEL_COLORS)] |
|
|
)) |
|
|
|
|
|
fig.update_layout( |
|
|
barmode='group', |
|
|
title=dict( |
|
|
text=title, |
|
|
font=dict(size=16) |
|
|
), |
|
|
xaxis=dict( |
|
|
title="الفئة" if use_arabic else "Category", |
|
|
tickangle=-45 if use_arabic else 0 |
|
|
), |
|
|
yaxis=dict( |
|
|
title="الدقة (%)" if use_arabic else "Accuracy (%)", |
|
|
range=[0, 105] |
|
|
), |
|
|
font=dict( |
|
|
family="Noto Kufi Arabic, Arial" if use_arabic else "Arial" |
|
|
), |
|
|
legend=dict( |
|
|
orientation="h", |
|
|
yanchor="bottom", |
|
|
y=1.02, |
|
|
xanchor="right", |
|
|
x=1 |
|
|
), |
|
|
height=500 |
|
|
) |
|
|
|
|
|
return fig |
|
|
|
|
|
|
|
|
def create_dialect_breakdown( |
|
|
model_scores: Dict[str, Dict[str, float]], |
|
|
use_arabic: bool = True, |
|
|
title: str = "Dialect Performance" |
|
|
) -> go.Figure: |
|
|
""" |
|
|
Create a chart showing performance across Arabic dialects. |
|
|
|
|
|
Args: |
|
|
model_scores: Dict mapping model names to dialect scores |
|
|
use_arabic: Whether to use Arabic labels |
|
|
title: Chart title |
|
|
|
|
|
Returns: |
|
|
Plotly Figure object |
|
|
""" |
|
|
dialects = ["msa", "egyptian", "gulf", "levantine"] |
|
|
dialect_labels = { |
|
|
"msa": "الفصحى" if use_arabic else "MSA", |
|
|
"egyptian": "المصري" if use_arabic else "Egyptian", |
|
|
"gulf": "الخليجي" if use_arabic else "Gulf", |
|
|
"levantine": "الشامي" if use_arabic else "Levantine" |
|
|
} |
|
|
|
|
|
fig = go.Figure() |
|
|
|
|
|
for i, (model_name, scores) in enumerate(model_scores.items()): |
|
|
dialect_scores = [scores.get(d, 0) for d in dialects] |
|
|
labels = [dialect_labels[d] for d in dialects] |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name=model_name, |
|
|
x=labels, |
|
|
y=dialect_scores, |
|
|
marker_color=MODEL_COLORS[i % len(MODEL_COLORS)] |
|
|
)) |
|
|
|
|
|
fig.update_layout( |
|
|
barmode='group', |
|
|
title=dict( |
|
|
text=title, |
|
|
font=dict(size=16) |
|
|
), |
|
|
xaxis=dict(title="اللهجة" if use_arabic else "Dialect"), |
|
|
yaxis=dict( |
|
|
title="الدقة (%)" if use_arabic else "Accuracy (%)", |
|
|
range=[0, 105] |
|
|
), |
|
|
font=dict( |
|
|
family="Noto Kufi Arabic, Arial" if use_arabic else "Arial" |
|
|
), |
|
|
height=400 |
|
|
) |
|
|
|
|
|
return fig |
|
|
|
|
|
|
|
|
def create_progress_over_time( |
|
|
history_data: List[Dict], |
|
|
models: Optional[List[str]] = None, |
|
|
title: str = "Performance Over Time" |
|
|
) -> go.Figure: |
|
|
""" |
|
|
Create a line chart showing model performance over time. |
|
|
|
|
|
Args: |
|
|
history_data: List of evaluation snapshots with dates |
|
|
models: Models to include |
|
|
title: Chart title |
|
|
|
|
|
Returns: |
|
|
Plotly Figure object |
|
|
""" |
|
|
if not history_data: |
|
|
|
|
|
fig = go.Figure() |
|
|
fig.update_layout(title=title) |
|
|
return fig |
|
|
|
|
|
df = pd.DataFrame(history_data) |
|
|
|
|
|
if models is not None: |
|
|
df = df[df['model'].isin(models)] |
|
|
|
|
|
fig = px.line( |
|
|
df, |
|
|
x='date', |
|
|
y='overall', |
|
|
color='model', |
|
|
title=title, |
|
|
labels={'overall': 'Overall Score (%)', 'date': 'Date', 'model': 'Model'} |
|
|
) |
|
|
|
|
|
fig.update_layout( |
|
|
yaxis=dict(range=[0, 100]), |
|
|
height=400 |
|
|
) |
|
|
|
|
|
return fig |
|
|
|