HeshamHaroon's picture
Initial release: Arabic Function Calling Leaderboard
566d03e verified
"""
Visualization Charts
====================
Plotly-based visualizations for the Arabic Function Calling Leaderboard.
"""
import plotly.graph_objects as go
import plotly.express as px
from typing import Dict, List, Optional
import pandas as pd
# Arabic category names mapping
CATEGORY_NAMES_AR = {
"simple": "بسيط",
"multiple": "متعدد",
"parallel": "متوازي",
"parallel_multiple": "متوازي متعدد",
"irrelevance": "اللا صلة",
"dialect_handling": "اللهجات",
"multi_turn": "متعدد الأدوار",
"native_arabic": "العربي الأصلي",
"java": "جافا",
"javascript": "جافاسكريبت",
"rest": "REST",
"sql": "SQL"
}
# Color palette for models
MODEL_COLORS = [
"#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd",
"#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"
]
def create_radar_chart(
model_scores: Dict[str, Dict[str, float]],
categories: Optional[List[str]] = None,
use_arabic: bool = True,
title: str = "Model Comparison"
) -> go.Figure:
"""
Create a radar/spider chart comparing models across categories.
Args:
model_scores: Dict mapping model names to category scores
categories: Categories to include (defaults to main evaluation categories)
use_arabic: Whether to use Arabic labels
title: Chart title
Returns:
Plotly Figure object
"""
if categories is None:
categories = ["simple", "multiple", "parallel", "parallel_multiple",
"irrelevance", "dialect_handling"]
# Prepare category labels
if use_arabic:
labels = [CATEGORY_NAMES_AR.get(cat, cat) for cat in categories]
else:
labels = categories
fig = go.Figure()
for i, (model_name, scores) in enumerate(model_scores.items()):
values = [scores.get(cat, 0) for cat in categories]
# Close the radar chart
values_closed = values + [values[0]]
labels_closed = labels + [labels[0]]
fig.add_trace(go.Scatterpolar(
r=values_closed,
theta=labels_closed,
fill='toself',
name=model_name,
line_color=MODEL_COLORS[i % len(MODEL_COLORS)],
opacity=0.7
))
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
range=[0, 100]
)
),
showlegend=True,
title=dict(
text=title,
font=dict(size=16)
),
font=dict(
family="Noto Kufi Arabic, Arial" if use_arabic else "Arial"
)
)
return fig
def create_bar_chart(
leaderboard_data: List[Dict],
metric: str = "overall",
top_n: int = 10,
use_arabic: bool = True,
title: str = "Top Models"
) -> go.Figure:
"""
Create a horizontal bar chart of top models.
Args:
leaderboard_data: List of model entries with scores
metric: Metric to display (default: 'overall')
top_n: Number of top models to show
use_arabic: Whether to use Arabic labels
title: Chart title
Returns:
Plotly Figure object
"""
# Sort and get top N
sorted_data = sorted(
leaderboard_data,
key=lambda x: x.get(metric, 0),
reverse=True
)[:top_n]
# Reverse for horizontal bar chart (top at top)
sorted_data = sorted_data[::-1]
models = [d.get('model', d.get('name', 'Unknown')) for d in sorted_data]
scores = [d.get(metric, 0) for d in sorted_data]
# Color based on score ranges
colors = []
for score in scores:
if score >= 80:
colors.append('#2ca02c') # Green
elif score >= 60:
colors.append('#1f77b4') # Blue
elif score >= 40:
colors.append('#ff7f0e') # Orange
else:
colors.append('#d62728') # Red
fig = go.Figure(go.Bar(
x=scores,
y=models,
orientation='h',
marker_color=colors,
text=[f"{s:.1f}%" for s in scores],
textposition='outside'
))
metric_label = CATEGORY_NAMES_AR.get(metric, metric) if use_arabic else metric
fig.update_layout(
title=dict(
text=title,
font=dict(size=16)
),
xaxis=dict(
title="الدقة (%)" if use_arabic else "Accuracy (%)",
range=[0, 105]
),
yaxis=dict(
title=""
),
font=dict(
family="Noto Kufi Arabic, Arial" if use_arabic else "Arial"
),
height=max(400, len(models) * 40)
)
return fig
def create_category_comparison(
leaderboard_data: List[Dict],
models: Optional[List[str]] = None,
use_arabic: bool = True,
title: str = "Category Performance Comparison"
) -> go.Figure:
"""
Create a grouped bar chart comparing models across categories.
Args:
leaderboard_data: List of model entries with category scores
models: List of model names to include (default: top 5)
use_arabic: Whether to use Arabic labels
title: Chart title
Returns:
Plotly Figure object
"""
# Categories to show
categories = ["simple", "multiple", "parallel", "parallel_multiple",
"irrelevance", "dialect_handling"]
# Get models to compare
if models is None:
sorted_data = sorted(
leaderboard_data,
key=lambda x: x.get('overall', 0),
reverse=True
)[:5]
models = [d.get('model', d.get('name', 'Unknown')) for d in sorted_data]
# Filter data for selected models
model_data = {
d.get('model', d.get('name', 'Unknown')): d
for d in leaderboard_data
if d.get('model', d.get('name', 'Unknown')) in models
}
# Prepare labels
if use_arabic:
cat_labels = [CATEGORY_NAMES_AR.get(cat, cat) for cat in categories]
else:
cat_labels = categories
fig = go.Figure()
for i, model in enumerate(models):
if model in model_data:
scores = [model_data[model].get(cat, 0) for cat in categories]
fig.add_trace(go.Bar(
name=model,
x=cat_labels,
y=scores,
marker_color=MODEL_COLORS[i % len(MODEL_COLORS)]
))
fig.update_layout(
barmode='group',
title=dict(
text=title,
font=dict(size=16)
),
xaxis=dict(
title="الفئة" if use_arabic else "Category",
tickangle=-45 if use_arabic else 0
),
yaxis=dict(
title="الدقة (%)" if use_arabic else "Accuracy (%)",
range=[0, 105]
),
font=dict(
family="Noto Kufi Arabic, Arial" if use_arabic else "Arial"
),
legend=dict(
orientation="h",
yanchor="bottom",
y=1.02,
xanchor="right",
x=1
),
height=500
)
return fig
def create_dialect_breakdown(
model_scores: Dict[str, Dict[str, float]],
use_arabic: bool = True,
title: str = "Dialect Performance"
) -> go.Figure:
"""
Create a chart showing performance across Arabic dialects.
Args:
model_scores: Dict mapping model names to dialect scores
use_arabic: Whether to use Arabic labels
title: Chart title
Returns:
Plotly Figure object
"""
dialects = ["msa", "egyptian", "gulf", "levantine"]
dialect_labels = {
"msa": "الفصحى" if use_arabic else "MSA",
"egyptian": "المصري" if use_arabic else "Egyptian",
"gulf": "الخليجي" if use_arabic else "Gulf",
"levantine": "الشامي" if use_arabic else "Levantine"
}
fig = go.Figure()
for i, (model_name, scores) in enumerate(model_scores.items()):
dialect_scores = [scores.get(d, 0) for d in dialects]
labels = [dialect_labels[d] for d in dialects]
fig.add_trace(go.Bar(
name=model_name,
x=labels,
y=dialect_scores,
marker_color=MODEL_COLORS[i % len(MODEL_COLORS)]
))
fig.update_layout(
barmode='group',
title=dict(
text=title,
font=dict(size=16)
),
xaxis=dict(title="اللهجة" if use_arabic else "Dialect"),
yaxis=dict(
title="الدقة (%)" if use_arabic else "Accuracy (%)",
range=[0, 105]
),
font=dict(
family="Noto Kufi Arabic, Arial" if use_arabic else "Arial"
),
height=400
)
return fig
def create_progress_over_time(
history_data: List[Dict],
models: Optional[List[str]] = None,
title: str = "Performance Over Time"
) -> go.Figure:
"""
Create a line chart showing model performance over time.
Args:
history_data: List of evaluation snapshots with dates
models: Models to include
title: Chart title
Returns:
Plotly Figure object
"""
if not history_data:
# Return empty figure
fig = go.Figure()
fig.update_layout(title=title)
return fig
df = pd.DataFrame(history_data)
if models is not None:
df = df[df['model'].isin(models)]
fig = px.line(
df,
x='date',
y='overall',
color='model',
title=title,
labels={'overall': 'Overall Score (%)', 'date': 'Date', 'model': 'Model'}
)
fig.update_layout(
yaxis=dict(range=[0, 100]),
height=400
)
return fig