""" Visualization Charts ==================== Plotly-based visualizations for the Arabic Function Calling Leaderboard. """ import plotly.graph_objects as go import plotly.express as px from typing import Dict, List, Optional import pandas as pd # Arabic category names mapping CATEGORY_NAMES_AR = { "simple": "بسيط", "multiple": "متعدد", "parallel": "متوازي", "parallel_multiple": "متوازي متعدد", "irrelevance": "اللا صلة", "dialect_handling": "اللهجات", "multi_turn": "متعدد الأدوار", "native_arabic": "العربي الأصلي", "java": "جافا", "javascript": "جافاسكريبت", "rest": "REST", "sql": "SQL" } # Color palette for models MODEL_COLORS = [ "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf" ] def create_radar_chart( model_scores: Dict[str, Dict[str, float]], categories: Optional[List[str]] = None, use_arabic: bool = True, title: str = "Model Comparison" ) -> go.Figure: """ Create a radar/spider chart comparing models across categories. Args: model_scores: Dict mapping model names to category scores categories: Categories to include (defaults to main evaluation categories) use_arabic: Whether to use Arabic labels title: Chart title Returns: Plotly Figure object """ if categories is None: categories = ["simple", "multiple", "parallel", "parallel_multiple", "irrelevance", "dialect_handling"] # Prepare category labels if use_arabic: labels = [CATEGORY_NAMES_AR.get(cat, cat) for cat in categories] else: labels = categories fig = go.Figure() for i, (model_name, scores) in enumerate(model_scores.items()): values = [scores.get(cat, 0) for cat in categories] # Close the radar chart values_closed = values + [values[0]] labels_closed = labels + [labels[0]] fig.add_trace(go.Scatterpolar( r=values_closed, theta=labels_closed, fill='toself', name=model_name, line_color=MODEL_COLORS[i % len(MODEL_COLORS)], opacity=0.7 )) fig.update_layout( polar=dict( radialaxis=dict( visible=True, range=[0, 100] ) ), showlegend=True, title=dict( text=title, font=dict(size=16) ), font=dict( family="Noto Kufi Arabic, Arial" if use_arabic else "Arial" ) ) return fig def create_bar_chart( leaderboard_data: List[Dict], metric: str = "overall", top_n: int = 10, use_arabic: bool = True, title: str = "Top Models" ) -> go.Figure: """ Create a horizontal bar chart of top models. Args: leaderboard_data: List of model entries with scores metric: Metric to display (default: 'overall') top_n: Number of top models to show use_arabic: Whether to use Arabic labels title: Chart title Returns: Plotly Figure object """ # Sort and get top N sorted_data = sorted( leaderboard_data, key=lambda x: x.get(metric, 0), reverse=True )[:top_n] # Reverse for horizontal bar chart (top at top) sorted_data = sorted_data[::-1] models = [d.get('model', d.get('name', 'Unknown')) for d in sorted_data] scores = [d.get(metric, 0) for d in sorted_data] # Color based on score ranges colors = [] for score in scores: if score >= 80: colors.append('#2ca02c') # Green elif score >= 60: colors.append('#1f77b4') # Blue elif score >= 40: colors.append('#ff7f0e') # Orange else: colors.append('#d62728') # Red fig = go.Figure(go.Bar( x=scores, y=models, orientation='h', marker_color=colors, text=[f"{s:.1f}%" for s in scores], textposition='outside' )) metric_label = CATEGORY_NAMES_AR.get(metric, metric) if use_arabic else metric fig.update_layout( title=dict( text=title, font=dict(size=16) ), xaxis=dict( title="الدقة (%)" if use_arabic else "Accuracy (%)", range=[0, 105] ), yaxis=dict( title="" ), font=dict( family="Noto Kufi Arabic, Arial" if use_arabic else "Arial" ), height=max(400, len(models) * 40) ) return fig def create_category_comparison( leaderboard_data: List[Dict], models: Optional[List[str]] = None, use_arabic: bool = True, title: str = "Category Performance Comparison" ) -> go.Figure: """ Create a grouped bar chart comparing models across categories. Args: leaderboard_data: List of model entries with category scores models: List of model names to include (default: top 5) use_arabic: Whether to use Arabic labels title: Chart title Returns: Plotly Figure object """ # Categories to show categories = ["simple", "multiple", "parallel", "parallel_multiple", "irrelevance", "dialect_handling"] # Get models to compare if models is None: sorted_data = sorted( leaderboard_data, key=lambda x: x.get('overall', 0), reverse=True )[:5] models = [d.get('model', d.get('name', 'Unknown')) for d in sorted_data] # Filter data for selected models model_data = { d.get('model', d.get('name', 'Unknown')): d for d in leaderboard_data if d.get('model', d.get('name', 'Unknown')) in models } # Prepare labels if use_arabic: cat_labels = [CATEGORY_NAMES_AR.get(cat, cat) for cat in categories] else: cat_labels = categories fig = go.Figure() for i, model in enumerate(models): if model in model_data: scores = [model_data[model].get(cat, 0) for cat in categories] fig.add_trace(go.Bar( name=model, x=cat_labels, y=scores, marker_color=MODEL_COLORS[i % len(MODEL_COLORS)] )) fig.update_layout( barmode='group', title=dict( text=title, font=dict(size=16) ), xaxis=dict( title="الفئة" if use_arabic else "Category", tickangle=-45 if use_arabic else 0 ), yaxis=dict( title="الدقة (%)" if use_arabic else "Accuracy (%)", range=[0, 105] ), font=dict( family="Noto Kufi Arabic, Arial" if use_arabic else "Arial" ), legend=dict( orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1 ), height=500 ) return fig def create_dialect_breakdown( model_scores: Dict[str, Dict[str, float]], use_arabic: bool = True, title: str = "Dialect Performance" ) -> go.Figure: """ Create a chart showing performance across Arabic dialects. Args: model_scores: Dict mapping model names to dialect scores use_arabic: Whether to use Arabic labels title: Chart title Returns: Plotly Figure object """ dialects = ["msa", "egyptian", "gulf", "levantine"] dialect_labels = { "msa": "الفصحى" if use_arabic else "MSA", "egyptian": "المصري" if use_arabic else "Egyptian", "gulf": "الخليجي" if use_arabic else "Gulf", "levantine": "الشامي" if use_arabic else "Levantine" } fig = go.Figure() for i, (model_name, scores) in enumerate(model_scores.items()): dialect_scores = [scores.get(d, 0) for d in dialects] labels = [dialect_labels[d] for d in dialects] fig.add_trace(go.Bar( name=model_name, x=labels, y=dialect_scores, marker_color=MODEL_COLORS[i % len(MODEL_COLORS)] )) fig.update_layout( barmode='group', title=dict( text=title, font=dict(size=16) ), xaxis=dict(title="اللهجة" if use_arabic else "Dialect"), yaxis=dict( title="الدقة (%)" if use_arabic else "Accuracy (%)", range=[0, 105] ), font=dict( family="Noto Kufi Arabic, Arial" if use_arabic else "Arial" ), height=400 ) return fig def create_progress_over_time( history_data: List[Dict], models: Optional[List[str]] = None, title: str = "Performance Over Time" ) -> go.Figure: """ Create a line chart showing model performance over time. Args: history_data: List of evaluation snapshots with dates models: Models to include title: Chart title Returns: Plotly Figure object """ if not history_data: # Return empty figure fig = go.Figure() fig.update_layout(title=title) return fig df = pd.DataFrame(history_data) if models is not None: df = df[df['model'].isin(models)] fig = px.line( df, x='date', y='overall', color='model', title=title, labels={'overall': 'Overall Score (%)', 'date': 'Date', 'model': 'Model'} ) fig.update_layout( yaxis=dict(range=[0, 100]), height=400 ) return fig