import plotly.graph_objects as go import pandas as pd import numpy as np from typing import Dict, List class EvaluationVisualizer: def __init__(self): self.metric_colors = { 'instruction_following': '#667eea', 'hallucination_score': '#48bb78', 'assumption_control': '#f6ad55', 'coherence': '#63b3ed', 'accuracy': '#fc8181', 'overall_score': '#764ba2' } #Spider chart with multi dimensional scores for single evaluation def create_spider_chart(self, scores: Dict, agent_name: str = "Agent") -> go.Figure: metrics = ['Instruction\nFollowing', 'Hallucination\nControl', 'Assumption\nControl', 'Coherence', 'Accuracy'] values = [ scores.get('instruction_following', 0), scores.get('hallucination_score', 0), scores.get('assumption_control', 0), scores.get('coherence', 0), scores.get('accuracy', 0) ] fig = go.Figure() fig.add_trace(go.Scatterpolar(r=values, theta=metrics, fill='toself', name=agent_name, line=dict(color=self.metric_colors['instruction_following']))) fig.update_layout(polar=dict(radialaxis=dict(visible=True, range=[0, 1])), title=f"{agent_name} - Performance Spider Chart", template='plotly_white') return fig # Horizontal bar chart showing scores for single evaluation def create_score_bars(self, scores: Dict, agent_name: str = "Agent") -> go.Figure: metric_map = { 'overall_score': 'Overall Score', 'instruction_following': 'Instruction Following', 'hallucination_score': 'Hallucination Control', 'assumption_control': 'Assumption Control', 'coherence': 'Coherence', 'accuracy': 'Accuracy' } metrics = [label for key, label in metric_map.items() if key in scores] values = [scores[key] for key in metric_map if key in scores] colors = [self.metric_colors.get(key, '#667eea') for key in metric_map if key in scores] fig = go.Figure(go.Bar(y=metrics, x=values, orientation='h', marker=dict(color=colors), text=[f'{v:.2f}' for v in values], textposition='auto')) fig.update_layout(title=f"{agent_name} - Score Breakdown", xaxis=dict(range=[0, 1]), template='plotly_white', showlegend=False) return fig # Heatmap showing evaluation scores across agents and tasks def create_evaluation_heatmap(self, results: List[Dict]) -> go.Figure: if not results: return go.Figure().update_layout(title="No data for heatmap") df_data = [{'agent': r['agent_name'], 'task': r['task_id'], 'score': r['scores'].get('overall_score', 0)} for r in results] df = pd.DataFrame(df_data) pivot_df = df.pivot(index='agent', columns='task', values='score') fig = go.Figure(data=go.Heatmap(z=pivot_df.values, x=pivot_df.columns, y=pivot_df.index, colorscale='Viridis', colorbar=dict(title="Score"))) fig.update_layout(title="Agent Performance Heatmap", xaxis_title="Tasks", yaxis_title="Agents", template='plotly_white') return fig # Violin plots for spread in scores across Agents def create_score_distribution(self, results: List[Dict]) -> go.Figure: if not results: return go.Figure().update_layout(title="No data for distribution plot") df_data = [] for r in results: entry = {'Agent': r.get('agent_name', 'Unknown')} entry.update(r['scores']) df_data.append(entry) df = pd.DataFrame(df_data).melt(id_vars=['Agent'], value_vars=self.metric_colors.keys(), var_name='Metric', value_name='Score') metric_map = {k: k.replace('_', ' ').title() for k in self.metric_colors.keys()} df['Metric'] = df['Metric'].map(metric_map) fig = go.Figure() for metric in df['Metric'].unique(): fig.add_trace(go.Violin(y=df[df['Metric'] == metric]['Score'], name=metric, box_visible=True, meanline_visible=True)) fig.update_layout(title="Score Distribution Analysis", yaxis_title="Score", template='plotly_white', showlegend=False) return fig # Bar chart showing average overall scores of each agent def create_performance_trends(self, results: List[Dict]) -> go.Figure: if not results: return go.Figure().update_layout(title="No data for average performance plot") agent_scores = {} for r in results: agent = r['agent_name'] if agent not in agent_scores: agent_scores[agent] = [] agent_scores[agent].append(r['scores'].get('overall_score', 0)) # Compute averages avg_scores = {agent: np.mean(scores) for agent, scores in agent_scores.items()} fig = go.Figure(go.Bar( x=list(avg_scores.keys()), y=list(avg_scores.values()), text=[f"{v:.2f}" for v in avg_scores.values()], textposition="auto", marker=dict(color="#667eea") )) fig.update_layout( title="Average Overall Scores by Agent", xaxis_title="Agents", yaxis_title="Average Overall Score", template="plotly_white" ) return fig # Comparison chart between two agents def create_agent_comparison(self, agent1_results: List[Dict], agent2_results: List[Dict]) -> go.Figure: metrics = list(self.metric_colors.keys()) agent1_name = agent1_results[0].get('agent_name', 'Agent 1') agent2_name = agent2_results[0].get('agent_name', 'Agent 2') def get_avg_scores(results): return {m: np.mean([r['scores'].get(m, 0) for r in results]) for m in metrics} avg1 = get_avg_scores(agent1_results) avg2 = get_avg_scores(agent2_results) metric_labels = [m.replace('_', ' ').title() for m in metrics] fig = go.Figure(data=[ go.Bar(name=agent1_name, x=metric_labels, y=[avg1[m] for m in metrics]), go.Bar(name=agent2_name, x=metric_labels, y=[avg2[m] for m in metrics]) ]) fig.update_layout(barmode='group', title="Agent Performance Comparison", yaxis_title="Average Score", template='plotly_white') return fig # Spider chart comparing two agents def create_radar_comparison(self, agent1_results: List[Dict], agent2_results: List[Dict]) -> go.Figure: metrics = ['instruction_following', 'hallucination_score', 'assumption_control', 'coherence', 'accuracy'] metric_labels = [m.replace('_', ' ').title() for m in metrics] def get_avg_scores(results): return [np.mean([r['scores'].get(m, 0) for r in results]) for m in metrics] agent1_values = get_avg_scores(agent1_results) agent2_values = get_avg_scores(agent2_results) fig = go.Figure() fig.add_trace(go.Scatterpolar(r=agent1_values, theta=metric_labels, fill='toself', name=agent1_results[0].get('agent_name', 'Agent 1'))) fig.add_trace(go.Scatterpolar(r=agent2_values, theta=metric_labels, fill='toself', name=agent2_results[0].get('agent_name', 'Agent 2'))) fig.update_layout(polar=dict(radialaxis=dict(visible=True, range=[0, 1])), title="Agent Comparison - Radar Chart", template='plotly_white') return fig #performance gap between two agents across metrics def create_performance_delta(self, agent1_results: List[Dict], agent2_results: List[Dict]) -> go.Figure: metrics = list(self.metric_colors.keys()) def get_avg_scores(results): return {m: np.mean([r['scores'].get(m, 0) for r in results]) for m in metrics} avg1 = get_avg_scores(agent1_results) avg2 = get_avg_scores(agent2_results) deltas = [avg2[m] - avg1[m] for m in metrics] colors = ['#48bb78' if d >= 0 else '#fc8181' for d in deltas] metric_labels = [m.replace('_', ' ').title() for m in metrics] fig = go.Figure(go.Bar(x=metric_labels, y=deltas, marker_color=colors, text=[f'{d:+.2f}' for d in deltas])) fig.update_layout(title="Performance Delta (Agent 2 vs Agent 1)", yaxis_title="Score Difference", template='plotly_white') return fig