Spaces:
Sleeping
Sleeping
File size: 8,449 Bytes
e3a1a9e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
import plotly.graph_objects as go
import pandas as pd
import numpy as np
from typing import Dict, List
class EvaluationVisualizer:
def __init__(self):
self.metric_colors = {
'instruction_following': '#667eea', 'hallucination_score': '#48bb78',
'assumption_control': '#f6ad55', 'coherence': '#63b3ed',
'accuracy': '#fc8181', 'overall_score': '#764ba2'
}
#Spider chart with multi dimensional scores for single evaluation
def create_spider_chart(self, scores: Dict, agent_name: str = "Agent") -> go.Figure:
metrics = ['Instruction\nFollowing', 'Hallucination\nControl', 'Assumption\nControl', 'Coherence', 'Accuracy']
values = [
scores.get('instruction_following', 0), scores.get('hallucination_score', 0),
scores.get('assumption_control', 0), scores.get('coherence', 0),
scores.get('accuracy', 0)
]
fig = go.Figure()
fig.add_trace(go.Scatterpolar(r=values, theta=metrics, fill='toself', name=agent_name, line=dict(color=self.metric_colors['instruction_following'])))
fig.update_layout(polar=dict(radialaxis=dict(visible=True, range=[0, 1])), title=f"{agent_name} - Performance Spider Chart", template='plotly_white')
return fig
# Horizontal bar chart showing scores for single evaluation
def create_score_bars(self, scores: Dict, agent_name: str = "Agent") -> go.Figure:
metric_map = {
'overall_score': 'Overall Score', 'instruction_following': 'Instruction Following',
'hallucination_score': 'Hallucination Control', 'assumption_control': 'Assumption Control',
'coherence': 'Coherence', 'accuracy': 'Accuracy'
}
metrics = [label for key, label in metric_map.items() if key in scores]
values = [scores[key] for key in metric_map if key in scores]
colors = [self.metric_colors.get(key, '#667eea') for key in metric_map if key in scores]
fig = go.Figure(go.Bar(y=metrics, x=values, orientation='h', marker=dict(color=colors), text=[f'{v:.2f}' for v in values], textposition='auto'))
fig.update_layout(title=f"{agent_name} - Score Breakdown", xaxis=dict(range=[0, 1]), template='plotly_white', showlegend=False)
return fig
# Heatmap showing evaluation scores across agents and tasks
def create_evaluation_heatmap(self, results: List[Dict]) -> go.Figure:
if not results: return go.Figure().update_layout(title="No data for heatmap")
df_data = [{'agent': r['agent_name'], 'task': r['task_id'], 'score': r['scores'].get('overall_score', 0)} for r in results]
df = pd.DataFrame(df_data)
pivot_df = df.pivot(index='agent', columns='task', values='score')
fig = go.Figure(data=go.Heatmap(z=pivot_df.values, x=pivot_df.columns, y=pivot_df.index, colorscale='Viridis', colorbar=dict(title="Score")))
fig.update_layout(title="Agent Performance Heatmap", xaxis_title="Tasks", yaxis_title="Agents", template='plotly_white')
return fig
# Violin plots for spread in scores across Agents
def create_score_distribution(self, results: List[Dict]) -> go.Figure:
if not results: return go.Figure().update_layout(title="No data for distribution plot")
df_data = []
for r in results:
entry = {'Agent': r.get('agent_name', 'Unknown')}
entry.update(r['scores'])
df_data.append(entry)
df = pd.DataFrame(df_data).melt(id_vars=['Agent'], value_vars=self.metric_colors.keys(), var_name='Metric', value_name='Score')
metric_map = {k: k.replace('_', ' ').title() for k in self.metric_colors.keys()}
df['Metric'] = df['Metric'].map(metric_map)
fig = go.Figure()
for metric in df['Metric'].unique():
fig.add_trace(go.Violin(y=df[df['Metric'] == metric]['Score'], name=metric, box_visible=True, meanline_visible=True))
fig.update_layout(title="Score Distribution Analysis", yaxis_title="Score", template='plotly_white', showlegend=False)
return fig
# Bar chart showing average overall scores of each agent
def create_performance_trends(self, results: List[Dict]) -> go.Figure:
if not results:
return go.Figure().update_layout(title="No data for average performance plot")
agent_scores = {}
for r in results:
agent = r['agent_name']
if agent not in agent_scores:
agent_scores[agent] = []
agent_scores[agent].append(r['scores'].get('overall_score', 0))
# Compute averages
avg_scores = {agent: np.mean(scores) for agent, scores in agent_scores.items()}
fig = go.Figure(go.Bar(
x=list(avg_scores.keys()),
y=list(avg_scores.values()),
text=[f"{v:.2f}" for v in avg_scores.values()],
textposition="auto",
marker=dict(color="#667eea")
))
fig.update_layout(
title="Average Overall Scores by Agent",
xaxis_title="Agents",
yaxis_title="Average Overall Score",
template="plotly_white"
)
return fig
# Comparison chart between two agents
def create_agent_comparison(self, agent1_results: List[Dict], agent2_results: List[Dict]) -> go.Figure:
metrics = list(self.metric_colors.keys())
agent1_name = agent1_results[0].get('agent_name', 'Agent 1')
agent2_name = agent2_results[0].get('agent_name', 'Agent 2')
def get_avg_scores(results):
return {m: np.mean([r['scores'].get(m, 0) for r in results]) for m in metrics}
avg1 = get_avg_scores(agent1_results)
avg2 = get_avg_scores(agent2_results)
metric_labels = [m.replace('_', ' ').title() for m in metrics]
fig = go.Figure(data=[
go.Bar(name=agent1_name, x=metric_labels, y=[avg1[m] for m in metrics]),
go.Bar(name=agent2_name, x=metric_labels, y=[avg2[m] for m in metrics])
])
fig.update_layout(barmode='group', title="Agent Performance Comparison", yaxis_title="Average Score", template='plotly_white')
return fig
# Spider chart comparing two agents
def create_radar_comparison(self, agent1_results: List[Dict], agent2_results: List[Dict]) -> go.Figure:
metrics = ['instruction_following', 'hallucination_score', 'assumption_control', 'coherence', 'accuracy']
metric_labels = [m.replace('_', ' ').title() for m in metrics]
def get_avg_scores(results):
return [np.mean([r['scores'].get(m, 0) for r in results]) for m in metrics]
agent1_values = get_avg_scores(agent1_results)
agent2_values = get_avg_scores(agent2_results)
fig = go.Figure()
fig.add_trace(go.Scatterpolar(r=agent1_values, theta=metric_labels, fill='toself', name=agent1_results[0].get('agent_name', 'Agent 1')))
fig.add_trace(go.Scatterpolar(r=agent2_values, theta=metric_labels, fill='toself', name=agent2_results[0].get('agent_name', 'Agent 2')))
fig.update_layout(polar=dict(radialaxis=dict(visible=True, range=[0, 1])), title="Agent Comparison - Radar Chart", template='plotly_white')
return fig
#performance gap between two agents across metrics
def create_performance_delta(self, agent1_results: List[Dict], agent2_results: List[Dict]) -> go.Figure:
metrics = list(self.metric_colors.keys())
def get_avg_scores(results):
return {m: np.mean([r['scores'].get(m, 0) for r in results]) for m in metrics}
avg1 = get_avg_scores(agent1_results)
avg2 = get_avg_scores(agent2_results)
deltas = [avg2[m] - avg1[m] for m in metrics]
colors = ['#48bb78' if d >= 0 else '#fc8181' for d in deltas]
metric_labels = [m.replace('_', ' ').title() for m in metrics]
fig = go.Figure(go.Bar(x=metric_labels, y=deltas, marker_color=colors, text=[f'{d:+.2f}' for d in deltas]))
fig.update_layout(title="Performance Delta (Agent 2 vs Agent 1)", yaxis_title="Score Difference", template='plotly_white')
return fig |