e6test / visualizer_module.py
aaditya-raj's picture
Upload 4 files
e3a1a9e verified
import plotly.graph_objects as go
import pandas as pd
import numpy as np
from typing import Dict, List
class EvaluationVisualizer:
def __init__(self):
self.metric_colors = {
'instruction_following': '#667eea', 'hallucination_score': '#48bb78',
'assumption_control': '#f6ad55', 'coherence': '#63b3ed',
'accuracy': '#fc8181', 'overall_score': '#764ba2'
}
#Spider chart with multi dimensional scores for single evaluation
def create_spider_chart(self, scores: Dict, agent_name: str = "Agent") -> go.Figure:
metrics = ['Instruction\nFollowing', 'Hallucination\nControl', 'Assumption\nControl', 'Coherence', 'Accuracy']
values = [
scores.get('instruction_following', 0), scores.get('hallucination_score', 0),
scores.get('assumption_control', 0), scores.get('coherence', 0),
scores.get('accuracy', 0)
]
fig = go.Figure()
fig.add_trace(go.Scatterpolar(r=values, theta=metrics, fill='toself', name=agent_name, line=dict(color=self.metric_colors['instruction_following'])))
fig.update_layout(polar=dict(radialaxis=dict(visible=True, range=[0, 1])), title=f"{agent_name} - Performance Spider Chart", template='plotly_white')
return fig
# Horizontal bar chart showing scores for single evaluation
def create_score_bars(self, scores: Dict, agent_name: str = "Agent") -> go.Figure:
metric_map = {
'overall_score': 'Overall Score', 'instruction_following': 'Instruction Following',
'hallucination_score': 'Hallucination Control', 'assumption_control': 'Assumption Control',
'coherence': 'Coherence', 'accuracy': 'Accuracy'
}
metrics = [label for key, label in metric_map.items() if key in scores]
values = [scores[key] for key in metric_map if key in scores]
colors = [self.metric_colors.get(key, '#667eea') for key in metric_map if key in scores]
fig = go.Figure(go.Bar(y=metrics, x=values, orientation='h', marker=dict(color=colors), text=[f'{v:.2f}' for v in values], textposition='auto'))
fig.update_layout(title=f"{agent_name} - Score Breakdown", xaxis=dict(range=[0, 1]), template='plotly_white', showlegend=False)
return fig
# Heatmap showing evaluation scores across agents and tasks
def create_evaluation_heatmap(self, results: List[Dict]) -> go.Figure:
if not results: return go.Figure().update_layout(title="No data for heatmap")
df_data = [{'agent': r['agent_name'], 'task': r['task_id'], 'score': r['scores'].get('overall_score', 0)} for r in results]
df = pd.DataFrame(df_data)
pivot_df = df.pivot(index='agent', columns='task', values='score')
fig = go.Figure(data=go.Heatmap(z=pivot_df.values, x=pivot_df.columns, y=pivot_df.index, colorscale='Viridis', colorbar=dict(title="Score")))
fig.update_layout(title="Agent Performance Heatmap", xaxis_title="Tasks", yaxis_title="Agents", template='plotly_white')
return fig
# Violin plots for spread in scores across Agents
def create_score_distribution(self, results: List[Dict]) -> go.Figure:
if not results: return go.Figure().update_layout(title="No data for distribution plot")
df_data = []
for r in results:
entry = {'Agent': r.get('agent_name', 'Unknown')}
entry.update(r['scores'])
df_data.append(entry)
df = pd.DataFrame(df_data).melt(id_vars=['Agent'], value_vars=self.metric_colors.keys(), var_name='Metric', value_name='Score')
metric_map = {k: k.replace('_', ' ').title() for k in self.metric_colors.keys()}
df['Metric'] = df['Metric'].map(metric_map)
fig = go.Figure()
for metric in df['Metric'].unique():
fig.add_trace(go.Violin(y=df[df['Metric'] == metric]['Score'], name=metric, box_visible=True, meanline_visible=True))
fig.update_layout(title="Score Distribution Analysis", yaxis_title="Score", template='plotly_white', showlegend=False)
return fig
# Bar chart showing average overall scores of each agent
def create_performance_trends(self, results: List[Dict]) -> go.Figure:
if not results:
return go.Figure().update_layout(title="No data for average performance plot")
agent_scores = {}
for r in results:
agent = r['agent_name']
if agent not in agent_scores:
agent_scores[agent] = []
agent_scores[agent].append(r['scores'].get('overall_score', 0))
# Compute averages
avg_scores = {agent: np.mean(scores) for agent, scores in agent_scores.items()}
fig = go.Figure(go.Bar(
x=list(avg_scores.keys()),
y=list(avg_scores.values()),
text=[f"{v:.2f}" for v in avg_scores.values()],
textposition="auto",
marker=dict(color="#667eea")
))
fig.update_layout(
title="Average Overall Scores by Agent",
xaxis_title="Agents",
yaxis_title="Average Overall Score",
template="plotly_white"
)
return fig
# Comparison chart between two agents
def create_agent_comparison(self, agent1_results: List[Dict], agent2_results: List[Dict]) -> go.Figure:
metrics = list(self.metric_colors.keys())
agent1_name = agent1_results[0].get('agent_name', 'Agent 1')
agent2_name = agent2_results[0].get('agent_name', 'Agent 2')
def get_avg_scores(results):
return {m: np.mean([r['scores'].get(m, 0) for r in results]) for m in metrics}
avg1 = get_avg_scores(agent1_results)
avg2 = get_avg_scores(agent2_results)
metric_labels = [m.replace('_', ' ').title() for m in metrics]
fig = go.Figure(data=[
go.Bar(name=agent1_name, x=metric_labels, y=[avg1[m] for m in metrics]),
go.Bar(name=agent2_name, x=metric_labels, y=[avg2[m] for m in metrics])
])
fig.update_layout(barmode='group', title="Agent Performance Comparison", yaxis_title="Average Score", template='plotly_white')
return fig
# Spider chart comparing two agents
def create_radar_comparison(self, agent1_results: List[Dict], agent2_results: List[Dict]) -> go.Figure:
metrics = ['instruction_following', 'hallucination_score', 'assumption_control', 'coherence', 'accuracy']
metric_labels = [m.replace('_', ' ').title() for m in metrics]
def get_avg_scores(results):
return [np.mean([r['scores'].get(m, 0) for r in results]) for m in metrics]
agent1_values = get_avg_scores(agent1_results)
agent2_values = get_avg_scores(agent2_results)
fig = go.Figure()
fig.add_trace(go.Scatterpolar(r=agent1_values, theta=metric_labels, fill='toself', name=agent1_results[0].get('agent_name', 'Agent 1')))
fig.add_trace(go.Scatterpolar(r=agent2_values, theta=metric_labels, fill='toself', name=agent2_results[0].get('agent_name', 'Agent 2')))
fig.update_layout(polar=dict(radialaxis=dict(visible=True, range=[0, 1])), title="Agent Comparison - Radar Chart", template='plotly_white')
return fig
#performance gap between two agents across metrics
def create_performance_delta(self, agent1_results: List[Dict], agent2_results: List[Dict]) -> go.Figure:
metrics = list(self.metric_colors.keys())
def get_avg_scores(results):
return {m: np.mean([r['scores'].get(m, 0) for r in results]) for m in metrics}
avg1 = get_avg_scores(agent1_results)
avg2 = get_avg_scores(agent2_results)
deltas = [avg2[m] - avg1[m] for m in metrics]
colors = ['#48bb78' if d >= 0 else '#fc8181' for d in deltas]
metric_labels = [m.replace('_', ' ').title() for m in metrics]
fig = go.Figure(go.Bar(x=metric_labels, y=deltas, marker_color=colors, text=[f'{d:+.2f}' for d in deltas]))
fig.update_layout(title="Performance Delta (Agent 2 vs Agent 1)", yaxis_title="Score Difference", template='plotly_white')
return fig