Spaces:
Sleeping
Sleeping
| import plotly.graph_objects as go | |
| import pandas as pd | |
| import numpy as np | |
| from typing import Dict, List | |
| class EvaluationVisualizer: | |
| def __init__(self): | |
| self.metric_colors = { | |
| 'instruction_following': '#667eea', 'hallucination_score': '#48bb78', | |
| 'assumption_control': '#f6ad55', 'coherence': '#63b3ed', | |
| 'accuracy': '#fc8181', 'overall_score': '#764ba2' | |
| } | |
| #Spider chart with multi dimensional scores for single evaluation | |
| def create_spider_chart(self, scores: Dict, agent_name: str = "Agent") -> go.Figure: | |
| metrics = ['Instruction\nFollowing', 'Hallucination\nControl', 'Assumption\nControl', 'Coherence', 'Accuracy'] | |
| values = [ | |
| scores.get('instruction_following', 0), scores.get('hallucination_score', 0), | |
| scores.get('assumption_control', 0), scores.get('coherence', 0), | |
| scores.get('accuracy', 0) | |
| ] | |
| fig = go.Figure() | |
| fig.add_trace(go.Scatterpolar(r=values, theta=metrics, fill='toself', name=agent_name, line=dict(color=self.metric_colors['instruction_following']))) | |
| fig.update_layout(polar=dict(radialaxis=dict(visible=True, range=[0, 1])), title=f"{agent_name} - Performance Spider Chart", template='plotly_white') | |
| return fig | |
| # Horizontal bar chart showing scores for single evaluation | |
| def create_score_bars(self, scores: Dict, agent_name: str = "Agent") -> go.Figure: | |
| metric_map = { | |
| 'overall_score': 'Overall Score', 'instruction_following': 'Instruction Following', | |
| 'hallucination_score': 'Hallucination Control', 'assumption_control': 'Assumption Control', | |
| 'coherence': 'Coherence', 'accuracy': 'Accuracy' | |
| } | |
| metrics = [label for key, label in metric_map.items() if key in scores] | |
| values = [scores[key] for key in metric_map if key in scores] | |
| colors = [self.metric_colors.get(key, '#667eea') for key in metric_map if key in scores] | |
| fig = go.Figure(go.Bar(y=metrics, x=values, orientation='h', marker=dict(color=colors), text=[f'{v:.2f}' for v in values], textposition='auto')) | |
| fig.update_layout(title=f"{agent_name} - Score Breakdown", xaxis=dict(range=[0, 1]), template='plotly_white', showlegend=False) | |
| return fig | |
| # Heatmap showing evaluation scores across agents and tasks | |
| def create_evaluation_heatmap(self, results: List[Dict]) -> go.Figure: | |
| if not results: return go.Figure().update_layout(title="No data for heatmap") | |
| df_data = [{'agent': r['agent_name'], 'task': r['task_id'], 'score': r['scores'].get('overall_score', 0)} for r in results] | |
| df = pd.DataFrame(df_data) | |
| pivot_df = df.pivot(index='agent', columns='task', values='score') | |
| fig = go.Figure(data=go.Heatmap(z=pivot_df.values, x=pivot_df.columns, y=pivot_df.index, colorscale='Viridis', colorbar=dict(title="Score"))) | |
| fig.update_layout(title="Agent Performance Heatmap", xaxis_title="Tasks", yaxis_title="Agents", template='plotly_white') | |
| return fig | |
| # Violin plots for spread in scores across Agents | |
| def create_score_distribution(self, results: List[Dict]) -> go.Figure: | |
| if not results: return go.Figure().update_layout(title="No data for distribution plot") | |
| df_data = [] | |
| for r in results: | |
| entry = {'Agent': r.get('agent_name', 'Unknown')} | |
| entry.update(r['scores']) | |
| df_data.append(entry) | |
| df = pd.DataFrame(df_data).melt(id_vars=['Agent'], value_vars=self.metric_colors.keys(), var_name='Metric', value_name='Score') | |
| metric_map = {k: k.replace('_', ' ').title() for k in self.metric_colors.keys()} | |
| df['Metric'] = df['Metric'].map(metric_map) | |
| fig = go.Figure() | |
| for metric in df['Metric'].unique(): | |
| fig.add_trace(go.Violin(y=df[df['Metric'] == metric]['Score'], name=metric, box_visible=True, meanline_visible=True)) | |
| fig.update_layout(title="Score Distribution Analysis", yaxis_title="Score", template='plotly_white', showlegend=False) | |
| return fig | |
| # Bar chart showing average overall scores of each agent | |
| def create_performance_trends(self, results: List[Dict]) -> go.Figure: | |
| if not results: | |
| return go.Figure().update_layout(title="No data for average performance plot") | |
| agent_scores = {} | |
| for r in results: | |
| agent = r['agent_name'] | |
| if agent not in agent_scores: | |
| agent_scores[agent] = [] | |
| agent_scores[agent].append(r['scores'].get('overall_score', 0)) | |
| # Compute averages | |
| avg_scores = {agent: np.mean(scores) for agent, scores in agent_scores.items()} | |
| fig = go.Figure(go.Bar( | |
| x=list(avg_scores.keys()), | |
| y=list(avg_scores.values()), | |
| text=[f"{v:.2f}" for v in avg_scores.values()], | |
| textposition="auto", | |
| marker=dict(color="#667eea") | |
| )) | |
| fig.update_layout( | |
| title="Average Overall Scores by Agent", | |
| xaxis_title="Agents", | |
| yaxis_title="Average Overall Score", | |
| template="plotly_white" | |
| ) | |
| return fig | |
| # Comparison chart between two agents | |
| def create_agent_comparison(self, agent1_results: List[Dict], agent2_results: List[Dict]) -> go.Figure: | |
| metrics = list(self.metric_colors.keys()) | |
| agent1_name = agent1_results[0].get('agent_name', 'Agent 1') | |
| agent2_name = agent2_results[0].get('agent_name', 'Agent 2') | |
| def get_avg_scores(results): | |
| return {m: np.mean([r['scores'].get(m, 0) for r in results]) for m in metrics} | |
| avg1 = get_avg_scores(agent1_results) | |
| avg2 = get_avg_scores(agent2_results) | |
| metric_labels = [m.replace('_', ' ').title() for m in metrics] | |
| fig = go.Figure(data=[ | |
| go.Bar(name=agent1_name, x=metric_labels, y=[avg1[m] for m in metrics]), | |
| go.Bar(name=agent2_name, x=metric_labels, y=[avg2[m] for m in metrics]) | |
| ]) | |
| fig.update_layout(barmode='group', title="Agent Performance Comparison", yaxis_title="Average Score", template='plotly_white') | |
| return fig | |
| # Spider chart comparing two agents | |
| def create_radar_comparison(self, agent1_results: List[Dict], agent2_results: List[Dict]) -> go.Figure: | |
| metrics = ['instruction_following', 'hallucination_score', 'assumption_control', 'coherence', 'accuracy'] | |
| metric_labels = [m.replace('_', ' ').title() for m in metrics] | |
| def get_avg_scores(results): | |
| return [np.mean([r['scores'].get(m, 0) for r in results]) for m in metrics] | |
| agent1_values = get_avg_scores(agent1_results) | |
| agent2_values = get_avg_scores(agent2_results) | |
| fig = go.Figure() | |
| fig.add_trace(go.Scatterpolar(r=agent1_values, theta=metric_labels, fill='toself', name=agent1_results[0].get('agent_name', 'Agent 1'))) | |
| fig.add_trace(go.Scatterpolar(r=agent2_values, theta=metric_labels, fill='toself', name=agent2_results[0].get('agent_name', 'Agent 2'))) | |
| fig.update_layout(polar=dict(radialaxis=dict(visible=True, range=[0, 1])), title="Agent Comparison - Radar Chart", template='plotly_white') | |
| return fig | |
| #performance gap between two agents across metrics | |
| def create_performance_delta(self, agent1_results: List[Dict], agent2_results: List[Dict]) -> go.Figure: | |
| metrics = list(self.metric_colors.keys()) | |
| def get_avg_scores(results): | |
| return {m: np.mean([r['scores'].get(m, 0) for r in results]) for m in metrics} | |
| avg1 = get_avg_scores(agent1_results) | |
| avg2 = get_avg_scores(agent2_results) | |
| deltas = [avg2[m] - avg1[m] for m in metrics] | |
| colors = ['#48bb78' if d >= 0 else '#fc8181' for d in deltas] | |
| metric_labels = [m.replace('_', ' ').title() for m in metrics] | |
| fig = go.Figure(go.Bar(x=metric_labels, y=deltas, marker_color=colors, text=[f'{d:+.2f}' for d in deltas])) | |
| fig.update_layout(title="Performance Delta (Agent 2 vs Agent 1)", yaxis_title="Score Difference", template='plotly_white') | |
| return fig |