""" Visualization module for AMA-Bench leaderboard Adapted from lmGAME_bench patterns with AMA-specific customizations """ import plotly.graph_objects as go import numpy as np import pandas as pd import json import os from typing import Dict, List, Optional, Tuple # Constants METRICS = ["Recall", "Causal Inference", "State Updating", "State Abstraction"] ALL_METRICS = METRICS + ["Average"] def load_model_colors(filepath: str = "assets/model_colors.json") -> Dict[str, str]: """ Load color scheme for models and methods from JSON file. Args: filepath: Path to color configuration JSON Returns: Dictionary mapping model/method names to hex colors """ try: with open(filepath, 'r', encoding='utf-8') as f: color_data = json.load(f) # Merge models and methods into single dictionary colors = {} if 'models' in color_data: colors.update(color_data['models']) if 'methods' in color_data: colors.update(color_data['methods']) # Store fallback color fallback = color_data.get('fallback', '#808080') return colors, fallback except Exception as e: print(f"Warning: Could not load colors from {filepath}: {e}") return {}, '#808080' def normalize_scores(values: List[float], mean: float, std: float) -> List[float]: """ Normalize scores using z-score and scale to 0-100 range. Adapted from lmGAME_bench's normalize_values() function. Args: values: List of accuracy values (0-1 range) mean: Mean value for normalization std: Standard deviation for normalization Returns: List of normalized scores (0-100 range) Formula: z_score = (value - mean) / std normalized = clamp((z_score * 30) + 35, 0, 100) """ # Handle zero std case (all values are the same) if std < 0.05: # Minimum std threshold to prevent extreme values std = 0.05 normalized = [] for v in values: z_score = (v - mean) / std scaled = (z_score * 30) + 35 clamped = max(0, min(100, scaled)) normalized.append(clamped) return normalized def filter_by_category(data: Dict, category: str) -> Dict: """ Filter method data by category. Args: data: Full dataset with entries category: "All", "RAG", or "Agent Memory" Returns: Filtered data dictionary """ if category == "All": return data filtered_data = data.copy() filtered_data['entries'] = [ entry for entry in data['entries'] if entry.get('category') == category ] return filtered_data def prepare_dataframe_for_visualization( data: Dict, top_n: Optional[int] = None, category_filter: str = "All", selected_metrics: Optional[List[str]] = None ) -> pd.DataFrame: """ Build DataFrame with both raw and normalized scores. Args: data: Raw data from model_data.json or method_data.json top_n: Number of top entries to include (None = all) category_filter: "All", "RAG", or "Agent Memory" (for methods only) selected_metrics: List of metrics to include (None = all) Returns: DataFrame with columns: - Method/Model (name) - Category (if applicable) - {Metric} (raw accuracy 0-1) for each metric - norm_{Metric} (normalized 0-100) for each metric - Avg Normalized Score (mean of normalized scores) """ # Filter by category first if category_filter != "All": data = filter_by_category(data, category_filter) if not data['entries']: # Return empty DataFrame if no entries return pd.DataFrame() # Use all metrics if none specified if selected_metrics is None: selected_metrics = METRICS # Build basic DataFrame rows = [] for entry in data['entries']: row = { 'Name': entry['method'], } # Add category if present if entry.get('category') is not None: row['Category'] = entry['category'] # Add raw scores for metric in selected_metrics: score_data = entry['scores'].get(metric, {}) row[metric] = score_data.get('accuracy', 0.0) # Add average row['Average'] = entry['scores'].get('Average', {}).get('accuracy', 0.0) rows.append(row) df = pd.DataFrame(rows) # Sort by average accuracy (descending) df = df.sort_values(by='Average', ascending=False) # Calculate normalization parameters from FULL dataset (before limiting) norm_params = {} for metric in selected_metrics: values = df[metric].values mean = values.mean() std = values.std() norm_params[metric] = (mean, std) # Apply top_n limit if specified if top_n is not None and top_n > 0: df = df.head(top_n) # Add normalized scores for metric in selected_metrics: mean, std = norm_params[metric] values = df[metric].values df[f'norm_{metric}'] = normalize_scores(values.tolist(), mean, std) # Calculate average normalized score norm_cols = [f'norm_{metric}' for metric in selected_metrics] df['Avg Normalized Score'] = df[norm_cols].mean(axis=1) # Reset index df = df.reset_index(drop=True) return df def hex_to_rgba(hex_color: str, alpha: float = 0.2) -> str: """ Convert hex color to RGBA with specified alpha. Args: hex_color: Hex color code (e.g., "#FF0000") alpha: Alpha value (0-1) Returns: RGBA color string """ hex_color = hex_color.lstrip('#') r = int(hex_color[0:2], 16) g = int(hex_color[2:4], 16) b = int(hex_color[4:6], 16) return f'rgba({r}, {g}, {b}, {alpha})' def create_radar_chart( df: pd.DataFrame, selected_metrics: List[str], title: str = "Performance Across Metrics", color_map: Optional[Dict[str, str]] = None ) -> go.Figure: """ Create radar chart with normalized scores. Adapted from lmGAME_bench's create_single_radar_chart(). Args: df: DataFrame from prepare_dataframe_for_visualization() selected_metrics: List of metric names to include as axes title: Chart title color_map: Dictionary mapping names to colors Returns: Plotly Figure with radar chart Features: - Each axis = one metric - Each trace = one model/method - Range: 0-100 (normalized) - Interactive legend (click to isolate, double-click to toggle) """ if df.empty: fig = go.Figure() fig.update_layout(title="No data available") return fig # Load colors if not provided if color_map is None: color_map, fallback_color = load_model_colors() else: fallback_color = '#808080' # Check if we have normalized columns norm_cols = [f'norm_{metric}' for metric in selected_metrics] if not all(col in df.columns for col in norm_cols): fig = go.Figure() fig.update_layout(title="Missing normalized data") return fig fig = go.Figure() # Add trace for each model/method for _, row in df.iterrows(): name = row['Name'] # Get normalized values for selected metrics r = [row[f'norm_{metric}'] for metric in selected_metrics] # Get color color = color_map.get(name, fallback_color) fillcolor = hex_to_rgba(color, 0.2) # Add trace fig.add_trace(go.Scatterpolar( r=r + [r[0]], # Close the polygon theta=selected_metrics + [selected_metrics[0]], mode='lines+markers', fill='toself', name=name.lower(), # Lowercase for legend line=dict(color=color, width=2), marker=dict(color=color, size=6), fillcolor=fillcolor, opacity=0.7, hovertemplate='%{fullData.name}
%{theta}: %{r:.1f}' )) # Update layout fig.update_layout( title=dict( text=title, x=0.5, xanchor='center', font=dict(size=18) ), polar=dict( radialaxis=dict( visible=True, range=[0, 100], tickfont=dict(size=11), gridcolor='lightgray', gridwidth=1 ), angularaxis=dict( tickfont=dict(size=12, weight='bold') ) ), legend=dict( font=dict(size=11), title=dict(text="Models/Methods 💡", font=dict(size=12)), itemsizing='trace', x=1.05, y=1, xanchor='left', yanchor='top', bgcolor='rgba(255,255,255,0.6)', bordercolor='gray', borderwidth=1, itemclick="toggleothers", itemdoubleclick="toggle" ), height=550, margin=dict(l=80, r=200, t=80, b=80) ) return fig def create_group_bar_chart( df: pd.DataFrame, selected_metrics: List[str], top_n: int = 5, color_map: Optional[Dict[str, str]] = None ) -> go.Figure: """ Create grouped bar chart showing top N performers per metric. Adapted from lmGAME_bench's create_group_bar_chart(). Args: df: DataFrame with normalized scores selected_metrics: List of metrics to display top_n: Number of top performers to show per metric color_map: Dictionary mapping names to colors Returns: Plotly Figure with grouped bar chart Structure: - X-axis: Metrics with rank positions (e.g., "Recall #1", "Recall #2") - Y-axis: Normalized score (0-100) - Bars: Grouped by model/method """ if df.empty: fig = go.Figure() fig.update_layout(title="No data available") return fig # Load colors if not provided if color_map is None: color_map, fallback_color = load_model_colors() else: fallback_color = '#808080' # Check for normalized columns norm_cols = [f'norm_{metric}' for metric in selected_metrics] if not all(col in df.columns for col in norm_cols): fig = go.Figure() fig.update_layout(title="Missing normalized data") return fig # Build x-axis categories and data structure all_x_categories = [] all_names = set() metric_rankings = {} for metric in selected_metrics: norm_col = f'norm_{metric}' # Get top N for this metric metric_df = df[df[norm_col].notna()].copy() metric_df = metric_df.sort_values(by=norm_col, ascending=False).head(top_n) metric_rankings[metric] = [] for rank, (_, row) in enumerate(metric_df.iterrows(), 1): name = row['Name'] score = row[norm_col] x_category = f"{metric}
#{rank}" metric_rankings[metric].append({ 'name': name, 'score': score, 'x_category': x_category, 'rank': rank }) all_x_categories.append(x_category) all_names.add(name) # Create traces for each model/method fig = go.Figure() for name in sorted(all_names): x_vals = [] y_vals = [] for metric in selected_metrics: # Find this model/method's data for this metric for data in metric_rankings[metric]: if data['name'] == name: x_vals.append(data['x_category']) y_vals.append(data['score']) break if x_vals: # Only add if has data color = color_map.get(name, fallback_color) fig.add_trace(go.Bar( name=name, x=x_vals, y=y_vals, marker_color=color, hovertemplate="%{fullData.name}
Score: %{y:.1f}" )) # Update layout fig.update_layout( title=dict( text=f"Top {top_n} Performers by Metric", x=0.5, xanchor='center', font=dict(size=18) ), xaxis_title="Metrics (Ranked by Performance)", yaxis_title="Normalized Score", xaxis=dict( categoryorder='array', categoryarray=all_x_categories, tickangle=0 ), yaxis=dict(range=[0, 100]), barmode='group', bargap=0.15, bargroupgap=0.1, height=550, margin=dict(l=60, r=200, t=80, b=80), legend=dict( font=dict(size=11), title=dict(text="Models/Methods 💡", font=dict(size=12)), itemsizing='trace', x=1.05, y=1, xanchor='left', yanchor='top', bgcolor='rgba(255,255,255,0.6)', bordercolor='gray', borderwidth=1 ) ) return fig def create_horizontal_bar_chart( df: pd.DataFrame, metric: str, color_map: Optional[Dict[str, str]] = None ) -> go.Figure: """ Create horizontal bar chart for single metric details view. Adapted from lmGAME_bench's create_horizontal_bar_chart(). Args: df: DataFrame with scores metric: Metric name (e.g., "Recall") color_map: Dictionary mapping names to colors Returns: Plotly Figure with horizontal bar chart Features: - Y-axis: Model/method names (sorted by score, descending) - X-axis: Raw accuracy score (0-1 range) - Uses raw scores, not normalized """ if df.empty or metric not in df.columns: fig = go.Figure() fig.update_layout(title=f"No data available for {metric}") return fig # Load colors if not provided if color_map is None: color_map, fallback_color = load_model_colors() else: fallback_color = '#808080' # Filter and sort metric_df = df[df[metric].notna()].copy() metric_df = metric_df.sort_values(by=metric, ascending=True) # Lowest at top if metric_df.empty: fig = go.Figure() fig.update_layout(title=f"No valid data for {metric}") return fig # Create bar chart colors = [color_map.get(name, fallback_color) for name in metric_df['Name']] fig = go.Figure( go.Bar( y=metric_df['Name'], x=metric_df[metric], orientation='h', marker=dict( color=colors, line=dict(color='#2c3e50', width=1) ), hovertemplate='%{y}
Accuracy: %{x:.4f}' ) ) # Update layout fig.update_layout( title=dict( text=f'{metric} - Detailed Rankings', x=0.5, xanchor='center', font=dict(size=18) ), xaxis_title="Accuracy", yaxis_title="Model/Method", xaxis=dict( range=[0, 1], gridcolor='#e0e0e0' ), plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font=dict(color='#2c3e50'), height=max(400, len(metric_df) * 30), # Dynamic height based on entries margin=dict(l=200, r=40, t=80, b=60), showlegend=False ) return fig def create_multi_metric_bar_chart( df: pd.DataFrame, selected_metrics: List[str], color_map: Optional[Dict[str, str]] = None ) -> go.Figure: """ Create grouped horizontal bar chart showing multiple metrics for each model/method. Args: df: DataFrame with scores selected_metrics: List of metrics to display (e.g., ["Recall", "Causal Inference"]) color_map: Dictionary mapping names to colors Returns: Plotly Figure with grouped horizontal bar chart Features: - Y-axis: Model/method names - X-axis: Raw accuracy score (0-1 range) - Multiple bars per model/method (one per selected metric) - Sorted by average score across selected metrics """ if df.empty or not selected_metrics: fig = go.Figure() fig.update_layout(title="No data available") return fig # Check if all selected metrics exist missing_metrics = [m for m in selected_metrics if m not in df.columns] if missing_metrics: fig = go.Figure() fig.update_layout(title=f"Missing metrics: {', '.join(missing_metrics)}") return fig # Filter to entries that have at least one selected metric metric_df = df.copy() metric_df = metric_df[metric_df[selected_metrics].notna().any(axis=1)] if metric_df.empty: fig = go.Figure() fig.update_layout(title="No valid data for selected metrics") return fig # Calculate average score across selected metrics for sorting metric_df['avg_score'] = metric_df[selected_metrics].mean(axis=1) metric_df = metric_df.sort_values(by='avg_score', ascending=True) # Lowest at top # Use single base color with gradient based on capability base_color = "#636EFA" # Blue color # Normalize avg_score to create gradient (0.3 to 1.0 range for visibility) min_score = metric_df['avg_score'].min() max_score = metric_df['avg_score'].max() score_range = max_score - min_score if max_score > min_score else 1 # Create color gradient based on model capability (higher score = deeper color) def get_gradient_color(score, min_val, max_val, score_range): """Generate color with gradient based on score""" # Normalize to 0-1 range, then scale to 0.3-1.0 for better visibility normalized = (score - min_val) / score_range if score_range > 0 else 0.5 intensity = 0.3 + (normalized * 0.7) # Range: 0.3 (light) to 1.0 (deep) # Convert base color to RGB and apply intensity with 50% opacity hex_color = base_color.lstrip('#') r = int(hex_color[0:2], 16) g = int(hex_color[2:4], 16) b = int(hex_color[4:6], 16) # Apply intensity to RGB values r = int(255 - (255 - r) * intensity) g = int(255 - (255 - g) * intensity) b = int(255 - (255 - b) * intensity) return f'rgba({r}, {g}, {b}, 0.5)' # 50% transparency # Create grouped bar chart fig = go.Figure() for metric in selected_metrics: # Create color array for each model based on their avg_score colors = [ get_gradient_color(row['avg_score'], min_score, max_score, score_range) for _, row in metric_df.iterrows() ] fig.add_trace(go.Bar( name=metric, y=metric_df['Name'], x=metric_df[metric], orientation='h', marker=dict( color=colors, line=dict(color='#2c3e50', width=0.5) ), hovertemplate=f'%{{y}}
{metric}: %{{x:.4f}}' )) # Update layout fig.update_layout( title=dict( text=f'Detailed Comparison - {", ".join(selected_metrics)}', x=0.5, xanchor='center', font=dict(size=18) ), xaxis_title="Accuracy", yaxis_title="Model/Method", xaxis=dict( range=[0, 1], gridcolor='#e0e0e0' ), barmode='group', plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font=dict(color='#2c3e50'), height=max(500, len(metric_df) * 40), # Dynamic height margin=dict(l=200, r=40, t=80, b=80), legend=dict( orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5, font=dict(size=12) ) ) return fig