import plotly.graph_objects as go from plotly.subplots import make_subplots import numpy as np # 30 distinct colors - assigned by Avg AUP rank COLOR_PALETTE = [ "#E91E63", "#4A90E2", "#00BFA5", "#FF6B35", "#8E24AA", "#4CAF50", "#FF4081", "#303F9F", "#FFD166", "#00796B", "#C2185B", "#7B1FA2", "#26A69A", "#1A4C7C", "#FF8C42", "#009688", "#673AB7", "#F44336", "#3F51B5", "#795548", "#607D8B", "#9C27B0", "#2196F3", "#CDDC39", "#FF9800", "#00BCD4", "#E64A19", "#5D4037", "#455A64", "#AD1457", ] def get_model_colors(df): """Assign colors to methods by Avg AUP rank (descending).""" models_sorted = df.sort_values("Avg_AUP", ascending=False)["Method"].tolist() return {model: COLOR_PALETTE[i % len(COLOR_PALETTE)] for i, model in enumerate(models_sorted)} def get_model_ranks(df): """Get rank for each method by Avg AUP.""" models_sorted = df.sort_values("Avg_AUP", ascending=False)["Method"].tolist() return {model: i + 1 for i, model in enumerate(models_sorted)} def hex_to_rgba(hex_color, alpha=0.25): hex_color = hex_color.lstrip('#') r, g, b = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16) return f'rgba({r},{g},{b},{alpha})' def create_radar_chart(df, tasks, top_n=15): """Create radar chart for top N methods showing original AUP scores (independent axes).""" df_top = df.head(top_n).copy() model_colors = get_model_colors(df) model_ranks = get_model_ranks(df) all_cols = [f"{t}_AUP" for t in tasks] + ["Avg_AUP"] categories = [t.replace("-", "\n") for t in tasks] + ["Avg\nAUP"] # Compute min/max per column for normalization (for radar display only) col_stats = {} for col in all_cols: vals = df_top[col].dropna().astype(float) col_stats[col] = {'min': vals.min() if len(vals) > 0 else 0, 'max': vals.max() if len(vals) > 0 else 100} fig = go.Figure() for _, row in df_top.iterrows(): method = row["Method"] rank = model_ranks.get(method, 0) color = model_colors.get(method, "#808080") display_name = f"#{rank} {method}" # Original AUP values for hover display original_vals = [row.get(col, 0) or 0 for col in all_cols] # Normalized values for radar shape (0-100 scale per axis) normalized = [] for col, val in zip(all_cols, original_vals): stats = col_stats[col] range_val = stats['max'] - stats['min'] if range_val > 0: norm = ((val - stats['min']) / range_val) * 80 + 10 # Scale to 10-90 else: norm = 50 normalized.append(norm) # Custom hover text showing original AUP scores hover_texts = [f"{display_name}
{cat}: {val:.1f}" for cat, val in zip(categories, original_vals)] fig.add_trace(go.Scatterpolar( r=normalized + [normalized[0]], theta=categories + [categories[0]], mode='lines+markers', fill='toself', name=display_name, line=dict(color=color, width=2), marker=dict(color=color, size=6), fillcolor=hex_to_rgba(color, 0.15), opacity=0.9, text=hover_texts + [hover_texts[0]], hovertemplate='%{text}' )) fig.update_layout( height=600, margin=dict(l=100, r=250, t=80, b=60), title=dict(text=f"🎯 Top {top_n} Methods: AUP Scores in Radar Chart", x=0.5, font=dict(size=18)), # title=dict(text=f"🎯 Top {top_n} Methods: AUP Scores in Radar Chart", x=0.5, font=dict(size=18)), polar=dict(radialaxis=dict(visible=True, range=[0, 100], tickfont=dict(size=11), tickvals=[], showticklabels=False)), legend=dict(font=dict(size=12), x=1.05, y=1, bgcolor='rgba(255,255,255,0.95)', bordercolor='#ddd', borderwidth=1, title=dict(text="Methods (sorted by Avg AUP)", font=dict(size=13))), hoverlabel=dict(bgcolor="white", font_size=14, font_family="Arial", bordercolor="#333") ) return fig def create_group_bar_chart(df, tasks, top_n=15): """Create grouped bar chart with Avg AUP included and rank numbers.""" df_top = df.head(top_n).copy() methods = df_top["Method"].tolist() model_colors = get_model_colors(df) model_ranks = get_model_ranks(df) all_benchmarks = tasks + ["Avg_AUP"] fig = go.Figure() for method in methods: row = df_top[df_top["Method"] == method].iloc[0] color = model_colors.get(method, "#808080") rank = model_ranks.get(method, 0) display_name = f"#{rank} {method}" y_vals, x_vals = [], [] for bench in all_benchmarks: aup = row.get("Avg_AUP") if bench == "Avg_AUP" else row.get(f"{bench}_AUP") if aup is not None and not (isinstance(aup, float) and aup != aup): y_vals.append(aup) x_vals.append("Avg AUP" if bench == "Avg_AUP" else bench) if y_vals: fig.add_trace(go.Bar( name=display_name, x=x_vals, y=y_vals, marker_color=color, hovertemplate=f"{display_name}
%{{x}}: %{{y:.1f}}" )) fig.update_layout( height=550, margin=dict(l=60, r=250, t=80, b=100), title=dict(text=f"📊 Top {top_n} Methods: AUP Scores in Bar Chart", x=0.5, font=dict(size=18)), # title=dict(text=f"📊 Top {top_n} Methods: AUP Scores of Different Benchmarks", x=0.5, font=dict(size=18)), xaxis_title="Benchmark", yaxis_title="AUP Score", barmode='group', bargap=0.2, bargroupgap=0.05, legend=dict(font=dict(size=11), x=1.02, y=1, bgcolor='rgba(255,255,255,0.95)', bordercolor='#ddd', borderwidth=1, title=dict(text="Methods (sorted by Avg AUP)", font=dict(size=12))), hoverlabel=dict(bgcolor="white", font_size=14, font_family="Arial") ) return fig def create_aup_curve_chart(raw_data, tasks, df, top_n=15): """Create 2x3 subplot grid of AUP curves with quadratic fitting (same as plot_lines.py).""" df_top = df.head(top_n).copy() model_colors = get_model_colors(df) model_ranks = get_model_ranks(df) methods_to_show = set(df_top["Method"].tolist()) # Build per-task data: {task: {method: [(rho, y), ...]}} task_data = {t: {} for t in tasks} for task in tasks: for method, pairs in raw_data.get(task, {}).items(): if method in methods_to_show: task_data[task][method] = [(p[0], p[1]) for p in pairs] # Compute average data: average TPF and Acc by index across tasks (all tasks have same length) avg_data = {} for method in methods_to_show: task_points = [task_data.get(t, {}).get(method, []) for t in tasks] task_points = [p for p in task_points if p] # filter empty if not task_points: continue n_points = len(task_points[0]) avg_data[method] = [ (np.mean([tp[i][0] for tp in task_points]), sum(tp[i][1] for tp in task_points) / 5) for i in range(n_points) ] # 6 subplots: 5 tasks + 1 Average at (2,3) titles = tasks + ["Average"] fig = make_subplots(rows=2, cols=3, subplot_titles=titles, horizontal_spacing=0.08, vertical_spacing=0.15) # Track which methods have been added to legend legend_added = set() def get_pos(idx): if idx < 3: return (1, idx + 1) return (2, idx - 2) # idx=3->(2,1), idx=4->(2,2), idx=5->(2,3) # Helper to draw curve for a given subplot def draw_curve(pairs, method, row, col): nonlocal legend_added if not pairs: return color = model_colors.get(method, "#808080") rank = model_ranks.get(method, 0) display_name = f"#{rank} {method}" show_legend = method not in legend_added if show_legend: legend_added.add(method) rho, y = zip(*sorted(pairs, key=lambda x: x[0])) rho, y = np.array(rho), np.array(y) # Generate smooth curve (quadratic fitting, same as plot_lines.py) if len(rho) >= 3: z = np.polyfit(rho, y, 2) p = np.poly1d(z) x_smooth = np.linspace(rho.min(), rho.max(), 300) y_smooth = p(x_smooth) elif len(rho) == 2: x_smooth = np.linspace(rho.min(), rho.max(), 300) if rho[1] != rho[0]: a = (y[1] - y[0]) / ((rho[1] - rho[0]) ** 2) y_smooth = a * (x_smooth - rho[0]) ** 2 + y[0] else: y_smooth = np.linspace(y[0], y[1], 300) else: x_smooth, y_smooth = rho, y # Add fitted curve fig.add_trace(go.Scatter( x=x_smooth, y=y_smooth, mode='lines', name=display_name, line=dict(color=color, width=2.5), opacity=0.85, showlegend=show_legend, legendgroup=method, hoverinfo='skip' ), row=row, col=col) # Add markers at original data points fig.add_trace(go.Scatter( x=rho, y=y, mode='markers', name=display_name, marker=dict(color='white', size=8, line=dict(color=color, width=2)), showlegend=False, legendgroup=method, hovertemplate=f"{display_name}
TPF: %{{x:.2f}}
Acc: %{{y:.1f}}" ), row=row, col=col) # Draw 5 task subplots for idx, task in enumerate(tasks): row, col = get_pos(idx) data = task_data.get(task, {}) for method in df_top["Method"].tolist(): if method in data: draw_curve(data[method], method, row, col) # Draw Average subplot at (2, 3) for method in df_top["Method"].tolist(): if method in avg_data: draw_curve(avg_data[method], method, 2, 3) fig.update_layout( height=550, margin=dict(l=60, r=250, t=80, b=60), title=dict(text=f"📈 Top {top_n} Methods: Accuracy-Parallelism Curves", x=0.5, font=dict(size=18)), legend=dict(font=dict(size=11), x=1.02, y=1, bgcolor='rgba(255,255,255,0.95)', bordercolor='#ddd', borderwidth=1, title=dict(text="Methods (sorted by Avg AUP)", font=dict(size=12)), tracegroupgap=1, itemsizing='constant'), hoverlabel=dict(bgcolor="white", font_size=14, font_family="Arial") ) # Update axes labels for 6 subplots for idx in range(6): row, col = get_pos(idx) fig.update_xaxes(title_text="TPF (Tokens per Forward)" if idx >= 3 else "", row=row, col=col) fig.update_yaxes(title_text="Acc (%)" if col == 1 else "", row=row, col=col) return fig