dLLM_Leaderboard / src /display /visualization.py
d3LLM-Data-LLaDA's picture
Initial commit
d473371
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
# 30 distinct colors - assigned by Avg AUP rank
COLOR_PALETTE = [
"#E91E63", "#4A90E2", "#00BFA5", "#FF6B35", "#8E24AA",
"#4CAF50", "#FF4081", "#303F9F", "#FFD166", "#00796B",
"#C2185B", "#7B1FA2", "#26A69A", "#1A4C7C", "#FF8C42",
"#009688", "#673AB7", "#F44336", "#3F51B5", "#795548",
"#607D8B", "#9C27B0", "#2196F3", "#CDDC39", "#FF9800",
"#00BCD4", "#E64A19", "#5D4037", "#455A64", "#AD1457",
]
def get_model_colors(df):
"""Assign colors to methods by Avg AUP rank (descending)."""
models_sorted = df.sort_values("Avg_AUP", ascending=False)["Method"].tolist()
return {model: COLOR_PALETTE[i % len(COLOR_PALETTE)] for i, model in enumerate(models_sorted)}
def get_model_ranks(df):
"""Get rank for each method by Avg AUP."""
models_sorted = df.sort_values("Avg_AUP", ascending=False)["Method"].tolist()
return {model: i + 1 for i, model in enumerate(models_sorted)}
def hex_to_rgba(hex_color, alpha=0.25):
hex_color = hex_color.lstrip('#')
r, g, b = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16)
return f'rgba({r},{g},{b},{alpha})'
def create_radar_chart(df, tasks, top_n=15):
"""Create radar chart for top N methods showing original AUP scores (independent axes)."""
df_top = df.head(top_n).copy()
model_colors = get_model_colors(df)
model_ranks = get_model_ranks(df)
all_cols = [f"{t}_AUP" for t in tasks] + ["Avg_AUP"]
categories = [t.replace("-", "\n") for t in tasks] + ["Avg\nAUP"]
# Compute min/max per column for normalization (for radar display only)
col_stats = {}
for col in all_cols:
vals = df_top[col].dropna().astype(float)
col_stats[col] = {'min': vals.min() if len(vals) > 0 else 0,
'max': vals.max() if len(vals) > 0 else 100}
fig = go.Figure()
for _, row in df_top.iterrows():
method = row["Method"]
rank = model_ranks.get(method, 0)
color = model_colors.get(method, "#808080")
display_name = f"#{rank} {method}"
# Original AUP values for hover display
original_vals = [row.get(col, 0) or 0 for col in all_cols]
# Normalized values for radar shape (0-100 scale per axis)
normalized = []
for col, val in zip(all_cols, original_vals):
stats = col_stats[col]
range_val = stats['max'] - stats['min']
if range_val > 0:
norm = ((val - stats['min']) / range_val) * 80 + 10 # Scale to 10-90
else:
norm = 50
normalized.append(norm)
# Custom hover text showing original AUP scores
hover_texts = [f"<b>{display_name}</b><br>{cat}: <b>{val:.1f}</b>"
for cat, val in zip(categories, original_vals)]
fig.add_trace(go.Scatterpolar(
r=normalized + [normalized[0]],
theta=categories + [categories[0]],
mode='lines+markers', fill='toself', name=display_name,
line=dict(color=color, width=2), marker=dict(color=color, size=6),
fillcolor=hex_to_rgba(color, 0.15), opacity=0.9,
text=hover_texts + [hover_texts[0]],
hovertemplate='%{text}<extra></extra>'
))
fig.update_layout(
height=600, margin=dict(l=100, r=250, t=80, b=60),
title=dict(text=f"🎯 Top {top_n} Methods: AUP Scores in Radar Chart", x=0.5, font=dict(size=18)),
# title=dict(text=f"🎯 Top {top_n} Methods: AUP Scores in Radar Chart", x=0.5, font=dict(size=18)),
polar=dict(radialaxis=dict(visible=True, range=[0, 100], tickfont=dict(size=11),
tickvals=[], showticklabels=False)),
legend=dict(font=dict(size=12), x=1.05, y=1, bgcolor='rgba(255,255,255,0.95)',
bordercolor='#ddd', borderwidth=1, title=dict(text="Methods (sorted by Avg AUP)", font=dict(size=13))),
hoverlabel=dict(bgcolor="white", font_size=14, font_family="Arial", bordercolor="#333")
)
return fig
def create_group_bar_chart(df, tasks, top_n=15):
"""Create grouped bar chart with Avg AUP included and rank numbers."""
df_top = df.head(top_n).copy()
methods = df_top["Method"].tolist()
model_colors = get_model_colors(df)
model_ranks = get_model_ranks(df)
all_benchmarks = tasks + ["Avg_AUP"]
fig = go.Figure()
for method in methods:
row = df_top[df_top["Method"] == method].iloc[0]
color = model_colors.get(method, "#808080")
rank = model_ranks.get(method, 0)
display_name = f"#{rank} {method}"
y_vals, x_vals = [], []
for bench in all_benchmarks:
aup = row.get("Avg_AUP") if bench == "Avg_AUP" else row.get(f"{bench}_AUP")
if aup is not None and not (isinstance(aup, float) and aup != aup):
y_vals.append(aup)
x_vals.append("Avg AUP" if bench == "Avg_AUP" else bench)
if y_vals:
fig.add_trace(go.Bar(
name=display_name, x=x_vals, y=y_vals, marker_color=color,
hovertemplate=f"<b>{display_name}</b><br>%{{x}}: %{{y:.1f}}<extra></extra>"
))
fig.update_layout(
height=550, margin=dict(l=60, r=250, t=80, b=100),
title=dict(text=f"πŸ“Š Top {top_n} Methods: AUP Scores in Bar Chart", x=0.5, font=dict(size=18)),
# title=dict(text=f"πŸ“Š Top {top_n} Methods: AUP Scores of Different Benchmarks", x=0.5, font=dict(size=18)),
xaxis_title="Benchmark", yaxis_title="AUP Score",
barmode='group', bargap=0.2, bargroupgap=0.05,
legend=dict(font=dict(size=11), x=1.02, y=1, bgcolor='rgba(255,255,255,0.95)',
bordercolor='#ddd', borderwidth=1, title=dict(text="Methods (sorted by Avg AUP)", font=dict(size=12))),
hoverlabel=dict(bgcolor="white", font_size=14, font_family="Arial")
)
return fig
def create_aup_curve_chart(raw_data, tasks, df, top_n=15):
"""Create 2x3 subplot grid of AUP curves with quadratic fitting (same as plot_lines.py)."""
df_top = df.head(top_n).copy()
model_colors = get_model_colors(df)
model_ranks = get_model_ranks(df)
methods_to_show = set(df_top["Method"].tolist())
# Build per-task data: {task: {method: [(rho, y), ...]}}
task_data = {t: {} for t in tasks}
for task in tasks:
for method, pairs in raw_data.get(task, {}).items():
if method in methods_to_show:
task_data[task][method] = [(p[0], p[1]) for p in pairs]
# Compute average data: average TPF and Acc by index across tasks (all tasks have same length)
avg_data = {}
for method in methods_to_show:
task_points = [task_data.get(t, {}).get(method, []) for t in tasks]
task_points = [p for p in task_points if p] # filter empty
if not task_points:
continue
n_points = len(task_points[0])
avg_data[method] = [
(np.mean([tp[i][0] for tp in task_points]), sum(tp[i][1] for tp in task_points) / 5)
for i in range(n_points)
]
# 6 subplots: 5 tasks + 1 Average at (2,3)
titles = tasks + ["Average"]
fig = make_subplots(rows=2, cols=3, subplot_titles=titles,
horizontal_spacing=0.08, vertical_spacing=0.15)
# Track which methods have been added to legend
legend_added = set()
def get_pos(idx):
if idx < 3:
return (1, idx + 1)
return (2, idx - 2) # idx=3->(2,1), idx=4->(2,2), idx=5->(2,3)
# Helper to draw curve for a given subplot
def draw_curve(pairs, method, row, col):
nonlocal legend_added
if not pairs:
return
color = model_colors.get(method, "#808080")
rank = model_ranks.get(method, 0)
display_name = f"#{rank} {method}"
show_legend = method not in legend_added
if show_legend:
legend_added.add(method)
rho, y = zip(*sorted(pairs, key=lambda x: x[0]))
rho, y = np.array(rho), np.array(y)
# Generate smooth curve (quadratic fitting, same as plot_lines.py)
if len(rho) >= 3:
z = np.polyfit(rho, y, 2)
p = np.poly1d(z)
x_smooth = np.linspace(rho.min(), rho.max(), 300)
y_smooth = p(x_smooth)
elif len(rho) == 2:
x_smooth = np.linspace(rho.min(), rho.max(), 300)
if rho[1] != rho[0]:
a = (y[1] - y[0]) / ((rho[1] - rho[0]) ** 2)
y_smooth = a * (x_smooth - rho[0]) ** 2 + y[0]
else:
y_smooth = np.linspace(y[0], y[1], 300)
else:
x_smooth, y_smooth = rho, y
# Add fitted curve
fig.add_trace(go.Scatter(
x=x_smooth, y=y_smooth, mode='lines', name=display_name,
line=dict(color=color, width=2.5), opacity=0.85,
showlegend=show_legend, legendgroup=method,
hoverinfo='skip'
), row=row, col=col)
# Add markers at original data points
fig.add_trace(go.Scatter(
x=rho, y=y, mode='markers', name=display_name,
marker=dict(color='white', size=8, line=dict(color=color, width=2)),
showlegend=False, legendgroup=method,
hovertemplate=f"<b>{display_name}</b><br>TPF: %{{x:.2f}}<br>Acc: %{{y:.1f}}<extra></extra>"
), row=row, col=col)
# Draw 5 task subplots
for idx, task in enumerate(tasks):
row, col = get_pos(idx)
data = task_data.get(task, {})
for method in df_top["Method"].tolist():
if method in data:
draw_curve(data[method], method, row, col)
# Draw Average subplot at (2, 3)
for method in df_top["Method"].tolist():
if method in avg_data:
draw_curve(avg_data[method], method, 2, 3)
fig.update_layout(
height=550, margin=dict(l=60, r=250, t=80, b=60),
title=dict(text=f"πŸ“ˆ Top {top_n} Methods: Accuracy-Parallelism Curves", x=0.5, font=dict(size=18)),
legend=dict(font=dict(size=11), x=1.02, y=1, bgcolor='rgba(255,255,255,0.95)',
bordercolor='#ddd', borderwidth=1, title=dict(text="Methods (sorted by Avg AUP)", font=dict(size=12)),
tracegroupgap=1, itemsizing='constant'),
hoverlabel=dict(bgcolor="white", font_size=14, font_family="Arial")
)
# Update axes labels for 6 subplots
for idx in range(6):
row, col = get_pos(idx)
fig.update_xaxes(title_text="TPF (Tokens per Forward)" if idx >= 3 else "", row=row, col=col)
fig.update_yaxes(title_text="Acc (%)" if col == 1 else "", row=row, col=col)
return fig