Spaces:

d3LLM
/

dLLM_Leaderboard

Running

App Files Files Community

dLLM_Leaderboard / src /display /visualization.py

d3LLM-Data-LLaDA

Initial commit

d473371 4 days ago

raw

history blame contribute delete

10.9 kB

	import plotly.graph_objects as go
	from plotly.subplots import make_subplots
	import numpy as np

	# 30 distinct colors - assigned by Avg AUP rank
	COLOR_PALETTE = [
	"#E91E63", "#4A90E2", "#00BFA5", "#FF6B35", "#8E24AA",
	"#4CAF50", "#FF4081", "#303F9F", "#FFD166", "#00796B",
	"#C2185B", "#7B1FA2", "#26A69A", "#1A4C7C", "#FF8C42",
	"#009688", "#673AB7", "#F44336", "#3F51B5", "#795548",
	"#607D8B", "#9C27B0", "#2196F3", "#CDDC39", "#FF9800",
	"#00BCD4", "#E64A19", "#5D4037", "#455A64", "#AD1457",
	]

	def get_model_colors(df):
	"""Assign colors to methods by Avg AUP rank (descending)."""
	models_sorted = df.sort_values("Avg_AUP", ascending=False)["Method"].tolist()
	return {model: COLOR_PALETTE[i % len(COLOR_PALETTE)] for i, model in enumerate(models_sorted)}

	def get_model_ranks(df):
	"""Get rank for each method by Avg AUP."""
	models_sorted = df.sort_values("Avg_AUP", ascending=False)["Method"].tolist()
	return {model: i + 1 for i, model in enumerate(models_sorted)}

	def hex_to_rgba(hex_color, alpha=0.25):
	hex_color = hex_color.lstrip('#')
	r, g, b = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16)
	return f'rgba({r},{g},{b},{alpha})'

	def create_radar_chart(df, tasks, top_n=15):
	"""Create radar chart for top N methods showing original AUP scores (independent axes)."""
	df_top = df.head(top_n).copy()
	model_colors = get_model_colors(df)
	model_ranks = get_model_ranks(df)

	all_cols = [f"{t}_AUP" for t in tasks] + ["Avg_AUP"]
	categories = [t.replace("-", "\n") for t in tasks] + ["Avg\nAUP"]

	# Compute min/max per column for normalization (for radar display only)
	col_stats = {}
	for col in all_cols:
	vals = df_top[col].dropna().astype(float)
	col_stats[col] = {'min': vals.min() if len(vals) > 0 else 0,
	'max': vals.max() if len(vals) > 0 else 100}

	fig = go.Figure()

	for _, row in df_top.iterrows():
	method = row["Method"]
	rank = model_ranks.get(method, 0)
	color = model_colors.get(method, "#808080")
	display_name = f"#{rank} {method}"

	# Original AUP values for hover display
	original_vals = [row.get(col, 0) or 0 for col in all_cols]

	# Normalized values for radar shape (0-100 scale per axis)
	normalized = []
	for col, val in zip(all_cols, original_vals):
	stats = col_stats[col]
	range_val = stats['max'] - stats['min']
	if range_val > 0:
	norm = ((val - stats['min']) / range_val) * 80 + 10 # Scale to 10-90
	else:
	norm = 50
	normalized.append(norm)

	# Custom hover text showing original AUP scores
	hover_texts = [f"<b>{display_name}</b><br>{cat}: <b>{val:.1f}</b>"
	for cat, val in zip(categories, original_vals)]

	fig.add_trace(go.Scatterpolar(
	r=normalized + [normalized[0]],
	theta=categories + [categories[0]],
	mode='lines+markers', fill='toself', name=display_name,
	line=dict(color=color, width=2), marker=dict(color=color, size=6),
	fillcolor=hex_to_rgba(color, 0.15), opacity=0.9,
	text=hover_texts + [hover_texts[0]],
	hovertemplate='%{text}<extra></extra>'
	))

	fig.update_layout(
	height=600, margin=dict(l=100, r=250, t=80, b=60),
	title=dict(text=f"🎯 Top {top_n} Methods: AUP Scores in Radar Chart", x=0.5, font=dict(size=18)),
	# title=dict(text=f"🎯 Top {top_n} Methods: AUP Scores in Radar Chart", x=0.5, font=dict(size=18)),
	polar=dict(radialaxis=dict(visible=True, range=[0, 100], tickfont=dict(size=11),
	tickvals=[], showticklabels=False)),
	legend=dict(font=dict(size=12), x=1.05, y=1, bgcolor='rgba(255,255,255,0.95)',
	bordercolor='#ddd', borderwidth=1, title=dict(text="Methods (sorted by Avg AUP)", font=dict(size=13))),
	hoverlabel=dict(bgcolor="white", font_size=14, font_family="Arial", bordercolor="#333")
	)
	return fig

	def create_group_bar_chart(df, tasks, top_n=15):
	"""Create grouped bar chart with Avg AUP included and rank numbers."""
	df_top = df.head(top_n).copy()
	methods = df_top["Method"].tolist()
	model_colors = get_model_colors(df)
	model_ranks = get_model_ranks(df)

	all_benchmarks = tasks + ["Avg_AUP"]
	fig = go.Figure()

	for method in methods:
	row = df_top[df_top["Method"] == method].iloc[0]
	color = model_colors.get(method, "#808080")
	rank = model_ranks.get(method, 0)
	display_name = f"#{rank} {method}"

	y_vals, x_vals = [], []
	for bench in all_benchmarks:
	aup = row.get("Avg_AUP") if bench == "Avg_AUP" else row.get(f"{bench}_AUP")
	if aup is not None and not (isinstance(aup, float) and aup != aup):
	y_vals.append(aup)
	x_vals.append("Avg AUP" if bench == "Avg_AUP" else bench)

	if y_vals:
	fig.add_trace(go.Bar(
	name=display_name, x=x_vals, y=y_vals, marker_color=color,
	hovertemplate=f"<b>{display_name}</b><br>%{{x}}: %{{y:.1f}}<extra></extra>"
	))

	fig.update_layout(
	height=550, margin=dict(l=60, r=250, t=80, b=100),
	title=dict(text=f"📊 Top {top_n} Methods: AUP Scores in Bar Chart", x=0.5, font=dict(size=18)),
	# title=dict(text=f"📊 Top {top_n} Methods: AUP Scores of Different Benchmarks", x=0.5, font=dict(size=18)),
	xaxis_title="Benchmark", yaxis_title="AUP Score",
	barmode='group', bargap=0.2, bargroupgap=0.05,
	legend=dict(font=dict(size=11), x=1.02, y=1, bgcolor='rgba(255,255,255,0.95)',
	bordercolor='#ddd', borderwidth=1, title=dict(text="Methods (sorted by Avg AUP)", font=dict(size=12))),
	hoverlabel=dict(bgcolor="white", font_size=14, font_family="Arial")
	)
	return fig

	def create_aup_curve_chart(raw_data, tasks, df, top_n=15):
	"""Create 2x3 subplot grid of AUP curves with quadratic fitting (same as plot_lines.py)."""
	df_top = df.head(top_n).copy()
	model_colors = get_model_colors(df)
	model_ranks = get_model_ranks(df)
	methods_to_show = set(df_top["Method"].tolist())

	# Build per-task data: {task: {method: [(rho, y), ...]}}
	task_data = {t: {} for t in tasks}
	for task in tasks:
	for method, pairs in raw_data.get(task, {}).items():
	if method in methods_to_show:
	task_data[task][method] = [(p[0], p[1]) for p in pairs]

	# Compute average data: average TPF and Acc by index across tasks (all tasks have same length)
	avg_data = {}
	for method in methods_to_show:
	task_points = [task_data.get(t, {}).get(method, []) for t in tasks]
	task_points = [p for p in task_points if p] # filter empty
	if not task_points:
	continue
	n_points = len(task_points[0])
	avg_data[method] = [
	(np.mean([tp[i][0] for tp in task_points]), sum(tp[i][1] for tp in task_points) / 5)
	for i in range(n_points)
	]

	# 6 subplots: 5 tasks + 1 Average at (2,3)
	titles = tasks + ["Average"]
	fig = make_subplots(rows=2, cols=3, subplot_titles=titles,
	horizontal_spacing=0.08, vertical_spacing=0.15)

	# Track which methods have been added to legend
	legend_added = set()

	def get_pos(idx):
	if idx < 3:
	return (1, idx + 1)
	return (2, idx - 2) # idx=3->(2,1), idx=4->(2,2), idx=5->(2,3)

	# Helper to draw curve for a given subplot
	def draw_curve(pairs, method, row, col):
	nonlocal legend_added
	if not pairs:
	return
	color = model_colors.get(method, "#808080")
	rank = model_ranks.get(method, 0)
	display_name = f"#{rank} {method}"
	show_legend = method not in legend_added
	if show_legend:
	legend_added.add(method)

	rho, y = zip(*sorted(pairs, key=lambda x: x[0]))
	rho, y = np.array(rho), np.array(y)

	# Generate smooth curve (quadratic fitting, same as plot_lines.py)
	if len(rho) >= 3:
	z = np.polyfit(rho, y, 2)
	p = np.poly1d(z)
	x_smooth = np.linspace(rho.min(), rho.max(), 300)
	y_smooth = p(x_smooth)
	elif len(rho) == 2:
	x_smooth = np.linspace(rho.min(), rho.max(), 300)
	if rho[1] != rho[0]:
	a = (y[1] - y[0]) / ((rho[1] - rho[0]) ** 2)
	y_smooth = a * (x_smooth - rho[0]) ** 2 + y[0]
	else:
	y_smooth = np.linspace(y[0], y[1], 300)
	else:
	x_smooth, y_smooth = rho, y

	# Add fitted curve
	fig.add_trace(go.Scatter(
	x=x_smooth, y=y_smooth, mode='lines', name=display_name,
	line=dict(color=color, width=2.5), opacity=0.85,
	showlegend=show_legend, legendgroup=method,
	hoverinfo='skip'
	), row=row, col=col)

	# Add markers at original data points
	fig.add_trace(go.Scatter(
	x=rho, y=y, mode='markers', name=display_name,
	marker=dict(color='white', size=8, line=dict(color=color, width=2)),
	showlegend=False, legendgroup=method,
	hovertemplate=f"<b>{display_name}</b><br>TPF: %{{x:.2f}}<br>Acc: %{{y:.1f}}<extra></extra>"
	), row=row, col=col)

	# Draw 5 task subplots
	for idx, task in enumerate(tasks):
	row, col = get_pos(idx)
	data = task_data.get(task, {})
	for method in df_top["Method"].tolist():
	if method in data:
	draw_curve(data[method], method, row, col)

	# Draw Average subplot at (2, 3)
	for method in df_top["Method"].tolist():
	if method in avg_data:
	draw_curve(avg_data[method], method, 2, 3)

	fig.update_layout(
	height=550, margin=dict(l=60, r=250, t=80, b=60),
	title=dict(text=f"📈 Top {top_n} Methods: Accuracy-Parallelism Curves", x=0.5, font=dict(size=18)),
	legend=dict(font=dict(size=11), x=1.02, y=1, bgcolor='rgba(255,255,255,0.95)',
	bordercolor='#ddd', borderwidth=1, title=dict(text="Methods (sorted by Avg AUP)", font=dict(size=12)),
	tracegroupgap=1, itemsizing='constant'),
	hoverlabel=dict(bgcolor="white", font_size=14, font_family="Arial")
	)

	# Update axes labels for 6 subplots
	for idx in range(6):
	row, col = get_pos(idx)
	fig.update_xaxes(title_text="TPF (Tokens per Forward)" if idx >= 3 else "", row=row, col=col)
	fig.update_yaxes(title_text="Acc (%)" if col == 1 else "", row=row, col=col)

	return fig