Spaces:

atharv6f
/

flash-attention-explorer

Sleeping

flash-attention-explorer / src /benchmark.py

a0y0346

fix: Add fallback SDPA benchmark when attention layer fails

685194e about 1 month ago

37.3 kB

	"""
	Benchmark module for FlashAttention Explorer.
	GPU benchmark functions for comparing attention backends using real HuggingFace models.
	"""

	import torch
	import torch.nn.functional as F
	import numpy as np
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots

	from .constants import GPU_SPECS, ATTENTION_BACKENDS, MODEL_CONFIGS, DEFAULT_GPU, DEFAULT_MODEL
	from .models import load_model, clear_model_cache
	from .attention_utils import (
	extract_attention_layer,
	create_attention_inputs,
	benchmark_attention_layer,
	get_model_attention_info,
	)


	def detect_gpu() -> dict:
	"""
	Detect the actual GPU and return its specs.

	Returns:
	Dict with GPU name and specs
	"""
	if not torch.cuda.is_available():
	return {"name": "CPU (No GPU)", "detected": False, **GPU_SPECS[DEFAULT_GPU]}

	gpu_name_raw = torch.cuda.get_device_name(0)
	gpu_name = gpu_name_raw.lower()

	# Get memory in GB for dynamic spec estimation
	try:
	mem_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
	except Exception:
	mem_gb = 24 # fallback

	# Match against known GPUs (ordered from newest to oldest)
	if "h200" in gpu_name:
	# H200 specs - HBM3e memory, very high bandwidth
	return {
	"detected": True,
	"detected_name": gpu_name_raw,
	"name": "NVIDIA H200",
	"tflops_fp16": 989, # Same compute as H100
	"bandwidth_gbps": 4800, # HBM3e: 4.8 TB/s
	"memory_gb": round(mem_gb),
	"sram_kb": 256,
	}
	elif "h100" in gpu_name:
	return {"detected": True, "detected_name": gpu_name_raw, **GPU_SPECS["H100"]}
	elif "a100" in gpu_name:
	return {"detected": True, "detected_name": gpu_name_raw, **GPU_SPECS["A100_80GB"]}
	elif "a10" in gpu_name:
	return {"detected": True, "detected_name": gpu_name_raw, **GPU_SPECS["A10G"]}
	elif "l40" in gpu_name:
	# L40S specs
	return {
	"detected": True,
	"detected_name": gpu_name_raw,
	"name": "NVIDIA L40S",
	"tflops_fp16": 362,
	"bandwidth_gbps": 864,
	"memory_gb": round(mem_gb),
	"sram_kb": 192,
	}
	elif "l4" in gpu_name:
	# L4 specs
	return {
	"detected": True,
	"detected_name": gpu_name_raw,
	"name": "NVIDIA L4",
	"tflops_fp16": 121,
	"bandwidth_gbps": 300,
	"memory_gb": round(mem_gb),
	"sram_kb": 96,
	}
	elif "t4" in gpu_name:
	return {
	"detected": True,
	"detected_name": gpu_name_raw,
	"name": "NVIDIA T4",
	"tflops_fp16": 65,
	"bandwidth_gbps": 320,
	"memory_gb": round(mem_gb),
	"sram_kb": 64,
	}
	elif "v100" in gpu_name:
	return {
	"detected": True,
	"detected_name": gpu_name_raw,
	"name": "NVIDIA V100",
	"tflops_fp16": 125,
	"bandwidth_gbps": 900,
	"memory_gb": round(mem_gb),
	"sram_kb": 128,
	}
	elif "rtx 4090" in gpu_name or "4090" in gpu_name:
	return {
	"detected": True,
	"detected_name": gpu_name_raw,
	"name": "NVIDIA RTX 4090",
	"tflops_fp16": 330,
	"bandwidth_gbps": 1008,
	"memory_gb": round(mem_gb),
	"sram_kb": 128,
	}
	else:
	# Unknown GPU - estimate specs using compute capability and SM count
	# These are the best indicators of performance we can query
	try:
	props = torch.cuda.get_device_properties(0)
	sm_count = props.multi_processor_count
	major, minor = torch.cuda.get_device_capability(0)

	# FP16 FLOPs per SM per cycle varies by architecture
	# Ampere (8.x): 256 FP16 ops/SM/cycle, Hopper (9.x): 512
	# Clock speed ~1.5-2 GHz typically
	if major >= 9: # Hopper/Ada
	flops_per_sm = 512
	clock_ghz = 1.8
	bw_per_gb_mem = 50 # Rough: HBM3 ~50 GB/s per GB capacity
	elif major >= 8: # Ampere
	flops_per_sm = 256
	clock_ghz = 1.5
	bw_per_gb_mem = 25 # HBM2e
	elif major >= 7: # Volta/Turing
	flops_per_sm = 128
	clock_ghz = 1.4
	bw_per_gb_mem = 28
	else: # Older
	flops_per_sm = 64
	clock_ghz = 1.2
	bw_per_gb_mem = 20

	# Estimate TFLOPS: SMs × FLOPs/SM/cycle × clock × 2 (FMA)
	est_tflops = (sm_count * flops_per_sm * clock_ghz * 2) / 1000
	est_bw = mem_gb * bw_per_gb_mem

	except Exception:
	# Fallback if properties query fails
	est_tflops = 125
	est_bw = 600

	return {
	"detected": True,
	"detected_name": gpu_name_raw,
	"name": gpu_name_raw,
	"tflops_fp16": round(est_tflops),
	"bandwidth_gbps": round(est_bw),
	"memory_gb": round(mem_gb),
	"sram_kb": 128,
	"estimated": True, # Flag that these are estimated from compute capability
	"compute_capability": f"{major}.{minor}" if 'major' in dir() else "unknown",
	}


	def run_attention_benchmark(
	model_name: str = None,
	seq_len: int = 1024,
	batch_size: int = 1,
	num_iterations: int = 10,
	warmup_iterations: int = 3,
	# Legacy parameters (used if model_name is None)
	num_heads: int = 16,
	head_dim: int = 64,
	) -> dict:
	"""
	Benchmark three SDPA backends using a real HuggingFace model's attention layer.

	Args:
	model_name: Name of the model from MODEL_CONFIGS (e.g., "SmolLM2-360M")
	If None, falls back to legacy random tensor mode
	seq_len: Sequence length (number of tokens)
	batch_size: Batch size
	num_iterations: Number of timed iterations
	warmup_iterations: Number of warmup iterations
	num_heads: (Legacy) Number of attention heads if model_name is None
	head_dim: (Legacy) Dimension per head if model_name is None

	Returns:
	Dict with timing and memory results per backend
	"""
	if not torch.cuda.is_available():
	return {"error": "CUDA not available"}

	device = torch.device("cuda")
	dtype = torch.float16

	# If model_name is provided, use real model dimensions for benchmarking
	if model_name is not None and model_name in MODEL_CONFIGS:
	try:
	# Load the real HuggingFace model
	model = load_model(model_name)

	# Get model attention info for real dimensions
	attn_info = get_model_attention_info(model)

	# Extract dimensions from real model
	model_num_heads = attn_info["num_attention_heads"]
	model_head_dim = attn_info["head_dim"]

	results = {"model_name": model_name, "using_real_model": True}
	results["model_info"] = attn_info

	# First try: Use actual attention layer forward pass
	attention_layer_works = False
	try:
	attention_layer = extract_attention_layer(model, layer_idx=0)
	hidden_states, position_ids = create_attention_inputs(
	model, batch_size, seq_len, device, dtype
	)

	# Test if attention layer works with first backend
	test_result = benchmark_attention_layer(
	attention_layer=attention_layer,
	hidden_states=hidden_states,
	position_ids=position_ids,
	backend="flash",
	num_iterations=2,
	warmup_iterations=1,
	)

	if test_result.get("time_ms") is not None:
	attention_layer_works = True

	del hidden_states, position_ids
	torch.cuda.empty_cache()

	except Exception as layer_error:
	print(f"[run_attention_benchmark] Attention layer extraction failed: {layer_error}")
	attention_layer_works = False

	if attention_layer_works:
	# Use actual attention layer
	hidden_states, position_ids = create_attention_inputs(
	model, batch_size, seq_len, device, dtype
	)

	for backend in ["math", "flash", "mem_efficient"]:
	result = benchmark_attention_layer(
	attention_layer=attention_layer,
	hidden_states=hidden_states,
	position_ids=position_ids,
	backend=backend,
	num_iterations=num_iterations,
	warmup_iterations=warmup_iterations,
	)
	results[backend] = result

	del hidden_states, position_ids
	torch.cuda.empty_cache()
	else:
	# Fallback: Use F.scaled_dot_product_attention with real model dimensions
	print(f"[run_attention_benchmark] Falling back to SDPA with model dimensions")
	results["fallback_mode"] = True

	# Create Q, K, V tensors with real model dimensions
	Q = torch.randn(batch_size, model_num_heads, seq_len, model_head_dim, device=device, dtype=dtype)
	K = torch.randn(batch_size, model_num_heads, seq_len, model_head_dim, device=device, dtype=dtype)
	V = torch.randn(batch_size, model_num_heads, seq_len, model_head_dim, device=device, dtype=dtype)

	backends = [
	("math", True, False, False),
	("flash", False, True, False),
	("mem_efficient", False, False, True),
	]

	for backend_name, enable_math, enable_flash, enable_mem_efficient in backends:
	try:
	torch.cuda.reset_peak_memory_stats()
	torch.cuda.synchronize()

	with torch.backends.cuda.sdp_kernel(
	enable_flash=enable_flash,
	enable_math=enable_math,
	enable_mem_efficient=enable_mem_efficient
	):
	# Warmup
	for _ in range(warmup_iterations):
	_ = F.scaled_dot_product_attention(Q, K, V)
	torch.cuda.synchronize()

	# Timed runs
	start = torch.cuda.Event(enable_timing=True)
	end = torch.cuda.Event(enable_timing=True)

	start.record()
	for _ in range(num_iterations):
	_ = F.scaled_dot_product_attention(Q, K, V)
	end.record()
	torch.cuda.synchronize()

	time_ms = start.elapsed_time(end) / num_iterations
	memory_mb = torch.cuda.max_memory_allocated() / 1e6

	results[backend_name] = {
	"time_ms": round(time_ms, 3),
	"memory_mb": round(memory_mb, 1),
	"status": "success"
	}

	except Exception as e:
	results[backend_name] = {
	"time_ms": None,
	"memory_mb": None,
	"status": f"error: {str(e)[:50]}"
	}

	del Q, K, V
	torch.cuda.empty_cache()

	# Calculate speedups
	if results.get("math", {}).get("time_ms"):
	base_time = results["math"]["time_ms"]
	for backend in ["math", "flash", "mem_efficient"]:
	if results.get(backend, {}).get("time_ms"):
	results[backend]["speedup"] = round(base_time / results[backend]["time_ms"], 2)

	return results

	except Exception as e:
	return {"error": f"Failed to load model: {str(e)[:100]}"}

	# Legacy mode: Use raw SDPA with random tensors (fallback)
	results = {"using_real_model": False}

	# Create input tensors
	Q = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device, dtype=dtype)
	K = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device, dtype=dtype)
	V = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device, dtype=dtype)

	# Test each backend
	backends = [
	("math", True, False, False),
	("flash", False, True, False),
	("mem_efficient", False, False, True),
	]

	for backend_name, enable_math, enable_flash, enable_mem_efficient in backends:
	try:
	torch.cuda.reset_peak_memory_stats()
	torch.cuda.synchronize()

	with torch.backends.cuda.sdp_kernel(
	enable_flash=enable_flash,
	enable_math=enable_math,
	enable_mem_efficient=enable_mem_efficient
	):
	# Warmup
	for _ in range(warmup_iterations):
	_ = F.scaled_dot_product_attention(Q, K, V)
	torch.cuda.synchronize()

	# Timed runs
	start = torch.cuda.Event(enable_timing=True)
	end = torch.cuda.Event(enable_timing=True)

	start.record()
	for _ in range(num_iterations):
	_ = F.scaled_dot_product_attention(Q, K, V)
	end.record()
	torch.cuda.synchronize()

	time_ms = start.elapsed_time(end) / num_iterations
	memory_mb = torch.cuda.max_memory_allocated() / 1e6

	results[backend_name] = {
	"time_ms": round(time_ms, 3),
	"memory_mb": round(memory_mb, 1),
	"status": "success"
	}

	except Exception as e:
	results[backend_name] = {
	"time_ms": None,
	"memory_mb": None,
	"status": f"error: {str(e)[:50]}"
	}

	# Calculate speedups relative to math backend
	if results.get("math", {}).get("time_ms"):
	base_time = results["math"]["time_ms"]
	for backend in results:
	if isinstance(results[backend], dict) and results[backend].get("time_ms"):
	results[backend]["speedup"] = round(base_time / results[backend]["time_ms"], 2)

	# Clean up
	del Q, K, V
	torch.cuda.empty_cache()

	return results


	def run_scaling_benchmark(
	model_name: str = None,
	seq_lengths: list = None,
	batch_size: int = 1,
	# Legacy parameters (used if model_name is None)
	num_heads: int = 16,
	head_dim: int = 64,
	) -> dict:
	"""
	Benchmark attention backends across multiple sequence lengths using a real model.

	Args:
	model_name: Name of the model from MODEL_CONFIGS (e.g., "SmolLM2-360M")
	seq_lengths: List of sequence lengths to test
	batch_size: Batch size
	num_heads: (Legacy) Number of attention heads if model_name is None
	head_dim: (Legacy) Dimension per head if model_name is None

	Returns:
	Dict with arrays of timing and memory results for each backend
	"""
	if seq_lengths is None:
	seq_lengths = [512, 1024, 2048, 4096]

	if not torch.cuda.is_available():
	return {"error": "CUDA not available"}

	results = {
	"seq_lengths": seq_lengths,
	"model_name": model_name,
	"math": {"time_ms": [], "memory_mb": []},
	"flash": {"time_ms": [], "memory_mb": []},
	"mem_efficient": {"time_ms": [], "memory_mb": []},
	}

	for seq_len in seq_lengths:
	bench_result = run_attention_benchmark(
	model_name=model_name,
	seq_len=seq_len,
	batch_size=batch_size,
	num_iterations=5, # Fewer iterations for scaling test
	warmup_iterations=2,
	# Legacy params (ignored if model_name is set)
	num_heads=num_heads,
	head_dim=head_dim,
	)

	for backend in ["math", "flash", "mem_efficient"]:
	if bench_result.get(backend, {}).get("time_ms"):
	results[backend]["time_ms"].append(bench_result[backend]["time_ms"])
	results[backend]["memory_mb"].append(bench_result[backend]["memory_mb"])
	else:
	results[backend]["time_ms"].append(None)
	results[backend]["memory_mb"].append(None)

	return results


	def create_benchmark_results_table(results: dict) -> str:
	"""Create a markdown table from benchmark results."""
	if "error" in results:
	return f"Error: {results['error']}"

	# Build table
	lines = [
	"\| Backend \| Time (ms) \| Memory (MB) \| Speedup \|",
	"\|---------\|-----------\|-------------\|---------\|",
	]

	for backend in ["math", "flash", "mem_efficient"]:
	if backend in results:
	r = results[backend]
	name = ATTENTION_BACKENDS.get(backend, backend)
	time_str = f"{r['time_ms']:.2f}" if r.get('time_ms') else "N/A"
	mem_str = f"{r['memory_mb']:.0f}" if r.get('memory_mb') else "N/A"
	speedup_str = f"{r.get('speedup', 1.0):.1f}×"
	lines.append(f"\| {name} \| {time_str} \| {mem_str} \| {speedup_str} \|")

	return "\n".join(lines)


	def create_benchmark_insight(results: dict) -> str:
	"""Create insight text from benchmark results."""
	if "error" in results:
	return ""

	flash = results.get("flash", {})
	math = results.get("math", {})

	if not flash.get("time_ms") or not math.get("time_ms"):
	return "Note: Some backends may not be available on this GPU."

	speedup = math["time_ms"] / flash["time_ms"]
	mem_reduction = math["memory_mb"] / flash["memory_mb"] if flash["memory_mb"] > 0 else 1

	return f"""Key Insight:
	FlashAttention is {speedup:.1f}× faster and uses {mem_reduction:.1f}× less memory!

	This improvement comes from:
	- Tiling attention into SRAM-sized blocks
	- Never materializing the full N×N attention matrix in HBM
	- Fused kernel avoiding multiple HBM round-trips"""


	def create_scaling_chart(results: dict) -> go.Figure:
	"""Create a scaling chart showing time and memory vs sequence length."""
	if "error" in results:
	fig = go.Figure()
	fig.add_annotation(
	x=0.5, y=0.5,
	text=f"Error: {results['error']}",
	showarrow=False,
	font=dict(size=16, color="red")
	)
	return fig

	seq_lengths = results["seq_lengths"]

	# Create subplot with two y-axes
	fig = make_subplots(
	rows=1, cols=2,
	subplot_titles=("Execution Time", "Peak Memory"),
	horizontal_spacing=0.12,
	)

	colors = {
	"math": "rgba(239, 68, 68, 0.8)", # Red
	"flash": "rgba(34, 197, 94, 0.8)", # Green
	"mem_efficient": "rgba(59, 130, 246, 0.8)", # Blue
	}

	# Plot time
	for backend in ["math", "flash", "mem_efficient"]:
	times = results[backend]["time_ms"]
	name = ATTENTION_BACKENDS.get(backend, backend)

	# Filter out None values
	valid_points = [(s, t) for s, t in zip(seq_lengths, times) if t is not None]
	if valid_points:
	x_vals, y_vals = zip(*valid_points)
	fig.add_trace(
	go.Scatter(
	x=list(x_vals),
	y=list(y_vals),
	mode="lines+markers",
	name=name,
	line=dict(color=colors[backend], width=2),
	marker=dict(size=8),
	legendgroup=backend,
	),
	row=1, col=1
	)

	# Plot memory
	for backend in ["math", "flash", "mem_efficient"]:
	memory = results[backend]["memory_mb"]
	name = ATTENTION_BACKENDS.get(backend, backend)

	valid_points = [(s, m) for s, m in zip(seq_lengths, memory) if m is not None]
	if valid_points:
	x_vals, y_vals = zip(*valid_points)
	fig.add_trace(
	go.Scatter(
	x=list(x_vals),
	y=list(y_vals),
	mode="lines+markers",
	name=name,
	line=dict(color=colors[backend], width=2),
	marker=dict(size=8),
	legendgroup=backend,
	showlegend=False,
	),
	row=1, col=2
	)

	fig.update_xaxes(title_text="Sequence Length", row=1, col=1)
	fig.update_xaxes(title_text="Sequence Length", row=1, col=2)
	fig.update_yaxes(title_text="Time (ms)", row=1, col=1)
	fig.update_yaxes(title_text="Memory (MB)", row=1, col=2)

	fig.update_layout(
	height=350,
	margin=dict(l=50, r=50, t=50, b=50),
	legend=dict(
	orientation="h",
	yanchor="bottom",
	y=-0.3,
	xanchor="center",
	x=0.5
	),
	)

	return fig


	def calculate_attention_flops(seq_len: int, num_heads: int, head_dim: int, batch_size: int = 1) -> float:
	"""
	Calculate FLOPs for scaled dot-product attention.

	FLOPs breakdown:
	- Q @ K^T: 2 * batch * heads * seq * seq * head_dim
	- Softmax: ~5 * batch * heads * seq * seq (exp, sum, div)
	- P @ V: 2 * batch * heads * seq * seq * head_dim

	Total: ~4 * batch * heads * seq² * head_dim + 5 * batch * heads * seq²
	"""
	qk_flops = 2 * batch_size * num_heads * seq_len * seq_len * head_dim
	softmax_flops = 5 * batch_size * num_heads * seq_len * seq_len
	pv_flops = 2 * batch_size * num_heads * seq_len * seq_len * head_dim
	return qk_flops + softmax_flops + pv_flops


	def calculate_memory_traffic(
	seq_len: int,
	num_heads: int,
	head_dim: int,
	batch_size: int = 1,
	is_flash: bool = False,
	dtype_bytes: int = 2, # FP16
	) -> float:
	"""
	Calculate memory traffic in bytes for attention.

	Standard Attention:
	- Read Q, K, V: 3 * batch * heads * seq * head_dim * dtype_bytes
	- Write S = Q @ K^T: batch * heads * seq * seq * dtype_bytes
	- Read S for softmax: batch * heads * seq * seq * dtype_bytes
	- Write P = softmax(S): batch * heads * seq * seq * dtype_bytes
	- Read P and V: batch * heads * seq * seq + batch * heads * seq * head_dim
	- Write O: batch * heads * seq * head_dim * dtype_bytes

	FlashAttention:
	- Read Q, K, V once: 3 * batch * heads * seq * head_dim * dtype_bytes
	- Write O once: batch * heads * seq * head_dim * dtype_bytes
	- No attention matrix written to HBM!
	"""
	qkv_size = 3 * batch_size * num_heads * seq_len * head_dim * dtype_bytes
	output_size = batch_size * num_heads * seq_len * head_dim * dtype_bytes

	if is_flash:
	# FlashAttention: Only Q, K, V reads + O write
	return qkv_size + output_size
	else:
	# Standard: Also materializes attention matrix (read + write twice)
	attention_matrix_size = batch_size * num_heads * seq_len * seq_len * dtype_bytes
	return qkv_size + output_size + 3 * attention_matrix_size


	def calculate_roofline_metrics(
	results: dict,
	seq_len: int,
	num_heads: int,
	head_dim: int,
	batch_size: int = 1,
	) -> dict:
	"""
	Calculate arithmetic intensity and achieved TFLOPS from benchmark results.

	Returns dict with measured metrics for each backend.
	"""
	flops = calculate_attention_flops(seq_len, num_heads, head_dim, batch_size)

	metrics = {}

	for backend in ["math", "flash", "mem_efficient"]:
	if backend not in results or results[backend].get("time_ms") is None:
	continue

	time_ms = results[backend]["time_ms"]
	time_s = time_ms / 1000.0

	# Calculate achieved TFLOPS
	achieved_tflops = (flops / time_s) / 1e12

	# Calculate memory traffic (approximation)
	is_flash = backend in ["flash", "mem_efficient"]
	memory_bytes = calculate_memory_traffic(
	seq_len, num_heads, head_dim, batch_size, is_flash=is_flash
	)

	# Arithmetic intensity = FLOPs / bytes
	arith_intensity = flops / memory_bytes

	metrics[backend] = {
	"flops": flops,
	"memory_bytes": memory_bytes,
	"time_ms": time_ms,
	"achieved_tflops": achieved_tflops,
	"arith_intensity": arith_intensity,
	}

	return metrics


	def create_roofline_chart(
	results: dict,
	gpu_specs: dict = None,
	benchmark_metrics: dict = None,
	) -> go.Figure:
	"""
	Create a roofline chart showing where different attention implementations fall.

	The roofline model shows:
	- X-axis: Arithmetic intensity (FLOPs per byte of memory traffic)
	- Y-axis: Performance (TFLOPS)
	- The roofline is min(peak_compute, bandwidth * intensity)

	Args:
	results: Benchmark results dict (can be empty)
	gpu_specs: GPU specifications dict (from detect_gpu() or GPU_SPECS)
	benchmark_metrics: Roofline metrics from calculate_roofline_metrics()

	If benchmark_metrics is provided, plots MEASURED values.
	Otherwise, plots theoretical approximations.
	"""
	# Use provided specs or default to A10G
	if gpu_specs is None:
	gpu = GPU_SPECS[DEFAULT_GPU]
	else:
	gpu = gpu_specs

	peak_tflops = gpu["tflops_fp16"]
	bandwidth_gbps = gpu["bandwidth_gbps"]

	# Ridge point: where memory-bound meets compute-bound
	ridge_point = (peak_tflops * 1e12) / (bandwidth_gbps * 1e9)

	# Create figure
	fig = go.Figure()

	# Roofline curve
	x_range = np.logspace(0, 3, 100)
	y_roofline = np.minimum(
	peak_tflops,
	bandwidth_gbps * x_range / 1000
	)

	fig.add_trace(go.Scatter(
	x=x_range,
	y=y_roofline,
	mode="lines",
	name="Roofline",
	line=dict(color="rgba(0, 0, 0, 0.6)", width=2),
	))

	# Memory-bound region (dashed)
	fig.add_trace(go.Scatter(
	x=[1, ridge_point],
	y=[bandwidth_gbps / 1000, peak_tflops],
	mode="lines",
	name="Memory Bound",
	line=dict(color="rgba(239, 68, 68, 0.5)", width=3, dash="dash"),
	))

	# Compute-bound region (dashed)
	fig.add_trace(go.Scatter(
	x=[ridge_point, 1000],
	y=[peak_tflops, peak_tflops],
	mode="lines",
	name="Compute Bound",
	line=dict(color="rgba(34, 197, 94, 0.5)", width=3, dash="dash"),
	))

	# Determine if we have measured data or should use theoretical
	use_measured = benchmark_metrics is not None and len(benchmark_metrics) > 0

	if use_measured:
	# Plot MEASURED data points
	title_suffix = " (Measured)"

	# Math/Standard backend
	if "math" in benchmark_metrics:
	m = benchmark_metrics["math"]
	fig.add_trace(go.Scatter(
	x=[m["arith_intensity"]],
	y=[m["achieved_tflops"]],
	mode="markers",
	name=f"Math ({m['achieved_tflops']:.1f} TFLOPS, {m['time_ms']:.1f}ms)",
	marker=dict(size=16, color="#dc2626", symbol="circle",
	line=dict(color="white", width=2)),
	))
	# Add label as annotation for better visibility
	fig.add_annotation(
	x=np.log10(m["arith_intensity"]),
	y=m["achieved_tflops"],
	text=f"<b>Math</b><br>{m['time_ms']:.1f}ms",
	showarrow=True,
	arrowhead=2,
	arrowsize=1,
	arrowwidth=1,
	arrowcolor="#dc2626",
	ax=0,
	ay=-40,
	font=dict(size=10, color="#dc2626"),
	bgcolor="rgba(255, 255, 255, 0.95)",
	bordercolor="#dc2626",
	borderwidth=1,
	borderpad=3,
	)

	# Flash backend
	if "flash" in benchmark_metrics:
	m = benchmark_metrics["flash"]
	fig.add_trace(go.Scatter(
	x=[m["arith_intensity"]],
	y=[m["achieved_tflops"]],
	mode="markers",
	name=f"Flash ({m['achieved_tflops']:.1f} TFLOPS, {m['time_ms']:.1f}ms)",
	marker=dict(size=16, color="#16a34a", symbol="circle",
	line=dict(color="white", width=2)),
	))
	fig.add_annotation(
	x=np.log10(m["arith_intensity"]),
	y=m["achieved_tflops"],
	text=f"<b>Flash</b><br>{m['time_ms']:.1f}ms",
	showarrow=True,
	arrowhead=2,
	arrowsize=1,
	arrowwidth=1,
	arrowcolor="#16a34a",
	ax=0,
	ay=-40,
	font=dict(size=10, color="#16a34a"),
	bgcolor="rgba(255, 255, 255, 0.95)",
	bordercolor="#16a34a",
	borderwidth=1,
	borderpad=3,
	)

	# Memory-efficient backend
	if "mem_efficient" in benchmark_metrics:
	m = benchmark_metrics["mem_efficient"]
	fig.add_trace(go.Scatter(
	x=[m["arith_intensity"]],
	y=[m["achieved_tflops"]],
	mode="markers",
	name=f"MemEff ({m['achieved_tflops']:.1f} TFLOPS, {m['time_ms']:.1f}ms)",
	marker=dict(size=16, color="#2563eb", symbol="circle",
	line=dict(color="white", width=2)),
	))
	fig.add_annotation(
	x=np.log10(m["arith_intensity"]),
	y=m["achieved_tflops"],
	text=f"<b>MemEff</b><br>{m['time_ms']:.1f}ms",
	showarrow=True,
	arrowhead=2,
	arrowsize=1,
	arrowwidth=1,
	arrowcolor="#2563eb",
	ax=30, # Offset to avoid overlap
	ay=-30,
	font=dict(size=10, color="#2563eb"),
	bgcolor="rgba(255, 255, 255, 0.95)",
	bordercolor="#2563eb",
	borderwidth=1,
	borderpad=3,
	)
	else:
	# Plot THEORETICAL approximations
	title_suffix = " (Theoretical)"

	# Standard attention - memory bound
	std_intensity = 10
	std_achieved = min(peak_tflops * 0.15, bandwidth_gbps * std_intensity / 1000)

	fig.add_trace(go.Scatter(
	x=[std_intensity],
	y=[std_achieved],
	mode="markers",
	name="Standard (Theoretical)",
	marker=dict(size=15, color="rgba(220, 38, 38, 0.6)", symbol="circle-open",
	line=dict(width=2)),
	))
	fig.add_annotation(
	x=np.log10(std_intensity),
	y=std_achieved,
	text="<b>Standard</b><br>(theoretical)",
	showarrow=True,
	arrowhead=2,
	ax=0,
	ay=-35,
	font=dict(size=10, color="#dc2626"),
	bgcolor="rgba(255, 255, 255, 0.9)",
	bordercolor="rgba(220, 38, 38, 0.5)",
	borderwidth=1,
	borderpad=3,
	)

	# FlashAttention - compute bound
	flash_intensity = 200
	flash_achieved = min(peak_tflops * 0.7, bandwidth_gbps * flash_intensity / 1000)

	fig.add_trace(go.Scatter(
	x=[flash_intensity],
	y=[flash_achieved],
	mode="markers",
	name="Flash (Theoretical)",
	marker=dict(size=15, color="rgba(22, 163, 74, 0.6)", symbol="circle-open",
	line=dict(width=2)),
	))
	fig.add_annotation(
	x=np.log10(flash_intensity),
	y=flash_achieved,
	text="<b>FlashAttention</b><br>(theoretical)",
	showarrow=True,
	arrowhead=2,
	ax=0,
	ay=-35,
	font=dict(size=10, color="#16a34a"),
	bgcolor="rgba(255, 255, 255, 0.9)",
	bordercolor="rgba(22, 163, 74, 0.5)",
	borderwidth=1,
	borderpad=3,
	)

	# Add ridge point marker
	fig.add_trace(go.Scatter(
	x=[ridge_point],
	y=[peak_tflops],
	mode="markers",
	name=f"Ridge Point ({ridge_point:.0f} FLOPs/byte)",
	marker=dict(size=10, color="rgba(0, 0, 0, 0.6)", symbol="diamond"),
	))

	# Add annotations with better visibility (white background)
	fig.add_annotation(
	x=np.log10(5),
	y=peak_tflops * 0.1,
	text="<b>Memory Bound</b><br>(limited by bandwidth)",
	showarrow=False,
	font=dict(size=11, color="#dc2626"), # Solid red
	bgcolor="rgba(255, 255, 255, 0.9)",
	bordercolor="#dc2626",
	borderwidth=1,
	borderpad=4,
	)

	fig.add_annotation(
	x=np.log10(300),
	y=peak_tflops * 0.65,
	text="<b>Compute Bound</b><br>(limited by TFLOPS)",
	showarrow=False,
	font=dict(size=11, color="#16a34a"), # Solid green
	bgcolor="rgba(255, 255, 255, 0.9)",
	bordercolor="#16a34a",
	borderwidth=1,
	borderpad=4,
	)

	# Use detected_name if available, otherwise use name
	display_name = gpu.get("detected_name", gpu.get("name", "GPU"))

	# Add estimated indicator if specs were estimated
	estimated_note = " (estimated specs)" if gpu.get("estimated") else ""

	fig.update_layout(
	title=dict(
	text=f"Roofline Model: {display_name}{title_suffix}{estimated_note}<br>"
	f"<span style='font-size:12px;color:#666'>"
	f"Peak: {peak_tflops} TFLOPS \| Bandwidth: {bandwidth_gbps} GB/s</span>",
	x=0.5,
	font=dict(size=14),
	),
	xaxis=dict(
	title="Arithmetic Intensity (FLOPs/byte)",
	type="log",
	range=[0, 3],
	),
	yaxis=dict(
	title="Performance (TFLOPS)",
	range=[0, peak_tflops * 1.2], # More headroom for text
	),
	height=420,
	margin=dict(l=60, r=40, t=80, b=80), # More room for title and legend
	legend=dict(
	orientation="h",
	yanchor="bottom",
	y=-0.30,
	xanchor="center",
	x=0.5,
	font=dict(size=10),
	),
	showlegend=True,
	)

	return fig


	def get_roofline_insight(benchmark_metrics: dict = None) -> str:
	"""Return insight text for the roofline chart."""
	base_insight = """Why FlashAttention is Faster:

	The roofline model reveals the key insight:

	1. Standard Attention sits in the memory-bound region (left of ridge point)
	- Limited by HBM bandwidth, not compute
	- Reading/writing the N×N attention matrix dominates runtime

	2. FlashAttention moves to the compute-bound region (right of ridge point)
	- By never materializing the full attention matrix
	- Arithmetic intensity increases ~20-50×
	- Can now utilize most of the GPU's TFLOPS

	The same FLOPs, but 10× less memory traffic = faster execution!"""

	if benchmark_metrics and "math" in benchmark_metrics and "flash" in benchmark_metrics:
	math_m = benchmark_metrics["math"]
	flash_m = benchmark_metrics["flash"]

	speedup = math_m["time_ms"] / flash_m["time_ms"]
	intensity_ratio = flash_m["arith_intensity"] / math_m["arith_intensity"]

	measured_insight = f"""

	---

	📊 Measured Results:
	- Math backend: {math_m['achieved_tflops']:.1f} TFLOPS @ {math_m['arith_intensity']:.0f} FLOPs/byte
	- Flash backend: {flash_m['achieved_tflops']:.1f} TFLOPS @ {flash_m['arith_intensity']:.0f} FLOPs/byte
	- Speedup: {speedup:.1f}× faster
	- Intensity increase: {intensity_ratio:.0f}× higher arithmetic intensity"""

	return base_insight + measured_insight

	return base_insight + "\n\nRun a benchmark to see measured values on the chart!"