Spaces:

atharv6f
/

flash-attention-explorer

Sleeping

a0y0346

Fix GQA scaling chart x-axis: use log scale to space tick labels properly

287497c 2 months ago

21.9 kB

	"""
	GQA/MQA comparison module.

	Demonstrates Grouped-Query Attention (GQA) and Multi-Query Attention (MQA)
	using REAL HuggingFace model configurations and benchmarks.

	Key concepts:
	- MHA (Multi-Head Attention): Each query head has its own K,V heads (ratio 1:1)
	- GQA (Grouped-Query Attention): Multiple query heads share K,V heads (ratio N:1)
	- MQA (Multi-Query Attention): All query heads share one K,V head (ratio N:1 where N=num_heads)
	"""

	import torch
	import torch.nn.functional as F
	import numpy as np
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots

	from .models import load_model


	def get_gqa_config_from_model(model_name: str) -> dict:
	"""
	Load model and extract REAL GQA configuration from model.config.

	Args:
	model_name: Model name to load

	Returns:
	Dict with GQA configuration from actual model
	"""
	model = load_model(model_name)
	config = model.config

	num_heads = config.num_attention_heads
	num_kv_heads = getattr(config, 'num_key_value_heads', num_heads)
	head_dim = config.hidden_size // num_heads
	num_layers = config.num_hidden_layers

	# Calculate GQA ratio
	gqa_ratio = num_heads // num_kv_heads if num_kv_heads > 0 else 1

	# Determine attention type
	if num_kv_heads == num_heads:
	attention_type = "MHA"
	attention_description = "Multi-Head Attention (each Q head has own K,V)"
	elif num_kv_heads == 1:
	attention_type = "MQA"
	attention_description = "Multi-Query Attention (all Q heads share 1 K,V)"
	else:
	attention_type = "GQA"
	attention_description = f"Grouped-Query Attention ({gqa_ratio} Q heads share 1 K,V pair)"

	return {
	"model_name": model_name,
	"num_heads": num_heads,
	"num_kv_heads": num_kv_heads,
	"head_dim": head_dim,
	"num_layers": num_layers,
	"hidden_size": config.hidden_size,
	"gqa_ratio": gqa_ratio,
	"attention_type": attention_type,
	"attention_description": attention_description,
	"is_mha": num_heads == num_kv_heads,
	"is_mqa": num_kv_heads == 1,
	"is_gqa": 1 < num_kv_heads < num_heads,
	}


	def calculate_kv_cache_memory(
	num_kv_heads: int,
	head_dim: int,
	num_layers: int,
	seq_len: int,
	batch_size: int = 1,
	dtype_bytes: int = 2, # FP16
	) -> float:
	"""
	Calculate KV cache memory in MB.

	Formula: 2 (K+V) × num_kv_heads × head_dim × seq_len × num_layers × batch_size × dtype_bytes
	"""
	bytes_total = 2 * num_kv_heads * head_dim * seq_len * num_layers * batch_size * dtype_bytes
	return bytes_total / (1024 * 1024)


	def compare_attention_memory(model_name: str, seq_len: int) -> dict:
	"""
	Compare KV cache memory for MHA vs actual GQA configuration.

	Uses REAL model config to show memory savings.
	"""
	gqa_config = get_gqa_config_from_model(model_name)

	num_heads = gqa_config["num_heads"]
	num_kv_heads = gqa_config["num_kv_heads"]
	head_dim = gqa_config["head_dim"]
	num_layers = gqa_config["num_layers"]

	# Calculate actual GQA memory
	gqa_memory_mb = calculate_kv_cache_memory(
	num_kv_heads=num_kv_heads,
	head_dim=head_dim,
	num_layers=num_layers,
	seq_len=seq_len,
	)

	# Calculate what MHA would use (if every Q head had own K,V)
	mha_memory_mb = calculate_kv_cache_memory(
	num_kv_heads=num_heads, # MHA: num_kv_heads = num_heads
	head_dim=head_dim,
	num_layers=num_layers,
	seq_len=seq_len,
	)

	# Calculate MQA memory (single K,V for all heads)
	mqa_memory_mb = calculate_kv_cache_memory(
	num_kv_heads=1, # MQA: single KV head
	head_dim=head_dim,
	num_layers=num_layers,
	seq_len=seq_len,
	)

	savings_vs_mha = mha_memory_mb - gqa_memory_mb
	savings_ratio = mha_memory_mb / gqa_memory_mb if gqa_memory_mb > 0 else 1

	return {
	"model_name": model_name,
	"seq_len": seq_len,
	"gqa_config": gqa_config,
	"mha_memory_mb": round(mha_memory_mb, 2),
	"gqa_memory_mb": round(gqa_memory_mb, 2),
	"mqa_memory_mb": round(mqa_memory_mb, 2),
	"savings_mb": round(savings_vs_mha, 2),
	"savings_ratio": round(savings_ratio, 1),
	}


	def benchmark_gqa_attention(
	model_name: str,
	seq_len: int,
	batch_size: int = 1,
	num_iterations: int = 10,
	use_flash: bool = True,
	) -> dict:
	"""
	Benchmark attention with real model's GQA configuration.

	Uses actual num_kv_heads from model.config and expands KV for SDPA.
	"""
	if not torch.cuda.is_available():
	return {"time_ms": 0, "memory_mb": 0, "status": "error: CUDA not available"}

	device = torch.device("cuda")
	dtype = torch.float16

	try:
	gqa_config = get_gqa_config_from_model(model_name)
	num_heads = gqa_config["num_heads"]
	num_kv_heads = gqa_config["num_kv_heads"]
	head_dim = gqa_config["head_dim"]
	gqa_ratio = gqa_config["gqa_ratio"]

	# Create Q tensor: [batch, num_heads, seq_len, head_dim]
	Q = torch.randn(batch_size, num_heads, seq_len, head_dim, dtype=dtype, device=device)

	# Create K,V with actual GQA KV head count
	K = torch.randn(batch_size, num_kv_heads, seq_len, head_dim, dtype=dtype, device=device)
	V = torch.randn(batch_size, num_kv_heads, seq_len, head_dim, dtype=dtype, device=device)

	# Expand K,V to match Q head count (this is what happens in GQA)
	if num_kv_heads < num_heads:
	K = K.repeat_interleave(gqa_ratio, dim=1)
	V = V.repeat_interleave(gqa_ratio, dim=1)

	# Backend selection
	if use_flash:
	enable_math, enable_flash, enable_mem_efficient = False, True, False
	else:
	enable_math, enable_flash, enable_mem_efficient = True, False, False

	# Warmup
	for _ in range(3):
	with torch.backends.cuda.sdp_kernel(
	enable_flash=enable_flash,
	enable_math=enable_math,
	enable_mem_efficient=enable_mem_efficient
	):
	_ = F.scaled_dot_product_attention(Q, K, V, is_causal=True)

	torch.cuda.synchronize()
	torch.cuda.reset_peak_memory_stats()

	# Timed runs
	start = torch.cuda.Event(enable_timing=True)
	end = torch.cuda.Event(enable_timing=True)

	start.record()
	for _ in range(num_iterations):
	with torch.backends.cuda.sdp_kernel(
	enable_flash=enable_flash,
	enable_math=enable_math,
	enable_mem_efficient=enable_mem_efficient
	):
	output = F.scaled_dot_product_attention(Q, K, V, is_causal=True)
	end.record()

	torch.cuda.synchronize()

	time_ms = start.elapsed_time(end) / num_iterations
	memory_mb = torch.cuda.max_memory_allocated() / (1024 * 1024)

	del Q, K, V, output
	torch.cuda.empty_cache()

	return {
	"time_ms": round(time_ms, 3),
	"memory_mb": round(memory_mb, 1),
	"config": gqa_config,
	"attention_type": gqa_config["attention_type"],
	"gqa_ratio": gqa_ratio,
	"status": "success",
	}

	except Exception as e:
	return {
	"time_ms": 0,
	"memory_mb": 0,
	"status": f"error: {str(e)[:100]}",
	}


	def benchmark_mha_attention(
	model_name: str,
	seq_len: int,
	batch_size: int = 1,
	num_iterations: int = 10,
	use_flash: bool = True,
	) -> dict:
	"""
	Benchmark attention as if model used MHA (for comparison).

	Uses num_heads for both Q and KV to simulate MHA.
	"""
	if not torch.cuda.is_available():
	return {"time_ms": 0, "memory_mb": 0, "status": "error: CUDA not available"}

	device = torch.device("cuda")
	dtype = torch.float16

	try:
	gqa_config = get_gqa_config_from_model(model_name)
	num_heads = gqa_config["num_heads"]
	head_dim = gqa_config["head_dim"]

	# For MHA simulation: num_kv_heads = num_heads
	Q = torch.randn(batch_size, num_heads, seq_len, head_dim, dtype=dtype, device=device)
	K = torch.randn(batch_size, num_heads, seq_len, head_dim, dtype=dtype, device=device)
	V = torch.randn(batch_size, num_heads, seq_len, head_dim, dtype=dtype, device=device)

	# Backend selection
	if use_flash:
	enable_math, enable_flash, enable_mem_efficient = False, True, False
	else:
	enable_math, enable_flash, enable_mem_efficient = True, False, False

	# Warmup
	for _ in range(3):
	with torch.backends.cuda.sdp_kernel(
	enable_flash=enable_flash,
	enable_math=enable_math,
	enable_mem_efficient=enable_mem_efficient
	):
	_ = F.scaled_dot_product_attention(Q, K, V, is_causal=True)

	torch.cuda.synchronize()
	torch.cuda.reset_peak_memory_stats()

	# Timed runs
	start = torch.cuda.Event(enable_timing=True)
	end = torch.cuda.Event(enable_timing=True)

	start.record()
	for _ in range(num_iterations):
	with torch.backends.cuda.sdp_kernel(
	enable_flash=enable_flash,
	enable_math=enable_math,
	enable_mem_efficient=enable_mem_efficient
	):
	output = F.scaled_dot_product_attention(Q, K, V, is_causal=True)
	end.record()

	torch.cuda.synchronize()

	time_ms = start.elapsed_time(end) / num_iterations
	memory_mb = torch.cuda.max_memory_allocated() / (1024 * 1024)

	del Q, K, V, output
	torch.cuda.empty_cache()

	return {
	"time_ms": round(time_ms, 3),
	"memory_mb": round(memory_mb, 1),
	"attention_type": "MHA (simulated)",
	"num_kv_heads": num_heads,
	"status": "success",
	}

	except Exception as e:
	return {
	"time_ms": 0,
	"memory_mb": 0,
	"status": f"error: {str(e)[:100]}",
	}


	def create_head_sharing_diagram(model_name: str) -> go.Figure:
	"""
	Create visual diagram showing how Q heads share K,V heads.

	Uses REAL model config for accurate visualization.
	"""
	gqa_config = get_gqa_config_from_model(model_name)

	num_heads = gqa_config["num_heads"]
	num_kv_heads = gqa_config["num_kv_heads"]
	gqa_ratio = gqa_config["gqa_ratio"]
	attention_type = gqa_config["attention_type"]

	fig = make_subplots(
	rows=1, cols=3,
	subplot_titles=(
	f"<b>MHA</b> ({num_heads}:{num_heads})",
	f"<b>{attention_type}</b> ({num_heads}:{num_kv_heads})",
	f"<b>MQA</b> ({num_heads}:1)",
	),
	horizontal_spacing=0.08,
	)

	# Limit display for visual clarity
	display_heads = min(num_heads, 8)
	display_kv = min(num_kv_heads, 8)

	# Colors
	q_color = "#3b82f6" # Blue for Q heads
	kv_color = "#22c55e" # Green for KV heads

	# MHA: 1:1 mapping
	for i in range(display_heads):
	# Q head
	fig.add_trace(go.Scatter(
	x=[0], y=[display_heads - 1 - i],
	mode="markers+text",
	marker=dict(size=25, color=q_color, symbol="circle"),
	text=[f"Q{i}"],
	textposition="middle center",
	textfont=dict(color="white", size=9),
	showlegend=False,
	hoverinfo="skip",
	), row=1, col=1)

	# KV head
	fig.add_trace(go.Scatter(
	x=[2], y=[display_heads - 1 - i],
	mode="markers+text",
	marker=dict(size=25, color=kv_color, symbol="square"),
	text=[f"KV{i}"],
	textposition="middle center",
	textfont=dict(color="white", size=9),
	showlegend=False,
	hoverinfo="skip",
	), row=1, col=1)

	# Connection line
	fig.add_trace(go.Scatter(
	x=[0.3, 1.7], y=[display_heads - 1 - i, display_heads - 1 - i],
	mode="lines",
	line=dict(color="rgba(0,0,0,0.3)", width=1),
	showlegend=False,
	hoverinfo="skip",
	), row=1, col=1)

	# GQA: Actual model configuration
	for i in range(display_heads):
	# Q head
	fig.add_trace(go.Scatter(
	x=[0], y=[display_heads - 1 - i],
	mode="markers+text",
	marker=dict(size=25, color=q_color, symbol="circle"),
	text=[f"Q{i}"],
	textposition="middle center",
	textfont=dict(color="white", size=9),
	showlegend=False,
	hoverinfo="skip",
	), row=1, col=2)

	# KV heads for GQA
	for j in range(display_kv):
	kv_y = (display_heads - 1) * (j / max(display_kv - 1, 1)) if display_kv > 1 else (display_heads - 1) / 2
	fig.add_trace(go.Scatter(
	x=[2], y=[kv_y],
	mode="markers+text",
	marker=dict(size=30, color=kv_color, symbol="square"),
	text=[f"KV{j}"],
	textposition="middle center",
	textfont=dict(color="white", size=9),
	showlegend=False,
	hoverinfo="skip",
	), row=1, col=2)

	# Draw connections from Q heads to their shared KV head
	heads_per_kv = gqa_ratio
	for k in range(min(heads_per_kv, display_heads - j * heads_per_kv)):
	q_idx = j * heads_per_kv + k
	if q_idx < display_heads:
	fig.add_trace(go.Scatter(
	x=[0.3, 1.7], y=[display_heads - 1 - q_idx, kv_y],
	mode="lines",
	line=dict(color="rgba(0,0,0,0.2)", width=1),
	showlegend=False,
	hoverinfo="skip",
	), row=1, col=2)

	# MQA: All Q heads share 1 KV
	for i in range(display_heads):
	fig.add_trace(go.Scatter(
	x=[0], y=[display_heads - 1 - i],
	mode="markers+text",
	marker=dict(size=25, color=q_color, symbol="circle"),
	text=[f"Q{i}"],
	textposition="middle center",
	textfont=dict(color="white", size=9),
	showlegend=False,
	hoverinfo="skip",
	), row=1, col=3)

	# Connection to single KV
	fig.add_trace(go.Scatter(
	x=[0.3, 1.7], y=[display_heads - 1 - i, (display_heads - 1) / 2],
	mode="lines",
	line=dict(color="rgba(0,0,0,0.2)", width=1),
	showlegend=False,
	hoverinfo="skip",
	), row=1, col=3)

	# Single KV for MQA
	fig.add_trace(go.Scatter(
	x=[2], y=[(display_heads - 1) / 2],
	mode="markers+text",
	marker=dict(size=35, color=kv_color, symbol="square"),
	text=["KV0"],
	textposition="middle center",
	textfont=dict(color="white", size=10),
	showlegend=False,
	hoverinfo="skip",
	), row=1, col=3)

	fig.update_layout(
	title=dict(
	text=f"Head Sharing Pattern: {model_name}",
	x=0.5,
	),
	height=350,
	showlegend=False,
	margin=dict(l=20, r=20, t=60, b=40),
	)

	# Update axes for all subplots
	for col in [1, 2, 3]:
	fig.update_xaxes(
	showgrid=False, zeroline=False, showticklabels=False,
	range=[-0.5, 2.5], row=1, col=col
	)
	fig.update_yaxes(
	showgrid=False, zeroline=False, showticklabels=False,
	range=[-0.5, display_heads - 0.5], row=1, col=col
	)

	return fig


	def create_memory_comparison_chart(model_name: str, seq_len: int) -> go.Figure:
	"""
	Create bar chart comparing KV cache memory for MHA vs GQA vs MQA.

	Uses REAL model config values.
	"""
	memory_data = compare_attention_memory(model_name, seq_len)
	gqa_config = memory_data["gqa_config"]

	attention_types = ["MHA<br>(Full)", f"{gqa_config['attention_type']}<br>(Actual)", "MQA<br>(Minimal)"]
	memory_values = [
	memory_data["mha_memory_mb"],
	memory_data["gqa_memory_mb"],
	memory_data["mqa_memory_mb"],
	]

	colors = ["#ef4444", "#22c55e", "#3b82f6"] # Red, Green, Blue

	fig = go.Figure()

	fig.add_trace(go.Bar(
	x=attention_types,
	y=memory_values,
	marker_color=colors,
	text=[f"{v:.1f} MB" for v in memory_values],
	textposition="outside",
	textfont=dict(size=12),
	))

	# Add savings annotation - positioned to the right of the GQA bar
	savings_ratio = memory_data["savings_ratio"]
	fig.add_annotation(
	x=1.5, y=memory_values[1] * 1.5,
	text=f"<b>{savings_ratio:.1f}× smaller</b>",
	showarrow=True,
	arrowhead=2,
	arrowsize=1,
	arrowwidth=1,
	arrowcolor="#22c55e",
	ax=30,
	ay=-20,
	font=dict(size=11, color="#22c55e"),
	)

	# Calculate y-axis range to fit bar labels
	max_val = max(memory_values)

	fig.update_layout(
	title=dict(
	text=f"KV Cache Memory at {seq_len} tokens<br><sub>{gqa_config['num_heads']} Q heads, {gqa_config['num_kv_heads']} KV heads, {gqa_config['num_layers']} layers</sub>",
	x=0.5,
	),
	yaxis_title="Memory (MB)",
	height=400,
	margin=dict(l=50, r=50, t=90, b=50),
	yaxis=dict(rangemode='tozero', range=[0, max_val * 1.25]), # Add headroom for labels
	)

	return fig


	def create_scaling_chart(model_name: str) -> go.Figure:
	"""
	Show how KV cache scales with sequence length for different attention types.
	"""
	gqa_config = get_gqa_config_from_model(model_name)

	seq_lengths = [512, 1024, 2048, 4096, 8192, 16384]

	mha_memory = []
	gqa_memory = []
	mqa_memory = []

	for seq_len in seq_lengths:
	data = compare_attention_memory(model_name, seq_len)
	mha_memory.append(data["mha_memory_mb"])
	gqa_memory.append(data["gqa_memory_mb"])
	mqa_memory.append(data["mqa_memory_mb"])

	fig = go.Figure()

	fig.add_trace(go.Scatter(
	x=seq_lengths, y=mha_memory,
	mode="lines+markers",
	name="MHA (baseline)",
	line=dict(color="#ef4444", width=2),
	marker=dict(size=8),
	))

	fig.add_trace(go.Scatter(
	x=seq_lengths, y=gqa_memory,
	mode="lines+markers",
	name=f"{gqa_config['attention_type']} (actual)",
	line=dict(color="#22c55e", width=3),
	marker=dict(size=10),
	))

	fig.add_trace(go.Scatter(
	x=seq_lengths, y=mqa_memory,
	mode="lines+markers",
	name="MQA (minimal)",
	line=dict(color="#3b82f6", width=2, dash="dash"),
	marker=dict(size=8),
	))

	fig.update_layout(
	title=dict(
	text=f"KV Cache Scaling: {model_name}<br><sub>GQA Ratio {gqa_config['gqa_ratio']}:1</sub>",
	x=0.5,
	),
	xaxis_title="Sequence Length",
	yaxis_title="KV Cache Memory (MB)",
	height=420,
	margin=dict(l=60, r=50, t=80, b=100),
	legend=dict(
	orientation="h",
	yanchor="top",
	y=-0.18,
	xanchor="center",
	x=0.5,
	),
	xaxis=dict(
	type="log",
	tickmode="array",
	tickvals=seq_lengths,
	ticktext=[f"{s//1000}K" if s >= 1000 else str(s) for s in seq_lengths],
	tickangle=0,
	),
	yaxis=dict(rangemode='tozero'),
	)

	return fig


	def run_gqa_analysis(model_name: str, seq_len: int) -> tuple:
	"""
	Run complete GQA analysis for a model.

	Returns config info, head diagram, memory chart, scaling chart, and insight text.
	"""
	# Get real config
	gqa_config = get_gqa_config_from_model(model_name)

	# Memory comparison
	memory_data = compare_attention_memory(model_name, seq_len)

	# Create visualizations
	head_diagram = create_head_sharing_diagram(model_name)
	memory_chart = create_memory_comparison_chart(model_name, seq_len)
	scaling_chart = create_scaling_chart(model_name)

	# Generate insight
	insight = f"""### {model_name} Attention Configuration

	Type: {gqa_config['attention_type']} ({gqa_config['attention_description']})

	Architecture (from model.config):
	- Query heads: {gqa_config['num_heads']}
	- KV heads: {gqa_config['num_kv_heads']}
	- Head dimension: {gqa_config['head_dim']}
	- Layers: {gqa_config['num_layers']}
	- GQA ratio: {gqa_config['gqa_ratio']}:1

	---

	### KV Cache Memory at {seq_len} tokens

	\| Configuration \| Memory \| vs MHA \|
	\|--------------\|--------\|--------\|
	\| MHA (baseline) \| {memory_data['mha_memory_mb']:.1f} MB \| 1.0× \|
	\| {gqa_config['attention_type']} \| {memory_data['gqa_memory_mb']:.1f} MB \| {memory_data['savings_ratio']:.1f}× smaller \|
	\| MQA (minimal) \| {memory_data['mqa_memory_mb']:.1f} MB \| {memory_data['mha_memory_mb']/memory_data['mqa_memory_mb']:.1f}× smaller \|

	---

	### Why GQA?

	- Memory efficiency: Reduces KV cache by {memory_data['savings_ratio']:.1f}×
	- Longer contexts: Same GPU memory supports {memory_data['savings_ratio']:.1f}× longer sequences
	- Quality preservation: Better than MQA for maintaining attention quality
	- Compute savings: Fewer KV projections during generation
	"""

	return gqa_config, head_diagram, memory_chart, scaling_chart, insight