Upload folder using huggingface_hub

ccef021 verified 25 days ago

7.77 kB

	# /// script
	# dependencies = [
	# "numpy",
	# "torch",
	# "kernels"
	# ]
	# ///
	"""
	Flash-MLA (Multi-head Latent Attention) Example

	This script demonstrates the usage of the Flash-MLA kernel for efficient
	attention computation on Hopper (SM90) GPUs.

	Flash-MLA is optimized for DeepSeek-style MLA attention patterns.
	"""
	import math
	import torch
	from kernels import get_kernel, get_local_kernel
	from pathlib import Path

	# Setup
	torch.manual_seed(42)
	flash_mla = get_kernel("drbh/tmp-kernel-123")
	# flash_mla = get_local_kernel(Path("build"), "flash-mla")
	device = torch.device("cuda")

	# Check GPU architecture
	cc_major, cc_minor = torch.cuda.get_device_capability()
	print(f"GPU Compute Capability: {cc_major}.{cc_minor}")
	if cc_major != 9:
	print("Warning: Flash-MLA dense decoding is optimized for SM90 (Hopper) GPUs.")
	print("Some features may not work on other architectures.")

	def cdiv(a, b):
	"""Ceiling division"""
	return (a + b - 1) // b


	# =============================================================================
	# Test 1: Dense MLA Decoding (SM90)
	# =============================================================================
	print("\n" + "=" * 60)
	print("Test 1: Dense MLA Decoding")
	print("=" * 60)

	# Configuration matching DeepSeek V3 architecture
	batch_size = 2
	seq_len_q = 1 # Typically 1 for decoding
	num_heads_q = 64 # Number of query heads (must be 64 or 128)
	num_heads_k = 1 # MLA uses single KV head
	head_dim = 576 # Q/K head dimension (576 or 512)
	head_dim_v = 512 # V head dimension (must be 512)
	page_block_size = 64 # Page block size (must be 64)
	seq_len_k = 256 # KV cache sequence length

	# Calculate number of blocks needed
	max_num_blocks = cdiv(seq_len_k, page_block_size)

	# Create input tensors
	q = torch.randn(batch_size, seq_len_q, num_heads_q, head_dim,
	device=device, dtype=torch.bfloat16) / 10
	q.clamp_(min=-1.0, max=1.0)

	# KV cache in blocked format: [num_blocks, page_block_size, num_heads_k, head_dim]
	total_blocks = batch_size * max_num_blocks
	blocked_k = torch.randn(total_blocks, page_block_size, num_heads_k, head_dim,
	device=device, dtype=torch.bfloat16) / 10
	blocked_k.clamp_(min=-1.0, max=1.0)

	# Block table maps batch elements to their cache blocks
	block_table = torch.arange(total_blocks, device=device, dtype=torch.int32).view(batch_size, max_num_blocks)

	# Sequence lengths for each batch element
	cache_seqlens = torch.full((batch_size,), seq_len_k, device=device, dtype=torch.int32)

	# Get scheduler metadata (required for flash_mla_with_kvcache)
	tile_scheduler_metadata, _ = flash_mla.get_mla_metadata()

	print(f"Query shape: {q.shape}")
	print(f"KV cache shape: {blocked_k.shape}")
	print(f"Block table shape: {block_table.shape}")
	print(f"Cache seqlens: {cache_seqlens}")

	# Run Flash-MLA dense decoding
	with torch.inference_mode():
	out, lse = flash_mla.flash_mla_with_kvcache(
	q=q,
	k_cache=blocked_k,
	block_table=block_table,
	cache_seqlens=cache_seqlens,
	head_dim_v=head_dim_v,
	tile_scheduler_metadata=tile_scheduler_metadata,
	num_splits=None,
	causal=False, # Causal masking
	)

	print(f"Output shape: {out.shape}") # [batch_size, seq_len_q, num_heads_q, head_dim_v]
	print(f"LSE shape: {lse.shape}") # [batch_size, num_heads_q, seq_len_q]
	print("Dense MLA decoding: SUCCESS")


	# =============================================================================
	# Test 2: Reference comparison for correctness
	# =============================================================================
	print("\n" + "=" * 60)
	print("Test 2: Correctness Check vs PyTorch Reference")
	print("=" * 60)

	def reference_attention(q, blocked_k, block_table, cache_seqlens, dv, is_causal=False):
	"""
	Reference implementation using PyTorch for verification
	"""
	b, s_q, h_q, d = q.size()
	block_size = blocked_k.size(1)
	h_kv = blocked_k.size(2)

	out_ref = torch.empty(b, s_q, h_q, dv, dtype=torch.float32, device=q.device)
	lse_ref = torch.empty(b, h_q, s_q, dtype=torch.float32, device=q.device)

	cache_seqlens_cpu = cache_seqlens.cpu()

	for i in range(b):
	cur_len = int(cache_seqlens_cpu[i].item())
	cur_num_blocks = cdiv(cur_len, block_size)
	cur_block_indices = block_table[i][0:cur_num_blocks]

	# Reconstruct KV from blocks
	cur_kv = blocked_k[cur_block_indices].view(-1, h_kv, d)[:cur_len, ...]

	# Compute attention
	query = q[i].transpose(0, 1).float() # [h_q, s_q, d]
	kv = cur_kv.transpose(0, 1).float() # [h_kv, s_k, d]

	# Expand KV heads if needed
	if h_kv != h_q:
	kv = kv.repeat_interleave(h_q // h_kv, dim=0)

	# Q @ K^T
	attn_weight = query @ kv.transpose(-2, -1)

	# Apply causal mask if needed
	s_k = kv.size(1)
	if is_causal and s_q > 1:
	mask = torch.ones(s_q, s_k, dtype=torch.bool, device=q.device).tril(diagonal=s_k - s_q)
	attn_weight.masked_fill_(~mask, float("-inf"))

	# Scale and softmax
	attn_weight = attn_weight / math.sqrt(d)
	lse = attn_weight.logsumexp(dim=-1)
	attn_weight = torch.softmax(attn_weight, dim=-1)

	# Attention @ V
	output = attn_weight @ kv[..., :dv]

	out_ref[i] = output.transpose(0, 1)
	lse_ref[i] = lse

	return out_ref.to(q.dtype), lse_ref

	# Compute reference
	out_ref, lse_ref = reference_attention(q, blocked_k, block_table, cache_seqlens, head_dim_v, is_causal=False)

	# Compare
	out_close = torch.allclose(out.float(), out_ref.float(), atol=1e-3, rtol=1e-2)
	lse_close = torch.allclose(lse.float(), lse_ref.float(), atol=1e-4, rtol=1e-3)

	print(f"Output close to reference: {out_close}")
	print(f"LSE close to reference: {lse_close}")

	if out_close and lse_close:
	print("Correctness check: PASSED")
	else:
	max_out_diff = (out.float() - out_ref.float()).abs().max().item()
	max_lse_diff = (lse.float() - lse_ref.float()).abs().max().item()
	print(f"Max output diff: {max_out_diff}")
	print(f"Max LSE diff: {max_lse_diff}")
	print("Correctness check: Check differences above")


	# =============================================================================
	# Test 3: Different configurations
	# =============================================================================
	print("\n" + "=" * 60)
	print("Test 3: Testing different configurations")
	print("=" * 60)

	configs = [
	{"batch": 1, "seq_q": 1, "heads_q": 64, "seq_k": 128},
	{"batch": 4, "seq_q": 1, "heads_q": 128, "seq_k": 512},
	{"batch": 8, "seq_q": 2, "heads_q": 64, "seq_k": 1024},
	]

	for cfg in configs:
	b = cfg["batch"]
	s_q = cfg["seq_q"]
	h_q = cfg["heads_q"]
	s_k = cfg["seq_k"]

	max_blocks = cdiv(s_k, page_block_size)
	total_blks = b * max_blocks

	q_test = torch.randn(b, s_q, h_q, head_dim, device=device, dtype=torch.bfloat16) / 10
	k_test = torch.randn(total_blks, page_block_size, num_heads_k, head_dim, device=device, dtype=torch.bfloat16) / 10
	bt_test = torch.arange(total_blks, device=device, dtype=torch.int32).view(b, max_blocks)
	sl_test = torch.full((b,), s_k, device=device, dtype=torch.int32)

	sched_meta, _ = flash_mla.get_mla_metadata()

	with torch.inference_mode():
	out_test, lse_test = flash_mla.flash_mla_with_kvcache(
	q=q_test,
	k_cache=k_test,
	block_table=bt_test,
	cache_seqlens=sl_test,
	head_dim_v=head_dim_v,
	tile_scheduler_metadata=sched_meta,
	)

	print(f"Config: batch={b}, seq_q={s_q}, heads_q={h_q}, seq_k={s_k} -> Output: {out_test.shape} SUCCESS")


	print("\n" + "=" * 60)
	print("All tests completed successfully!")
	print("=" * 60)