flash-mla / tests /test_flash_mla_sparse_decoding.py

Upload folder using huggingface_hub

ccef021 verified 25 days ago

14.5 kB

	# /// script
	# dependencies = [
	# "numpy",
	# "torch",
	# "kernels",
	# "triton",
	# "rich",
	# ]
	# ///

	import time
	import dataclasses
	from typing import Tuple, List, Dict, Optional
	import copy

	import rich.console
	import rich.table

	import torch
	import kernelkit as kk

	# import flash_mla
	from kernels import get_kernel, get_local_kernel
	flash_mla = get_kernel("drbh/tmp-kernel-123")

	import lib
	from lib import TestParam
	from lib import RawTestParamForDecode as RawTestParam
	import ref

	"""
	Generate testcase for unit test
	"""

	def gen_testcase() -> List[RawTestParam]:
	correctness_cases = []
	corner_cases = []
	for d_qk in [576, 512]:
	for have_extra_k in ([False, True] if d_qk == 512 else [False]):
	for have_extra_topk_len in ([False, True] if have_extra_k else [False]):
	for have_topk_len in ([False, True] if d_qk == 512 else [False]):
	for h_q in [64, 128]:
	cur_correctness_cases = [
	RawTestParam(b, h_q, s_q, 1, s_k, is_varlen, topk,
	have_topk_length=have_topk_len,
	enable_attn_sink=True,
	extra_s_k=extra_s_k,
	extra_topk=extra_topk,
	block_size=block_size,
	extra_block_size=extra_block_size,
	have_extra_topk_length=have_extra_topk_len,
	d_qk=d_qk,
	check_correctness=True,
	num_runs=0)
	for (s_k, topk, block_size) in [
	(512, 64, 2),
	(512, 64, 64),
	(512, 64, 69),
	(1024, 576, 2),
	(1024, 576, 61),
	(2046, 2048, 2),
	(2046, 2048, 64),
	(2046, 2048, 576)
	]
	for (extra_s_k, extra_topk, extra_block_size) in ([
	(512, 64, 2),
	(512, 64, 64),
	(512, 64, 69),
	(1024, 576, 2),
	(1024, 576, 61),
	(2046, 2048, 2),
	(2046, 2048, 64),
	(2046, 2048, 576)
	] if have_extra_k else [(None, None, None)])
	for b in [4, 74, 321]
	for s_q in [1, 3]
	for is_varlen in ([True, False] if (b == 74 and not have_topk_len and not have_extra_topk_len) else [True])
	]
	correctness_cases.extend(cur_correctness_cases)

	cur_corner_cases = [
	RawTestParam(b, h_q, s_q, 1, s_k, is_varlen, topk,
	is_all_indices_invalid=is_all_indices_invalid,
	have_zero_seqlen_k=have_zero_seqlen_k,
	have_topk_length=have_topk_len,
	enable_attn_sink=enable_attn_sink,
	extra_s_k=extra_s_k,
	extra_topk=extra_topk,
	block_size=block_size,
	extra_block_size=extra_block_size,
	have_extra_topk_length=have_extra_topk_len,
	d_qk=d_qk,
	check_correctness=True,
	num_runs=0,
	)
	for (s_k, topk, block_size) in [
	(512, 64, 61),
	(650, 576, 53),
	]
	for (extra_s_k, extra_topk, extra_block_size) in ([
	(512, 64, 61),
	(650, 576, 53),
	] if have_extra_k else [(None, None, None)])
	for b in [4, 74, 321]
	for s_q in [3]
	for is_varlen in ([True, False] if (b == 74 and not have_topk_len and not have_extra_topk_len) else [True])
	for is_all_indices_invalid in [True, False]
	for have_zero_seqlen_k in [True, False]
	for enable_attn_sink in [True, False]
	if (is_all_indices_invalid or have_zero_seqlen_k or enable_attn_sink)
	]
	corner_cases.extend(cur_corner_cases)

	base_and_bszs = [
	# V3.2
	(RawTestParam(0, 128, 2, 1, 32768, True, topk=2048, d_qk=576), [2, 64, 74, 128]),
	# MODEL1 CONFIG1
	(RawTestParam(0, 64, 2, 1, 16384, True, topk=128, d_qk=512, extra_s_k=16384, extra_topk=512, block_size=256, extra_block_size=64), [2, 64, 74, 128, 74*2, 256]),
	# MODEL1 CONFIG2
	(RawTestParam(0, 128, 2, 1, 16384, True, topk=128, d_qk=512, extra_s_k=16384, extra_topk=1024, block_size=256, extra_block_size=64), [2, 64, 74, 128, 74*2, 256]),
	# MODEL1 CONFIG3
	(RawTestParam(0, 64, 2, 1, 16384, True, topk=128, d_qk=512, extra_s_k=16384, extra_topk=1024, block_size=256, extra_block_size=2, have_extra_topk_length=True), [2, 64, 74, 128, 74*2, 256]),
	# MODEL1 CONFIG4
	(RawTestParam(0, 128, 2, 1, 16384, True, topk=128, d_qk=512, extra_s_k=16384, extra_topk=1024, block_size=256, extra_block_size=2, have_extra_topk_length=True), [2, 64, 74, 128, 74*2, 256]),
	]
	performance_cases = [
	# Production cases
	dataclasses.replace(base, b=b)
	for base, bszs in base_and_bszs
	for b in bszs
	] + [
	# Peak perf cases
	RawTestParam(74*2, h_q, 2, 1, 32768, True, topk=16384, d_qk=d_qk)
	for h_q in [64, 128]
	for d_qk in [512, 576]
	]

	return correctness_cases + corner_cases + performance_cases


	@dataclasses.dataclass
	class Result:
	is_correct: bool
	compute_memory_ratio: float
	time_usage_per_us: float
	splitkv_time_usage_us: float
	combine_time_usage_us: float
	achieved_tflops: float
	achieved_gBps: float

	_counter = kk.Counter()

	@torch.inference_mode()
	def test_flash_mla(p: TestParam) -> Result:
	if p.seed == -1:
	global _counter
	p.seed = _counter.next()
	assert p.decode

	print("================")
	print(f"Running on {p}")
	torch.cuda.empty_cache()

	t = lib.generate_testcase_for_decode(p)

	tile_scheduler_metadata, _ = flash_mla.get_mla_metadata()
	def run_decode():
	return lib.run_flash_mla_decode(p, t, tile_scheduler_metadata, None)

	# We first run the kernel once to generate output data for the correctness test
	# We must do this first, otherwise when allocating tensors for storing answers,
	# it may re-use memory that contains the correct answer, leading to false positives
	if p.check_correctness:
	torch.cuda.synchronize()
	out_ans, lse_ans = run_decode()
	torch.cuda.synchronize()
	# torch.set_printoptions(profile='full')
	# print(tile_scheduler_metadata.tile_scheduler_metadata[:, :7])

	# We run the performance test before generating the answer for the correctness test to avoid interference
	performance_result = Result(True, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
	if p.num_runs == 0:
	performance_result = Result(True, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
	else:
	result = kk.bench_kineto(run_decode, p.num_runs)

	splitkv_kernel_name = "flash_fwd_splitkv_mla_fp8_sparse_kernel"
	combine_kernel_name = "flash_fwd_mla_combine_kernel"

	# Get individual kernel time usages
	kernel_time_usages_us: Dict[str, Optional[float]] = {}
	def pick_kernel_time_usage(kernel_name: str):
	t = [kernel_name in s for s in result.get_kernel_names()]
	if any(t):
	assert sum(t) == 1
	kernel_time_usages_us[kernel_name] = result.get_kernel_time(kernel_name) * 1e6
	else:
	kernel_time_usages_us[kernel_name] = None
	pick_kernel_time_usage(splitkv_kernel_name)
	pick_kernel_time_usage(combine_kernel_name)

	# Get E2E time usages
	def have_kernel(name: str):
	return kernel_time_usages_us[name] is not None

	if kk.is_using_profiling_tools():
	e2e_time_usage_us = 1e6
	else:
	assert have_kernel(splitkv_kernel_name)
	if have_kernel(combine_kernel_name):
	e2e_time_usage_us = result.get_e2e_time(splitkv_kernel_name, combine_kernel_name) * 1e6
	else:
	e2e_time_usage_us = kernel_time_usages_us[splitkv_kernel_name]
	assert e2e_time_usage_us is not None

	flops_and_mem_vol = lib.count_flop_and_mem_vol_for_decode(p, t)

	e2e_time_usage_s = e2e_time_usage_us / 1e6
	theoritical_compute_memory_ratio = flops_and_mem_vol.flop / flops_and_mem_vol.mem_vol
	achieved_tflops = flops_and_mem_vol.flop / e2e_time_usage_s / 1e12
	achieved_gBps = flops_and_mem_vol.mem_vol / e2e_time_usage_s / 1e9
	def print_kernel_time_usage(name: str, short_name: str):
	if kernel_time_usages_us[name] is not None:
	print(f'{short_name} time: {kernel_time_usages_us[name]:.1f} us')
	print(f'Compute/Memory: {theoritical_compute_memory_ratio:.2f}')
	print(f'Time (per): {e2e_time_usage_us:.1f} us')
	print_kernel_time_usage(splitkv_kernel_name, "Splitkv")
	print_kernel_time_usage(combine_kernel_name, "Combine")
	print(f'TFlops: {achieved_tflops:.1f}')
	print(f'GB/s: {achieved_gBps:.0f}')

	performance_result = Result(True, theoritical_compute_memory_ratio, e2e_time_usage_us, kernel_time_usages_us[splitkv_kernel_name] or 0.0, kernel_time_usages_us[combine_kernel_name] or 0.0, achieved_tflops, achieved_gBps)

	is_correct = True
	if p.check_correctness:
	torch.cuda.synchronize()
	with torch.profiler.record_function("reference_flash_mla"):
	out_ref, lse_ref = ref.ref_sparse_attn_decode(p, t)

	is_out_correct = kk.check_is_allclose("out", out_ans, out_ref, abs_tol=1e-3, rel_tol=2.01/128, cos_diff_tol=5e-6)
	is_lse_correct = kk.check_is_allclose("lse", lse_ans, lse_ref, abs_tol=1e-6, rel_tol=8.01/65536)
	is_correct &= is_out_correct and is_lse_correct

	performance_result.is_correct = is_correct
	return performance_result


	def main():
	dtype = torch.bfloat16
	device = torch.device("cuda:0")
	torch.set_default_dtype(dtype)
	torch.set_default_device(device)
	torch.cuda.set_device(device)
	torch.set_float32_matmul_precision('high')
	torch.set_num_threads(32)

	raw_testcases = gen_testcase()
	testcases = [t.to_test_param() for t in raw_testcases]

	print(f"{kk.colors['CYAN_BG']}{len(testcases)} testcases to run{kk.colors['CLEAR']}")

	is_no_cooldown = lib.is_no_cooldown()
	num_testcases_len = len(str(len(testcases)))
	failed_cases = []
	results: List[Tuple[TestParam, Result]] = []
	for testcase_idx, testcase in enumerate(testcases):
	if testcase != testcases[0] and testcase.num_runs > 0 and not is_no_cooldown:
	time.sleep(0.3) # Cooldown
	print(f"[{testcase_idx+1:{num_testcases_len}d}/{len(testcases)}, {testcase_idx/len(testcases)*100:3.0f}%] ", end='')
	result = test_flash_mla(testcase)
	results.append((testcase, result))
	if not result.is_correct:
	failed_cases.append(testcase)
	import sys
	sys.exit(1)

	console = rich.console.Console(width=120)
	table = rich.table.Table(show_header=True, header_style="bold cyan")
	table.add_column("topk")
	table.add_column("Bsz")
	table.add_column("h_q&k")
	table.add_column("sq")
	table.add_column("sk")
	table.add_column("d_qk")
	table.add_column("Feats")
	table.add_column("C/M")
	table.add_column("TFlops")
	table.add_column("GBps")
	table.add_column("us")
	table.add_column(" ")

	for testcase, result in results:
	assert testcase.decode
	topk_str = f"{testcase.topk}" if testcase.decode.extra_topk is None else f"{testcase.topk}+{testcase.decode.extra_topk}"
	table.add_row(
	topk_str,
	str(testcase.decode.b),
	f"{testcase.h_q:3d} {testcase.h_kv}",
	str(testcase.s_q),
	str(testcase.s_kv),
	str(testcase.d_qk),
	" V"[testcase.decode.is_varlen] + " L"[testcase.have_topk_length] + " E"[testcase.decode.have_extra_topk_length],
	f"{result.compute_memory_ratio:3.0f}",
	f"{result.achieved_tflops:3.0f}",
	f"{result.achieved_gBps:4.0f}",
	f"{result.time_usage_per_us:4.1f}",
	"" if result.is_correct else "X"
	)
	console.print(table)

	def geomean(l) -> float:
	import numpy
	return numpy.exp(numpy.mean(numpy.log(l)))

	num_correct_testcases = [result.is_correct for t, result in results if t.check_correctness].count(True)
	num_correctness_cases = sum([1 for t in testcases if t.check_correctness])
	if num_correct_testcases == num_correctness_cases:
	print(f"{kk.colors['GREEN_BG']}{num_correct_testcases}/{num_correctness_cases} correctness cases passed{kk.colors['CLEAR']}")
	else:
	print(f"{kk.colors['RED_BG']}{num_correct_testcases}/{num_correctness_cases} correctness cases passed{kk.colors['CLEAR']}")
	for t in failed_cases:
	print(f"\t{t},")

	valid_achieved_tflops = [result.achieved_tflops for _, result in results if result.achieved_tflops > 0.1]
	if len(valid_achieved_tflops) > 0:
	achieved_tflops_geomean = geomean(valid_achieved_tflops) # > 0.1 to prune out correctness cases
	print(f"TFlops geomean: {achieved_tflops_geomean:.1f}")


	if __name__ == "__main__":
	main()