diff --git "a/autotune_cache.json" "b/autotune_cache.json" new file mode 100644--- /dev/null +++ "b/autotune_cache.json" @@ -0,0 +1,24360 @@ +{ + "*": { + "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_implicit_gemm.sparse_submanifold_conv_bwd_input_implicit_gemm_kernel": { + "(7, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + } + }, + "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_implicit_gemm.sparse_submanifold_conv_bwd_weight_implicit_gemm_kernel": { + "(7, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 32 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 32 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 32 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 32 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + } + }, + "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_implicit_gemm_splitk.sparse_submanifold_conv_bwd_input_implicit_gemm_kernel": { + "(7, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + } + }, + "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_implicit_gemm_splitk.sparse_submanifold_conv_bwd_weight_implicit_gemm_kernel": { + "(7, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 32 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 32 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 32 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 32 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + } + }, + "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_implicit_gemm_splitk.sparse_submanifold_conv_bwd_input_implicit_gemm_splitk_kernel": { + "(7, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, 8, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, 8, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, 4, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + } + }, + "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_implicit_gemm_splitk.sparse_submanifold_conv_bwd_weight_implicit_gemm_splitk_kernel": { + "(7, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 32 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, 8, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 32 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + } + }, + "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_implicit_gemm_splitk.sparse_submanifold_conv_bwd_input_implicit_gemm_splitk": { + "(2^7, 1024, 1024, 27)": { + "SPLITK": 8 + }, + "(2^9, 1024, 1024, 27)": { + "SPLITK": 8 + }, + "(2^11, 1024, 1024, 27)": { + "SPLITK": 4 + }, + "(2^13, 1024, 1024, 27)": { + "SPLITK": 1 + }, + "(2^15, 512, 512, 27)": { + "SPLITK": 1 + }, + "(2^17, 256, 256, 27)": { + "SPLITK": 1 + } + }, + "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_implicit_gemm_splitk.sparse_submanifold_conv_bwd_weight_implicit_gemm_splitk": { + "(2^7, 1024, 1024, 27)": { + "SPLITK": 1 + }, + "(2^9, 1024, 1024, 27)": { + "SPLITK": 1 + }, + "(2^11, 1024, 1024, 27)": { + "SPLITK": 1 + }, + "(2^13, 1024, 1024, 27)": { + "SPLITK": 1 + }, + "(2^15, 512, 512, 27)": { + "SPLITK": 1 + }, + "(2^17, 256, 256, 27)": { + "SPLITK": 8 + } + }, + "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_masked_implicit_gemm.sparse_submanifold_conv_bwd_input_masked_implicit_gemm_kernel": { + "(18, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(21, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(21, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + } + }, + "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_masked_implicit_gemm.sparse_submanifold_conv_bwd_weight_masked_implicit_gemm_kernel": { + "(16, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 32 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + } + }, + "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_masked_implicit_gemm_splitk.sparse_submanifold_conv_bwd_input_masked_implicit_gemm_kernel": { + "(18, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(21, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(21, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + } + }, + "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_masked_implicit_gemm_splitk.sparse_submanifold_conv_bwd_weight_masked_implicit_gemm_kernel": { + "(16, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 32 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + } + }, + "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_masked_implicit_gemm_splitk.sparse_submanifold_conv_bwd_input_masked_implicit_gemm_splitk_kernel": { + "(16, 128, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 2048, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 2048, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 2048, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 2048, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 512, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 4096, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 4096, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 4096, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 4096, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 4096, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 1024, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 32, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 2048, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 2048, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 2048, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 2048, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 512, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 4096, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 4096, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 4096, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 4096, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 4096, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 32, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 32, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 4096, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 4096, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 4096, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 4096, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 4096, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 4096, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 1024, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 1024, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 2048, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 2048, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 2048, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 32, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 4096, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 4096, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 4096, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 4096, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 2048, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 2048, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 2048, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 2048, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 2048, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 512, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 512, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 256, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 2048, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 2048, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 4096, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 4096, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 4096, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 4096, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 4096, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 4096, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 32, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 32, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 32, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 64, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 64, 16, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, 8, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, 8, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, 4, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + } + }, + "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_masked_implicit_gemm_splitk.sparse_submanifold_conv_bwd_weight_masked_implicit_gemm_splitk_kernel": { + "(18, 64, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 64, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 64, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 64, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 64, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 64, 64, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 512, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 512, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 128, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 128, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 256, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 256, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 2048, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 32, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 32, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 32, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 32, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 32, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 32, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 64, 16, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 64, 16, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 64, 16, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 64, 16, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 64, 16, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 64, 16, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 2048, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 256, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 256, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 64, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 64, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 64, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 64, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 64, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 64, 64, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 512, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 512, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 128, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 128, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 32, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 32, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 32, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 32, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 32, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 32, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 64, 16, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 64, 16, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 64, 16, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 64, 16, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 64, 16, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 64, 16, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 64, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 64, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 64, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 64, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 64, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 64, 64, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 512, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 512, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 128, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 128, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 256, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 256, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 2048, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 32, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 32, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 32, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 32, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 32, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 32, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 64, 16, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 64, 16, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 64, 16, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 64, 16, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 64, 16, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 64, 16, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 2048, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 64, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 64, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 64, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 64, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 64, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 64, 64, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 64, 16, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 64, 16, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 64, 16, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 64, 16, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 64, 16, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 64, 16, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 512, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 512, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 128, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 128, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 32, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 32, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 32, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 32, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 32, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 32, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 256, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 256, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 256, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 256, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(21, 64, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(21, 64, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(21, 64, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(21, 64, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(21, 64, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(21, 64, 64, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(21, 64, 16, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(21, 64, 16, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(21, 64, 16, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(21, 64, 16, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(21, 64, 16, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(21, 64, 16, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 512, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 512, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 128, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 128, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 2048, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 32, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 32, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 32, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 32, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 32, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 32, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 512, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 512, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 128, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 128, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 32, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 32, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 32, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 32, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 32, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 32, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 64, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 64, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 64, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 64, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 64, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 64, 64, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 64, 16, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 64, 16, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 64, 16, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 64, 16, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 64, 16, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 64, 16, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 64, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 64, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 64, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 64, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 64, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 64, 64, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 64, 16, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 64, 16, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 64, 16, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 64, 16, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 64, 16, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 64, 16, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 32 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, 8, True, 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.uint32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + } + }, + "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_masked_implicit_gemm_splitk.sparse_submanifold_conv_bwd_input_masked_implicit_gemm_splitk": { + "(2^18, 64, 64, 27)": { + "SPLITK": 1 + }, + "(2^16, 128, 512, 27)": { + "SPLITK": 1 + }, + "(2^16, 128, 128, 27)": { + "SPLITK": 1 + }, + "(2^14, 256, 1024, 27)": { + "SPLITK": 2 + }, + "(2^14, 256, 256, 27)": { + "SPLITK": 1 + }, + "(2^12, 512, 2048, 27)": { + "SPLITK": 4 + }, + "(2^12, 512, 512, 27)": { + "SPLITK": 4 + }, + "(2^10, 1024, 4096, 27)": { + "SPLITK": 8 + }, + "(2^10, 1024, 1024, 27)": { + "SPLITK": 8 + }, + "(2^12, 512, 128, 27)": { + "SPLITK": 1 + }, + "(2^14, 256, 64, 27)": { + "SPLITK": 1 + }, + "(2^16, 128, 32, 27)": { + "SPLITK": 1 + }, + "(2^18, 64, 16, 27)": { + "SPLITK": 1 + }, + "(2^11, 512, 2048, 27)": { + "SPLITK": 8 + }, + "(2^11, 512, 512, 27)": { + "SPLITK": 4 + }, + "(2^9, 1024, 4096, 27)": { + "SPLITK": 16 + }, + "(2^9, 1024, 1024, 27)": { + "SPLITK": 8 + }, + "(2^11, 512, 128, 27)": { + "SPLITK": 1 + }, + "(2^13, 256, 1024, 27)": { + "SPLITK": 4 + }, + "(2^13, 256, 256, 27)": { + "SPLITK": 1 + }, + "(2^13, 256, 64, 27)": { + "SPLITK": 1 + }, + "(2^17, 64, 64, 27)": { + "SPLITK": 1 + }, + "(2^15, 128, 512, 27)": { + "SPLITK": 1 + }, + "(2^15, 128, 128, 27)": { + "SPLITK": 1 + }, + "(2^15, 128, 32, 27)": { + "SPLITK": 1 + }, + "(2^17, 64, 16, 27)": { + "SPLITK": 1 + }, + "(2^8, 1024, 4096, 27)": { + "SPLITK": 16 + }, + "(2^8, 1024, 1024, 27)": { + "SPLITK": 16 + }, + "(2^19, 64, 64, 27)": { + "SPLITK": 1 + }, + "(2^17, 128, 512, 27)": { + "SPLITK": 1 + }, + "(2^17, 128, 128, 27)": { + "SPLITK": 1 + }, + "(2^15, 256, 1024, 27)": { + "SPLITK": 2 + }, + "(2^15, 256, 256, 27)": { + "SPLITK": 1 + }, + "(2^13, 512, 2048, 27)": { + "SPLITK": 4 + }, + "(2^13, 512, 512, 27)": { + "SPLITK": 1 + }, + "(2^13, 512, 128, 27)": { + "SPLITK": 1 + }, + "(2^15, 256, 64, 27)": { + "SPLITK": 1 + }, + "(2^17, 128, 32, 27)": { + "SPLITK": 1 + }, + "(2^19, 64, 16, 27)": { + "SPLITK": 1 + }, + "(2^11, 1024, 4096, 27)": { + "SPLITK": 16 + }, + "(2^11, 1024, 1024, 27)": { + "SPLITK": 4 + }, + "(2^10, 512, 2048, 27)": { + "SPLITK": 16 + }, + "(2^10, 512, 512, 27)": { + "SPLITK": 4 + }, + "(2^10, 512, 128, 27)": { + "SPLITK": 1 + }, + "(2^20, 64, 64, 27)": { + "SPLITK": 1 + }, + "(2^20, 64, 16, 27)": { + "SPLITK": 1 + }, + "(2^18, 128, 512, 27)": { + "SPLITK": 1 + }, + "(2^18, 128, 128, 27)": { + "SPLITK": 1 + }, + "(2^18, 128, 32, 27)": { + "SPLITK": 1 + }, + "(2^16, 256, 1024, 27)": { + "SPLITK": 1 + }, + "(2^16, 256, 256, 27)": { + "SPLITK": 1 + }, + "(2^16, 256, 64, 27)": { + "SPLITK": 1 + }, + "(2^12, 256, 1024, 27)": { + "SPLITK": 8 + }, + "(2^12, 256, 256, 27)": { + "SPLITK": 1 + }, + "(2^12, 256, 64, 27)": { + "SPLITK": 1 + }, + "(2^21, 64, 64, 27)": { + "SPLITK": 1 + }, + "(2^21, 64, 16, 27)": { + "SPLITK": 1 + }, + "(2^19, 128, 512, 27)": { + "SPLITK": 1 + }, + "(2^19, 128, 128, 27)": { + "SPLITK": 1 + }, + "(2^17, 256, 1024, 27)": { + "SPLITK": 1 + }, + "(2^17, 256, 256, 27)": { + "SPLITK": 1 + }, + "(2^14, 512, 2048, 27)": { + "SPLITK": 4 + }, + "(2^14, 512, 512, 27)": { + "SPLITK": 1 + }, + "(2^14, 512, 128, 27)": { + "SPLITK": 1 + }, + "(2^17, 256, 64, 27)": { + "SPLITK": 1 + }, + "(2^19, 128, 32, 27)": { + "SPLITK": 1 + }, + "(2^7, 1024, 4096, 27)": { + "SPLITK": 32 + }, + "(2^7, 1024, 1024, 27)": { + "SPLITK": 8 + }, + "(2^14, 128, 512, 27)": { + "SPLITK": 4 + }, + "(2^14, 128, 128, 27)": { + "SPLITK": 1 + }, + "(2^14, 128, 32, 27)": { + "SPLITK": 1 + }, + "(2^16, 64, 64, 27)": { + "SPLITK": 1 + }, + "(2^16, 64, 16, 27)": { + "SPLITK": 1 + }, + "(2^22, 64, 64, 27)": { + "SPLITK": 1 + }, + "(2^22, 64, 16, 27)": { + "SPLITK": 1 + }, + "(2^13, 1024, 1024, 27)": { + "SPLITK": 1 + }, + "(2^15, 512, 512, 27)": { + "SPLITK": 1 + } + }, + "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_masked_implicit_gemm_splitk.sparse_submanifold_conv_bwd_weight_masked_implicit_gemm_splitk": { + "(2^18, 64, 64, 27)": { + "SPLITK": 128 + }, + "(2^16, 128, 512, 27)": { + "SPLITK": 16 + }, + "(2^16, 128, 128, 27)": { + "SPLITK": 32 + }, + "(2^14, 256, 1024, 27)": { + "SPLITK": 1 + }, + "(2^14, 256, 256, 27)": { + "SPLITK": 4 + }, + "(2^12, 512, 2048, 27)": { + "SPLITK": 1 + }, + "(2^12, 512, 512, 27)": { + "SPLITK": 1 + }, + "(2^10, 1024, 4096, 27)": { + "SPLITK": 1 + }, + "(2^10, 1024, 1024, 27)": { + "SPLITK": 1 + }, + "(2^12, 512, 128, 27)": { + "SPLITK": 1 + }, + "(2^14, 256, 64, 27)": { + "SPLITK": 16 + }, + "(2^16, 128, 32, 27)": { + "SPLITK": 64 + }, + "(2^18, 64, 16, 27)": { + "SPLITK": 128 + }, + "(2^11, 512, 2048, 27)": { + "SPLITK": 1 + }, + "(2^11, 512, 512, 27)": { + "SPLITK": 1 + }, + "(2^9, 1024, 4096, 27)": { + "SPLITK": 1 + }, + "(2^9, 1024, 1024, 27)": { + "SPLITK": 1 + }, + "(2^11, 512, 128, 27)": { + "SPLITK": 1 + }, + "(2^13, 256, 1024, 27)": { + "SPLITK": 1 + }, + "(2^13, 256, 256, 27)": { + "SPLITK": 4 + }, + "(2^13, 256, 64, 27)": { + "SPLITK": 16 + }, + "(2^17, 64, 64, 27)": { + "SPLITK": 64 + }, + "(2^15, 128, 512, 27)": { + "SPLITK": 4 + }, + "(2^15, 128, 128, 27)": { + "SPLITK": 16 + }, + "(2^15, 128, 32, 27)": { + "SPLITK": 32 + }, + "(2^17, 64, 16, 27)": { + "SPLITK": 128 + }, + "(2^8, 1024, 4096, 27)": { + "SPLITK": 1 + }, + "(2^8, 1024, 1024, 27)": { + "SPLITK": 1 + }, + "(2^19, 64, 64, 27)": { + "SPLITK": 128 + }, + "(2^17, 128, 512, 27)": { + "SPLITK": 16 + }, + "(2^17, 128, 128, 27)": { + "SPLITK": 32 + }, + "(2^15, 256, 1024, 27)": { + "SPLITK": 2 + }, + "(2^15, 256, 256, 27)": { + "SPLITK": 8 + }, + "(2^13, 512, 2048, 27)": { + "SPLITK": 1 + }, + "(2^13, 512, 512, 27)": { + "SPLITK": 1 + }, + "(2^13, 512, 128, 27)": { + "SPLITK": 1 + }, + "(2^15, 256, 64, 27)": { + "SPLITK": 8 + }, + "(2^17, 128, 32, 27)": { + "SPLITK": 64 + }, + "(2^19, 64, 16, 27)": { + "SPLITK": 128 + }, + "(2^11, 1024, 4096, 27)": { + "SPLITK": 1 + }, + "(2^11, 1024, 1024, 27)": { + "SPLITK": 1 + }, + "(2^10, 512, 2048, 27)": { + "SPLITK": 1 + }, + "(2^10, 512, 512, 27)": { + "SPLITK": 1 + }, + "(2^10, 512, 128, 27)": { + "SPLITK": 1 + }, + "(2^20, 64, 64, 27)": { + "SPLITK": 128 + }, + "(2^20, 64, 16, 27)": { + "SPLITK": 128 + }, + "(2^18, 128, 512, 27)": { + "SPLITK": 32 + }, + "(2^18, 128, 128, 27)": { + "SPLITK": 64 + }, + "(2^18, 128, 32, 27)": { + "SPLITK": 64 + }, + "(2^16, 256, 1024, 27)": { + "SPLITK": 4 + }, + "(2^16, 256, 256, 27)": { + "SPLITK": 4 + }, + "(2^16, 256, 64, 27)": { + "SPLITK": 8 + }, + "(2^12, 256, 1024, 27)": { + "SPLITK": 1 + }, + "(2^12, 256, 256, 27)": { + "SPLITK": 1 + }, + "(2^12, 256, 64, 27)": { + "SPLITK": 8 + }, + "(2^21, 64, 64, 27)": { + "SPLITK": 128 + }, + "(2^21, 64, 16, 27)": { + "SPLITK": 128 + }, + "(2^19, 128, 512, 27)": { + "SPLITK": 32 + }, + "(2^19, 128, 128, 27)": { + "SPLITK": 128 + }, + "(2^17, 256, 1024, 27)": { + "SPLITK": 8 + }, + "(2^17, 256, 256, 27)": { + "SPLITK": 8 + }, + "(2^14, 512, 2048, 27)": { + "SPLITK": 2 + }, + "(2^14, 512, 512, 27)": { + "SPLITK": 1 + }, + "(2^14, 512, 128, 27)": { + "SPLITK": 4 + }, + "(2^17, 256, 64, 27)": { + "SPLITK": 32 + }, + "(2^19, 128, 32, 27)": { + "SPLITK": 128 + }, + "(2^7, 1024, 4096, 27)": { + "SPLITK": 1 + }, + "(2^7, 1024, 1024, 27)": { + "SPLITK": 1 + }, + "(2^14, 128, 512, 27)": { + "SPLITK": 4 + }, + "(2^14, 128, 128, 27)": { + "SPLITK": 16 + }, + "(2^14, 128, 32, 27)": { + "SPLITK": 32 + }, + "(2^16, 64, 64, 27)": { + "SPLITK": 32 + }, + "(2^16, 64, 16, 27)": { + "SPLITK": 64 + }, + "(2^22, 64, 64, 27)": { + "SPLITK": 8 + }, + "(2^22, 64, 16, 27)": { + "SPLITK": 128 + }, + "(2^13, 1024, 1024, 27)": { + "SPLITK": 1 + }, + "(2^15, 512, 512, 27)": { + "SPLITK": 1 + } + }, + "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_fwd_implicit_gemm.sparse_submanifold_conv_fwd_implicit_gemm_kernel": { + "(10, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + } + }, + "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_fwd_implicit_gemm_splitk.sparse_submanifold_conv_fwd_implicit_gemm_kernel": { + "(10, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 128, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + } + }, + "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_fwd_implicit_gemm_splitk.sparse_submanifold_conv_fwd_implicit_gemm_splitk_kernel": { + "(8, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 1024, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 1024, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 1024, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, 16, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, 8, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, 4, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + } + }, + "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_fwd_implicit_gemm_splitk.sparse_submanifold_conv_fwd_implicit_gemm_splitk": { + "(2^8, 1024, 1024, 27)": { + "SPLITK": 8 + }, + "(2^10, 1024, 1024, 27)": { + "SPLITK": 2 + }, + "(2^12, 1024, 1024, 27)": { + "SPLITK": 2 + }, + "(2^14, 512, 512, 27)": { + "SPLITK": 1 + }, + "(2^16, 256, 256, 27)": { + "SPLITK": 1 + }, + "(2^18, 128, 128, 27)": { + "SPLITK": 1 + }, + "(2^20, 64, 64, 27)": { + "SPLITK": 1 + }, + "(2^7, 1024, 1024, 27)": { + "SPLITK": 16 + }, + "(2^9, 1024, 1024, 27)": { + "SPLITK": 8 + }, + "(2^11, 1024, 1024, 27)": { + "SPLITK": 4 + }, + "(2^13, 1024, 1024, 27)": { + "SPLITK": 1 + }, + "(2^15, 512, 512, 27)": { + "SPLITK": 1 + }, + "(2^17, 256, 256, 27)": { + "SPLITK": 1 + }, + "(2^19, 128, 128, 27)": { + "SPLITK": 1 + } + }, + "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_fwd_masked_implicit_gemm.sparse_submanifold_conv_fwd_masked_implicit_gemm_kernel": { + "(18, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(21, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(21, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(23, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(23, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(21, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(21, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(21, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 32 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 32 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 32 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 32 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 128, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(24, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + } + }, + "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_fwd_masked_implicit_gemm_splitk.sparse_submanifold_conv_fwd_masked_implicit_gemm_kernel": { + "(18, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(21, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(21, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(23, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(23, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(21, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(21, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 256, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 512, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 1024, 4096, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(21, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 128, 32, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 64, 16, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 32 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 32 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 32 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 256, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 32 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(19, 128, 128, 27, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 512, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(18, 512, 2048, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 256, 256, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(20, 256, 1024, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 128, 128, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 128, 512, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(24, 64, 64, 27, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float16')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + } + }, + "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_fwd_masked_implicit_gemm_splitk.sparse_submanifold_conv_fwd_masked_implicit_gemm_splitk_kernel": { + "(16, 128, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 128, 32, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 512, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 128, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 1024, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 4096, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 4096, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 1024, 4096, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 2048, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 512, 2048, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 256, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 128, 32, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 4096, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 4096, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 512, 2048, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 512, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 128, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 4096, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 4096, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 4096, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 2048, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 512, 2048, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 256, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 32, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 128, 32, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 1024, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 1024, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 4096, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 4096, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 4096, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 1024, 4096, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 512, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 512, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 128, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 128, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 2048, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 2048, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 512, 2048, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 256, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 256, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 512, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 4096, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 4096, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 4096, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 4096, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 4096, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 32, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 32, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 32, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 128, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 64, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 64, 16, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 256, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 256, 256, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 256, 256, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 256, 256, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 1024, 4096, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 512, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 512, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 256, + "BK": 64 + }, + "num_warps": 8, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 1024, 1024, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(16, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(17, 512, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 256, 256, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 256, 256, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 256, 256, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 256, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 256, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 256, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 256, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 256, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 64, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 64, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 64, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 64, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 64, 16, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 64, 16, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 64, 16, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(14, 64, 16, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 128, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 128, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 128, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 128, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 128, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 128, 128, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 128, 32, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 128, 32, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 128, 32, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 128, 32, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 128, 32, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(12, 128, 32, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 256, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 256, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 256, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 256, 64, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(10, 256, 64, 27, 256, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 512, 512, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 512, 512, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 512, 512, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 512, 512, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 512, 512, 27, 256, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 512, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 512, 128, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 512, 128, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 512, 128, 27, 256, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 512, 128, 27, 512, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(8, 512, 128, 27, 1024, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 64, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 64, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 64, 16, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(15, 64, 16, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 128, 128, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 128, 128, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 128, 128, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 128, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 128, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 128, 32, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 128, 32, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 128, 32, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 128, 32, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 128, 32, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 512, 512, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 512, 512, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 512, 512, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 512, 512, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 512, 512, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 512, 512, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 512, 128, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 512, 128, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 512, 128, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 512, 128, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 512, 128, 27, 256, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 512, 128, 27, 512, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 64, 64, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 64, 64, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 64, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 64, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 64, 16, 27, 2, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 64, 16, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 64, 16, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(13, 64, 16, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 256, + "B2": 64, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 256, 256, 27, 4, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 256, 256, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 128, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 256, 256, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 256, 256, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 256, 256, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 256, 256, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 256, 64, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 256, 64, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 256, 64, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 256, 64, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 256, 64, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 256, 64, 27, 256, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(6, 1024, 1024, 27, 8, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(6, 1024, 1024, 27, 16, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(6, 1024, 1024, 27, 32, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(6, 1024, 1024, 27, 64, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(6, 1024, 1024, 27, 128, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(6, 1024, 1024, 27, 256, True, 'torch.float16', 'torch.float16', 'torch.float16', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 64 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(7, 1024, 1024, 27, 4, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 64, + "B2": 64, + "BK": 64 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(9, 1024, 1024, 27, 8, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(11, 1024, 1024, 27, 4, True, 'torch.float32', 'torch.float32', 'torch.float32', 'torch.uint32', 'torch.int64', 'torch.float32')": { + "kwargs": { + "B1": 128, + "B2": 128, + "BK": 32 + }, + "num_warps": 4, + "num_ctas": 1, + "num_stages": 4, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + } + }, + "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_fwd_masked_implicit_gemm_splitk.sparse_submanifold_conv_fwd_masked_implicit_gemm_splitk": { + "(2^18, 64, 64, 27)": { + "SPLITK": 1 + }, + "(2^18, 64, 16, 27)": { + "SPLITK": 1 + }, + "(2^16, 128, 128, 27)": { + "SPLITK": 1 + }, + "(2^16, 128, 32, 27)": { + "SPLITK": 1 + }, + "(2^14, 256, 256, 27)": { + "SPLITK": 1 + }, + "(2^14, 256, 64, 27)": { + "SPLITK": 1 + }, + "(2^12, 512, 512, 27)": { + "SPLITK": 1 + }, + "(2^12, 512, 128, 27)": { + "SPLITK": 16 + }, + "(2^10, 1024, 1024, 27)": { + "SPLITK": 4 + }, + "(2^10, 1024, 4096, 27)": { + "SPLITK": 4 + }, + "(2^12, 512, 2048, 27)": { + "SPLITK": 1 + }, + "(2^14, 256, 1024, 27)": { + "SPLITK": 1 + }, + "(2^16, 128, 512, 27)": { + "SPLITK": 1 + }, + "(2^19, 64, 64, 27)": { + "SPLITK": 1 + }, + "(2^19, 64, 16, 27)": { + "SPLITK": 1 + }, + "(2^17, 128, 128, 27)": { + "SPLITK": 1 + }, + "(2^17, 128, 32, 27)": { + "SPLITK": 1 + }, + "(2^15, 256, 256, 27)": { + "SPLITK": 1 + }, + "(2^15, 256, 64, 27)": { + "SPLITK": 1 + }, + "(2^13, 512, 512, 27)": { + "SPLITK": 2 + }, + "(2^13, 512, 128, 27)": { + "SPLITK": 1 + }, + "(2^11, 1024, 1024, 27)": { + "SPLITK": 4 + }, + "(2^11, 1024, 4096, 27)": { + "SPLITK": 1 + }, + "(2^13, 512, 2048, 27)": { + "SPLITK": 1 + }, + "(2^15, 256, 1024, 27)": { + "SPLITK": 1 + }, + "(2^17, 128, 512, 27)": { + "SPLITK": 1 + }, + "(2^11, 512, 512, 27)": { + "SPLITK": 4 + }, + "(2^11, 512, 128, 27)": { + "SPLITK": 16 + }, + "(2^9, 1024, 1024, 27)": { + "SPLITK": 8 + }, + "(2^9, 1024, 4096, 27)": { + "SPLITK": 8 + }, + "(2^11, 512, 2048, 27)": { + "SPLITK": 1 + }, + "(2^13, 256, 256, 27)": { + "SPLITK": 1 + }, + "(2^13, 256, 64, 27)": { + "SPLITK": 1 + }, + "(2^13, 256, 1024, 27)": { + "SPLITK": 1 + }, + "(2^17, 64, 64, 27)": { + "SPLITK": 1 + }, + "(2^17, 64, 16, 27)": { + "SPLITK": 1 + }, + "(2^15, 128, 128, 27)": { + "SPLITK": 1 + }, + "(2^15, 128, 32, 27)": { + "SPLITK": 1 + }, + "(2^15, 128, 512, 27)": { + "SPLITK": 1 + }, + "(2^8, 1024, 1024, 27)": { + "SPLITK": 16 + }, + "(2^8, 1024, 4096, 27)": { + "SPLITK": 4 + }, + "(2^10, 512, 512, 27)": { + "SPLITK": 1 + }, + "(2^10, 512, 128, 27)": { + "SPLITK": 32 + }, + "(2^10, 512, 2048, 27)": { + "SPLITK": 1 + }, + "(2^20, 64, 64, 27)": { + "SPLITK": 1 + }, + "(2^20, 64, 16, 27)": { + "SPLITK": 1 + }, + "(2^18, 128, 128, 27)": { + "SPLITK": 1 + }, + "(2^18, 128, 32, 27)": { + "SPLITK": 1 + }, + "(2^18, 128, 512, 27)": { + "SPLITK": 1 + }, + "(2^16, 256, 256, 27)": { + "SPLITK": 1 + }, + "(2^16, 256, 64, 27)": { + "SPLITK": 1 + }, + "(2^16, 256, 1024, 27)": { + "SPLITK": 1 + }, + "(2^12, 256, 256, 27)": { + "SPLITK": 1 + }, + "(2^12, 256, 64, 27)": { + "SPLITK": 1 + }, + "(2^12, 256, 1024, 27)": { + "SPLITK": 1 + }, + "(2^21, 64, 64, 27)": { + "SPLITK": 1 + }, + "(2^21, 64, 16, 27)": { + "SPLITK": 1 + }, + "(2^19, 128, 128, 27)": { + "SPLITK": 1 + }, + "(2^19, 128, 32, 27)": { + "SPLITK": 1 + }, + "(2^17, 256, 256, 27)": { + "SPLITK": 1 + }, + "(2^17, 256, 64, 27)": { + "SPLITK": 1 + }, + "(2^14, 512, 512, 27)": { + "SPLITK": 1 + }, + "(2^14, 512, 128, 27)": { + "SPLITK": 4 + }, + "(2^14, 512, 2048, 27)": { + "SPLITK": 1 + }, + "(2^17, 256, 1024, 27)": { + "SPLITK": 1 + }, + "(2^19, 128, 512, 27)": { + "SPLITK": 1 + }, + "(2^7, 1024, 1024, 27)": { + "SPLITK": 4 + }, + "(2^7, 1024, 4096, 27)": { + "SPLITK": 16 + }, + "(2^14, 128, 128, 27)": { + "SPLITK": 1 + }, + "(2^14, 128, 32, 27)": { + "SPLITK": 1 + }, + "(2^14, 128, 512, 27)": { + "SPLITK": 1 + }, + "(2^16, 64, 64, 27)": { + "SPLITK": 1 + }, + "(2^16, 64, 16, 27)": { + "SPLITK": 1 + }, + "(2^22, 64, 64, 27)": { + "SPLITK": 1 + }, + "(2^22, 64, 16, 27)": { + "SPLITK": 1 + }, + "(2^12, 1024, 1024, 27)": { + "SPLITK": 1 + }, + "(2^10, 256, 256, 27)": { + "SPLITK": 16 + }, + "(2^12, 1024, 4096, 27)": { + "SPLITK": 1 + }, + "(2^15, 512, 512, 27)": { + "SPLITK": 1 + }, + "(2^15, 512, 128, 27)": { + "SPLITK": 1 + }, + "(2^13, 1024, 1024, 27)": { + "SPLITK": 1 + }, + "(2^13, 1024, 4096, 27)": { + "SPLITK": 1 + }, + "(2^15, 512, 2048, 27)": { + "SPLITK": 1 + }, + "(2^20, 128, 128, 27)": { + "SPLITK": 1 + }, + "(2^20, 128, 32, 27)": { + "SPLITK": 1 + }, + "(2^18, 256, 256, 27)": { + "SPLITK": 1 + }, + "(2^18, 256, 64, 27)": { + "SPLITK": 1 + }, + "(2^18, 256, 1024, 27)": { + "SPLITK": 1 + }, + "(2^20, 128, 512, 27)": { + "SPLITK": 1 + }, + "(2^16, 512, 512, 27)": { + "SPLITK": 1 + }, + "(2^16, 512, 128, 27)": { + "SPLITK": 1 + }, + "(2^14, 1024, 1024, 27)": { + "SPLITK": 1 + }, + "(2^14, 1024, 4096, 27)": { + "SPLITK": 1 + }, + "(2^16, 512, 2048, 27)": { + "SPLITK": 1 + }, + "(2^23, 64, 64, 27)": { + "SPLITK": 1 + }, + "(2^23, 64, 16, 27)": { + "SPLITK": 1 + }, + "(2^21, 128, 128, 27)": { + "SPLITK": 1 + }, + "(2^21, 128, 32, 27)": { + "SPLITK": 1 + }, + "(2^19, 256, 256, 27)": { + "SPLITK": 1 + }, + "(2^19, 256, 64, 27)": { + "SPLITK": 1 + }, + "(2^17, 512, 512, 27)": { + "SPLITK": 1 + }, + "(2^17, 512, 128, 27)": { + "SPLITK": 1 + }, + "(2^15, 1024, 1024, 27)": { + "SPLITK": 1 + }, + "(2^15, 1024, 4096, 27)": { + "SPLITK": 1 + }, + "(2^17, 512, 2048, 27)": { + "SPLITK": 1 + }, + "(2^19, 256, 1024, 27)": { + "SPLITK": 1 + }, + "(2^21, 128, 512, 27)": { + "SPLITK": 1 + }, + "(2^11, 256, 256, 27)": { + "SPLITK": 1 + }, + "(2^11, 256, 64, 27)": { + "SPLITK": 32 + }, + "(2^14, 64, 64, 27)": { + "SPLITK": 2 + }, + "(2^14, 64, 16, 27)": { + "SPLITK": 4 + }, + "(2^12, 128, 128, 27)": { + "SPLITK": 4 + }, + "(2^12, 128, 32, 27)": { + "SPLITK": 32 + }, + "(2^10, 256, 64, 27)": { + "SPLITK": 128 + }, + "(2^8, 512, 512, 27)": { + "SPLITK": 64 + }, + "(2^8, 512, 128, 27)": { + "SPLITK": 256 + }, + "(2^15, 64, 64, 27)": { + "SPLITK": 1 + }, + "(2^15, 64, 16, 27)": { + "SPLITK": 4 + }, + "(2^13, 128, 128, 27)": { + "SPLITK": 2 + }, + "(2^13, 128, 32, 27)": { + "SPLITK": 32 + }, + "(2^9, 512, 512, 27)": { + "SPLITK": 8 + }, + "(2^9, 512, 128, 27)": { + "SPLITK": 256 + }, + "(2^13, 64, 64, 27)": { + "SPLITK": 1 + }, + "(2^13, 64, 16, 27)": { + "SPLITK": 1 + }, + "(2^9, 256, 256, 27)": { + "SPLITK": 8 + }, + "(2^9, 256, 64, 27)": { + "SPLITK": 32 + }, + "(2^6, 1024, 1024, 27)": { + "SPLITK": 8 + }, + "(2^18, 512, 512, 27)": { + "SPLITK": 1 + }, + "(2^18, 512, 2048, 27)": { + "SPLITK": 1 + }, + "(2^20, 256, 256, 27)": { + "SPLITK": 1 + }, + "(2^20, 256, 1024, 27)": { + "SPLITK": 1 + }, + "(2^22, 128, 128, 27)": { + "SPLITK": 1 + }, + "(2^22, 128, 512, 27)": { + "SPLITK": 1 + }, + "(2^24, 64, 64, 27)": { + "SPLITK": 1 + } + }, + "flex_gemm.triton.gemm.gemm_nn.gemm_nn_kernel": {}, + "flex_gemm.triton.gemm.gemm_nn_splitk.gemm_nn_kernel": {}, + "flex_gemm.triton.gemm.gemm_nn_splitk.gemm_nn_splitk_kernel": {}, + "flex_gemm.triton.gemm.gemm_nn_splitk.gemm_nn_splitk": {}, + "flex_gemm.triton.gemm.gemm_nn_splitk_atomic.gemm_nn_splitk_atomic_kernel": {}, + "flex_gemm.triton.gemm.gemm_nn_splitk_lock.gemm_nn_splitk_lock_kernel": {}, + "flex_gemm.kernels.triton.grid_sample.indice_weighed_sum_bwd.indice_weighed_sum_bwd_input_kernel": {}, + "flex_gemm.kernels.triton.grid_sample.indice_weighed_sum_fwd.indice_weighed_sum_fwd_kernel": { + "(23, 4194304, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 16, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 4194304, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 32, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 1941851, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 8, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 7877533, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 16, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(23, 2226123, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 8, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(23, 12748156, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 16, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 1911611, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 16, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(22, 5017088, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 16, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(24, 4194304, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 16, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(24, 1112821, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 8, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(24, 33122502, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 16, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(23, 1284070, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 8, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(23, 10949861, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 16, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(23, 1571524, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 8, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(23, 12857151, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 16, + "BK": 16 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(23, 1641261, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 8, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(23, 15495770, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 8, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(23, 1808517, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 8, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(23, 9814710, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 32, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(23, 1780790, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 8, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(23, 14310670, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 32, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(24, 2113485, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 8, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(24, 21145957, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 8, + "BK": 32 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(23, 1782768, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 8, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(23, 14828658, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 16, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(24, 1577958, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 8, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(24, 17012742, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 32, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(23, 2345014, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 8, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(23, 12887107, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 16, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(23, 2168973, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 8, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(23, 12754203, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 16, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(23, 1873811, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 8, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(23, 12506390, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 16, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(23, 2223717, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 8, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + }, + "(23, 14387524, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": { + "kwargs": { + "BM": 16, + "BK": 8 + }, + "num_warps": 2, + "num_ctas": 1, + "num_stages": 2, + "num_buffers_warp_spec": 0, + "num_consumer_groups": 0, + "reg_dec_producer": 0, + "reg_inc_consumer": 0, + "maxnreg": null, + "pre_hook": null + } + } + } +} \ No newline at end of file