ArtemisTAO commited on May 30, 2025

Commit

ec78611

verified ·

0 Parent(s):

Duplicate from ArtemisTAO/WIN_21_1

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +191 -0
Dockerfile +48 -0
LICENSE +201 -0
README.md +13 -0
assets/Qwen2.5_Omni.pdf +3 -0
cookbooks/=4.41.0 +17 -0
cookbooks/=4.50.0.dev0 +17 -0
cookbooks/=4.51.0.dev0 +17 -0
cookbooks/flash-attention/.github/workflows/publish.yml +218 -0
cookbooks/flash-attention/.gitignore +31 -0
cookbooks/flash-attention/.gitmodules +6 -0
cookbooks/flash-attention/AUTHORS +1 -0
cookbooks/flash-attention/LICENSE +29 -0
cookbooks/flash-attention/MANIFEST.in +12 -0
cookbooks/flash-attention/Makefile +9 -0
cookbooks/flash-attention/README.md +524 -0
cookbooks/flash-attention/assets/flash2_a100_fwd_bwd_benchmark.png +3 -0
cookbooks/flash-attention/assets/flash2_h100_fwd_bwd_benchmark.png +3 -0
cookbooks/flash-attention/assets/flash3_fp16_fwd.png +3 -0
cookbooks/flash-attention/assets/flashattention_logo.png +3 -0
cookbooks/flash-attention/assets/flashattn_banner.jpg +3 -0
cookbooks/flash-attention/assets/flashattn_banner.pdf +3 -0
cookbooks/flash-attention/assets/flashattn_memory.jpg +0 -0
cookbooks/flash-attention/assets/flashattn_speedup.jpg +3 -0
cookbooks/flash-attention/assets/flashattn_speedup_3090.jpg +3 -0
cookbooks/flash-attention/assets/flashattn_speedup_a100_d128.jpg +3 -0
cookbooks/flash-attention/assets/flashattn_speedup_t4.jpg +3 -0
cookbooks/flash-attention/assets/flashattn_speedup_t4_fwd.jpg +3 -0
cookbooks/flash-attention/assets/gpt2_training_curve.jpg +3 -0
cookbooks/flash-attention/assets/gpt2_training_efficiency.jpg +3 -0
cookbooks/flash-attention/assets/gpt3_training_curve.jpg +3 -0
cookbooks/flash-attention/assets/gpt3_training_efficiency.jpg +3 -0
cookbooks/flash-attention/benchmarks/benchmark_alibi.py +275 -0
cookbooks/flash-attention/benchmarks/benchmark_causal.py +225 -0
cookbooks/flash-attention/benchmarks/benchmark_flash_attention.py +180 -0
cookbooks/flash-attention/benchmarks/benchmark_gemm.py +47 -0
cookbooks/flash-attention/csrc/flash_attn/flash_api.cpp +1485 -0
cookbooks/flash-attention/csrc/flash_attn/src/alibi.h +75 -0
cookbooks/flash-attention/csrc/flash_attn/src/block_info.h +49 -0
cookbooks/flash-attention/csrc/flash_attn/src/dropout.h +95 -0
cookbooks/flash-attention/csrc/flash_attn/src/flash.h +194 -0
cookbooks/flash-attention/csrc/flash_attn/src/flash_bwd_hdim128_bf16_causal_sm80.cu +14 -0
cookbooks/flash-attention/csrc/flash_attn/src/flash_bwd_hdim128_bf16_sm80.cu +14 -0
cookbooks/flash-attention/csrc/flash_attn/src/flash_bwd_hdim128_fp16_causal_sm80.cu +14 -0
cookbooks/flash-attention/csrc/flash_attn/src/flash_bwd_hdim128_fp16_sm80.cu +14 -0
cookbooks/flash-attention/csrc/flash_attn/src/flash_bwd_hdim160_bf16_causal_sm80.cu +14 -0
cookbooks/flash-attention/csrc/flash_attn/src/flash_bwd_hdim160_bf16_sm80.cu +14 -0
cookbooks/flash-attention/csrc/flash_attn/src/flash_bwd_hdim160_fp16_causal_sm80.cu +14 -0
cookbooks/flash-attention/csrc/flash_attn/src/flash_bwd_hdim160_fp16_sm80.cu +14 -0
cookbooks/flash-attention/csrc/flash_attn/src/flash_bwd_hdim192_bf16_causal_sm80.cu +14 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,191 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/Qwen2.5_Omni.pdf filter=lfs diff=lfs merge=lfs -text
+cookbooks/flash-attention/assets/flash2_a100_fwd_bwd_benchmark.png filter=lfs diff=lfs merge=lfs -text
+cookbooks/flash-attention/assets/flash2_h100_fwd_bwd_benchmark.png filter=lfs diff=lfs merge=lfs -text
+cookbooks/flash-attention/assets/flash3_fp16_fwd.png filter=lfs diff=lfs merge=lfs -text
+cookbooks/flash-attention/assets/flashattention_logo.png filter=lfs diff=lfs merge=lfs -text
+cookbooks/flash-attention/assets/flashattn_banner.jpg filter=lfs diff=lfs merge=lfs -text
+cookbooks/flash-attention/assets/flashattn_banner.pdf filter=lfs diff=lfs merge=lfs -text
+cookbooks/flash-attention/assets/flashattn_speedup.jpg filter=lfs diff=lfs merge=lfs -text
+cookbooks/flash-attention/assets/flashattn_speedup_3090.jpg filter=lfs diff=lfs merge=lfs -text
+cookbooks/flash-attention/assets/flashattn_speedup_a100_d128.jpg filter=lfs diff=lfs merge=lfs -text
+cookbooks/flash-attention/assets/flashattn_speedup_t4.jpg filter=lfs diff=lfs merge=lfs -text
+cookbooks/flash-attention/assets/flashattn_speedup_t4_fwd.jpg filter=lfs diff=lfs merge=lfs -text
+cookbooks/flash-attention/assets/gpt2_training_curve.jpg filter=lfs diff=lfs merge=lfs -text
+cookbooks/flash-attention/assets/gpt2_training_efficiency.jpg filter=lfs diff=lfs merge=lfs -text
+cookbooks/flash-attention/assets/gpt3_training_curve.jpg filter=lfs diff=lfs merge=lfs -text
+cookbooks/flash-attention/assets/gpt3_training_efficiency.jpg filter=lfs diff=lfs merge=lfs -text
+flash-attention/assets/flash2_a100_fwd_bwd_benchmark.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/assets/flash2_h100_fwd_bwd_benchmark.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/assets/flash3_fp16_fwd.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/assets/flashattention_logo.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/assets/flashattn_banner.jpg filter=lfs diff=lfs merge=lfs -text
+flash-attention/assets/flashattn_banner.pdf filter=lfs diff=lfs merge=lfs -text
+flash-attention/assets/flashattn_speedup.jpg filter=lfs diff=lfs merge=lfs -text
+flash-attention/assets/flashattn_speedup_3090.jpg filter=lfs diff=lfs merge=lfs -text
+flash-attention/assets/flashattn_speedup_a100_d128.jpg filter=lfs diff=lfs merge=lfs -text
+flash-attention/assets/flashattn_speedup_t4.jpg filter=lfs diff=lfs merge=lfs -text
+flash-attention/assets/flashattn_speedup_t4_fwd.jpg filter=lfs diff=lfs merge=lfs -text
+flash-attention/assets/gpt2_training_curve.jpg filter=lfs diff=lfs merge=lfs -text
+flash-attention/assets/gpt2_training_efficiency.jpg filter=lfs diff=lfs merge=lfs -text
+flash-attention/assets/gpt3_training_curve.jpg filter=lfs diff=lfs merge=lfs -text
+flash-attention/assets/gpt3_training_efficiency.jpg filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/composable_kernel/docs/data/ck_component.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/composable_kernel/docs/data/ck_layer.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/composable_kernel/example/ck_tile/14_moe_smoothquant/misc/moe-sm.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/composable_kernel/example/ck_tile/15_fused_moe/misc/moe-2.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/M128xK4_scalefactor_gmem.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/conv2d-fprop-int4.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cute/HMMA.8x8x4.NT.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cute/HMMA.8x8x4.NT_2x2.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cute/HMMA.8x8x4.NT_2x2_32Mx32x4.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cute/HMMA.8x8x4.NT_2x2_32x32x4.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cute/HMMA.8x8x4.NT_Atom.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cute/HMMA.8x8x4.quadpair.AB.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cute/HMMA.8x8x4.quadpair.C.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cute/TiledCopyA.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cute/TiledMmaC.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cute/composition1.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cute/composition2.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cute/divide2.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cute/divide3.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cute/gmma_coremat_cd_fp16.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cute/gmma_wg_n_slice.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cute/logical_divide-and-zipped_divide-2.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cute/logical_divide-and-zipped_divide.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cute/product2d.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cute/productblocked2d.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cute/productraked2d.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cute/slice.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cute/tC_partitioning.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cute/tv_layout.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cutlass-2.8-gemm-performance.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cutlass-2.9-implicit-gemm-performance.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cutlass-3.0-gemm-peak-performance.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cutlass-3.1-gemm-peak-performance.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cutlass-3.5.1-gemm-peak-performance-fp8.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cutlass-3.5.1-gemm-peak-performance.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cutlass-gemm-components.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cutlass-reduction-in-named-iterators.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cutlass-threadblock-mma-pipelined.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cutlass-tile-structure.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cutlass-warp-level-gemm-api-instantiation.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/cutlass-warp-thread-tile-structure.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/gemm-hierarchy-with-epilogue-no-labels.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/gemm-hierarchy-with-epilogue.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/gemm-structural-components.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/ldmatrix-8x128bx4.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/ldmatrix-tensorop-32x32x32.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/mma-8x8x32.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/non_persistent.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/persistent_clc.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/persistent_static.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/software-pipeline.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/tensor-op-permuted-smem-layout-TN-k0.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/tensor-op-permuted-smem-layout-TN-k1.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/csrc/cutlass/media/images/tensor-op-permuted-smem-layout-TN.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/assets/flash2_a100_fwd_bwd_benchmark.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/assets/flash2_h100_fwd_bwd_benchmark.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/assets/flash3_fp16_fwd.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/assets/flashattention_logo.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/assets/flashattn_banner.jpg filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/assets/flashattn_banner.pdf filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/assets/flashattn_speedup.jpg filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/assets/flashattn_speedup_3090.jpg filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/assets/flashattn_speedup_a100_d128.jpg filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/assets/flashattn_speedup_t4.jpg filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/assets/flashattn_speedup_t4_fwd.jpg filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/assets/gpt2_training_curve.jpg filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/assets/gpt2_training_efficiency.jpg filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/assets/gpt3_training_curve.jpg filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/assets/gpt3_training_efficiency.jpg filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/composable_kernel/docs/data/ck_component.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/composable_kernel/docs/data/ck_layer.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/composable_kernel/example/ck_tile/14_moe_smoothquant/misc/moe-sm.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/composable_kernel/example/ck_tile/15_fused_moe/misc/moe-2.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/M128xK4_scalefactor_gmem.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/conv2d-fprop-int4.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cute/HMMA.8x8x4.NT.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cute/HMMA.8x8x4.NT_2x2.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cute/HMMA.8x8x4.NT_2x2_32Mx32x4.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cute/HMMA.8x8x4.NT_2x2_32x32x4.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cute/HMMA.8x8x4.NT_Atom.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cute/HMMA.8x8x4.quadpair.AB.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cute/HMMA.8x8x4.quadpair.C.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cute/TiledCopyA.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cute/TiledMmaC.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cute/composition1.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cute/composition2.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cute/divide2.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cute/divide3.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cute/gmma_coremat_cd_fp16.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cute/gmma_wg_n_slice.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cute/logical_divide-and-zipped_divide-2.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cute/logical_divide-and-zipped_divide.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cute/product2d.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cute/productblocked2d.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cute/productraked2d.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cute/slice.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cute/tC_partitioning.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cute/tv_layout.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cutlass-2.8-gemm-performance.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cutlass-2.9-implicit-gemm-performance.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cutlass-3.0-gemm-peak-performance.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cutlass-3.1-gemm-peak-performance.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cutlass-3.5.1-gemm-peak-performance-fp8.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cutlass-3.5.1-gemm-peak-performance.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cutlass-gemm-components.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cutlass-reduction-in-named-iterators.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cutlass-threadblock-mma-pipelined.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cutlass-tile-structure.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cutlass-warp-level-gemm-api-instantiation.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/cutlass-warp-thread-tile-structure.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/gemm-hierarchy-with-epilogue-no-labels.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/gemm-hierarchy-with-epilogue.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/gemm-structural-components.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/ldmatrix-8x128bx4.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/ldmatrix-tensorop-32x32x32.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/mma-8x8x32.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/non_persistent.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/persistent_clc.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/persistent_static.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/software-pipeline.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/tensor-op-permuted-smem-layout-TN-k0.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/tensor-op-permuted-smem-layout-TN-k1.png filter=lfs diff=lfs merge=lfs -text
+flash-attention/flash-attention/csrc/cutlass/media/images/tensor-op-permuted-smem-layout-TN.png filter=lfs diff=lfs merge=lfs -text
+input_audio.wav filter=lfs diff=lfs merge=lfs -text
+model/Qwen2.5-Omni-7B/tokenizer.json filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,48 @@

+FROM nvidia/cuda:12.3.2-cudnn9-devel-ubuntu22.04
+# Set environment variables
+ENV PYTHONUNBUFFERED=1 \
+    DEBIAN_FRONTEND=noninteractive \
+    HF_HOME=/app/models \
+    NUMBA_CACHE_DIR=/tmp/numba_cache \
+    TORCH_CUDA_ARCH_LIST=8.0
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3 \
+    python3-pip \
+    python3-dev \
+    build-essential \
+    git \
+    ffmpeg \
+    libsndfile1 \
+    libcusparse-dev-12-3 \
+    && rm -rf /var/lib/apt/lists/*
+# Install Python build tools
+RUN pip install --upgrade pip setuptools wheel packaging ninja
+WORKDIR /app
+# Create cache directory
+RUN mkdir -p /tmp/numba_cache && \
+    chmod 777 /tmp/numba_cache
+# Install PyTorch with CUDA 12.1 first
+RUN pip install --pre torch torchvision torchaudio \
+    --index-url https://download.pytorch.org/whl/nightly/cu121
+# Copy and install requirements
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Install flash-attn separately with no isolation
+RUN pip install flash-attn==2.7.4.post1 --no-build-isolation
+# Copy application files
+COPY server.py .
+COPY qwen-omni-utils/ ./qwen-omni-utils/
+COPY model/ ./model/
+EXPOSE 8000
+CMD ["python3", "server.py"]

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2025 Alibaba Cloud
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+license: mit
+tags:
+- any-to-any
+- omega
+- omegalabs
+- bittensor
+- agi
+---
+This is an Any-to-Any model checkpoint for the OMEGA Labs x Bittensor Any-to-Any subnet.
+Check out the [git repo](https://github.com/omegalabsinc/omegalabs-anytoany-bittensor) and find OMEGA on X: [@omegalabsai](https://x.com/omegalabsai).

assets/Qwen2.5_Omni.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e0c9e0042ad20bc0c95cbbfc96f63f4ff1f28727c5b32973e7fd597557b6b15f
+size 4014433

cookbooks/=4.41.0 ADDED Viewed

	@@ -0,0 +1,17 @@

+Requirement already satisfied: transformers in /home/ubuntu/.venv/lib/python3.10/site-packages (4.51.0.dev0)
+Requirement already satisfied: filelock in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (3.18.0)
+Requirement already satisfied: huggingface-hub<1.0,>=0.26.0 in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (0.29.3)
+Requirement already satisfied: numpy>=1.17 in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (2.1.3)
+Requirement already satisfied: packaging>=20.0 in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (24.2)
+Requirement already satisfied: pyyaml>=5.1 in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (6.0.2)
+Requirement already satisfied: regex!=2019.12.17 in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (2024.11.6)
+Requirement already satisfied: requests in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (2.32.3)
+Requirement already satisfied: tokenizers<0.22,>=0.21 in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (0.21.1)
+Requirement already satisfied: safetensors>=0.4.3 in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (0.5.3)
+Requirement already satisfied: tqdm>=4.27 in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (4.67.1)
+Requirement already satisfied: fsspec>=2023.5.0 in /home/ubuntu/.venv/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.26.0->transformers) (2025.3.0)
+Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/ubuntu/.venv/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.26.0->transformers) (4.13.0)
+Requirement already satisfied: charset-normalizer<4,>=2 in /home/ubuntu/.venv/lib/python3.10/site-packages (from requests->transformers) (3.4.1)
+Requirement already satisfied: idna<4,>=2.5 in /home/ubuntu/.venv/lib/python3.10/site-packages (from requests->transformers) (3.10)
+Requirement already satisfied: urllib3<3,>=1.21.1 in /home/ubuntu/.venv/lib/python3.10/site-packages (from requests->transformers) (2.3.0)
+Requirement already satisfied: certifi>=2017.4.17 in /home/ubuntu/.venv/lib/python3.10/site-packages (from requests->transformers) (2025.1.31)

cookbooks/=4.50.0.dev0 ADDED Viewed

	@@ -0,0 +1,17 @@

+Requirement already satisfied: transformers in /home/ubuntu/.venv/lib/python3.10/site-packages (4.50.0.dev0)
+Requirement already satisfied: filelock in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (3.18.0)
+Requirement already satisfied: huggingface-hub<1.0,>=0.26.0 in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (0.29.3)
+Requirement already satisfied: numpy>=1.17 in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (2.1.3)
+Requirement already satisfied: packaging>=20.0 in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (24.2)
+Requirement already satisfied: pyyaml>=5.1 in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (6.0.2)
+Requirement already satisfied: regex!=2019.12.17 in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (2024.11.6)
+Requirement already satisfied: requests in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (2.32.3)
+Requirement already satisfied: tokenizers<0.22,>=0.21 in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (0.21.1)
+Requirement already satisfied: safetensors>=0.4.1 in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (0.5.3)
+Requirement already satisfied: tqdm>=4.27 in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (4.67.1)
+Requirement already satisfied: fsspec>=2023.5.0 in /home/ubuntu/.venv/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.26.0->transformers) (2025.3.0)
+Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/ubuntu/.venv/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.26.0->transformers) (4.13.0)
+Requirement already satisfied: charset-normalizer<4,>=2 in /home/ubuntu/.venv/lib/python3.10/site-packages (from requests->transformers) (3.4.1)
+Requirement already satisfied: idna<4,>=2.5 in /home/ubuntu/.venv/lib/python3.10/site-packages (from requests->transformers) (3.10)
+Requirement already satisfied: urllib3<3,>=1.21.1 in /home/ubuntu/.venv/lib/python3.10/site-packages (from requests->transformers) (2.3.0)
+Requirement already satisfied: certifi>=2017.4.17 in /home/ubuntu/.venv/lib/python3.10/site-packages (from requests->transformers) (2025.1.31)

cookbooks/=4.51.0.dev0 ADDED Viewed

	@@ -0,0 +1,17 @@

+Requirement already satisfied: transformers in /home/ubuntu/.venv/lib/python3.10/site-packages (4.50.0.dev0)
+Requirement already satisfied: filelock in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (3.18.0)
+Requirement already satisfied: huggingface-hub<1.0,>=0.26.0 in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (0.29.3)
+Requirement already satisfied: numpy>=1.17 in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (2.1.3)
+Requirement already satisfied: packaging>=20.0 in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (24.2)
+Requirement already satisfied: pyyaml>=5.1 in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (6.0.2)
+Requirement already satisfied: regex!=2019.12.17 in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (2024.11.6)
+Requirement already satisfied: requests in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (2.32.3)
+Requirement already satisfied: tokenizers<0.22,>=0.21 in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (0.21.1)
+Requirement already satisfied: safetensors>=0.4.1 in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (0.5.3)
+Requirement already satisfied: tqdm>=4.27 in /home/ubuntu/.venv/lib/python3.10/site-packages (from transformers) (4.67.1)
+Requirement already satisfied: fsspec>=2023.5.0 in /home/ubuntu/.venv/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.26.0->transformers) (2025.3.0)
+Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/ubuntu/.venv/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.26.0->transformers) (4.13.0)
+Requirement already satisfied: charset-normalizer<4,>=2 in /home/ubuntu/.venv/lib/python3.10/site-packages (from requests->transformers) (3.4.1)
+Requirement already satisfied: idna<4,>=2.5 in /home/ubuntu/.venv/lib/python3.10/site-packages (from requests->transformers) (3.10)
+Requirement already satisfied: urllib3<3,>=1.21.1 in /home/ubuntu/.venv/lib/python3.10/site-packages (from requests->transformers) (2.3.0)
+Requirement already satisfied: certifi>=2017.4.17 in /home/ubuntu/.venv/lib/python3.10/site-packages (from requests->transformers) (2025.1.31)

cookbooks/flash-attention/.github/workflows/publish.yml ADDED Viewed

	@@ -0,0 +1,218 @@

+# This workflow will:
+# - Create a new Github release
+# - Build wheels for supported architectures
+# - Deploy the wheels to the Github release
+# - Release the static code to PyPi
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
+name: Build wheels and deploy
+on:
+  create:
+    tags:
+      - v*
+jobs:
+  setup_release:
+    name: Create Release
+    runs-on: ubuntu-latest
+    steps:
+      - name: Get the tag version
+        id: extract_branch
+        run: echo ::set-output name=branch::${GITHUB_REF#refs/tags/}
+        shell: bash
+      - name: Create Release
+        id: create_release
+        uses: actions/create-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          tag_name: ${{ steps.extract_branch.outputs.branch }}
+          release_name: ${{ steps.extract_branch.outputs.branch }}
+  build_wheels:
+    name: Build Wheel
+    needs: setup_release
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+          # Using ubuntu-20.04 instead of 22.04 for more compatibility (glibc). Ideally we'd use the
+          # manylinux docker image, but I haven't figured out how to install CUDA on manylinux.
+          os: [ubuntu-20.04]
+          python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
+          torch-version: ['2.2.2', '2.3.1', '2.4.0', '2.5.1', '2.6.0']
+          cuda-version: ['12.4.1']
+          # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not.
+          # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI.
+          # Without this we get import error (undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationESs)
+          # when building without C++11 ABI and using it on nvcr images.
+          cxx11_abi: ['FALSE', 'TRUE']
+          exclude:
+            # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
+            # Pytorch < 2.5 does not support Python 3.13
+            - torch-version: '2.2.2'
+              python-version: '3.13'
+            - torch-version: '2.3.1'
+              python-version: '3.13'
+            - torch-version: '2.4.0'
+              python-version: '3.13'
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Set CUDA and PyTorch versions
+        run: |
+          echo "MATRIX_CUDA_VERSION=$(echo ${{ matrix.cuda-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
+          echo "MATRIX_TORCH_VERSION=$(echo ${{ matrix.torch-version }} | awk -F \. {'print $1 "." $2'})" >> $GITHUB_ENV
+          echo "WHEEL_CUDA_VERSION=$(echo ${{ matrix.cuda-version }} | awk -F \. {'print $1'})" >> $GITHUB_ENV
+          echo "MATRIX_PYTHON_VERSION=$(echo ${{ matrix.python-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
+      - name: Free up disk space
+        if: ${{ runner.os == 'Linux' }}
+        # https://github.com/easimon/maximize-build-space/blob/master/action.yml
+        # https://github.com/easimon/maximize-build-space/tree/test-report
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /opt/hostedtoolcache/CodeQL
+      - name: Set up swap space
+        if: runner.os == 'Linux'
+        uses: pierotofy/set-swap-space@v1.0
+        with:
+          swap-size-gb: 10
+      - name: Install CUDA ${{ matrix.cuda-version }}
+        if: ${{ matrix.cuda-version != 'cpu' }}
+        uses: Jimver/cuda-toolkit@v0.2.19
+        id: cuda-toolkit
+        with:
+          cuda: ${{ matrix.cuda-version }}
+          linux-local-args: '["--toolkit"]'
+          # default method is "local", and we're hitting some error with caching for CUDA 11.8 and 12.1
+          # method: ${{ (matrix.cuda-version == '11.8.0' || matrix.cuda-version == '12.1.0') && 'network' || 'local' }}
+          method: 'network'
+          sub-packages: '["nvcc"]'
+      - name: Install PyTorch ${{ matrix.torch-version }}+cu${{ matrix.cuda-version }}
+        run: |
+          pip install --upgrade pip
+          # For some reason torch 2.2.0 on python 3.12 errors saying no setuptools
+          pip install setuptools==75.8.0
+          # With python 3.13 and torch 2.5.1, unless we update typing-extensions, we get error
+          # AttributeError: attribute '__default__' of 'typing.ParamSpec' objects is not writable
+          pip install typing-extensions==4.12.2
+          # We want to figure out the CUDA version to download pytorch
+          # e.g. we can have system CUDA version being 11.7 but if torch==1.12 then we need to download the wheel from cu116
+          # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
+          # This code is ugly, maybe there's a better way to do this.
+          export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \
+            minv = {'2.2': 118, '2.3': 118, '2.4': 118, '2.5': 118, '2.6': 118}[env['MATRIX_TORCH_VERSION']]; \
+            maxv = {'2.2': 121, '2.3': 121, '2.4': 124, '2.5': 124, '2.6': 124}[env['MATRIX_TORCH_VERSION']]; \
+            print(minv if int(env['MATRIX_CUDA_VERSION']) < 120 else maxv)" \
+          )
+          if [[ ${{ matrix.torch-version }} == *"dev"* ]]; then
+            # pip install --no-cache-dir --pre torch==${{ matrix.torch-version }} --index-url https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION}
+            # Can't use --no-deps because we need cudnn etc.
+            # Hard-coding this version of pytorch-triton for torch 2.6.0.dev20241001
+            pip install jinja2
+            pip install https://download.pytorch.org/whl/nightly/pytorch_triton-3.1.0%2Bcf34004b8a-cp${MATRIX_PYTHON_VERSION}-cp${MATRIX_PYTHON_VERSION}-linux_x86_64.whl
+            pip install --no-cache-dir --pre https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION}/torch-${{ matrix.torch-version }}%2Bcu${TORCH_CUDA_VERSION}-cp${MATRIX_PYTHON_VERSION}-cp${MATRIX_PYTHON_VERSION}-linux_x86_64.whl
+          else
+            pip install --no-cache-dir torch==${{ matrix.torch-version }} --index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION}
+          fi
+          nvcc --version
+          python --version
+          python -c "import torch; print('PyTorch:', torch.__version__)"
+          python -c "import torch; print('CUDA:', torch.version.cuda)"
+          python -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
+        shell:
+          bash
+      - name: Build wheel
+        run: |
+          # We want setuptools >= 49.6.0 otherwise we can't compile the extension if system CUDA version is 11.7 and pytorch cuda version is 11.6
+          # https://github.com/pytorch/pytorch/blob/664058fa83f1d8eede5d66418abff6e20bd76ca8/torch/utils/cpp_extension.py#L810
+          # However this still fails so I'm using a newer version of setuptools
+          pip install setuptools==75.8.0
+          pip install ninja packaging wheel
+          export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
+          export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+          # Limit MAX_JOBS otherwise the github runner goes OOM
+          # nvcc 11.8 can compile with 2 jobs, but nvcc 12.3 goes OOM
+          MAX_JOBS=$([ "$MATRIX_CUDA_VERSION" == "123" ] && echo 1 || echo 2) FLASH_ATTENTION_FORCE_BUILD="TRUE" FLASH_ATTENTION_FORCE_CXX11_ABI=${{ matrix.cxx11_abi}} python setup.py bdist_wheel --dist-dir=dist
+          tmpname=cu${WHEEL_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}cxx11abi${{ matrix.cxx11_abi }}
+          wheel_name=$(ls dist/*whl | xargs -n 1 basename | sed "s/-/+$tmpname-/2")
+          ls dist/*whl |xargs -I {} mv {} dist/${wheel_name}
+          echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
+      - name: Log Built Wheels
+        run: |
+          ls dist
+      - name: Get the tag version
+        id: extract_branch
+        run: echo ::set-output name=branch::${GITHUB_REF#refs/tags/}
+      - name: Get Release with tag
+        id: get_current_release
+        uses: joutvhu/get-release@v1
+        with:
+          tag_name: ${{ steps.extract_branch.outputs.branch }}
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Upload Release Asset
+        id: upload_release_asset
+        uses: actions/upload-release-asset@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ steps.get_current_release.outputs.upload_url }}
+          asset_path: ./dist/${{env.wheel_name}}
+          asset_name: ${{env.wheel_name}}
+          asset_content_type: application/*
+  publish_package:
+    name: Publish package
+    needs: [build_wheels]
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+      - name: Install dependencies
+        run: |
+          pip install ninja packaging wheel twine
+          # Install latest setuptools with support for pypi metadata 2.2 (improved compat w/ uv)
+          pip install setuptools==75.8.0
+          # We don't want to download anything CUDA-related here
+          pip install torch --index-url https://download.pytorch.org/whl/cpu
+      - name: Build core package
+        env:
+          FLASH_ATTENTION_SKIP_CUDA_BUILD: "TRUE"
+        run: |
+          python setup.py sdist --dist-dir=dist
+      - name: Deploy
+        env:
+          TWINE_USERNAME: "__token__"
+          TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
+        run: |
+          python -m twine upload dist/*

cookbooks/flash-attention/.gitignore ADDED Viewed

	@@ -0,0 +1,31 @@

+*.ncu-rep
+.DS_store
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+# C extensions
+*.so
+# Distribution / packaging
+bin/
+build/
+develop-eggs/
+dist/
+eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+.eggs/
+# IDE-related
+.idea/
+# Dev
+venv

cookbooks/flash-attention/.gitmodules ADDED Viewed

	@@ -0,0 +1,6 @@

+[submodule "csrc/cutlass"]
+	path = csrc/cutlass
+	url = https://github.com/NVIDIA/cutlass.git
+[submodule "csrc/composable_kernel"]
+	path = csrc/composable_kernel
+	url = https://github.com/ROCm/composable_kernel.git

cookbooks/flash-attention/AUTHORS ADDED Viewed

	@@ -0,0 +1 @@


1	+ Tri Dao, trid@cs.stanford.edu

cookbooks/flash-attention/LICENSE ADDED Viewed

	@@ -0,0 +1,29 @@

+BSD 3-Clause License
+Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file.
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

cookbooks/flash-attention/MANIFEST.in ADDED Viewed

	@@ -0,0 +1,12 @@

+recursive-include csrc *.cu
+recursive-include csrc *.h
+recursive-include csrc *.cuh
+recursive-include csrc *.cpp
+recursive-include csrc *.hpp
+recursive-include csrc *.py
+recursive-include flash_attn *.cu
+recursive-include flash_attn *.h
+recursive-include flash_attn *.cuh
+recursive-include flash_attn *.cpp
+recursive-include flash_attn *.hpp

cookbooks/flash-attention/Makefile ADDED Viewed

	@@ -0,0 +1,9 @@

+clean_dist:
+	rm -rf dist/*
+create_dist: clean_dist
+	python setup.py sdist
+upload_package: create_dist
+	twine upload dist/*

cookbooks/flash-attention/README.md ADDED Viewed

	@@ -0,0 +1,524 @@

+# FlashAttention
+This repository provides the official implementation of FlashAttention and
+FlashAttention-2 from the
+following papers.
+**FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness**
+Tri Dao, Daniel Y. Fu, Stefano Ermon, Atri Rudra, Christopher Ré
+Paper: https://arxiv.org/abs/2205.14135
+IEEE Spectrum [article](https://spectrum.ieee.org/mlperf-rankings-2022) about our submission to the MLPerf 2.0 benchmark using FlashAttention.
+![FlashAttention](assets/flashattn_banner.jpg)
+**FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning**
+Tri Dao
+Paper: https://tridao.me/publications/flash2/flash2.pdf
+![FlashAttention-2](assets/flashattention_logo.png)
+## Usage
+We've been very happy to see FlashAttention being widely adopted in such a short
+time after its release. This [page](https://github.com/Dao-AILab/flash-attention/blob/main/usage.md)
+contains a partial list of places where FlashAttention is being used.
+FlashAttention and FlashAttention-2 are free to use and modify (see LICENSE).
+Please cite and credit FlashAttention if you use it.
+## FlashAttention-3 beta release
+FlashAttention-3 is optimized for Hopper GPUs (e.g. H100).
+Blogpost: https://tridao.me/blog/2024/flash3/
+Paper: https://tridao.me/publications/flash3/flash3.pdf
+![FlashAttention-3 speedup on H100 80GB SXM5 with FP16](assets/flash3_fp16_fwd.png)
+This is a beta release for testing / benchmarking before we integrate that with
+the rest of the repo.
+Currently released:
+- FP16 / BF16 forward and backward, FP8 forward
+Requirements: H100 / H800 GPU, CUDA >= 12.3.
+We highly recommend CUDA 12.8 for best performance.
+To install:
+```sh
+cd hopper
+python setup.py install
+```
+To run the test:
+```sh
+export PYTHONPATH=$PWD
+pytest -q -s test_flash_attn.py
+```
+Once the package is installed, you can import it as follows:
+```python
+import flash_attn_interface
+flash_attn_interface.flash_attn_func()
+```
+## Installation and features
+**Requirements:**
+- CUDA toolkit or ROCm toolkit
+- PyTorch 2.2 and above.
+- `packaging` Python package (`pip install packaging`)
+- `ninja` Python package (`pip install ninja`) *
+- Linux. Might work for Windows starting v2.3.2 (we've seen a few positive [reports](https://github.com/Dao-AILab/flash-attention/issues/595)) but Windows compilation still requires more testing. If you have ideas on how to set up prebuilt CUDA wheels for Windows, please reach out via Github issue.
+\* Make sure that `ninja` is installed and that it works correctly (e.g. `ninja
+--version` then `echo $?` should return exit code 0). If not (sometimes `ninja
+--version` then `echo $?` returns a nonzero exit code), uninstall then reinstall
+`ninja` (`pip uninstall -y ninja && pip install ninja`). Without `ninja`,
+compiling can take a very long time (2h) since it does not use multiple CPU
+cores. With `ninja` compiling takes 3-5 minutes on a 64-core machine using CUDA toolkit.
+**To install:**
+```sh
+pip install flash-attn --no-build-isolation
+```
+Alternatively you can compile from source:
+```sh
+python setup.py install
+```
+If your machine has less than 96GB of RAM and lots of CPU cores, `ninja` might
+run too many parallel compilation jobs that could exhaust the amount of RAM. To
+limit the number of parallel compilation jobs, you can set the environment
+variable `MAX_JOBS`:
+```sh
+MAX_JOBS=4 pip install flash-attn --no-build-isolation
+```
+**Interface:** `src/flash_attention_interface.py`
+### NVIDIA CUDA Support
+**Requirements:**
+- CUDA 12.0 and above.
+We recommend the
+[Pytorch](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch)
+container from Nvidia, which has all the required tools to install FlashAttention.
+FlashAttention-2 with CUDA currently supports:
+1. Ampere, Ada, or Hopper GPUs (e.g., A100, RTX 3090, RTX 4090, H100). Support for Turing
+   GPUs (T4, RTX 2080) is coming soon, please use FlashAttention 1.x for Turing
+   GPUs for now.
+2. Datatype fp16 and bf16 (bf16 requires Ampere, Ada, or Hopper GPUs).
+3. All head dimensions up to 256. ~~Head dim > 192 backward requires A100/A800 or H100/H800~~. Head dim 256 backward now works on consumer GPUs (if there's no dropout) as of flash-attn 2.5.5.
+### AMD ROCm Support
+ROCm version has two backends. There is [composable_kernel](https://github.com/ROCm/composable_kernel) (ck) which is the default backend and a [Triton](https://github.com/triton-lang/triton) backend. They provide an implementation of FlashAttention-2.
+**Requirements:**
+- ROCm 6.0 and above.
+We recommend the
+[Pytorch](https://hub.docker.com/r/rocm/pytorch)
+container from ROCm, which has all the required tools to install FlashAttention.
+#### Composable Kernel Backend
+FlashAttention-2 ROCm CK backend currently supports:
+1. MI200 or MI300 GPUs.
+2. Datatype fp16 and bf16
+3. Both forward's and backward's head dimensions up to 256.
+#### Triton Backend
+The Triton implementation of the [Flash Attention v2](https://tridao.me/publications/flash2/flash2.pdf) is currently a work in progress.
+It supports AMD's CDNA (MI200, MI300) and RDNA GPU's using fp16, bf16 and fp32 datatypes.
+These features are supported in Fwd and Bwd
+1) Fwd and Bwd with causal masking
+2) Variable sequence lengths
+3) Arbitrary Q and KV sequence lengths
+4) Arbitrary head sizes
+These features are supported in Fwd for now. We will add them to backward soon.
+1) Multi and grouped query attention
+2) ALiBi and matrix bias
+These features are in development
+1) Paged Attention
+2) Sliding Window
+3) Rotary embeddings
+4) Dropout
+5) Performance Improvements
+#### Getting Started
+To get started with the triton backend for AMD, follow the steps below.
+First install the recommended Triton [commit](https://github.com/triton-lang/triton/commit/3ca2f498e98ed7249b82722587c511a5610e00c4).
+```
+git clone https://github.com/triton-lang/triton
+cd triton
+git checkout 3ca2f498e98ed7249b82722587c511a5610e00c4
+pip install --verbose -e python
+```
+Then install and test Flash Attention with the flag `FLASH_ATTENTION_TRITON_AMD_ENABLE` set to `"TRUE"`.
+```
+export FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"
+cd flash-attention
+python setup.py install
+pytest tests/test_flash_attn.py
+```
+## How to use FlashAttention
+The main functions implement scaled dot product attention (softmax(Q @ K^T *
+softmax_scale) @ V):
+```python
+from flash_attn import flash_attn_qkvpacked_func, flash_attn_func
+```
+```python
+flash_attn_qkvpacked_func(qkv, dropout_p=0.0, softmax_scale=None, causal=False,
+                          window_size=(-1, -1), alibi_slopes=None, deterministic=False):
+"""dropout_p should be set to 0.0 during evaluation
+If Q, K, V are already stacked into 1 tensor, this function will be faster than
+calling flash_attn_func on Q, K, V since the backward pass avoids explicit concatenation
+of the gradients of Q, K, V.
+If window_size != (-1, -1), implements sliding window local attention. Query at position i
+will only attend to keys between [i - window_size[0], i + window_size[1]] inclusive.
+Arguments:
+    qkv: (batch_size, seqlen, 3, nheads, headdim)
+    dropout_p: float. Dropout probability.
+    softmax_scale: float. The scaling of QK^T before applying softmax.
+        Default to 1 / sqrt(headdim).
+    causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+    window_size: (left, right). If not (-1, -1), implements sliding window local attention.
+    alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of (-alibi_slope * |i - j|) is added to
+        the attention score of query i and key j.
+    deterministic: bool. Whether to use the deterministic implementation of the backward pass,
+        which is slightly slower and uses more memory. The forward pass is always deterministic.
+Return:
+    out: (batch_size, seqlen, nheads, headdim).
+"""
+```
+```python
+flash_attn_func(q, k, v, dropout_p=0.0, softmax_scale=None, causal=False,
+                window_size=(-1, -1), alibi_slopes=None, deterministic=False):
+"""dropout_p should be set to 0.0 during evaluation
+Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
+than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
+For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
+0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.
+If window_size != (-1, -1), implements sliding window local attention. Query at position i
+will only attend to keys between
+[i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.
+Arguments:
+    q: (batch_size, seqlen, nheads, headdim)
+    k: (batch_size, seqlen, nheads_k, headdim)
+    v: (batch_size, seqlen, nheads_k, headdim)
+    dropout_p: float. Dropout probability.
+    softmax_scale: float. The scaling of QK^T before applying softmax.
+        Default to 1 / sqrt(headdim).
+    causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+    window_size: (left, right). If not (-1, -1), implements sliding window local attention.
+    alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
+        (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
+        is added to the attention score of query i and key j.
+    deterministic: bool. Whether to use the deterministic implementation of the backward pass,
+        which is slightly slower and uses more memory. The forward pass is always deterministic.
+Return:
+    out: (batch_size, seqlen, nheads, headdim).
+"""
+```
+```python
+def flash_attn_with_kvcache(
+    q,
+    k_cache,
+    v_cache,
+    k=None,
+    v=None,
+    rotary_cos=None,
+    rotary_sin=None,
+    cache_seqlens: Optional[Union[(int, torch.Tensor)]] = None,
+    cache_batch_idx: Optional[torch.Tensor] = None,
+    block_table: Optional[torch.Tensor] = None,
+    softmax_scale=None,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite context window
+    rotary_interleaved=True,
+    alibi_slopes=None,
+):
+    """
+    If k and v are not None, k_cache and v_cache will be updated *inplace* with the new values from
+    k and v. This is useful for incremental decoding: you can pass in the cached keys/values from
+    the previous step, and update them with the new keys/values from the current step, and do
+    attention with the updated cache, all in 1 kernel.
+    If you pass in k / v, you must make sure that the cache is large enough to hold the new values.
+    For example, the KV cache could be pre-allocated with the max sequence length, and you can use
+    cache_seqlens to keep track of the current sequence lengths of each sequence in the batch.
+    Also apply rotary embedding if rotary_cos and rotary_sin are passed in. The key @k will be
+    rotated by rotary_cos and rotary_sin at indices cache_seqlens, cache_seqlens + 1, etc.
+    If causal or local (i.e., window_size != (-1, -1)), the query @q will be rotated by rotary_cos
+    and rotary_sin at indices cache_seqlens, cache_seqlens + 1, etc.
+    If not causal and not local, the query @q will be rotated by rotary_cos and rotary_sin at
+    indices cache_seqlens only (i.e. we consider all tokens in @q to be at position cache_seqlens).
+    See tests/test_flash_attn.py::test_flash_attn_kvcache for examples of how to use this function.
+    Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
+    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
+    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
+    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.
+    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
+    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
+        1 1 1 1 0
+        1 1 1 1 1
+    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
+        0 0
+        0 0
+        0 0
+        1 0
+        1 1
+    If the row of the mask is all zero, the output will be zero.
+    If window_size != (-1, -1), implements sliding window local attention. Query at position i
+    will only attend to keys between
+    [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.
+    Note: Does not support backward pass.
+    Arguments:
+        q: (batch_size, seqlen, nheads, headdim)
+        k_cache: (batch_size_cache, seqlen_cache, nheads_k, headdim) if there's no block_table,
+            or (num_blocks, page_block_size, nheads_k, headdim) if there's a block_table (i.e. paged KV cache)
+            page_block_size must be a multiple of 256.
+        v_cache: (batch_size_cache, seqlen_cache, nheads_k, headdim) if there's no block_table,
+            or (num_blocks, page_block_size, nheads_k, headdim) if there's a block_table (i.e. paged KV cache)
+        k [optional]: (batch_size, seqlen_new, nheads_k, headdim). If not None, we concatenate
+            k with k_cache, starting at the indices specified by cache_seqlens.
+        v [optional]: (batch_size, seqlen_new, nheads_k, headdim). Similar to k.
+        rotary_cos [optional]: (seqlen_ro, rotary_dim / 2). If not None, we apply rotary embedding
+            to k and q. Only applicable if k and v are passed in. rotary_dim must be divisible by 16.
+        rotary_sin [optional]: (seqlen_ro, rotary_dim / 2). Similar to rotary_cos.
+        cache_seqlens: int, or (batch_size,), dtype torch.int32. The sequence lengths of the
+            KV cache.
+        block_table [optional]: (batch_size, max_num_blocks_per_seq), dtype torch.int32.
+        cache_batch_idx: (batch_size,), dtype torch.int32. The indices used to index into the KV cache.
+            If None, we assume that the batch indices are [0, 1, 2, ..., batch_size - 1].
+            If the indices are not distinct, and k and v are provided, the values updated in the cache
+                 might come from any of the duplicate indices.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
+        rotary_interleaved: bool. Only applicable if rotary_cos and rotary_sin are passed in.
+            If True, rotary embedding will combine dimensions 0 & 1, 2 & 3, etc. If False,
+            rotary embedding will combine dimensions 0 & rotary_dim / 2, 1 & rotary_dim / 2 + 1
+            (i.e. GPT-NeoX style).
+        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
+            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
+            is added to the attention score of query i and key j.
+    Return:
+        out: (batch_size, seqlen, nheads, headdim).
+    """
+```
+To see how these functions are used in a multi-head attention layer (which
+includes QKV projection, output projection), see the MHA [implementation](https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/modules/mha.py).
+## Changelog
+### 2.0: Complete rewrite, 2x faster
+Upgrading from FlashAttention (1.x) to FlashAttention-2
+These functions have been renamed:
+- `flash_attn_unpadded_func` -> `flash_attn_varlen_func`
+- `flash_attn_unpadded_qkvpacked_func` -> `flash_attn_varlen_qkvpacked_func`
+- `flash_attn_unpadded_kvpacked_func` -> `flash_attn_varlen_kvpacked_func`
+If the inputs have the same sequence lengths in the same batch, it is simpler
+and faster to use these functions:
+```python
+flash_attn_qkvpacked_func(qkv, dropout_p=0.0, softmax_scale=None, causal=False)
+```
+```python
+flash_attn_func(q, k, v, dropout_p=0.0, softmax_scale=None, causal=False)
+```
+### 2.1: Change behavior of causal flag
+If seqlen_q != seqlen_k and causal=True, the causal mask is aligned to the
+bottom right corner of the attention matrix, instead of the top-left corner.
+For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 =
+masked out) is:
+v2.0:
+    1 0 0 0 0
+    1 1 0 0 0
+v2.1:
+    1 1 1 1 0
+    1 1 1 1 1
+If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
+v2.0:
+    1 0
+    1 1
+    1 1
+    1 1
+    1 1
+v2.1:
+    0 0
+    0 0
+    0 0
+    1 0
+    1 1
+If the row of the mask is all zero, the output will be zero.
+### 2.2: Optimize for inference
+Optimize for inference (iterative decoding) when query has very small sequence
+length (e.g., query sequence length = 1). The bottleneck here is to load KV
+cache as fast as possible, and we split the loading across different thread
+blocks, with a separate kernel to combine results.
+See the function `flash_attn_with_kvcache` with more features for inference
+(perform rotary embedding, updating KV cache inplace).
+Thanks to the xformers team, and in particular Daniel Haziza, for this
+collaboration.
+### 2.3: Local (i.e., sliding window) attention
+Implement sliding window attention (i.e., local attention). Thanks to [Mistral
+AI](https://mistral.ai/) and in particular Timothée Lacroix for this
+contribution. Sliding window was used in the [Mistral 7B](https://mistral.ai/news/announcing-mistral-7b/) model.
+### 2.4: ALiBi (attention with linear bias), deterministic backward pass.
+Implement ALiBi (Press et al., 2021). Thanks to Sanghun Cho from Kakao Brain for this contribution.
+Implement deterministic backward pass. Thanks to engineers from [Meituan](www.meituan.com) for this contribution.
+### 2.5: Paged KV cache.
+Support paged KV cache (i.e., [PagedAttention](https://arxiv.org/abs/2309.06180)).
+Thanks to @beginlner for this contribution.
+### 2.6: Softcapping.
+Support attention with softcapping, as used in Gemma-2 and Grok models.
+Thanks to @Narsil and @lucidrains for this contribution.
+### 2.7: Compatibility with torch compile
+Thanks to @ani300 for this contribution.
+## Performance
+We present expected speedup (combined forward + backward pass) and memory savings from using FlashAttention against PyTorch standard attention, depending on sequence length, on different GPUs (speedup depends on memory bandwidth - we see more speedup on slower GPU memory).
+We currently have benchmarks for these GPUs:
+* [A100](#a100)
+* [H100](#h100)
+<!-- * [RTX 3090](#rtx-3090) -->
+<!-- * [T4](#t4) -->
+### A100
+We display FlashAttention speedup using these parameters:
+* Head dimension 64 or 128, hidden dimension 2048 (i.e. either 32 or 16 heads).
+* Sequence length 512, 1k, 2k, 4k, 8k, 16k.
+* Batch size set to 16k / seqlen.
+#### Speedup
+![FlashAttention speedup on A100 80GB SXM5 with FP16/BF16](assets/flash2_a100_fwd_bwd_benchmark.png)
+#### Memory
+![FlashAttention memory](assets/flashattn_memory.jpg)
+We show memory savings in this graph (note that memory footprint is the same no matter if you use dropout or masking).
+Memory savings are proportional to sequence length -- since standard attention has memory quadratic in sequence length, whereas FlashAttention has memory linear in sequence length.
+We see 10X memory savings at sequence length 2K, and 20X at 4K.
+As a result, FlashAttention can scale to much longer sequence lengths.
+### H100
+![FlashAttention speedup on H100 SXM5 with FP16/BF16](assets/flash2_h100_fwd_bwd_benchmark.png)
+## Full model code and training script
+We have released the full GPT model
+[implementation](https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/models/gpt.py).
+We also provide optimized implementations of other layers (e.g., MLP, LayerNorm,
+cross-entropy loss, rotary embedding). Overall this speeds up training by 3-5x
+compared to the baseline implementation from Huggingface, reaching up to 225
+TFLOPs/sec per A100, equivalent to 72% model FLOPs utilization (we don't need
+any activation checkpointing).
+We also include a training
+[script](https://github.com/Dao-AILab/flash-attention/tree/main/training) to
+train GPT2 on Openwebtext and GPT3 on The Pile.
+## Triton implementation of FlashAttention
+Phil Tillet (OpenAI) has an experimental implementation of FlashAttention in Triton:
+https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py
+As Triton is a higher-level language than CUDA, it might be easier to understand
+and experiment with. The notations in the Triton implementation are also closer
+to what's used in our paper.
+We also have an experimental implementation in Triton that support attention
+bias (e.g. ALiBi):
+https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/flash_attn_triton.py
+## Tests
+We test that FlashAttention produces the same output and gradient as a reference
+implementation, up to some numerical tolerance. In particular, we check that the
+maximum numerical error of FlashAttention is at most twice the numerical error
+of a baseline implementation in Pytorch (for different head dimensions, input
+dtype, sequence length, causal / non-causal).
+To run the tests:
+```sh
+pytest -q -s tests/test_flash_attn.py
+```
+## When you encounter issues
+This new release of FlashAttention-2 has been tested on several GPT-style
+models, mostly on A100 GPUs.
+If you encounter bugs, please open a GitHub Issue!
+## Tests
+To run the tests:
+```sh
+pytest tests/test_flash_attn_ck.py
+```
+## Citation
+If you use this codebase, or otherwise found our work valuable, please cite:
+```
+@inproceedings{dao2022flashattention,
+  title={Flash{A}ttention: Fast and Memory-Efficient Exact Attention with {IO}-Awareness},
+  author={Dao, Tri and Fu, Daniel Y. and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher},
+  booktitle={Advances in Neural Information Processing Systems (NeurIPS)},
+  year={2022}
+}
+@inproceedings{dao2023flashattention2,
+  title={Flash{A}ttention-2: Faster Attention with Better Parallelism and Work Partitioning},
+  author={Dao, Tri},
+  booktitle={International Conference on Learning Representations (ICLR)},
+  year={2024}
+}
+```

cookbooks/flash-attention/assets/flash2_a100_fwd_bwd_benchmark.png ADDED Viewed

Git LFS Details

SHA256: c1d31938cd597204eaee34f54b551123dc0f4b4554d46a2bc7907653a876232d
Pointer size: 131 Bytes
Size of remote file: 378 kB

cookbooks/flash-attention/assets/flash2_h100_fwd_bwd_benchmark.png ADDED Viewed

Git LFS Details

SHA256: 92bec94ee65e454bf55efbfe32dfde0419c40cb7017da1902de760ae1803505e
Pointer size: 131 Bytes
Size of remote file: 315 kB

cookbooks/flash-attention/assets/flash3_fp16_fwd.png ADDED Viewed

Git LFS Details

SHA256: 8d32a7a34f9ab2160e0a314cb3e6d8c7b51d9c59f9b30de93cfa04e3eae0aacd
Pointer size: 131 Bytes
Size of remote file: 205 kB

cookbooks/flash-attention/assets/flashattention_logo.png ADDED Viewed

Git LFS Details

SHA256: 61969fc112a38be106744ce2c416a2bca8026a173ef3cbb883826c998732958c
Pointer size: 132 Bytes
Size of remote file: 2.74 MB

cookbooks/flash-attention/assets/flashattn_banner.jpg ADDED Viewed

Git LFS Details

SHA256: dbf9e1e910446414035e90c05bd7cb5932e390c438fd0622c04d2172d39ac63b
Pointer size: 131 Bytes
Size of remote file: 322 kB

cookbooks/flash-attention/assets/flashattn_banner.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f4df0222057bbffcd2894fbae18bbfa6304e5d0583d47e44e9ac7a97bfb75ce
+size 474702

cookbooks/flash-attention/assets/flashattn_memory.jpg ADDED Viewed

cookbooks/flash-attention/assets/flashattn_speedup.jpg ADDED Viewed

Git LFS Details

SHA256: a55ea48b7296b77530b5b0af46460870e749e166039c2ab81acc3473fc8e01c8
Pointer size: 131 Bytes
Size of remote file: 117 kB

cookbooks/flash-attention/assets/flashattn_speedup_3090.jpg ADDED Viewed

Git LFS Details

SHA256: e8e693583cbb4d2880fd1bb03c6b6649e639df81194dbf2aec50fd9de02e8bd6
Pointer size: 131 Bytes
Size of remote file: 112 kB

cookbooks/flash-attention/assets/flashattn_speedup_a100_d128.jpg ADDED Viewed

Git LFS Details

SHA256: d27d69f1e25e7537c57f8376d3cb4c1b28bdf5812a925626d2c24c3972a8779d
Pointer size: 131 Bytes
Size of remote file: 128 kB

cookbooks/flash-attention/assets/flashattn_speedup_t4.jpg ADDED Viewed

Git LFS Details

SHA256: e475db1780c9bdf5bfb069b13761f12b47a6cc332a39aba742873f2e64b3749d
Pointer size: 131 Bytes
Size of remote file: 115 kB

cookbooks/flash-attention/assets/flashattn_speedup_t4_fwd.jpg ADDED Viewed

Git LFS Details

SHA256: 783673fc7c8f1fd3b60b3db07b29cd0b10ed40a545ae8a473d4b6608cc0c11f2
Pointer size: 131 Bytes
Size of remote file: 114 kB

cookbooks/flash-attention/assets/gpt2_training_curve.jpg ADDED Viewed

Git LFS Details

SHA256: 16cc3582fe0923f3e14670e0b0e8754e7797041be2e8907a5173e1fdda1c8ab3
Pointer size: 131 Bytes
Size of remote file: 172 kB

cookbooks/flash-attention/assets/gpt2_training_efficiency.jpg ADDED Viewed

Git LFS Details

SHA256: 55839bbfda5ad56ebd48834154822499b83ceb7935efd744c8d76dde57782584
Pointer size: 131 Bytes
Size of remote file: 376 kB

cookbooks/flash-attention/assets/gpt3_training_curve.jpg ADDED Viewed

Git LFS Details

SHA256: 6f5c428f0a413158bba4be56bcfc33c891c189e0c60d18987a04368d03917c86
Pointer size: 131 Bytes
Size of remote file: 187 kB

cookbooks/flash-attention/assets/gpt3_training_efficiency.jpg ADDED Viewed

Git LFS Details

SHA256: 3fce533a0e170e2f832aedc735581a37739e8c17e2e36387555d3efaed4ea259
Pointer size: 131 Bytes
Size of remote file: 392 kB

cookbooks/flash-attention/benchmarks/benchmark_alibi.py ADDED Viewed

	@@ -0,0 +1,275 @@

+# Copyright (c) 2024, Sanghun Cho, Tri Dao.
+import pickle
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from flash_attn.layers.rotary import apply_rotary_emb
+from flash_attn.utils.benchmark import benchmark_all, benchmark_forward, benchmark_backward
+from flash_attn.utils.benchmark import benchmark_fwd_bwd, benchmark_combined
+from flash_attn import flash_attn_qkvpacked_func, flash_attn_func
+try:
+    import xformers.ops as xops
+except ImportError:
+    xops = None
+def generate_cos_sin(seqlen, rotary_dim, device, dtype):
+    assert rotary_dim % 2 == 0
+    angle = torch.rand(seqlen * 2, rotary_dim // 2, device=device) * 2 * math.pi
+    cos = torch.cos(angle).to(dtype=dtype)
+    sin = torch.sin(angle).to(dtype=dtype)
+    return cos, sin
+def flash_rotary(q, k, v, cos, sin, causal=False):
+    # corrected by @tridao comments
+    q = apply_rotary_emb(
+        q, cos, sin, seqlen_offsets=0, interleaved=False, inplace=True
+    )
+    k = apply_rotary_emb(
+        k, cos, sin, seqlen_offsets=0, interleaved=False, inplace=True
+    )
+    return flash_attn_func(q, k, v, causal=causal)
+def attn_bias_from_alibi_slopes(
+    slopes, seqlen_q, seqlen_k, query_padding_mask=None, key_padding_mask=None, causal=False
+):
+    batch, nheads = slopes.shape
+    device = slopes.device
+    slopes = rearrange(slopes, "b h -> b h 1 1")
+    if causal:
+        return torch.arange(-seqlen_k + 1, 1, device=device, dtype=torch.float32) * slopes
+    else:
+        row_idx = rearrange(torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1")
+        col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long)
+        sk = (
+            seqlen_k
+            if key_padding_mask is None
+            else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1")
+        )
+        sq = (
+            seqlen_q
+            if query_padding_mask is None
+            else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1")
+        )
+        relative_pos = torch.abs(row_idx + sk - sq - col_idx)
+        return -slopes * relative_pos.to(dtype=slopes.dtype)
+def flops(batch, seqlen, headdim, nheads, causal, mode="fwd"):
+    assert mode in ["fwd", "bwd", "fwd_bwd"]
+    f = 4 * batch * seqlen**2 * nheads * headdim // (2 if causal else 1)
+    return f if mode == "fwd" else (2.5 * f if mode == "bwd" else 3.5 * f)
+def efficiency(flop, time):
+    return (flop / time / 10**12) if not math.isnan(time) else 0.0
+def attention_pytorch(q, k, v, dropout_p=0.0, causal=True, attn_bias=None):
+    """
+    Arguments:
+        q, k, v: (batch_size, seqlen, nheads, head_dim)
+        dropout_p: float
+        attn_bias: (batch_size, nheads, seqlen, seqlen) or (1, nheads, seqlen, seqlen)
+    Output:
+        output: (batch_size, seqlen, nheads, head_dim)
+    """
+    batch_size, seqlen, nheads, d = q.shape
+    q = rearrange(q, 'b t h d -> (b h) t d')
+    k = rearrange(k, 'b s h d -> (b h) d s')
+    softmax_scale = 1.0 / math.sqrt(d)
+    # Preallocate attn_weights for `baddbmm`
+    if attn_bias is not None:
+        scores = rearrange(attn_bias, 'b h t s -> (b h) t s')
+    else:
+        scores = torch.empty(batch_size * nheads, seqlen, seqlen, dtype=q.dtype, device=q.device)
+    scores = rearrange(torch.baddbmm(scores, q, k, beta=1.0, alpha=softmax_scale),
+                       '(b h) t s -> b h t s', h=nheads)
+    if causal:
+        # "triu_tril_cuda_template" not implemented for 'BFloat16'
+        # So we have to construct the mask in float
+        causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1)
+        # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess)
+        scores = scores + causal_mask.to(dtype=scores.dtype)
+    attention = torch.softmax(scores, dim=-1)
+    attention_drop = F.dropout(attention, dropout_p)
+    output = torch.einsum('bhts,bshd->bthd', attention_drop , v)
+    return output.to(dtype=q.dtype)
+def time_fwd_bwd(func, *args, **kwargs):
+    time_f, time_b = benchmark_fwd_bwd(func, *args, **kwargs)
+    return time_f[1].mean, time_b[1].mean
+repeats = 30
+device = 'cuda'
+dtype = torch.float16
+bs_seqlen_vals = [(32, 512), (16, 1024), (8, 2048), (4, 4096), (2, 8192), (1, 16384)]
+causal_vals = [False, True]
+headdim_vals = [64, 128]
+dim = 2048
+dropout_p = 0.0
+methods = (["fa2_alibi", "torch"]
+           + (["xformers"] if xops is not None else [])
+           + ["sdpa"]
+           + ["fa2_baseline"]
+           + ["fa2_rotary"])
+time_f = {}
+time_b = {}
+time_f_b = {}
+speed_f = {}
+speed_b = {}
+speed_f_b = {}
+for causal in causal_vals:
+    for headdim in headdim_vals:
+        for batch_size, seqlen in bs_seqlen_vals:
+            config = (causal, headdim, batch_size, seqlen)
+            nheads = dim // headdim
+            q, k, v = [torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype,
+                                    requires_grad=True) for _ in range(3)]
+            # alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3
+            alibi_slopes = torch.rand(1, nheads, device=device, dtype=torch.float32) * 0.3
+            attn_bias = attn_bias_from_alibi_slopes(alibi_slopes, seqlen, seqlen, causal=causal).to(dtype)
+            attn_bias = repeat(attn_bias, "1 ... -> b ...", b=batch_size)
+            f, b = time_fwd_bwd(
+                flash_attn_func,
+                q, k, v,
+                dropout_p,
+                causal=causal,
+                # alibi_slopes=alibi_slopes,
+                alibi_slopes=None,
+                repeats=repeats,
+                verbose=False
+            )
+            time_f[config, "fa2_baseline"] = f
+            time_b[config, "fa2_baseline"] = b
+            q = q.detach().requires_grad_(True)
+            k = k.detach().requires_grad_(True)
+            v = v.detach().requires_grad_(True)
+            f, b = time_fwd_bwd(
+                flash_attn_func,
+                q, k, v,
+                dropout_p,
+                causal=causal,
+                alibi_slopes=rearrange(alibi_slopes, "1 h -> h"),
+                # alibi_slopes=None,
+                repeats=repeats,
+                verbose=False
+            )
+            time_f[config, "fa2_alibi"] = f
+            time_b[config, "fa2_alibi"] = b
+            try:
+                q = q.detach().requires_grad_(True)
+                k = k.detach().requires_grad_(True)
+                v = v.detach().requires_grad_(True)
+                f, b = time_fwd_bwd(
+                    attention_pytorch,
+                    q, k, v,
+                    dropout_p,
+                    causal=causal,
+                    attn_bias=attn_bias,
+                    repeats=repeats,
+                    verbose=False
+                )
+            except:  # Skip if OOM
+                f, b = float('nan'), float('nan')
+            time_f[config, "torch"] = f
+            time_b[config, "torch"] = b
+            # F.sdpa doesn't currently (torch 2.1) dispatch to flash-attn but just to be safe
+            with torch.backends.cuda.sdp_kernel(enable_flash=False):
+                q_pt = q.detach().requires_grad_(True).transpose(1, 2)
+                k_pt = k.detach().requires_grad_(True).transpose(1, 2)
+                v_pt = v.detach().requires_grad_(True).transpose(1, 2)
+                f, b = time_fwd_bwd(
+                    F.scaled_dot_product_attention,
+                    q_pt, k_pt, v_pt,
+                    attn_mask=attn_bias,
+                    dropout_p=dropout_p,
+                    is_causal=causal,
+                    repeats=repeats,
+                    verbose=False
+                )
+                time_f[config, "sdpa"] = f
+                time_b[config, "sdpa"] = b
+            if xops is not None:
+                q = q.detach().requires_grad_(True)
+                k = k.detach().requires_grad_(True)
+                v = v.detach().requires_grad_(True)
+                if causal:
+                    attn_bias_xops = xops.LowerTriangularMask().add_bias(attn_bias.expand(-1, -1, seqlen, -1).to(dtype=q.dtype))
+                    # NotImplementedError: No operator found for `memory_efficient_attention_backward` with inputs:
+                    # `flshattB@v2.3.6` is not supported because:
+                    #     attn_bias type is <class 'xformers.ops.fmha.attn_bias.LowerTriangularMaskWithTensorBias'>
+                    # `cutlassB` is not supported because:
+                    #     attn_bias type is <class 'xformers.ops.fmha.attn_bias.LowerTriangularMaskWithTensorBias'>
+                    attn_bias_xops = attn_bias_xops.materialize((batch_size, nheads, seqlen, seqlen), dtype=q.dtype, device=device)
+                else:
+                    attn_bias_xops = attn_bias.to(dtype=q.dtype)
+                f, b = time_fwd_bwd(
+                    xops.memory_efficient_attention,
+                    q, k, v,
+                    attn_bias_xops,
+                    dropout_p,
+                    repeats=repeats,
+                    verbose=False
+                )
+                time_f[config, "xformers"] = f
+                time_b[config, "xformers"] = b
+            q = q.detach().requires_grad_(True)
+            k = k.detach().requires_grad_(True)
+            v = v.detach().requires_grad_(True)
+            cos, sin = generate_cos_sin(seqlen, headdim, device, dtype)
+            f, b = time_fwd_bwd(
+                flash_rotary,
+                q, k, v,
+                cos, sin,
+                causal,
+                repeats=repeats,
+                verbose=False
+            )
+            time_f[config, "fa2_rotary"] = f
+            time_b[config, "fa2_rotary"] = b
+            print(f"### causal={causal}, headdim={headdim}, batch_size={batch_size}, seqlen={seqlen} ###")
+            csv_output = ""
+            csv_output += f"{causal},{headdim},{batch_size},{seqlen},"
+            for method in methods:
+                time_f_b[config, method] = time_f[config, method] + time_b[config, method]
+                speed_f[config, method] = efficiency(
+                    flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd"),
+                    time_f[config, method]
+                )
+                speed_b[config, method] = efficiency(
+                    flops(batch_size, seqlen, headdim, nheads, causal, mode="bwd"),
+                    time_b[config, method]
+                )
+                speed_f_b[config, method] = efficiency(
+                    flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd_bwd"),
+                    time_f_b[config, method]
+                )
+                print(
+                    f"{method} fwd: {speed_f[config, method]:.2f} TFLOPs/s, "
+                    f"bwd: {speed_b[config, method]:.2f} TFLOPs/s, "
+                    f"fwd + bwd: {speed_f_b[config, method]:.2f} TFLOPs/s"
+                )
+                csv_output += f"{speed_f[config, method]:.2f},{speed_b[config, method]:.2f},{speed_f_b[config, method]:.2f},"
+            print(csv_output)

cookbooks/flash-attention/benchmarks/benchmark_causal.py ADDED Viewed

	@@ -0,0 +1,225 @@

+from functools import partial
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+# from flash_attn.utils.benchmark import benchmark_forward, benchmark_backward, benchmark_combined, benchmark_all, benchmark_fwd_bwd, pytorch_profiler
+from flash_attn.utils.benchmark import benchmark_forward, benchmark_backward, benchmark_combined, benchmark_all, benchmark_fwd_bwd, pytorch_profiler
+from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func
+# # from flash_attn.triton.fused_attention import attention as attention
+# from flash_attn.flash_attn_triton import flash_attn_qkvpacked_func
+# from flash_attn.flash_attn_triton_og import attention as attention_og
+# from triton.ops.flash_attention import attention as attention_triton
+from flash_attn import flash_attn_qkvpacked_func, flash_attn_kvpacked_func
+try:
+    from flash_attn.fused_softmax import scaled_upper_triang_masked_softmax
+except ImportError:
+    scaled_upper_triang_masked_softmax = None
+def attention_pytorch(qkv, dropout_p=0.0, causal=True):
+    """
+    Arguments:
+        qkv: (batch_size, seqlen, 3, nheads, head_dim)
+        dropout_p: float
+    Output:
+        output: (batch_size, seqlen, nheads, head_dim)
+    """
+    batch_size, seqlen, _, nheads, d = qkv.shape
+    q, k, v = qkv.unbind(dim=2)
+    q = rearrange(q, 'b t h d -> (b h) t d')
+    k = rearrange(k, 'b s h d -> (b h) d s')
+    softmax_scale = 1.0 / math.sqrt(d)
+    # Preallocate attn_weights for `baddbmm`
+    scores = torch.empty(batch_size * nheads, seqlen, seqlen, dtype=qkv.dtype, device=qkv.device)
+    scores = rearrange(torch.baddbmm(scores, q, k, beta=0, alpha=softmax_scale),
+                       '(b h) t s -> b h t s', h=nheads)
+    if causal:
+        # "triu_tril_cuda_template" not implemented for 'BFloat16'
+        # So we have to construct the mask in float
+        causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1)
+        # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess)
+        scores = scores + causal_mask.to(dtype=scores.dtype)
+    attention = torch.softmax(scores, dim=-1)
+    attention_drop = F.dropout(attention, dropout_p)
+    output = torch.einsum('bhts,bshd->bthd', attention_drop , v)
+    return output.to(dtype=qkv.dtype)
+def attention_megatron(qkv):
+    """
+    Arguments:
+        qkv: (batch_size, seqlen, 3, nheads, head_dim)
+    Output:
+        output: (batch_size, seqlen, nheads, head_dim)
+    """
+    batch_size, seqlen, _, nheads, d = qkv.shape
+    q, k, v = qkv.unbind(dim=2)
+    q = rearrange(q, 'b t h d -> (b h) t d')
+    k = rearrange(k, 'b s h d -> (b h) d s')
+    softmax_scale = 1.0 / math.sqrt(d)
+    # Preallocate attn_weights for `baddbmm`
+    scores = torch.empty(batch_size * nheads, seqlen, seqlen, dtype=qkv.dtype, device=qkv.device)
+    scores = rearrange(torch.baddbmm(scores, q, k, beta=0, alpha=softmax_scale),
+                       '(b h) t s -> b h t s', h=nheads)
+    attention = scaled_upper_triang_masked_softmax(scores, None, scale=1.0)
+    output = torch.einsum('bhts,bshd->bthd', attention, v)
+    return output.to(dtype=qkv.dtype)
+torch.manual_seed(0)
+repeats = 30
+batch_size = 8
+seqlen = 2048
+nheads = 12
+headdim = 128
+# nheads = 24
+# headdim = 64
+# batch_size = 64
+# seqlen = 512
+# nheads = 8
+# headdim = 128
+dropout_p = 0.0
+causal = True
+dtype = torch.float16
+device = 'cuda'
+qkv = torch.randn(batch_size, seqlen, 3, nheads, headdim, device=device, dtype=dtype,
+                  requires_grad=True)
+cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
+                          device=qkv.device)
+qkv_unpad = rearrange(qkv, 'b s ... -> (b s) ...').detach().requires_grad_(True)
+# benchmark_all(flash_attn_varlen_qkvpacked_func, qkv_unpad,
+#               cu_seqlens, seqlen, dropout_p, causal=causal, repeats=repeats, desc='FlashAttention')
+# pytorch_profiler(flash_attn_varlen_qkvpacked_func, qkv_unpad,
+#                  cu_seqlens, seqlen, dropout_p, causal=causal, backward=True)
+benchmark_forward(flash_attn_qkvpacked_func, qkv, dropout_p, causal=causal, repeats=repeats, desc='Fav2')
+pytorch_profiler(flash_attn_qkvpacked_func, qkv, dropout_p, causal=causal, backward=False)
+# for dropout_p in [0.1, 0.0]:
+#     for causal in [False, True]:
+#         print(f"### {dropout_p = }, {causal = } ###")
+#         pytorch_profiler(fav2_qkvpacked_func, qkv, dropout_p, causal=causal, backward=True)
+# nheads_k = 2
+# q = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, requires_grad=True)
+# kv = torch.randn(batch_size, seqlen, 2, nheads_k, headdim, device=device, dtype=dtype,
+#                  requires_grad=True)
+# if fav2_kvpacked_func is not None:
+#     benchmark_all(fav2_kvpacked_func, q, kv, dropout_p, causal=causal, repeats=repeats, desc='Fav2')
+#     pytorch_profiler(fav2_kvpacked_func, q, kv, dropout_p, causal=causal, backward=True)
+# dropout_p = 0.0
+# causal = False
+# benchmark_all(attention_pytorch, qkv, dropout_p, causal=causal,
+#               repeats=repeats, desc='PyTorch Attention')
+# benchmark_all(flash_attn_qkvpacked_func, qkv, None, causal, repeats=repeats, desc='FlashAttention Triton')
+# pytorch_profiler(flash_attn_qkvpacked_func, qkv, None, causal, backward=True)
+# q, k, v = [torch.randn(batch_size, nheads, seqlen, headdim, device=device, dtype=dtype,
+#                        requires_grad=True) for _ in range(3)]
+# benchmark_all(attention_og, q, k, v, 1.0, repeats=repeats, desc='FlashAttention Triton OG')
+# # pytorch_profiler(attention, q, k, v, 1.0, backward=True)
+# if scaled_upper_triang_masked_softmax is not None:
+#     benchmark_all(attention_megatron, qkv, repeats=repeats, desc='Megatron Attention')
+# from src.ops.fftconv import fftconv_func
+# dim = nheads * headdim
+# u = torch.randn(batch_size, dim, seqlen, device=device, dtype=dtype, requires_grad=True)
+# k = torch.randn(dim, seqlen, device=device, requires_grad=True)
+# D = torch.randn(dim, device=device, requires_grad=True)
+# benchmark_all(fftconv_func, u, k, D, repeats=repeats, desc='FFTConv')
+# pytorch_profiler(fftconv_func, u, k, D, backward=True)
+# pytorch_profiler(torch.fft.rfft, u.float())
+flops = 4 * batch_size * seqlen ** 2 * nheads * headdim
+ideal_a100_time = flops / 312 / 1e9
+print(f"Ideal A100 fwd time: {ideal_a100_time:.3f}ms, bwd time: {ideal_a100_time * 2.5:.3f}ms")
+exit(0)
+def time_fwd_bwd(func, *args, **kwargs):
+    time_f, time_b = benchmark_fwd_bwd(func, *args, **kwargs)
+    return time_f[1].mean, time_b[1].mean
+bs_seqlen_vals = [(32, 512), (16, 1024), (8, 2048), (4, 4096), (2, 8192), (1, 16384)]
+causal_vals = [False, True]
+headdim_vals = [64, 128]
+dim = 2048
+dropout_p = 0.0
+time_f = {}
+time_b = {}
+for causal in causal_vals:
+    for headdim in headdim_vals:
+        for batch_size, seqlen in bs_seqlen_vals:
+            nheads = dim // headdim
+            qkv = torch.randn(batch_size, seqlen, 3, nheads, headdim, device=device, dtype=dtype,
+                              requires_grad=True)
+            cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
+                                    device=qkv.device)
+            qkv_unpad = rearrange(qkv, 'b s ... -> (b s) ...').detach().requires_grad_(True)
+            f, b = time_fwd_bwd(
+                flash_attn_varlen_qkvpacked_func, qkv_unpad, cu_seqlens, seqlen, dropout_p,
+                causal=causal, repeats=repeats, verbose=False
+            )
+            time_f[(causal, headdim, batch_size, seqlen), "Flash"] = f
+            time_b[(causal, headdim, batch_size, seqlen), "Flash"] = b
+            qkv = qkv.detach().requires_grad_(True)
+            f, b = time_fwd_bwd(
+                fav2_qkvpacked_func, qkv, dropout_p, causal=causal, repeats=repeats, verbose=False
+            )
+            time_f[(causal, headdim, batch_size, seqlen), "Flash2"] = f
+            time_b[(causal, headdim, batch_size, seqlen), "Flash2"] = b
+            # q, k, v = [torch.randn(batch_size, nheads, seqlen, headdim, device=device, dtype=dtype,
+            #                        requires_grad=True) for _ in range(3)]
+            # # Try both values of sequence_parallel and pick the faster one
+            # f, b = time_fwd_bwd(
+            #     attention_triton, q, k, v, causal, headdim**(-0.5),
+            #     False, repeats=repeats, verbose=False
+            # )
+            # _, b0 = time_fwd_bwd(
+            #     attention_triton, q, k, v, causal, headdim**(-0.5),
+            #     True, repeats=repeats, verbose=False
+            # )
+            # time_f[(causal, headdim, batch_size, seqlen), "Triton"] = f
+            # time_b[(causal, headdim, batch_size, seqlen), "Triton"] = min(b, b0)
+            if seqlen <= 8 * 1024:
+                qkv = qkv.detach().requires_grad_(True)
+                f, b = time_fwd_bwd(
+                    attention_pytorch, qkv, dropout_p, causal=causal, repeats=repeats, verbose=False
+                )
+            else:
+                f, b = float('nan'), float('nan')
+            time_f[(causal, headdim, batch_size, seqlen), "Pytorch"] = f
+            time_b[(causal, headdim, batch_size, seqlen), "Pytorch"] = b
+            # q, k, v = [torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype,
+            #                        requires_grad=True) for _ in range(3)]
+            # import xformers.ops as xops
+            # f, b = time_fwd_bwd(
+            #     xops.memory_efficient_attention, q, k, v,
+            #     attn_bias=xops.LowerTriangularMask() if causal else None,
+            #     op=(xops.fmha.cutlass.FwOp, xops.fmha.cutlass.BwOp)
+            # )
+            # time_f[(causal, headdim, batch_size, seqlen), "xformers"] = f
+            # time_b[(causal, headdim, batch_size, seqlen), "xformers"] = b
+import pickle
+with open('flash2_attn_time_h100.plk', 'wb') as fp:
+    pickle.dump((time_f, time_b), fp, protocol=pickle.HIGHEST_PROTOCOL)

cookbooks/flash-attention/benchmarks/benchmark_flash_attention.py ADDED Viewed

	@@ -0,0 +1,180 @@

+# Install the newest triton version with
+# pip install "git+https://github.com/openai/triton.git#egg=triton&subdirectory=python"
+import pickle
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from flash_attn.utils.benchmark import benchmark_all, benchmark_forward, benchmark_backward
+from flash_attn.utils.benchmark import benchmark_fwd_bwd, benchmark_combined
+from flash_attn import flash_attn_qkvpacked_func
+try:
+    from triton.ops.flash_attention import attention as attention_triton
+except ImportError:
+    attention_triton = None
+try:
+    import xformers.ops as xops
+except ImportError:
+    xops = None
+def flops(batch, seqlen, headdim, nheads, causal, mode="fwd"):
+    assert mode in ["fwd", "bwd", "fwd_bwd"]
+    f = 4 * batch * seqlen**2 * nheads * headdim // (2 if causal else 1)
+    return f if mode == "fwd" else (2.5 * f if mode == "bwd" else 3.5 * f)
+def efficiency(flop, time):
+    return (flop / time / 10**12) if not math.isnan(time) else 0.0
+def attention_pytorch(qkv, dropout_p=0.0, causal=True):
+    """
+    Arguments:
+        qkv: (batch_size, seqlen, 3, nheads, head_dim)
+        dropout_p: float
+    Output:
+        output: (batch_size, seqlen, nheads, head_dim)
+    """
+    batch_size, seqlen, _, nheads, d = qkv.shape
+    q, k, v = qkv.unbind(dim=2)
+    q = rearrange(q, 'b t h d -> (b h) t d')
+    k = rearrange(k, 'b s h d -> (b h) d s')
+    softmax_scale = 1.0 / math.sqrt(d)
+    # Preallocate attn_weights for `baddbmm`
+    scores = torch.empty(batch_size * nheads, seqlen, seqlen, dtype=qkv.dtype, device=qkv.device)
+    scores = rearrange(torch.baddbmm(scores, q, k, beta=0, alpha=softmax_scale),
+                       '(b h) t s -> b h t s', h=nheads)
+    if causal:
+        # "triu_tril_cuda_template" not implemented for 'BFloat16'
+        # So we have to construct the mask in float
+        causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1)
+        # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess)
+        scores = scores + causal_mask.to(dtype=scores.dtype)
+    attention = torch.softmax(scores, dim=-1)
+    attention_drop = F.dropout(attention, dropout_p)
+    output = torch.einsum('bhts,bshd->bthd', attention_drop , v)
+    return output.to(dtype=qkv.dtype)
+def time_fwd_bwd(func, *args, **kwargs):
+    time_f, time_b = benchmark_fwd_bwd(func, *args, **kwargs)
+    return time_f[1].mean, time_b[1].mean
+repeats = 30
+device = 'cuda'
+dtype = torch.float16
+bs_seqlen_vals = [(32, 512), (16, 1024), (8, 2048), (4, 4096), (2, 8192), (1, 16384)]
+causal_vals = [False, True]
+headdim_vals = [64, 128]
+dim = 2048
+dropout_p = 0.0
+methods = (["Flash2", "Pytorch"]
+           + (["Triton"] if attention_triton is not None else [])
+           + (["xformers.c"] if xops is not None else [])
+           + (["xformers.f"] if xops is not None else []))
+time_f = {}
+time_b = {}
+time_f_b = {}
+speed_f = {}
+speed_b = {}
+speed_f_b = {}
+for causal in causal_vals:
+    for headdim in headdim_vals:
+        for batch_size, seqlen in bs_seqlen_vals:
+            config = (causal, headdim, batch_size, seqlen)
+            nheads = dim // headdim
+            qkv = torch.randn(batch_size, seqlen, 3, nheads, headdim, device=device, dtype=dtype,
+                              requires_grad=True)
+            f, b = time_fwd_bwd(
+                flash_attn_qkvpacked_func, qkv, dropout_p, causal=causal, repeats=repeats, verbose=False
+            )
+            time_f[config, "Flash2"] = f
+            time_b[config, "Flash2"] = b
+            try:
+                qkv = qkv.detach().requires_grad_(True)
+                f, b = time_fwd_bwd(
+                    attention_pytorch, qkv, dropout_p, causal=causal, repeats=repeats, verbose=False
+                )
+            except:  # Skip if OOM
+                f, b = float('nan'), float('nan')
+            time_f[config, "Pytorch"] = f
+            time_b[config, "Pytorch"] = b
+            if attention_triton is not None:
+                q, k, v = [torch.randn(batch_size, nheads, seqlen, headdim, device=device, dtype=dtype,
+                                    requires_grad=True) for _ in range(3)]
+                # Try both values of sequence_parallel and pick the faster one
+                try:
+                    f, b = time_fwd_bwd(
+                        attention_triton, q, k, v, causal, headdim**(-0.5),
+                        False, repeats=repeats, verbose=False
+                    )
+                except:
+                    f, b = float('nan'), float('inf')
+                try:
+                    _, b0 = time_fwd_bwd(
+                        attention_triton, q, k, v, causal, headdim**(-0.5),
+                        True, repeats=repeats, verbose=False
+                    )
+                except:
+                    b0 = float('inf')
+                time_f[config, "Triton"] = f
+                time_b[config, "Triton"] = min(b, b0) if min(b, b0) < float('inf') else float('nan')
+            if xops is not None:
+                q, k, v = [torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype,
+                                    requires_grad=True) for _ in range(3)]
+                f, b = time_fwd_bwd(
+                    xops.memory_efficient_attention, q, k, v,
+                    attn_bias=xops.LowerTriangularMask() if causal else None,
+                    op=(xops.fmha.cutlass.FwOp, xops.fmha.cutlass.BwOp)
+                )
+                time_f[config, "xformers.c"] = f
+                time_b[config, "xformers.c"] = b
+            if xops is not None:
+                q, k, v = [torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype,
+                                    requires_grad=True) for _ in range(3)]
+                f, b = time_fwd_bwd(
+                    xops.memory_efficient_attention, q, k, v,
+                    attn_bias=xops.LowerTriangularMask() if causal else None,
+                    op=(xops.fmha.flash.FwOp, xops.fmha.flash.BwOp)
+                )
+                time_f[config, "xformers.f"] = f
+                time_b[config, "xformers.f"] = b
+            print(f"### causal={causal}, headdim={headdim}, batch_size={batch_size}, seqlen={seqlen} ###")
+            for method in methods:
+                time_f_b[config, method] = time_f[config, method] + time_b[config, method]
+                speed_f[config, method] = efficiency(
+                    flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd"),
+                    time_f[config, method]
+                )
+                speed_b[config, method] = efficiency(
+                    flops(batch_size, seqlen, headdim, nheads, causal, mode="bwd"),
+                    time_b[config, method]
+                )
+                speed_f_b[config, method] = efficiency(
+                    flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd_bwd"),
+                    time_f_b[config, method]
+                )
+                print(
+                    f"{method} fwd: {speed_f[config, method]:.2f} TFLOPs/s, "
+                    f"bwd: {speed_b[config, method]:.2f} TFLOPs/s, "
+                    f"fwd + bwd: {speed_f_b[config, method]:.2f} TFLOPs/s"
+                )
+# with open('flash2_attn_time.plk', 'wb') as fp:
+#     pickle.dump((speed_f, speed_b, speed_f_b), fp, protocol=pickle.HIGHEST_PROTOCOL)

cookbooks/flash-attention/benchmarks/benchmark_gemm.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import time
+import torch
+import torch.utils.benchmark as benchmark
+from triton.testing import do_bench
+if torch.version.cuda:
+    backendBLAS = "cuBLAS"
+elif torch.version.hip:
+    backendBLAS = "hipBLAS"
+def benchmark_forward(fn, *inputs, repeats=10, desc='', verbose=True, **kwinputs):
+    """Use Pytorch Benchmark on the forward pass of an arbitrary function."""
+    if verbose:
+        print(desc, '- Forward pass')
+    t = benchmark.Timer(
+            stmt='fn(*inputs, **kwinputs)',
+            globals={'fn': fn, 'inputs': inputs, 'kwinputs': kwinputs},
+            num_threads=torch.get_num_threads(),
+            )
+    m = t.timeit(repeats)
+    if verbose:
+        print(m)
+    return t, m
+torch.manual_seed(0)
+repeats = 30
+dtype = torch.bfloat16
+device = 'cuda'
+verbose = False
+m, n = 8192, 8192
+tflops_matmul = {}
+tflops_matmul1 = {}
+for k in [512, 1024, 1536, 2048, 2560, 3072, 3584, 4096, 4608, 5120, 5632, 6144, 6656, 7168, 7680, 8192]:
+    a = torch.randn(m, k, device=device, dtype=dtype)
+    b = torch.randn(n, k, device=device, dtype=dtype).transpose(-1, -2)
+    nFLOPS_matmul = 2 * m * n * k
+    time.sleep(2)  # to reduce power throttling
+    timing = benchmark_forward(torch.matmul, a, b, desc=backendBLAS, verbose=verbose, repeats=repeats)[1]
+    tflops_matmul[k] = nFLOPS_matmul / timing.mean * 1e-12
+    print(f'[torch.utils.benchmark] {backendBLAS}, {m = }, {n = }, {k = }: {timing.mean * 1e3:.3f}ms, {tflops_matmul[k]:.1f} TFLOPS')
+    time.sleep(2)  # to reduce power throttling
+    ms = do_bench(lambda: torch.matmul(a, b), warmup=10, rep=repeats)
+    tflops_matmul1[k] = nFLOPS_matmul / ms * 1e-9
+    print(f'[triton.test.do_bench]  {backendBLAS}, {m = }, {n = }, {k = }: {ms:.3f}ms, {tflops_matmul1[k]:.1f} TFLOPS')

cookbooks/flash-attention/csrc/flash_attn/flash_api.cpp ADDED Viewed

	@@ -0,0 +1,1485 @@

+/******************************************************************************
+ * Copyright (c) 2024, Tri Dao.
+ ******************************************************************************/
+// Include these 2 headers instead of torch/extension.h since we don't need all of the torch headers.
+#include <torch/python.h>
+#include <torch/nn/functional.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+#include <ATen/cuda/CUDAGeneratorImpl.h>  // For at::Generator and at::PhiloxCudaState
+#include "philox_unpack.cuh"  // For at::cuda::philox::unpack
+#include <cutlass/numeric_types.h>
+#include "namespace_config.h"
+#include "hardware_info.h"
+#include "flash.h"
+#include "static_switch.h"
+#define CHECK_DEVICE(x) TORCH_CHECK(x.is_cuda(), #x " must be on CUDA")
+#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+namespace FLASH_NAMESPACE {
+void set_params_fprop(Flash_fwd_params &params,
+                      // sizes
+                      const size_t b,
+                      const size_t seqlen_q,
+                      const size_t seqlen_k,
+                      const size_t seqlen_q_rounded,
+                      const size_t seqlen_k_rounded,
+                      const size_t h,
+                      const size_t h_k,
+                      const size_t d,
+                      const size_t d_rounded,
+                      // device pointers
+                      const at::Tensor q,
+                      const at::Tensor k,
+                      const at::Tensor v,
+                      at::Tensor out,
+                      void *cu_seqlens_q_d,
+                      void *cu_seqlens_k_d,
+                      void *seqused_k,
+                      void *p_d,
+                      void *softmax_lse_d,
+                      float p_dropout,
+                      float softmax_scale,
+                      int window_size_left,
+                      int window_size_right,
+                      const float softcap,
+                      bool seqlenq_ngroups_swapped=false,
+                      const bool unpadded_lse=false) {
+    // Reset the parameters
+    params = {};
+    params.is_bf16 = q.dtype() == torch::kBFloat16;
+    // Set the pointers and strides.
+    params.q_ptr = q.data_ptr();
+    params.k_ptr = k.data_ptr();
+    params.v_ptr = v.data_ptr();
+    // All stride are in elements, not bytes.
+    params.q_row_stride = q.stride(-3);
+    params.k_row_stride = k.stride(-3);
+    params.v_row_stride = v.stride(-3);
+    params.q_head_stride = q.stride(-2);
+    params.k_head_stride = k.stride(-2);
+    params.v_head_stride = v.stride(-2);
+    params.o_ptr = out.data_ptr();
+    params.o_row_stride = out.stride(-3);
+    params.o_head_stride = out.stride(-2);
+    if (cu_seqlens_q_d == nullptr) {
+        params.q_batch_stride = q.stride(0);
+        params.k_batch_stride = k.stride(0);
+        params.v_batch_stride = v.stride(0);
+        params.o_batch_stride = out.stride(0);
+        if (seqlenq_ngroups_swapped) {
+             params.q_batch_stride *= seqlen_q;
+             params.o_batch_stride *= seqlen_q;
+        }
+    }
+    params.cu_seqlens_q = static_cast<int *>(cu_seqlens_q_d);
+    params.cu_seqlens_k = static_cast<int *>(cu_seqlens_k_d);
+    params.seqused_k = static_cast<int *>(seqused_k);
+    // P = softmax(QK^T)
+    params.p_ptr = p_d;
+    // Softmax sum
+    params.softmax_lse_ptr = softmax_lse_d;
+    // Set the dimensions.
+    params.b = b;
+    params.h = h;
+    params.h_k = h_k;
+    params.h_h_k_ratio = h / h_k;
+    params.seqlen_q = seqlen_q;
+    params.seqlen_k = seqlen_k;
+    params.seqlen_q_rounded = seqlen_q_rounded;
+    params.seqlen_k_rounded = seqlen_k_rounded;
+    params.d = d;
+    params.d_rounded = d_rounded;
+    // Set the different scale values.
+    #ifdef FLASHATTENTION_DISABLE_SOFTCAP
+        TORCH_CHECK(softcap <= 0.0, "This flash attention build does not support softcap.");
+    #endif
+    if (softcap > 0.0) {
+        params.softcap = softmax_scale / softcap;
+        params.scale_softmax = softcap;
+        params.scale_softmax_log2 = softcap * M_LOG2E;
+    } else{
+        // Remove potential NaN
+        params.softcap = 0.0;
+        params.scale_softmax = softmax_scale;
+        params.scale_softmax_log2 = softmax_scale * M_LOG2E;
+    }
+    // Set this to probability of keeping an element to simplify things.
+    params.p_dropout = 1.f - p_dropout;
+    // Convert p from float to int so we don't have to convert the random uint to float to compare.
+    // [Minor] We want to round down since when we do the comparison we use <= instead of <
+    // params.p_dropout_in_uint = uint32_t(std::floor(params.p_dropout * 4294967295.0));
+    // params.p_dropout_in_uint16_t = uint16_t(std::floor(params.p_dropout * 65535.0));
+    params.p_dropout_in_uint8_t = uint8_t(std::floor(params.p_dropout * 255.0));
+    params.rp_dropout = 1.f / params.p_dropout;
+    params.scale_softmax_rp_dropout = params.rp_dropout * params.scale_softmax;
+    TORCH_CHECK(p_dropout < 1.f);
+    #ifdef FLASHATTENTION_DISABLE_DROPOUT
+        TORCH_CHECK(p_dropout == 0.0f, "This flash attention build does not support dropout.");
+    #endif
+    // Causal is the special case where window_size_right == 0 and window_size_left < 0.
+    // Local is the more general case where window_size_right >= 0 or window_size_left >= 0.
+    params.is_causal = window_size_left < 0 && window_size_right == 0;
+    if (window_size_left < 0 && window_size_right >= 0) { window_size_left = seqlen_k; }
+    if (window_size_left >= 0 && window_size_right < 0) { window_size_right = seqlen_k; }
+    params.window_size_left = window_size_left;
+    params.window_size_right = window_size_right;
+    #ifdef FLASHATTENTION_DISABLE_LOCAL
+        TORCH_CHECK(params.is_causal || (window_size_left < 0 && window_size_right < 0),
+            "This flash attention build does not support local attention.");
+    #endif
+    params.is_seqlens_k_cumulative = true;
+    #ifdef FLASHATTENTION_DISABLE_UNEVEN_K
+        TORCH_CHECK(d == d_rounded, "This flash attention build does not support headdim not being a multiple of 32.");
+    #endif
+    params.unpadded_lse = unpadded_lse;
+    params.seqlenq_ngroups_swapped = seqlenq_ngroups_swapped;
+}
+void set_params_dgrad(Flash_bwd_params &params,
+                      // sizes
+                      const size_t b,
+                      const size_t seqlen_q,
+                      const size_t seqlen_k,
+                      const size_t seqlen_q_rounded,
+                      const size_t seqlen_k_rounded,
+                      const size_t h,
+                      const size_t h_k,
+                      const size_t d,
+                      const size_t d_rounded,
+                      // device pointers
+                      const at::Tensor q,
+                      const at::Tensor k,
+                      const at::Tensor v,
+                      const at::Tensor out,
+                      const at::Tensor dout,
+                      at::Tensor dq,
+                      at::Tensor dk,
+                      at::Tensor dv,
+                      void *cu_seqlens_q_d,
+                      void *cu_seqlens_k_d,
+                      void *dq_accum_d,
+                      void *dk_accum_d,
+                      void *dv_accum_d,
+                      void *softmax_lse_d,
+                      void *dsoftmax_sum_d,
+                      float p_dropout,
+                      float softmax_scale,
+                      int window_size_left,
+                      int window_size_right,
+                      const float softcap,
+                      bool deterministic,
+                      const bool unpadded_lse) {
+    set_params_fprop(params,
+                     b, seqlen_q, seqlen_k, seqlen_q_rounded, seqlen_k_rounded, h, h_k, d, d_rounded,
+                     q, k, v, out,
+                     cu_seqlens_q_d,
+                     cu_seqlens_k_d,
+                     nullptr,
+                     nullptr,
+                     softmax_lse_d,
+                     p_dropout,
+                     softmax_scale,
+                     window_size_left,
+                     window_size_right,
+                     softcap,
+                     false, // seqlenq_ngroups_swapped
+                     unpadded_lse);
+    // Set the pointers and strides.
+    params.do_ptr = dout.data_ptr();
+    params.do_row_stride = dout.stride(-3);
+    params.do_head_stride = dout.stride(-2);
+    params.dq_ptr = dq.data_ptr();
+    params.dk_ptr = dk.data_ptr();
+    params.dv_ptr = dv.data_ptr();
+    params.dq_row_stride = dq.stride(-3);
+    params.dk_row_stride = dk.stride(-3);
+    params.dv_row_stride = dv.stride(-3);
+    params.dq_head_stride = dq.stride(-2);
+    params.dk_head_stride = dk.stride(-2);
+    params.dv_head_stride = dv.stride(-2);
+    if (cu_seqlens_q_d == nullptr) {
+        params.do_batch_stride = dout.stride(0);
+        params.dq_batch_stride = dq.stride(0);
+        params.dk_batch_stride = dk.stride(0);
+        params.dv_batch_stride = dv.stride(0);
+    }
+    params.dq_accum_ptr = dq_accum_d;
+    params.dk_accum_ptr = dk_accum_d;
+    params.dv_accum_ptr = dv_accum_d;
+    // Softmax sum
+    params.dsoftmax_sum = dsoftmax_sum_d;
+    params.deterministic = deterministic;
+}
+void run_mha_fwd(Flash_fwd_params &params, cudaStream_t stream, bool force_split_kernel=false) {
+    FP16_SWITCH(!params.is_bf16, [&] {
+        HEADDIM_SWITCH(params.d, [&] {
+            BOOL_SWITCH(params.is_causal, Is_causal, [&] {
+                if (params.num_splits <= 1 && !force_split_kernel) {  // If we don't set it num_splits == 0
+                    run_mha_fwd_<elem_type, kHeadDim, Is_causal>(params, stream);
+                } else {
+                    run_mha_fwd_splitkv_dispatch<elem_type, kHeadDim, Is_causal>(params, stream);
+                }
+            });
+        });
+    });
+}
+// Find the number of splits that maximizes the occupancy. For example, if we have
+// batch * n_heads = 48 and we have 108 SMs, having 2 splits (efficiency = 0.89) is
+// better than having 3 splits (efficiency = 0.67). However, we also don't want too many
+// splits as that would incur more HBM reads/writes.
+// So we find the best efficiency, then find the smallest number of splits that gets 85%
+// of the best efficiency.
+inline int num_splits_heuristic(int batch_nheads_mblocks, int num_SMs, int num_n_blocks, int max_splits) {
+    // If we have enough to almost fill the SMs, then just use 1 split
+    if (batch_nheads_mblocks >= 0.8f * num_SMs) { return 1; }
+    max_splits = std::min({max_splits, num_SMs, num_n_blocks});
+    float max_efficiency = 0.f;
+    std::vector<float> efficiency;
+    efficiency.reserve(max_splits);
+    auto ceildiv = [](int a, int b) { return (a + b - 1) / b; };
+    // Some splits are not eligible. For example, if we have 64 blocks and choose 11 splits,
+    // we'll have 6 * 10 + 4 blocks. If we choose 12 splits, we'll have 6 * 11 + (-2) blocks
+    // (i.e. it's 11 splits anyway).
+    // So we check if the number of blocks per split is the same as the previous num_splits.
+    auto is_split_eligible = [&ceildiv, &num_n_blocks](int num_splits) {
+        return num_splits == 1 || ceildiv(num_n_blocks, num_splits) != ceildiv(num_n_blocks, num_splits - 1);
+    };
+    for (int num_splits = 1; num_splits <= max_splits; num_splits++) {
+        if (!is_split_eligible(num_splits)) {
+            efficiency.push_back(0.f);
+        } else {
+            float n_waves = float(batch_nheads_mblocks * num_splits) / num_SMs;
+            float eff = n_waves / ceil(n_waves);
+            // printf("num_splits = %d, eff = %f\n", num_splits, eff);
+            if (eff > max_efficiency) { max_efficiency = eff; }
+            efficiency.push_back(eff);
+        }
+    }
+    for (int num_splits = 1; num_splits <= max_splits; num_splits++) {
+        if (!is_split_eligible(num_splits)) { continue; }
+        if (efficiency[num_splits - 1] >= 0.85 * max_efficiency) {
+            // printf("num_splits chosen = %d\n", num_splits);
+            return num_splits;
+        }
+    }
+    return 1;
+}
+std::tuple<at::Tensor, at::Tensor> set_params_splitkv(Flash_fwd_params &params, const int batch_size,
+    const int num_heads, const int head_size, const int max_seqlen_k, const int max_seqlen_q,
+    const int head_size_rounded, const float p_dropout,
+    const int num_splits, const int num_sm, struct c10::TensorOptions opts) {
+    // This needs to match with run_mha_fwd_splitkv_dispatch
+    const int block_n = head_size <= 64 ? 256 : (head_size <= 128 ? 128 : 64);
+    const int num_n_blocks = (max_seqlen_k + block_n - 1) / block_n;
+    // Technically kBlockM = 64 only for the splitKV kernels, not the standard kernel.
+    // In any case we don't expect seqlen_q to be larger than 64 for inference.
+    const int num_m_blocks = (max_seqlen_q + 64 - 1) / 64;
+    params.num_splits = num_splits;
+    at::Tensor softmax_lse_accum;
+    at::Tensor out_accum;
+    if (p_dropout == 0.0f) {  // SplitKV is not implemented for dropout
+        if (num_splits < 1) {
+            // We multiply number of SMs by 2 to hard-code the fact that we're using 128 threads per block.
+            params.num_splits = num_splits_heuristic(batch_size * num_heads * num_m_blocks, num_sm * 2, num_n_blocks, 128);
+        }
+        if (params.num_splits > 1) {
+            softmax_lse_accum = torch::empty({params.num_splits, batch_size, num_heads, max_seqlen_q}, opts.dtype(at::kFloat));
+            out_accum = torch::empty({params.num_splits, batch_size, num_heads, max_seqlen_q, head_size_rounded}, opts.dtype(at::kFloat));
+            params.softmax_lseaccum_ptr = softmax_lse_accum.data_ptr();
+            params.oaccum_ptr = out_accum.data_ptr();
+        }
+        TORCH_CHECK(params.num_splits <= 128, "num_splits > 128 not supported");
+    }
+    return std::make_tuple(softmax_lse_accum, out_accum);
+}
+void set_params_alibi(Flash_fwd_params &params, std::optional<at::Tensor> &alibi_slopes_, int batch_size, int num_heads){
+#ifdef FLASHATTENTION_DISABLE_ALIBI
+    TORCH_CHECK(!alibi_slopes_.has_value(), "This flash attention build does not support alibi.");
+    params.alibi_slopes_ptr = nullptr;
+#else
+    if (alibi_slopes_.has_value()) {
+        auto alibi_slopes = alibi_slopes_.value();
+        TORCH_CHECK(alibi_slopes.dtype() == torch::kFloat32, "ALiBi slopes must have dtype fp32");
+        CHECK_DEVICE(alibi_slopes);
+        TORCH_CHECK(alibi_slopes.stride(-1) == 1, "ALiBi slopes tensor must have contiguous last dimension");
+        TORCH_CHECK(alibi_slopes.sizes() == torch::IntArrayRef({num_heads}) || alibi_slopes.sizes() == torch::IntArrayRef({batch_size, num_heads}));
+        params.alibi_slopes_ptr = alibi_slopes.data_ptr();
+        params.alibi_slopes_batch_stride = alibi_slopes.dim() == 2 ? alibi_slopes.stride(0) : 0;
+    } else {
+        params.alibi_slopes_ptr = nullptr;
+    }
+#endif
+}
+std::vector<at::Tensor>
+mha_fwd(at::Tensor &q,         // batch_size x seqlen_q x num_heads x round_multiple(head_size, 8)
+        const at::Tensor &k,         // batch_size x seqlen_k x num_heads_k x round_multiple(head_size, 8)
+        const at::Tensor &v,         // batch_size x seqlen_k x num_heads_k x round_multiple(head_size, 8)
+        std::optional<at::Tensor> &out_,             // batch_size x seqlen_q x num_heads x round_multiple(head_size, 8)
+        std::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
+        const float p_dropout,
+        const float softmax_scale,
+        bool is_causal,
+        int window_size_left,
+        int window_size_right,
+        const float softcap,
+        const bool return_softmax,
+        std::optional<at::Generator> gen_) {
+    // Otherwise the kernel will be launched from cuda:0 device
+    at::cuda::CUDAGuard device_guard{q.device()};
+    auto [cc_major, cc_minor] = get_compute_capability(get_current_device());
+    bool is_sm8x_min = cc_major >= 8;
+    TORCH_CHECK(is_sm8x_min, "FlashAttention only supports Ampere GPUs or newer.");
+    auto q_dtype = q.dtype();
+    TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16,
+                "FlashAttention only support fp16 and bf16 data type");
+    TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
+    TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
+    CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v);
+    TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    const auto sizes = q.sizes();
+    const int batch_size = sizes[0];
+    int seqlen_q = sizes[1];
+    int num_heads = sizes[2];
+    const int head_size = sizes[3];
+    const int seqlen_k = k.size(1);
+    const int num_heads_k = k.size(2);
+    TORCH_CHECK(batch_size > 0, "batch size must be positive");
+    TORCH_CHECK(head_size <= 256, "FlashAttention forward only supports head dimension at most 256");
+    TORCH_CHECK(head_size % 8 == 0, "query, key, value, and out_ must have a head_size that is a multiple of 8");
+    TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
+    if (softcap > 0.f) { TORCH_CHECK(p_dropout == 0.f, "Softcapping does not support dropout for now"); }
+    if (window_size_left >= seqlen_k) { window_size_left = -1; }
+    if (window_size_right >= seqlen_k) { window_size_right = -1; }
+    // causal=true is the same as causal=false in this case
+    if (seqlen_q == 1 && !alibi_slopes_.has_value()) { is_causal = false; }
+    if (is_causal) { window_size_right = 0; }
+    // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d) in this case
+    // H/t Daniel Haziza
+    const int seqlenq_ngroups_swapped = seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 && window_size_right < 0 && p_dropout == 0.f && head_size % 8 == 0 && !alibi_slopes_.has_value();
+    const int ngroups = num_heads / num_heads_k;
+    if (seqlenq_ngroups_swapped) {
+        q = q.reshape({batch_size, num_heads_k, ngroups, head_size}).transpose(1, 2);
+        seqlen_q = ngroups;
+        num_heads = num_heads_k;
+    }
+    CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size);
+    CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size);
+    CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, head_size);
+    at::Tensor out;
+    if (out_.has_value()) {
+        out = out_.value();
+        TORCH_CHECK(out.dtype() == q_dtype, "Output must have the same dtype as inputs");
+        CHECK_DEVICE(out);
+        TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension");
+        CHECK_SHAPE(out, batch_size, sizes[1], sizes[2], head_size);
+        if (seqlenq_ngroups_swapped) {
+            out = out.reshape({batch_size, num_heads_k, ngroups, head_size}).transpose(1, 2);
+        }
+    } else {
+        out = torch::empty_like(q);
+    }
+    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    const int head_size_rounded = head_size <= 192 ? round_multiple(head_size, 32) : 256;
+    const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
+    const int seqlen_k_rounded = round_multiple(seqlen_k, 128);
+    auto opts = q.options();
+    auto softmax_lse = torch::empty({batch_size, num_heads, seqlen_q}, opts.dtype(at::kFloat));
+    at::Tensor p;
+    // Only return softmax if there's dropout to reduce compilation time
+    if (return_softmax) {
+        TORCH_CHECK(p_dropout > 0.0f, "return_softmax is only supported when p_dropout > 0.0");
+        p = torch::empty({ batch_size, num_heads, seqlen_q_rounded, seqlen_k_rounded }, opts);
+    }
+    else {
+        p = torch::empty({ 0 }, opts);
+    }
+    Flash_fwd_params params;
+    set_params_fprop(params,
+                     batch_size,
+                     seqlen_q, seqlen_k,
+                     seqlen_q_rounded, seqlen_k_rounded,
+                     num_heads, num_heads_k,
+                     head_size, head_size_rounded,
+                     q, k, v, out,
+                     /*cu_seqlens_q_d=*/nullptr,
+                     /*cu_seqlens_k_d=*/nullptr,
+                     /*seqused_k=*/nullptr,
+                     return_softmax ? p.data_ptr() : nullptr,
+                     softmax_lse.data_ptr(),
+                     p_dropout,
+                     softmax_scale,
+                     window_size_left,
+                     window_size_right,
+                     softcap
+                     );
+    // Keep references to these tensors to extend their lifetime
+    at::Tensor softmax_lse_accum, out_accum;
+    std::tie(softmax_lse_accum, out_accum) = set_params_splitkv(
+        params, batch_size, num_heads, head_size, seqlen_k, seqlen_q,
+        head_size_rounded, p_dropout, /*num_splits*/ 0, get_num_sm(get_current_device()), opts);
+    // number of times random will be generated per thread, to offset philox counter in thc random
+    // state
+    // We use a custom RNG that increases the offset by batch_size * nheads * 32.
+    int64_t counter_offset = params.b * params.h * 32;
+    auto options = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    auto rng_state = torch::empty({2}, options.dtype(torch::kInt64));
+    // Forward kernel will populate memory with the seed and offset.
+    params.rng_state = reinterpret_cast<uint64_t*>(rng_state.data_ptr());
+    if (p_dropout > 0.0)  {
+        auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
+            gen_, at::cuda::detail::getDefaultCUDAGenerator());
+        // See Note [Acquire lock when using random generators]
+        std::lock_guard<std::mutex> lock(gen->mutex_);
+        params.philox_args = gen->philox_cuda_state(counter_offset);
+    }
+    set_params_alibi(params, alibi_slopes_, batch_size, num_heads);
+    if (seqlen_k > 0) {
+        auto stream = at::cuda::getCurrentCUDAStream().stream();
+        run_mha_fwd(params, stream);
+    } else {
+        // If seqlen_k == 0, then we have an empty tensor. We need to set the output to 0.
+        out.zero_();
+        softmax_lse.fill_(std::numeric_limits<float>::infinity());
+    }
+    if (seqlenq_ngroups_swapped) {
+        out = out.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size});
+        q = q.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size});
+        softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1});
+    }
+    return {out, softmax_lse, p, rng_state};
+}
+std::vector<at::Tensor>
+mha_varlen_fwd(at::Tensor &q,  // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+               const at::Tensor &k,  // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table.
+               const at::Tensor &v,  // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table.
+               std::optional<at::Tensor> &out_, // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+               const at::Tensor &cu_seqlens_q,  // b+1
+               const at::Tensor &cu_seqlens_k,  // b+1
+               std::optional<at::Tensor> &seqused_k, // b. If given, only this many elements of each batch element's keys are used.
+               std::optional<const at::Tensor> &leftpad_k_, // batch_size
+               std::optional<at::Tensor> &block_table_, // batch_size x max_num_blocks_per_seq
+               std::optional<at::Tensor> &alibi_slopes_, // num_heads or b x num_heads
+               int max_seqlen_q,
+               const int max_seqlen_k,
+               const float p_dropout,
+               const float softmax_scale,
+               const bool zero_tensors,
+               bool is_causal,
+               int window_size_left,
+               int window_size_right,
+               const float softcap,
+               const bool return_softmax,
+               std::optional<at::Generator> gen_) {
+    // Otherwise the kernel will be launched from cuda:0 device
+    at::cuda::CUDAGuard device_guard{q.device()};
+    auto [cc_major, cc_minor] = get_compute_capability(get_current_device());
+    bool is_sm8x_min = cc_major >= 8;
+    TORCH_CHECK(is_sm8x_min, "FlashAttention only supports Ampere GPUs or newer.");
+    auto q_dtype = q.dtype();
+    TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16,
+                "FlashAttention only support fp16 and bf16 data type");
+    TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
+    TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
+    TORCH_CHECK(cu_seqlens_q.dtype() == torch::kInt32, "cu_seqlens_q must have dtype int32");
+    TORCH_CHECK(cu_seqlens_k.dtype() == torch::kInt32, "cu_seqlens_k must have dtype int32");
+    CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v);
+    CHECK_DEVICE(cu_seqlens_q);
+    CHECK_DEVICE(cu_seqlens_k);
+    at::Tensor block_table;
+    const bool paged_KV = block_table_.has_value();
+    if (paged_KV) {
+        block_table = block_table_.value();
+        CHECK_DEVICE(block_table);
+        TORCH_CHECK(block_table.dtype() == torch::kInt32, "block_table must have dtype torch.int32");
+        TORCH_CHECK(block_table.stride(-1) == 1, "block_table must have contiguous last dimension");
+    }
+    TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    CHECK_CONTIGUOUS(cu_seqlens_q);
+    CHECK_CONTIGUOUS(cu_seqlens_k);
+    const auto sizes = q.sizes();
+    const int batch_size = cu_seqlens_q.numel() - 1;
+    int num_heads = sizes[1];
+    const int head_size = sizes[2];
+    const int num_heads_k = paged_KV ? k.size(2) : k.size(1);
+    if (softcap > 0.f) { TORCH_CHECK(p_dropout == 0.f, "Softcapping does not support dropout for now"); }
+    const int max_num_blocks_per_seq = !paged_KV ? 0 : block_table.size(1);
+    const int num_blocks = !paged_KV ? 0 : k.size(0);
+    const int page_block_size = !paged_KV ? 1 : k.size(1);
+    TORCH_CHECK(!paged_KV || page_block_size % 256 == 0, "Paged KV cache block size must be divisible by 256");
+    if (max_seqlen_q == 1 && !alibi_slopes_.has_value()) { is_causal = false; }  // causal=true is the same as causal=false in this case
+    if (is_causal) { window_size_right = 0; }
+    void *cu_seqlens_q_d = cu_seqlens_q.data_ptr();
+    // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d) in this case
+    // H/t Daniel Haziza
+    const int seqlenq_ngroups_swapped = max_seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 && window_size_right < 0 && p_dropout == 0.f && head_size % 8 == 0 && !alibi_slopes_.has_value();
+    const int ngroups = num_heads / num_heads_k;
+    if (seqlenq_ngroups_swapped) {
+        q = q.reshape({batch_size, num_heads_k, ngroups, head_size}).transpose(1, 2).reshape({batch_size * ngroups, num_heads_k, head_size});
+        max_seqlen_q = ngroups;
+        num_heads = num_heads_k;
+        cu_seqlens_q_d = nullptr;
+    }
+    const int total_q = q.sizes()[0];
+    TORCH_CHECK(batch_size > 0, "batch size must be positive");
+    TORCH_CHECK(head_size <= 256, "FlashAttention forward only supports head dimension at most 256");
+    TORCH_CHECK(head_size % 8 == 0, "query, key, value, and out_ must have a head_size that is a multiple of 8");
+    TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
+    if (window_size_left >= max_seqlen_k) { window_size_left = -1; }
+    if (window_size_right >= max_seqlen_k) { window_size_right = -1; }
+    CHECK_SHAPE(q, total_q, num_heads, head_size);
+    if (!paged_KV) {
+        const int total_k = k.size(0);
+        CHECK_SHAPE(k, total_k, num_heads_k, head_size);
+        CHECK_SHAPE(v, total_k, num_heads_k, head_size);
+    } else {
+        CHECK_SHAPE(k, num_blocks, page_block_size, num_heads_k, head_size);
+        CHECK_SHAPE(v, num_blocks, page_block_size, num_heads_k, head_size);
+        CHECK_SHAPE(block_table, batch_size, max_num_blocks_per_seq);
+    }
+    CHECK_SHAPE(cu_seqlens_q, batch_size + 1);
+    CHECK_SHAPE(cu_seqlens_k, batch_size + 1);
+    if (seqused_k.has_value()){
+        auto seqused_k_ = seqused_k.value();
+        TORCH_CHECK(seqused_k_.dtype() == torch::kInt32, "seqused_k must have dtype int32");
+        TORCH_CHECK(seqused_k_.is_cuda(), "seqused_k must be on CUDA device");
+        TORCH_CHECK(seqused_k_.is_contiguous(), "seqused_k must be contiguous");
+        CHECK_SHAPE(seqused_k_, batch_size);
+    }
+    at::Tensor out;
+    if (out_.has_value()) {
+        out = out_.value();
+        TORCH_CHECK(out.dtype() == q_dtype, "Output must have the same dtype as inputs");
+        CHECK_DEVICE(out);
+        TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension");
+        CHECK_SHAPE(out, sizes[0], sizes[1], head_size);
+        if (seqlenq_ngroups_swapped) {
+            out = out.reshape({batch_size, num_heads_k, ngroups, head_size}).transpose(1, 2).reshape({batch_size * ngroups, num_heads_k, head_size});
+        }
+    } else {
+        out = torch::empty_like(q);
+    }
+    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    const int head_size_rounded = head_size <= 192 ? round_multiple(head_size, 32) : 256;
+    const int seqlen_q_rounded = round_multiple(max_seqlen_q, 128);
+    const int seqlen_k_rounded = round_multiple(max_seqlen_k, 128);
+    auto opts = q.options();
+    auto softmax_lse = torch::empty({num_heads, total_q}, opts.dtype(at::kFloat));
+    at::Tensor p;
+    // Only return softmax if there's dropout to reduce compilation time
+    if (return_softmax) {
+        TORCH_CHECK(p_dropout > 0.0f, "return_softmax is only supported when p_dropout > 0.0");
+        p = torch::empty({ batch_size, num_heads, seqlen_q_rounded, seqlen_k_rounded }, opts);
+    }
+    else {
+        p = torch::empty({ 0 }, opts);
+    }
+    if (zero_tensors) {
+        out.zero_();
+        softmax_lse.fill_(-std::numeric_limits<float>::infinity());
+        if (return_softmax) {p.zero_();}
+    }
+    Flash_fwd_params params;
+    set_params_fprop(params,
+                     batch_size,
+                     max_seqlen_q, max_seqlen_k,
+                     seqlen_q_rounded, seqlen_k_rounded,
+                     num_heads, num_heads_k,
+                     head_size, head_size_rounded,
+                     q, k, v, out,
+                     cu_seqlens_q_d,
+                     cu_seqlens_k.data_ptr(),
+                     seqused_k.has_value() ? seqused_k.value().data_ptr() : nullptr,
+                     return_softmax ? p.data_ptr() : nullptr,
+                     softmax_lse.data_ptr(),
+                     p_dropout,
+                     softmax_scale,
+                     window_size_left,
+                     window_size_right,
+                     softcap,
+                     seqlenq_ngroups_swapped,
+                     /*unpadded_lse*/true);
+    params.total_q = total_q;
+    if (paged_KV) {
+        params.block_table = block_table.data_ptr<int>();
+        params.block_table_batch_stride = block_table.stride(0);
+        params.k_batch_stride = k.stride(0);
+        params.v_batch_stride = v.stride(0);
+    }
+    params.page_block_size = page_block_size;
+    // Keep references to these tensors to extend their lifetime
+    at::Tensor softmax_lse_accum, out_accum;
+    if (seqlenq_ngroups_swapped) {
+        // Only apply split-k for decoding
+        std::tie(softmax_lse_accum, out_accum) =
+            set_params_splitkv(params, batch_size, num_heads, head_size,
+                               max_seqlen_k, max_seqlen_q, head_size_rounded,
+                               p_dropout, /*num_splits*/ 0, get_num_sm(get_current_device()), opts);
+    }
+    if (leftpad_k_.has_value()) {
+        auto leftpad_k = leftpad_k_.value();
+        TORCH_CHECK(!paged_KV, "We don't support Paged KV and leftpad_k running at the same time yet");
+        TORCH_CHECK(leftpad_k.dtype() == torch::kInt32, "leftpad_k must have dtype int32");
+        CHECK_DEVICE(leftpad_k);
+        CHECK_CONTIGUOUS(leftpad_k);
+        CHECK_SHAPE(leftpad_k, batch_size);
+        params.leftpad_k = static_cast<int *>(leftpad_k.data_ptr());
+    }
+    // number of times random will be generated per thread, to offset philox counter in thc random
+    // state
+    // We use a custom RNG that increases the offset by batch_size * nheads * 32.
+    int64_t counter_offset = params.b * params.h * 32;
+    auto options = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    auto rng_state = torch::empty({2}, options.dtype(torch::kInt64));
+    // Forward kernel will populate memory with the seed and offset.
+    params.rng_state = reinterpret_cast<uint64_t*>(rng_state.data_ptr());
+    if (p_dropout > 0.0)  {
+        auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
+            gen_, at::cuda::detail::getDefaultCUDAGenerator());
+        // See Note [Acquire lock when using random generators]
+        std::lock_guard<std::mutex> lock(gen->mutex_);
+        params.philox_args = gen->philox_cuda_state(counter_offset);
+    }
+    set_params_alibi(params, alibi_slopes_, batch_size, num_heads);
+    if (max_seqlen_k > 0) {
+        auto stream = at::cuda::getCurrentCUDAStream().stream();
+        run_mha_fwd(params, stream, paged_KV);
+    } else {
+        // If seqlen_k == 0, then we have an empty tensor. We need to set the output to 0.
+        out.zero_();
+        softmax_lse.fill_(std::numeric_limits<float>::infinity());
+    }
+    if (seqlenq_ngroups_swapped) {
+        int64_t size_before[] = {batch_size, max_seqlen_q, num_heads_k, head_size};
+        int64_t size_after[] = {batch_size, num_heads_k * max_seqlen_q, head_size};
+        out = out.reshape(size_before).transpose(1, 2).reshape(size_after);
+        q = q.reshape(size_before).transpose(1, 2).reshape(size_after);
+        softmax_lse = softmax_lse.reshape({num_heads * max_seqlen_q, batch_size});
+    }
+    return {out, softmax_lse, p, rng_state};
+}
+void run_mha_bwd(Flash_bwd_params &params, cudaStream_t stream) {
+    FP16_SWITCH(!params.is_bf16, [&] {
+        HEADDIM_SWITCH(params.d, [&] {
+            BOOL_SWITCH(params.is_causal, Is_causal, [&] {
+                run_mha_bwd_<elem_type, kHeadDim, Is_causal>(params, stream);
+            });
+        });
+    });
+}
+std::vector<at::Tensor>
+mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x multiple_of(head_size_og, 8)
+        const at::Tensor &q,   // batch_size x seqlen_q x num_heads x head_size
+        const at::Tensor &k,   // batch_size x seqlen_k x num_heads_k x head_size
+        const at::Tensor &v,   // batch_size x seqlen_k x num_heads_k x head_size
+        const at::Tensor &out,   // batch_size x seqlen_q x num_heads x head_size
+        const at::Tensor &softmax_lse,     // b x h x seqlen_q
+        std::optional<at::Tensor> &dq_,   // batch_size x seqlen_q x num_heads x head_size
+        std::optional<at::Tensor> &dk_,   // batch_size x seqlen_k x num_heads_k x head_size
+        std::optional<at::Tensor> &dv_,   // batch_size x seqlen_k x num_heads_k x head_size
+        std::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
+        const float p_dropout,         // probability to drop
+        const float softmax_scale,
+        const bool is_causal,
+        int window_size_left,
+        int window_size_right,
+        const float softcap,
+        const bool deterministic,
+        std::optional<at::Generator> gen_,
+        std::optional<at::Tensor> &rng_state) {
+    #ifdef FLASHATTENTION_DISABLE_BACKWARD
+        TORCH_CHECK(false, "This flash attention build does not support backward.");
+    #endif
+    if (is_causal) { window_size_right = 0; }
+    // Otherwise the kernel will be launched from cuda:0 device
+    at::cuda::CUDAGuard device_guard{q.device()};
+    auto [cc_major, cc_minor] = get_compute_capability(get_current_device());
+    bool is_sm8x_min = cc_major >= 8;
+    TORCH_CHECK(is_sm8x_min, "FlashAttention only supports Ampere GPUs or newer.");
+    bool is_dropout = p_dropout > 0.0;
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    auto q_dtype = q.dtype();
+    TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16,
+                "FlashAttention only support fp16 and bf16 data type");
+    TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
+    TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
+    TORCH_CHECK(out.dtype() == q_dtype, "query and out must have the same dtype");
+    TORCH_CHECK(dout.dtype() == q_dtype, "query and dout must have the same dtype");
+    CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v);
+    CHECK_DEVICE(out); CHECK_DEVICE(dout); CHECK_DEVICE(softmax_lse);
+    TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(out.stride(-1) == 1, "out tensor must have contiguous last dimension");
+    TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension");
+    const auto sizes = q.sizes();
+    const int batch_size = sizes[0];
+    const int seqlen_q = sizes[1];
+    const int num_heads = sizes[2];
+    const int head_size = sizes[3];
+    const int seqlen_k = k.size(1);
+    const int num_heads_k = k.size(2);
+    TORCH_CHECK(batch_size > 0, "batch size must be positive");
+    TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8");
+    TORCH_CHECK(head_size <= 256, "FlashAttention backward only supports head dimension at most 256");
+    TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
+    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    const int head_size_rounded = head_size <= 192 ? round_multiple(head_size, 32) : 256;
+    const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
+    const int seqlen_k_rounded = round_multiple(seqlen_k, 128);
+    if (softcap > 0.f) { TORCH_CHECK(p_dropout == 0.f, "Softcapping does not support dropout for now"); }
+    if (window_size_left >= seqlen_k) { window_size_left = -1; }
+    if (window_size_right >= seqlen_k) { window_size_right = -1; }
+    CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size);
+    CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size);
+    CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, head_size);
+    CHECK_SHAPE(out, batch_size, seqlen_q, num_heads, head_size);
+    CHECK_SHAPE(dout, batch_size, seqlen_q, num_heads, head_size);
+    at::Tensor dq, dk, dv;
+    if (dq_.has_value()) {
+        dq = dq_.value();
+        TORCH_CHECK(dq.dtype() == q_dtype, "dq must have the same dtype as q");
+        CHECK_DEVICE(dq);
+        TORCH_CHECK(dq.stride(-1) == 1, "dq must have contiguous last dimension");
+        CHECK_SHAPE(dq, batch_size, seqlen_q, num_heads, head_size);
+    } else {
+        dq = torch::empty_like(q);
+    }
+    if (dk_.has_value()) {
+        dk = dk_.value();
+        TORCH_CHECK(dk.dtype() == q_dtype, "dk must have the same dtype as q");
+        CHECK_DEVICE(dk);
+        TORCH_CHECK(dk.stride(-1) == 1, "dk must have contiguous last dimension");
+        CHECK_SHAPE(dk, batch_size, seqlen_k, num_heads_k, head_size);
+    } else {
+        dk = torch::empty_like(k);
+    }
+    if (dv_.has_value()) {
+        dv = dv_.value();
+        TORCH_CHECK(dv.dtype() == q_dtype, "dv must have the same dtype as q");
+        CHECK_DEVICE(dv);
+        TORCH_CHECK(dv.stride(-1) == 1, "dv must have contiguous last dimension");
+        CHECK_SHAPE(dv, batch_size, seqlen_k, num_heads_k, head_size);
+    } else {
+        dv = torch::empty_like(v);
+    }
+    // bool loop = seqlen_k > blocksize_c;
+    // TODO: change later, for now set to true for simplicity
+    bool loop = true;
+    auto opts = q.options();
+    auto softmax_d = torch::empty({batch_size, num_heads, seqlen_q_rounded}, opts.dtype(at::kFloat));
+    at::Tensor dq_accum;
+    at::Tensor dk_accum, dv_accum;
+    if (loop) {
+        if (!deterministic) {
+            dq_accum = torch::empty({batch_size, seqlen_q_rounded, num_heads, head_size_rounded}, opts.dtype(at::kFloat));
+        } else {
+            const int nsplits = (get_num_sm(get_current_device()) + batch_size * num_heads - 1) / (batch_size * num_heads);
+            dq_accum = torch::zeros({nsplits, batch_size, seqlen_q_rounded, num_heads, head_size_rounded}, opts.dtype(at::kFloat));
+        }
+        // dk_accum = torch::empty({batch_size, num_heads_k, seqlen_k_rounded, head_size_rounded}, opts.dtype(at::kFloat));
+        // dv_accum = torch::empty({batch_size, num_heads_k, seqlen_k_rounded, head_size_rounded}, opts.dtype(at::kFloat));
+    }
+    at::Tensor dk_expanded, dv_expanded;
+    if (num_heads_k != num_heads) {  // MQA / GQA
+        dk_expanded = torch::empty({batch_size, seqlen_k, num_heads, head_size}, opts);
+        dv_expanded = torch::empty({batch_size, seqlen_k, num_heads, head_size}, opts);
+    } else {
+        dk_expanded = dk;
+        dv_expanded = dv;
+    }
+    Flash_bwd_params params;
+    set_params_dgrad(params,
+                     batch_size,
+                     seqlen_q, seqlen_k,
+                     seqlen_q_rounded, seqlen_k_rounded,
+                     num_heads, num_heads_k,
+                     head_size, head_size_rounded,
+                     q, k, v, out,
+                     dout, dq, dk_expanded, dv_expanded,
+                     nullptr,
+                     nullptr,
+                     loop ? dq_accum.data_ptr() : nullptr,
+                     // loop ? dk_accum.data_ptr() : nullptr,
+                     // loop ? dv_accum.data_ptr() : nullptr,
+                     nullptr,
+                     nullptr,
+                     softmax_lse.data_ptr(),
+                     softmax_d.data_ptr(),
+                     p_dropout,
+                     softmax_scale,
+                     window_size_left,
+                     window_size_right,
+                     softcap,
+                     deterministic,
+                     /*unpadded_lse*/false);
+    params.dq_accum_split_stride = !deterministic ? 0 : dq_accum.stride(0);
+    auto launch = &run_mha_bwd;
+    auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
+        gen_, at::cuda::detail::getDefaultCUDAGenerator());
+    // We use a custom RNG that increases the offset by batch_size * nheads * 32.
+    int64_t counter_offset = params.b * params.h * 32;
+    if ( rng_state.has_value() ) {
+        params.rng_state = reinterpret_cast<uint64_t*>(rng_state.value().data_ptr());
+    } else if( is_dropout ) {
+        // See Note [Acquire lock when using random generators]
+        std::lock_guard<std::mutex> lock(gen->mutex_);
+        params.philox_args = gen->philox_cuda_state(counter_offset);
+        auto seeds = at::cuda::philox::unpack(params.philox_args);
+        params.rng_state[0] = std::get<0>(seeds);
+        params.rng_state[1] = std::get<1>(seeds);
+    }
+    set_params_alibi(params, alibi_slopes_, batch_size, num_heads);
+    if (seqlen_q > 0) {
+        launch(params, stream);
+    } else {
+        // If seqlen_q == 0, then we have an empty tensor. We need to set the output to 0.
+        dk_expanded.zero_();
+        dv_expanded.zero_();
+        softmax_d.zero_();
+    }
+    // For MQA/GQA we need to sum dK and dV across the groups
+    if (num_heads_k != num_heads) {
+        at::sum_out(dk, at::reshape(dk_expanded, {batch_size, seqlen_k, num_heads_k, num_heads / num_heads_k, head_size}), {3});
+        at::sum_out(dv, at::reshape(dv_expanded, {batch_size, seqlen_k, num_heads_k, num_heads / num_heads_k, head_size}), {3});
+    }
+    return { dq, dk, dv, softmax_d };
+}
+std::vector<at::Tensor>
+mha_varlen_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
+               const at::Tensor &q,   // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+               const at::Tensor &k,   // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
+               const at::Tensor &v,   // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
+               const at::Tensor &out,   // total_q x num_heads x head_size
+               const at::Tensor &softmax_lse,    // h x total_q, softmax logsumexp
+               std::optional<at::Tensor> &dq_,   // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+               std::optional<at::Tensor> &dk_,   // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
+               std::optional<at::Tensor> &dv_,   // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
+               const at::Tensor &cu_seqlens_q,  // b+1
+               const at::Tensor &cu_seqlens_k,  // b+1
+               std::optional<at::Tensor> &alibi_slopes_, // num_heads or b x num_heads
+               const int max_seqlen_q,
+               const int max_seqlen_k,          // max sequence length to choose the kernel
+               const float p_dropout,         // probability to drop
+               const float softmax_scale,
+               const bool zero_tensors,
+               const bool is_causal,
+               int window_size_left,
+               int window_size_right,
+               const float softcap,
+               const bool deterministic,
+               std::optional<at::Generator> gen_,
+               std::optional<at::Tensor> &rng_state) {
+    #ifdef FLASHATTENTION_DISABLE_BACKWARD
+        TORCH_CHECK(false, "This flash attention build does not support backward.");
+    #endif
+    if (is_causal) { window_size_right = 0; }
+    // Otherwise the kernel will be launched from cuda:0 device
+    at::cuda::CUDAGuard device_guard{q.device()};
+    auto [cc_major, cc_minor] = get_compute_capability(get_current_device());
+    bool is_sm8x_min = cc_major >= 8;
+    TORCH_CHECK(is_sm8x_min, "FlashAttention only supports Ampere GPUs or newer.");
+    bool is_dropout = p_dropout > 0.0;
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    auto q_dtype = q.dtype();
+    TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16,
+                "FlashAttention only support fp16 and bf16 data type");
+    TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
+    TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
+    TORCH_CHECK(out.dtype() == q_dtype, "query and out must have the same dtype");
+    TORCH_CHECK(dout.dtype() == q_dtype, "query and dout must have the same dtype");
+    TORCH_CHECK(cu_seqlens_q.dtype() == torch::kInt32, "cu_seqlens_q must have dtype int32");
+    TORCH_CHECK(cu_seqlens_k.dtype() == torch::kInt32, "cu_seqlens_k must have dtype int32");
+    CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v);
+    CHECK_DEVICE(out); CHECK_DEVICE(dout); CHECK_DEVICE(softmax_lse);
+    CHECK_DEVICE(cu_seqlens_q); CHECK_DEVICE(cu_seqlens_k);
+    TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(out.stride(-1) == 1, "out tensor must have contiguous last dimension");
+    TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension");
+    CHECK_CONTIGUOUS(cu_seqlens_q);
+    CHECK_CONTIGUOUS(cu_seqlens_k);
+    const auto sizes = q.sizes();
+    const int total_q = sizes[0];
+    const int batch_size = cu_seqlens_q.numel() - 1;
+    const int num_heads = sizes[1];
+    const int head_size = sizes[2];
+    const int total_k = k.size(0);
+    const int num_heads_k = k.size(1);
+    TORCH_CHECK(batch_size > 0, "batch size must be positive");
+    TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8");
+    TORCH_CHECK(head_size <= 256, "FlashAttention backward only supports head dimension at most 256");
+    TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
+    if (softcap > 0.f) { TORCH_CHECK(p_dropout == 0.f, "Softcapping does not support dropout for now"); }
+    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    const int head_size_rounded = head_size <= 192 ? round_multiple(head_size, 32) : 256;
+    const int seqlen_q_rounded = round_multiple(max_seqlen_q, 128);
+    const int seqlen_k_rounded = round_multiple(max_seqlen_k, 128);
+    if (window_size_left >= max_seqlen_k) { window_size_left = -1; }
+    if (window_size_right >= max_seqlen_k) { window_size_right = -1; }
+    CHECK_SHAPE(q, total_q, num_heads, head_size);
+    CHECK_SHAPE(k, total_k, num_heads_k, head_size);
+    CHECK_SHAPE(v, total_k, num_heads_k, head_size);
+    CHECK_SHAPE(out, total_q, num_heads, head_size);
+    CHECK_SHAPE(dout, total_q, num_heads, head_size);
+    CHECK_SHAPE(cu_seqlens_q, batch_size + 1);
+    CHECK_SHAPE(cu_seqlens_k, batch_size + 1);
+    at::Tensor dq, dk, dv;
+    if (dq_.has_value()) {
+        dq = dq_.value();
+        TORCH_CHECK(dq.dtype() == q_dtype, "dq must have the same dtype as q");
+        CHECK_DEVICE(dq);
+        TORCH_CHECK(dq.stride(-1) == 1, "dq must have contiguous last dimension");
+        CHECK_SHAPE(dq, total_q, num_heads, head_size);
+    } else {
+        dq = torch::empty_like(q);
+    }
+    if (dk_.has_value()) {
+        dk = dk_.value();
+        TORCH_CHECK(dk.dtype() == q_dtype, "dk must have the same dtype as q");
+        CHECK_DEVICE(dk);
+        TORCH_CHECK(dk.stride(-1) == 1, "dk must have contiguous last dimension");
+        CHECK_SHAPE(dk, total_k, num_heads_k, head_size);
+    } else {
+        dk = torch::empty_like(k);
+    }
+    if (dv_.has_value()) {
+        dv = dv_.value();
+        TORCH_CHECK(dv.dtype() == q_dtype, "dv must have the same dtype as q");
+        CHECK_DEVICE(dv);
+        TORCH_CHECK(dv.stride(-1) == 1, "dv must have contiguous last dimension");
+        CHECK_SHAPE(dv, total_k, num_heads_k, head_size);
+    } else {
+        dv = torch::empty_like(v);
+    }
+    // bool loop = max_seqlen_k > blocksize_c;
+    // TODO: change later, for now set to true for simplicity
+    bool loop = true;
+    auto opts = q.options();
+    auto softmax_d = torch::empty({num_heads, total_q + 128 * batch_size}, opts.dtype(at::kFloat));
+    at::Tensor dq_accum;
+    if (loop) {
+        // We don't want to allocate dq_accum of size (batch, seqlen_q_rounded, num_heads, head_size_rounded)
+        // because that would be too large if there is a very long sequence and the rest of the sequences are short.
+        // Instead, we allocate dq_accum of size (total_q + 128 * batch, num_heads, head_size_rounded).
+        // Note that 128 is the max block size on the seqlen_q dimension.
+        // For dQ, the i-th sequence is stored in indices from cu_seqlens[i] + 128 * i to
+        // cu_seqlens[i + 1] * 128 * i - 1. This ensures that the i-th sequence and (i + 1)-th sequence will
+        // be at least 128 apart. It's ok for us to do atomicAdds up to 128 rows beyond what we're normally
+        // allowed to do. So we won't have to do any bound checking, and performance should stay the same.
+        // Same holds for softmax_d, since LSE is stored in unpadded format.
+        if (!deterministic) {
+            dq_accum = torch::empty({total_q + 128 * batch_size, num_heads, head_size_rounded}, opts.dtype(at::kFloat));
+        } else {
+            const int nsplits = (get_num_sm(get_current_device()) + batch_size * num_heads - 1) / (batch_size * num_heads);
+            dq_accum = torch::zeros({nsplits, total_q + 128 * batch_size, num_heads, head_size_rounded}, opts.dtype(at::kFloat));
+        }
+    }
+    at::Tensor dk_expanded, dv_expanded;
+    if (num_heads_k != num_heads) {  // MQA / GQA
+        dk_expanded = torch::empty({total_k, num_heads, head_size}, opts);
+        dv_expanded = torch::empty({total_k, num_heads, head_size}, opts);
+    } else {
+        dk_expanded = dk;
+        dv_expanded = dv;
+    }
+    if( zero_tensors ) {
+        dq.zero_();
+        dk_expanded.zero_();
+        dv_expanded.zero_();
+        softmax_d.zero_();
+    }
+    Flash_bwd_params params;
+    set_params_dgrad(params,
+                     batch_size,
+                     max_seqlen_q, max_seqlen_k,
+                     seqlen_q_rounded, seqlen_k_rounded,
+                     num_heads, num_heads_k,
+                     head_size, head_size_rounded,
+                     q, k, v, out,
+                     dout, dq, dk_expanded, dv_expanded,
+                     cu_seqlens_q.data_ptr(),
+                     cu_seqlens_k.data_ptr(),
+                     loop ? dq_accum.data_ptr() : nullptr,
+                     nullptr,
+                     nullptr,
+                     softmax_lse.data_ptr(),
+                     softmax_d.data_ptr(),
+                     p_dropout,
+                     softmax_scale,
+                     window_size_left,
+                     window_size_right,
+                     softcap,
+                     deterministic,
+                     /*unpadded_lse*/true);
+    params.dq_accum_split_stride = !deterministic ? 0 : dq_accum.stride(0);
+    params.total_q = total_q;
+    auto launch = &run_mha_bwd;
+    auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
+        gen_, at::cuda::detail::getDefaultCUDAGenerator());
+    // We use a custom RNG that increases the offset by batch_size * nheads * 32.
+    int64_t counter_offset = params.b * params.h * 32;
+    if ( rng_state.has_value() ) {
+        params.rng_state = reinterpret_cast<uint64_t*>(rng_state.value().data_ptr());
+    } else if( is_dropout ) {
+        // See Note [Acquire lock when using random generators]
+        std::lock_guard<std::mutex> lock(gen->mutex_);
+        params.philox_args = gen->philox_cuda_state(counter_offset);
+        auto seeds = at::cuda::philox::unpack(params.philox_args);
+        params.rng_state[0] = std::get<0>(seeds);
+        params.rng_state[1] = std::get<1>(seeds);
+    }
+    set_params_alibi(params, alibi_slopes_, batch_size, num_heads);
+    if (max_seqlen_q > 0) {
+        launch(params, stream);
+    } else {
+        // If seqlen_q == 0, then we have an empty tensor. We need to set the output to 0.
+        dk_expanded.zero_();
+        dv_expanded.zero_();
+        softmax_d.zero_();
+    }
+    // For MQA/GQA we need to sum dK and dV across the groups
+    if (num_heads_k != num_heads) {
+        at::sum_out(dk, at::reshape(dk_expanded, {total_k, num_heads_k, num_heads / num_heads_k, head_size}), {2});
+        at::sum_out(dv, at::reshape(dv_expanded, {total_k, num_heads_k, num_heads / num_heads_k, head_size}), {2});
+    }
+    return { dq, dk, dv, softmax_d };
+}
+std::vector<at::Tensor>
+mha_fwd_kvcache(at::Tensor &q,                 // batch_size x seqlen_q x num_heads x head_size
+                const at::Tensor &kcache,            // batch_size_c x seqlen_k x num_heads_k x head_size or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table.
+                const at::Tensor &vcache,            // batch_size_c x seqlen_k x num_heads_k x head_size or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table.
+                std::optional<const at::Tensor> &k_, // batch_size x seqlen_knew x num_heads_k x head_size
+                std::optional<const at::Tensor> &v_, // batch_size x seqlen_knew x num_heads_k x head_size
+                std::optional<const at::Tensor> &seqlens_k_, // batch_size
+                std::optional<const at::Tensor> &rotary_cos_, // seqlen_ro x (rotary_dim / 2)
+                std::optional<const at::Tensor> &rotary_sin_, // seqlen_ro x (rotary_dim / 2)
+                std::optional<const at::Tensor> &cache_batch_idx_, // indices to index into the KV cache
+                std::optional<const at::Tensor> &leftpad_k_, // batch_size
+                std::optional<at::Tensor> &block_table_, // batch_size x max_num_blocks_per_seq
+                std::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
+                std::optional<at::Tensor> &out_,             // batch_size x seqlen_q x num_heads x head_size
+                const float softmax_scale,
+                bool is_causal,
+                int window_size_left,
+                int window_size_right,
+                const float softcap,
+                bool is_rotary_interleaved,   // if true, rotary combines indices 0 & 1, else indices 0 & rotary_dim / 2
+                int num_splits
+                ) {
+    // Otherwise the kernel will be launched from cuda:0 device
+    at::cuda::CUDAGuard device_guard{q.device()};
+    auto [cc_major, cc_minor] = get_compute_capability(get_current_device());
+    bool is_sm8x_min = cc_major >= 8;
+    TORCH_CHECK(is_sm8x_min, "FlashAttention only supports Ampere GPUs or newer.");
+    auto q_dtype = q.dtype();
+    TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16,
+                "FlashAttention only support fp16 and bf16 data type");
+    TORCH_CHECK(kcache.dtype() == q_dtype, "query and key must have the same dtype");
+    TORCH_CHECK(vcache.dtype() == q_dtype, "query and value must have the same dtype");
+    CHECK_DEVICE(q); CHECK_DEVICE(kcache); CHECK_DEVICE(vcache);
+    TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(kcache.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(vcache.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    at::Tensor block_table;
+    const bool paged_KV = block_table_.has_value();
+    if (paged_KV) {
+        TORCH_CHECK(!cache_batch_idx_.has_value(), "Paged KVcache does not support cache_batch_idx");
+        block_table = block_table_.value();
+        CHECK_DEVICE(block_table);
+        TORCH_CHECK(block_table.dtype() == torch::kInt32, "block_table must have dtype torch.int32");
+        TORCH_CHECK(block_table.stride(-1) == 1, "block_table must have contiguous last dimension");
+    }
+    const auto sizes = q.sizes();
+    const int batch_size = sizes[0];
+    int seqlen_q = sizes[1];
+    int num_heads = sizes[2];
+    const int head_size_og = sizes[3];
+    const int max_num_blocks_per_seq = !paged_KV ? 0 : block_table.size(1);
+    const int num_blocks = !paged_KV ? 0 : kcache.size(0);
+    const int page_block_size = !paged_KV ? 1 : kcache.size(1);
+    TORCH_CHECK(!paged_KV || page_block_size % 256 == 0, "Paged KV cache block size must be divisible by 256");
+    const int seqlen_k = !paged_KV ? kcache.size(1) : max_num_blocks_per_seq * page_block_size;
+    const int num_heads_k = kcache.size(2);
+    const int batch_size_c = !paged_KV ? kcache.size(0) : batch_size;
+    TORCH_CHECK(batch_size > 0, "batch size must be positive");
+    TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256");
+    TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
+    // causal=true is the same as causal=false in this case
+    if (seqlen_q == 1 && !alibi_slopes_.has_value()) { is_causal = false; }
+    if (is_causal) { window_size_right = 0; }
+    // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d) in this case
+    // H/t Daniel Haziza
+    const int seqlenq_ngroups_swapped = seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 && window_size_right < 0 && head_size_og % 8 == 0 && !alibi_slopes_.has_value();
+    if (seqlenq_ngroups_swapped) {
+        const int ngroups = num_heads / num_heads_k;
+        q = q.reshape({batch_size, num_heads_k, ngroups, head_size_og}).transpose(1, 2);
+        seqlen_q = ngroups;
+        num_heads = num_heads_k;
+    }
+    if (window_size_left >= seqlen_k) { window_size_left = -1; }
+    if (window_size_right >= seqlen_k) { window_size_right = -1; }
+    CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size_og);
+    if (!paged_KV) {
+        CHECK_SHAPE(kcache, batch_size_c, seqlen_k, num_heads_k, head_size_og);
+        CHECK_SHAPE(vcache, batch_size_c, seqlen_k, num_heads_k, head_size_og);
+    } else {
+        CHECK_SHAPE(kcache, num_blocks, page_block_size, num_heads_k, head_size_og);
+        CHECK_SHAPE(vcache, num_blocks, page_block_size, num_heads_k, head_size_og);
+        CHECK_SHAPE(block_table, batch_size, max_num_blocks_per_seq);
+    }
+    at::Tensor q_padded, kcache_padded, vcache_padded;
+    if (head_size_og % 8 != 0) {
+        q_padded = torch::nn::functional::pad(q, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+        kcache_padded = torch::nn::functional::pad(kcache, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+        vcache_padded = torch::nn::functional::pad(vcache, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+    } else {
+        q_padded = q;
+        kcache_padded = kcache;
+        vcache_padded = vcache;
+    }
+    at::Tensor out;
+    if (out_.has_value()) {
+        out = out_.value();
+        TORCH_CHECK(out.dtype() == q_dtype, "Output must have the same dtype as inputs");
+        CHECK_DEVICE(out);
+        TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension");
+        CHECK_SHAPE(out, batch_size, seqlen_q, num_heads, head_size_og);
+        if (head_size_og % 8 != 0) { out = torch::empty_like(q_padded); }
+    } else {
+        out = torch::empty_like(q_padded);
+    }
+    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    const int head_size = round_multiple(head_size_og, 8);
+    const int head_size_rounded = head_size <= 192 ? round_multiple(head_size, 32) : 256;
+    const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
+    const int seqlen_k_rounded = round_multiple(seqlen_k, 128);
+    auto opts = q.options();
+    auto softmax_lse = torch::empty({batch_size, num_heads, seqlen_q}, opts.dtype(at::kFloat));
+    Flash_fwd_params params;
+    set_params_fprop(params,
+                     batch_size,
+                     seqlen_q, seqlen_k,
+                     seqlen_q_rounded, seqlen_k_rounded,
+                     num_heads, num_heads_k,
+                     head_size, head_size_rounded,
+                     q_padded, kcache_padded, vcache_padded, out,
+                     /*cu_seqlens_q_d=*/nullptr,
+                     /*cu_seqlens_k_d=*/nullptr,
+                     /*seqused_k=*/nullptr,
+                     /*p_ptr=*/nullptr,
+                     softmax_lse.data_ptr(),
+                     /*p_dropout=*/0.f,
+                     softmax_scale,
+                     window_size_left,
+                     window_size_right,
+                     softcap
+                     );
+    at::Tensor k, v, k_padded, v_padded;
+    if (k_.has_value()) {
+        TORCH_CHECK(v_.has_value(), "If key is supplied, value must also be passed in");
+        TORCH_CHECK(seqlens_k_.has_value(), "If key is supplied, seqlens_k must also be passed in");
+        TORCH_CHECK(seqlen_q <= seqlen_k, "If key is supplied, it must have seqlen <= the seqlen of the KV cache");
+        k = k_.value();
+        v = v_.value();
+        TORCH_CHECK(k.dtype() == q_dtype, "Key must have the same dtype as query");
+        TORCH_CHECK(v.dtype() == q_dtype, "Value must have the same dtype as query");
+        CHECK_DEVICE(k); CHECK_DEVICE(v);
+        TORCH_CHECK(k.stride(-1) == 1, "Key tensor must have contiguous last dimension");
+        TORCH_CHECK(v.stride(-1) == 1, "Value tensor must have contiguous last dimension");
+        int seqlen_knew = k.size(1);
+        CHECK_SHAPE(k, batch_size, seqlen_knew, num_heads_k, head_size_og);
+        CHECK_SHAPE(v, batch_size, seqlen_knew, num_heads_k, head_size_og);
+        if (head_size_og % 8 != 0) {
+            k_padded = torch::nn::functional::pad(k, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+            v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+        } else {
+            k_padded = k;
+            v_padded = v;
+        }
+        params.seqlen_knew = seqlen_knew;
+        params.knew_ptr = k_padded.data_ptr();
+        params.vnew_ptr = v_padded.data_ptr();
+        // All stride are in elements, not bytes.
+        params.knew_batch_stride = k_padded.stride(0);
+        params.vnew_batch_stride = v_padded.stride(0);
+        params.knew_row_stride = k_padded.stride(-3);
+        params.vnew_row_stride = v_padded.stride(-3);
+        params.knew_head_stride = k_padded.stride(-2);
+        params.vnew_head_stride = v_padded.stride(-2);
+    }
+    if (seqlens_k_.has_value()) {
+        auto seqlens_k = seqlens_k_.value();
+        TORCH_CHECK(seqlens_k.dtype() == torch::kInt32, "seqlens_k must have dtype int32");
+        CHECK_DEVICE(seqlens_k);
+        CHECK_CONTIGUOUS(seqlens_k);
+        CHECK_SHAPE(seqlens_k, batch_size);
+        params.cu_seqlens_k = static_cast<int *>(seqlens_k.data_ptr());
+    }
+    params.is_seqlens_k_cumulative = !(seqlens_k_.has_value());
+    if (leftpad_k_.has_value()) {
+        TORCH_CHECK(!paged_KV, "We don't support Paged KV and leftpad_k running at the same time yet");
+        auto leftpad_k = leftpad_k_.value();
+        TORCH_CHECK(leftpad_k.dtype() == torch::kInt32, "leftpad_k must have dtype int32");
+        CHECK_DEVICE(leftpad_k);
+        CHECK_CONTIGUOUS(leftpad_k);
+        CHECK_SHAPE(leftpad_k, batch_size);
+        params.leftpad_k = static_cast<int *>(leftpad_k.data_ptr());
+    }
+    if (rotary_cos_.has_value()) {
+        TORCH_CHECK(k_.has_value(), "If rotary cos/sin are provided, new key / value to be appended to KV cache must also be provided");
+        auto rotary_cos = rotary_cos_.value();
+        CHECK_DEVICE(rotary_cos);
+        params.rotary_dim = rotary_cos.size(1) * 2;
+        TORCH_CHECK(params.rotary_dim <= head_size, "rotary_dim must be <= headdim");
+        TORCH_CHECK(params.rotary_dim % 16 == 0, "Only rotary dimensions divisible by 16 are currently supported");
+        const int seqlen_ro = rotary_cos.size(0);
+        TORCH_CHECK(seqlen_ro >= seqlen_k, "cos/sin seqlen must be at least the seqlen of KV cache");
+        CHECK_SHAPE(rotary_cos, seqlen_ro, params.rotary_dim / 2);
+        CHECK_CONTIGUOUS(rotary_cos);
+        TORCH_CHECK(rotary_cos.scalar_type() == q_dtype, "rotary_cos must have the same dtype as query");
+        TORCH_CHECK(rotary_sin_.has_value(), "If rotary cos is provided, rotary sin must also be provided");
+        auto rotary_sin = rotary_sin_.value();
+        CHECK_DEVICE(rotary_sin);
+        CHECK_SHAPE(rotary_sin, seqlen_ro, params.rotary_dim / 2);
+        CHECK_CONTIGUOUS(rotary_sin);
+        TORCH_CHECK(rotary_sin.scalar_type() == q_dtype, "rotary_cos must have the same dtype as query");
+        params.rotary_cos_ptr = rotary_cos.data_ptr();
+        params.rotary_sin_ptr = rotary_sin.data_ptr();
+        params.is_rotary_interleaved = is_rotary_interleaved;
+    } else {
+        params.rotary_dim = 0;
+    }
+    if (cache_batch_idx_.has_value()) {
+        auto cache_batch_idx = cache_batch_idx_.value();
+        CHECK_DEVICE(cache_batch_idx);
+        CHECK_CONTIGUOUS(cache_batch_idx);
+        TORCH_CHECK(cache_batch_idx.scalar_type() == torch::kInt32, "cache_batch_idx must have dtype int32");
+        params.cache_batch_idx = reinterpret_cast<int *>(cache_batch_idx.data_ptr());
+    }
+    // Keep references to these tensors to extend their lifetime
+    at::Tensor softmax_lse_accum, out_accum;
+    std::tie(softmax_lse_accum, out_accum) = set_params_splitkv(
+        params, batch_size, num_heads, head_size, seqlen_k, seqlen_q,
+        head_size_rounded, /*dropout*/ 0.f, num_splits, get_num_sm(get_current_device()), opts);
+    if (paged_KV) {
+        params.block_table = block_table.data_ptr<int>();
+        params.block_table_batch_stride = block_table.stride(0);
+    }
+    params.page_block_size = page_block_size;
+    set_params_alibi(params, alibi_slopes_, batch_size, num_heads);
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    // Only split kernel supports appending to KV cache, or indexing to the cache with cache_batch_idx,
+    // or paged KV cache
+    run_mha_fwd(params, stream, /*force_split_kernel=*/k_.has_value() || cache_batch_idx_.has_value() || paged_KV);
+    if (head_size_og % 8 != 0) {
+        out = out.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+        if (out_.has_value()) { out_.value().copy_(out); }
+        if (k_.has_value()) {
+            // It's expensive to copy the KV cache here for the case where head size not divisible by 8,
+            // but we don't expect to get this case in practice. This is just so that the code works for that case.
+            kcache.copy_(kcache_padded.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}));
+            vcache.copy_(vcache_padded.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}));
+        }
+    }
+    if (seqlenq_ngroups_swapped) {
+        out = out.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size_og});
+        softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1});
+    }
+    return {out, softmax_lse};
+}
+} // namespace FLASH_NAMESPACE
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.doc() = "FlashAttention";
+    m.def("fwd", &FLASH_NAMESPACE::mha_fwd, "Forward pass");
+    m.def("varlen_fwd", &FLASH_NAMESPACE::mha_varlen_fwd, "Forward pass (variable length)");
+    m.def("bwd", &FLASH_NAMESPACE::mha_bwd, "Backward pass");
+    m.def("varlen_bwd", &FLASH_NAMESPACE::mha_varlen_bwd, "Backward pass (variable length)");
+    m.def("fwd_kvcache", &FLASH_NAMESPACE::mha_fwd_kvcache, "Forward pass, with KV-cache");
+}

cookbooks/flash-attention/csrc/flash_attn/src/alibi.h ADDED Viewed

	@@ -0,0 +1,75 @@

+#include <cmath>
+#include "namespace_config.h"
+#include <cute/tensor.hpp>
+#include <cutlass/cutlass.h>
+#include <cutlass/array.h>
+#include "utils.h"
+namespace FLASH_NAMESPACE {
+using namespace cute;
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template <bool Is_causal>
+struct Alibi {
+    const float alibi_slope;
+    const int max_seqlen_k, max_seqlen_q;
+    __forceinline__ __device__ Alibi(const float alibi_slope, const int max_seqlen_k, const int max_seqlen_q)
+        : alibi_slope(alibi_slope)
+        , max_seqlen_k(max_seqlen_k)
+        , max_seqlen_q(max_seqlen_q) {
+    };
+    template <typename Engine, typename Layout>
+    __forceinline__ __device__ void apply_alibi(Tensor<Engine, Layout> &tensor,
+                                      const int col_idx_offset_,
+                                      const int row_idx_offset,
+                                      const int warp_row_stride) {
+        // tensor has shape (nrow=(2, MMA_M), ncol=(2, MMA_N))
+        static_assert(Layout::rank == 2, "Only support 2D Tensor");
+        const int lane_id = threadIdx.x % 32;
+        const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2;
+        if constexpr (Is_causal) {  // Simpler, we add the same bias vector to all rows
+            #pragma unroll
+            for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+                const int col_idx_base = col_idx_offset + nj * 8;
+                #pragma unroll
+                for (int j = 0; j < size<1, 0>(tensor); ++j) {
+                    const int col_idx = col_idx_base + j;
+                    #pragma unroll
+                    for (int mi = 0; mi < size<0>(tensor); ++mi) {
+                        tensor(mi, make_coord(j, nj)) += alibi_slope * col_idx;
+                    }
+                }
+            }
+        } else {  // Bias depends on both row_idx and col_idx
+            #pragma unroll
+            for (int mi = 0; mi < size<0, 1>(tensor); ++mi) {
+                const int row_idx_base = row_idx_offset + mi * warp_row_stride;
+                #pragma unroll
+                for (int i = 0; i < size<0, 0>(tensor); ++i) {
+                    const int row_idx = row_idx_base + i * 8;
+                    #pragma unroll
+                    for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+                        const int col_idx_base = col_idx_offset + nj * 8;
+                        #pragma unroll
+                        for (int j = 0; j < size<1, 0>(tensor); ++j) {
+                            const int col_idx = col_idx_base + j;
+                            tensor(make_coord(i, mi), make_coord(j, nj)) -= alibi_slope * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx);
+                        }
+                    }
+                }
+            }
+        }
+    }
+};
+}  // namespace FLASH_NAMESPACE

cookbooks/flash-attention/csrc/flash_attn/src/block_info.h ADDED Viewed

	@@ -0,0 +1,49 @@

+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+#pragma once
+#include "namespace_config.h"
+namespace FLASH_NAMESPACE {
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template<bool Varlen=true>
+struct BlockInfo {
+    template<typename Params>
+    __device__ BlockInfo(const Params &params, const int bidb)
+        : sum_s_q(!Varlen || params.cu_seqlens_q == nullptr ? -1 : params.cu_seqlens_q[bidb])
+        , sum_s_k(!Varlen || params.cu_seqlens_k == nullptr || !params.is_seqlens_k_cumulative ? -1 : params.cu_seqlens_k[bidb])
+        , actual_seqlen_q(!Varlen || params.cu_seqlens_q == nullptr ? params.seqlen_q : params.cu_seqlens_q[bidb + 1] - sum_s_q)
+        // If is_seqlens_k_cumulative, then seqlen_k is cu_seqlens_k[bidb + 1] - cu_seqlens_k[bidb].
+        // Otherwise it's cu_seqlens_k[bidb], i.e., we use cu_seqlens_k to store the sequence lengths of K.
+        , leftpad_k(params.leftpad_k == nullptr ? 0 : params.leftpad_k[bidb])
+        , seqlen_k_cache((!Varlen || params.cu_seqlens_k == nullptr ? params.seqlen_k : (params.is_seqlens_k_cumulative ? params.cu_seqlens_k[bidb + 1] - sum_s_k : params.cu_seqlens_k[bidb])) - leftpad_k)
+        , actual_seqlen_k(params.seqused_k ? params.seqused_k[bidb] - leftpad_k : seqlen_k_cache + (params.knew_ptr == nullptr ? 0 : params.seqlen_knew))
+        {
+        }
+    template <typename index_t>
+    __forceinline__ __device__ index_t q_offset(const index_t batch_stride, const index_t row_stride, const int bidb) const {
+        return sum_s_q == -1 ? bidb * batch_stride : uint32_t(sum_s_q) * row_stride;
+    }
+    template <typename index_t>
+    __forceinline__ __device__ index_t k_offset(const index_t batch_stride, const index_t row_stride, const int bidb) const {
+        return sum_s_k == -1 ? bidb * batch_stride + leftpad_k * row_stride : uint32_t(sum_s_k + leftpad_k) * row_stride;
+    }
+    const int sum_s_q;
+    const int sum_s_k;
+    const int actual_seqlen_q;
+    // We have to have seqlen_k_cache declared before actual_seqlen_k, otherwise actual_seqlen_k is set to 0.
+    const int leftpad_k;
+    const int seqlen_k_cache;
+    const int actual_seqlen_k;
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+}  // namespace FLASH_NAMESPACE

cookbooks/flash-attention/csrc/flash_attn/src/dropout.h ADDED Viewed

	@@ -0,0 +1,95 @@

+/******************************************************************************
+ * Copyright (c) 2024, Tri Dao.
+ ******************************************************************************/
+#pragma once
+#include "namespace_config.h"
+#include "philox.cuh"
+#include "utils.h"
+namespace FLASH_NAMESPACE {
+struct Dropout {
+    const unsigned long long seed, offset;
+    const uint8_t p_dropout_in_uint8_t;
+    __forceinline__ __device__ Dropout(const unsigned long long seed, const unsigned long long offset,
+                              const uint8_t p_dropout_in_uint8_t,
+                              const int bid, const int hid, const int tid, const int nheads)
+            : seed(seed)
+            , offset(offset + (bid * nheads + hid) * 32 + tid % 32)
+            , p_dropout_in_uint8_t(p_dropout_in_uint8_t) {
+    }
+    template <bool encode_dropout_in_sign_bit=false, typename Engine, typename Layout>
+    __forceinline__ __device__ void apply_dropout(Tensor<Engine, Layout> &tensor_,
+                                         int block_row_start, int block_col_start, int block_row_stride) {
+        // convert shape from (4, MMA_M, MMA_N) to (8, MMA_M, MMA_N / 2)
+        Tensor tensor = make_tensor(tensor_.data(), FLASH_NAMESPACE::convert_layout_acc_dropout(tensor_.layout()));
+        using T = typename Engine::value_type;
+        auto encode_dropout = [](bool keep, T val) {
+            return keep ? val : (encode_dropout_in_sign_bit ? -val : T(0));
+        };
+        static_assert(decltype(size<2>(tensor))::value % 2 == 0);
+        const uint16_t p_dropout_8bit_in_uint16_t = uint16_t(p_dropout_in_uint8_t);
+        const uint32_t p_dropout_8bit_in_uint32_t = (uint32_t(p_dropout_8bit_in_uint16_t) << 16) | uint32_t(p_dropout_8bit_in_uint16_t);
+        // if (cute::thread0()) { printf("threshold2 = 0x%x\n", p_dropout_8bit_in_uint32_t); }
+        #pragma unroll
+        for (int m = 0; m < size<1>(tensor); ++m, block_row_start += block_row_stride) {
+            uint2 rowcol = make_uint2(block_row_start, block_col_start);
+            #pragma unroll
+            for (int n = 0; n < size<2>(tensor) / 2; ++n, ++rowcol.y) {
+                // if (cute::thread(32, 0)) { printf("m = %d, n = %d, row = %d, col = %d\n", m, n, int(rowcol.x), int(rowcol.y));}
+                uint4 random_uint4 = FLASH_NAMESPACE::philox(seed, reinterpret_cast<unsigned long long&>(rowcol), offset);
+                // if (cute::thread0()) { printf("philox = %u, %d, %d, %d\n", random_uint4.x, random_uint4.y, random_uint4.z, random_uint4.w);}
+                uint8_t (&rnd_8)[16] = reinterpret_cast<uint8_t (&)[16]>(random_uint4);
+                // Special implementation for 16-bit types: we duplicate the threshold to the
+                // low and high 16 bits of a 32-bit value, then use the f16x2 comparison instruction
+                // to get a mask. The low 16 bits of the mask will be either 0xffff or 0x0000,
+                // and the high 16 bits will be either 0xffff or 0x0000, depending on whether
+                // the random value is less than the threshold.
+                // We then do a bit-wise AND between the mask and the original value (in 32-bit).
+                // We're exploiting the fact that floating point comparison is equivalent to integer
+                // comparison, since we're comparing unsigned integers whose top 8-bits are zero.
+                if (!encode_dropout_in_sign_bit
+                    && (std::is_same<T, cutlass::half_t>::value || std::is_same<T, cutlass::bfloat16_t>::value)) {
+                    uint16_t rnd_16[16];
+                    #pragma unroll
+                    for (int i = 0; i < 16; i++) { rnd_16[i] = uint16_t(rnd_8[i]); }
+                    uint32_t (&rnd_32)[8] = reinterpret_cast<uint32_t (&)[8]>(rnd_16);
+                    #pragma unroll
+                    for (int j = 0; j < 2; j++) {
+                        Tensor tensor_uint32 = recast<uint32_t>(tensor(_, m, n * 2 + j));
+                        // if (cute::thread0()) { printf("random = 0x%x, 0x%x, 0x%x, 0x%x\n", rnd_32[j * 4 + 0], rnd_32[j * 4 + 1], rnd_32[j * 4 + 2], rnd_32[j * 4 + 3]); }
+                        // if (cute::thread0()) { printf("tensor_uint32 = 0x%x, 0x%x, 0x%x, 0x%x\n", tensor_uint32(0), tensor_uint32(1), tensor_uint32(2), tensor_uint32(3)); }
+                        #pragma unroll
+                        for (int i = 0; i < 4; i++) {
+                            uint32_t mask;
+                            asm volatile("set.le.u32.f16x2 %0, %1, %2;\n" : "=r"(mask) : "r"(rnd_32[j * 4 + i]), "r"(p_dropout_8bit_in_uint32_t));
+                            tensor_uint32(i) &= mask;
+                        }
+                        // if (cute::thread0()) { printf("tensor_uint32 = 0x%x, 0x%x, 0x%x, 0x%x\n", tensor_uint32(0), tensor_uint32(1), tensor_uint32(2), tensor_uint32(3)); }
+                    }
+                } else {
+                    #pragma unroll
+                    for (int j = 0; j < 2; j++) {
+                        #pragma unroll
+                        for (int i = 0; i < 8; i++) {
+                            tensor(i, m, n * 2 + j) = encode_dropout(rnd_8[j * 8 + i] <= p_dropout_in_uint8_t, tensor(i, m, n * 2 + j));
+                        }
+                        Tensor tensor_uint32 = recast<uint32_t>(tensor(_, m, n * 2 + j));
+                        // if (cute::thread0()) { printf("tensor_uint32 = 0x%x, 0x%x, 0x%x, 0x%x\n", tensor_uint32(0), tensor_uint32(1), tensor_uint32(2), tensor_uint32(3)); }
+                    }
+                }
+                // // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+                // //     printf("n = %d, ph  Philox: %u, %u, %u, %u\n", n, rnd_8.x, rnd_8.y, rnd_8.z, rnd_8.w);
+                // // }
+            }
+        }
+    }
+};
+} // namespace FLASH_NAMESPACE

cookbooks/flash-attention/csrc/flash_attn/src/flash.h ADDED Viewed

	@@ -0,0 +1,194 @@

+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+#pragma once
+#include "namespace_config.h"
+#include <cuda.h>
+#include <vector>
+#include <ATen/cuda/CUDAGeneratorImpl.h> // For at::Generator and at::PhiloxCudaState
+namespace FLASH_NAMESPACE {
+constexpr int TOTAL_DIM = 0;
+constexpr int H_DIM = 1;
+constexpr int D_DIM = 2;
+////////////////////////////////////////////////////////////////////////////////////////////////////
+struct Qkv_params {
+    using index_t = int64_t;
+    // The QKV matrices.
+    void *__restrict__ q_ptr;
+    void *__restrict__ k_ptr;
+    void *__restrict__ v_ptr;
+    // The stride between rows of the Q, K and V matrices.
+    index_t q_batch_stride;
+    index_t k_batch_stride;
+    index_t v_batch_stride;
+    index_t q_row_stride;
+    index_t k_row_stride;
+    index_t v_row_stride;
+    index_t q_head_stride;
+    index_t k_head_stride;
+    index_t v_head_stride;
+    // The number of heads.
+    int h, h_k;
+    // In the case of multi-query and grouped-query attention (MQA/GQA), nheads_k could be
+    // different from nheads (query).
+    int h_h_k_ratio; // precompute h / h_k,
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+struct Flash_fwd_params : public Qkv_params {
+    // The O matrix (output).
+    void * __restrict__ o_ptr;
+    void * __restrict__ oaccum_ptr;
+    // The stride between rows of O.
+    index_t o_batch_stride;
+    index_t o_row_stride;
+    index_t o_head_stride;
+    // The pointer to the P matrix.
+    void * __restrict__ p_ptr;
+    // The pointer to the softmax sum.
+    void * __restrict__ softmax_lse_ptr;
+    void * __restrict__ softmax_lseaccum_ptr;
+    // The dimensions.
+    int b, seqlen_q, seqlen_k, seqlen_knew, d, seqlen_q_rounded, seqlen_k_rounded, d_rounded, rotary_dim, total_q;
+    // The scaling factors for the kernel.
+    float scale_softmax;
+    float scale_softmax_log2;
+    // array of length b+1 holding starting offset of each sequence.
+    int * __restrict__ cu_seqlens_q;
+    int * __restrict__ cu_seqlens_k;
+    int * __restrict__ leftpad_k;
+    // If provided, the actual length of each k sequence.
+    int * __restrict__ seqused_k;
+    int *__restrict__ blockmask;
+    // The K_new and V_new matrices.
+    void * __restrict__ knew_ptr;
+    void * __restrict__ vnew_ptr;
+    // The stride between rows of the Q, K and V matrices.
+    index_t knew_batch_stride;
+    index_t vnew_batch_stride;
+    index_t knew_row_stride;
+    index_t vnew_row_stride;
+    index_t knew_head_stride;
+    index_t vnew_head_stride;
+    // The cos and sin matrices for rotary embedding.
+    void * __restrict__ rotary_cos_ptr;
+    void * __restrict__ rotary_sin_ptr;
+    // The indices to index into the KV cache.
+    int * __restrict__ cache_batch_idx;
+    // Paged KV cache
+    int * __restrict__ block_table;
+    index_t block_table_batch_stride;
+    int page_block_size;
+    // The dropout probability (probability of keeping an activation).
+    float p_dropout;
+    // uint32_t p_dropout_in_uint;
+    // uint16_t p_dropout_in_uint16_t;
+    uint8_t p_dropout_in_uint8_t;
+    // Scale factor of 1 / (1 - p_dropout).
+    float rp_dropout;
+    float scale_softmax_rp_dropout;
+    // Local window size
+    int window_size_left, window_size_right;
+    float softcap;
+    // Random state.
+    at::PhiloxCudaState philox_args;
+    // Pointer to the RNG seed (idx 0) and offset (idx 1).
+    uint64_t * rng_state;
+    bool is_bf16;
+    bool is_causal;
+    // If is_seqlens_k_cumulative, then seqlen_k is cu_seqlens_k[bidb + 1] - cu_seqlens_k[bidb].
+    // Otherwise it's cu_seqlens_k[bidb], i.e., we use cu_seqlens_k to store the sequence lengths of K.
+    bool is_seqlens_k_cumulative;
+    bool is_rotary_interleaved;
+    int num_splits;  // For split-KV version
+    void * __restrict__ alibi_slopes_ptr;
+    index_t alibi_slopes_batch_stride;
+    bool unpadded_lse;  // For varlen paths: LSE is in [nheads, total_seqlen_q] format instead of [b, nheads, seqlen_q].
+    bool seqlenq_ngroups_swapped;  // q has been transposed from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d).
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+struct Flash_bwd_params : public Flash_fwd_params {
+    // The dO and dQKV matrices.
+    void *__restrict__ do_ptr;
+    void *__restrict__ dq_ptr;
+    void *__restrict__ dk_ptr;
+    void *__restrict__ dv_ptr;
+    // To accumulate dQ
+    void *__restrict__ dq_accum_ptr;
+    void *__restrict__ dk_accum_ptr;
+    void *__restrict__ dv_accum_ptr;
+    // // To accumulate dK and dV in case we're splitting the bwd along seqlen_q
+    // dimension void *__restrict__ dk_accum_ptr; void *__restrict__
+    // dv_accum_ptr;
+    // The stride between rows of the dO, dQ, dK and dV matrices.
+    // TD [2022-04-16]: We're using 32-bit indexing to save registers.
+    // The code probably won't work for arrays larger than 2GB.
+    index_t do_batch_stride;
+    index_t do_row_stride;
+    index_t do_head_stride;
+    index_t dq_batch_stride;
+    index_t dk_batch_stride;
+    index_t dv_batch_stride;
+    index_t dq_row_stride;
+    index_t dk_row_stride;
+    index_t dv_row_stride;
+    index_t dq_head_stride;
+    index_t dk_head_stride;
+    index_t dv_head_stride;
+    // The pointer to the softmax d sum.
+    void *__restrict__ dsoftmax_sum;
+    bool deterministic;
+    index_t dq_accum_split_stride;
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template<typename T, int Headdim, bool Is_causal> void run_mha_fwd_(Flash_fwd_params &params, cudaStream_t stream);
+template<typename T, int Headdim, bool Is_causal> void run_mha_fwd_splitkv_dispatch(Flash_fwd_params &params, cudaStream_t stream);
+template<typename T, int Headdim, bool Is_causal> void run_mha_bwd_(Flash_bwd_params &params, cudaStream_t stream);
+}  // namespace FLASH_NAMESPACE

cookbooks/flash-attention/csrc/flash_attn/src/flash_bwd_hdim128_bf16_causal_sm80.cu ADDED Viewed

	@@ -0,0 +1,14 @@

+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_bwd_launch_template.h"
+namespace FLASH_NAMESPACE {
+template<>
+void run_mha_bwd_<cutlass::bfloat16_t, 128, true>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim128<cutlass::bfloat16_t, true>(params, stream);
+}
+} // namespace FLASH_NAMESPACE

cookbooks/flash-attention/csrc/flash_attn/src/flash_bwd_hdim128_bf16_sm80.cu ADDED Viewed

	@@ -0,0 +1,14 @@

+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_bwd_launch_template.h"
+namespace FLASH_NAMESPACE {
+template<>
+void run_mha_bwd_<cutlass::bfloat16_t, 128, false>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim128<cutlass::bfloat16_t, false>(params, stream);
+}
+} // namespace FLASH_NAMESPACE

cookbooks/flash-attention/csrc/flash_attn/src/flash_bwd_hdim128_fp16_causal_sm80.cu ADDED Viewed

	@@ -0,0 +1,14 @@

+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_bwd_launch_template.h"
+namespace FLASH_NAMESPACE {
+template<>
+void run_mha_bwd_<cutlass::half_t, 128, true>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim128<cutlass::half_t, true>(params, stream);
+}
+} // namespace FLASH_NAMESPACE

cookbooks/flash-attention/csrc/flash_attn/src/flash_bwd_hdim128_fp16_sm80.cu ADDED Viewed

	@@ -0,0 +1,14 @@

+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_bwd_launch_template.h"
+namespace FLASH_NAMESPACE {
+template<>
+void run_mha_bwd_<cutlass::half_t, 128, false>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim128<cutlass::half_t, false>(params, stream);
+}
+} // namespace FLASH_NAMESPACE

cookbooks/flash-attention/csrc/flash_attn/src/flash_bwd_hdim160_bf16_causal_sm80.cu ADDED Viewed

	@@ -0,0 +1,14 @@

+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_bwd_launch_template.h"
+namespace FLASH_NAMESPACE {
+template<>
+void run_mha_bwd_<cutlass::bfloat16_t, 160, true>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim160<cutlass::bfloat16_t, true>(params, stream);
+}
+} // namespace FLASH_NAMESPACE

cookbooks/flash-attention/csrc/flash_attn/src/flash_bwd_hdim160_bf16_sm80.cu ADDED Viewed

	@@ -0,0 +1,14 @@

+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_bwd_launch_template.h"
+namespace FLASH_NAMESPACE {
+template<>
+void run_mha_bwd_<cutlass::bfloat16_t, 160, false>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim160<cutlass::bfloat16_t, false>(params, stream);
+}
+} // namespace FLASH_NAMESPACE

cookbooks/flash-attention/csrc/flash_attn/src/flash_bwd_hdim160_fp16_causal_sm80.cu ADDED Viewed

	@@ -0,0 +1,14 @@

+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_bwd_launch_template.h"
+namespace FLASH_NAMESPACE {
+template<>
+void run_mha_bwd_<cutlass::half_t, 160, true>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim160<cutlass::half_t, true>(params, stream);
+}
+} // namespace FLASH_NAMESPACE

cookbooks/flash-attention/csrc/flash_attn/src/flash_bwd_hdim160_fp16_sm80.cu ADDED Viewed

	@@ -0,0 +1,14 @@

+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_bwd_launch_template.h"
+namespace FLASH_NAMESPACE {
+template<>
+void run_mha_bwd_<cutlass::half_t, 160, false>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim160<cutlass::half_t, false>(params, stream);
+}
+} // namespace FLASH_NAMESPACE

cookbooks/flash-attention/csrc/flash_attn/src/flash_bwd_hdim192_bf16_causal_sm80.cu ADDED Viewed

	@@ -0,0 +1,14 @@

+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_bwd_launch_template.h"
+namespace FLASH_NAMESPACE {
+template<>
+void run_mha_bwd_<cutlass::bfloat16_t, 192, true>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim192<cutlass::bfloat16_t, true>(params, stream);
+}
+} // namespace FLASH_NAMESPACE