kernels-community
/

flash-attn2

Kernels

Model card Files Files and versions

xet

Community

drbh commited on Mar 27, 2025

Commit

4080f9c

1 Parent(s): b0d3c12

fix: adjust types

Browse files

Files changed (1) hide show

flash_attn/flash_api.cpp +102 -32

flash_attn/flash_api.cpp CHANGED Viewed

@@ -1507,45 +1507,61 @@ mha_fwd(const at::Tensor &q,                               // batch_size x seqle
     float softcap_float = static_cast<float>(softcap);
     int window_size_left_int = static_cast<int>(window_size_left);
     int window_size_right_int = static_cast<int>(window_size_right);
     return FLASH_NAMESPACE::mha_fwd(const_cast<at::Tensor &>(q), k, v, out, alibi_slopes, p_dropout_float, softmax_scale_float, is_causal, window_size_left_int, window_size_right_int, softcap_float, return_softmax, gen);
 }
 std::vector<at::Tensor>
-mha_varlen_fwd(const at::Tensor &q,                        // batch_size x seqlen_q x num_heads x round_multiple(head_size, 8)
-               const at::Tensor &k,                        // batch_size x seqlen_k x num_heads_k x round_multiple(head_size, 8)
-               const at::Tensor &v,                        // batch_size x seqlen_k x num_heads_k x round_multiple(head_size, 8)
-               const c10::optional<torch::Tensor> &out_,   // batch_size x seqlen_q x num_heads x round_multiple(head_size, 8)
-               const at::Tensor &cu_seqlens_q,             // batch_size + 1
-               const at::Tensor &cu_seqlens_k,             // batch_size + 1
                const int64_t max_seqlen_q,
                const int64_t max_seqlen_k,
                const double p_dropout,
                const double softmax_scale,
-               bool is_causal,
                const int64_t window_size_left,
                const int64_t window_size_right,
                const double softcap,
                const bool return_softmax,
-               const c10::optional<at::Generator> gen_) {
     auto gen = gen_.value_or(at::cuda::detail::getDefaultCUDAGenerator());
     // Prepare the optional arguments as non-const references.
     std::optional<at::Tensor> out = out_.has_value() ? std::optional<at::Tensor>(const_cast<at::Tensor &>(out_.value())) : std::nullopt;
     if (!out.has_value()){
         out = torch::empty_like(q);
     }
     // Convert double to float and int64_t to int.
     float p_dropout_float = static_cast<float>(p_dropout);
     float softmax_scale_float = static_cast<float>(softmax_scale);
     float softcap_float = static_cast<float>(softcap);
     int window_size_left_int = static_cast<int>(window_size_left);
     int window_size_right_int = static_cast<int>(window_size_right);
-    return FLASH_NAMESPACE::mha_varlen_fwd(const_cast<at::Tensor &>(q), k, v, out, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, p_dropout_float, softmax_scale_float, is_causal, window_size_left_int, window_size_right_int, softcap_float, return_softmax, gen);
 }
 std::vector<at::Tensor>
@@ -1570,7 +1586,7 @@ mha_bwd(const at::Tensor &dout,                         // batch_size x seqlen_q
         std::optional<at::Tensor> &rng_state) {
     auto gen = gen_.value_or(at::cuda::detail::getDefaultCUDAGenerator());
     // Prepare the optional arguments as non-const references.
     std::optional<at::Tensor> dq = dq_.has_value() ? std::optional<at::Tensor>(const_cast<at::Tensor &>(dq_.value())) : std::nullopt;
     std::optional<at::Tensor> dk = dk_.has_value() ? std::optional<at::Tensor>(const_cast<at::Tensor &>(dk_.value())) : std::nullopt;
@@ -1584,7 +1600,15 @@ mha_bwd(const at::Tensor &dout,                         // batch_size x seqlen_q
     int window_size_left_int = static_cast<int>(window_size_left);
     int window_size_right_int = static_cast<int>(window_size_right);
-    return FLASH_NAMESPACE::mha_bwd(const_cast<at::Tensor &>(dout), q, k, v, out, softmax_lse, dq, dk, dv, alibi_slopes, p_dropout_float, softmax_scale_float, is_causal, window_size_left_int, window_size_right_int, softcap_float, deterministic, gen, rng_state);
 }
@@ -1595,12 +1619,17 @@ mha_varlen_bwd(const at::Tensor &dout,                  // batch_size x seqlen_q
                const at::Tensor &v,                     // batch_size x seqlen_k x num_heads_k x head_size
                const at::Tensor &out,                   // batch_size x seqlen_q x num_heads x head_size
                const at::Tensor &softmax_lse,           // b x h x seqlen_q
                const at::Tensor &cu_seqlens_q,          // batch_size + 1
                const at::Tensor &cu_seqlens_k,          // batch_size + 1
                const int64_t max_seqlen_q,
                const int64_t max_seqlen_k,
                const double p_dropout,
                const double softmax_scale,
                const bool is_causal,
                const int64_t window_size_left,
                const int64_t window_size_right,
@@ -1608,17 +1637,36 @@ mha_varlen_bwd(const at::Tensor &dout,                  // batch_size x seqlen_q
                const bool deterministic,
                std::optional<at::Generator> gen_,
                std::optional<at::Tensor> &rng_state) {
     auto gen = gen_.value_or(at::cuda::detail::getDefaultCUDAGenerator());
     // Convert double to float and int64_t to int.
     float p_dropout_float = static_cast<float>(p_dropout);
     float softmax_scale_float = static_cast<float>(softmax_scale);
     float softcap_float = static_cast<float>(softcap);
     int window_size_left_int = static_cast<int>(window_size_left);
     int window_size_right_int = static_cast<int>(window_size_right);
-    return FLASH_NAMESPACE::mha_varlen_bwd(const_cast<at::Tensor &>(dout), q, k, v, out, softmax_lse, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, p_dropout_float, softmax_scale_float, is_causal, window_size_left_int, window_size_right_int, softcap_float, deterministic, gen, rng_state);
 }
 std::vector<at::Tensor>
@@ -1643,25 +1691,47 @@ mha_fwd_kvcache(const at::Tensor &q,                                    // batch
                 bool is_rotary_interleaved,   // if true, rotary combines indices 0 & 1, else indices 0 & rotary_dim / 2
                 const int64_t num_splits
                 ) {
-    // Prepare the optional arguments as non-const references.
-    std::optional<at::Tensor> k = k_.has_value() ? std::optional<at::Tensor>(const_cast<at::Tensor &>(k_.value())) : std::nullopt;
-    std::optional<at::Tensor> v = v_.has_value() ? std::optional<at::Tensor>(const_cast<at::Tensor &>(v_.value())) : std::nullopt;
-    std::optional<at::Tensor> seqlens_k = seqlens_k_.has_value() ? std::optional<at::Tensor>(const_cast<at::Tensor &>(seqlens_k_.value())) : std::nullopt;
-    std::optional<at::Tensor> rotary_cos = rotary_cos_.has_value() ? std::optional<at::Tensor>(const_cast<at::Tensor &>(rotary_cos_.value())) : std::nullopt;
-    std::optional<at::Tensor> rotary_sin = rotary_sin_.has_value() ? std::optional<at::Tensor>(const_cast<at::Tensor &>(rotary_sin_.value())) : std::nullopt;
-    std::optional<at::Tensor> cache_batch_idx = cache_batch_idx_.has_value() ? std::optional<at::Tensor>(const_cast<at::Tensor &>(cache_batch_idx_.value())) : std::nullopt;
-    std::optional<at::Tensor> leftpad_k = leftpad_k_.has_value() ? std::optional<at::Tensor>(const_cast<at::at::Tensor &>(leftpad_k_.value())) : std::nullopt;
     std::optional<at::Tensor> block_table = block_table_.has_value() ? std::optional<at::Tensor>(const_cast<at::Tensor &>(block_table_.value())) : std::nullopt;
     std::optional<at::Tensor> alibi_slopes = alibi_slopes_.has_value() ? std::optional<at::Tensor>(const_cast<at::Tensor &>(alibi_slopes_.value())) : std::nullopt;
     std::optional<at::Tensor> out = out_.has_value() ? std::optional<at::Tensor>(const_cast<at::Tensor &>(out_.value())) : std::nullopt;
     // Convert double to float and int64_t to int.
     float softmax_scale_float = static_cast<float>(softmax_scale);
     float softcap_float = static_cast<float>(softcap);
     int window_size_left_int = static_cast<int>(window_size_left);
     int window_size_right_int = static_cast<int>(window_size_right);
     int num_splits_int = static_cast<int>(num_splits);
-    return FLASH_NAMESPACE::mha_fwd_kvcache(const_cast<at::Tensor &>(q), kcache, vcache, k, v, seqlens_k, rotary_cos, rotary_sin, cache_batch_idx, leftpad_k, block_table, alibi_slopes, out, softmax_scale_float, is_causal, window_size_left_int, window_size_right_int, softcap_float, is_rotary_interleaved, num_splits_int);
 }

     float softcap_float = static_cast<float>(softcap);
     int window_size_left_int = static_cast<int>(window_size_left);
     int window_size_right_int = static_cast<int>(window_size_right);
     return FLASH_NAMESPACE::mha_fwd(const_cast<at::Tensor &>(q), k, v, out, alibi_slopes, p_dropout_float, softmax_scale_float, is_causal, window_size_left_int, window_size_right_int, softcap_float, return_softmax, gen);
 }
 std::vector<at::Tensor>
+mha_varlen_fwd(at::Tensor &q,  // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+               const at::Tensor &k,  // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i or num_blocks x page_block_size x num_heads_k x head_>
+               const at::Tensor &v,  // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i or num_blocks x page_block_size x num_heads_k x head_>
+               const std::optional<at::Tensor> &out_, // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+               const at::Tensor &cu_seqlens_q,  // b+1
+               const at::Tensor &cu_seqlens_k,  // b+1
+               const std::optional<at::Tensor> &seqused_k_, // b. If given, only this many elements of each batch element's keys are used.
+               const std::optional<const at::Tensor> &leftpad_k_, // batch_size
+               const std::optional<at::Tensor> &block_table_, // batch_size x max_num_blocks_per_seq
+               const std::optional<at::Tensor> &alibi_slopes_, // num_heads or b x num_heads
                const int64_t max_seqlen_q,
                const int64_t max_seqlen_k,
                const double p_dropout,
                const double softmax_scale,
+               const bool zero_tensors,
+               const bool is_causal,
                const int64_t window_size_left,
                const int64_t window_size_right,
                const double softcap,
                const bool return_softmax,
+               const std::optional<at::Generator> gen_) {
     auto gen = gen_.value_or(at::cuda::detail::getDefaultCUDAGenerator());
     // Prepare the optional arguments as non-const references.
     std::optional<at::Tensor> out = out_.has_value() ? std::optional<at::Tensor>(const_cast<at::Tensor &>(out_.value())) : std::nullopt;
+    std::optional<at::Tensor> seqused_k = seqused_k_.has_value() ? std::optional<at::Tensor>(const_cast<at::Tensor &>(seqused_k_.value())) : std::nullopt;
+    std::optional<const at::Tensor> leftpad_k = leftpad_k_.has_value() ? std::optional<const at::Tensor>(leftpad_k_.value()) : std::nullopt;
+    std::optional<at::Tensor> block_table = block_table_.has_value() ? std::optional<at::Tensor>(const_cast<at::Tensor &>(block_table_.value())) : std::nullopt;
+    std::optional<at::Tensor> alibi_slopes = alibi_slopes_.has_value() ? std::optional<at::Tensor>(const_cast<at::Tensor &>(alibi_slopes_.value())) : std::nullopt;
     if (!out.has_value()){
         out = torch::empty_like(q);
     }
     // Convert double to float and int64_t to int.
     float p_dropout_float = static_cast<float>(p_dropout);
     float softmax_scale_float = static_cast<float>(softmax_scale);
     float softcap_float = static_cast<float>(softcap);
+    int max_seqlen_q_int = static_cast<int>(max_seqlen_q);
+    int max_seqlen_k_int = static_cast<int>(max_seqlen_k);
     int window_size_left_int = static_cast<int>(window_size_left);
     int window_size_right_int = static_cast<int>(window_size_right);
+    return FLASH_NAMESPACE::mha_varlen_fwd(
+        const_cast<at::Tensor &>(q), k, v, out,
+        cu_seqlens_q, cu_seqlens_k,
+        seqused_k, leftpad_k, block_table, alibi_slopes,
+        max_seqlen_q_int, max_seqlen_k_int,
+        p_dropout_float, softmax_scale_float,
+        zero_tensors, is_causal,
+        window_size_left_int, window_size_right_int,
+        softcap_float, return_softmax, gen);
 }
 std::vector<at::Tensor>
         std::optional<at::Tensor> &rng_state) {
     auto gen = gen_.value_or(at::cuda::detail::getDefaultCUDAGenerator());
     // Prepare the optional arguments as non-const references.
     std::optional<at::Tensor> dq = dq_.has_value() ? std::optional<at::Tensor>(const_cast<at::Tensor &>(dq_.value())) : std::nullopt;
     std::optional<at::Tensor> dk = dk_.has_value() ? std::optional<at::Tensor>(const_cast<at::Tensor &>(dk_.value())) : std::nullopt;
     int window_size_left_int = static_cast<int>(window_size_left);
     int window_size_right_int = static_cast<int>(window_size_right);
+    return FLASH_NAMESPACE::mha_bwd(
+        const_cast<at::Tensor &>(dout),
+        q, k, v, out, softmax_lse,
+        dq, dk, dv, alibi_slopes,
+        p_dropout_float, softmax_scale_float,
+        is_causal,
+        window_size_left_int, window_size_right_int,
+        softcap_float, deterministic,
+        gen, rng_state);
 }
                const at::Tensor &v,                     // batch_size x seqlen_k x num_heads_k x head_size
                const at::Tensor &out,                   // batch_size x seqlen_q x num_heads x head_size
                const at::Tensor &softmax_lse,           // b x h x seqlen_q
+               const std::optional<at::Tensor> &dq_,    // batch_size x seqlen_q x num_heads x head_size
+               const std::optional<at::Tensor> &dk_,    // batch_size x seqlen_k x num_heads_k x head_size
+               const std::optional<at::Tensor> &dv_,    // batch_size x seqlen_k x num_heads_k x head_size
                const at::Tensor &cu_seqlens_q,          // batch_size + 1
                const at::Tensor &cu_seqlens_k,          // batch_size + 1
+               const std::optional<at::Tensor> &alibi_slopes_, // num_heads or b x num_heads
                const int64_t max_seqlen_q,
                const int64_t max_seqlen_k,
                const double p_dropout,
                const double softmax_scale,
+               const bool zero_tensors,
                const bool is_causal,
                const int64_t window_size_left,
                const int64_t window_size_right,
                const bool deterministic,
                std::optional<at::Generator> gen_,
                std::optional<at::Tensor> &rng_state) {
     auto gen = gen_.value_or(at::cuda::detail::getDefaultCUDAGenerator());
+    // Prepare the optional arguments as non-const references.
+    std::optional<at::Tensor> dq = dq_.has_value() ? std::optional<at::Tensor>(const_cast<at::Tensor &>(dq_.value())) : std::nullopt;
+    std::optional<at::Tensor> dk = dk_.has_value() ? std::optional<at::Tensor>(const_cast<at::Tensor &>(dk_.value())) : std::nullopt;
+    std::optional<at::Tensor> dv = dv_.has_value() ? std::optional<at::Tensor>(const_cast<at::Tensor &>(dv_.value())) : std::nullopt;
+    std::optional<at::Tensor> alibi_slopes = alibi_slopes_.has_value() ? std::optional<at::Tensor>(const_cast<at::Tensor &>(alibi_slopes_.value())) : std::nullopt;
     // Convert double to float and int64_t to int.
     float p_dropout_float = static_cast<float>(p_dropout);
     float softmax_scale_float = static_cast<float>(softmax_scale);
     float softcap_float = static_cast<float>(softcap);
+    int max_seqlen_q_int = static_cast<int>(max_seqlen_q);
+    int max_seqlen_k_int = static_cast<int>(max_seqlen_k);
     int window_size_left_int = static_cast<int>(window_size_left);
     int window_size_right_int = static_cast<int>(window_size_right);
+    return FLASH_NAMESPACE::mha_varlen_bwd(
+        const_cast<at::Tensor &>(dout),
+        q, k, v, out, softmax_lse,
+        dq, dk, dv,
+        cu_seqlens_q, cu_seqlens_k,
+        alibi_slopes,
+        max_seqlen_q_int, max_seqlen_k_int,
+        p_dropout_float, softmax_scale_float,
+        zero_tensors, is_causal,
+        window_size_left_int, window_size_right_int,
+        softcap_float, deterministic,
+        gen, rng_state);
 }
 std::vector<at::Tensor>
                 bool is_rotary_interleaved,   // if true, rotary combines indices 0 & 1, else indices 0 & rotary_dim / 2
                 const int64_t num_splits
                 ) {
+    // Prepare the optional arguments as const references where needed
+    std::optional<const at::Tensor> k = k_.has_value() ? std::optional<const at::Tensor>(k_.value()) : std::nullopt;
+    std::optional<const at::Tensor> v = v_.has_value() ? std::optional<const at::Tensor>(v_.value()) : std::nullopt;
+    std::optional<const at::Tensor> seqlens_k = seqlens_k_.has_value() ? std::optional<const at::Tensor>(seqlens_k_.value()) : std::nullopt;
+    std::optional<const at::Tensor> rotary_cos = rotary_cos_.has_value() ? std::optional<const at::Tensor>(rotary_cos_.value()) : std::nullopt;
+    std::optional<const at::Tensor> rotary_sin = rotary_sin_.has_value() ? std::optional<const at::Tensor>(rotary_sin_.value()) : std::nullopt;
+    std::optional<const at::Tensor> cache_batch_idx = cache_batch_idx_.has_value() ? std::optional<const at::Tensor>(cache_batch_idx_.value()) : std::nullopt;
+    std::optional<const at::Tensor> leftpad_k = leftpad_k_.has_value() ? std::optional<const at::Tensor>(leftpad_k_.value()) : std::nullopt;
+    // For non-const tensors
     std::optional<at::Tensor> block_table = block_table_.has_value() ? std::optional<at::Tensor>(const_cast<at::Tensor &>(block_table_.value())) : std::nullopt;
     std::optional<at::Tensor> alibi_slopes = alibi_slopes_.has_value() ? std::optional<at::Tensor>(const_cast<at::Tensor &>(alibi_slopes_.value())) : std::nullopt;
     std::optional<at::Tensor> out = out_.has_value() ? std::optional<at::Tensor>(const_cast<at::Tensor &>(out_.value())) : std::nullopt;
+    if (!out.has_value()){
+        out = torch::empty_like(q);
+    }
     // Convert double to float and int64_t to int.
     float softmax_scale_float = static_cast<float>(softmax_scale);
     float softcap_float = static_cast<float>(softcap);
     int window_size_left_int = static_cast<int>(window_size_left);
     int window_size_right_int = static_cast<int>(window_size_right);
     int num_splits_int = static_cast<int>(num_splits);
+    return FLASH_NAMESPACE::mha_fwd_kvcache(
+        const_cast<at::Tensor &>(q),
+        kcache, vcache,
+        k, v,
+        seqlens_k,
+        rotary_cos, rotary_sin,
+        cache_batch_idx,
+        leftpad_k,
+        block_table, alibi_slopes,
+        out,
+        softmax_scale_float,
+        is_causal,
+        window_size_left_int, window_size_right_int,
+        softcap_float,
+        is_rotary_interleaved,
+        num_splits_int
+    );
 }