Update triton_flash_blocksparse_attn.py

Browse files

Adding with ```torch.cuda.device(q.device.index)``` at all applicable sections to support multi gpu.

Files changed (1) hide show

triton_flash_blocksparse_attn.py +64 -62

triton_flash_blocksparse_attn.py CHANGED Viewed

@@ -992,37 +992,38 @@ def blocksparse_flash_attn_padded_fwd(
     grid = (len(q_start_sids), n_heads)
-    _fwd_kernel_batch_inference[grid](
-    q, k, v, out,
-    sm_scale,
-    q_batch_starts,
-    q_batch_ends,
-    k_batch_starts,
-    k_batch_ends,
-    q_batch_ids,
-    q_start_sids,
-    *q.stride(),
-    *k.stride(),
-    *v.stride(),
-    *out.stride(),
-    layout_crow_indices,
-    layout_col_indices,
-    *layout_crow_indices.stride(),
-    *layout_col_indices.stride(),
-    q_k_ratio,
-    HAS_BATCH_DIM = True,
-    D_HEAD = head_size,
-    BLOCK_M = block_size,
-    BLOCK_N = block_size,
-    BLOCK_D = block_d,
-    BLOCK_M_LOADING = 16 if q_len == 1 else block_size, # smaller for decoding
-    EVEN_D = block_d == head_size,
-    num_warps = 1 if q_len == 1 else 4,
-    num_stages = 3
-    )
     return out
@@ -1094,37 +1095,38 @@ def blocksparse_flash_attn_varlen_fwd(
     grid = (len(q_start_sids), n_heads)
-    _fwd_kernel_batch_inference[grid](
-    q, k, v, out,
-    sm_scale,
-    cu_seqlens_q[:-1],
-    cu_seqlens_q[1:],
-    cu_seqlens_k[:-1],
-    cu_seqlens_k[1:],
-    q_batch_ids,
-    q_start_sids,
-    0, *q.stride(),
-    0, *k.stride(),
-    0, *v.stride(),
-    0, *out.stride(),
-    layout_crow_indices,
-    layout_col_indices,
-    *layout_crow_indices.stride(),
-    *layout_col_indices.stride(),
-    q_k_ratio,
-    HAS_BATCH_DIM = False,
-    D_HEAD = head_size,
-    BLOCK_M = block_size,
-    BLOCK_N = block_size,
-    BLOCK_D = block_d,
-    BLOCK_M_LOADING = 16 if decoding_only else block_size, # smaller for decoding
-    EVEN_D = block_d == head_size,
-    num_warps = 1 if decoding_only else 4,
-    num_stages = 3
-    )
     return out

     grid = (len(q_start_sids), n_heads)
+    with torch.cuda.device(q.device.index):
+        _fwd_kernel_batch_inference[grid](
+        q, k, v, out,
+        sm_scale,
+        q_batch_starts,
+        q_batch_ends,
+        k_batch_starts,
+        k_batch_ends,
+        q_batch_ids,
+        q_start_sids,
+        *q.stride(),
+        *k.stride(),
+        *v.stride(),
+        *out.stride(),
+        layout_crow_indices,
+        layout_col_indices,
+        *layout_crow_indices.stride(),
+        *layout_col_indices.stride(),
+        q_k_ratio,
+        HAS_BATCH_DIM = True,
+        D_HEAD = head_size,
+        BLOCK_M = block_size,
+        BLOCK_N = block_size,
+        BLOCK_D = block_d,
+        BLOCK_M_LOADING = 16 if q_len == 1 else block_size, # smaller for decoding
+        EVEN_D = block_d == head_size,
+        num_warps = 1 if q_len == 1 else 4,
+        num_stages = 3
+        )
     return out
     grid = (len(q_start_sids), n_heads)
+    with torch.cuda.device(q.device.index):
+        _fwd_kernel_batch_inference[grid](
+        q, k, v, out,
+        sm_scale,
+        cu_seqlens_q[:-1],
+        cu_seqlens_q[1:],
+        cu_seqlens_k[:-1],
+        cu_seqlens_k[1:],
+        q_batch_ids,
+        q_start_sids,
+        0, *q.stride(),
+        0, *k.stride(),
+        0, *v.stride(),
+        0, *out.stride(),
+        layout_crow_indices,
+        layout_col_indices,
+        *layout_crow_indices.stride(),
+        *layout_col_indices.stride(),
+        q_k_ratio,
+        HAS_BATCH_DIM = False,
+        D_HEAD = head_size,
+        BLOCK_M = block_size,
+        BLOCK_N = block_size,
+        BLOCK_D = block_d,
+        BLOCK_M_LOADING = 16 if decoding_only else block_size, # smaller for decoding
+        EVEN_D = block_d == head_size,
+        num_warps = 1 if decoding_only else 4,
+        num_stages = 3
+        )
     return out