Kernels
kernels-bot commited on
Commit
6502310
·
verified ·
1 Parent(s): 1fb71e7

Uploaded using `kernel-builder`.

Browse files
build/torch-cuda/_ops.py CHANGED
@@ -22,7 +22,7 @@ def get_backend() -> str:
22
 
23
  def _find_ops_name() -> str:
24
  kernel_name = "finegrained_fp8"
25
- unique_id = "bbc7e0f"
26
  backend = get_backend()
27
  return f"_{kernel_name}_{backend}_{unique_id}"
28
 
 
22
 
23
  def _find_ops_name() -> str:
24
  kernel_name = "finegrained_fp8"
25
+ unique_id = "7c5619e"
26
  backend = get_backend()
27
  return f"_{kernel_name}_{backend}_{unique_id}"
28
 
build/torch-cuda/batched.py CHANGED
@@ -103,7 +103,13 @@ def w8a8_block_fp8_matmul_batched_kernel(
103
  offs_cm = tl.arange(0, BLOCK_SIZE_M)
104
  offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
105
  c_ptrs = C + offs_cm[:, None] * 0 + stride_cn * offs_cn[None, :]
106
- tl.store(c_ptrs, c)
 
 
 
 
 
 
107
 
108
 
109
  @triton.autotune(
@@ -181,7 +187,9 @@ def w8a8_tensor_fp8_matmul_batched_kernel(
181
  offs_cm = tl.arange(0, BLOCK_SIZE_M)
182
  offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
183
  c_ptrs = C + offs_cm[:, None] * 0 + stride_cn * offs_cn[None, :]
184
- tl.store(c_ptrs, c)
 
 
185
 
186
 
187
  @triton_op("finegrained_fp8::w8a8_block_fp8_matmul_batched", mutates_args=())
 
103
  offs_cm = tl.arange(0, BLOCK_SIZE_M)
104
  offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
105
  c_ptrs = C + offs_cm[:, None] * 0 + stride_cn * offs_cn[None, :]
106
+ # Fake-batch trick aliases all BLOCK_SIZE_M lanes to the same C row, so emitting
107
+ # `tl.store(c_ptrs, c)` issues BLOCK_SIZE_M duplicate-address stores. On NVIDIA
108
+ # WGMMA this is usually benign (last-write-wins of identical bytes), but on Intel
109
+ # XPU the duplicate-address store has hardware-undefined behavior and corrupts the
110
+ # output. Mask so only lane 0 stores — the (M, N) accumulator rows are
111
+ # mathematically identical (same A row × same B), so lane 0 holds the right value.
112
+ tl.store(c_ptrs, c, mask=(offs_cm == 0)[:, None])
113
 
114
 
115
  @triton.autotune(
 
187
  offs_cm = tl.arange(0, BLOCK_SIZE_M)
188
  offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
189
  c_ptrs = C + offs_cm[:, None] * 0 + stride_cn * offs_cn[None, :]
190
+ # See block-FP8 kernel above: BLOCK_SIZE_M lanes alias the same C row;
191
+ # mask so only lane 0 stores to avoid hardware-undefined duplicate writes on XPU.
192
+ tl.store(c_ptrs, c, mask=(offs_cm == 0)[:, None])
193
 
194
 
195
  @triton_op("finegrained_fp8::w8a8_block_fp8_matmul_batched", mutates_args=())
build/torch-cuda/metadata.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "name": "finegrained-fp8",
3
- "id": "_finegrained_fp8_cuda_bbc7e0f",
4
  "version": 1,
5
  "license": "Apache-2.0",
6
  "python-depends": [],
 
1
  {
2
  "name": "finegrained-fp8",
3
+ "id": "_finegrained_fp8_cuda_7c5619e",
4
  "version": 1,
5
  "license": "Apache-2.0",
6
  "python-depends": [],