Instructions to use kernels-community/finegrained-fp8 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Kernels
How to use kernels-community/finegrained-fp8 with Kernels:
# !pip install kernels from kernels import get_kernel kernel = get_kernel("kernels-community/finegrained-fp8") - Notebooks
- Google Colab
- Kaggle
Uploaded using `kernel-builder`.
Browse files- build/torch-cuda/_ops.py +1 -1
- build/torch-cuda/batched.py +10 -2
- build/torch-cuda/metadata.json +1 -1
build/torch-cuda/_ops.py
CHANGED
|
@@ -22,7 +22,7 @@ def get_backend() -> str:
|
|
| 22 |
|
| 23 |
def _find_ops_name() -> str:
|
| 24 |
kernel_name = "finegrained_fp8"
|
| 25 |
-
unique_id = "
|
| 26 |
backend = get_backend()
|
| 27 |
return f"_{kernel_name}_{backend}_{unique_id}"
|
| 28 |
|
|
|
|
| 22 |
|
| 23 |
def _find_ops_name() -> str:
|
| 24 |
kernel_name = "finegrained_fp8"
|
| 25 |
+
unique_id = "7c5619e"
|
| 26 |
backend = get_backend()
|
| 27 |
return f"_{kernel_name}_{backend}_{unique_id}"
|
| 28 |
|
build/torch-cuda/batched.py
CHANGED
|
@@ -103,7 +103,13 @@ def w8a8_block_fp8_matmul_batched_kernel(
|
|
| 103 |
offs_cm = tl.arange(0, BLOCK_SIZE_M)
|
| 104 |
offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
|
| 105 |
c_ptrs = C + offs_cm[:, None] * 0 + stride_cn * offs_cn[None, :]
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
|
| 109 |
@triton.autotune(
|
|
@@ -181,7 +187,9 @@ def w8a8_tensor_fp8_matmul_batched_kernel(
|
|
| 181 |
offs_cm = tl.arange(0, BLOCK_SIZE_M)
|
| 182 |
offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
|
| 183 |
c_ptrs = C + offs_cm[:, None] * 0 + stride_cn * offs_cn[None, :]
|
| 184 |
-
|
|
|
|
|
|
|
| 185 |
|
| 186 |
|
| 187 |
@triton_op("finegrained_fp8::w8a8_block_fp8_matmul_batched", mutates_args=())
|
|
|
|
| 103 |
offs_cm = tl.arange(0, BLOCK_SIZE_M)
|
| 104 |
offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
|
| 105 |
c_ptrs = C + offs_cm[:, None] * 0 + stride_cn * offs_cn[None, :]
|
| 106 |
+
# Fake-batch trick aliases all BLOCK_SIZE_M lanes to the same C row, so emitting
|
| 107 |
+
# `tl.store(c_ptrs, c)` issues BLOCK_SIZE_M duplicate-address stores. On NVIDIA
|
| 108 |
+
# WGMMA this is usually benign (last-write-wins of identical bytes), but on Intel
|
| 109 |
+
# XPU the duplicate-address store has hardware-undefined behavior and corrupts the
|
| 110 |
+
# output. Mask so only lane 0 stores — the (M, N) accumulator rows are
|
| 111 |
+
# mathematically identical (same A row × same B), so lane 0 holds the right value.
|
| 112 |
+
tl.store(c_ptrs, c, mask=(offs_cm == 0)[:, None])
|
| 113 |
|
| 114 |
|
| 115 |
@triton.autotune(
|
|
|
|
| 187 |
offs_cm = tl.arange(0, BLOCK_SIZE_M)
|
| 188 |
offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
|
| 189 |
c_ptrs = C + offs_cm[:, None] * 0 + stride_cn * offs_cn[None, :]
|
| 190 |
+
# See block-FP8 kernel above: BLOCK_SIZE_M lanes alias the same C row;
|
| 191 |
+
# mask so only lane 0 stores to avoid hardware-undefined duplicate writes on XPU.
|
| 192 |
+
tl.store(c_ptrs, c, mask=(offs_cm == 0)[:, None])
|
| 193 |
|
| 194 |
|
| 195 |
@triton_op("finegrained_fp8::w8a8_block_fp8_matmul_batched", mutates_args=())
|
build/torch-cuda/metadata.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"name": "finegrained-fp8",
|
| 3 |
-
"id": "
|
| 4 |
"version": 1,
|
| 5 |
"license": "Apache-2.0",
|
| 6 |
"python-depends": [],
|
|
|
|
| 1 |
{
|
| 2 |
"name": "finegrained-fp8",
|
| 3 |
+
"id": "_finegrained_fp8_cuda_7c5619e",
|
| 4 |
"version": 1,
|
| 5 |
"license": "Apache-2.0",
|
| 6 |
"python-depends": [],
|