liangsu9988 commited on
Commit
b9f8d14
·
verified ·
1 Parent(s): 85d7854

Uploaded using `kernel-builder`.

Browse files
benchmarks/benchmark_nvfp4_sf_reshape.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ from kernels.benchmark import Benchmark
4
+
5
+
6
+ LAYOUT_SHAPES = [
7
+ ("rows1_d4096", 1, 4096),
8
+ ("rows2_d4096", 2, 4096),
9
+ ("rows31_d4096", 31, 4096),
10
+ ("rows32_d4096", 32, 4096),
11
+ ("rows33_d4096", 33, 4096),
12
+ ("rows127_d4096", 127, 4096),
13
+ ("rows128_d4096", 128, 4096),
14
+ ("rows129_d4096", 129, 4096),
15
+ ("rows16_d1024", 16, 1024),
16
+ ("rows16_d2048", 16, 2048),
17
+ ("rows16_d8192", 16, 8192),
18
+ ("rows16_d12288", 16, 12288),
19
+ ("rows64_d16384", 64, 16384),
20
+ ]
21
+
22
+
23
+ def _reference_swizzle(scales: torch.Tensor) -> torch.Tensor:
24
+ rows, n_blocks = scales.shape
25
+ n_col_super = (n_blocks + 3) // 4
26
+ src = scales.cpu()
27
+ out = torch.zeros(
28
+ ((rows + 127) // 128) * n_col_super * 512,
29
+ dtype=torch.uint8,
30
+ )
31
+ for row in range(rows):
32
+ rb = row // 128
33
+ ri = row % 128
34
+ for blk in range(n_blocks):
35
+ cb = blk // 4
36
+ ci = blk % 4
37
+ super_idx = rb * n_col_super + cb
38
+ inner_off = (ri % 32) * 16 + (ri // 32) * 4 + ci
39
+ out[super_idx * 512 + inner_off] = src[row, blk]
40
+ return out.to(scales.device)
41
+
42
+
43
+ class Nvfp4ScaleFactorReshapeBenchmark(Benchmark):
44
+ seed = 7
45
+
46
+ def _setup_shape(self, rows: int, D: int) -> None:
47
+ self.scales = torch.randint(
48
+ 0,
49
+ 256,
50
+ (rows, D // 16),
51
+ device=self.device,
52
+ dtype=torch.uint8,
53
+ )
54
+ n_col_super = ((D // 16) + 3) // 4
55
+ self.out = torch.zeros(
56
+ ((rows + 127) // 128) * n_col_super * 512,
57
+ device=self.device,
58
+ dtype=torch.uint8,
59
+ )
60
+
61
+ def _reference(self):
62
+ return _reference_swizzle(self.scales)
63
+
64
+ def setup_rows1_d4096(self):
65
+ self._setup_shape(1, 4096)
66
+
67
+ def benchmark_rows1_d4096(self):
68
+ self.kernel.nvfp4_sf_linear_to_swizzled(self.scales, out=self.out)
69
+
70
+ def verify_rows1_d4096(self):
71
+ return self._reference()
72
+
73
+ def setup_rows16_d12288(self):
74
+ self._setup_shape(16, 12288)
75
+
76
+ def benchmark_rows16_d12288(self):
77
+ self.kernel.nvfp4_sf_linear_to_swizzled(self.scales, out=self.out)
78
+
79
+ def verify_rows16_d12288(self):
80
+ return self._reference()
81
+
82
+ def setup_rows64_d16384(self):
83
+ self._setup_shape(64, 16384)
84
+
85
+ def benchmark_rows64_d16384(self):
86
+ self.kernel.nvfp4_sf_linear_to_swizzled(self.scales, out=self.out)
87
+
88
+ def verify_rows64_d16384(self):
89
+ return self._reference()
90
+
91
+ def setup_rows128_d4096(self):
92
+ self._setup_shape(128, 4096)
93
+
94
+ def benchmark_rows128_d4096(self):
95
+ self.kernel.nvfp4_sf_linear_to_swizzled(self.scales, out=self.out)
96
+
97
+ def verify_rows128_d4096(self):
98
+ return self._reference()
99
+
100
+ def setup_rows129_d4096(self):
101
+ self._setup_shape(129, 4096)
102
+
103
+ def benchmark_rows129_d4096(self):
104
+ self.kernel.nvfp4_sf_linear_to_swizzled(self.scales, out=self.out)
105
+
106
+ def verify_rows129_d4096(self):
107
+ return self._reference()
108
+
109
+
110
+ def _register_layout_shapes() -> None:
111
+ for label, rows, D in LAYOUT_SHAPES:
112
+
113
+ def setup(self, rows=rows, D=D) -> None:
114
+ self._setup_shape(rows, D)
115
+
116
+ def benchmark(self) -> None:
117
+ self.kernel.nvfp4_sf_linear_to_swizzled(self.scales, out=self.out)
118
+
119
+ def verify(self):
120
+ return self._reference()
121
+
122
+ setattr(Nvfp4ScaleFactorReshapeBenchmark, f"setup_{label}", setup)
123
+ setattr(Nvfp4ScaleFactorReshapeBenchmark, f"benchmark_{label}", benchmark)
124
+ setattr(Nvfp4ScaleFactorReshapeBenchmark, f"verify_{label}", verify)
125
+
126
+
127
+ _register_layout_shapes()
build/torch211-cxx11-cu128-x86_64-linux/__init__.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """FlashRT NVFP4 layout kernels."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+
7
+ import torch
8
+
9
+ from ._ops import ops
10
+
11
+
12
+ def nvfp4_sf_swizzled_bytes(rows: int, D: int) -> int:
13
+ """Return byte count for a CUTLASS Sm1xx NVFP4 swizzled SF buffer."""
14
+
15
+ if rows <= 0:
16
+ raise ValueError("rows must be positive")
17
+ if D <= 0 or D % 16 != 0:
18
+ raise ValueError("D must be positive and divisible by 16")
19
+ n_blocks = D // 16
20
+ n_row_super = (rows + 127) // 128
21
+ n_col_super = (n_blocks + 3) // 4
22
+ return n_row_super * n_col_super * 512
23
+
24
+
25
+ def nvfp4_sf_linear_to_swizzled(
26
+ scales: torch.Tensor,
27
+ *,
28
+ out: Optional[torch.Tensor] = None,
29
+ is_sfb: bool = False,
30
+ ) -> torch.Tensor:
31
+ """Convert linear NVFP4 scale bytes to CUTLASS Sm1xx swizzled layout.
32
+
33
+ ``scales`` must be contiguous CUDA ``torch.uint8`` with shape
34
+ ``(rows, D / 16)``. If ``out`` is omitted, a flat ``torch.uint8`` output
35
+ tensor with ``nvfp4_sf_swizzled_bytes(rows, D)`` bytes is allocated.
36
+ """
37
+
38
+ if scales.dim() != 2:
39
+ raise ValueError("scales must have shape (rows, D / 16)")
40
+ rows = scales.shape[0]
41
+ D = scales.shape[1] * 16
42
+ if out is None:
43
+ out = torch.zeros(
44
+ (nvfp4_sf_swizzled_bytes(rows, D),),
45
+ device=scales.device,
46
+ dtype=torch.uint8,
47
+ )
48
+ ops.nvfp4_sf_linear_to_swizzled(scales, out, D, is_sfb)
49
+ return out
50
+
51
+
52
+ __all__ = [
53
+ "nvfp4_sf_linear_to_swizzled",
54
+ "nvfp4_sf_swizzled_bytes",
55
+ ]
build/torch211-cxx11-cu128-x86_64-linux/_flashrt_nvfp4_cuda_e9a1fe0.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:695400c280dd6f770e837da25ff054f0caee876890f88aff1ab9fa300aa1daf8
3
+ size 95200
build/torch211-cxx11-cu128-x86_64-linux/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _flashrt_nvfp4_cuda_e9a1fe0
3
+ ops = torch.ops._flashrt_nvfp4_cuda_e9a1fe0
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_flashrt_nvfp4_cuda_e9a1fe0::{op_name}"
build/torch211-cxx11-cu128-x86_64-linux/flashrt_nvfp4/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import importlib.util
3
+ import sys
4
+ from pathlib import Path
5
+ from types import ModuleType
6
+
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch211-cxx11-cu128-x86_64-linux/metadata.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "flashrt-nvfp4",
3
+ "id": "_flashrt_nvfp4_cuda_e9a1fe0",
4
+ "version": 1,
5
+ "license": "Apache-2.0",
6
+ "python-depends": [],
7
+ "backend": {
8
+ "type": "cuda",
9
+ "archs": [
10
+ "12.0"
11
+ ]
12
+ }
13
+ }
build/torch211-cxx11-cu130-x86_64-linux/__init__.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """FlashRT NVFP4 layout kernels."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+
7
+ import torch
8
+
9
+ from ._ops import ops
10
+
11
+
12
+ def nvfp4_sf_swizzled_bytes(rows: int, D: int) -> int:
13
+ """Return byte count for a CUTLASS Sm1xx NVFP4 swizzled SF buffer."""
14
+
15
+ if rows <= 0:
16
+ raise ValueError("rows must be positive")
17
+ if D <= 0 or D % 16 != 0:
18
+ raise ValueError("D must be positive and divisible by 16")
19
+ n_blocks = D // 16
20
+ n_row_super = (rows + 127) // 128
21
+ n_col_super = (n_blocks + 3) // 4
22
+ return n_row_super * n_col_super * 512
23
+
24
+
25
+ def nvfp4_sf_linear_to_swizzled(
26
+ scales: torch.Tensor,
27
+ *,
28
+ out: Optional[torch.Tensor] = None,
29
+ is_sfb: bool = False,
30
+ ) -> torch.Tensor:
31
+ """Convert linear NVFP4 scale bytes to CUTLASS Sm1xx swizzled layout.
32
+
33
+ ``scales`` must be contiguous CUDA ``torch.uint8`` with shape
34
+ ``(rows, D / 16)``. If ``out`` is omitted, a flat ``torch.uint8`` output
35
+ tensor with ``nvfp4_sf_swizzled_bytes(rows, D)`` bytes is allocated.
36
+ """
37
+
38
+ if scales.dim() != 2:
39
+ raise ValueError("scales must have shape (rows, D / 16)")
40
+ rows = scales.shape[0]
41
+ D = scales.shape[1] * 16
42
+ if out is None:
43
+ out = torch.zeros(
44
+ (nvfp4_sf_swizzled_bytes(rows, D),),
45
+ device=scales.device,
46
+ dtype=torch.uint8,
47
+ )
48
+ ops.nvfp4_sf_linear_to_swizzled(scales, out, D, is_sfb)
49
+ return out
50
+
51
+
52
+ __all__ = [
53
+ "nvfp4_sf_linear_to_swizzled",
54
+ "nvfp4_sf_swizzled_bytes",
55
+ ]
build/torch211-cxx11-cu130-x86_64-linux/_flashrt_nvfp4_cuda_e9a1fe0.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a29dd302ad9728a9bea7899650db2a9671cd63298138658f830c6c7776243229
3
+ size 100344
build/torch211-cxx11-cu130-x86_64-linux/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _flashrt_nvfp4_cuda_e9a1fe0
3
+ ops = torch.ops._flashrt_nvfp4_cuda_e9a1fe0
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_flashrt_nvfp4_cuda_e9a1fe0::{op_name}"
build/torch211-cxx11-cu130-x86_64-linux/flashrt_nvfp4/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import importlib.util
3
+ import sys
4
+ from pathlib import Path
5
+ from types import ModuleType
6
+
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch211-cxx11-cu130-x86_64-linux/metadata.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "flashrt-nvfp4",
3
+ "id": "_flashrt_nvfp4_cuda_e9a1fe0",
4
+ "version": 1,
5
+ "license": "Apache-2.0",
6
+ "python-depends": [],
7
+ "backend": {
8
+ "type": "cuda",
9
+ "archs": [
10
+ "12.0"
11
+ ]
12
+ }
13
+ }
build/torch212-cxx11-cu130-x86_64-linux/__init__.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """FlashRT NVFP4 layout kernels."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+
7
+ import torch
8
+
9
+ from ._ops import ops
10
+
11
+
12
+ def nvfp4_sf_swizzled_bytes(rows: int, D: int) -> int:
13
+ """Return byte count for a CUTLASS Sm1xx NVFP4 swizzled SF buffer."""
14
+
15
+ if rows <= 0:
16
+ raise ValueError("rows must be positive")
17
+ if D <= 0 or D % 16 != 0:
18
+ raise ValueError("D must be positive and divisible by 16")
19
+ n_blocks = D // 16
20
+ n_row_super = (rows + 127) // 128
21
+ n_col_super = (n_blocks + 3) // 4
22
+ return n_row_super * n_col_super * 512
23
+
24
+
25
+ def nvfp4_sf_linear_to_swizzled(
26
+ scales: torch.Tensor,
27
+ *,
28
+ out: Optional[torch.Tensor] = None,
29
+ is_sfb: bool = False,
30
+ ) -> torch.Tensor:
31
+ """Convert linear NVFP4 scale bytes to CUTLASS Sm1xx swizzled layout.
32
+
33
+ ``scales`` must be contiguous CUDA ``torch.uint8`` with shape
34
+ ``(rows, D / 16)``. If ``out`` is omitted, a flat ``torch.uint8`` output
35
+ tensor with ``nvfp4_sf_swizzled_bytes(rows, D)`` bytes is allocated.
36
+ """
37
+
38
+ if scales.dim() != 2:
39
+ raise ValueError("scales must have shape (rows, D / 16)")
40
+ rows = scales.shape[0]
41
+ D = scales.shape[1] * 16
42
+ if out is None:
43
+ out = torch.zeros(
44
+ (nvfp4_sf_swizzled_bytes(rows, D),),
45
+ device=scales.device,
46
+ dtype=torch.uint8,
47
+ )
48
+ ops.nvfp4_sf_linear_to_swizzled(scales, out, D, is_sfb)
49
+ return out
50
+
51
+
52
+ __all__ = [
53
+ "nvfp4_sf_linear_to_swizzled",
54
+ "nvfp4_sf_swizzled_bytes",
55
+ ]
build/torch212-cxx11-cu130-x86_64-linux/_flashrt_nvfp4_cuda_e9a1fe0.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26f4c11dc535b3cce288d30dd0df4a6262031286fe39db48149fd25fa8165457
3
+ size 111104
build/torch212-cxx11-cu130-x86_64-linux/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _flashrt_nvfp4_cuda_e9a1fe0
3
+ ops = torch.ops._flashrt_nvfp4_cuda_e9a1fe0
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_flashrt_nvfp4_cuda_e9a1fe0::{op_name}"
build/torch212-cxx11-cu130-x86_64-linux/flashrt_nvfp4/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import importlib.util
3
+ import sys
4
+ from pathlib import Path
5
+ from types import ModuleType
6
+
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch212-cxx11-cu130-x86_64-linux/metadata.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "flashrt-nvfp4",
3
+ "id": "_flashrt_nvfp4_cuda_e9a1fe0",
4
+ "version": 1,
5
+ "license": "Apache-2.0",
6
+ "python-depends": [],
7
+ "backend": {
8
+ "type": "cuda",
9
+ "archs": [
10
+ "12.0"
11
+ ]
12
+ }
13
+ }
build/torch212-cxx11-cu132-x86_64-linux/__init__.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """FlashRT NVFP4 layout kernels."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+
7
+ import torch
8
+
9
+ from ._ops import ops
10
+
11
+
12
+ def nvfp4_sf_swizzled_bytes(rows: int, D: int) -> int:
13
+ """Return byte count for a CUTLASS Sm1xx NVFP4 swizzled SF buffer."""
14
+
15
+ if rows <= 0:
16
+ raise ValueError("rows must be positive")
17
+ if D <= 0 or D % 16 != 0:
18
+ raise ValueError("D must be positive and divisible by 16")
19
+ n_blocks = D // 16
20
+ n_row_super = (rows + 127) // 128
21
+ n_col_super = (n_blocks + 3) // 4
22
+ return n_row_super * n_col_super * 512
23
+
24
+
25
+ def nvfp4_sf_linear_to_swizzled(
26
+ scales: torch.Tensor,
27
+ *,
28
+ out: Optional[torch.Tensor] = None,
29
+ is_sfb: bool = False,
30
+ ) -> torch.Tensor:
31
+ """Convert linear NVFP4 scale bytes to CUTLASS Sm1xx swizzled layout.
32
+
33
+ ``scales`` must be contiguous CUDA ``torch.uint8`` with shape
34
+ ``(rows, D / 16)``. If ``out`` is omitted, a flat ``torch.uint8`` output
35
+ tensor with ``nvfp4_sf_swizzled_bytes(rows, D)`` bytes is allocated.
36
+ """
37
+
38
+ if scales.dim() != 2:
39
+ raise ValueError("scales must have shape (rows, D / 16)")
40
+ rows = scales.shape[0]
41
+ D = scales.shape[1] * 16
42
+ if out is None:
43
+ out = torch.zeros(
44
+ (nvfp4_sf_swizzled_bytes(rows, D),),
45
+ device=scales.device,
46
+ dtype=torch.uint8,
47
+ )
48
+ ops.nvfp4_sf_linear_to_swizzled(scales, out, D, is_sfb)
49
+ return out
50
+
51
+
52
+ __all__ = [
53
+ "nvfp4_sf_linear_to_swizzled",
54
+ "nvfp4_sf_swizzled_bytes",
55
+ ]
build/torch212-cxx11-cu132-x86_64-linux/_flashrt_nvfp4_cuda_e9a1fe0.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23ef46609656ff3387039e83a410b131ab72c2da5c3099df19acea47c6e38d45
3
+ size 111104
build/torch212-cxx11-cu132-x86_64-linux/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _flashrt_nvfp4_cuda_e9a1fe0
3
+ ops = torch.ops._flashrt_nvfp4_cuda_e9a1fe0
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_flashrt_nvfp4_cuda_e9a1fe0::{op_name}"
build/torch212-cxx11-cu132-x86_64-linux/flashrt_nvfp4/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import importlib.util
3
+ import sys
4
+ from pathlib import Path
5
+ from types import ModuleType
6
+
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch212-cxx11-cu132-x86_64-linux/metadata.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "flashrt-nvfp4",
3
+ "id": "_flashrt_nvfp4_cuda_e9a1fe0",
4
+ "version": 1,
5
+ "license": "Apache-2.0",
6
+ "python-depends": [],
7
+ "backend": {
8
+ "type": "cuda",
9
+ "archs": [
10
+ "12.0"
11
+ ]
12
+ }
13
+ }