medmekk commited on Feb 16

Commit

c67ae40

verified ·

1 Parent(s): 9090aa9

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +6 -0
README.md +81 -0
build.toml +100 -0
build/torch210-cxx11-cu126-x86_64-linux/__init__.py +289 -0
build/torch210-cxx11-cu126-x86_64-linux/_deep_gemm_099ac3c_dirty.abi3.so +3 -0
build/torch210-cxx11-cu126-x86_64-linux/_ops.py +9 -0
build/torch210-cxx11-cu126-x86_64-linux/deep_gemm/__init__.py +26 -0
build/torch210-cxx11-cu126-x86_64-linux/metadata.json +3 -0
build/torch210-cxx11-cu126-x86_64-linux/testing/__init__.py +4 -0
build/torch210-cxx11-cu126-x86_64-linux/testing/bench.py +137 -0
build/torch210-cxx11-cu126-x86_64-linux/testing/numeric.py +21 -0
build/torch210-cxx11-cu126-x86_64-linux/testing/utils.py +38 -0
build/torch210-cxx11-cu126-x86_64-linux/utils/__init__.py +3 -0
build/torch210-cxx11-cu126-x86_64-linux/utils/layout.py +25 -0
build/torch210-cxx11-cu126-x86_64-linux/utils/math.py +107 -0
build/torch210-cxx11-cu128-x86_64-linux/__init__.py +289 -0
build/torch210-cxx11-cu128-x86_64-linux/_deep_gemm_099ac3c_dirty.abi3.so +3 -0
build/torch210-cxx11-cu128-x86_64-linux/_ops.py +9 -0
build/torch210-cxx11-cu128-x86_64-linux/deep_gemm/__init__.py +26 -0
build/torch210-cxx11-cu128-x86_64-linux/metadata.json +3 -0
build/torch210-cxx11-cu128-x86_64-linux/testing/__init__.py +4 -0
build/torch210-cxx11-cu128-x86_64-linux/testing/bench.py +137 -0
build/torch210-cxx11-cu128-x86_64-linux/testing/numeric.py +21 -0
build/torch210-cxx11-cu128-x86_64-linux/testing/utils.py +38 -0
build/torch210-cxx11-cu128-x86_64-linux/utils/__init__.py +3 -0
build/torch210-cxx11-cu128-x86_64-linux/utils/layout.py +25 -0
build/torch210-cxx11-cu128-x86_64-linux/utils/math.py +107 -0
build/torch210-cxx11-cu130-x86_64-linux/__init__.py +289 -0
build/torch210-cxx11-cu130-x86_64-linux/_deep_gemm_099ac3c_dirty.abi3.so +3 -0
build/torch210-cxx11-cu130-x86_64-linux/_ops.py +9 -0
build/torch210-cxx11-cu130-x86_64-linux/deep_gemm/__init__.py +26 -0
build/torch210-cxx11-cu130-x86_64-linux/metadata.json +3 -0
build/torch210-cxx11-cu130-x86_64-linux/testing/__init__.py +4 -0
build/torch210-cxx11-cu130-x86_64-linux/testing/bench.py +137 -0
build/torch210-cxx11-cu130-x86_64-linux/testing/numeric.py +21 -0
build/torch210-cxx11-cu130-x86_64-linux/testing/utils.py +38 -0
build/torch210-cxx11-cu130-x86_64-linux/utils/__init__.py +3 -0
build/torch210-cxx11-cu130-x86_64-linux/utils/layout.py +25 -0
build/torch210-cxx11-cu130-x86_64-linux/utils/math.py +107 -0
build/torch29-cxx11-cu126-x86_64-linux/__init__.py +289 -0
build/torch29-cxx11-cu126-x86_64-linux/_deep_gemm_099ac3c_dirty.abi3.so +3 -0
build/torch29-cxx11-cu126-x86_64-linux/_ops.py +9 -0
build/torch29-cxx11-cu126-x86_64-linux/deep_gemm/__init__.py +26 -0
build/torch29-cxx11-cu126-x86_64-linux/metadata.json +3 -0
build/torch29-cxx11-cu126-x86_64-linux/testing/__init__.py +4 -0
build/torch29-cxx11-cu126-x86_64-linux/testing/bench.py +137 -0
build/torch29-cxx11-cu126-x86_64-linux/testing/numeric.py +21 -0
build/torch29-cxx11-cu126-x86_64-linux/testing/utils.py +38 -0
build/torch29-cxx11-cu126-x86_64-linux/utils/__init__.py +3 -0
build/torch29-cxx11-cu126-x86_64-linux/utils/layout.py +25 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-x86_64-linux/_deep_gemm_099ac3c_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-x86_64-linux/_deep_gemm_099ac3c_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-x86_64-linux/_deep_gemm_099ac3c_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu126-x86_64-linux/_deep_gemm_099ac3c_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu128-x86_64-linux/_deep_gemm_099ac3c_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu130-x86_64-linux/_deep_gemm_099ac3c_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,81 @@

+# DeepGEMM
+DeepGEMM kernel for the [Hugging Face kernel-builder](https://github.com/huggingface/kernels) infrastructure.
+This package provides FP8/FP4/BF16 GEMM kernels, einsum, attention, and hyperconnection operations
+from [DeepSeek-AI/DeepGEMM](https://github.com/DeepSeek-AI/DeepGEMM), adapted to the kernels-community
+build structure with torch library bindings.
+## Features
+- **FP8/FP4 GEMMs**: NT, NN, TN, TT variants with M-grouped and K-grouped support
+- **BF16 GEMMs**: NT, NN, TN, TT variants with M-grouped and K-grouped support
+- **cuBLASLt GEMMs**: NT, NN, TN, TT wrappers
+- **Einsum**: bmk,bnk->mn, bhr,hdr->bhd, bhd,hdr->bhr expressions (BF16 and FP8)
+- **Attention**: FP8 MQA logits (regular and paged)
+- **Hyperconnection**: TF32 prenorm GEMM
+- **Layout utilities**: Scaling factor transformations, TMA alignment
+## Architecture Support
+- SM 9.0a (Hopper / H100)
+- SM 10.0a (Blackwell / B200)
+## Requirements
+- CUDA >= 12.1
+- PyTorch >= 2.1
+- CUTLASS 3.9+
+- NVRTC (part of CUDA Toolkit)
+## Installation
+```bash
+pip install kernels
+```
+```python
+import kernels
+kernels.install("kernels-community/DeepGEMM")
+```
+## Usage
+```python
+import deep_gemm
+# FP8 GEMM: D = A @ B.T
+deep_gemm.fp8_gemm_nt((a_fp8, sfa), (b_fp8, sfb), d)
+# BF16 GEMM: D = A @ B.T
+deep_gemm.bf16_gemm_nt(a_bf16, b_bf16, d)
+# cuBLASLt GEMM
+deep_gemm.cublaslt_gemm_nt(a, b, d)
+```
+## JIT Compilation
+DeepGEMM uses Just-In-Time (JIT) compilation for its CUDA kernels. The kernel
+templates (`.cuh` files in `include/deep_gemm/`) are compiled at runtime using
+NVCC or NVRTC. First invocations may be slower due to compilation; results are
+cached in `~/.deep_gemm/` for subsequent calls.
+### CUTLASS Runtime Dependency
+The JIT-compiled kernels depend on CUTLASS headers (`cute/`, `cutlass/`) at
+runtime. The package will automatically search for CUTLASS in these locations:
+1. `DG_CUTLASS_INCLUDE` environment variable (direct path to include dir)
+2. `CUTLASS_HOME` environment variable (`$CUTLASS_HOME/include`)
+3. Bundled in the package's `include/` directory
+4. `CUDA_HOME/include` (some CUDA 12.8+ installs bundle `cute/`)
+5. `nvidia-cutlass` Python package
+Set one of these if JIT compilation fails with missing CUTLASS headers:
+```bash
+export CUTLASS_HOME=/path/to/cutlass
+# or
+export DG_CUTLASS_INCLUDE=/path/to/cutlass/include
+```

build.toml ADDED Viewed

	@@ -0,0 +1,100 @@

+[general]
+name = "deep_gemm"
+backends = ["cuda"]
+[general.hub]
+repo-id = "kernels-community/DeepGEMM"
+[torch]
+src = [
+    "torch-ext/torch_binding.cpp",
+    "torch-ext/torch_binding.h",
+]
+[kernel.deep_gemm]
+backend = "cuda"
+cuda-capabilities = [
+    "9.0a",
+    "10.0a",
+]
+cxx-flags = [
+    "-std=c++17",
+    "-O3",
+    "-Wno-psabi",
+    "-Wno-deprecated-declarations",
+]
+depends = [
+    "torch",
+    "cutlass_3_9",
+]
+include = [
+    ".",
+    "csrc",
+    "deep_gemm/include",
+    "third-party/fmt/include",
+]
+src = [
+    "csrc/deep_gemm_impl.cpp",
+    "csrc/apis/attention.hpp",
+    "csrc/apis/einsum.hpp",
+    "csrc/apis/gemm.hpp",
+    "csrc/apis/hyperconnection.hpp",
+    "csrc/apis/layout.hpp",
+    "csrc/apis/runtime.hpp",
+    "csrc/jit/cache.hpp",
+    "csrc/jit/compiler.hpp",
+    "csrc/jit/device_runtime.hpp",
+    "csrc/jit/handle.hpp",
+    "csrc/jit/kernel_runtime.hpp",
+    "csrc/jit_kernels/heuristics/common.hpp",
+    "csrc/jit_kernels/heuristics/sm90.hpp",
+    "csrc/jit_kernels/heuristics/sm100.hpp",
+    "csrc/jit_kernels/impls/epilogue.hpp",
+    "csrc/jit_kernels/impls/runtime_utils.hpp",
+    "csrc/jit_kernels/impls/sm90_bf16_gemm.hpp",
+    "csrc/jit_kernels/impls/sm90_bmk_bnk_mn.hpp",
+    "csrc/jit_kernels/impls/sm90_fp8_gemm_1d1d.hpp",
+    "csrc/jit_kernels/impls/sm90_fp8_gemm_1d2d.hpp",
+    "csrc/jit_kernels/impls/sm90_tf32_hc_prenorm_gemm.hpp",
+    "csrc/jit_kernels/impls/sm100_bf16_gemm.hpp",
+    "csrc/jit_kernels/impls/sm100_bmk_bnk_mn.hpp",
+    "csrc/jit_kernels/impls/sm100_fp8_gemm_1d1d.hpp",
+    "csrc/jit_kernels/impls/sm100_tf32_hc_prenorm_gemm.hpp",
+    "csrc/jit_kernels/impls/smxx_clean_logits.hpp",
+    "csrc/jit_kernels/impls/smxx_cublaslt.hpp",
+    "csrc/jit_kernels/impls/smxx_fp8_mqa_logits.hpp",
+    "csrc/jit_kernels/impls/smxx_fp8_paged_mqa_logits.hpp",
+    "csrc/jit_kernels/impls/smxx_layout.hpp",
+    "csrc/utils/compatibility.hpp",
+    "csrc/utils/exception.hpp",
+    "csrc/utils/format.hpp",
+    "csrc/utils/hash.hpp",
+    "csrc/utils/layout.hpp",
+    "csrc/utils/lazy_init.hpp",
+    "csrc/utils/math.hpp",
+    "csrc/utils/system.hpp",
+    "deep_gemm/include/deep_gemm/common/cute_tie.cuh",
+    "deep_gemm/include/deep_gemm/common/epilogue_utils.cuh",
+    "deep_gemm/include/deep_gemm/common/reduction.cuh",
+    "deep_gemm/include/deep_gemm/common/scheduler.cuh",
+    "deep_gemm/include/deep_gemm/common/sm100_utils.cuh",
+    "deep_gemm/include/deep_gemm/common/sm90_utils.cuh",
+    "deep_gemm/include/deep_gemm/common/tma_utils.cuh",
+    "deep_gemm/include/deep_gemm/common/types.hpp",
+    "deep_gemm/include/deep_gemm/common/utils.cuh",
+    "deep_gemm/include/deep_gemm/impls/sm90_bf16_gemm.cuh",
+    "deep_gemm/include/deep_gemm/impls/sm90_bmk_bnk_mn.cuh",
+    "deep_gemm/include/deep_gemm/impls/sm90_fp8_gemm_1d1d.cuh",
+    "deep_gemm/include/deep_gemm/impls/sm90_fp8_gemm_1d2d.cuh",
+    "deep_gemm/include/deep_gemm/impls/sm90_fp8_mqa_logits.cuh",
+    "deep_gemm/include/deep_gemm/impls/sm90_fp8_paged_mqa_logits.cuh",
+    "deep_gemm/include/deep_gemm/impls/sm90_tf32_hc_prenorm_gemm.cuh",
+    "deep_gemm/include/deep_gemm/impls/sm100_bf16_gemm.cuh",
+    "deep_gemm/include/deep_gemm/impls/sm100_bmk_bnk_mn.cuh",
+    "deep_gemm/include/deep_gemm/impls/sm100_fp8_gemm_1d1d.cuh",
+    "deep_gemm/include/deep_gemm/impls/sm100_fp8_mqa_logits.cuh",
+    "deep_gemm/include/deep_gemm/impls/sm100_fp8_paged_mqa_logits.cuh",
+    "deep_gemm/include/deep_gemm/impls/sm100_tf32_hc_prenorm_gemm.cuh",
+    "deep_gemm/include/deep_gemm/impls/smxx_clean_logits.cuh",
+    "deep_gemm/include/deep_gemm/impls/smxx_layout.cuh",
+]

build/torch210-cxx11-cu126-x86_64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import os
+import subprocess
+import torch
+from ._ops import ops
+def _find_cuda_home():
+    cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH')
+    if cuda_home is None:
+        try:
+            with open(os.devnull, 'w') as devnull:
+                nvcc = subprocess.check_output(
+                    ['which', 'nvcc'], stderr=devnull
+                ).decode().rstrip('\r\n')
+                cuda_home = os.path.dirname(os.path.dirname(nvcc))
+        except Exception:
+            cuda_home = '/usr/local/cuda'
+            if not os.path.exists(cuda_home):
+                cuda_home = ''
+    return cuda_home or ''
+def _find_cutlass_include():
+    """Find CUTLASS include path for JIT compilation of .cuh templates."""
+    # 1. Explicit env var
+    cutlass_include = os.environ.get('DG_CUTLASS_INCLUDE')
+    if cutlass_include and os.path.isdir(cutlass_include):
+        return cutlass_include
+    # 2. CUTLASS_HOME env var
+    cutlass_home = os.environ.get('CUTLASS_HOME')
+    if cutlass_home:
+        p = os.path.join(cutlass_home, 'include')
+        if os.path.isdir(os.path.join(p, 'cute')):
+            return p
+    # 3. Check in package include/ directory (bundled cute/cutlass headers)
+    pkg_include = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'include')
+    if os.path.isdir(os.path.join(pkg_include, 'cute')):
+        return pkg_include
+    # 4. Check CUDA_HOME/include (some CUDA 12.8+ installs include cute/)
+    cuda_home = _find_cuda_home()
+    if cuda_home:
+        cuda_inc = os.path.join(cuda_home, 'include')
+        if os.path.isdir(os.path.join(cuda_inc, 'cute')):
+            return cuda_inc
+    # 5. Try to find nvidia-cutlass Python package
+    try:
+        import cutlass as _cutlass
+        cutlass_dir = os.path.dirname(_cutlass.__file__)
+        p = os.path.join(cutlass_dir, 'include')
+        if os.path.isdir(os.path.join(p, 'cute')):
+            return p
+    except ImportError:
+        pass
+    # Return empty string; C++ side will also check env vars
+    return ""
+def set_num_sms(new_num_sms):
+    ops.set_num_sms(new_num_sms)
+def get_num_sms():
+    return ops.get_num_sms()
+def set_tc_util(new_tc_util):
+    ops.set_tc_util(new_tc_util)
+def get_tc_util():
+    return ops.get_tc_util()
+# cuBLASLt GEMMs
+def cublaslt_gemm_nt(a, b, d, c=None):
+    ops.cublaslt_gemm_nt(a, b, d, c)
+def cublaslt_gemm_nn(a, b, d, c=None):
+    ops.cublaslt_gemm_nn(a, b, d, c)
+def cublaslt_gemm_tn(a, b, d, c=None):
+    ops.cublaslt_gemm_tn(a, b, d, c)
+def cublaslt_gemm_tt(a, b, d, c=None):
+    ops.cublaslt_gemm_tt(a, b, d, c)
+try:
+    # FP8/FP4 GEMMs
+    def fp8_fp4_gemm_nt(a, b, d, c=None, recipe=None, recipe_a=None,
+                        recipe_b=None, compiled_dims="nk", disable_ue8m0_cast=False):
+        ops.fp8_fp4_gemm_nt(a[0], a[1], b[0], b[1], d, c,
+                            list(recipe) if recipe else None,
+                            list(recipe_a) if recipe_a else None,
+                            list(recipe_b) if recipe_b else None,
+                            compiled_dims, disable_ue8m0_cast)
+    def fp8_fp4_gemm_nn(a, b, d, c=None, recipe=None, recipe_a=None,
+                        recipe_b=None, compiled_dims="nk", disable_ue8m0_cast=False):
+        ops.fp8_fp4_gemm_nn(a[0], a[1], b[0], b[1], d, c,
+                            list(recipe) if recipe else None,
+                            list(recipe_a) if recipe_a else None,
+                            list(recipe_b) if recipe_b else None,
+                            compiled_dims, disable_ue8m0_cast)
+    def fp8_fp4_gemm_tn(a, b, d, c=None, recipe=None, recipe_a=None,
+                        recipe_b=None, compiled_dims="mn", disable_ue8m0_cast=False):
+        ops.fp8_fp4_gemm_tn(a[0], a[1], b[0], b[1], d, c,
+                            list(recipe) if recipe else None,
+                            list(recipe_a) if recipe_a else None,
+                            list(recipe_b) if recipe_b else None,
+                            compiled_dims, disable_ue8m0_cast)
+    def fp8_fp4_gemm_tt(a, b, d, c=None, recipe=None, recipe_a=None,
+                        recipe_b=None, compiled_dims="mn", disable_ue8m0_cast=False):
+        ops.fp8_fp4_gemm_tt(a[0], a[1], b[0], b[1], d, c,
+                            list(recipe) if recipe else None,
+                            list(recipe_a) if recipe_a else None,
+                            list(recipe_b) if recipe_b else None,
+                            compiled_dims, disable_ue8m0_cast)
+    fp8_gemm_nt = fp8_fp4_gemm_nt
+    fp8_gemm_nn = fp8_fp4_gemm_nn
+    fp8_gemm_tn = fp8_fp4_gemm_tn
+    fp8_gemm_tt = fp8_fp4_gemm_tt
+    def m_grouped_fp8_fp4_gemm_nt_contiguous(a, b, d, grouped_layout,
+            recipe=None, recipe_a=None, recipe_b=None, compiled_dims="nk",
+            disable_ue8m0_cast=False, use_psum_layout=False,
+            expected_m_for_psum_layout=None):
+        ops.m_grouped_fp8_fp4_gemm_nt_contiguous(
+            a[0], a[1], b[0], b[1], d, grouped_layout,
+            list(recipe) if recipe else None,
+            list(recipe_a) if recipe_a else None,
+            list(recipe_b) if recipe_b else None,
+            compiled_dims, disable_ue8m0_cast, use_psum_layout,
+            expected_m_for_psum_layout)
+    m_grouped_fp8_gemm_nt_contiguous = m_grouped_fp8_fp4_gemm_nt_contiguous
+    def m_grouped_fp8_fp4_gemm_nn_contiguous(a, b, d, grouped_layout,
+            recipe=None, recipe_a=None, recipe_b=None, compiled_dims="nk",
+            disable_ue8m0_cast=False, use_psum_layout=False):
+        ops.m_grouped_fp8_fp4_gemm_nn_contiguous(
+            a[0], a[1], b[0], b[1], d, grouped_layout,
+            list(recipe) if recipe else None,
+            list(recipe_a) if recipe_a else None,
+            list(recipe_b) if recipe_b else None,
+            compiled_dims, disable_ue8m0_cast, use_psum_layout)
+    m_grouped_fp8_gemm_nn_contiguous = m_grouped_fp8_fp4_gemm_nn_contiguous
+    def m_grouped_fp8_fp4_gemm_nt_masked(a, b, d, masked_m, expected_m,
+            recipe=None, recipe_a=None, recipe_b=None, compiled_dims="nk",
+            disable_ue8m0_cast=False):
+        ops.m_grouped_fp8_fp4_gemm_nt_masked(
+            a[0], a[1], b[0], b[1], d, masked_m, expected_m,
+            list(recipe) if recipe else None,
+            list(recipe_a) if recipe_a else None,
+            list(recipe_b) if recipe_b else None,
+            compiled_dims, disable_ue8m0_cast)
+    m_grouped_fp8_gemm_nt_masked = m_grouped_fp8_fp4_gemm_nt_masked
+    def k_grouped_fp8_gemm_nt_contiguous(a, b, d, ks, ks_tensor, c=None,
+            recipe=(1, 1, 128), compiled_dims="mn"):
+        ops.k_grouped_fp8_gemm_nt_contiguous(
+            a[0], a[1], b[0], b[1], d, ks, ks_tensor, c,
+            list(recipe), compiled_dims)
+    def k_grouped_fp8_gemm_tn_contiguous(a, b, d, ks, ks_tensor, c=None,
+            recipe=(1, 1, 128), compiled_dims="mn"):
+        ops.k_grouped_fp8_gemm_tn_contiguous(
+            a[0], a[1], b[0], b[1], d, ks, ks_tensor, c,
+            list(recipe), compiled_dims)
+    # BF16 GEMMs
+    def bf16_gemm_nt(a, b, d, c=None, compiled_dims="nk"):
+        ops.bf16_gemm_nt(a, b, d, c, compiled_dims)
+    def bf16_gemm_nn(a, b, d, c=None, compiled_dims="nk"):
+        ops.bf16_gemm_nn(a, b, d, c, compiled_dims)
+    def bf16_gemm_tn(a, b, d, c=None, compiled_dims="mn"):
+        ops.bf16_gemm_tn(a, b, d, c, compiled_dims)
+    def bf16_gemm_tt(a, b, d, c=None, compiled_dims="mn"):
+        ops.bf16_gemm_tt(a, b, d, c, compiled_dims)
+    def m_grouped_bf16_gemm_nt_contiguous(a, b, d, grouped_layout,
+            compiled_dims="nk", use_psum_layout=False,
+            expected_m_for_psum_layout=None):
+        ops.m_grouped_bf16_gemm_nt_contiguous(
+            a, b, d, grouped_layout, compiled_dims,
+            use_psum_layout, expected_m_for_psum_layout)
+    def m_grouped_bf16_gemm_nn_contiguous(a, b, d, grouped_layout,
+            compiled_dims="nk", use_psum_layout=False):
+        ops.m_grouped_bf16_gemm_nn_contiguous(
+            a, b, d, grouped_layout, compiled_dims, use_psum_layout)
+    def m_grouped_bf16_gemm_nt_masked(a, b, d, masked_m, expected_m,
+            compiled_dims="nk"):
+        ops.m_grouped_bf16_gemm_nt_masked(
+            a, b, d, masked_m, expected_m, compiled_dims)
+    def k_grouped_bf16_gemm_tn_contiguous(a, b, d, ks, ks_tensor,
+            c=None, compiled_dims="mn"):
+        ops.k_grouped_bf16_gemm_tn_contiguous(
+            a, b, d, ks, ks_tensor, c, compiled_dims)
+    # Einsum
+    def einsum(expr, a, b, d, c=None, use_cublaslt=False):
+        ops.einsum(expr, a, b, d, c, use_cublaslt)
+    def fp8_einsum(expr, a, b, d, c=None, recipe=(1, 128, 128)):
+        ops.fp8_einsum(expr, a[0], a[1], b[0], b[1], d, c, list(recipe))
+    # Attention
+    def fp8_gemm_nt_skip_head_mid(a, b, d, head_splits, recipe=None,
+            compiled_dims="nk", disable_ue8m0_cast=False):
+        ops.fp8_gemm_nt_skip_head_mid(
+            a[0], a[1], b[0], b[1], d, list(head_splits),
+            list(recipe) if recipe else None,
+            compiled_dims, disable_ue8m0_cast)
+    def fp8_mqa_logits(q, kv, weights, cu_seq_len_k_start,
+                       cu_seq_len_k_end, clean_logits=True, max_seqlen_k=0):
+        return ops.fp8_mqa_logits(
+            q, kv[0], kv[1], weights,
+            cu_seq_len_k_start, cu_seq_len_k_end,
+            clean_logits, max_seqlen_k)
+    def get_paged_mqa_logits_metadata(context_lens, block_kv, num_sms):
+        return ops.get_paged_mqa_logits_metadata(
+            context_lens, block_kv, num_sms)
+    def fp8_paged_mqa_logits(q, fused_kv_cache, weights, context_lens,
+                             block_table, schedule_meta,
+                             max_context_len, clean_logits=False):
+        return ops.fp8_paged_mqa_logits(
+            q, fused_kv_cache, weights, context_lens,
+            block_table, schedule_meta, max_context_len, clean_logits)
+    # Hyperconnection
+    def tf32_hc_prenorm_gemm(a, b, d, sqr_sum, num_splits=None):
+        ops.tf32_hc_prenorm_gemm(a, b, d, sqr_sum, num_splits)
+    # Layout
+    def transform_sf_into_required_layout(sf, mn, k, recipe=None,
+            recipe_ab=None, num_groups=None, is_sfa=False,
+            disable_ue8m0_cast=False):
+        return ops.transform_sf_into_required_layout(
+            sf, mn, k,
+            list(recipe) if recipe else None,
+            list(recipe_ab) if recipe_ab else None,
+            num_groups, is_sfa, disable_ue8m0_cast)
+    def get_mk_alignment_for_contiguous_layout():
+        return ops.get_mk_alignment_for_contiguous_layout()
+    # Legacy aliases
+    fp8_m_grouped_gemm_nt_masked = m_grouped_fp8_fp4_gemm_nt_masked
+    bf16_m_grouped_gemm_nt_masked = m_grouped_bf16_gemm_nt_masked
+except Exception:
+    pass
+# Utils
+from . import utils
+from .utils import *
+# Testing
+from . import testing
+# Initialize (gracefully skip if CUDA is not available, e.g. in build sandboxes)
+try:
+    ops.init(
+        os.path.dirname(os.path.abspath(__file__)),
+        _find_cuda_home(),
+        _find_cutlass_include()
+    )
+except Exception:
+    pass
+__version__ = '2.3.0'

build/torch210-cxx11-cu126-x86_64-linux/_deep_gemm_099ac3c_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be5d0bb69c96d55b15ba62ba83e0743eb80ef4e93198fe59862dc247540f4956
+size 3006712

build/torch210-cxx11-cu126-x86_64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _deep_gemm_099ac3c_dirty
+ops = torch.ops._deep_gemm_099ac3c_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_deep_gemm_099ac3c_dirty::{op_name}"

build/torch210-cxx11-cu126-x86_64-linux/deep_gemm/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import sys
+import importlib
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch210-cxx11-cu126-x86_64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "python-depends": []
+}

build/torch210-cxx11-cu126-x86_64-linux/testing/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from . import bench, numeric, utils
+from .bench import *
+from .numeric import *
+from .utils import *

build/torch210-cxx11-cu126-x86_64-linux/testing/bench.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import os
+import sys
+import torch
+def bench(fn, num_warmups: int = 5, num_tests: int = 10,
+          high_precision: bool = False):
+    # Flush L2 cache with 256 MB data
+    torch.cuda.synchronize()
+    cache = torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda')
+    cache.zero_()
+    # Warmup
+    for _ in range(num_warmups):
+        fn()
+    # Add a large kernel to eliminate the CPU launch overhead
+    if high_precision:
+        x = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
+        y = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
+        x @ y
+    # Testing
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    for i in range(num_tests):
+        fn()
+    end_event.record()
+    torch.cuda.synchronize()
+    return start_event.elapsed_time(end_event) / num_tests / 1e3
+class empty_suppress:
+    def __enter__(self):
+        return self
+    def __exit__(self, *_):
+        pass
+class suppress_stdout_stderr:
+    def __enter__(self):
+        self.outnull_file = open(os.devnull, 'w')
+        self.errnull_file = open(os.devnull, 'w')
+        self.old_stdout_fileno_undup = sys.stdout.fileno()
+        self.old_stderr_fileno_undup = sys.stderr.fileno()
+        self.old_stdout_fileno = os.dup(sys.stdout.fileno())
+        self.old_stderr_fileno = os.dup(sys.stderr.fileno())
+        self.old_stdout = sys.stdout
+        self.old_stderr = sys.stderr
+        os.dup2(self.outnull_file.fileno(), self.old_stdout_fileno_undup)
+        os.dup2(self.errnull_file.fileno(), self.old_stderr_fileno_undup)
+        sys.stdout = self.outnull_file
+        sys.stderr = self.errnull_file
+        return self
+    def __exit__(self, *_):
+        sys.stdout = self.old_stdout
+        sys.stderr = self.old_stderr
+        os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
+        os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)
+        os.close(self.old_stdout_fileno)
+        os.close(self.old_stderr_fileno)
+        self.outnull_file.close()
+        self.errnull_file.close()
+def bench_kineto(fn, kernel_names, num_tests: int = 30,
+                 suppress_kineto_output: bool = False,
+                 trace_path: str = None, flush_l2: bool = True,
+                 with_multiple_kernels: bool = False):
+    assert isinstance(kernel_names, str) or isinstance(kernel_names, tuple)
+    is_tuple = isinstance(kernel_names, tuple)
+    # Skip profiling
+    # Conflict with Nsight Systems, Nsight Compute and Compute Sanitizer
+    if int(os.environ.get('DG_USE_NVIDIA_TOOLS', 0)):
+        return (1, ) * len(kernel_names) if is_tuple else 1
+    # By default, flush L2 with an excessive 8 GB memset to give the GPU some (literal) chill time without full idle
+    flush_l2_size = int(8e9 // 4)
+    # For some auto-tuning kernels with prints
+    fn()
+    # Profile
+    suppress = suppress_stdout_stderr if suppress_kineto_output else empty_suppress
+    with suppress():
+        schedule = torch.profiler.schedule(wait=1, warmup=0, active=1, repeat=1)
+        profiler = torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA], schedule=schedule)
+        with profiler:
+            for i in range(2):
+                for _ in range(num_tests):
+                    if flush_l2:
+                        torch.empty(flush_l2_size, dtype=torch.int, device='cuda').zero_()
+                    fn()
+                profiler.step()
+    # Parse the profiling table
+    prof_lines = profiler.key_averages().table(sort_by='cuda_time_total', max_name_column_width=100).split('\n')
+    kernel_names = (kernel_names, ) if isinstance(kernel_names, str) else kernel_names
+    if not with_multiple_kernels:
+        for name in kernel_names:
+            assert sum([name in line for line in prof_lines]) <= 1, f'Errors of the kernel {name} in the profiling table'
+    # Save chrome traces
+    if trace_path is not None:
+        profiler.export_chrome_trace(trace_path)
+    # Return average kernel times
+    units = {'ms': 1e3, 'us': 1e6}
+    kernel_times = []
+    for name in kernel_names:
+        total_time = 0
+        total_num = 0
+        for line in prof_lines:
+            if name in line:
+                time_str = line.split()[-2]
+                num_str = line.split()[-1]
+                for unit, scale in units.items():
+                    if unit in time_str:
+                        total_time += float(time_str.replace(unit, '')) / scale * int(num_str)
+                        total_num += int(num_str)
+                        break
+        kernel_times.append(total_time / total_num if total_num > 0 else 0)
+    return tuple(kernel_times) if is_tuple else kernel_times[0]

build/torch210-cxx11-cu126-x86_64-linux/testing/numeric.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch
+from typing import Iterable
+def calc_diff(x: torch.Tensor, y: torch.Tensor):
+    x, y = x.double(), y.double()
+    denominator = (x * x + y * y).sum()
+    if denominator == 0:    # Which means that all elements in x and y are 0
+        return 0.0
+    sim = 2 * (x * y).sum() / denominator
+    return 1 - sim
+def count_bytes(*tensors):
+    total = 0
+    for t in tensors:
+        if isinstance(t, (tuple, list)):
+            total += count_bytes(*t)
+        elif t is not None:
+            total += t.numel() * t.element_size()
+    return total

build/torch210-cxx11-cu126-x86_64-linux/testing/utils.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import functools
+import os
+import torch
+from typing import Callable
+def get_arch_major() -> int:
+    major, minor = torch.cuda.get_device_capability()
+    return major
+def test_filter(condition: Callable):
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            if condition():
+                func(*args, **kwargs)
+            else:
+                print(f'{func.__name__}:')
+                print(f' > Filtered by {condition}')
+                print()
+        return wrapper
+    return decorator
+def ignore_env(name: str, condition: Callable):
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            if condition():
+                saved = os.environ.pop(name, None)
+                func(*args, **kwargs)
+                if saved is not None:
+                    os.environ[name] = saved
+            else:
+                func(*args, **kwargs)
+        return wrapper
+    return decorator

build/torch210-cxx11-cu126-x86_64-linux/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from . import math, layout
+from .layout import *
+from .math import *

build/torch210-cxx11-cu126-x86_64-linux/utils/layout.py ADDED Viewed

	@@ -0,0 +1,25 @@

+try:
+    from .._ops import ops
+    def get_tma_aligned_size(x, element_size):
+        return ops.get_tma_aligned_size(x, element_size)
+    def get_mn_major_tma_aligned_tensor(sf):
+        return ops.get_mn_major_tma_aligned_tensor(sf)
+    def get_mn_major_tma_aligned_packed_ue8m0_tensor(sf):
+        return ops.get_mn_major_tma_aligned_packed_ue8m0_tensor(sf)
+    def get_k_grouped_mn_major_tma_aligned_packed_ue8m0_tensor(sf, ks_tensor, ks):
+        return ops.get_k_grouped_mn_major_tma_aligned_packed_ue8m0_tensor(
+            sf, ks_tensor, ks)
+except ImportError:
+    pass
+from .._ops import ops as _ops
+def get_mk_alignment_for_contiguous_layout():
+    return _ops.get_mk_alignment_for_contiguous_layout()
+get_m_alignment_for_contiguous_layout = get_mk_alignment_for_contiguous_layout
+get_k_alignment_for_contiguous_layout = get_mk_alignment_for_contiguous_layout

build/torch210-cxx11-cu126-x86_64-linux/utils/math.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import torch
+from typing import Tuple
+def ceil_div(x: int, y: int) -> int:
+    return (x + y - 1) // y
+def align(x: int, y: int) -> int:
+    return ceil_div(x, y) * y
+def ceil_to_ue8m0(x: torch.Tensor):
+    assert x.view(-1).amax().item() > 0
+    return torch.pow(2.0, torch.ceil(torch.log2(x.abs())))
+def per_token_cast_to_fp8(x: torch.Tensor, use_ue8m0: bool, gran_k: int = 128) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    padded_n = align(n, gran_k)
+    x_padded = torch.empty((m, padded_n), dtype=x.dtype, device=x.device).fill_(0)
+    x_padded[:, :n] = x
+    x_view = x_padded.view(m, -1, gran_k)
+    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
+    sf = x_amax / 448.0
+    sf = ceil_to_ue8m0(sf) if use_ue8m0 else sf
+    return (x_view * (1.0 / sf.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, padded_n)[:, :n].contiguous(), sf
+def per_channel_cast_to_fp8(x: torch.Tensor, use_ue8m0: bool, gran_k: int = 128) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2 and x.size(0) % gran_k == 0
+    m, n = x.shape
+    x_view = x.view(-1, gran_k, n)
+    x_amax = x_view.abs().float().amax(dim=1).view(-1, n).clamp(1e-4)
+    sf = x_amax / 448.0
+    sf = ceil_to_ue8m0(sf) if use_ue8m0 else sf
+    return (x_view * (1.0 / sf.unsqueeze(1))).to(torch.float8_e4m3fn).view(m, n), sf
+def per_block_cast_to_fp8(x: torch.Tensor, use_ue8m0: bool, gran_k: int = 128) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros((align(m, gran_k), align(n, gran_k)), dtype=x.dtype, device=x.device)
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, gran_k, x_padded.size(1) // gran_k, gran_k)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    sf = x_amax / 448.0
+    sf = ceil_to_ue8m0(sf) if use_ue8m0 else sf
+    x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn)
+    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(x_view.size(0), x_view.size(2))
+def per_custom_dims_cast_to_fp8(x: torch.Tensor, dims: Tuple, use_ue8m0: bool) -> Tuple[torch.Tensor, torch.Tensor]:
+    excluded_dims = tuple([i for i in range(x.dim()) if i not in set(dims)])
+    x_amax = x.abs().float().amax(dim=excluded_dims, keepdim=True).clamp(1e-4)
+    sf = x_amax / 448.0
+    sf = ceil_to_ue8m0(sf) if use_ue8m0 else sf
+    x_scaled = (x * (1.0 / sf)).to(torch.float8_e4m3fn)
+    return x_scaled, sf.squeeze()
+def _quantize_to_fp4_e2m1(x: torch.Tensor) -> torch.Tensor:
+    ax = x.abs().clamp_max(6.0)
+    # {0, 0.5, 1, 1.5, 2, 3, 4, 6}
+    # midpoints: 0.25, 0.75, 1.25, 1.75, 2.5, 3.5, 5.0
+    boundaries = torch.tensor([0.25, 0.75, 1.25, 1.75, 2.5, 3.5, 5.0],
+                              device=x.device, dtype=ax.dtype)
+    idx = torch.bucketize(ax, boundaries)
+    code = idx.to(torch.uint8)
+    sign = (x < 0) & (idx != 0)
+    code = code | (sign.to(torch.uint8) << 3)
+    return code  # uint8, 0..15
+def per_token_cast_to_fp4(x: torch.Tensor, use_ue8m0: bool, gran_k: int = 128) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    assert n % 2 == 0
+    padded_n = align(n, gran_k)
+    x_padded = torch.zeros((m, padded_n), dtype=x.dtype, device=x.device)
+    x_padded[:, :n] = x
+    x_view = x_padded.view(m, -1, gran_k)
+    x_amax = x_view.abs().float().amax(dim=2).clamp_min(1e-4)
+    sf = x_amax / 6.0
+    sf = ceil_to_ue8m0(sf) if use_ue8m0 else sf
+    x_scaled = x_view * (1.0 / sf.unsqueeze(2))
+    codes = _quantize_to_fp4_e2m1(x_scaled).view(m, padded_n)  # uint8, (m, padded_n)
+    codes2 = codes.view(m, padded_n // 2, 2)
+    packed = (codes2[:, :, 0] & 0x0F) | ((codes2[:, :, 1] & 0x0F) << 4)  # uint8
+    return packed[:, :n // 2].contiguous(), sf
+def transpose_packed_fp4(a: torch.Tensor) -> torch.Tensor:
+    assert a.dtype == torch.uint8
+    assert a.dim() == 2
+    m, n2 = a.shape
+    n = n2 * 2
+    assert (m % 2) == 0
+    lo = a & 0x0F
+    hi = (a >> 4) & 0x0F
+    codes = torch.empty((m, n), device=a.device, dtype=torch.uint8)
+    codes[:, 0::2], codes[:, 1::2] = lo, hi
+    codes_t = codes.transpose(0, 1).contiguous()
+    codes2 = codes_t.view(n, m // 2, 2)
+    out = (codes2[:, :, 0] & 0x0F) | ((codes2[:, :, 1] & 0x0F) << 4)
+    return out.contiguous()

build/torch210-cxx11-cu128-x86_64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import os
+import subprocess
+import torch
+from ._ops import ops
+def _find_cuda_home():
+    cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH')
+    if cuda_home is None:
+        try:
+            with open(os.devnull, 'w') as devnull:
+                nvcc = subprocess.check_output(
+                    ['which', 'nvcc'], stderr=devnull
+                ).decode().rstrip('\r\n')
+                cuda_home = os.path.dirname(os.path.dirname(nvcc))
+        except Exception:
+            cuda_home = '/usr/local/cuda'
+            if not os.path.exists(cuda_home):
+                cuda_home = ''
+    return cuda_home or ''
+def _find_cutlass_include():
+    """Find CUTLASS include path for JIT compilation of .cuh templates."""
+    # 1. Explicit env var
+    cutlass_include = os.environ.get('DG_CUTLASS_INCLUDE')
+    if cutlass_include and os.path.isdir(cutlass_include):
+        return cutlass_include
+    # 2. CUTLASS_HOME env var
+    cutlass_home = os.environ.get('CUTLASS_HOME')
+    if cutlass_home:
+        p = os.path.join(cutlass_home, 'include')
+        if os.path.isdir(os.path.join(p, 'cute')):
+            return p
+    # 3. Check in package include/ directory (bundled cute/cutlass headers)
+    pkg_include = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'include')
+    if os.path.isdir(os.path.join(pkg_include, 'cute')):
+        return pkg_include
+    # 4. Check CUDA_HOME/include (some CUDA 12.8+ installs include cute/)
+    cuda_home = _find_cuda_home()
+    if cuda_home:
+        cuda_inc = os.path.join(cuda_home, 'include')
+        if os.path.isdir(os.path.join(cuda_inc, 'cute')):
+            return cuda_inc
+    # 5. Try to find nvidia-cutlass Python package
+    try:
+        import cutlass as _cutlass
+        cutlass_dir = os.path.dirname(_cutlass.__file__)
+        p = os.path.join(cutlass_dir, 'include')
+        if os.path.isdir(os.path.join(p, 'cute')):
+            return p
+    except ImportError:
+        pass
+    # Return empty string; C++ side will also check env vars
+    return ""
+def set_num_sms(new_num_sms):
+    ops.set_num_sms(new_num_sms)
+def get_num_sms():
+    return ops.get_num_sms()
+def set_tc_util(new_tc_util):
+    ops.set_tc_util(new_tc_util)
+def get_tc_util():
+    return ops.get_tc_util()
+# cuBLASLt GEMMs
+def cublaslt_gemm_nt(a, b, d, c=None):
+    ops.cublaslt_gemm_nt(a, b, d, c)
+def cublaslt_gemm_nn(a, b, d, c=None):
+    ops.cublaslt_gemm_nn(a, b, d, c)
+def cublaslt_gemm_tn(a, b, d, c=None):
+    ops.cublaslt_gemm_tn(a, b, d, c)
+def cublaslt_gemm_tt(a, b, d, c=None):
+    ops.cublaslt_gemm_tt(a, b, d, c)
+try:
+    # FP8/FP4 GEMMs
+    def fp8_fp4_gemm_nt(a, b, d, c=None, recipe=None, recipe_a=None,
+                        recipe_b=None, compiled_dims="nk", disable_ue8m0_cast=False):
+        ops.fp8_fp4_gemm_nt(a[0], a[1], b[0], b[1], d, c,
+                            list(recipe) if recipe else None,
+                            list(recipe_a) if recipe_a else None,
+                            list(recipe_b) if recipe_b else None,
+                            compiled_dims, disable_ue8m0_cast)
+    def fp8_fp4_gemm_nn(a, b, d, c=None, recipe=None, recipe_a=None,
+                        recipe_b=None, compiled_dims="nk", disable_ue8m0_cast=False):
+        ops.fp8_fp4_gemm_nn(a[0], a[1], b[0], b[1], d, c,
+                            list(recipe) if recipe else None,
+                            list(recipe_a) if recipe_a else None,
+                            list(recipe_b) if recipe_b else None,
+                            compiled_dims, disable_ue8m0_cast)
+    def fp8_fp4_gemm_tn(a, b, d, c=None, recipe=None, recipe_a=None,
+                        recipe_b=None, compiled_dims="mn", disable_ue8m0_cast=False):
+        ops.fp8_fp4_gemm_tn(a[0], a[1], b[0], b[1], d, c,
+                            list(recipe) if recipe else None,
+                            list(recipe_a) if recipe_a else None,
+                            list(recipe_b) if recipe_b else None,
+                            compiled_dims, disable_ue8m0_cast)
+    def fp8_fp4_gemm_tt(a, b, d, c=None, recipe=None, recipe_a=None,
+                        recipe_b=None, compiled_dims="mn", disable_ue8m0_cast=False):
+        ops.fp8_fp4_gemm_tt(a[0], a[1], b[0], b[1], d, c,
+                            list(recipe) if recipe else None,
+                            list(recipe_a) if recipe_a else None,
+                            list(recipe_b) if recipe_b else None,
+                            compiled_dims, disable_ue8m0_cast)
+    fp8_gemm_nt = fp8_fp4_gemm_nt
+    fp8_gemm_nn = fp8_fp4_gemm_nn
+    fp8_gemm_tn = fp8_fp4_gemm_tn
+    fp8_gemm_tt = fp8_fp4_gemm_tt
+    def m_grouped_fp8_fp4_gemm_nt_contiguous(a, b, d, grouped_layout,
+            recipe=None, recipe_a=None, recipe_b=None, compiled_dims="nk",
+            disable_ue8m0_cast=False, use_psum_layout=False,
+            expected_m_for_psum_layout=None):
+        ops.m_grouped_fp8_fp4_gemm_nt_contiguous(
+            a[0], a[1], b[0], b[1], d, grouped_layout,
+            list(recipe) if recipe else None,
+            list(recipe_a) if recipe_a else None,
+            list(recipe_b) if recipe_b else None,
+            compiled_dims, disable_ue8m0_cast, use_psum_layout,
+            expected_m_for_psum_layout)
+    m_grouped_fp8_gemm_nt_contiguous = m_grouped_fp8_fp4_gemm_nt_contiguous
+    def m_grouped_fp8_fp4_gemm_nn_contiguous(a, b, d, grouped_layout,
+            recipe=None, recipe_a=None, recipe_b=None, compiled_dims="nk",
+            disable_ue8m0_cast=False, use_psum_layout=False):
+        ops.m_grouped_fp8_fp4_gemm_nn_contiguous(
+            a[0], a[1], b[0], b[1], d, grouped_layout,
+            list(recipe) if recipe else None,
+            list(recipe_a) if recipe_a else None,
+            list(recipe_b) if recipe_b else None,
+            compiled_dims, disable_ue8m0_cast, use_psum_layout)
+    m_grouped_fp8_gemm_nn_contiguous = m_grouped_fp8_fp4_gemm_nn_contiguous
+    def m_grouped_fp8_fp4_gemm_nt_masked(a, b, d, masked_m, expected_m,
+            recipe=None, recipe_a=None, recipe_b=None, compiled_dims="nk",
+            disable_ue8m0_cast=False):
+        ops.m_grouped_fp8_fp4_gemm_nt_masked(
+            a[0], a[1], b[0], b[1], d, masked_m, expected_m,
+            list(recipe) if recipe else None,
+            list(recipe_a) if recipe_a else None,
+            list(recipe_b) if recipe_b else None,
+            compiled_dims, disable_ue8m0_cast)
+    m_grouped_fp8_gemm_nt_masked = m_grouped_fp8_fp4_gemm_nt_masked
+    def k_grouped_fp8_gemm_nt_contiguous(a, b, d, ks, ks_tensor, c=None,
+            recipe=(1, 1, 128), compiled_dims="mn"):
+        ops.k_grouped_fp8_gemm_nt_contiguous(
+            a[0], a[1], b[0], b[1], d, ks, ks_tensor, c,
+            list(recipe), compiled_dims)
+    def k_grouped_fp8_gemm_tn_contiguous(a, b, d, ks, ks_tensor, c=None,
+            recipe=(1, 1, 128), compiled_dims="mn"):
+        ops.k_grouped_fp8_gemm_tn_contiguous(
+            a[0], a[1], b[0], b[1], d, ks, ks_tensor, c,
+            list(recipe), compiled_dims)
+    # BF16 GEMMs
+    def bf16_gemm_nt(a, b, d, c=None, compiled_dims="nk"):
+        ops.bf16_gemm_nt(a, b, d, c, compiled_dims)
+    def bf16_gemm_nn(a, b, d, c=None, compiled_dims="nk"):
+        ops.bf16_gemm_nn(a, b, d, c, compiled_dims)
+    def bf16_gemm_tn(a, b, d, c=None, compiled_dims="mn"):
+        ops.bf16_gemm_tn(a, b, d, c, compiled_dims)
+    def bf16_gemm_tt(a, b, d, c=None, compiled_dims="mn"):
+        ops.bf16_gemm_tt(a, b, d, c, compiled_dims)
+    def m_grouped_bf16_gemm_nt_contiguous(a, b, d, grouped_layout,
+            compiled_dims="nk", use_psum_layout=False,
+            expected_m_for_psum_layout=None):
+        ops.m_grouped_bf16_gemm_nt_contiguous(
+            a, b, d, grouped_layout, compiled_dims,
+            use_psum_layout, expected_m_for_psum_layout)
+    def m_grouped_bf16_gemm_nn_contiguous(a, b, d, grouped_layout,
+            compiled_dims="nk", use_psum_layout=False):
+        ops.m_grouped_bf16_gemm_nn_contiguous(
+            a, b, d, grouped_layout, compiled_dims, use_psum_layout)
+    def m_grouped_bf16_gemm_nt_masked(a, b, d, masked_m, expected_m,
+            compiled_dims="nk"):
+        ops.m_grouped_bf16_gemm_nt_masked(
+            a, b, d, masked_m, expected_m, compiled_dims)
+    def k_grouped_bf16_gemm_tn_contiguous(a, b, d, ks, ks_tensor,
+            c=None, compiled_dims="mn"):
+        ops.k_grouped_bf16_gemm_tn_contiguous(
+            a, b, d, ks, ks_tensor, c, compiled_dims)
+    # Einsum
+    def einsum(expr, a, b, d, c=None, use_cublaslt=False):
+        ops.einsum(expr, a, b, d, c, use_cublaslt)
+    def fp8_einsum(expr, a, b, d, c=None, recipe=(1, 128, 128)):
+        ops.fp8_einsum(expr, a[0], a[1], b[0], b[1], d, c, list(recipe))
+    # Attention
+    def fp8_gemm_nt_skip_head_mid(a, b, d, head_splits, recipe=None,
+            compiled_dims="nk", disable_ue8m0_cast=False):
+        ops.fp8_gemm_nt_skip_head_mid(
+            a[0], a[1], b[0], b[1], d, list(head_splits),
+            list(recipe) if recipe else None,
+            compiled_dims, disable_ue8m0_cast)
+    def fp8_mqa_logits(q, kv, weights, cu_seq_len_k_start,
+                       cu_seq_len_k_end, clean_logits=True, max_seqlen_k=0):
+        return ops.fp8_mqa_logits(
+            q, kv[0], kv[1], weights,
+            cu_seq_len_k_start, cu_seq_len_k_end,
+            clean_logits, max_seqlen_k)
+    def get_paged_mqa_logits_metadata(context_lens, block_kv, num_sms):
+        return ops.get_paged_mqa_logits_metadata(
+            context_lens, block_kv, num_sms)
+    def fp8_paged_mqa_logits(q, fused_kv_cache, weights, context_lens,
+                             block_table, schedule_meta,
+                             max_context_len, clean_logits=False):
+        return ops.fp8_paged_mqa_logits(
+            q, fused_kv_cache, weights, context_lens,
+            block_table, schedule_meta, max_context_len, clean_logits)
+    # Hyperconnection
+    def tf32_hc_prenorm_gemm(a, b, d, sqr_sum, num_splits=None):
+        ops.tf32_hc_prenorm_gemm(a, b, d, sqr_sum, num_splits)
+    # Layout
+    def transform_sf_into_required_layout(sf, mn, k, recipe=None,
+            recipe_ab=None, num_groups=None, is_sfa=False,
+            disable_ue8m0_cast=False):
+        return ops.transform_sf_into_required_layout(
+            sf, mn, k,
+            list(recipe) if recipe else None,
+            list(recipe_ab) if recipe_ab else None,
+            num_groups, is_sfa, disable_ue8m0_cast)
+    def get_mk_alignment_for_contiguous_layout():
+        return ops.get_mk_alignment_for_contiguous_layout()
+    # Legacy aliases
+    fp8_m_grouped_gemm_nt_masked = m_grouped_fp8_fp4_gemm_nt_masked
+    bf16_m_grouped_gemm_nt_masked = m_grouped_bf16_gemm_nt_masked
+except Exception:
+    pass
+# Utils
+from . import utils
+from .utils import *
+# Testing
+from . import testing
+# Initialize (gracefully skip if CUDA is not available, e.g. in build sandboxes)
+try:
+    ops.init(
+        os.path.dirname(os.path.abspath(__file__)),
+        _find_cuda_home(),
+        _find_cutlass_include()
+    )
+except Exception:
+    pass
+__version__ = '2.3.0'

build/torch210-cxx11-cu128-x86_64-linux/_deep_gemm_099ac3c_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b4ca9c42204f1909adcefc61053c7943c105eadb44a447a1ea9a488e01675df
+size 3078080

build/torch210-cxx11-cu128-x86_64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _deep_gemm_099ac3c_dirty
+ops = torch.ops._deep_gemm_099ac3c_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_deep_gemm_099ac3c_dirty::{op_name}"

build/torch210-cxx11-cu128-x86_64-linux/deep_gemm/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import sys
+import importlib
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch210-cxx11-cu128-x86_64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "python-depends": []
+}

build/torch210-cxx11-cu128-x86_64-linux/testing/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from . import bench, numeric, utils
+from .bench import *
+from .numeric import *
+from .utils import *

build/torch210-cxx11-cu128-x86_64-linux/testing/bench.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import os
+import sys
+import torch
+def bench(fn, num_warmups: int = 5, num_tests: int = 10,
+          high_precision: bool = False):
+    # Flush L2 cache with 256 MB data
+    torch.cuda.synchronize()
+    cache = torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda')
+    cache.zero_()
+    # Warmup
+    for _ in range(num_warmups):
+        fn()
+    # Add a large kernel to eliminate the CPU launch overhead
+    if high_precision:
+        x = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
+        y = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
+        x @ y
+    # Testing
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    for i in range(num_tests):
+        fn()
+    end_event.record()
+    torch.cuda.synchronize()
+    return start_event.elapsed_time(end_event) / num_tests / 1e3
+class empty_suppress:
+    def __enter__(self):
+        return self
+    def __exit__(self, *_):
+        pass
+class suppress_stdout_stderr:
+    def __enter__(self):
+        self.outnull_file = open(os.devnull, 'w')
+        self.errnull_file = open(os.devnull, 'w')
+        self.old_stdout_fileno_undup = sys.stdout.fileno()
+        self.old_stderr_fileno_undup = sys.stderr.fileno()
+        self.old_stdout_fileno = os.dup(sys.stdout.fileno())
+        self.old_stderr_fileno = os.dup(sys.stderr.fileno())
+        self.old_stdout = sys.stdout
+        self.old_stderr = sys.stderr
+        os.dup2(self.outnull_file.fileno(), self.old_stdout_fileno_undup)
+        os.dup2(self.errnull_file.fileno(), self.old_stderr_fileno_undup)
+        sys.stdout = self.outnull_file
+        sys.stderr = self.errnull_file
+        return self
+    def __exit__(self, *_):
+        sys.stdout = self.old_stdout
+        sys.stderr = self.old_stderr
+        os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
+        os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)
+        os.close(self.old_stdout_fileno)
+        os.close(self.old_stderr_fileno)
+        self.outnull_file.close()
+        self.errnull_file.close()
+def bench_kineto(fn, kernel_names, num_tests: int = 30,
+                 suppress_kineto_output: bool = False,
+                 trace_path: str = None, flush_l2: bool = True,
+                 with_multiple_kernels: bool = False):
+    assert isinstance(kernel_names, str) or isinstance(kernel_names, tuple)
+    is_tuple = isinstance(kernel_names, tuple)
+    # Skip profiling
+    # Conflict with Nsight Systems, Nsight Compute and Compute Sanitizer
+    if int(os.environ.get('DG_USE_NVIDIA_TOOLS', 0)):
+        return (1, ) * len(kernel_names) if is_tuple else 1
+    # By default, flush L2 with an excessive 8 GB memset to give the GPU some (literal) chill time without full idle
+    flush_l2_size = int(8e9 // 4)
+    # For some auto-tuning kernels with prints
+    fn()
+    # Profile
+    suppress = suppress_stdout_stderr if suppress_kineto_output else empty_suppress
+    with suppress():
+        schedule = torch.profiler.schedule(wait=1, warmup=0, active=1, repeat=1)
+        profiler = torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA], schedule=schedule)
+        with profiler:
+            for i in range(2):
+                for _ in range(num_tests):
+                    if flush_l2:
+                        torch.empty(flush_l2_size, dtype=torch.int, device='cuda').zero_()
+                    fn()
+                profiler.step()
+    # Parse the profiling table
+    prof_lines = profiler.key_averages().table(sort_by='cuda_time_total', max_name_column_width=100).split('\n')
+    kernel_names = (kernel_names, ) if isinstance(kernel_names, str) else kernel_names
+    if not with_multiple_kernels:
+        for name in kernel_names:
+            assert sum([name in line for line in prof_lines]) <= 1, f'Errors of the kernel {name} in the profiling table'
+    # Save chrome traces
+    if trace_path is not None:
+        profiler.export_chrome_trace(trace_path)
+    # Return average kernel times
+    units = {'ms': 1e3, 'us': 1e6}
+    kernel_times = []
+    for name in kernel_names:
+        total_time = 0
+        total_num = 0
+        for line in prof_lines:
+            if name in line:
+                time_str = line.split()[-2]
+                num_str = line.split()[-1]
+                for unit, scale in units.items():
+                    if unit in time_str:
+                        total_time += float(time_str.replace(unit, '')) / scale * int(num_str)
+                        total_num += int(num_str)
+                        break
+        kernel_times.append(total_time / total_num if total_num > 0 else 0)
+    return tuple(kernel_times) if is_tuple else kernel_times[0]

build/torch210-cxx11-cu128-x86_64-linux/testing/numeric.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch
+from typing import Iterable
+def calc_diff(x: torch.Tensor, y: torch.Tensor):
+    x, y = x.double(), y.double()
+    denominator = (x * x + y * y).sum()
+    if denominator == 0:    # Which means that all elements in x and y are 0
+        return 0.0
+    sim = 2 * (x * y).sum() / denominator
+    return 1 - sim
+def count_bytes(*tensors):
+    total = 0
+    for t in tensors:
+        if isinstance(t, (tuple, list)):
+            total += count_bytes(*t)
+        elif t is not None:
+            total += t.numel() * t.element_size()
+    return total

build/torch210-cxx11-cu128-x86_64-linux/testing/utils.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import functools
+import os
+import torch
+from typing import Callable
+def get_arch_major() -> int:
+    major, minor = torch.cuda.get_device_capability()
+    return major
+def test_filter(condition: Callable):
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            if condition():
+                func(*args, **kwargs)
+            else:
+                print(f'{func.__name__}:')
+                print(f' > Filtered by {condition}')
+                print()
+        return wrapper
+    return decorator
+def ignore_env(name: str, condition: Callable):
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            if condition():
+                saved = os.environ.pop(name, None)
+                func(*args, **kwargs)
+                if saved is not None:
+                    os.environ[name] = saved
+            else:
+                func(*args, **kwargs)
+        return wrapper
+    return decorator

build/torch210-cxx11-cu128-x86_64-linux/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from . import math, layout
+from .layout import *
+from .math import *

build/torch210-cxx11-cu128-x86_64-linux/utils/layout.py ADDED Viewed

	@@ -0,0 +1,25 @@

+try:
+    from .._ops import ops
+    def get_tma_aligned_size(x, element_size):
+        return ops.get_tma_aligned_size(x, element_size)
+    def get_mn_major_tma_aligned_tensor(sf):
+        return ops.get_mn_major_tma_aligned_tensor(sf)
+    def get_mn_major_tma_aligned_packed_ue8m0_tensor(sf):
+        return ops.get_mn_major_tma_aligned_packed_ue8m0_tensor(sf)
+    def get_k_grouped_mn_major_tma_aligned_packed_ue8m0_tensor(sf, ks_tensor, ks):
+        return ops.get_k_grouped_mn_major_tma_aligned_packed_ue8m0_tensor(
+            sf, ks_tensor, ks)
+except ImportError:
+    pass
+from .._ops import ops as _ops
+def get_mk_alignment_for_contiguous_layout():
+    return _ops.get_mk_alignment_for_contiguous_layout()
+get_m_alignment_for_contiguous_layout = get_mk_alignment_for_contiguous_layout
+get_k_alignment_for_contiguous_layout = get_mk_alignment_for_contiguous_layout

build/torch210-cxx11-cu128-x86_64-linux/utils/math.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import torch
+from typing import Tuple
+def ceil_div(x: int, y: int) -> int:
+    return (x + y - 1) // y
+def align(x: int, y: int) -> int:
+    return ceil_div(x, y) * y
+def ceil_to_ue8m0(x: torch.Tensor):
+    assert x.view(-1).amax().item() > 0
+    return torch.pow(2.0, torch.ceil(torch.log2(x.abs())))
+def per_token_cast_to_fp8(x: torch.Tensor, use_ue8m0: bool, gran_k: int = 128) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    padded_n = align(n, gran_k)
+    x_padded = torch.empty((m, padded_n), dtype=x.dtype, device=x.device).fill_(0)
+    x_padded[:, :n] = x
+    x_view = x_padded.view(m, -1, gran_k)
+    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
+    sf = x_amax / 448.0
+    sf = ceil_to_ue8m0(sf) if use_ue8m0 else sf
+    return (x_view * (1.0 / sf.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, padded_n)[:, :n].contiguous(), sf
+def per_channel_cast_to_fp8(x: torch.Tensor, use_ue8m0: bool, gran_k: int = 128) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2 and x.size(0) % gran_k == 0
+    m, n = x.shape
+    x_view = x.view(-1, gran_k, n)
+    x_amax = x_view.abs().float().amax(dim=1).view(-1, n).clamp(1e-4)
+    sf = x_amax / 448.0
+    sf = ceil_to_ue8m0(sf) if use_ue8m0 else sf
+    return (x_view * (1.0 / sf.unsqueeze(1))).to(torch.float8_e4m3fn).view(m, n), sf
+def per_block_cast_to_fp8(x: torch.Tensor, use_ue8m0: bool, gran_k: int = 128) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros((align(m, gran_k), align(n, gran_k)), dtype=x.dtype, device=x.device)
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, gran_k, x_padded.size(1) // gran_k, gran_k)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    sf = x_amax / 448.0
+    sf = ceil_to_ue8m0(sf) if use_ue8m0 else sf
+    x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn)
+    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(x_view.size(0), x_view.size(2))
+def per_custom_dims_cast_to_fp8(x: torch.Tensor, dims: Tuple, use_ue8m0: bool) -> Tuple[torch.Tensor, torch.Tensor]:
+    excluded_dims = tuple([i for i in range(x.dim()) if i not in set(dims)])
+    x_amax = x.abs().float().amax(dim=excluded_dims, keepdim=True).clamp(1e-4)
+    sf = x_amax / 448.0
+    sf = ceil_to_ue8m0(sf) if use_ue8m0 else sf
+    x_scaled = (x * (1.0 / sf)).to(torch.float8_e4m3fn)
+    return x_scaled, sf.squeeze()
+def _quantize_to_fp4_e2m1(x: torch.Tensor) -> torch.Tensor:
+    ax = x.abs().clamp_max(6.0)
+    # {0, 0.5, 1, 1.5, 2, 3, 4, 6}
+    # midpoints: 0.25, 0.75, 1.25, 1.75, 2.5, 3.5, 5.0
+    boundaries = torch.tensor([0.25, 0.75, 1.25, 1.75, 2.5, 3.5, 5.0],
+                              device=x.device, dtype=ax.dtype)
+    idx = torch.bucketize(ax, boundaries)
+    code = idx.to(torch.uint8)
+    sign = (x < 0) & (idx != 0)
+    code = code | (sign.to(torch.uint8) << 3)
+    return code  # uint8, 0..15
+def per_token_cast_to_fp4(x: torch.Tensor, use_ue8m0: bool, gran_k: int = 128) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    assert n % 2 == 0
+    padded_n = align(n, gran_k)
+    x_padded = torch.zeros((m, padded_n), dtype=x.dtype, device=x.device)
+    x_padded[:, :n] = x
+    x_view = x_padded.view(m, -1, gran_k)
+    x_amax = x_view.abs().float().amax(dim=2).clamp_min(1e-4)
+    sf = x_amax / 6.0
+    sf = ceil_to_ue8m0(sf) if use_ue8m0 else sf
+    x_scaled = x_view * (1.0 / sf.unsqueeze(2))
+    codes = _quantize_to_fp4_e2m1(x_scaled).view(m, padded_n)  # uint8, (m, padded_n)
+    codes2 = codes.view(m, padded_n // 2, 2)
+    packed = (codes2[:, :, 0] & 0x0F) | ((codes2[:, :, 1] & 0x0F) << 4)  # uint8
+    return packed[:, :n // 2].contiguous(), sf
+def transpose_packed_fp4(a: torch.Tensor) -> torch.Tensor:
+    assert a.dtype == torch.uint8
+    assert a.dim() == 2
+    m, n2 = a.shape
+    n = n2 * 2
+    assert (m % 2) == 0
+    lo = a & 0x0F
+    hi = (a >> 4) & 0x0F
+    codes = torch.empty((m, n), device=a.device, dtype=torch.uint8)
+    codes[:, 0::2], codes[:, 1::2] = lo, hi
+    codes_t = codes.transpose(0, 1).contiguous()
+    codes2 = codes_t.view(n, m // 2, 2)
+    out = (codes2[:, :, 0] & 0x0F) | ((codes2[:, :, 1] & 0x0F) << 4)
+    return out.contiguous()

build/torch210-cxx11-cu130-x86_64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import os
+import subprocess
+import torch
+from ._ops import ops
+def _find_cuda_home():
+    cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH')
+    if cuda_home is None:
+        try:
+            with open(os.devnull, 'w') as devnull:
+                nvcc = subprocess.check_output(
+                    ['which', 'nvcc'], stderr=devnull
+                ).decode().rstrip('\r\n')
+                cuda_home = os.path.dirname(os.path.dirname(nvcc))
+        except Exception:
+            cuda_home = '/usr/local/cuda'
+            if not os.path.exists(cuda_home):
+                cuda_home = ''
+    return cuda_home or ''
+def _find_cutlass_include():
+    """Find CUTLASS include path for JIT compilation of .cuh templates."""
+    # 1. Explicit env var
+    cutlass_include = os.environ.get('DG_CUTLASS_INCLUDE')
+    if cutlass_include and os.path.isdir(cutlass_include):
+        return cutlass_include
+    # 2. CUTLASS_HOME env var
+    cutlass_home = os.environ.get('CUTLASS_HOME')
+    if cutlass_home:
+        p = os.path.join(cutlass_home, 'include')
+        if os.path.isdir(os.path.join(p, 'cute')):
+            return p
+    # 3. Check in package include/ directory (bundled cute/cutlass headers)
+    pkg_include = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'include')
+    if os.path.isdir(os.path.join(pkg_include, 'cute')):
+        return pkg_include
+    # 4. Check CUDA_HOME/include (some CUDA 12.8+ installs include cute/)
+    cuda_home = _find_cuda_home()
+    if cuda_home:
+        cuda_inc = os.path.join(cuda_home, 'include')
+        if os.path.isdir(os.path.join(cuda_inc, 'cute')):
+            return cuda_inc
+    # 5. Try to find nvidia-cutlass Python package
+    try:
+        import cutlass as _cutlass
+        cutlass_dir = os.path.dirname(_cutlass.__file__)
+        p = os.path.join(cutlass_dir, 'include')
+        if os.path.isdir(os.path.join(p, 'cute')):
+            return p
+    except ImportError:
+        pass
+    # Return empty string; C++ side will also check env vars
+    return ""
+def set_num_sms(new_num_sms):
+    ops.set_num_sms(new_num_sms)
+def get_num_sms():
+    return ops.get_num_sms()
+def set_tc_util(new_tc_util):
+    ops.set_tc_util(new_tc_util)
+def get_tc_util():
+    return ops.get_tc_util()
+# cuBLASLt GEMMs
+def cublaslt_gemm_nt(a, b, d, c=None):
+    ops.cublaslt_gemm_nt(a, b, d, c)
+def cublaslt_gemm_nn(a, b, d, c=None):
+    ops.cublaslt_gemm_nn(a, b, d, c)
+def cublaslt_gemm_tn(a, b, d, c=None):
+    ops.cublaslt_gemm_tn(a, b, d, c)
+def cublaslt_gemm_tt(a, b, d, c=None):
+    ops.cublaslt_gemm_tt(a, b, d, c)
+try:
+    # FP8/FP4 GEMMs
+    def fp8_fp4_gemm_nt(a, b, d, c=None, recipe=None, recipe_a=None,
+                        recipe_b=None, compiled_dims="nk", disable_ue8m0_cast=False):
+        ops.fp8_fp4_gemm_nt(a[0], a[1], b[0], b[1], d, c,
+                            list(recipe) if recipe else None,
+                            list(recipe_a) if recipe_a else None,
+                            list(recipe_b) if recipe_b else None,
+                            compiled_dims, disable_ue8m0_cast)
+    def fp8_fp4_gemm_nn(a, b, d, c=None, recipe=None, recipe_a=None,
+                        recipe_b=None, compiled_dims="nk", disable_ue8m0_cast=False):
+        ops.fp8_fp4_gemm_nn(a[0], a[1], b[0], b[1], d, c,
+                            list(recipe) if recipe else None,
+                            list(recipe_a) if recipe_a else None,
+                            list(recipe_b) if recipe_b else None,
+                            compiled_dims, disable_ue8m0_cast)
+    def fp8_fp4_gemm_tn(a, b, d, c=None, recipe=None, recipe_a=None,
+                        recipe_b=None, compiled_dims="mn", disable_ue8m0_cast=False):
+        ops.fp8_fp4_gemm_tn(a[0], a[1], b[0], b[1], d, c,
+                            list(recipe) if recipe else None,
+                            list(recipe_a) if recipe_a else None,
+                            list(recipe_b) if recipe_b else None,
+                            compiled_dims, disable_ue8m0_cast)
+    def fp8_fp4_gemm_tt(a, b, d, c=None, recipe=None, recipe_a=None,
+                        recipe_b=None, compiled_dims="mn", disable_ue8m0_cast=False):
+        ops.fp8_fp4_gemm_tt(a[0], a[1], b[0], b[1], d, c,
+                            list(recipe) if recipe else None,
+                            list(recipe_a) if recipe_a else None,
+                            list(recipe_b) if recipe_b else None,
+                            compiled_dims, disable_ue8m0_cast)
+    fp8_gemm_nt = fp8_fp4_gemm_nt
+    fp8_gemm_nn = fp8_fp4_gemm_nn
+    fp8_gemm_tn = fp8_fp4_gemm_tn
+    fp8_gemm_tt = fp8_fp4_gemm_tt
+    def m_grouped_fp8_fp4_gemm_nt_contiguous(a, b, d, grouped_layout,
+            recipe=None, recipe_a=None, recipe_b=None, compiled_dims="nk",
+            disable_ue8m0_cast=False, use_psum_layout=False,
+            expected_m_for_psum_layout=None):
+        ops.m_grouped_fp8_fp4_gemm_nt_contiguous(
+            a[0], a[1], b[0], b[1], d, grouped_layout,
+            list(recipe) if recipe else None,
+            list(recipe_a) if recipe_a else None,
+            list(recipe_b) if recipe_b else None,
+            compiled_dims, disable_ue8m0_cast, use_psum_layout,
+            expected_m_for_psum_layout)
+    m_grouped_fp8_gemm_nt_contiguous = m_grouped_fp8_fp4_gemm_nt_contiguous
+    def m_grouped_fp8_fp4_gemm_nn_contiguous(a, b, d, grouped_layout,
+            recipe=None, recipe_a=None, recipe_b=None, compiled_dims="nk",
+            disable_ue8m0_cast=False, use_psum_layout=False):
+        ops.m_grouped_fp8_fp4_gemm_nn_contiguous(
+            a[0], a[1], b[0], b[1], d, grouped_layout,
+            list(recipe) if recipe else None,
+            list(recipe_a) if recipe_a else None,
+            list(recipe_b) if recipe_b else None,
+            compiled_dims, disable_ue8m0_cast, use_psum_layout)
+    m_grouped_fp8_gemm_nn_contiguous = m_grouped_fp8_fp4_gemm_nn_contiguous
+    def m_grouped_fp8_fp4_gemm_nt_masked(a, b, d, masked_m, expected_m,
+            recipe=None, recipe_a=None, recipe_b=None, compiled_dims="nk",
+            disable_ue8m0_cast=False):
+        ops.m_grouped_fp8_fp4_gemm_nt_masked(
+            a[0], a[1], b[0], b[1], d, masked_m, expected_m,
+            list(recipe) if recipe else None,
+            list(recipe_a) if recipe_a else None,
+            list(recipe_b) if recipe_b else None,
+            compiled_dims, disable_ue8m0_cast)
+    m_grouped_fp8_gemm_nt_masked = m_grouped_fp8_fp4_gemm_nt_masked
+    def k_grouped_fp8_gemm_nt_contiguous(a, b, d, ks, ks_tensor, c=None,
+            recipe=(1, 1, 128), compiled_dims="mn"):
+        ops.k_grouped_fp8_gemm_nt_contiguous(
+            a[0], a[1], b[0], b[1], d, ks, ks_tensor, c,
+            list(recipe), compiled_dims)
+    def k_grouped_fp8_gemm_tn_contiguous(a, b, d, ks, ks_tensor, c=None,
+            recipe=(1, 1, 128), compiled_dims="mn"):
+        ops.k_grouped_fp8_gemm_tn_contiguous(
+            a[0], a[1], b[0], b[1], d, ks, ks_tensor, c,
+            list(recipe), compiled_dims)
+    # BF16 GEMMs
+    def bf16_gemm_nt(a, b, d, c=None, compiled_dims="nk"):
+        ops.bf16_gemm_nt(a, b, d, c, compiled_dims)
+    def bf16_gemm_nn(a, b, d, c=None, compiled_dims="nk"):
+        ops.bf16_gemm_nn(a, b, d, c, compiled_dims)
+    def bf16_gemm_tn(a, b, d, c=None, compiled_dims="mn"):
+        ops.bf16_gemm_tn(a, b, d, c, compiled_dims)
+    def bf16_gemm_tt(a, b, d, c=None, compiled_dims="mn"):
+        ops.bf16_gemm_tt(a, b, d, c, compiled_dims)
+    def m_grouped_bf16_gemm_nt_contiguous(a, b, d, grouped_layout,
+            compiled_dims="nk", use_psum_layout=False,
+            expected_m_for_psum_layout=None):
+        ops.m_grouped_bf16_gemm_nt_contiguous(
+            a, b, d, grouped_layout, compiled_dims,
+            use_psum_layout, expected_m_for_psum_layout)
+    def m_grouped_bf16_gemm_nn_contiguous(a, b, d, grouped_layout,
+            compiled_dims="nk", use_psum_layout=False):
+        ops.m_grouped_bf16_gemm_nn_contiguous(
+            a, b, d, grouped_layout, compiled_dims, use_psum_layout)
+    def m_grouped_bf16_gemm_nt_masked(a, b, d, masked_m, expected_m,
+            compiled_dims="nk"):
+        ops.m_grouped_bf16_gemm_nt_masked(
+            a, b, d, masked_m, expected_m, compiled_dims)
+    def k_grouped_bf16_gemm_tn_contiguous(a, b, d, ks, ks_tensor,
+            c=None, compiled_dims="mn"):
+        ops.k_grouped_bf16_gemm_tn_contiguous(
+            a, b, d, ks, ks_tensor, c, compiled_dims)
+    # Einsum
+    def einsum(expr, a, b, d, c=None, use_cublaslt=False):
+        ops.einsum(expr, a, b, d, c, use_cublaslt)
+    def fp8_einsum(expr, a, b, d, c=None, recipe=(1, 128, 128)):
+        ops.fp8_einsum(expr, a[0], a[1], b[0], b[1], d, c, list(recipe))
+    # Attention
+    def fp8_gemm_nt_skip_head_mid(a, b, d, head_splits, recipe=None,
+            compiled_dims="nk", disable_ue8m0_cast=False):
+        ops.fp8_gemm_nt_skip_head_mid(
+            a[0], a[1], b[0], b[1], d, list(head_splits),
+            list(recipe) if recipe else None,
+            compiled_dims, disable_ue8m0_cast)
+    def fp8_mqa_logits(q, kv, weights, cu_seq_len_k_start,
+                       cu_seq_len_k_end, clean_logits=True, max_seqlen_k=0):
+        return ops.fp8_mqa_logits(
+            q, kv[0], kv[1], weights,
+            cu_seq_len_k_start, cu_seq_len_k_end,
+            clean_logits, max_seqlen_k)
+    def get_paged_mqa_logits_metadata(context_lens, block_kv, num_sms):
+        return ops.get_paged_mqa_logits_metadata(
+            context_lens, block_kv, num_sms)
+    def fp8_paged_mqa_logits(q, fused_kv_cache, weights, context_lens,
+                             block_table, schedule_meta,
+                             max_context_len, clean_logits=False):
+        return ops.fp8_paged_mqa_logits(
+            q, fused_kv_cache, weights, context_lens,
+            block_table, schedule_meta, max_context_len, clean_logits)
+    # Hyperconnection
+    def tf32_hc_prenorm_gemm(a, b, d, sqr_sum, num_splits=None):
+        ops.tf32_hc_prenorm_gemm(a, b, d, sqr_sum, num_splits)
+    # Layout
+    def transform_sf_into_required_layout(sf, mn, k, recipe=None,
+            recipe_ab=None, num_groups=None, is_sfa=False,
+            disable_ue8m0_cast=False):
+        return ops.transform_sf_into_required_layout(
+            sf, mn, k,
+            list(recipe) if recipe else None,
+            list(recipe_ab) if recipe_ab else None,
+            num_groups, is_sfa, disable_ue8m0_cast)
+    def get_mk_alignment_for_contiguous_layout():
+        return ops.get_mk_alignment_for_contiguous_layout()
+    # Legacy aliases
+    fp8_m_grouped_gemm_nt_masked = m_grouped_fp8_fp4_gemm_nt_masked
+    bf16_m_grouped_gemm_nt_masked = m_grouped_bf16_gemm_nt_masked
+except Exception:
+    pass
+# Utils
+from . import utils
+from .utils import *
+# Testing
+from . import testing
+# Initialize (gracefully skip if CUDA is not available, e.g. in build sandboxes)
+try:
+    ops.init(
+        os.path.dirname(os.path.abspath(__file__)),
+        _find_cuda_home(),
+        _find_cutlass_include()
+    )
+except Exception:
+    pass
+__version__ = '2.3.0'

build/torch210-cxx11-cu130-x86_64-linux/_deep_gemm_099ac3c_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8307e5e24ea3f68435a8251df19977bfd2323e60f761b4c3cd7c5ba7aada4c3f
+size 3078072

build/torch210-cxx11-cu130-x86_64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _deep_gemm_099ac3c_dirty
+ops = torch.ops._deep_gemm_099ac3c_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_deep_gemm_099ac3c_dirty::{op_name}"

build/torch210-cxx11-cu130-x86_64-linux/deep_gemm/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import sys
+import importlib
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch210-cxx11-cu130-x86_64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "python-depends": []
+}

build/torch210-cxx11-cu130-x86_64-linux/testing/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from . import bench, numeric, utils
+from .bench import *
+from .numeric import *
+from .utils import *

build/torch210-cxx11-cu130-x86_64-linux/testing/bench.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import os
+import sys
+import torch
+def bench(fn, num_warmups: int = 5, num_tests: int = 10,
+          high_precision: bool = False):
+    # Flush L2 cache with 256 MB data
+    torch.cuda.synchronize()
+    cache = torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda')
+    cache.zero_()
+    # Warmup
+    for _ in range(num_warmups):
+        fn()
+    # Add a large kernel to eliminate the CPU launch overhead
+    if high_precision:
+        x = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
+        y = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
+        x @ y
+    # Testing
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    for i in range(num_tests):
+        fn()
+    end_event.record()
+    torch.cuda.synchronize()
+    return start_event.elapsed_time(end_event) / num_tests / 1e3
+class empty_suppress:
+    def __enter__(self):
+        return self
+    def __exit__(self, *_):
+        pass
+class suppress_stdout_stderr:
+    def __enter__(self):
+        self.outnull_file = open(os.devnull, 'w')
+        self.errnull_file = open(os.devnull, 'w')
+        self.old_stdout_fileno_undup = sys.stdout.fileno()
+        self.old_stderr_fileno_undup = sys.stderr.fileno()
+        self.old_stdout_fileno = os.dup(sys.stdout.fileno())
+        self.old_stderr_fileno = os.dup(sys.stderr.fileno())
+        self.old_stdout = sys.stdout
+        self.old_stderr = sys.stderr
+        os.dup2(self.outnull_file.fileno(), self.old_stdout_fileno_undup)
+        os.dup2(self.errnull_file.fileno(), self.old_stderr_fileno_undup)
+        sys.stdout = self.outnull_file
+        sys.stderr = self.errnull_file
+        return self
+    def __exit__(self, *_):
+        sys.stdout = self.old_stdout
+        sys.stderr = self.old_stderr
+        os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
+        os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)
+        os.close(self.old_stdout_fileno)
+        os.close(self.old_stderr_fileno)
+        self.outnull_file.close()
+        self.errnull_file.close()
+def bench_kineto(fn, kernel_names, num_tests: int = 30,
+                 suppress_kineto_output: bool = False,
+                 trace_path: str = None, flush_l2: bool = True,
+                 with_multiple_kernels: bool = False):
+    assert isinstance(kernel_names, str) or isinstance(kernel_names, tuple)
+    is_tuple = isinstance(kernel_names, tuple)
+    # Skip profiling
+    # Conflict with Nsight Systems, Nsight Compute and Compute Sanitizer
+    if int(os.environ.get('DG_USE_NVIDIA_TOOLS', 0)):
+        return (1, ) * len(kernel_names) if is_tuple else 1
+    # By default, flush L2 with an excessive 8 GB memset to give the GPU some (literal) chill time without full idle
+    flush_l2_size = int(8e9 // 4)
+    # For some auto-tuning kernels with prints
+    fn()
+    # Profile
+    suppress = suppress_stdout_stderr if suppress_kineto_output else empty_suppress
+    with suppress():
+        schedule = torch.profiler.schedule(wait=1, warmup=0, active=1, repeat=1)
+        profiler = torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA], schedule=schedule)
+        with profiler:
+            for i in range(2):
+                for _ in range(num_tests):
+                    if flush_l2:
+                        torch.empty(flush_l2_size, dtype=torch.int, device='cuda').zero_()
+                    fn()
+                profiler.step()
+    # Parse the profiling table
+    prof_lines = profiler.key_averages().table(sort_by='cuda_time_total', max_name_column_width=100).split('\n')
+    kernel_names = (kernel_names, ) if isinstance(kernel_names, str) else kernel_names
+    if not with_multiple_kernels:
+        for name in kernel_names:
+            assert sum([name in line for line in prof_lines]) <= 1, f'Errors of the kernel {name} in the profiling table'
+    # Save chrome traces
+    if trace_path is not None:
+        profiler.export_chrome_trace(trace_path)
+    # Return average kernel times
+    units = {'ms': 1e3, 'us': 1e6}
+    kernel_times = []
+    for name in kernel_names:
+        total_time = 0
+        total_num = 0
+        for line in prof_lines:
+            if name in line:
+                time_str = line.split()[-2]
+                num_str = line.split()[-1]
+                for unit, scale in units.items():
+                    if unit in time_str:
+                        total_time += float(time_str.replace(unit, '')) / scale * int(num_str)
+                        total_num += int(num_str)
+                        break
+        kernel_times.append(total_time / total_num if total_num > 0 else 0)
+    return tuple(kernel_times) if is_tuple else kernel_times[0]

build/torch210-cxx11-cu130-x86_64-linux/testing/numeric.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch
+from typing import Iterable
+def calc_diff(x: torch.Tensor, y: torch.Tensor):
+    x, y = x.double(), y.double()
+    denominator = (x * x + y * y).sum()
+    if denominator == 0:    # Which means that all elements in x and y are 0
+        return 0.0
+    sim = 2 * (x * y).sum() / denominator
+    return 1 - sim
+def count_bytes(*tensors):
+    total = 0
+    for t in tensors:
+        if isinstance(t, (tuple, list)):
+            total += count_bytes(*t)
+        elif t is not None:
+            total += t.numel() * t.element_size()
+    return total

build/torch210-cxx11-cu130-x86_64-linux/testing/utils.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import functools
+import os
+import torch
+from typing import Callable
+def get_arch_major() -> int:
+    major, minor = torch.cuda.get_device_capability()
+    return major
+def test_filter(condition: Callable):
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            if condition():
+                func(*args, **kwargs)
+            else:
+                print(f'{func.__name__}:')
+                print(f' > Filtered by {condition}')
+                print()
+        return wrapper
+    return decorator
+def ignore_env(name: str, condition: Callable):
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            if condition():
+                saved = os.environ.pop(name, None)
+                func(*args, **kwargs)
+                if saved is not None:
+                    os.environ[name] = saved
+            else:
+                func(*args, **kwargs)
+        return wrapper
+    return decorator

build/torch210-cxx11-cu130-x86_64-linux/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from . import math, layout
+from .layout import *
+from .math import *

build/torch210-cxx11-cu130-x86_64-linux/utils/layout.py ADDED Viewed

	@@ -0,0 +1,25 @@

+try:
+    from .._ops import ops
+    def get_tma_aligned_size(x, element_size):
+        return ops.get_tma_aligned_size(x, element_size)
+    def get_mn_major_tma_aligned_tensor(sf):
+        return ops.get_mn_major_tma_aligned_tensor(sf)
+    def get_mn_major_tma_aligned_packed_ue8m0_tensor(sf):
+        return ops.get_mn_major_tma_aligned_packed_ue8m0_tensor(sf)
+    def get_k_grouped_mn_major_tma_aligned_packed_ue8m0_tensor(sf, ks_tensor, ks):
+        return ops.get_k_grouped_mn_major_tma_aligned_packed_ue8m0_tensor(
+            sf, ks_tensor, ks)
+except ImportError:
+    pass
+from .._ops import ops as _ops
+def get_mk_alignment_for_contiguous_layout():
+    return _ops.get_mk_alignment_for_contiguous_layout()
+get_m_alignment_for_contiguous_layout = get_mk_alignment_for_contiguous_layout
+get_k_alignment_for_contiguous_layout = get_mk_alignment_for_contiguous_layout

build/torch210-cxx11-cu130-x86_64-linux/utils/math.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import torch
+from typing import Tuple
+def ceil_div(x: int, y: int) -> int:
+    return (x + y - 1) // y
+def align(x: int, y: int) -> int:
+    return ceil_div(x, y) * y
+def ceil_to_ue8m0(x: torch.Tensor):
+    assert x.view(-1).amax().item() > 0
+    return torch.pow(2.0, torch.ceil(torch.log2(x.abs())))
+def per_token_cast_to_fp8(x: torch.Tensor, use_ue8m0: bool, gran_k: int = 128) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    padded_n = align(n, gran_k)
+    x_padded = torch.empty((m, padded_n), dtype=x.dtype, device=x.device).fill_(0)
+    x_padded[:, :n] = x
+    x_view = x_padded.view(m, -1, gran_k)
+    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
+    sf = x_amax / 448.0
+    sf = ceil_to_ue8m0(sf) if use_ue8m0 else sf
+    return (x_view * (1.0 / sf.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, padded_n)[:, :n].contiguous(), sf
+def per_channel_cast_to_fp8(x: torch.Tensor, use_ue8m0: bool, gran_k: int = 128) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2 and x.size(0) % gran_k == 0
+    m, n = x.shape
+    x_view = x.view(-1, gran_k, n)
+    x_amax = x_view.abs().float().amax(dim=1).view(-1, n).clamp(1e-4)
+    sf = x_amax / 448.0
+    sf = ceil_to_ue8m0(sf) if use_ue8m0 else sf
+    return (x_view * (1.0 / sf.unsqueeze(1))).to(torch.float8_e4m3fn).view(m, n), sf
+def per_block_cast_to_fp8(x: torch.Tensor, use_ue8m0: bool, gran_k: int = 128) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros((align(m, gran_k), align(n, gran_k)), dtype=x.dtype, device=x.device)
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, gran_k, x_padded.size(1) // gran_k, gran_k)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    sf = x_amax / 448.0
+    sf = ceil_to_ue8m0(sf) if use_ue8m0 else sf
+    x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn)
+    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(x_view.size(0), x_view.size(2))
+def per_custom_dims_cast_to_fp8(x: torch.Tensor, dims: Tuple, use_ue8m0: bool) -> Tuple[torch.Tensor, torch.Tensor]:
+    excluded_dims = tuple([i for i in range(x.dim()) if i not in set(dims)])
+    x_amax = x.abs().float().amax(dim=excluded_dims, keepdim=True).clamp(1e-4)
+    sf = x_amax / 448.0
+    sf = ceil_to_ue8m0(sf) if use_ue8m0 else sf
+    x_scaled = (x * (1.0 / sf)).to(torch.float8_e4m3fn)
+    return x_scaled, sf.squeeze()
+def _quantize_to_fp4_e2m1(x: torch.Tensor) -> torch.Tensor:
+    ax = x.abs().clamp_max(6.0)
+    # {0, 0.5, 1, 1.5, 2, 3, 4, 6}
+    # midpoints: 0.25, 0.75, 1.25, 1.75, 2.5, 3.5, 5.0
+    boundaries = torch.tensor([0.25, 0.75, 1.25, 1.75, 2.5, 3.5, 5.0],
+                              device=x.device, dtype=ax.dtype)
+    idx = torch.bucketize(ax, boundaries)
+    code = idx.to(torch.uint8)
+    sign = (x < 0) & (idx != 0)
+    code = code | (sign.to(torch.uint8) << 3)
+    return code  # uint8, 0..15
+def per_token_cast_to_fp4(x: torch.Tensor, use_ue8m0: bool, gran_k: int = 128) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    assert n % 2 == 0
+    padded_n = align(n, gran_k)
+    x_padded = torch.zeros((m, padded_n), dtype=x.dtype, device=x.device)
+    x_padded[:, :n] = x
+    x_view = x_padded.view(m, -1, gran_k)
+    x_amax = x_view.abs().float().amax(dim=2).clamp_min(1e-4)
+    sf = x_amax / 6.0
+    sf = ceil_to_ue8m0(sf) if use_ue8m0 else sf
+    x_scaled = x_view * (1.0 / sf.unsqueeze(2))
+    codes = _quantize_to_fp4_e2m1(x_scaled).view(m, padded_n)  # uint8, (m, padded_n)
+    codes2 = codes.view(m, padded_n // 2, 2)
+    packed = (codes2[:, :, 0] & 0x0F) | ((codes2[:, :, 1] & 0x0F) << 4)  # uint8
+    return packed[:, :n // 2].contiguous(), sf
+def transpose_packed_fp4(a: torch.Tensor) -> torch.Tensor:
+    assert a.dtype == torch.uint8
+    assert a.dim() == 2
+    m, n2 = a.shape
+    n = n2 * 2
+    assert (m % 2) == 0
+    lo = a & 0x0F
+    hi = (a >> 4) & 0x0F
+    codes = torch.empty((m, n), device=a.device, dtype=torch.uint8)
+    codes[:, 0::2], codes[:, 1::2] = lo, hi
+    codes_t = codes.transpose(0, 1).contiguous()
+    codes2 = codes_t.view(n, m // 2, 2)
+    out = (codes2[:, :, 0] & 0x0F) | ((codes2[:, :, 1] & 0x0F) << 4)
+    return out.contiguous()

build/torch29-cxx11-cu126-x86_64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import os
+import subprocess
+import torch
+from ._ops import ops
+def _find_cuda_home():
+    cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH')
+    if cuda_home is None:
+        try:
+            with open(os.devnull, 'w') as devnull:
+                nvcc = subprocess.check_output(
+                    ['which', 'nvcc'], stderr=devnull
+                ).decode().rstrip('\r\n')
+                cuda_home = os.path.dirname(os.path.dirname(nvcc))
+        except Exception:
+            cuda_home = '/usr/local/cuda'
+            if not os.path.exists(cuda_home):
+                cuda_home = ''
+    return cuda_home or ''
+def _find_cutlass_include():
+    """Find CUTLASS include path for JIT compilation of .cuh templates."""
+    # 1. Explicit env var
+    cutlass_include = os.environ.get('DG_CUTLASS_INCLUDE')
+    if cutlass_include and os.path.isdir(cutlass_include):
+        return cutlass_include
+    # 2. CUTLASS_HOME env var
+    cutlass_home = os.environ.get('CUTLASS_HOME')
+    if cutlass_home:
+        p = os.path.join(cutlass_home, 'include')
+        if os.path.isdir(os.path.join(p, 'cute')):
+            return p
+    # 3. Check in package include/ directory (bundled cute/cutlass headers)
+    pkg_include = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'include')
+    if os.path.isdir(os.path.join(pkg_include, 'cute')):
+        return pkg_include
+    # 4. Check CUDA_HOME/include (some CUDA 12.8+ installs include cute/)
+    cuda_home = _find_cuda_home()
+    if cuda_home:
+        cuda_inc = os.path.join(cuda_home, 'include')
+        if os.path.isdir(os.path.join(cuda_inc, 'cute')):
+            return cuda_inc
+    # 5. Try to find nvidia-cutlass Python package
+    try:
+        import cutlass as _cutlass
+        cutlass_dir = os.path.dirname(_cutlass.__file__)
+        p = os.path.join(cutlass_dir, 'include')
+        if os.path.isdir(os.path.join(p, 'cute')):
+            return p
+    except ImportError:
+        pass
+    # Return empty string; C++ side will also check env vars
+    return ""
+def set_num_sms(new_num_sms):
+    ops.set_num_sms(new_num_sms)
+def get_num_sms():
+    return ops.get_num_sms()
+def set_tc_util(new_tc_util):
+    ops.set_tc_util(new_tc_util)
+def get_tc_util():
+    return ops.get_tc_util()
+# cuBLASLt GEMMs
+def cublaslt_gemm_nt(a, b, d, c=None):
+    ops.cublaslt_gemm_nt(a, b, d, c)
+def cublaslt_gemm_nn(a, b, d, c=None):
+    ops.cublaslt_gemm_nn(a, b, d, c)
+def cublaslt_gemm_tn(a, b, d, c=None):
+    ops.cublaslt_gemm_tn(a, b, d, c)
+def cublaslt_gemm_tt(a, b, d, c=None):
+    ops.cublaslt_gemm_tt(a, b, d, c)
+try:
+    # FP8/FP4 GEMMs
+    def fp8_fp4_gemm_nt(a, b, d, c=None, recipe=None, recipe_a=None,
+                        recipe_b=None, compiled_dims="nk", disable_ue8m0_cast=False):
+        ops.fp8_fp4_gemm_nt(a[0], a[1], b[0], b[1], d, c,
+                            list(recipe) if recipe else None,
+                            list(recipe_a) if recipe_a else None,
+                            list(recipe_b) if recipe_b else None,
+                            compiled_dims, disable_ue8m0_cast)
+    def fp8_fp4_gemm_nn(a, b, d, c=None, recipe=None, recipe_a=None,
+                        recipe_b=None, compiled_dims="nk", disable_ue8m0_cast=False):
+        ops.fp8_fp4_gemm_nn(a[0], a[1], b[0], b[1], d, c,
+                            list(recipe) if recipe else None,
+                            list(recipe_a) if recipe_a else None,
+                            list(recipe_b) if recipe_b else None,
+                            compiled_dims, disable_ue8m0_cast)
+    def fp8_fp4_gemm_tn(a, b, d, c=None, recipe=None, recipe_a=None,
+                        recipe_b=None, compiled_dims="mn", disable_ue8m0_cast=False):
+        ops.fp8_fp4_gemm_tn(a[0], a[1], b[0], b[1], d, c,
+                            list(recipe) if recipe else None,
+                            list(recipe_a) if recipe_a else None,
+                            list(recipe_b) if recipe_b else None,
+                            compiled_dims, disable_ue8m0_cast)
+    def fp8_fp4_gemm_tt(a, b, d, c=None, recipe=None, recipe_a=None,
+                        recipe_b=None, compiled_dims="mn", disable_ue8m0_cast=False):
+        ops.fp8_fp4_gemm_tt(a[0], a[1], b[0], b[1], d, c,
+                            list(recipe) if recipe else None,
+                            list(recipe_a) if recipe_a else None,
+                            list(recipe_b) if recipe_b else None,
+                            compiled_dims, disable_ue8m0_cast)
+    fp8_gemm_nt = fp8_fp4_gemm_nt
+    fp8_gemm_nn = fp8_fp4_gemm_nn
+    fp8_gemm_tn = fp8_fp4_gemm_tn
+    fp8_gemm_tt = fp8_fp4_gemm_tt
+    def m_grouped_fp8_fp4_gemm_nt_contiguous(a, b, d, grouped_layout,
+            recipe=None, recipe_a=None, recipe_b=None, compiled_dims="nk",
+            disable_ue8m0_cast=False, use_psum_layout=False,
+            expected_m_for_psum_layout=None):
+        ops.m_grouped_fp8_fp4_gemm_nt_contiguous(
+            a[0], a[1], b[0], b[1], d, grouped_layout,
+            list(recipe) if recipe else None,
+            list(recipe_a) if recipe_a else None,
+            list(recipe_b) if recipe_b else None,
+            compiled_dims, disable_ue8m0_cast, use_psum_layout,
+            expected_m_for_psum_layout)
+    m_grouped_fp8_gemm_nt_contiguous = m_grouped_fp8_fp4_gemm_nt_contiguous
+    def m_grouped_fp8_fp4_gemm_nn_contiguous(a, b, d, grouped_layout,
+            recipe=None, recipe_a=None, recipe_b=None, compiled_dims="nk",
+            disable_ue8m0_cast=False, use_psum_layout=False):
+        ops.m_grouped_fp8_fp4_gemm_nn_contiguous(
+            a[0], a[1], b[0], b[1], d, grouped_layout,
+            list(recipe) if recipe else None,
+            list(recipe_a) if recipe_a else None,
+            list(recipe_b) if recipe_b else None,
+            compiled_dims, disable_ue8m0_cast, use_psum_layout)
+    m_grouped_fp8_gemm_nn_contiguous = m_grouped_fp8_fp4_gemm_nn_contiguous
+    def m_grouped_fp8_fp4_gemm_nt_masked(a, b, d, masked_m, expected_m,
+            recipe=None, recipe_a=None, recipe_b=None, compiled_dims="nk",
+            disable_ue8m0_cast=False):
+        ops.m_grouped_fp8_fp4_gemm_nt_masked(
+            a[0], a[1], b[0], b[1], d, masked_m, expected_m,
+            list(recipe) if recipe else None,
+            list(recipe_a) if recipe_a else None,
+            list(recipe_b) if recipe_b else None,
+            compiled_dims, disable_ue8m0_cast)
+    m_grouped_fp8_gemm_nt_masked = m_grouped_fp8_fp4_gemm_nt_masked
+    def k_grouped_fp8_gemm_nt_contiguous(a, b, d, ks, ks_tensor, c=None,
+            recipe=(1, 1, 128), compiled_dims="mn"):
+        ops.k_grouped_fp8_gemm_nt_contiguous(
+            a[0], a[1], b[0], b[1], d, ks, ks_tensor, c,
+            list(recipe), compiled_dims)
+    def k_grouped_fp8_gemm_tn_contiguous(a, b, d, ks, ks_tensor, c=None,
+            recipe=(1, 1, 128), compiled_dims="mn"):
+        ops.k_grouped_fp8_gemm_tn_contiguous(
+            a[0], a[1], b[0], b[1], d, ks, ks_tensor, c,
+            list(recipe), compiled_dims)
+    # BF16 GEMMs
+    def bf16_gemm_nt(a, b, d, c=None, compiled_dims="nk"):
+        ops.bf16_gemm_nt(a, b, d, c, compiled_dims)
+    def bf16_gemm_nn(a, b, d, c=None, compiled_dims="nk"):
+        ops.bf16_gemm_nn(a, b, d, c, compiled_dims)
+    def bf16_gemm_tn(a, b, d, c=None, compiled_dims="mn"):
+        ops.bf16_gemm_tn(a, b, d, c, compiled_dims)
+    def bf16_gemm_tt(a, b, d, c=None, compiled_dims="mn"):
+        ops.bf16_gemm_tt(a, b, d, c, compiled_dims)
+    def m_grouped_bf16_gemm_nt_contiguous(a, b, d, grouped_layout,
+            compiled_dims="nk", use_psum_layout=False,
+            expected_m_for_psum_layout=None):
+        ops.m_grouped_bf16_gemm_nt_contiguous(
+            a, b, d, grouped_layout, compiled_dims,
+            use_psum_layout, expected_m_for_psum_layout)
+    def m_grouped_bf16_gemm_nn_contiguous(a, b, d, grouped_layout,
+            compiled_dims="nk", use_psum_layout=False):
+        ops.m_grouped_bf16_gemm_nn_contiguous(
+            a, b, d, grouped_layout, compiled_dims, use_psum_layout)
+    def m_grouped_bf16_gemm_nt_masked(a, b, d, masked_m, expected_m,
+            compiled_dims="nk"):
+        ops.m_grouped_bf16_gemm_nt_masked(
+            a, b, d, masked_m, expected_m, compiled_dims)
+    def k_grouped_bf16_gemm_tn_contiguous(a, b, d, ks, ks_tensor,
+            c=None, compiled_dims="mn"):
+        ops.k_grouped_bf16_gemm_tn_contiguous(
+            a, b, d, ks, ks_tensor, c, compiled_dims)
+    # Einsum
+    def einsum(expr, a, b, d, c=None, use_cublaslt=False):
+        ops.einsum(expr, a, b, d, c, use_cublaslt)
+    def fp8_einsum(expr, a, b, d, c=None, recipe=(1, 128, 128)):
+        ops.fp8_einsum(expr, a[0], a[1], b[0], b[1], d, c, list(recipe))
+    # Attention
+    def fp8_gemm_nt_skip_head_mid(a, b, d, head_splits, recipe=None,
+            compiled_dims="nk", disable_ue8m0_cast=False):
+        ops.fp8_gemm_nt_skip_head_mid(
+            a[0], a[1], b[0], b[1], d, list(head_splits),
+            list(recipe) if recipe else None,
+            compiled_dims, disable_ue8m0_cast)
+    def fp8_mqa_logits(q, kv, weights, cu_seq_len_k_start,
+                       cu_seq_len_k_end, clean_logits=True, max_seqlen_k=0):
+        return ops.fp8_mqa_logits(
+            q, kv[0], kv[1], weights,
+            cu_seq_len_k_start, cu_seq_len_k_end,
+            clean_logits, max_seqlen_k)
+    def get_paged_mqa_logits_metadata(context_lens, block_kv, num_sms):
+        return ops.get_paged_mqa_logits_metadata(
+            context_lens, block_kv, num_sms)
+    def fp8_paged_mqa_logits(q, fused_kv_cache, weights, context_lens,
+                             block_table, schedule_meta,
+                             max_context_len, clean_logits=False):
+        return ops.fp8_paged_mqa_logits(
+            q, fused_kv_cache, weights, context_lens,
+            block_table, schedule_meta, max_context_len, clean_logits)
+    # Hyperconnection
+    def tf32_hc_prenorm_gemm(a, b, d, sqr_sum, num_splits=None):
+        ops.tf32_hc_prenorm_gemm(a, b, d, sqr_sum, num_splits)
+    # Layout
+    def transform_sf_into_required_layout(sf, mn, k, recipe=None,
+            recipe_ab=None, num_groups=None, is_sfa=False,
+            disable_ue8m0_cast=False):
+        return ops.transform_sf_into_required_layout(
+            sf, mn, k,
+            list(recipe) if recipe else None,
+            list(recipe_ab) if recipe_ab else None,
+            num_groups, is_sfa, disable_ue8m0_cast)
+    def get_mk_alignment_for_contiguous_layout():
+        return ops.get_mk_alignment_for_contiguous_layout()
+    # Legacy aliases
+    fp8_m_grouped_gemm_nt_masked = m_grouped_fp8_fp4_gemm_nt_masked
+    bf16_m_grouped_gemm_nt_masked = m_grouped_bf16_gemm_nt_masked
+except Exception:
+    pass
+# Utils
+from . import utils
+from .utils import *
+# Testing
+from . import testing
+# Initialize (gracefully skip if CUDA is not available, e.g. in build sandboxes)
+try:
+    ops.init(
+        os.path.dirname(os.path.abspath(__file__)),
+        _find_cuda_home(),
+        _find_cutlass_include()
+    )
+except Exception:
+    pass
+__version__ = '2.3.0'

build/torch29-cxx11-cu126-x86_64-linux/_deep_gemm_099ac3c_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac9ad7e5f8bcd1642692d50e321db2ee6a668bdc448fa481490e307e2dfb0ffe
+size 2967864

build/torch29-cxx11-cu126-x86_64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _deep_gemm_099ac3c_dirty
+ops = torch.ops._deep_gemm_099ac3c_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_deep_gemm_099ac3c_dirty::{op_name}"

build/torch29-cxx11-cu126-x86_64-linux/deep_gemm/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import sys
+import importlib
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch29-cxx11-cu126-x86_64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "python-depends": []
+}

build/torch29-cxx11-cu126-x86_64-linux/testing/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from . import bench, numeric, utils
+from .bench import *
+from .numeric import *
+from .utils import *

build/torch29-cxx11-cu126-x86_64-linux/testing/bench.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import os
+import sys
+import torch
+def bench(fn, num_warmups: int = 5, num_tests: int = 10,
+          high_precision: bool = False):
+    # Flush L2 cache with 256 MB data
+    torch.cuda.synchronize()
+    cache = torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda')
+    cache.zero_()
+    # Warmup
+    for _ in range(num_warmups):
+        fn()
+    # Add a large kernel to eliminate the CPU launch overhead
+    if high_precision:
+        x = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
+        y = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
+        x @ y
+    # Testing
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    for i in range(num_tests):
+        fn()
+    end_event.record()
+    torch.cuda.synchronize()
+    return start_event.elapsed_time(end_event) / num_tests / 1e3
+class empty_suppress:
+    def __enter__(self):
+        return self
+    def __exit__(self, *_):
+        pass
+class suppress_stdout_stderr:
+    def __enter__(self):
+        self.outnull_file = open(os.devnull, 'w')
+        self.errnull_file = open(os.devnull, 'w')
+        self.old_stdout_fileno_undup = sys.stdout.fileno()
+        self.old_stderr_fileno_undup = sys.stderr.fileno()
+        self.old_stdout_fileno = os.dup(sys.stdout.fileno())
+        self.old_stderr_fileno = os.dup(sys.stderr.fileno())
+        self.old_stdout = sys.stdout
+        self.old_stderr = sys.stderr
+        os.dup2(self.outnull_file.fileno(), self.old_stdout_fileno_undup)
+        os.dup2(self.errnull_file.fileno(), self.old_stderr_fileno_undup)
+        sys.stdout = self.outnull_file
+        sys.stderr = self.errnull_file
+        return self
+    def __exit__(self, *_):
+        sys.stdout = self.old_stdout
+        sys.stderr = self.old_stderr
+        os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
+        os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)
+        os.close(self.old_stdout_fileno)
+        os.close(self.old_stderr_fileno)
+        self.outnull_file.close()
+        self.errnull_file.close()
+def bench_kineto(fn, kernel_names, num_tests: int = 30,
+                 suppress_kineto_output: bool = False,
+                 trace_path: str = None, flush_l2: bool = True,
+                 with_multiple_kernels: bool = False):
+    assert isinstance(kernel_names, str) or isinstance(kernel_names, tuple)
+    is_tuple = isinstance(kernel_names, tuple)
+    # Skip profiling
+    # Conflict with Nsight Systems, Nsight Compute and Compute Sanitizer
+    if int(os.environ.get('DG_USE_NVIDIA_TOOLS', 0)):
+        return (1, ) * len(kernel_names) if is_tuple else 1
+    # By default, flush L2 with an excessive 8 GB memset to give the GPU some (literal) chill time without full idle
+    flush_l2_size = int(8e9 // 4)
+    # For some auto-tuning kernels with prints
+    fn()
+    # Profile
+    suppress = suppress_stdout_stderr if suppress_kineto_output else empty_suppress
+    with suppress():
+        schedule = torch.profiler.schedule(wait=1, warmup=0, active=1, repeat=1)
+        profiler = torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA], schedule=schedule)
+        with profiler:
+            for i in range(2):
+                for _ in range(num_tests):
+                    if flush_l2:
+                        torch.empty(flush_l2_size, dtype=torch.int, device='cuda').zero_()
+                    fn()
+                profiler.step()
+    # Parse the profiling table
+    prof_lines = profiler.key_averages().table(sort_by='cuda_time_total', max_name_column_width=100).split('\n')
+    kernel_names = (kernel_names, ) if isinstance(kernel_names, str) else kernel_names
+    if not with_multiple_kernels:
+        for name in kernel_names:
+            assert sum([name in line for line in prof_lines]) <= 1, f'Errors of the kernel {name} in the profiling table'
+    # Save chrome traces
+    if trace_path is not None:
+        profiler.export_chrome_trace(trace_path)
+    # Return average kernel times
+    units = {'ms': 1e3, 'us': 1e6}
+    kernel_times = []
+    for name in kernel_names:
+        total_time = 0
+        total_num = 0
+        for line in prof_lines:
+            if name in line:
+                time_str = line.split()[-2]
+                num_str = line.split()[-1]
+                for unit, scale in units.items():
+                    if unit in time_str:
+                        total_time += float(time_str.replace(unit, '')) / scale * int(num_str)
+                        total_num += int(num_str)
+                        break
+        kernel_times.append(total_time / total_num if total_num > 0 else 0)
+    return tuple(kernel_times) if is_tuple else kernel_times[0]

build/torch29-cxx11-cu126-x86_64-linux/testing/numeric.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch
+from typing import Iterable
+def calc_diff(x: torch.Tensor, y: torch.Tensor):
+    x, y = x.double(), y.double()
+    denominator = (x * x + y * y).sum()
+    if denominator == 0:    # Which means that all elements in x and y are 0
+        return 0.0
+    sim = 2 * (x * y).sum() / denominator
+    return 1 - sim
+def count_bytes(*tensors):
+    total = 0
+    for t in tensors:
+        if isinstance(t, (tuple, list)):
+            total += count_bytes(*t)
+        elif t is not None:
+            total += t.numel() * t.element_size()
+    return total

build/torch29-cxx11-cu126-x86_64-linux/testing/utils.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import functools
+import os
+import torch
+from typing import Callable
+def get_arch_major() -> int:
+    major, minor = torch.cuda.get_device_capability()
+    return major
+def test_filter(condition: Callable):
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            if condition():
+                func(*args, **kwargs)
+            else:
+                print(f'{func.__name__}:')
+                print(f' > Filtered by {condition}')
+                print()
+        return wrapper
+    return decorator
+def ignore_env(name: str, condition: Callable):
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            if condition():
+                saved = os.environ.pop(name, None)
+                func(*args, **kwargs)
+                if saved is not None:
+                    os.environ[name] = saved
+            else:
+                func(*args, **kwargs)
+        return wrapper
+    return decorator

build/torch29-cxx11-cu126-x86_64-linux/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from . import math, layout
+from .layout import *
+from .math import *

build/torch29-cxx11-cu126-x86_64-linux/utils/layout.py ADDED Viewed

	@@ -0,0 +1,25 @@

+try:
+    from .._ops import ops
+    def get_tma_aligned_size(x, element_size):
+        return ops.get_tma_aligned_size(x, element_size)
+    def get_mn_major_tma_aligned_tensor(sf):
+        return ops.get_mn_major_tma_aligned_tensor(sf)
+    def get_mn_major_tma_aligned_packed_ue8m0_tensor(sf):
+        return ops.get_mn_major_tma_aligned_packed_ue8m0_tensor(sf)
+    def get_k_grouped_mn_major_tma_aligned_packed_ue8m0_tensor(sf, ks_tensor, ks):
+        return ops.get_k_grouped_mn_major_tma_aligned_packed_ue8m0_tensor(
+            sf, ks_tensor, ks)
+except ImportError:
+    pass
+from .._ops import ops as _ops
+def get_mk_alignment_for_contiguous_layout():
+    return _ops.get_mk_alignment_for_contiguous_layout()
+get_m_alignment_for_contiguous_layout = get_mk_alignment_for_contiguous_layout
+get_k_alignment_for_contiguous_layout = get_mk_alignment_for_contiguous_layout