medmekk commited on Sep 22, 2025

Commit

f622ea1

1 Parent(s): 4e9c226

add 9.0 build

Browse files

Files changed (36) hide show

CMakeLists.txt +213 -0
build.toml +4 -1
build/torch27-cxx11-cu118-x86_64-linux/layer_norm/__init__.py +26 -0
build/torch27-cxx11-cu118-x86_64-linux/layer_norm/_layer_norm_4e9c226_dirty.abi3.so +3 -0
build/torch27-cxx11-cu118-x86_64-linux/layer_norm/_ops.py +9 -0
build/torch27-cxx11-cu118-x86_64-linux/layer_norm/layers.py +49 -0
build/torch27-cxx11-cu126-x86_64-linux/layer_norm/__init__.py +26 -0
build/torch27-cxx11-cu126-x86_64-linux/layer_norm/_layer_norm_4e9c226_dirty.abi3.so +3 -0
build/torch27-cxx11-cu126-x86_64-linux/layer_norm/_ops.py +9 -0
build/torch27-cxx11-cu126-x86_64-linux/layer_norm/layers.py +49 -0
build/torch27-cxx11-cu128-x86_64-linux/layer_norm/__init__.py +26 -0
build/torch27-cxx11-cu128-x86_64-linux/layer_norm/_layer_norm_4e9c226_dirty.abi3.so +3 -0
build/torch27-cxx11-cu128-x86_64-linux/layer_norm/_ops.py +9 -0
build/torch27-cxx11-cu128-x86_64-linux/layer_norm/layers.py +49 -0
build/torch28-cxx11-cu126-x86_64-linux/layer_norm/__init__.py +26 -0
build/torch28-cxx11-cu126-x86_64-linux/layer_norm/_layer_norm_4e9c226_dirty.abi3.so +3 -0
build/torch28-cxx11-cu126-x86_64-linux/layer_norm/_ops.py +9 -0
build/torch28-cxx11-cu126-x86_64-linux/layer_norm/layers.py +49 -0
build/torch28-cxx11-cu128-x86_64-linux/layer_norm/__init__.py +26 -0
build/torch28-cxx11-cu128-x86_64-linux/layer_norm/_layer_norm_4e9c226_dirty.abi3.so +3 -0
build/torch28-cxx11-cu128-x86_64-linux/layer_norm/_ops.py +9 -0
build/torch28-cxx11-cu128-x86_64-linux/layer_norm/layers.py +49 -0
build/torch28-cxx11-cu129-x86_64-linux/layer_norm/__init__.py +26 -0
build/torch28-cxx11-cu129-x86_64-linux/layer_norm/_layer_norm_4e9c226_dirty.abi3.so +3 -0
build/torch28-cxx11-cu129-x86_64-linux/layer_norm/_ops.py +9 -0
build/torch28-cxx11-cu129-x86_64-linux/layer_norm/layers.py +49 -0
cmake/hipify.py +76 -0
cmake/utils.cmake +545 -0
flake.lock +168 -0
pyproject.toml +10 -0
setup.py +138 -0
torch-ext/layer_norm/_layer_norm_711aa42_dirty.abi3.so +3 -0
torch-ext/layer_norm/_ops.py +9 -0
torch-ext/registration.h +30 -0
torch-ext/torch_binding.cpp +146 -9
torch-ext/torch_binding.h +66 -4

CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,213 @@

+cmake_minimum_required(VERSION 3.26)
+project(layer_norm LANGUAGES CXX)
+set(TARGET_DEVICE "cuda" CACHE STRING "Target device backend for kernel")
+install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
+include(FetchContent)
+file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
+message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
+set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
+if(DEFINED Python_EXECUTABLE)
+  # Allow passing through the interpreter (e.g. from setup.py).
+  find_package(Python COMPONENTS Development Development.SABIModule Interpreter)
+  if (NOT Python_FOUND)
+    message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
+  endif()
+else()
+  find_package(Python REQUIRED COMPONENTS Development Development.SABIModule Interpreter)
+endif()
+append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
+find_package(Torch REQUIRED)
+if (NOT TARGET_DEVICE STREQUAL "cuda" AND
+    NOT TARGET_DEVICE STREQUAL "rocm")
+    return()
+endif()
+if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
+   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
+ set(CUDA_DEFAULT_KERNEL_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0+PTX")
+else()
+  set(CUDA_DEFAULT_KERNEL_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0+PTX")
+endif()
+if (NOT HIP_FOUND AND CUDA_FOUND)
+  set(GPU_LANG "CUDA")
+elseif(HIP_FOUND)
+  set(GPU_LANG "HIP")
+  # Importing torch recognizes and sets up some HIP/ROCm configuration but does
+  # not let cmake recognize .hip files. In order to get cmake to understand the
+  # .hip extension automatically, HIP must be enabled explicitly.
+  enable_language(HIP)
+else()
+  message(FATAL_ERROR "Can't find CUDA or HIP installation.")
+endif()
+if(GPU_LANG STREQUAL "CUDA")
+  clear_cuda_arches(CUDA_ARCH_FLAGS)
+  extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
+  message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
+  # Filter the target architectures by the supported supported archs
+  # since for some files we will build for all CUDA_ARCHS.
+  cuda_archs_loose_intersection(CUDA_ARCHS "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
+  message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
+  if(NVCC_THREADS AND GPU_LANG STREQUAL "CUDA")
+    list(APPEND GPU_FLAGS "--threads=${NVCC_THREADS}")
+  endif()
+  add_compile_definitions(CUDA_KERNEL)
+elseif(GPU_LANG STREQUAL "HIP")
+  set(ROCM_ARCHS "${HIP_SUPPORTED_ARCHS}")
+  # TODO: remove this once we can set specific archs per source file set.
+  override_gpu_arches(GPU_ARCHES
+    ${GPU_LANG}
+    "${${GPU_LANG}_SUPPORTED_ARCHS}")
+  add_compile_definitions(ROCM_KERNEL)
+else()
+  override_gpu_arches(GPU_ARCHES
+    ${GPU_LANG}
+    "${${GPU_LANG}_SUPPORTED_ARCHS}")
+endif()
+get_torch_gpu_compiler_flags(TORCH_GPU_FLAGS ${GPU_LANG})
+list(APPEND GPU_FLAGS ${TORCH_GPU_FLAGS})
+set(TORCH_layer_norm_SRC
+  torch-ext/torch_binding.cpp torch-ext/torch_binding.h
+)
+list(APPEND SRC "${TORCH_layer_norm_SRC}")
+set(layer_norm_SRC
+  "layer_norm/ln.h"
+"layer_norm/ln_api.cpp"
+"layer_norm/ln_bwd_1024.cu"
+"layer_norm/ln_bwd_1280.cu"
+"layer_norm/ln_bwd_1536.cu"
+"layer_norm/ln_bwd_2048.cu"
+"layer_norm/ln_bwd_256.cu"
+"layer_norm/ln_bwd_2560.cu"
+"layer_norm/ln_bwd_3072.cu"
+"layer_norm/ln_bwd_4096.cu"
+"layer_norm/ln_bwd_512.cu"
+"layer_norm/ln_bwd_5120.cu"
+"layer_norm/ln_bwd_6144.cu"
+"layer_norm/ln_bwd_7168.cu"
+"layer_norm/ln_bwd_768.cu"
+"layer_norm/ln_bwd_8192.cu"
+"layer_norm/ln_bwd_kernels.cuh"
+"layer_norm/ln_fwd_1024.cu"
+"layer_norm/ln_fwd_1280.cu"
+"layer_norm/ln_fwd_1536.cu"
+"layer_norm/ln_fwd_2048.cu"
+"layer_norm/ln_fwd_256.cu"
+"layer_norm/ln_fwd_2560.cu"
+"layer_norm/ln_fwd_3072.cu"
+"layer_norm/ln_fwd_4096.cu"
+"layer_norm/ln_fwd_512.cu"
+"layer_norm/ln_fwd_5120.cu"
+"layer_norm/ln_fwd_6144.cu"
+"layer_norm/ln_fwd_7168.cu"
+"layer_norm/ln_fwd_768.cu"
+"layer_norm/ln_fwd_8192.cu"
+"layer_norm/ln_fwd_kernels.cuh"
+"layer_norm/ln_kernel_traits.h"
+"layer_norm/ln_parallel_bwd_1024.cu"
+"layer_norm/ln_parallel_bwd_1280.cu"
+"layer_norm/ln_parallel_bwd_1536.cu"
+"layer_norm/ln_parallel_bwd_2048.cu"
+"layer_norm/ln_parallel_bwd_256.cu"
+"layer_norm/ln_parallel_bwd_2560.cu"
+"layer_norm/ln_parallel_bwd_3072.cu"
+"layer_norm/ln_parallel_bwd_4096.cu"
+"layer_norm/ln_parallel_bwd_512.cu"
+"layer_norm/ln_parallel_bwd_5120.cu"
+"layer_norm/ln_parallel_bwd_6144.cu"
+"layer_norm/ln_parallel_bwd_7168.cu"
+"layer_norm/ln_parallel_bwd_768.cu"
+"layer_norm/ln_parallel_bwd_8192.cu"
+"layer_norm/ln_parallel_fwd_1024.cu"
+"layer_norm/ln_parallel_fwd_1280.cu"
+"layer_norm/ln_parallel_fwd_1536.cu"
+"layer_norm/ln_parallel_fwd_2048.cu"
+"layer_norm/ln_parallel_fwd_256.cu"
+"layer_norm/ln_parallel_fwd_2560.cu"
+"layer_norm/ln_parallel_fwd_3072.cu"
+"layer_norm/ln_parallel_fwd_4096.cu"
+"layer_norm/ln_parallel_fwd_512.cu"
+"layer_norm/ln_parallel_fwd_5120.cu"
+"layer_norm/ln_parallel_fwd_6144.cu"
+"layer_norm/ln_parallel_fwd_7168.cu"
+"layer_norm/ln_parallel_fwd_768.cu"
+"layer_norm/ln_parallel_fwd_8192.cu"
+"layer_norm/ln_parallel_residual_bwd_kernels.cuh"
+"layer_norm/ln_parallel_residual_fwd_kernels.cuh"
+"layer_norm/ln_utils.cuh"
+"layer_norm/static_switch.h"
+)
+# TODO: check if CLion support this:
+# https://youtrack.jetbrains.com/issue/CPP-16510/CLion-does-not-handle-per-file-include-directories
+set_source_files_properties(
+  ${layer_norm_SRC}
+  PROPERTIES INCLUDE_DIRECTORIES "${CMAKE_SOURCE_DIR}/.")
+if(GPU_LANG STREQUAL "CUDA")
+      cuda_archs_loose_intersection(layer_norm_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}" "${CUDA_ARCHS}")
+    message(STATUS "Capabilities for kernel layer_norm: ${layer_norm_ARCHS}")
+  set_gencode_flags_for_srcs(SRCS "${layer_norm_SRC}" CUDA_ARCHS "${layer_norm_ARCHS}")
+  foreach(_KERNEL_SRC ${layer_norm_SRC})
+    if(_KERNEL_SRC MATCHES ".*\\.cu$")
+      set_property(
+        SOURCE ${_KERNEL_SRC}
+        APPEND PROPERTY
+        COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CUDA>:-O3;-U__CUDA_NO_HALF_OPERATORS__;-U__CUDA_NO_HALF_CONVERSIONS__;-U__CUDA_NO_BFLOAT16_OPERATORS__;-U__CUDA_NO_BFLOAT16_CONVERSIONS__;-U__CUDA_NO_BFLOAT162_OPERATORS__;-U__CUDA_NO_BFLOAT162_CONVERSIONS__;--expt-relaxed-constexpr;--expt-extended-lambda;--use_fast_math>"
+      )
+    endif()
+  endforeach()
+    foreach(_KERNEL_SRC ${layer_norm_SRC})
+    set_property(
+      SOURCE ${_KERNEL_SRC}
+      APPEND PROPERTY
+      COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CXX>:-DFLASHATTENTION_DISABLE_PYBIND>"
+    )
+  endforeach()
+  list(APPEND SRC "${layer_norm_SRC}")
+endif()
+define_gpu_extension_target(
+  _layer_norm_711aa42_dirty
+  DESTINATION _layer_norm_711aa42_dirty
+  LANGUAGE ${GPU_LANG}
+  SOURCES ${SRC}
+  COMPILE_FLAGS ${GPU_FLAGS}
+  ARCHITECTURES ${GPU_ARCHES}
+  #INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
+  USE_SABI 3
+  WITH_SOABI)
+target_link_options(_layer_norm_711aa42_dirty PRIVATE -static-libstdc++)

build.toml CHANGED Viewed

@@ -11,6 +11,9 @@ src = [
 [kernel.layer_norm]
 depends = ["torch"]
 backend = "cuda"
 include = ["."]
 src = [
     "layer_norm/ln.h",
@@ -79,7 +82,7 @@ src = [
     "layer_norm/ln_utils.cuh",
     "layer_norm/static_switch.h"
 ]
-cxx-flags = ["-DFLASHATTENTION_DISABLE_PYBIND"]
 cuda-flags = [
     "-O3",
     "-U__CUDA_NO_HALF_OPERATORS__",

 [kernel.layer_norm]
 depends = ["torch"]
 backend = "cuda"
+cuda-capabilities = [
+    "9.0"
+]
 include = ["."]
 src = [
     "layer_norm/ln.h",
     "layer_norm/ln_utils.cuh",
     "layer_norm/static_switch.h"
 ]
+cxx-flags = ["-DFLASHATTENTION_DISABLE_PYBIND", "-mcmodel=large"]
 cuda-flags = [
     "-O3",
     "-U__CUDA_NO_HALF_OPERATORS__",

build/torch27-cxx11-cu118-x86_64-linux/layer_norm/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+from . import layers
+def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
+    return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
+def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
+    return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
+def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
+    return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
+def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
+    return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
+__all__ = [
+    "layers",
+    "dropout_add_ln_fwd",
+    "dropout_add_ln_bwd",
+    "dropout_add_ln_parallel_residual_fwd",
+    "dropout_add_ln_parallel_residual_bwd",
+]

build/torch27-cxx11-cu118-x86_64-linux/layer_norm/_layer_norm_4e9c226_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:34e4a57b8d721c4dafb541a81e161435d25198632e3e4c8e2bc66c17eccc236f
+size 248321384

build/torch27-cxx11-cu118-x86_64-linux/layer_norm/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _layer_norm_4e9c226_dirty
+ops = torch.ops._layer_norm_4e9c226_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_layer_norm_4e9c226_dirty::{op_name}"

build/torch27-cxx11-cu118-x86_64-linux/layer_norm/layers.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+class LayerNorm(nn.Module):
+    weight: torch.Tensor
+    variance_epsilon: float
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return ops.dropout_add_ln_fwd(
+            hidden_states,
+            gamma = self.weight,
+            beta = None,
+            rowscale = None,
+            colscale = None,
+            x0_subset = None,
+            z_subset = None,
+            dropout_p = 0,
+            epsilon = self.variance_epsilon,
+            rowscale_const = 1.0,
+            z_numrows = hidden_states.shape[1],
+            gen = None,
+            residual_in_fp32 = False,
+            is_rms_norm = False,
+        )
+class LlamaRMSNorm(nn.Module):
+    weight: torch.Tensor
+    variance_epsilon: float
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return ops.dropout_add_ln_fwd(
+            hidden_states,
+            gamma = self.weight,
+            beta = None,
+            rowscale = None,
+            colscale = None,
+            x0_subset = None,
+            z_subset = None,
+            dropout_p = 0,
+            epsilon = self.variance_epsilon,
+            rowscale_const = 1.0,
+            z_numrows = hidden_states.shape[1],
+            gen = None,
+            residual_in_fp32 = False,
+            is_rms_norm = True,
+        )

build/torch27-cxx11-cu126-x86_64-linux/layer_norm/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+from . import layers
+def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
+    return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
+def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
+    return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
+def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
+    return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
+def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
+    return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
+__all__ = [
+    "layers",
+    "dropout_add_ln_fwd",
+    "dropout_add_ln_bwd",
+    "dropout_add_ln_parallel_residual_fwd",
+    "dropout_add_ln_parallel_residual_bwd",
+]

build/torch27-cxx11-cu126-x86_64-linux/layer_norm/_layer_norm_4e9c226_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f541911e5471865e47faf1641da36bcee3b206aa4993949a3cac966c3b936d27
+size 247115320

build/torch27-cxx11-cu126-x86_64-linux/layer_norm/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _layer_norm_4e9c226_dirty
+ops = torch.ops._layer_norm_4e9c226_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_layer_norm_4e9c226_dirty::{op_name}"

build/torch27-cxx11-cu126-x86_64-linux/layer_norm/layers.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+class LayerNorm(nn.Module):
+    weight: torch.Tensor
+    variance_epsilon: float
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return ops.dropout_add_ln_fwd(
+            hidden_states,
+            gamma = self.weight,
+            beta = None,
+            rowscale = None,
+            colscale = None,
+            x0_subset = None,
+            z_subset = None,
+            dropout_p = 0,
+            epsilon = self.variance_epsilon,
+            rowscale_const = 1.0,
+            z_numrows = hidden_states.shape[1],
+            gen = None,
+            residual_in_fp32 = False,
+            is_rms_norm = False,
+        )
+class LlamaRMSNorm(nn.Module):
+    weight: torch.Tensor
+    variance_epsilon: float
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return ops.dropout_add_ln_fwd(
+            hidden_states,
+            gamma = self.weight,
+            beta = None,
+            rowscale = None,
+            colscale = None,
+            x0_subset = None,
+            z_subset = None,
+            dropout_p = 0,
+            epsilon = self.variance_epsilon,
+            rowscale_const = 1.0,
+            z_numrows = hidden_states.shape[1],
+            gen = None,
+            residual_in_fp32 = False,
+            is_rms_norm = True,
+        )

build/torch27-cxx11-cu128-x86_64-linux/layer_norm/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+from . import layers
+def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
+    return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
+def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
+    return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
+def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
+    return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
+def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
+    return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
+__all__ = [
+    "layers",
+    "dropout_add_ln_fwd",
+    "dropout_add_ln_bwd",
+    "dropout_add_ln_parallel_residual_fwd",
+    "dropout_add_ln_parallel_residual_bwd",
+]

build/torch27-cxx11-cu128-x86_64-linux/layer_norm/_layer_norm_4e9c226_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7db683e74d55a1a71dc520a504521af3f08fb07724675d2097ce3d4ab3481e3d
+size 246751936

build/torch27-cxx11-cu128-x86_64-linux/layer_norm/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _layer_norm_4e9c226_dirty
+ops = torch.ops._layer_norm_4e9c226_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_layer_norm_4e9c226_dirty::{op_name}"

build/torch27-cxx11-cu128-x86_64-linux/layer_norm/layers.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+class LayerNorm(nn.Module):
+    weight: torch.Tensor
+    variance_epsilon: float
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return ops.dropout_add_ln_fwd(
+            hidden_states,
+            gamma = self.weight,
+            beta = None,
+            rowscale = None,
+            colscale = None,
+            x0_subset = None,
+            z_subset = None,
+            dropout_p = 0,
+            epsilon = self.variance_epsilon,
+            rowscale_const = 1.0,
+            z_numrows = hidden_states.shape[1],
+            gen = None,
+            residual_in_fp32 = False,
+            is_rms_norm = False,
+        )
+class LlamaRMSNorm(nn.Module):
+    weight: torch.Tensor
+    variance_epsilon: float
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return ops.dropout_add_ln_fwd(
+            hidden_states,
+            gamma = self.weight,
+            beta = None,
+            rowscale = None,
+            colscale = None,
+            x0_subset = None,
+            z_subset = None,
+            dropout_p = 0,
+            epsilon = self.variance_epsilon,
+            rowscale_const = 1.0,
+            z_numrows = hidden_states.shape[1],
+            gen = None,
+            residual_in_fp32 = False,
+            is_rms_norm = True,
+        )

build/torch28-cxx11-cu126-x86_64-linux/layer_norm/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+from . import layers
+def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
+    return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
+def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
+    return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
+def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
+    return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
+def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
+    return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
+__all__ = [
+    "layers",
+    "dropout_add_ln_fwd",
+    "dropout_add_ln_bwd",
+    "dropout_add_ln_parallel_residual_fwd",
+    "dropout_add_ln_parallel_residual_bwd",
+]

build/torch28-cxx11-cu126-x86_64-linux/layer_norm/_layer_norm_4e9c226_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b28a4d7885c08614b479490306561990c4cf6e5958dedd5ce59c2ee10bd0f0a
+size 247115408

build/torch28-cxx11-cu126-x86_64-linux/layer_norm/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _layer_norm_4e9c226_dirty
+ops = torch.ops._layer_norm_4e9c226_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_layer_norm_4e9c226_dirty::{op_name}"

build/torch28-cxx11-cu126-x86_64-linux/layer_norm/layers.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+class LayerNorm(nn.Module):
+    weight: torch.Tensor
+    variance_epsilon: float
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return ops.dropout_add_ln_fwd(
+            hidden_states,
+            gamma = self.weight,
+            beta = None,
+            rowscale = None,
+            colscale = None,
+            x0_subset = None,
+            z_subset = None,
+            dropout_p = 0,
+            epsilon = self.variance_epsilon,
+            rowscale_const = 1.0,
+            z_numrows = hidden_states.shape[1],
+            gen = None,
+            residual_in_fp32 = False,
+            is_rms_norm = False,
+        )
+class LlamaRMSNorm(nn.Module):
+    weight: torch.Tensor
+    variance_epsilon: float
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return ops.dropout_add_ln_fwd(
+            hidden_states,
+            gamma = self.weight,
+            beta = None,
+            rowscale = None,
+            colscale = None,
+            x0_subset = None,
+            z_subset = None,
+            dropout_p = 0,
+            epsilon = self.variance_epsilon,
+            rowscale_const = 1.0,
+            z_numrows = hidden_states.shape[1],
+            gen = None,
+            residual_in_fp32 = False,
+            is_rms_norm = True,
+        )

build/torch28-cxx11-cu128-x86_64-linux/layer_norm/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+from . import layers
+def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
+    return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
+def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
+    return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
+def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
+    return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
+def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
+    return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
+__all__ = [
+    "layers",
+    "dropout_add_ln_fwd",
+    "dropout_add_ln_bwd",
+    "dropout_add_ln_parallel_residual_fwd",
+    "dropout_add_ln_parallel_residual_bwd",
+]

build/torch28-cxx11-cu128-x86_64-linux/layer_norm/_layer_norm_4e9c226_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:69c897ea7e96a6988909ac3878f74baa2b598b0301a2ee3227f9f1c9804fb64d
+size 246756512

build/torch28-cxx11-cu128-x86_64-linux/layer_norm/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _layer_norm_4e9c226_dirty
+ops = torch.ops._layer_norm_4e9c226_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_layer_norm_4e9c226_dirty::{op_name}"

build/torch28-cxx11-cu128-x86_64-linux/layer_norm/layers.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+class LayerNorm(nn.Module):
+    weight: torch.Tensor
+    variance_epsilon: float
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return ops.dropout_add_ln_fwd(
+            hidden_states,
+            gamma = self.weight,
+            beta = None,
+            rowscale = None,
+            colscale = None,
+            x0_subset = None,
+            z_subset = None,
+            dropout_p = 0,
+            epsilon = self.variance_epsilon,
+            rowscale_const = 1.0,
+            z_numrows = hidden_states.shape[1],
+            gen = None,
+            residual_in_fp32 = False,
+            is_rms_norm = False,
+        )
+class LlamaRMSNorm(nn.Module):
+    weight: torch.Tensor
+    variance_epsilon: float
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return ops.dropout_add_ln_fwd(
+            hidden_states,
+            gamma = self.weight,
+            beta = None,
+            rowscale = None,
+            colscale = None,
+            x0_subset = None,
+            z_subset = None,
+            dropout_p = 0,
+            epsilon = self.variance_epsilon,
+            rowscale_const = 1.0,
+            z_numrows = hidden_states.shape[1],
+            gen = None,
+            residual_in_fp32 = False,
+            is_rms_norm = True,
+        )

build/torch28-cxx11-cu129-x86_64-linux/layer_norm/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+from . import layers
+def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
+    return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
+def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
+    return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
+def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
+    return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
+def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
+    return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
+__all__ = [
+    "layers",
+    "dropout_add_ln_fwd",
+    "dropout_add_ln_bwd",
+    "dropout_add_ln_parallel_residual_fwd",
+    "dropout_add_ln_parallel_residual_bwd",
+]

build/torch28-cxx11-cu129-x86_64-linux/layer_norm/_layer_norm_4e9c226_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:594fd2ab65b273a4fee370bab7e03cb79cbc9c320eb37364466940a60ef154fa
+size 248443760

build/torch28-cxx11-cu129-x86_64-linux/layer_norm/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _layer_norm_4e9c226_dirty
+ops = torch.ops._layer_norm_4e9c226_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_layer_norm_4e9c226_dirty::{op_name}"

build/torch28-cxx11-cu129-x86_64-linux/layer_norm/layers.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+class LayerNorm(nn.Module):
+    weight: torch.Tensor
+    variance_epsilon: float
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return ops.dropout_add_ln_fwd(
+            hidden_states,
+            gamma = self.weight,
+            beta = None,
+            rowscale = None,
+            colscale = None,
+            x0_subset = None,
+            z_subset = None,
+            dropout_p = 0,
+            epsilon = self.variance_epsilon,
+            rowscale_const = 1.0,
+            z_numrows = hidden_states.shape[1],
+            gen = None,
+            residual_in_fp32 = False,
+            is_rms_norm = False,
+        )
+class LlamaRMSNorm(nn.Module):
+    weight: torch.Tensor
+    variance_epsilon: float
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return ops.dropout_add_ln_fwd(
+            hidden_states,
+            gamma = self.weight,
+            beta = None,
+            rowscale = None,
+            colscale = None,
+            x0_subset = None,
+            z_subset = None,
+            dropout_p = 0,
+            epsilon = self.variance_epsilon,
+            rowscale_const = 1.0,
+            z_numrows = hidden_states.shape[1],
+            gen = None,
+            residual_in_fp32 = False,
+            is_rms_norm = True,
+        )

cmake/hipify.py ADDED Viewed

	@@ -0,0 +1,76 @@

+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# From vLLM: https://github.com/vllm-project/vllm/blob/main/cmake/hipify.py
+#
+# A command line tool for running pytorch's hipify preprocessor on CUDA
+# source files.
+#
+# See https://github.com/ROCm/hipify_torch
+# and <torch install dir>/utils/hipify/hipify_python.py
+#
+import argparse
+import os
+import shutil
+from torch.utils.hipify.hipify_python import hipify
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    # Project directory where all the source + include files live.
+    parser.add_argument(
+        "-p",
+        "--project_dir",
+        help="The project directory.",
+    )
+    # Directory where hipified files are written.
+    parser.add_argument(
+        "-o",
+        "--output_dir",
+        help="The output directory.",
+    )
+    # Source files to convert.
+    parser.add_argument("sources",
+                        help="Source files to hipify.",
+                        nargs="*",
+                        default=[])
+    args = parser.parse_args()
+    # Limit include scope to project_dir only
+    includes = [os.path.join(args.project_dir, '*')]
+    # Get absolute path for all source files.
+    extra_files = [os.path.abspath(s) for s in args.sources]
+    # Copy sources from project directory to output directory.
+    # The directory might already exist to hold object files so we ignore that.
+    shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True)
+    hipify_result = hipify(project_directory=args.project_dir,
+                           output_directory=args.output_dir,
+                           header_include_dirs=[],
+                           includes=includes,
+                           extra_files=extra_files,
+                           show_detailed=True,
+                           is_pytorch_extension=True,
+                           hipify_extra_files_only=True)
+    hipified_sources = []
+    for source in args.sources:
+        s_abs = os.path.abspath(source)
+        hipified_s_abs = (hipify_result[s_abs].hipified_path if
+                          (s_abs in hipify_result
+                           and hipify_result[s_abs].hipified_path is not None)
+                          else s_abs)
+        hipified_sources.append(hipified_s_abs)
+    assert (len(hipified_sources) == len(args.sources))
+    # Print hipified source files.
+    print("\n".join(hipified_sources))

cmake/utils.cmake ADDED Viewed

	@@ -0,0 +1,545 @@

+# Vendored from vLLM:
+#
+# https://github.com/vllm-project/vllm/blob/main/cmake/utils.cmake
+#
+# Attempt to find the python package that uses the same python executable as
+# `EXECUTABLE` and is one of the `SUPPORTED_VERSIONS`.
+#
+macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)
+  file(REAL_PATH ${EXECUTABLE} EXECUTABLE)
+  set(Python_EXECUTABLE ${EXECUTABLE})
+  find_package(Python COMPONENTS Interpreter Development.Module Development.SABIModule)
+  if (NOT Python_FOUND)
+    message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
+  endif()
+  set(_VER "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}")
+  set(_SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN})
+  if (NOT _VER IN_LIST _SUPPORTED_VERSIONS_LIST)
+    message(FATAL_ERROR
+      "Python version (${_VER}) is not one of the supported versions: "
+      "${_SUPPORTED_VERSIONS_LIST}.")
+  endif()
+  message(STATUS "Found python matching: ${EXECUTABLE}.")
+endmacro()
+#
+# Run `EXPR` in python.  The standard output of python is stored in `OUT` and
+# has trailing whitespace stripped.  If an error is encountered when running
+# python, a fatal message `ERR_MSG` is issued.
+#
+function (run_python OUT EXPR ERR_MSG)
+  execute_process(
+    COMMAND
+    "${Python_EXECUTABLE}" "-c" "${EXPR}"
+    OUTPUT_VARIABLE PYTHON_OUT
+    RESULT_VARIABLE PYTHON_ERROR_CODE
+    ERROR_VARIABLE PYTHON_STDERR
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if(NOT PYTHON_ERROR_CODE EQUAL 0)
+    message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}")
+  endif()
+  set(${OUT} ${PYTHON_OUT} PARENT_SCOPE)
+endfunction()
+# Run `EXPR` in python after importing `PKG`. Use the result of this to extend
+# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported.
+macro (append_cmake_prefix_path PKG EXPR)
+  run_python(_PREFIX_PATH
+    "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path")
+  list(APPEND CMAKE_PREFIX_PATH ${_PREFIX_PATH})
+endmacro()
+#
+# Add a target named `hipify${NAME}` that runs the hipify preprocessor on a set
+# of CUDA source files. The names of the corresponding "hipified" sources are
+# stored in `OUT_SRCS`.
+#
+function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
+  #
+  # Split into C++ and non-C++ (i.e. CUDA) sources.
+  #
+  set(NODUP_SRCS ${ORIG_SRCS})
+  list(REMOVE_DUPLICATES NODUP_SRCS)
+  set(SRCS ${NODUP_SRCS})
+  set(CXX_SRCS ${NODUP_SRCS})
+  list(FILTER SRCS INCLUDE REGEX "\.cu$")
+  list(FILTER CXX_SRCS EXCLUDE REGEX "\.cu$")
+  #
+  # Generate ROCm/HIP source file names from CUDA file names.
+  # Since HIP files are generated code, they will appear in the build area
+  # `CMAKE_CURRENT_BINARY_DIR` directory rather than the original csrc dir.
+  #
+  set(HIP_SRCS)
+  foreach (SRC ${SRCS})
+    get_source_file_property(include_dirs "${SRC}" INCLUDE_DIRECTORIES)
+    string(REGEX REPLACE "\.cu$" "\.hip" SRC ${SRC})
+    string(REGEX REPLACE "cuda" "hip" SRC ${SRC})
+    if(include_dirs)
+      # Copy over include directories from the original CUDA file.
+      set_source_files_properties(
+        ${SRC}
+        PROPERTIES INCLUDE_DIRECTORIES "${include_dirs}")
+    endif()
+    list(APPEND HIP_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${SRC}")
+  endforeach()
+  add_custom_target(
+    hipify${NAME}
+    COMMAND "${Python_EXECUTABLE}" ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR} -o ${CMAKE_CURRENT_BINARY_DIR} ${SRCS}
+    DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS}
+    BYPRODUCTS ${HIP_SRCS}
+    COMMENT "Running hipify on ${NAME} extension source files.")
+  # Swap out original extension sources with hipified sources.
+  list(APPEND HIP_SRCS ${CXX_SRCS})
+  set(${OUT_SRCS} ${HIP_SRCS} PARENT_SCOPE)
+endfunction()
+#
+# Get additional GPU compiler flags from torch.
+#
+function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
+  if (${GPU_LANG} STREQUAL "CUDA")
+    #
+    # Get common NVCC flags from torch.
+    #
+    run_python(GPU_FLAGS
+      "from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))"
+      "Failed to determine torch nvcc compiler flags")
+    if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
+      list(APPEND GPU_FLAGS "-DENABLE_FP8")
+      list(REMOVE_ITEM GPU_FLAGS
+        "-D__CUDA_NO_HALF_OPERATORS__"
+        "-D__CUDA_NO_HALF_CONVERSIONS__"
+        "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
+        "-D__CUDA_NO_HALF2_OPERATORS__")
+    endif()
+  elseif(${GPU_LANG} STREQUAL "HIP")
+    #
+    # Get common HIP/HIPCC flags from torch.
+    #
+    run_python(GPU_FLAGS
+      "import torch.utils.cpp_extension as t; print(';'.join(t.COMMON_HIP_FLAGS + t.COMMON_HIPCC_FLAGS))"
+      "Failed to determine torch nvcc compiler flags")
+    list(APPEND GPU_FLAGS
+      "-DUSE_ROCM"
+      "-DENABLE_FP8"
+      "-U__HIP_NO_HALF_CONVERSIONS__"
+      "-U__HIP_NO_HALF_OPERATORS__"
+      "-fno-gpu-rdc")
+  endif()
+  set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE)
+endfunction()
+# Macro for converting a `gencode` version number to a cmake version number.
+macro(string_to_ver OUT_VER IN_STR)
+  string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
+endmacro()
+#
+# Clear all `-gencode` flags from `CMAKE_CUDA_FLAGS` and store them in
+# `CUDA_ARCH_FLAGS`.
+#
+# Example:
+#   CMAKE_CUDA_FLAGS="-Wall -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75"
+#   clear_cuda_arches(CUDA_ARCH_FLAGS)
+#   CUDA_ARCH_FLAGS="-gencode arch=compute_70,code=sm_70;-gencode arch=compute_75,code=sm_75"
+#   CMAKE_CUDA_FLAGS="-Wall"
+#
+macro(clear_cuda_arches CUDA_ARCH_FLAGS)
+    # Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
+    string(REGEX MATCHALL "-gencode arch=[^ ]+" CUDA_ARCH_FLAGS
+      ${CMAKE_CUDA_FLAGS})
+    # Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
+    # and passed back via the `CUDA_ARCHITECTURES` property.
+    string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
+      ${CMAKE_CUDA_FLAGS})
+endmacro()
+#
+# Extract unique CUDA architectures from a list of compute capabilities codes in
+# the form `<major><minor>[<letter>]`, convert them to the form sort
+# `<major>.<minor>`, dedupes them and then sorts them in ascending order and
+# stores them in `OUT_ARCHES`.
+#
+# Example:
+#   CUDA_ARCH_FLAGS="-gencode arch=compute_75,code=sm_75;...;-gencode arch=compute_90a,code=sm_90a"
+#   extract_unique_cuda_archs_ascending(OUT_ARCHES CUDA_ARCH_FLAGS)
+#   OUT_ARCHES="7.5;...;9.0"
+function(extract_unique_cuda_archs_ascending OUT_ARCHES CUDA_ARCH_FLAGS)
+  set(_CUDA_ARCHES)
+  foreach(_ARCH ${CUDA_ARCH_FLAGS})
+    string(REGEX MATCH "arch=compute_\([0-9]+a?\)" _COMPUTE ${_ARCH})
+    if (_COMPUTE)
+      set(_COMPUTE ${CMAKE_MATCH_1})
+    endif()
+    string_to_ver(_COMPUTE_VER ${_COMPUTE})
+    list(APPEND _CUDA_ARCHES ${_COMPUTE_VER})
+  endforeach()
+  list(REMOVE_DUPLICATES _CUDA_ARCHES)
+  list(SORT _CUDA_ARCHES COMPARE NATURAL ORDER ASCENDING)
+  set(${OUT_ARCHES} ${_CUDA_ARCHES} PARENT_SCOPE)
+endfunction()
+#
+# For a specific file set the `-gencode` flag in compile options conditionally
+# for the CUDA language.
+#
+# Example:
+#   set_gencode_flag_for_srcs(
+#     SRCS "foo.cu"
+#     ARCH "compute_75"
+#     CODE "sm_75")
+#   adds: "-gencode arch=compute_75,code=sm_75" to the compile options for
+#    `foo.cu` (only for the CUDA language).
+#
+macro(set_gencode_flag_for_srcs)
+  set(options)
+  set(oneValueArgs ARCH CODE)
+  set(multiValueArgs SRCS)
+  cmake_parse_arguments(arg "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN} )
+  set(_FLAG -gencode arch=${arg_ARCH},code=${arg_CODE})
+  set_property(
+    SOURCE ${arg_SRCS}
+    APPEND PROPERTY
+    COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CUDA>:${_FLAG}>"
+  )
+  message(DEBUG "Setting gencode flag for ${arg_SRCS}: ${_FLAG}")
+endmacro(set_gencode_flag_for_srcs)
+#
+# For a list of source files set the `-gencode` flags in the files specific
+#  compile options (specifically for the CUDA language).
+#
+# arguments are:
+#  SRCS: list of source files
+#  CUDA_ARCHS: list of CUDA architectures in the form `<major>.<minor>[letter]`
+#  BUILD_PTX_FOR_ARCH: if set to true, then the PTX code will be built
+#    for architecture `BUILD_PTX_FOR_ARCH` if there is a CUDA_ARCH in CUDA_ARCHS
+#    that is larger than BUILD_PTX_FOR_ARCH.
+#
+macro(set_gencode_flags_for_srcs)
+  set(options)
+  set(oneValueArgs BUILD_PTX_FOR_ARCH)
+  set(multiValueArgs SRCS CUDA_ARCHS)
+  cmake_parse_arguments(arg "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN} )
+  foreach(_ARCH ${arg_CUDA_ARCHS})
+    # handle +PTX suffix: generate both sm and ptx codes if requested
+    string(FIND "${_ARCH}" "+PTX" _HAS_PTX)
+    if(NOT _HAS_PTX EQUAL -1)
+      string(REPLACE "+PTX" "" _BASE_ARCH "${_ARCH}")
+      string(REPLACE "." "" _STRIPPED_ARCH "${_BASE_ARCH}")
+      set_gencode_flag_for_srcs(
+        SRCS ${arg_SRCS}
+        ARCH "compute_${_STRIPPED_ARCH}"
+        CODE "sm_${_STRIPPED_ARCH}")
+      set_gencode_flag_for_srcs(
+        SRCS ${arg_SRCS}
+        ARCH "compute_${_STRIPPED_ARCH}"
+        CODE "compute_${_STRIPPED_ARCH}")
+    else()
+      string(REPLACE "." "" _STRIPPED_ARCH "${_ARCH}")
+      set_gencode_flag_for_srcs(
+        SRCS ${arg_SRCS}
+        ARCH "compute_${_STRIPPED_ARCH}"
+        CODE "sm_${_STRIPPED_ARCH}")
+    endif()
+  endforeach()
+  if (${arg_BUILD_PTX_FOR_ARCH})
+    list(SORT arg_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
+    list(GET arg_CUDA_ARCHS -1 _HIGHEST_ARCH)
+    if (_HIGHEST_ARCH VERSION_GREATER_EQUAL ${arg_BUILD_PTX_FOR_ARCH})
+      string(REPLACE "." "" _PTX_ARCH "${arg_BUILD_PTX_FOR_ARCH}")
+      set_gencode_flag_for_srcs(
+        SRCS ${arg_SRCS}
+        ARCH "compute_${_PTX_ARCH}"
+        CODE "compute_${_PTX_ARCH}")
+    endif()
+  endif()
+endmacro()
+#
+# For the given `SRC_CUDA_ARCHS` list of gencode versions in the form
+#  `<major>.<minor>[letter]` compute the "loose intersection" with the
+#  `TGT_CUDA_ARCHS` list of gencodes. We also support the `+PTX` suffix in
+#  `SRC_CUDA_ARCHS` which indicates that the PTX code should be built when there
+#  is a CUDA_ARCH in `TGT_CUDA_ARCHS` that is equal to or larger than the
+#  architecture in `SRC_CUDA_ARCHS`.
+# The loose intersection is defined as:
+#   { max{ x \in tgt | x <= y } | y \in src, { x \in tgt | x <= y } != {} }
+#  where `<=` is the version comparison operator.
+# In other words, for each version in `TGT_CUDA_ARCHS` find the highest version
+#  in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`.
+# We have special handling for x.0a, if x.0a is in `SRC_CUDA_ARCHS` and x.0 is
+#  in `TGT_CUDA_ARCHS` then we should remove x.0a from `SRC_CUDA_ARCHS` and add
+#  x.0a to the result (and remove x.0 from TGT_CUDA_ARCHS).
+# The result is stored in `OUT_CUDA_ARCHS`.
+#
+# Example:
+#   SRC_CUDA_ARCHS="7.5;8.0;8.6;9.0;9.0a"
+#   TGT_CUDA_ARCHS="8.0;8.9;9.0"
+#   cuda_archs_loose_intersection(OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
+#   OUT_CUDA_ARCHS="8.0;8.6;9.0;9.0a"
+#
+# Example With PTX:
+#   SRC_CUDA_ARCHS="8.0+PTX"
+#   TGT_CUDA_ARCHS="9.0"
+#   cuda_archs_loose_intersection(OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
+#   OUT_CUDA_ARCHS="8.0+PTX"
+#
+function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
+  set(_SRC_CUDA_ARCHS "${SRC_CUDA_ARCHS}")
+  set(_TGT_CUDA_ARCHS ${TGT_CUDA_ARCHS})
+  # handle +PTX suffix: separate base arch for matching, record PTX requests
+  set(_PTX_ARCHS)
+  foreach(_arch ${_SRC_CUDA_ARCHS})
+    if(_arch MATCHES "\\+PTX$")
+      string(REPLACE "+PTX" "" _base "${_arch}")
+      list(APPEND _PTX_ARCHS "${_base}")
+      list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
+      list(APPEND _SRC_CUDA_ARCHS "${_base}")
+    endif()
+  endforeach()
+  list(REMOVE_DUPLICATES _PTX_ARCHS)
+  list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS)
+  # if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
+  # remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS
+  set(_CUDA_ARCHS)
+  foreach(_arch ${_SRC_CUDA_ARCHS})
+    if(_arch MATCHES "\\a$")
+      list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
+      string(REPLACE "a" "" _base "${_arch}")
+      if ("${_base}" IN_LIST TGT_CUDA_ARCHS)
+        list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}")
+        list(APPEND _CUDA_ARCHS "${_arch}")
+      endif()
+    endif()
+  endforeach()
+  list(SORT _SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
+  # for each ARCH in TGT_CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that
+  # is less or equal to ARCH (but has the same major version since SASS binary
+  # compatibility is only forward compatible within the same major version).
+  foreach(_ARCH ${_TGT_CUDA_ARCHS})
+    set(_TMP_ARCH)
+    # Extract the major version of the target arch
+    string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" TGT_ARCH_MAJOR "${_ARCH}")
+    foreach(_SRC_ARCH ${_SRC_CUDA_ARCHS})
+      # Extract the major version of the source arch
+      string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" SRC_ARCH_MAJOR "${_SRC_ARCH}")
+      # Check version-less-or-equal, and allow PTX arches to match across majors
+      if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
+        if (_SRC_ARCH IN_LIST _PTX_ARCHS OR SRC_ARCH_MAJOR STREQUAL TGT_ARCH_MAJOR)
+          set(_TMP_ARCH "${_SRC_ARCH}")
+        endif()
+      else()
+        # If we hit a version greater than the target, we can break
+        break()
+      endif()
+    endforeach()
+    # If we found a matching _TMP_ARCH, append it to _CUDA_ARCHS
+    if (_TMP_ARCH)
+      list(APPEND _CUDA_ARCHS "${_TMP_ARCH}")
+    endif()
+  endforeach()
+  list(REMOVE_DUPLICATES _CUDA_ARCHS)
+  # reapply +PTX suffix to architectures that requested PTX
+  set(_FINAL_ARCHS)
+  foreach(_arch ${_CUDA_ARCHS})
+    if(_arch IN_LIST _PTX_ARCHS)
+      list(APPEND _FINAL_ARCHS "${_arch}+PTX")
+    else()
+      list(APPEND _FINAL_ARCHS "${_arch}")
+    endif()
+  endforeach()
+  set(_CUDA_ARCHS ${_FINAL_ARCHS})
+  set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)
+endfunction()
+#
+# For the given `SRC_ROCM_ARCHS` list of architecture versions in the form
+# `<name>` compute the "loose intersection" with the `TGT_ROCM_ARCHS` list.
+# The loose intersection is defined as:
+#   { max{ x \in tgt | x <= y } | y \in src, { x \in tgt | x <= y } != {} }
+#  where `<=` is the version comparison operator.
+# In other words, for each version in `TGT_ROCM_ARCHS` find the highest version
+#  in `SRC_ROCM_ARCHS` that is less or equal to the version in `TGT_ROCM_ARCHS`.
+# The result is stored in `OUT_ROCM_ARCHS`.
+#
+# Example:
+#   SRC_ROCM_ARCHS="gfx900;gfx906;gfx908;gfx90a"
+#   TGT_ROCM_ARCHS="gfx906;gfx908;gfx1030"
+#   hip_archs_loose_intersection(OUT_ROCM_ARCHS SRC_ROCM_ARCHS TGT_ROCM_ARCHS)
+#   OUT_ROCM_ARCHS="gfx906;gfx908"
+#
+function(hip_archs_loose_intersection OUT_ROCM_ARCHS SRC_ROCM_ARCHS TGT_ROCM_ARCHS)
+  list(REMOVE_DUPLICATES SRC_ROCM_ARCHS)
+  # ROCm architectures are typically in format gfxNNN or gfxNNNx where N is a digit
+  # and x is a letter. We can sort them by string comparison which works for this format.
+  list(SORT SRC_ROCM_ARCHS COMPARE STRING ORDER ASCENDING)
+  set(_ROCM_ARCHS)
+  # Find the intersection of supported architectures
+  foreach(_SRC_ARCH ${SRC_ROCM_ARCHS})
+    if(_SRC_ARCH IN_LIST TGT_ROCM_ARCHS)
+      list(APPEND _ROCM_ARCHS ${_SRC_ARCH})
+    endif()
+  endforeach()
+  list(REMOVE_DUPLICATES _ROCM_ARCHS)
+  set(${OUT_ROCM_ARCHS} ${_ROCM_ARCHS} PARENT_SCOPE)
+endfunction()
+#
+# Override the GPU architectures detected by cmake/torch and filter them by
+# `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in
+# `GPU_ARCHES`. This only applies to the HIP language since for CUDA we set
+# the architectures on a per file basis.
+#
+# Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`.
+#
+macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
+  set(_GPU_SUPPORTED_ARCHES_LIST ${GPU_SUPPORTED_ARCHES} ${ARGN})
+  message(STATUS "${GPU_LANG} supported arches: ${_GPU_SUPPORTED_ARCHES_LIST}")
+  if (${GPU_LANG} STREQUAL "HIP")
+    #
+    # `GPU_ARCHES` controls the `--offload-arch` flags.
+    #
+    # If PYTORCH_ROCM_ARCH env variable exists, then we take it as a list,
+    # if not, then we use CMAKE_HIP_ARCHITECTURES which was generated by calling
+    # "rocm_agent_enumerator" in "enable_language(HIP)"
+    # (in file Modules/CMakeDetermineHIPCompiler.cmake)
+    #
+    if(DEFINED ENV{PYTORCH_ROCM_ARCH})
+      set(HIP_ARCHITECTURES $ENV{PYTORCH_ROCM_ARCH})
+    else()
+      set(HIP_ARCHITECTURES ${CMAKE_HIP_ARCHITECTURES})
+    endif()
+    #
+    # Find the intersection of the supported + detected architectures to
+    # set the module architecture flags.
+    #
+    set(${GPU_ARCHES})
+    foreach (_ARCH ${HIP_ARCHITECTURES})
+      if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
+        list(APPEND ${GPU_ARCHES} ${_ARCH})
+      endif()
+    endforeach()
+    if(NOT ${GPU_ARCHES})
+      message(FATAL_ERROR
+        "None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is"
+        " supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
+    endif()
+  endif()
+endmacro()
+#
+# Define a target named `GPU_MOD_NAME` for a single extension. The
+# arguments are:
+#
+# DESTINATION <dest>         - Module destination directory.
+# LANGUAGE <lang>            - The GPU language for this module, e.g CUDA, HIP,
+#                              etc.
+# SOURCES <sources>          - List of source files relative to CMakeLists.txt
+#                              directory.
+#
+# Optional arguments:
+#
+# ARCHITECTURES <arches>     - A list of target GPU architectures in cmake
+#                              format.
+#                              Refer `CMAKE_CUDA_ARCHITECTURES` documentation
+#                              and `CMAKE_HIP_ARCHITECTURES` for more info.
+#                              ARCHITECTURES will use cmake's defaults if
+#                              not provided.
+# COMPILE_FLAGS <flags>      - Extra compiler flags passed to NVCC/hip.
+# INCLUDE_DIRECTORIES <dirs> - Extra include directories.
+# LIBRARIES <libraries>      - Extra link libraries.
+# WITH_SOABI                 - Generate library with python SOABI suffix name.
+# USE_SABI <version>         - Use python stable api <version>
+#
+# Note: optimization level/debug info is set via cmake build type.
+#
+function (define_gpu_extension_target GPU_MOD_NAME)
+  cmake_parse_arguments(PARSE_ARGV 1
+    GPU
+    "WITH_SOABI"
+    "DESTINATION;LANGUAGE;USE_SABI"
+    "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")
+  # Add hipify preprocessing step when building with HIP/ROCm.
+  if (GPU_LANGUAGE STREQUAL "HIP")
+    hipify_sources_target(GPU_SOURCES ${GPU_MOD_NAME} "${GPU_SOURCES}")
+  endif()
+  if (GPU_WITH_SOABI)
+    set(GPU_WITH_SOABI WITH_SOABI)
+  else()
+    set(GPU_WITH_SOABI)
+  endif()
+  if (GPU_USE_SABI)
+    Python_add_library(${GPU_MOD_NAME} MODULE USE_SABI ${GPU_USE_SABI} ${GPU_WITH_SOABI} "${GPU_SOURCES}")
+  else()
+    Python_add_library(${GPU_MOD_NAME} MODULE ${GPU_WITH_SOABI} "${GPU_SOURCES}")
+  endif()
+  if (GPU_LANGUAGE STREQUAL "HIP")
+    # Make this target dependent on the hipify preprocessor step.
+    add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME})
+  endif()
+  if (GPU_ARCHITECTURES)
+    set_target_properties(${GPU_MOD_NAME} PROPERTIES
+      ${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}")
+  endif()
+  set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17)
+  target_compile_options(${GPU_MOD_NAME} PRIVATE
+    $<$<COMPILE_LANGUAGE:${GPU_LANGUAGE}>:${GPU_COMPILE_FLAGS}>)
+  target_compile_definitions(${GPU_MOD_NAME} PRIVATE
+    "-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}")
+  target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
+    ${GPU_INCLUDE_DIRECTORIES})
+  target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES})
+  # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
+  # dependencies that are not necessary and may not be installed.
+  if (GPU_LANGUAGE STREQUAL "CUDA")
+    target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart)
+  else()
+    target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
+  endif()
+  install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION} COMPONENT ${GPU_MOD_NAME})
+endfunction()

flake.lock ADDED Viewed

	@@ -0,0 +1,168 @@

+{
+  "nodes": {
+    "flake-compat": {
+      "locked": {
+        "lastModified": 1747046372,
+        "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-compat_2": {
+      "locked": {
+        "lastModified": 1747046372,
+        "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "flake-utils_2": {
+      "inputs": {
+        "systems": "systems_2"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "hf-nix": {
+      "inputs": {
+        "flake-compat": "flake-compat_2",
+        "flake-utils": "flake-utils_2",
+        "nixpkgs": "nixpkgs"
+      },
+      "locked": {
+        "lastModified": 1757675377,
+        "narHash": "sha256-JQKZOI1ZYO4faJnanuoTXziSmqzXe5rEFSGliWDWqWw=",
+        "owner": "huggingface",
+        "repo": "hf-nix",
+        "rev": "faf3354403a7381958d08e826c15fe30f6986a4f",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "hf-nix",
+        "type": "github"
+      }
+    },
+    "kernel-builder": {
+      "inputs": {
+        "flake-compat": "flake-compat",
+        "flake-utils": "flake-utils",
+        "hf-nix": "hf-nix",
+        "nixpkgs": [
+          "kernel-builder",
+          "hf-nix",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1758103102,
+        "narHash": "sha256-z9E9FxuxuxUztG5DbUcOvKBHvd27gBY9617t9x2QE6M=",
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "rev": "94369928dc09ea7753c58495e3e406ac26f6c378",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1755963616,
+        "narHash": "sha256-6yD0ww/S8n+U2uPYcJZ3DRURP8Kx036GRpR2uPNZroE=",
+        "owner": "nixos",
+        "repo": "nixpkgs",
+        "rev": "73e96df7cff5783f45e21342a75a1540c4eddce4",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nixos",
+        "ref": "nixos-unstable-small",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "kernel-builder": "kernel-builder"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    },
+    "systems_2": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}

pyproject.toml ADDED Viewed

	@@ -0,0 +1,10 @@

+[build-system]
+requires = [
+  "cmake>=3.26",
+  "ninja",
+  "packaging",
+  "setuptools>=61",
+  "torch",
+  "wheel",
+]
+build-backend = "setuptools.build_meta"

setup.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import logging
+import os
+from shutil import which, move
+import subprocess
+import sys
+from pathlib import Path
+from setuptools import Extension, find_packages, setup
+from setuptools.command.build_ext import build_ext
+logger = logging.getLogger(__name__)
+def is_sccache_available() -> bool:
+    return which("sccache") is not None
+def is_ccache_available() -> bool:
+    return which("ccache") is not None
+def is_ninja_available() -> bool:
+    return which("ninja") is not None
+class CMakeExtension(Extension):
+    def __init__(self, name: str, sourcedir: str = "") -> None:
+        super().__init__(name, sources=[], py_limited_api=True)
+        self.sourcedir = os.fspath(Path(sourcedir).resolve())
+class CMakeBuild(build_ext):
+    def build_extension(self, ext: CMakeExtension) -> None:
+        ext_fullpath = Path.cwd() / self.get_ext_fullpath(ext.name)
+        extdir = ext_fullpath.parent.resolve()
+        debug = int(os.environ.get("DEBUG", 0)) if self.debug is None else self.debug
+        cfg = "Debug" if debug else "Release"
+        cmake_generator = os.environ.get("CMAKE_GENERATOR", "")
+        # Set Python_EXECUTABLE instead if you use PYBIND11_FINDPYTHON
+        # EXAMPLE_VERSION_INFO shows you how to pass a value into the C++ code
+        # from Python.
+        cmake_args = [
+            f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}{os.sep}",
+            f"-DPython_EXECUTABLE={sys.executable}",
+            f"-DCMAKE_BUILD_TYPE={cfg}",  # not used on MSVC, but no harm
+        ]
+        build_args = []
+        if "CMAKE_ARGS" in os.environ:
+            cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item]
+        if not cmake_generator or cmake_generator == "Ninja":
+            try:
+                import ninja
+                ninja_executable_path = Path(ninja.BIN_DIR) / "ninja"
+                cmake_args += [
+                    "-GNinja",
+                    f"-DCMAKE_MAKE_PROGRAM:FILEPATH={ninja_executable_path}",
+                ]
+            except ImportError:
+                pass
+        if is_sccache_available():
+            cmake_args += [
+                "-DCMAKE_C_COMPILER_LAUNCHER=sccache",
+                "-DCMAKE_CXX_COMPILER_LAUNCHER=sccache",
+                "-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache",
+                "-DCMAKE_HIP_COMPILER_LAUNCHER=sccache",
+            ]
+        elif is_ccache_available():
+            cmake_args += [
+                "-DCMAKE_C_COMPILER_LAUNCHER=ccache",
+                "-DCMAKE_CXX_COMPILER_LAUNCHER=ccache",
+                "-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache",
+                "-DCMAKE_HIP_COMPILER_LAUNCHER=ccache",
+            ]
+        num_jobs = os.getenv("MAX_JOBS", None)
+        if num_jobs is not None:
+            num_jobs = int(num_jobs)
+            logger.info("Using MAX_JOBS=%d as the number of jobs.", num_jobs)
+        else:
+            try:
+                # os.sched_getaffinity() isn't universally available, so fall
+                #  back to os.cpu_count() if we get an error here.
+                num_jobs = len(os.sched_getaffinity(0))
+            except AttributeError:
+                num_jobs = os.cpu_count()
+        nvcc_threads = os.getenv("NVCC_THREADS", None)
+        if nvcc_threads is not None:
+            nvcc_threads = int(nvcc_threads)
+            logger.info(
+                "Using NVCC_THREADS=%d as the number of nvcc threads.", nvcc_threads
+            )
+        else:
+            nvcc_threads = 1
+        num_jobs = max(1, num_jobs // nvcc_threads)
+        build_args += [f"-j{num_jobs}"]
+        if sys.platform == "win32":
+            build_args += ["--config", cfg]
+        if nvcc_threads:
+            cmake_args += ["-DNVCC_THREADS={}".format(nvcc_threads)]
+        build_temp = Path(self.build_temp) / ext.name
+        if not build_temp.exists():
+            build_temp.mkdir(parents=True)
+        subprocess.run(
+            ["cmake", ext.sourcedir, *cmake_args], cwd=build_temp, check=True
+        )
+        subprocess.run(
+            ["cmake", "--build", ".", *build_args], cwd=build_temp, check=True
+        )
+        if sys.platform == "win32":
+            # Move the dylib one folder up for discovery.
+            for filename in os.listdir(extdir / cfg):
+                move(extdir / cfg / filename, extdir / filename)
+setup(
+    name="layer_norm",
+    # The version is just a stub, it's not used by the final build artefact.
+    version="0.1.0",
+    ext_modules=[CMakeExtension("layer_norm._layer_norm_711aa42_dirty")],
+    cmdclass={"build_ext": CMakeBuild},
+    packages=find_packages(where="torch-ext", include=["layer_norm*"]),
+    package_dir={"": "torch-ext"},
+    zip_safe=False,
+    install_requires=["torch"],
+    python_requires=">=3.9",
+)

torch-ext/layer_norm/_layer_norm_711aa42_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c824a0d2b400f4a89ccf293975ccfedc32733174dad4386a402149c440946674
+size 247782208

torch-ext/layer_norm/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _layer_norm_711aa42_dirty
+ops = torch.ops._layer_norm_711aa42_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_layer_norm_711aa42_dirty::{op_name}"

torch-ext/registration.h ADDED Viewed

	@@ -0,0 +1,30 @@

+// Registration macros from vLLM:
+// https://github.com/vllm-project/vllm/blob/main/csrc/core/registration.h
+#pragma once
+#include <Python.h>
+#define _CONCAT(A, B) A##B
+#define CONCAT(A, B) _CONCAT(A, B)
+#define _STRINGIFY(A) #A
+#define STRINGIFY(A) _STRINGIFY(A)
+// A version of the TORCH_LIBRARY macro that expands the NAME, i.e. so NAME
+// could be a macro instead of a literal token.
+#define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE)
+// A version of the TORCH_LIBRARY_IMPL macro that expands the NAME, i.e. so NAME
+// could be a macro instead of a literal token.
+#define TORCH_LIBRARY_IMPL_EXPAND(NAME, DEVICE, MODULE) \
+  TORCH_LIBRARY_IMPL(NAME, DEVICE, MODULE)
+// REGISTER_EXTENSION allows the shared library to be loaded and initialized
+// via python's import statement.
+#define REGISTER_EXTENSION(NAME)                                               \
+  PyMODINIT_FUNC CONCAT(PyInit_, NAME)() {                                     \
+    static struct PyModuleDef module = {PyModuleDef_HEAD_INIT,                 \
+                                        STRINGIFY(NAME), nullptr, 0, nullptr}; \
+    return PyModule_Create(&module);                                           \
+  }

torch-ext/torch_binding.cpp CHANGED Viewed

@@ -3,15 +3,152 @@
 #include "registration.h"
 #include "torch_binding.h"
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
-  ops.def("dropout_add_ln_fwd(Tensor input, Tensor gamma, Tensor beta, Tensor rowscale, Tensor colscale, Tensor x0_subset, Tensor z_subset, float dropout_p, float epsilon, float rowscale_const, int64_t z_numrows, Generator gen, bool residual_in_fp32, bool is_rms_norm) -> Tensor");
-  ops.impl("dropout_add_ln_fwd", torch::kCUDA, &dropout_add_ln_fwd);
-  ops.def("dropout_add_ln_bwd(Tensor dz, Tensor dx, Tensor x, Tensor mu, Tensor rsigma, Tensor gamma, Tensor rowscale, Tensor colscale, Tensor x0_subset, Tensor z_subset, float dropout_p, float rowscale_const, int64_t x0_numrows, bool has_residual, bool is_rms_norm) -> Tensor");
-  ops.impl("dropout_add_ln_bwd", torch::kCUDA, &dropout_add_ln_bwd);
-  ops.def("dropout_add_ln_parallel_residual_fwd(Tensor input, Tensor gamma0, Tensor beta0, Tensor gamma1, Tensor beta1, float dropout_p, float epsilon, Generator gen, bool residual_in_fp32, bool is_rms_norm) -> Tensor");
-  ops.impl("dropout_add_ln_parallel_residual_fwd", torch::kCUDA, &dropout_add_ln_parallel_residual_fwd);
-  ops.def("dropout_add_ln_parallel_residual_bwd(Tensor dz0, Tensor dz1, Tensor dx, Tensor x, Tensor mu, Tensor rsigma, Tensor gamma0, Tensor gamma1, float dropout_p, bool has_x1, bool has_residual, bool is_rms_norm) -> Tensor");
-  ops.impl("dropout_add_ln_parallel_residual_bwd", torch::kCUDA, &dropout_add_ln_parallel_residual_bwd);
 }
-REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

 #include "registration.h"
 #include "torch_binding.h"
+// Helper to turn Tensor? from schema (optional by value) into optional<const Tensor>& args
+template <typename T>
+static c10::optional<const at::Tensor> as_const_opt(const c10::optional<T>& v) {
+  if (v.has_value()) return c10::optional<const at::Tensor>(v.value());
+  return c10::optional<const at::Tensor>();
+}
+// Wrappers with dispatcher-friendly types (double scalars, optional Generator)
+// Forward
+static std::vector<at::Tensor> dropout_add_ln_fwd_wrap(
+    const at::Tensor& input,
+    const at::Tensor& gamma,
+    c10::optional<at::Tensor> beta,
+    c10::optional<at::Tensor> rowscale,
+    c10::optional<at::Tensor> colscale,
+    c10::optional<at::Tensor> x0_subset,
+    c10::optional<at::Tensor> z_subset,
+    double dropout_p,
+    double epsilon,
+    double rowscale_const,
+    int64_t z_numrows,
+    c10::optional<at::Generator> gen,
+    bool residual_in_fp32,
+    bool is_rms_norm) {
+  // residual is not exposed in this schema (None)
+  auto residual_c = c10::optional<const at::Tensor>();
+  auto beta_c = as_const_opt(beta);
+  auto rowscale_c = as_const_opt(rowscale);
+  auto colscale_c = as_const_opt(colscale);
+  auto x0_subset_c = as_const_opt(x0_subset);
+  auto z_subset_c = as_const_opt(z_subset);
+  return dropout_add_ln_fwd(
+      input, residual_c, gamma, beta_c, rowscale_c, colscale_c, x0_subset_c, z_subset_c,
+      static_cast<float>(dropout_p),
+      static_cast<float>(epsilon),
+      static_cast<float>(rowscale_const),
+      z_numrows, gen, residual_in_fp32, is_rms_norm);
+}
+// Backward
+static std::vector<at::Tensor> dropout_add_ln_bwd_wrap(
+    const at::Tensor& dz,
+    c10::optional<at::Tensor> dx,
+    const at::Tensor& x,
+    c10::optional<at::Tensor> x0,
+    c10::optional<at::Tensor> dmask,
+    const at::Tensor& mu,
+    const at::Tensor& rsigma,
+    const at::Tensor& gamma,
+    c10::optional<at::Tensor> rowscale,
+    c10::optional<at::Tensor> colscale,
+    c10::optional<at::Tensor> x0_subset,
+    c10::optional<at::Tensor> z_subset,
+    double dropout_p,
+    double rowscale_const,
+    int64_t x0_numrows,
+    bool has_residual,
+    bool is_rms_norm) {
+  auto dx_c = as_const_opt(dx);
+  auto x0_c = as_const_opt(x0);
+  auto dmask_c = as_const_opt(dmask);
+  auto rowscale_c = as_const_opt(rowscale);
+  auto colscale_c = as_const_opt(colscale);
+  auto x0_subset_c = as_const_opt(x0_subset);
+  auto z_subset_c = as_const_opt(z_subset);
+  return dropout_add_ln_bwd(
+      dz, dx_c, x, x0_c, dmask_c, mu, rsigma, gamma,
+      rowscale_c, colscale_c, x0_subset_c, z_subset_c,
+      static_cast<float>(dropout_p),
+      static_cast<float>(rowscale_const),
+      x0_numrows, has_residual, is_rms_norm);
+}
+// Parallel forward
+static std::vector<at::Tensor> dropout_add_ln_parallel_residual_fwd_wrap(
+    const at::Tensor& input,
+    c10::optional<at::Tensor> x1,
+    c10::optional<at::Tensor> residual,
+    const at::Tensor& gamma0,
+    c10::optional<at::Tensor> beta0,
+    c10::optional<at::Tensor> gamma1,
+    c10::optional<at::Tensor> beta1,
+    double dropout_p,
+    double epsilon,
+    c10::optional<at::Generator> gen,
+    bool residual_in_fp32,
+    bool is_rms_norm) {
+  auto x1_c = as_const_opt(x1);
+  auto residual_c = as_const_opt(residual);
+  auto beta0_c = as_const_opt(beta0);
+  auto gamma1_c = as_const_opt(gamma1);
+  auto beta1_c = as_const_opt(beta1);
+  return dropout_add_ln_parallel_residual_fwd(
+      input, x1_c, residual_c, gamma0, beta0_c, gamma1_c, beta1_c,
+      static_cast<float>(dropout_p),
+      static_cast<float>(epsilon),
+      gen, residual_in_fp32, is_rms_norm);
+}
+// Parallel backward
+static std::vector<at::Tensor> dropout_add_ln_parallel_residual_bwd_wrap(
+    const at::Tensor& dz0,
+    c10::optional<at::Tensor> dz1,
+    c10::optional<at::Tensor> dx,
+    const at::Tensor& x,
+    c10::optional<at::Tensor> dmask0,
+    c10::optional<at::Tensor> dmask1,
+    const at::Tensor& mu,
+    const at::Tensor& rsigma,
+    const at::Tensor& gamma0,
+    c10::optional<at::Tensor> gamma1,
+    double dropout_p,
+    bool has_x1,
+    bool has_residual,
+    bool is_rms_norm) {
+  auto dz1_c = as_const_opt(dz1);
+  auto dx_c = as_const_opt(dx);
+  auto dmask0_c = as_const_opt(dmask0);
+  auto dmask1_c = as_const_opt(dmask1);
+  auto gamma1_c = as_const_opt(gamma1);
+  return dropout_add_ln_parallel_residual_bwd(
+      dz0, dz1_c, dx_c, x, dmask0_c, dmask1_c, mu, rsigma, gamma0, gamma1_c,
+      static_cast<float>(dropout_p), has_x1, has_residual, is_rms_norm);
+}
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  // Return lists to match std::vector<at::Tensor> from implementations
+  ops.def("dropout_add_ln_fwd(Tensor input, Tensor gamma, Tensor? beta, Tensor? rowscale, Tensor? colscale, Tensor? x0_subset, Tensor? z_subset, float dropout_p, float epsilon, float rowscale_const, int z_numrows, Generator? gen, bool residual_in_fp32, bool is_rms_norm) -> Tensor[]");
+  ops.impl("dropout_add_ln_fwd", torch::kCUDA, &dropout_add_ln_fwd_wrap);
+  ops.def("dropout_add_ln_bwd(Tensor dz, Tensor? dx, Tensor x, Tensor? x0, Tensor? dmask, Tensor mu, Tensor rsigma, Tensor gamma, Tensor? rowscale, Tensor? colscale, Tensor? x0_subset, Tensor? z_subset, float dropout_p, float rowscale_const, int x0_numrows, bool has_residual, bool is_rms_norm) -> Tensor[]");
+  ops.impl("dropout_add_ln_bwd", torch::kCUDA, &dropout_add_ln_bwd_wrap);
+  ops.def("dropout_add_ln_parallel_residual_fwd(Tensor input, Tensor? x1, Tensor? residual, Tensor gamma0, Tensor? beta0, Tensor? gamma1, Tensor? beta1, float dropout_p, float epsilon, Generator? gen, bool residual_in_fp32, bool is_rms_norm) -> Tensor[]");
+  ops.impl("dropout_add_ln_parallel_residual_fwd", torch::kCUDA, &dropout_add_ln_parallel_residual_fwd_wrap);
+  ops.def("dropout_add_ln_parallel_residual_bwd(Tensor dz0, Tensor? dz1, Tensor? dx, Tensor x, Tensor? dmask0, Tensor? dmask1, Tensor mu, Tensor rsigma, Tensor gamma0, Tensor? gamma1, float dropout_p, bool has_x1, bool has_residual, bool is_rms_norm) -> Tensor[]");
+  ops.impl("dropout_add_ln_parallel_residual_bwd", torch::kCUDA, &dropout_add_ln_parallel_residual_bwd_wrap);
 }
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

torch-ext/torch_binding.h CHANGED Viewed

@@ -2,7 +2,69 @@
 #include <torch/torch.h>
-torch::Tensor dropout_add_ln_fwd(torch::Tensor &input, torch::Tensor &gamma, torch::Tensor &beta, torch::Tensor &rowscale, torch::Tensor &colscale, torch::Tensor &x0_subset, torch::Tensor &z_subset, float dropout_p, float epsilon, float rowscale_const, int64_t z_numrows, torch::Generator &gen, bool residual_in_fp32, bool is_rms_norm);
-torch::Tensor dropout_add_ln_bwd(torch::Tensor &dz, torch::Tensor &dx, torch::Tensor &x, torch::Tensor &mu, torch::Tensor &rsigma, torch::Tensor &gamma, torch::Tensor &rowscale, torch::Tensor &colscale, torch::Tensor &x0_subset, torch::Tensor &z_subset, float dropout_p, float rowscale_const, int64_t x0_numrows, bool has_residual, bool is_rms_norm);
-torch::Tensor dropout_add_ln_parallel_residual_fwd(torch::Tensor &input, torch::Tensor &gamma0, torch::Tensor &beta0, torch::Tensor &gamma1, torch::Tensor &beta1, float dropout_p, float epsilon, torch::Generator &gen, bool residual_in_fp32, bool is_rms_norm);
-torch::Tensor dropout_add_ln_parallel_residual_bwd(torch::Tensor &dz0, torch::Tensor &dz1, torch::Tensor &dx, torch::Tensor &x, torch::Tensor &mu, torch::Tensor &rsigma, torch::Tensor &gamma0, torch::Tensor &gamma1, float dropout_p, bool has_x1, bool has_residual, bool is_rms_norm);

 #include <torch/torch.h>
+// Declarations for implementations defined in layer_norm/ln_api.cpp
+std::vector<at::Tensor> dropout_add_ln_fwd(
+    const at::Tensor &x0,
+    c10::optional<const at::Tensor> &residual,
+    const at::Tensor &gamma,
+    c10::optional<const at::Tensor> &beta,
+    c10::optional<const at::Tensor> &rowscale,
+    c10::optional<const at::Tensor> &colscale,
+    c10::optional<const at::Tensor> &x0_subset,
+    c10::optional<const at::Tensor> &z_subset,
+    const float dropout_p,
+    const float epsilon,
+    const float rowscale_const,
+    const int64_t z_numrows,
+    c10::optional<at::Generator> gen,
+    bool residual_in_fp32,
+    bool is_rms_norm);
+std::vector<at::Tensor> dropout_add_ln_bwd(
+    const at::Tensor &dz,
+    c10::optional<const at::Tensor> &dx,
+    const at::Tensor &x,
+    c10::optional<const at::Tensor> &x0,
+    c10::optional<const at::Tensor> &dmask,
+    const at::Tensor &mu,
+    const at::Tensor &rsigma,
+    const at::Tensor &gamma,
+    c10::optional<const at::Tensor> &rowscale,
+    c10::optional<const at::Tensor> &colscale,
+    c10::optional<const at::Tensor> &x0_subset,
+    c10::optional<const at::Tensor> &z_subset,
+    const float dropout_p,
+    const float rowscale_const,
+    const int64_t x0_numrows,
+    const bool has_residual,
+    bool is_rms_norm);
+std::vector<at::Tensor> dropout_add_ln_parallel_residual_fwd(
+    const at::Tensor &x0,
+    c10::optional<const at::Tensor> &x1,
+    c10::optional<const at::Tensor> &residual,
+    const at::Tensor &gamma0,
+    c10::optional<const at::Tensor> &beta0,
+    c10::optional<const at::Tensor> &gamma1,
+    c10::optional<const at::Tensor> &beta1,
+    const float dropout_p,
+    const float epsilon,
+    c10::optional<at::Generator> gen,
+    bool residual_in_fp32,
+    bool is_rms_norm);
+std::vector<at::Tensor> dropout_add_ln_parallel_residual_bwd(
+    const at::Tensor &dz0,
+    c10::optional<const at::Tensor> &dz1,
+    c10::optional<const at::Tensor> &dx,
+    const at::Tensor &x,
+    c10::optional<const at::Tensor> &dmask0,
+    c10::optional<const at::Tensor> &dmask1,
+    const at::Tensor &mu,
+    const at::Tensor &rsigma,
+    const at::Tensor &gamma0,
+    c10::optional<const at::Tensor> &gamma1,
+    const float dropout_p,
+    const bool has_x1,
+    const bool has_residual,
+    bool is_rms_norm);