medmekk commited on Oct 16, 2025

Commit

854c683

1 Parent(s): b9597c9

Torch 2.9 builds

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +1 -0
CMakeLists.txt +0 -213
README.md +1 -20
api.py +0 -800
build/{torch28-cxx11-cu126-x86_64-linux/layer_norm/_layer_norm_f622ea1_dirty.abi3.so → torch27-cxx11-cu118-x86_64-linux/layer_norm/_layer_norm_f3fd6bf.abi3.so} +2 -2
build/torch27-cxx11-cu118-x86_64-linux/layer_norm/_ops.py +3 -3
build/{torch27-cxx11-cu118-x86_64-linux/layer_norm/_layer_norm_f622ea1_dirty.abi3.so → torch27-cxx11-cu126-x86_64-linux/layer_norm/_layer_norm_f3fd6bf.abi3.so} +2 -2
build/torch27-cxx11-cu126-x86_64-linux/layer_norm/_ops.py +3 -3
build/torch27-cxx11-cu128-x86_64-linux/layer_norm/{_layer_norm_f622ea1_dirty.abi3.so → _layer_norm_f3fd6bf.abi3.so} +2 -2
build/torch27-cxx11-cu128-x86_64-linux/layer_norm/_ops.py +3 -3
build/{torch27-cxx11-cu126-x86_64-linux/layer_norm/_layer_norm_f622ea1_dirty.abi3.so → torch28-cxx11-cu126-x86_64-linux/layer_norm/_layer_norm_f3fd6bf.abi3.so} +2 -2
build/torch28-cxx11-cu126-x86_64-linux/layer_norm/_ops.py +3 -3
build/torch28-cxx11-cu128-x86_64-linux/layer_norm/_layer_norm_f3fd6bf.abi3.so +3 -0
build/torch28-cxx11-cu128-x86_64-linux/layer_norm/_layer_norm_f622ea1_dirty.abi3.so +0 -3
build/torch28-cxx11-cu128-x86_64-linux/layer_norm/_ops.py +3 -3
build/torch28-cxx11-cu129-x86_64-linux/layer_norm/_layer_norm_f3fd6bf.abi3.so +3 -0
build/torch28-cxx11-cu129-x86_64-linux/layer_norm/_layer_norm_f622ea1_dirty.abi3.so +0 -3
build/torch28-cxx11-cu129-x86_64-linux/layer_norm/_ops.py +3 -3
{torch-ext → build/torch29-cxx11-cu126-x86_64-linux}/layer_norm/__init__.py +0 -0
build/torch29-cxx11-cu126-x86_64-linux/layer_norm/_layer_norm_f3fd6bf.abi3.so +3 -0
build/torch29-cxx11-cu126-x86_64-linux/layer_norm/_ops.py +9 -0
{torch-ext → build/torch29-cxx11-cu126-x86_64-linux}/layer_norm/layers.py +0 -0
build/torch29-cxx11-cu128-x86_64-linux/layer_norm/__init__.py +26 -0
build/torch29-cxx11-cu128-x86_64-linux/layer_norm/_layer_norm_f3fd6bf.abi3.so +3 -0
build/torch29-cxx11-cu128-x86_64-linux/layer_norm/_ops.py +9 -0
build/torch29-cxx11-cu128-x86_64-linux/layer_norm/layers.py +51 -0
build/torch29-cxx11-cu130-x86_64-linux/layer_norm/__init__.py +26 -0
build/torch29-cxx11-cu130-x86_64-linux/layer_norm/_layer_norm_f3fd6bf.abi3.so +3 -0
build/torch29-cxx11-cu130-x86_64-linux/layer_norm/_ops.py +9 -0
build/torch29-cxx11-cu130-x86_64-linux/layer_norm/layers.py +51 -0
cmake/hipify.py +0 -76
cmake/utils.cmake +0 -545
layer_norm/ln.h +0 -281
layer_norm/ln_api.cpp +0 -828
layer_norm/ln_bwd_1024.cu +0 -15
layer_norm/ln_bwd_1280.cu +0 -15
layer_norm/ln_bwd_1536.cu +0 -15
layer_norm/ln_bwd_2048.cu +0 -15
layer_norm/ln_bwd_256.cu +0 -15
layer_norm/ln_bwd_2560.cu +0 -15
layer_norm/ln_bwd_3072.cu +0 -15
layer_norm/ln_bwd_4096.cu +0 -15
layer_norm/ln_bwd_512.cu +0 -15
layer_norm/ln_bwd_5120.cu +0 -15
layer_norm/ln_bwd_6144.cu +0 -15
layer_norm/ln_bwd_7168.cu +0 -15
layer_norm/ln_bwd_768.cu +0 -15
layer_norm/ln_bwd_8192.cu +0 -15
layer_norm/ln_bwd_kernels.cuh +0 -534
layer_norm/ln_fwd_1024.cu +0 -15

.gitignore CHANGED Viewed

@@ -1,2 +1,3 @@
 __pycache__/
 *.pyc

 __pycache__/
+**/__pycache__/
 *.pyc

CMakeLists.txt DELETED Viewed

@@ -1,213 +0,0 @@
-cmake_minimum_required(VERSION 3.26)
-project(layer_norm LANGUAGES CXX)
-set(TARGET_DEVICE "cuda" CACHE STRING "Target device backend for kernel")
-install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
-include(FetchContent)
-file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
-message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
-set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")
-include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
-if(DEFINED Python_EXECUTABLE)
-  # Allow passing through the interpreter (e.g. from setup.py).
-  find_package(Python COMPONENTS Development Development.SABIModule Interpreter)
-  if (NOT Python_FOUND)
-    message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
-  endif()
-else()
-  find_package(Python REQUIRED COMPONENTS Development Development.SABIModule Interpreter)
-endif()
-append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
-find_package(Torch REQUIRED)
-if (NOT TARGET_DEVICE STREQUAL "cuda" AND
-    NOT TARGET_DEVICE STREQUAL "rocm")
-    return()
-endif()
-if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
-   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
- set(CUDA_DEFAULT_KERNEL_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0+PTX")
-else()
-  set(CUDA_DEFAULT_KERNEL_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0+PTX")
-endif()
-if (NOT HIP_FOUND AND CUDA_FOUND)
-  set(GPU_LANG "CUDA")
-elseif(HIP_FOUND)
-  set(GPU_LANG "HIP")
-  # Importing torch recognizes and sets up some HIP/ROCm configuration but does
-  # not let cmake recognize .hip files. In order to get cmake to understand the
-  # .hip extension automatically, HIP must be enabled explicitly.
-  enable_language(HIP)
-else()
-  message(FATAL_ERROR "Can't find CUDA or HIP installation.")
-endif()
-if(GPU_LANG STREQUAL "CUDA")
-  clear_cuda_arches(CUDA_ARCH_FLAGS)
-  extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
-  message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
-  # Filter the target architectures by the supported supported archs
-  # since for some files we will build for all CUDA_ARCHS.
-  cuda_archs_loose_intersection(CUDA_ARCHS "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
-  message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
-  if(NVCC_THREADS AND GPU_LANG STREQUAL "CUDA")
-    list(APPEND GPU_FLAGS "--threads=${NVCC_THREADS}")
-  endif()
-  add_compile_definitions(CUDA_KERNEL)
-elseif(GPU_LANG STREQUAL "HIP")
-  set(ROCM_ARCHS "${HIP_SUPPORTED_ARCHS}")
-  # TODO: remove this once we can set specific archs per source file set.
-  override_gpu_arches(GPU_ARCHES
-    ${GPU_LANG}
-    "${${GPU_LANG}_SUPPORTED_ARCHS}")
-  add_compile_definitions(ROCM_KERNEL)
-else()
-  override_gpu_arches(GPU_ARCHES
-    ${GPU_LANG}
-    "${${GPU_LANG}_SUPPORTED_ARCHS}")
-endif()
-get_torch_gpu_compiler_flags(TORCH_GPU_FLAGS ${GPU_LANG})
-list(APPEND GPU_FLAGS ${TORCH_GPU_FLAGS})
-set(TORCH_layer_norm_SRC
-  torch-ext/torch_binding.cpp torch-ext/torch_binding.h
-)
-list(APPEND SRC "${TORCH_layer_norm_SRC}")
-set(layer_norm_SRC
-  "layer_norm/ln.h"
-"layer_norm/ln_api.cpp"
-"layer_norm/ln_bwd_1024.cu"
-"layer_norm/ln_bwd_1280.cu"
-"layer_norm/ln_bwd_1536.cu"
-"layer_norm/ln_bwd_2048.cu"
-"layer_norm/ln_bwd_256.cu"
-"layer_norm/ln_bwd_2560.cu"
-"layer_norm/ln_bwd_3072.cu"
-"layer_norm/ln_bwd_4096.cu"
-"layer_norm/ln_bwd_512.cu"
-"layer_norm/ln_bwd_5120.cu"
-"layer_norm/ln_bwd_6144.cu"
-"layer_norm/ln_bwd_7168.cu"
-"layer_norm/ln_bwd_768.cu"
-"layer_norm/ln_bwd_8192.cu"
-"layer_norm/ln_bwd_kernels.cuh"
-"layer_norm/ln_fwd_1024.cu"
-"layer_norm/ln_fwd_1280.cu"
-"layer_norm/ln_fwd_1536.cu"
-"layer_norm/ln_fwd_2048.cu"
-"layer_norm/ln_fwd_256.cu"
-"layer_norm/ln_fwd_2560.cu"
-"layer_norm/ln_fwd_3072.cu"
-"layer_norm/ln_fwd_4096.cu"
-"layer_norm/ln_fwd_512.cu"
-"layer_norm/ln_fwd_5120.cu"
-"layer_norm/ln_fwd_6144.cu"
-"layer_norm/ln_fwd_7168.cu"
-"layer_norm/ln_fwd_768.cu"
-"layer_norm/ln_fwd_8192.cu"
-"layer_norm/ln_fwd_kernels.cuh"
-"layer_norm/ln_kernel_traits.h"
-"layer_norm/ln_parallel_bwd_1024.cu"
-"layer_norm/ln_parallel_bwd_1280.cu"
-"layer_norm/ln_parallel_bwd_1536.cu"
-"layer_norm/ln_parallel_bwd_2048.cu"
-"layer_norm/ln_parallel_bwd_256.cu"
-"layer_norm/ln_parallel_bwd_2560.cu"
-"layer_norm/ln_parallel_bwd_3072.cu"
-"layer_norm/ln_parallel_bwd_4096.cu"
-"layer_norm/ln_parallel_bwd_512.cu"
-"layer_norm/ln_parallel_bwd_5120.cu"
-"layer_norm/ln_parallel_bwd_6144.cu"
-"layer_norm/ln_parallel_bwd_7168.cu"
-"layer_norm/ln_parallel_bwd_768.cu"
-"layer_norm/ln_parallel_bwd_8192.cu"
-"layer_norm/ln_parallel_fwd_1024.cu"
-"layer_norm/ln_parallel_fwd_1280.cu"
-"layer_norm/ln_parallel_fwd_1536.cu"
-"layer_norm/ln_parallel_fwd_2048.cu"
-"layer_norm/ln_parallel_fwd_256.cu"
-"layer_norm/ln_parallel_fwd_2560.cu"
-"layer_norm/ln_parallel_fwd_3072.cu"
-"layer_norm/ln_parallel_fwd_4096.cu"
-"layer_norm/ln_parallel_fwd_512.cu"
-"layer_norm/ln_parallel_fwd_5120.cu"
-"layer_norm/ln_parallel_fwd_6144.cu"
-"layer_norm/ln_parallel_fwd_7168.cu"
-"layer_norm/ln_parallel_fwd_768.cu"
-"layer_norm/ln_parallel_fwd_8192.cu"
-"layer_norm/ln_parallel_residual_bwd_kernels.cuh"
-"layer_norm/ln_parallel_residual_fwd_kernels.cuh"
-"layer_norm/ln_utils.cuh"
-"layer_norm/static_switch.h"
-)
-# TODO: check if CLion support this:
-# https://youtrack.jetbrains.com/issue/CPP-16510/CLion-does-not-handle-per-file-include-directories
-set_source_files_properties(
-  ${layer_norm_SRC}
-  PROPERTIES INCLUDE_DIRECTORIES "${CMAKE_SOURCE_DIR}/.")
-if(GPU_LANG STREQUAL "CUDA")
-      cuda_archs_loose_intersection(layer_norm_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}" "${CUDA_ARCHS}")
-    message(STATUS "Capabilities for kernel layer_norm: ${layer_norm_ARCHS}")
-  set_gencode_flags_for_srcs(SRCS "${layer_norm_SRC}" CUDA_ARCHS "${layer_norm_ARCHS}")
-  foreach(_KERNEL_SRC ${layer_norm_SRC})
-    if(_KERNEL_SRC MATCHES ".*\\.cu$")
-      set_property(
-        SOURCE ${_KERNEL_SRC}
-        APPEND PROPERTY
-        COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CUDA>:-O3;-U__CUDA_NO_HALF_OPERATORS__;-U__CUDA_NO_HALF_CONVERSIONS__;-U__CUDA_NO_BFLOAT16_OPERATORS__;-U__CUDA_NO_BFLOAT16_CONVERSIONS__;-U__CUDA_NO_BFLOAT162_OPERATORS__;-U__CUDA_NO_BFLOAT162_CONVERSIONS__;--expt-relaxed-constexpr;--expt-extended-lambda;--use_fast_math>"
-      )
-    endif()
-  endforeach()
-    foreach(_KERNEL_SRC ${layer_norm_SRC})
-    set_property(
-      SOURCE ${_KERNEL_SRC}
-      APPEND PROPERTY
-      COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CXX>:-DFLASHATTENTION_DISABLE_PYBIND>"
-    )
-  endforeach()
-  list(APPEND SRC "${layer_norm_SRC}")
-endif()
-define_gpu_extension_target(
-  _layer_norm_711aa42_dirty
-  DESTINATION _layer_norm_711aa42_dirty
-  LANGUAGE ${GPU_LANG}
-  SOURCES ${SRC}
-  COMPILE_FLAGS ${GPU_FLAGS}
-  ARCHITECTURES ${GPU_ARCHES}
-  #INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
-  USE_SABI 3
-  WITH_SOABI)
-target_link_options(_layer_norm_711aa42_dirty PRIVATE -static-libstdc++)

README.md CHANGED Viewed

@@ -2,23 +2,4 @@
 tags:
 - kernel
 ---
-This CUDA extension implements fused dropout + residual + LayerNorm, building on
-Apex's [FastLayerNorm](https://github.com/NVIDIA/apex/tree/master/apex/contrib/layer_norm).
-Major changes:
-- Add dropout and residual.
-- Make it work for both pre-norm and post-norm architecture.
-- Support more hidden dimensions (all dimensions divisible by 8, up to 8192).
-- Implement RMSNorm as an option.
-- Support layer norm with parallel residual (e.g., GPT-J, GPT-NeoX, PaLM).
-If you want to use it for dimensions larger than 8k, please file an issue.
-This extension has only been tested on A100s.
-```sh
-cd csrc/layer_norm && pip install .
-```
-As of 2024-01-05, this extension is no longer used in the FlashAttention repo.
-We've instead switched to a Triton-based
-[implementation](https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/triton/layer_norm.py).

 tags:
 - kernel
 ---
+This CUDA extension implements fused dropout + residual + LayerNorm from the [flash-attention](https://github.com/Dao-AILab/flash-attention/tree/main/csrc/layer_norm) repo.

api.py DELETED Viewed

@@ -1,800 +0,0 @@
-# Copyright (c) 2022, Tri Dao.
-# Adapted from https://github.com/NVIDIA/apex/blob/master/apex/contrib/layer_norm/layer_norm.py
-import dropout_layer_norm
-import torch
-from torch.nn import init
-def maybe_align(x, alignment_in_bytes=16):
-    """Assume that x already has last dim divisible by alignment_in_bytes"""
-    # TD [2023-07-04] I'm not 100% sure that clone will align the memory
-    # https://discuss.pytorch.org/t/how-to-ensure-that-tensor-data-ptr-is-aligned-to-16-bytes/183440
-    return x if x.data_ptr() % alignment_in_bytes == 0 else x.clone()
-def _dropout_add_layer_norm_forward(
-    x0,
-    residual,
-    gamma,
-    beta,
-    rowscale,
-    colscale,
-    dropout_p,
-    epsilon,
-    residual_in_fp32=False,
-    is_rms_norm=False,
-):
-    """Assume that arguments are contiguous and aligned to 16 bytes"""
-    hidden_size = gamma.numel()
-    x0mat = x0.view((-1, hidden_size))
-    residualmat = residual.view((-1, hidden_size)) if residual is not None else None
-    rowscale = rowscale.view(-1) if rowscale is not None else None
-    zmat, xmat, dmask, mu, rsigma = dropout_layer_norm.dropout_add_ln_fwd(
-        x0mat,
-        residualmat,
-        gamma,
-        beta,
-        rowscale,
-        colscale,
-        None,
-        None,
-        dropout_p,
-        epsilon,
-        1.0,
-        0,
-        None,
-        residual_in_fp32,
-        is_rms_norm,
-    )
-    # dmask is None if dropout_p == 0.0
-    # xmat is None if dropout_p == 0.0 and residual is None and residual_dtype != input_dtype
-    return zmat, xmat if xmat is not None else x0mat, dmask, mu, rsigma
-def _dropout_add_layer_norm_backward(
-    dz,
-    dx,
-    x,
-    x0,
-    dmask,
-    mu,
-    rsigma,
-    gamma,
-    rowscale,
-    colscale,
-    dropout_p,
-    has_residual,
-    is_rms_norm=False,
-):
-    """Assume that arguments are contiguous and aligned to 16 bytes
-    dx == None means that it was a post-norm architecture
-    (x = drop(x0) + residual was not returned in the fwd).
-    x0 must not be None if we have colscale.
-    """
-    hidden_size = gamma.numel()
-    xmat = x.view((-1, hidden_size))
-    dzmat = dz.view(xmat.shape)
-    dxmat = dx.view(xmat.shape) if dx is not None else None
-    x0mat = x0.view((-1, hidden_size)) if x0 is not None else None
-    rowscale = rowscale.view(-1) if rowscale is not None else None
-    if colscale is not None:
-        assert x0 is not None, "x0 is required to compute the gradient of colscale"
-    dx0mat, dresidualmat, dgamma, dbeta, _, _, *rest = dropout_layer_norm.dropout_add_ln_bwd(
-        dzmat,
-        dxmat,
-        xmat,
-        x0mat,
-        dmask,
-        mu,
-        rsigma,
-        gamma,
-        rowscale,
-        colscale,
-        None,
-        None,
-        dropout_p,
-        1.0,
-        0,
-        has_residual,
-        is_rms_norm,
-    )
-    # dresidualmat is None if not has_residual
-    if colscale is None:
-        return dx0mat, dresidualmat, dgamma, dbeta
-    else:
-        dcolscale = rest[0]
-        return dx0mat, dresidualmat, dgamma, dbeta, dcolscale
-def _dropout_add_layer_norm_subset_forward(
-    x0,
-    residual,
-    gamma,
-    beta,
-    colscale,
-    x0_subset,
-    out_subset,
-    dropout_p,
-    epsilon,
-    rowscale_const,
-    out_numrows,
-    residual_in_fp32=False,
-    is_rms_norm=False,
-):
-    """Assume that arguments are contiguous and aligned to 16 bytes"""
-    hidden_size = gamma.numel()
-    x0mat = x0.view((-1, hidden_size))
-    residualmat = residual.view((-1, hidden_size)) if residual is not None else None
-    x0_subset = x0_subset.view(-1) if x0_subset is not None else None
-    out_subset = out_subset.view(-1) if out_subset is not None else None
-    zmat, xmat, dmask, mu, rsigma = dropout_layer_norm.dropout_add_ln_fwd(
-        x0mat,
-        residualmat,
-        gamma,
-        beta,
-        None,
-        colscale,
-        x0_subset,
-        out_subset,
-        dropout_p,
-        epsilon,
-        rowscale_const,
-        out_numrows,
-        None,
-        residual_in_fp32,
-        is_rms_norm,
-    )
-    # dmask is None if dropout_p == 0.0
-    # xmat is None if dropout_p == 0.0 and residual is None and residual_dtype != input_dtype
-    return zmat, xmat if xmat is not None else x0mat, dmask, mu, rsigma
-def _dropout_add_layer_norm_subset_backward(
-    dz,
-    dx,
-    x,
-    x0,
-    dmask,
-    mu,
-    rsigma,
-    gamma,
-    colscale,
-    x0_subset,
-    out_subset,
-    dropout_p,
-    rowscale_const,
-    x0_numrows,
-    has_residual,
-    is_rms_norm=False,
-):
-    """Assume that arguments are contiguous and aligned to 16 bytes
-    dx == None means that it was a post-norm architecture
-    (x = drop(x0) + residual was not returned in the fwd).
-    x0 must not be None if we have colscale.
-    """
-    hidden_size = gamma.numel()
-    xmat = x.view((-1, hidden_size))
-    dzmat = dz.view(-1, hidden_size)
-    dxmat = dx.view(xmat.shape) if dx is not None else None
-    x0mat = x0.view((-1, hidden_size)) if x0 is not None else None
-    x0_subset = x0_subset.view(-1) if x0_subset is not None else None
-    out_subset = out_subset.view(-1) if out_subset is not None else None
-    if colscale is not None:
-        assert x0 is not None, "x0 is required to compute the gradient of colscale"
-    dx0mat, dresidualmat, dgamma, dbeta, _, _, *rest = dropout_layer_norm.dropout_add_ln_bwd(
-        dzmat,
-        dxmat,
-        xmat,
-        x0mat,
-        dmask,
-        mu,
-        rsigma,
-        gamma,
-        None,
-        colscale,
-        x0_subset,
-        out_subset,
-        dropout_p,
-        rowscale_const,
-        x0_numrows,
-        has_residual,
-        is_rms_norm,
-    )
-    # dresidualmat is None if not has_residual
-    if colscale is None:
-        return dx0mat, dresidualmat, dgamma, dbeta
-    else:
-        dcolscale = rest[0]
-        return dx0mat, dresidualmat, dgamma, dbeta, dcolscale
-def _dropout_add_layer_norm_parallel_residual_forward(
-    x0,
-    x1,
-    residual,
-    gamma0,
-    beta0,
-    gamma1,
-    beta1,
-    dropout_p,
-    epsilon,
-    residual_in_fp32=False,
-    is_rms_norm=False,
-):
-    """Assume that arguments are contiguous and aligned to 16 bytes"""
-    hidden_size = gamma0.numel()
-    x0mat = x0.view((-1, hidden_size))
-    x1mat = x1.view((-1, hidden_size)) if x1 is not None else None
-    residualmat = residual.view((-1, hidden_size)) if residual is not None else None
-    (
-        z0mat,
-        z1mat,
-        xmat,
-        dmask0,
-        dmask1,
-        mu,
-        rsigma,
-    ) = dropout_layer_norm.dropout_add_ln_parallel_residual_fwd(
-        x0mat,
-        x1mat,
-        residualmat,
-        gamma0,
-        beta0,
-        gamma1,
-        beta1,
-        dropout_p,
-        epsilon,
-        None,
-        residual_in_fp32,
-        is_rms_norm,
-    )
-    # dmask0 and dmask1 are None if dropout_p == 0.0
-    # xmat is None if dropout_p == 0.0 and residual is None and residual_dtype != input_dtype
-    return z0mat, z1mat, xmat if xmat is not None else x0mat, dmask0, dmask1, mu, rsigma
-def _dropout_add_layer_norm_parallel_residual_backward(
-    dz0,
-    dz1,
-    dx,
-    x,
-    dmask0,
-    dmask1,
-    mu,
-    rsigma,
-    gamma0,
-    gamma1,
-    dropout_p,
-    has_x1,
-    has_residual,
-    is_rms_norm=False,
-):
-    """Assume that arguments are contiguous and aligned to 16 bytes
-    dx == None means that it was a post-norm architecture
-    (x = drop(x0) + residual was not returned in the fwd).
-    """
-    hidden_size = gamma0.numel()
-    xmat = x.view((-1, hidden_size))
-    dz0mat = dz0.view(xmat.shape)
-    dz1mat = dz1.view(xmat.shape) if dz1 is not None else None
-    dxmat = dx.view(xmat.shape) if dx is not None else None
-    (
-        dx0mat,
-        dx1mat,
-        dresidualmat,
-        dgamma0,
-        dbeta0,
-        dgamma1,
-        dbeta1,
-        *rest,
-    ) = dropout_layer_norm.dropout_add_ln_parallel_residual_bwd(
-        dz0mat,
-        dz1mat,
-        dxmat,
-        xmat,
-        dmask0,
-        dmask1,
-        mu,
-        rsigma,
-        gamma0,
-        gamma1,
-        dropout_p,
-        has_x1,
-        has_residual,
-        is_rms_norm,
-    )
-    # dresidualmat is None if not has_residual
-    return dx0mat, dx1mat, dresidualmat, dgamma0, dbeta0, dgamma1, dbeta1
-class DropoutAddLayerNormFn(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x0,
-        residual,
-        gamma,
-        beta,
-        rowscale,
-        colscale,
-        dropout_p,
-        epsilon,
-        residual_in_fp32=False,
-        prenorm=False,
-        is_rms_norm=False,
-        return_dmask=False,
-    ):
-        x0 = maybe_align(x0.contiguous(), 16)
-        residual = maybe_align(residual.contiguous(), 16) if residual is not None else None
-        gamma = maybe_align(gamma.contiguous(), 16)
-        beta = maybe_align(beta.contiguous(), 16) if beta is not None else None
-        rowscale = maybe_align(rowscale.contiguous(), 16) if rowscale is not None else None
-        colscale = maybe_align(colscale.contiguous(), 16) if colscale is not None else None
-        zmat, xmat, dmask, mu, rsigma = _dropout_add_layer_norm_forward(
-            x0,
-            residual,
-            gamma,
-            beta,
-            rowscale,
-            colscale,
-            dropout_p,
-            epsilon,
-            residual_in_fp32,
-            is_rms_norm,
-        )
-        # Only need to save x0 if we need to compute gradient wrt colscale
-        x0_saved = x0 if colscale is not None else None
-        ctx.save_for_backward(
-            xmat.view(x0.shape), x0_saved, dmask, gamma, mu, rsigma, rowscale, colscale
-        )
-        ctx.prenorm = prenorm
-        ctx.dropout_p = dropout_p
-        ctx.has_residual = residual is not None
-        ctx.is_rms_norm = is_rms_norm
-        ctx.has_beta = beta is not None
-        if not return_dmask:
-            return (
-                zmat.view(x0.shape) if not prenorm else (zmat.view(x0.shape), xmat.view(x0.shape))
-            )
-        else:
-            dmask = (
-                dmask.view(x0.shape)
-                if dropout_p > 0.0
-                else torch.ones(x0.shape, dtype=torch.uint8, device=x0.device)
-            )
-            ctx.mark_non_differentiable(dmask)
-            return (
-                (zmat.view(x0.shape), dmask)
-                if not prenorm
-                else (zmat.view(x0.shape), xmat.view(x0.shape), dmask)
-            )
-    @staticmethod
-    def backward(ctx, dz, *args):
-        # assert dz.is_contiguous()
-        dz = maybe_align(dz.contiguous(), 16)  # this happens!
-        dx = maybe_align(args[0].contiguous(), 16) if ctx.prenorm else None
-        x, x0, dmask, gamma, mu, rsigma, rowscale, colscale = ctx.saved_tensors
-        # x0 is None if colscale is None
-        dropout_p = ctx.dropout_p
-        has_residual = ctx.has_residual
-        dx0mat, dresidualmat, dgamma, dbeta, *rest = _dropout_add_layer_norm_backward(
-            dz,
-            dx,
-            x,
-            x0,
-            dmask,
-            mu,
-            rsigma,
-            gamma,
-            rowscale,
-            colscale,
-            dropout_p,
-            has_residual,
-            ctx.is_rms_norm,
-        )
-        dx0 = dx0mat.view(x.shape)
-        dresidual = dresidualmat.view(x.shape) if dresidualmat is not None else None
-        dcolscale = rest[0] if colscale is not None else None
-        return (
-            dx0,
-            dresidual,
-            dgamma,
-            dbeta if ctx.has_beta else None,
-            None,
-            dcolscale,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-        )
-class DropoutAddLayerNormSubsetFn(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x0,
-        residual,
-        gamma,
-        beta,
-        colscale,
-        x0_subset,
-        out_subset,
-        dropout_p,
-        epsilon,
-        rowscale_const,
-        out_numrows,
-        residual_in_fp32=False,
-        prenorm=False,
-        is_rms_norm=False,
-        return_dmask=False,
-    ):
-        x0 = maybe_align(x0.contiguous(), 16)
-        residual = maybe_align(residual.contiguous(), 16) if residual is not None else None
-        gamma = maybe_align(gamma.contiguous(), 16)
-        beta = maybe_align(beta.contiguous(), 16) if beta is not None else None
-        colscale = maybe_align(colscale.contiguous(), 16) if colscale is not None else None
-        zmat, xmat, dmask, mu, rsigma = _dropout_add_layer_norm_subset_forward(
-            x0,
-            residual,
-            gamma,
-            beta,
-            colscale,
-            x0_subset,
-            out_subset,
-            dropout_p,
-            epsilon,
-            rowscale_const,
-            out_numrows,
-            residual_in_fp32,
-            is_rms_norm,
-        )
-        # Only need to save x0 if we need to compute gradient wrt colscale
-        x0_saved = x0 if colscale is not None else None
-        x_shape = (-1, *x0.shape[1:])
-        ctx.save_for_backward(
-            xmat.view(x_shape), x0_saved, dmask, gamma, mu, rsigma, colscale, x0_subset, out_subset
-        )
-        ctx.prenorm = prenorm
-        ctx.dropout_p = dropout_p
-        ctx.rowscale_const = rowscale_const
-        ctx.x0_numrows = x0.shape[:-1].numel()
-        ctx.has_residual = residual is not None
-        ctx.is_rms_norm = is_rms_norm
-        ctx.has_beta = beta is not None
-        z_shape = (-1, *x0.shape[1:])
-        if not return_dmask:
-            return zmat.view(z_shape) if not prenorm else (zmat.view(z_shape), xmat.view(x0.shape))
-        else:
-            z = zmat.view(z_shape)
-            dmask = (
-                dmask.view(x0.shape)
-                if dropout_p > 0.0
-                else torch.ones(x0.shape, dtype=torch.uint8, device=x0.device)
-            )
-            ctx.mark_non_differentiable(dmask)
-            return (z, dmask) if not prenorm else (z, xmat.view(x_shape), dmask)
-    @staticmethod
-    def backward(ctx, dz, *args):
-        # assert dz.is_contiguous()
-        dz = maybe_align(dz.contiguous(), 16)  # this happens!
-        dx = maybe_align(args[0].contiguous(), 16) if ctx.prenorm else None
-        x, x0, dmask, gamma, mu, rsigma, colscale, x0_subset, out_subset = ctx.saved_tensors
-        # x0 is None if colscale is None
-        dropout_p = ctx.dropout_p
-        has_residual = ctx.has_residual
-        dx0mat, dresidualmat, dgamma, dbeta, *rest = _dropout_add_layer_norm_subset_backward(
-            dz,
-            dx,
-            x,
-            x0,
-            dmask,
-            mu,
-            rsigma,
-            gamma,
-            colscale,
-            x0_subset,
-            out_subset,
-            dropout_p,
-            ctx.rowscale_const,
-            ctx.x0_numrows,
-            has_residual,
-            ctx.is_rms_norm,
-        )
-        dx0 = dx0mat.view(-1, *x.shape[1:])
-        dresidual = dresidualmat.view(x.shape) if dresidualmat is not None else None
-        dcolscale = rest[0] if colscale is not None else None
-        return (
-            dx0,
-            dresidual,
-            dgamma,
-            dbeta if ctx.has_beta else None,
-            dcolscale,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-        )
-class DropoutAddLayerNormParallelResidualFn(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x0,
-        x1,
-        residual,
-        gamma0,
-        beta0,
-        gamma1,
-        beta1,
-        dropout_p,
-        epsilon,
-        residual_in_fp32=False,
-        prenorm=False,
-        is_rms_norm=False,
-        return_dmask=False,
-    ):
-        x0 = maybe_align(x0.contiguous(), 16)
-        x1 = maybe_align(x1.contiguous(), 16) if x1 is not None else None
-        residual = maybe_align(residual.contiguous(), 16) if residual is not None else None
-        gamma0 = maybe_align(gamma0.contiguous(), 16)
-        beta0 = maybe_align(beta0.contiguous(), 16) if beta0 is not None else None
-        gamma1 = maybe_align(gamma1.contiguous(), 16) if gamma1 is not None else None
-        beta1 = maybe_align(beta1.contiguous(), 16) if beta1 is not None else None
-        (
-            z0mat,
-            z1mat,
-            xmat,
-            dmask0,
-            dmask1,
-            mu,
-            rsigma,
-        ) = _dropout_add_layer_norm_parallel_residual_forward(
-            x0,
-            x1,
-            residual,
-            gamma0,
-            beta0,
-            gamma1,
-            beta1,
-            dropout_p,
-            epsilon,
-            residual_in_fp32,
-            is_rms_norm,
-        )
-        ctx.save_for_backward(xmat.view(x0.shape), dmask0, dmask1, gamma0, gamma1, mu, rsigma)
-        ctx.prenorm = prenorm
-        ctx.dropout_p = dropout_p
-        ctx.has_x1 = x1 is not None
-        ctx.has_residual = residual is not None
-        ctx.is_rms_norm = is_rms_norm
-        ctx.has_beta = beta0 is not None
-        z = (z0mat.view(x0.shape), z1mat.view(x0.shape) if z1mat is not None else None)
-        if not return_dmask:
-            return z if not prenorm else (*z, xmat.view(x0.shape))
-        else:
-            dmask0 = (
-                dmask0.view(x0.shape)
-                if dropout_p > 0.0
-                else torch.ones(x0.shape, dtype=torch.uint8, device=x0.device)
-            )
-            dmask1 = (
-                dmask1.view(x0.shape)
-                if dropout_p > 0.0 and x1 is not None
-                else torch.ones(x0.shape, dtype=torch.uint8, device=x0.device)
-            )
-            ctx.mark_non_differentiable(dmask0)
-            ctx.mark_non_differentiable(dmask1)
-            return (
-                (*z, dmask0, dmask1) if not prenorm else (*z, xmat.view(x0.shape), dmask0, dmask1)
-            )
-    @staticmethod
-    def backward(ctx, dz0, dz1, *args):
-        dz0 = maybe_align(dz0.contiguous(), 16)  # this happens!
-        dz1 = maybe_align(dz1.contiguous(), 16) if dz1 is not None else None
-        dx = maybe_align(args[0].contiguous(), 16) if ctx.prenorm else None
-        x, dmask0, dmask1, gamma0, gamma1, mu, rsigma = ctx.saved_tensors
-        dropout_p = ctx.dropout_p
-        has_x1 = ctx.has_x1
-        has_residual = ctx.has_residual
-        (
-            dx0mat,
-            dx1mat,
-            dresidualmat,
-            dgamma0,
-            dbeta0,
-            dgamma1,
-            dbeta1,
-        ) = _dropout_add_layer_norm_parallel_residual_backward(
-            dz0,
-            dz1,
-            dx,
-            x,
-            dmask0,
-            dmask1,
-            mu,
-            rsigma,
-            gamma0,
-            gamma1,
-            dropout_p,
-            has_x1,
-            has_residual,
-            ctx.is_rms_norm,
-        )
-        dx0 = dx0mat.view(x.shape)
-        dx1 = dx1mat.view(x.shape) if dx1mat is not None else None
-        dresidual = dresidualmat.view(x.shape) if dresidualmat is not None else None
-        return (
-            dx0,
-            dx1,
-            dresidual,
-            dgamma0,
-            dbeta0 if ctx.has_beta else None,
-            dgamma1,
-            dbeta1 if ctx.has_beta else None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-        )
-def layer_norm(x, weight, bias, epsilon):
-    return DropoutAddLayerNormFn.apply(x, None, weight, bias, None, None, 0.0, epsilon, False)
-def dropout_add_layer_norm(
-    x0,
-    residual,
-    weight,
-    bias,
-    dropout_p,
-    epsilon,
-    rowscale=None,
-    layerscale=None,
-    prenorm=False,
-    residual_in_fp32=False,
-    return_dropout_mask=False,
-):
-    """residual_in_fp32 only has an effect if residual is None.
-    Otherwise residual dtype is residual.dtype.
-    """
-    return DropoutAddLayerNormFn.apply(
-        x0,
-        residual,
-        weight,
-        bias,
-        rowscale,
-        layerscale,
-        dropout_p,
-        epsilon,
-        residual_in_fp32,
-        prenorm,
-        False,
-        return_dropout_mask,
-    )
-def dropout_add_layer_norm_subset(
-    x0,
-    residual,
-    weight,
-    bias,
-    dropout_p,
-    epsilon,
-    layerscale=None,
-    x0_subset=None,
-    out_subset=None,
-    rowscale_const=1.0,
-    out_numrows=0,
-    prenorm=False,
-    residual_in_fp32=False,
-    return_dropout_mask=False,
-):
-    """residual_in_fp32 only has an effect if residual is None.
-    Otherwise residual dtype is residual.dtype.
-    """
-    return DropoutAddLayerNormSubsetFn.apply(
-        x0,
-        residual,
-        weight,
-        bias,
-        layerscale,
-        x0_subset,
-        out_subset,
-        dropout_p,
-        epsilon,
-        rowscale_const,
-        out_numrows,
-        residual_in_fp32,
-        prenorm,
-        False,
-        return_dropout_mask,
-    )
-def dropout_add_layer_norm_parallel_residual(
-    x0,
-    x1,
-    residual,
-    weight0,
-    bias0,
-    weight1,
-    bias1,
-    dropout_p,
-    epsilon,
-    prenorm=False,
-    residual_in_fp32=False,
-    return_dropout_mask=False,
-):
-    """residual_in_fp32 only has an effect if residual is None.
-    Otherwise residual dtype is residual.dtype.
-    """
-    return DropoutAddLayerNormParallelResidualFn.apply(
-        x0,
-        x1,
-        residual,
-        weight0,
-        bias0,
-        weight1,
-        bias1,
-        dropout_p,
-        epsilon,
-        residual_in_fp32,
-        prenorm,
-        False,
-        return_dropout_mask,
-    )
-class DropoutAddLayerNorm(torch.nn.Module):
-    def __init__(
-        self,
-        hidden_size,
-        prenorm=False,
-        p=0.0,
-        eps=1e-5,
-        residual_in_fp32=False,
-        device=None,
-        dtype=None,
-    ):
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.prenorm = prenorm
-        self.p = p
-        self.eps = eps
-        self.residual_in_fp32 = residual_in_fp32
-        self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
-        self.bias = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
-        self.reset_parameters()
-    def reset_parameters(self):
-        init.ones_(self.weight)
-        init.zeros_(self.bias)
-    def forward(self, x0, residual=None):
-        return dropout_add_layer_norm(
-            x0,
-            residual,
-            self.weight,
-            self.bias,
-            self.p if self.training else 0.0,
-            self.eps,
-            prenorm=self.prenorm,
-            residual_in_fp32=self.residual_in_fp32,
-        )

build/{torch28-cxx11-cu126-x86_64-linux/layer_norm/_layer_norm_f622ea1_dirty.abi3.so → torch27-cxx11-cu118-x86_64-linux/layer_norm/_layer_norm_f3fd6bf.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:12b6de6cef24c5ee7a390d91ee2ea7069533e66440cf78ae5df7ae3beff5c1ca
-size 712024936

 version https://git-lfs.github.com/spec/v1
+oid sha256:790cd814bbfcaf7ff83b5c68bcb91091a67f34e92b9a2494e2856462e71a3141
+size 716945944

build/torch27-cxx11-cu118-x86_64-linux/layer_norm/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _layer_norm_f622ea1_dirty
-ops = torch.ops._layer_norm_f622ea1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_layer_norm_f622ea1_dirty::{op_name}"

 import torch
+from . import _layer_norm_f3fd6bf
+ops = torch.ops._layer_norm_f3fd6bf
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_layer_norm_f3fd6bf::{op_name}"

build/{torch27-cxx11-cu118-x86_64-linux/layer_norm/_layer_norm_f622ea1_dirty.abi3.so → torch27-cxx11-cu126-x86_64-linux/layer_norm/_layer_norm_f3fd6bf.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fe0515daaf1bbfd1246d18bd5c1a5cd6f366059090a8b6e402955d06caaa6392
-size 716945976

 version https://git-lfs.github.com/spec/v1
+oid sha256:b17984ef79fc9d6427c8efe0a8cc8f1f6e2777f9a8641b86556b7bb2359626ab
+size 712024816

build/torch27-cxx11-cu126-x86_64-linux/layer_norm/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _layer_norm_f622ea1_dirty
-ops = torch.ops._layer_norm_f622ea1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_layer_norm_f622ea1_dirty::{op_name}"

 import torch
+from . import _layer_norm_f3fd6bf
+ops = torch.ops._layer_norm_f3fd6bf
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_layer_norm_f3fd6bf::{op_name}"

build/torch27-cxx11-cu128-x86_64-linux/layer_norm/{_layer_norm_f622ea1_dirty.abi3.so → _layer_norm_f3fd6bf.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ae0d54be8ee4e3ae33f47f0b27243c9cbd5668ff7756b1dfb5dcd9e2430f5a35
-size 1231333392

 version https://git-lfs.github.com/spec/v1
+oid sha256:7629b13b777a390df75374fc60d85311679a56a5bbd9969e138822e5c0fe2b1e
+size 1231333360

build/torch27-cxx11-cu128-x86_64-linux/layer_norm/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _layer_norm_f622ea1_dirty
-ops = torch.ops._layer_norm_f622ea1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_layer_norm_f622ea1_dirty::{op_name}"

 import torch
+from . import _layer_norm_f3fd6bf
+ops = torch.ops._layer_norm_f3fd6bf
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_layer_norm_f3fd6bf::{op_name}"

build/{torch27-cxx11-cu126-x86_64-linux/layer_norm/_layer_norm_f622ea1_dirty.abi3.so → torch28-cxx11-cu126-x86_64-linux/layer_norm/_layer_norm_f3fd6bf.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:04095de2e4bf9cd03f9ec481084d0c9e9e0baa0bab17a0ec9715f22f69bdfd33
-size 712024848

 version https://git-lfs.github.com/spec/v1
+oid sha256:d6ffc9d5651e8de6440f2d4f58018a5ded07634582ae03eec5b9edf428f613a6
+size 712024904

build/torch28-cxx11-cu126-x86_64-linux/layer_norm/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _layer_norm_f622ea1_dirty
-ops = torch.ops._layer_norm_f622ea1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_layer_norm_f622ea1_dirty::{op_name}"

 import torch
+from . import _layer_norm_f3fd6bf
+ops = torch.ops._layer_norm_f3fd6bf
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_layer_norm_f3fd6bf::{op_name}"

build/torch28-cxx11-cu128-x86_64-linux/layer_norm/_layer_norm_f3fd6bf.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df39795e047e962019cbecbb11f93d8ee1fcfb49ed8326f2edc267bc0d90da08
+size 1231337936

build/torch28-cxx11-cu128-x86_64-linux/layer_norm/_layer_norm_f622ea1_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d51ec6b6da7095cf5fc18493eb4b0b1c20485f01dff4b38370979ea3d0a9dd60
-size 1231337968

build/torch28-cxx11-cu128-x86_64-linux/layer_norm/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _layer_norm_f622ea1_dirty
-ops = torch.ops._layer_norm_f622ea1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_layer_norm_f622ea1_dirty::{op_name}"

 import torch
+from . import _layer_norm_f3fd6bf
+ops = torch.ops._layer_norm_f3fd6bf
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_layer_norm_f3fd6bf::{op_name}"

build/torch28-cxx11-cu129-x86_64-linux/layer_norm/_layer_norm_f3fd6bf.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dfef6947945f8f126a284c6a8ab861e180a5e628992eeb0b4b7c7914c50a59c2
+size 1283037344

build/torch28-cxx11-cu129-x86_64-linux/layer_norm/_layer_norm_f622ea1_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9080934ece3b5e09db6178b1baa15b8baf9f6873e234a951a2122071e1190fba
-size 1283037376

build/torch28-cxx11-cu129-x86_64-linux/layer_norm/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _layer_norm_f622ea1_dirty
-ops = torch.ops._layer_norm_f622ea1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_layer_norm_f622ea1_dirty::{op_name}"

 import torch
+from . import _layer_norm_f3fd6bf
+ops = torch.ops._layer_norm_f3fd6bf
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_layer_norm_f3fd6bf::{op_name}"

{torch-ext → build/torch29-cxx11-cu126-x86_64-linux}/layer_norm/__init__.py RENAMED Viewed

File without changes

build/torch29-cxx11-cu126-x86_64-linux/layer_norm/_layer_norm_f3fd6bf.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2bdb57c0889ade2fc574156873c1d4b543796f2e8ad6a894be82ee2785459c9b
+size 712029160

build/torch29-cxx11-cu126-x86_64-linux/layer_norm/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _layer_norm_f3fd6bf
+ops = torch.ops._layer_norm_f3fd6bf
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_layer_norm_f3fd6bf::{op_name}"

{torch-ext → build/torch29-cxx11-cu126-x86_64-linux}/layer_norm/layers.py RENAMED Viewed

File without changes

build/torch29-cxx11-cu128-x86_64-linux/layer_norm/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+from . import layers
+def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
+    return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
+def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
+    return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
+def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
+    return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
+def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
+    return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
+__all__ = [
+    "layers",
+    "dropout_add_ln_fwd",
+    "dropout_add_ln_bwd",
+    "dropout_add_ln_parallel_residual_fwd",
+    "dropout_add_ln_parallel_residual_bwd",
+]

build/torch29-cxx11-cu128-x86_64-linux/layer_norm/_layer_norm_f3fd6bf.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03e6e7ecbf276b306d89607100f78f2ce8b3385a77594676dbf0daabdce26fc7
+size 1231338080

build/torch29-cxx11-cu128-x86_64-linux/layer_norm/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _layer_norm_f3fd6bf
+ops = torch.ops._layer_norm_f3fd6bf
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_layer_norm_f3fd6bf::{op_name}"

build/torch29-cxx11-cu128-x86_64-linux/layer_norm/layers.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+class LayerNorm(nn.Module):
+    weight: torch.Tensor
+    variance_epsilon: float
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        output = ops.dropout_add_ln_fwd(
+            hidden_states.view(-1, hidden_states.shape[-1]),
+            gamma = self.weight,
+            beta = None,
+            rowscale = None,
+            colscale = None,
+            x0_subset = None,
+            z_subset = None,
+            dropout_p = 0,
+            epsilon = self.variance_epsilon,
+            rowscale_const = 1.0,
+            z_numrows = hidden_states.shape[1],
+            gen = None,
+            residual_in_fp32 = False,
+            is_rms_norm = False,
+        )
+        return output[0].view(hidden_states.shape)
+class LlamaRMSNorm(nn.Module):
+    weight: torch.Tensor
+    variance_epsilon: float
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        output = ops.dropout_add_ln_fwd(
+            hidden_states.view(-1, hidden_states.shape[-1]),
+            gamma = self.weight,
+            beta = None,
+            rowscale = None,
+            colscale = None,
+            x0_subset = None,
+            z_subset = None,
+            dropout_p = 0,
+            epsilon = self.variance_epsilon,
+            rowscale_const = 1.0,
+            z_numrows = hidden_states.shape[1],
+            gen = None,
+            residual_in_fp32 = False,
+            is_rms_norm = True,
+        )
+        return output[0].view(hidden_states.shape)

build/torch29-cxx11-cu130-x86_64-linux/layer_norm/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+from . import layers
+def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
+    return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
+def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
+    return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
+def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
+    return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
+def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
+    return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
+__all__ = [
+    "layers",
+    "dropout_add_ln_fwd",
+    "dropout_add_ln_bwd",
+    "dropout_add_ln_parallel_residual_fwd",
+    "dropout_add_ln_parallel_residual_bwd",
+]

build/torch29-cxx11-cu130-x86_64-linux/layer_norm/_layer_norm_f3fd6bf.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:322e2d8fc69447be95ef7b6e85267e8769f1284419baa606732a77b1980a834d
+size 1238333264

build/torch29-cxx11-cu130-x86_64-linux/layer_norm/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _layer_norm_f3fd6bf
+ops = torch.ops._layer_norm_f3fd6bf
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_layer_norm_f3fd6bf::{op_name}"

build/torch29-cxx11-cu130-x86_64-linux/layer_norm/layers.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import torch
+import torch.nn as nn
+from ._ops import ops
+class LayerNorm(nn.Module):
+    weight: torch.Tensor
+    variance_epsilon: float
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        output = ops.dropout_add_ln_fwd(
+            hidden_states.view(-1, hidden_states.shape[-1]),
+            gamma = self.weight,
+            beta = None,
+            rowscale = None,
+            colscale = None,
+            x0_subset = None,
+            z_subset = None,
+            dropout_p = 0,
+            epsilon = self.variance_epsilon,
+            rowscale_const = 1.0,
+            z_numrows = hidden_states.shape[1],
+            gen = None,
+            residual_in_fp32 = False,
+            is_rms_norm = False,
+        )
+        return output[0].view(hidden_states.shape)
+class LlamaRMSNorm(nn.Module):
+    weight: torch.Tensor
+    variance_epsilon: float
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        output = ops.dropout_add_ln_fwd(
+            hidden_states.view(-1, hidden_states.shape[-1]),
+            gamma = self.weight,
+            beta = None,
+            rowscale = None,
+            colscale = None,
+            x0_subset = None,
+            z_subset = None,
+            dropout_p = 0,
+            epsilon = self.variance_epsilon,
+            rowscale_const = 1.0,
+            z_numrows = hidden_states.shape[1],
+            gen = None,
+            residual_in_fp32 = False,
+            is_rms_norm = True,
+        )
+        return output[0].view(hidden_states.shape)

cmake/hipify.py DELETED Viewed

@@ -1,76 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: Apache-2.0
-# From vLLM: https://github.com/vllm-project/vllm/blob/main/cmake/hipify.py
-#
-# A command line tool for running pytorch's hipify preprocessor on CUDA
-# source files.
-#
-# See https://github.com/ROCm/hipify_torch
-# and <torch install dir>/utils/hipify/hipify_python.py
-#
-import argparse
-import os
-import shutil
-from torch.utils.hipify.hipify_python import hipify
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    # Project directory where all the source + include files live.
-    parser.add_argument(
-        "-p",
-        "--project_dir",
-        help="The project directory.",
-    )
-    # Directory where hipified files are written.
-    parser.add_argument(
-        "-o",
-        "--output_dir",
-        help="The output directory.",
-    )
-    # Source files to convert.
-    parser.add_argument("sources",
-                        help="Source files to hipify.",
-                        nargs="*",
-                        default=[])
-    args = parser.parse_args()
-    # Limit include scope to project_dir only
-    includes = [os.path.join(args.project_dir, '*')]
-    # Get absolute path for all source files.
-    extra_files = [os.path.abspath(s) for s in args.sources]
-    # Copy sources from project directory to output directory.
-    # The directory might already exist to hold object files so we ignore that.
-    shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True)
-    hipify_result = hipify(project_directory=args.project_dir,
-                           output_directory=args.output_dir,
-                           header_include_dirs=[],
-                           includes=includes,
-                           extra_files=extra_files,
-                           show_detailed=True,
-                           is_pytorch_extension=True,
-                           hipify_extra_files_only=True)
-    hipified_sources = []
-    for source in args.sources:
-        s_abs = os.path.abspath(source)
-        hipified_s_abs = (hipify_result[s_abs].hipified_path if
-                          (s_abs in hipify_result
-                           and hipify_result[s_abs].hipified_path is not None)
-                          else s_abs)
-        hipified_sources.append(hipified_s_abs)
-    assert (len(hipified_sources) == len(args.sources))
-    # Print hipified source files.
-    print("\n".join(hipified_sources))

cmake/utils.cmake DELETED Viewed

@@ -1,545 +0,0 @@
-# Vendored from vLLM:
-#
-# https://github.com/vllm-project/vllm/blob/main/cmake/utils.cmake
-#
-# Attempt to find the python package that uses the same python executable as
-# `EXECUTABLE` and is one of the `SUPPORTED_VERSIONS`.
-#
-macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)
-  file(REAL_PATH ${EXECUTABLE} EXECUTABLE)
-  set(Python_EXECUTABLE ${EXECUTABLE})
-  find_package(Python COMPONENTS Interpreter Development.Module Development.SABIModule)
-  if (NOT Python_FOUND)
-    message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
-  endif()
-  set(_VER "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}")
-  set(_SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN})
-  if (NOT _VER IN_LIST _SUPPORTED_VERSIONS_LIST)
-    message(FATAL_ERROR
-      "Python version (${_VER}) is not one of the supported versions: "
-      "${_SUPPORTED_VERSIONS_LIST}.")
-  endif()
-  message(STATUS "Found python matching: ${EXECUTABLE}.")
-endmacro()
-#
-# Run `EXPR` in python.  The standard output of python is stored in `OUT` and
-# has trailing whitespace stripped.  If an error is encountered when running
-# python, a fatal message `ERR_MSG` is issued.
-#
-function (run_python OUT EXPR ERR_MSG)
-  execute_process(
-    COMMAND
-    "${Python_EXECUTABLE}" "-c" "${EXPR}"
-    OUTPUT_VARIABLE PYTHON_OUT
-    RESULT_VARIABLE PYTHON_ERROR_CODE
-    ERROR_VARIABLE PYTHON_STDERR
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
-  if(NOT PYTHON_ERROR_CODE EQUAL 0)
-    message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}")
-  endif()
-  set(${OUT} ${PYTHON_OUT} PARENT_SCOPE)
-endfunction()
-# Run `EXPR` in python after importing `PKG`. Use the result of this to extend
-# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported.
-macro (append_cmake_prefix_path PKG EXPR)
-  run_python(_PREFIX_PATH
-    "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path")
-  list(APPEND CMAKE_PREFIX_PATH ${_PREFIX_PATH})
-endmacro()
-#
-# Add a target named `hipify${NAME}` that runs the hipify preprocessor on a set
-# of CUDA source files. The names of the corresponding "hipified" sources are
-# stored in `OUT_SRCS`.
-#
-function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
-  #
-  # Split into C++ and non-C++ (i.e. CUDA) sources.
-  #
-  set(NODUP_SRCS ${ORIG_SRCS})
-  list(REMOVE_DUPLICATES NODUP_SRCS)
-  set(SRCS ${NODUP_SRCS})
-  set(CXX_SRCS ${NODUP_SRCS})
-  list(FILTER SRCS INCLUDE REGEX "\.cu$")
-  list(FILTER CXX_SRCS EXCLUDE REGEX "\.cu$")
-  #
-  # Generate ROCm/HIP source file names from CUDA file names.
-  # Since HIP files are generated code, they will appear in the build area
-  # `CMAKE_CURRENT_BINARY_DIR` directory rather than the original csrc dir.
-  #
-  set(HIP_SRCS)
-  foreach (SRC ${SRCS})
-    get_source_file_property(include_dirs "${SRC}" INCLUDE_DIRECTORIES)
-    string(REGEX REPLACE "\.cu$" "\.hip" SRC ${SRC})
-    string(REGEX REPLACE "cuda" "hip" SRC ${SRC})
-    if(include_dirs)
-      # Copy over include directories from the original CUDA file.
-      set_source_files_properties(
-        ${SRC}
-        PROPERTIES INCLUDE_DIRECTORIES "${include_dirs}")
-    endif()
-    list(APPEND HIP_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${SRC}")
-  endforeach()
-  add_custom_target(
-    hipify${NAME}
-    COMMAND "${Python_EXECUTABLE}" ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR} -o ${CMAKE_CURRENT_BINARY_DIR} ${SRCS}
-    DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS}
-    BYPRODUCTS ${HIP_SRCS}
-    COMMENT "Running hipify on ${NAME} extension source files.")
-  # Swap out original extension sources with hipified sources.
-  list(APPEND HIP_SRCS ${CXX_SRCS})
-  set(${OUT_SRCS} ${HIP_SRCS} PARENT_SCOPE)
-endfunction()
-#
-# Get additional GPU compiler flags from torch.
-#
-function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
-  if (${GPU_LANG} STREQUAL "CUDA")
-    #
-    # Get common NVCC flags from torch.
-    #
-    run_python(GPU_FLAGS
-      "from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))"
-      "Failed to determine torch nvcc compiler flags")
-    if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
-      list(APPEND GPU_FLAGS "-DENABLE_FP8")
-      list(REMOVE_ITEM GPU_FLAGS
-        "-D__CUDA_NO_HALF_OPERATORS__"
-        "-D__CUDA_NO_HALF_CONVERSIONS__"
-        "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
-        "-D__CUDA_NO_HALF2_OPERATORS__")
-    endif()
-  elseif(${GPU_LANG} STREQUAL "HIP")
-    #
-    # Get common HIP/HIPCC flags from torch.
-    #
-    run_python(GPU_FLAGS
-      "import torch.utils.cpp_extension as t; print(';'.join(t.COMMON_HIP_FLAGS + t.COMMON_HIPCC_FLAGS))"
-      "Failed to determine torch nvcc compiler flags")
-    list(APPEND GPU_FLAGS
-      "-DUSE_ROCM"
-      "-DENABLE_FP8"
-      "-U__HIP_NO_HALF_CONVERSIONS__"
-      "-U__HIP_NO_HALF_OPERATORS__"
-      "-fno-gpu-rdc")
-  endif()
-  set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE)
-endfunction()
-# Macro for converting a `gencode` version number to a cmake version number.
-macro(string_to_ver OUT_VER IN_STR)
-  string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
-endmacro()
-#
-# Clear all `-gencode` flags from `CMAKE_CUDA_FLAGS` and store them in
-# `CUDA_ARCH_FLAGS`.
-#
-# Example:
-#   CMAKE_CUDA_FLAGS="-Wall -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75"
-#   clear_cuda_arches(CUDA_ARCH_FLAGS)
-#   CUDA_ARCH_FLAGS="-gencode arch=compute_70,code=sm_70;-gencode arch=compute_75,code=sm_75"
-#   CMAKE_CUDA_FLAGS="-Wall"
-#
-macro(clear_cuda_arches CUDA_ARCH_FLAGS)
-    # Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
-    string(REGEX MATCHALL "-gencode arch=[^ ]+" CUDA_ARCH_FLAGS
-      ${CMAKE_CUDA_FLAGS})
-    # Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
-    # and passed back via the `CUDA_ARCHITECTURES` property.
-    string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
-      ${CMAKE_CUDA_FLAGS})
-endmacro()
-#
-# Extract unique CUDA architectures from a list of compute capabilities codes in
-# the form `<major><minor>[<letter>]`, convert them to the form sort
-# `<major>.<minor>`, dedupes them and then sorts them in ascending order and
-# stores them in `OUT_ARCHES`.
-#
-# Example:
-#   CUDA_ARCH_FLAGS="-gencode arch=compute_75,code=sm_75;...;-gencode arch=compute_90a,code=sm_90a"
-#   extract_unique_cuda_archs_ascending(OUT_ARCHES CUDA_ARCH_FLAGS)
-#   OUT_ARCHES="7.5;...;9.0"
-function(extract_unique_cuda_archs_ascending OUT_ARCHES CUDA_ARCH_FLAGS)
-  set(_CUDA_ARCHES)
-  foreach(_ARCH ${CUDA_ARCH_FLAGS})
-    string(REGEX MATCH "arch=compute_\([0-9]+a?\)" _COMPUTE ${_ARCH})
-    if (_COMPUTE)
-      set(_COMPUTE ${CMAKE_MATCH_1})
-    endif()
-    string_to_ver(_COMPUTE_VER ${_COMPUTE})
-    list(APPEND _CUDA_ARCHES ${_COMPUTE_VER})
-  endforeach()
-  list(REMOVE_DUPLICATES _CUDA_ARCHES)
-  list(SORT _CUDA_ARCHES COMPARE NATURAL ORDER ASCENDING)
-  set(${OUT_ARCHES} ${_CUDA_ARCHES} PARENT_SCOPE)
-endfunction()
-#
-# For a specific file set the `-gencode` flag in compile options conditionally
-# for the CUDA language.
-#
-# Example:
-#   set_gencode_flag_for_srcs(
-#     SRCS "foo.cu"
-#     ARCH "compute_75"
-#     CODE "sm_75")
-#   adds: "-gencode arch=compute_75,code=sm_75" to the compile options for
-#    `foo.cu` (only for the CUDA language).
-#
-macro(set_gencode_flag_for_srcs)
-  set(options)
-  set(oneValueArgs ARCH CODE)
-  set(multiValueArgs SRCS)
-  cmake_parse_arguments(arg "${options}" "${oneValueArgs}"
-                        "${multiValueArgs}" ${ARGN} )
-  set(_FLAG -gencode arch=${arg_ARCH},code=${arg_CODE})
-  set_property(
-    SOURCE ${arg_SRCS}
-    APPEND PROPERTY
-    COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CUDA>:${_FLAG}>"
-  )
-  message(DEBUG "Setting gencode flag for ${arg_SRCS}: ${_FLAG}")
-endmacro(set_gencode_flag_for_srcs)
-#
-# For a list of source files set the `-gencode` flags in the files specific
-#  compile options (specifically for the CUDA language).
-#
-# arguments are:
-#  SRCS: list of source files
-#  CUDA_ARCHS: list of CUDA architectures in the form `<major>.<minor>[letter]`
-#  BUILD_PTX_FOR_ARCH: if set to true, then the PTX code will be built
-#    for architecture `BUILD_PTX_FOR_ARCH` if there is a CUDA_ARCH in CUDA_ARCHS
-#    that is larger than BUILD_PTX_FOR_ARCH.
-#
-macro(set_gencode_flags_for_srcs)
-  set(options)
-  set(oneValueArgs BUILD_PTX_FOR_ARCH)
-  set(multiValueArgs SRCS CUDA_ARCHS)
-  cmake_parse_arguments(arg "${options}" "${oneValueArgs}"
-                        "${multiValueArgs}" ${ARGN} )
-  foreach(_ARCH ${arg_CUDA_ARCHS})
-    # handle +PTX suffix: generate both sm and ptx codes if requested
-    string(FIND "${_ARCH}" "+PTX" _HAS_PTX)
-    if(NOT _HAS_PTX EQUAL -1)
-      string(REPLACE "+PTX" "" _BASE_ARCH "${_ARCH}")
-      string(REPLACE "." "" _STRIPPED_ARCH "${_BASE_ARCH}")
-      set_gencode_flag_for_srcs(
-        SRCS ${arg_SRCS}
-        ARCH "compute_${_STRIPPED_ARCH}"
-        CODE "sm_${_STRIPPED_ARCH}")
-      set_gencode_flag_for_srcs(
-        SRCS ${arg_SRCS}
-        ARCH "compute_${_STRIPPED_ARCH}"
-        CODE "compute_${_STRIPPED_ARCH}")
-    else()
-      string(REPLACE "." "" _STRIPPED_ARCH "${_ARCH}")
-      set_gencode_flag_for_srcs(
-        SRCS ${arg_SRCS}
-        ARCH "compute_${_STRIPPED_ARCH}"
-        CODE "sm_${_STRIPPED_ARCH}")
-    endif()
-  endforeach()
-  if (${arg_BUILD_PTX_FOR_ARCH})
-    list(SORT arg_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
-    list(GET arg_CUDA_ARCHS -1 _HIGHEST_ARCH)
-    if (_HIGHEST_ARCH VERSION_GREATER_EQUAL ${arg_BUILD_PTX_FOR_ARCH})
-      string(REPLACE "." "" _PTX_ARCH "${arg_BUILD_PTX_FOR_ARCH}")
-      set_gencode_flag_for_srcs(
-        SRCS ${arg_SRCS}
-        ARCH "compute_${_PTX_ARCH}"
-        CODE "compute_${_PTX_ARCH}")
-    endif()
-  endif()
-endmacro()
-#
-# For the given `SRC_CUDA_ARCHS` list of gencode versions in the form
-#  `<major>.<minor>[letter]` compute the "loose intersection" with the
-#  `TGT_CUDA_ARCHS` list of gencodes. We also support the `+PTX` suffix in
-#  `SRC_CUDA_ARCHS` which indicates that the PTX code should be built when there
-#  is a CUDA_ARCH in `TGT_CUDA_ARCHS` that is equal to or larger than the
-#  architecture in `SRC_CUDA_ARCHS`.
-# The loose intersection is defined as:
-#   { max{ x \in tgt | x <= y } | y \in src, { x \in tgt | x <= y } != {} }
-#  where `<=` is the version comparison operator.
-# In other words, for each version in `TGT_CUDA_ARCHS` find the highest version
-#  in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`.
-# We have special handling for x.0a, if x.0a is in `SRC_CUDA_ARCHS` and x.0 is
-#  in `TGT_CUDA_ARCHS` then we should remove x.0a from `SRC_CUDA_ARCHS` and add
-#  x.0a to the result (and remove x.0 from TGT_CUDA_ARCHS).
-# The result is stored in `OUT_CUDA_ARCHS`.
-#
-# Example:
-#   SRC_CUDA_ARCHS="7.5;8.0;8.6;9.0;9.0a"
-#   TGT_CUDA_ARCHS="8.0;8.9;9.0"
-#   cuda_archs_loose_intersection(OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
-#   OUT_CUDA_ARCHS="8.0;8.6;9.0;9.0a"
-#
-# Example With PTX:
-#   SRC_CUDA_ARCHS="8.0+PTX"
-#   TGT_CUDA_ARCHS="9.0"
-#   cuda_archs_loose_intersection(OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
-#   OUT_CUDA_ARCHS="8.0+PTX"
-#
-function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
-  set(_SRC_CUDA_ARCHS "${SRC_CUDA_ARCHS}")
-  set(_TGT_CUDA_ARCHS ${TGT_CUDA_ARCHS})
-  # handle +PTX suffix: separate base arch for matching, record PTX requests
-  set(_PTX_ARCHS)
-  foreach(_arch ${_SRC_CUDA_ARCHS})
-    if(_arch MATCHES "\\+PTX$")
-      string(REPLACE "+PTX" "" _base "${_arch}")
-      list(APPEND _PTX_ARCHS "${_base}")
-      list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
-      list(APPEND _SRC_CUDA_ARCHS "${_base}")
-    endif()
-  endforeach()
-  list(REMOVE_DUPLICATES _PTX_ARCHS)
-  list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS)
-  # if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
-  # remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS
-  set(_CUDA_ARCHS)
-  foreach(_arch ${_SRC_CUDA_ARCHS})
-    if(_arch MATCHES "\\a$")
-      list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
-      string(REPLACE "a" "" _base "${_arch}")
-      if ("${_base}" IN_LIST TGT_CUDA_ARCHS)
-        list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}")
-        list(APPEND _CUDA_ARCHS "${_arch}")
-      endif()
-    endif()
-  endforeach()
-  list(SORT _SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
-  # for each ARCH in TGT_CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that
-  # is less or equal to ARCH (but has the same major version since SASS binary
-  # compatibility is only forward compatible within the same major version).
-  foreach(_ARCH ${_TGT_CUDA_ARCHS})
-    set(_TMP_ARCH)
-    # Extract the major version of the target arch
-    string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" TGT_ARCH_MAJOR "${_ARCH}")
-    foreach(_SRC_ARCH ${_SRC_CUDA_ARCHS})
-      # Extract the major version of the source arch
-      string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" SRC_ARCH_MAJOR "${_SRC_ARCH}")
-      # Check version-less-or-equal, and allow PTX arches to match across majors
-      if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
-        if (_SRC_ARCH IN_LIST _PTX_ARCHS OR SRC_ARCH_MAJOR STREQUAL TGT_ARCH_MAJOR)
-          set(_TMP_ARCH "${_SRC_ARCH}")
-        endif()
-      else()
-        # If we hit a version greater than the target, we can break
-        break()
-      endif()
-    endforeach()
-    # If we found a matching _TMP_ARCH, append it to _CUDA_ARCHS
-    if (_TMP_ARCH)
-      list(APPEND _CUDA_ARCHS "${_TMP_ARCH}")
-    endif()
-  endforeach()
-  list(REMOVE_DUPLICATES _CUDA_ARCHS)
-  # reapply +PTX suffix to architectures that requested PTX
-  set(_FINAL_ARCHS)
-  foreach(_arch ${_CUDA_ARCHS})
-    if(_arch IN_LIST _PTX_ARCHS)
-      list(APPEND _FINAL_ARCHS "${_arch}+PTX")
-    else()
-      list(APPEND _FINAL_ARCHS "${_arch}")
-    endif()
-  endforeach()
-  set(_CUDA_ARCHS ${_FINAL_ARCHS})
-  set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)
-endfunction()
-#
-# For the given `SRC_ROCM_ARCHS` list of architecture versions in the form
-# `<name>` compute the "loose intersection" with the `TGT_ROCM_ARCHS` list.
-# The loose intersection is defined as:
-#   { max{ x \in tgt | x <= y } | y \in src, { x \in tgt | x <= y } != {} }
-#  where `<=` is the version comparison operator.
-# In other words, for each version in `TGT_ROCM_ARCHS` find the highest version
-#  in `SRC_ROCM_ARCHS` that is less or equal to the version in `TGT_ROCM_ARCHS`.
-# The result is stored in `OUT_ROCM_ARCHS`.
-#
-# Example:
-#   SRC_ROCM_ARCHS="gfx900;gfx906;gfx908;gfx90a"
-#   TGT_ROCM_ARCHS="gfx906;gfx908;gfx1030"
-#   hip_archs_loose_intersection(OUT_ROCM_ARCHS SRC_ROCM_ARCHS TGT_ROCM_ARCHS)
-#   OUT_ROCM_ARCHS="gfx906;gfx908"
-#
-function(hip_archs_loose_intersection OUT_ROCM_ARCHS SRC_ROCM_ARCHS TGT_ROCM_ARCHS)
-  list(REMOVE_DUPLICATES SRC_ROCM_ARCHS)
-  # ROCm architectures are typically in format gfxNNN or gfxNNNx where N is a digit
-  # and x is a letter. We can sort them by string comparison which works for this format.
-  list(SORT SRC_ROCM_ARCHS COMPARE STRING ORDER ASCENDING)
-  set(_ROCM_ARCHS)
-  # Find the intersection of supported architectures
-  foreach(_SRC_ARCH ${SRC_ROCM_ARCHS})
-    if(_SRC_ARCH IN_LIST TGT_ROCM_ARCHS)
-      list(APPEND _ROCM_ARCHS ${_SRC_ARCH})
-    endif()
-  endforeach()
-  list(REMOVE_DUPLICATES _ROCM_ARCHS)
-  set(${OUT_ROCM_ARCHS} ${_ROCM_ARCHS} PARENT_SCOPE)
-endfunction()
-#
-# Override the GPU architectures detected by cmake/torch and filter them by
-# `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in
-# `GPU_ARCHES`. This only applies to the HIP language since for CUDA we set
-# the architectures on a per file basis.
-#
-# Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`.
-#
-macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
-  set(_GPU_SUPPORTED_ARCHES_LIST ${GPU_SUPPORTED_ARCHES} ${ARGN})
-  message(STATUS "${GPU_LANG} supported arches: ${_GPU_SUPPORTED_ARCHES_LIST}")
-  if (${GPU_LANG} STREQUAL "HIP")
-    #
-    # `GPU_ARCHES` controls the `--offload-arch` flags.
-    #
-    # If PYTORCH_ROCM_ARCH env variable exists, then we take it as a list,
-    # if not, then we use CMAKE_HIP_ARCHITECTURES which was generated by calling
-    # "rocm_agent_enumerator" in "enable_language(HIP)"
-    # (in file Modules/CMakeDetermineHIPCompiler.cmake)
-    #
-    if(DEFINED ENV{PYTORCH_ROCM_ARCH})
-      set(HIP_ARCHITECTURES $ENV{PYTORCH_ROCM_ARCH})
-    else()
-      set(HIP_ARCHITECTURES ${CMAKE_HIP_ARCHITECTURES})
-    endif()
-    #
-    # Find the intersection of the supported + detected architectures to
-    # set the module architecture flags.
-    #
-    set(${GPU_ARCHES})
-    foreach (_ARCH ${HIP_ARCHITECTURES})
-      if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
-        list(APPEND ${GPU_ARCHES} ${_ARCH})
-      endif()
-    endforeach()
-    if(NOT ${GPU_ARCHES})
-      message(FATAL_ERROR
-        "None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is"
-        " supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
-    endif()
-  endif()
-endmacro()
-#
-# Define a target named `GPU_MOD_NAME` for a single extension. The
-# arguments are:
-#
-# DESTINATION <dest>         - Module destination directory.
-# LANGUAGE <lang>            - The GPU language for this module, e.g CUDA, HIP,
-#                              etc.
-# SOURCES <sources>          - List of source files relative to CMakeLists.txt
-#                              directory.
-#
-# Optional arguments:
-#
-# ARCHITECTURES <arches>     - A list of target GPU architectures in cmake
-#                              format.
-#                              Refer `CMAKE_CUDA_ARCHITECTURES` documentation
-#                              and `CMAKE_HIP_ARCHITECTURES` for more info.
-#                              ARCHITECTURES will use cmake's defaults if
-#                              not provided.
-# COMPILE_FLAGS <flags>      - Extra compiler flags passed to NVCC/hip.
-# INCLUDE_DIRECTORIES <dirs> - Extra include directories.
-# LIBRARIES <libraries>      - Extra link libraries.
-# WITH_SOABI                 - Generate library with python SOABI suffix name.
-# USE_SABI <version>         - Use python stable api <version>
-#
-# Note: optimization level/debug info is set via cmake build type.
-#
-function (define_gpu_extension_target GPU_MOD_NAME)
-  cmake_parse_arguments(PARSE_ARGV 1
-    GPU
-    "WITH_SOABI"
-    "DESTINATION;LANGUAGE;USE_SABI"
-    "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")
-  # Add hipify preprocessing step when building with HIP/ROCm.
-  if (GPU_LANGUAGE STREQUAL "HIP")
-    hipify_sources_target(GPU_SOURCES ${GPU_MOD_NAME} "${GPU_SOURCES}")
-  endif()
-  if (GPU_WITH_SOABI)
-    set(GPU_WITH_SOABI WITH_SOABI)
-  else()
-    set(GPU_WITH_SOABI)
-  endif()
-  if (GPU_USE_SABI)
-    Python_add_library(${GPU_MOD_NAME} MODULE USE_SABI ${GPU_USE_SABI} ${GPU_WITH_SOABI} "${GPU_SOURCES}")
-  else()
-    Python_add_library(${GPU_MOD_NAME} MODULE ${GPU_WITH_SOABI} "${GPU_SOURCES}")
-  endif()
-  if (GPU_LANGUAGE STREQUAL "HIP")
-    # Make this target dependent on the hipify preprocessor step.
-    add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME})
-  endif()
-  if (GPU_ARCHITECTURES)
-    set_target_properties(${GPU_MOD_NAME} PROPERTIES
-      ${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}")
-  endif()
-  set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17)
-  target_compile_options(${GPU_MOD_NAME} PRIVATE
-    $<$<COMPILE_LANGUAGE:${GPU_LANGUAGE}>:${GPU_COMPILE_FLAGS}>)
-  target_compile_definitions(${GPU_MOD_NAME} PRIVATE
-    "-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}")
-  target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
-    ${GPU_INCLUDE_DIRECTORIES})
-  target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES})
-  # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
-  # dependencies that are not necessary and may not be installed.
-  if (GPU_LANGUAGE STREQUAL "CUDA")
-    target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart)
-  else()
-    target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
-  endif()
-  install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION} COMPONENT ${GPU_MOD_NAME})
-endfunction()

layer_norm/ln.h DELETED Viewed

@@ -1,281 +0,0 @@
-#pragma once
-#include <unordered_map>
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
-#ifdef OLD_GENERATOR_PATH
-#include <ATen/CUDAGeneratorImpl.h>
-#else
-#include <ATen/cuda/CUDAGeneratorImpl.h>
-#endif
-namespace layer_norm {
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template<typename Params>
-struct LaunchParams{
-    size_t elts_per_thread;
-    size_t workspace_bytes;
-    size_t barrier_size;
-    cudaDeviceProp * props;
-    cudaStream_t stream;
-    Params params;
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-struct ParamsBase {
-    ParamsBase()
-        : ctas_per_col(0)
-        , rows(0)
-        , cols(0)
-        , x(nullptr)
-        , mu(nullptr)
-        , rs(nullptr)
-        , gamma(nullptr)
-        , gamma1(nullptr)
-        , rowscale(nullptr)
-        , colscale(nullptr)
-        , dropout_keep_p(1.f)
-        , dropout_scale(1.f)
-        , is_rms_norm(false)
-        , workspace(nullptr)
-        , barrier(nullptr)
-    {
-    }
-    // For Multi-CTA, number of different CTA groups. Otherwise same as gridDim.x.
-    int ctas_per_col;
-    // Input is interpreted as matrix. We normalize across columns.
-    int rows;
-    int cols;
-    // Common data pointers.
-    void *x0;
-    void *x1;
-    void *residual;
-    void *x;
-    void *dmask;
-    void *dmask1;
-    void *mu;
-    void *rs;
-    void *gamma;
-    void *gamma1;
-    void *rowscale;
-    void *colscale;
-    void *x0_subset;
-    void *z_subset;
-    float inverse_cols;
-    float dropout_keep_p;
-    float dropout_scale;
-    float rowscale_const;
-    bool is_rms_norm;
-    // Multi-CTA workspace in gmem.
-    void *workspace;
-    // Multi-CTA sync barriers in gmem.
-    int *barrier;
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-struct FwdParams : public ParamsBase {
-    FwdParams()
-        : ParamsBase()
-        , z(nullptr)
-        , z1(nullptr)
-        , beta(nullptr)
-        , beta1(nullptr)
-        , epsilon(0.f)
-    {
-    }
-    // Output of LN FWD.
-    void *z;
-    void *z1;
-    void *beta;
-    void *beta1;
-    float epsilon;
-    // Random state.
-    at::PhiloxCudaState philox_args;
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-struct BwdParams : public ParamsBase {
-    BwdParams()
-        : ParamsBase()
-        , dz(nullptr)
-        , dz1(nullptr)
-        , dx(nullptr)
-        , dbeta_part(nullptr)
-        , dgamma_part(nullptr)
-        , dbeta1_part(nullptr)
-        , dgamma1_part(nullptr)
-        , dcolscale_part(nullptr)
-        , dx0(nullptr)
-        , dx1(nullptr)
-        , dresidual(nullptr)
-        , dbeta(nullptr)
-        , dgamma(nullptr)
-        , dbeta1(nullptr)
-        , dgamma1(nullptr)
-        , dcolscale(nullptr)
-    {
-    }
-    // Input: gradient wrt. LN FWD output.
-    void *dz;
-    void *dz1;
-    // Input: gradient wrt residual.
-    void *dx;
-    // Workspace for Wgrad pre-reduction.
-    void *dbeta_part;
-    void *dgamma_part;
-    void *dbeta1_part;
-    void *dgamma1_part;
-    void *dcolscale_part;
-    // Output: Dgrad.
-    void *dx0;
-    void *dx1;
-    void *dresidual;
-    // Output: Wgrad.
-    void *dbeta;
-    void *dgamma;
-    void *dbeta1;
-    void *dgamma1;
-    void *dcolscale;
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-using FwdFunction = std::function<void(LaunchParams<FwdParams>&, const bool)>;
-using BwdFunction = std::function<void(LaunchParams<BwdParams>&, const bool)>;
-using FunctionKey = uint64_t;
-using FwdRegistry = std::unordered_map<FunctionKey, FwdFunction>;
-using BwdRegistry = std::unordered_map<FunctionKey, BwdFunction>;
-extern FwdRegistry FWD_FUNCS, PARALLEL_FWD_FUNCS;
-extern BwdRegistry BWD_FUNCS, PARALLEL_BWD_FUNCS;
-////////////////////////////////////////////////////////////////////////////////////////////////////
-using fp32 = float;
-using fp16 = half;
-using bf16 = nv_bfloat16;
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template<typename T>
-struct TypeId{};
-template<>
-struct TypeId<fp16>{
-    constexpr static uint32_t Value = 0;
-};
-template<>
-struct TypeId<bf16>{
-    constexpr static uint32_t Value = 1;
-};
-template<>
-struct TypeId<fp32>{
-    constexpr static uint32_t Value = 2;
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template<typename T, int S>
-struct Type2Key{
-    constexpr static uint32_t Value = TypeId<T>::Value << S;
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template<typename T>
-struct WeightType2Key : public Type2Key<T, 0>{};
-template<typename T>
-struct InputType2Key : public Type2Key<T, 2>{};
-template<typename T>
-struct ResidualType2Key : public Type2Key<T, 4>{};
-template<typename T>
-struct OutputType2Key : public Type2Key<T, 6>{};
-template<typename T>
-struct ComputeType2Key : public Type2Key<T, 8>{};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template<typename W, typename I, typename R, typename O, typename C>
-struct Types2Key{
-    constexpr static uint32_t Value = WeightType2Key<W>::Value | InputType2Key<I>::Value | ResidualType2Key<R>::Value | OutputType2Key<O>::Value | ComputeType2Key<C>::Value;
-    constexpr static inline uint64_t get(const uint64_t hidden_size){
-        constexpr uint64_t type_key = Value;
-        return (type_key << 32) | hidden_size;
-    }
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template<typename W, typename I, typename R, typename O, typename C, uint64_t HIDDEN_SIZE>
-struct FwdRegistrar{
-    FwdRegistrar(FwdFunction f){
-        uint64_t key = Types2Key<W,I,R,O,C>::get(HIDDEN_SIZE);
-        FWD_FUNCS.insert({ key, f });
-    }
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template<typename W, typename I, typename R, typename O, typename C, uint64_t HIDDEN_SIZE>
-struct BwdRegistrar{
-    BwdRegistrar(BwdFunction f){
-        uint64_t key = Types2Key<W,I,R,O,C>::get(HIDDEN_SIZE);
-        BWD_FUNCS.insert({ key, f });
-    }
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template<typename W, typename I, typename R, typename O, typename C, uint64_t HIDDEN_SIZE>
-struct FwdParallelRegistrar{
-    FwdParallelRegistrar(FwdFunction f){
-        uint64_t key = Types2Key<W,I,R,O,C>::get(HIDDEN_SIZE);
-        PARALLEL_FWD_FUNCS.insert({ key, f });
-    }
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template<typename W, typename I, typename R, typename O, typename C, uint64_t HIDDEN_SIZE>
-struct BwdParallelRegistrar{
-    BwdParallelRegistrar(BwdFunction f){
-        uint64_t key = Types2Key<W,I,R,O,C>::get(HIDDEN_SIZE);
-        PARALLEL_BWD_FUNCS.insert({ key, f });
-    }
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-}  // namespace layer_norm

layer_norm/ln_api.cpp DELETED Viewed

@@ -1,828 +0,0 @@
-#include <torch/torch.h>
-#include "ATen/cuda/CUDAContext.h"
-#include <c10/cuda/CUDAGuard.h>
-#include "ln.h"
-/*
-Supported Type combinations:
-input  residual   compute   weights   output
-============================================
-fp32     fp32      fp32      fp32      fp32
-fp16     fp32      fp32      fp32      fp16
-fp16     fp16      fp32      fp32      fp16
-bf16     fp32      fp32      fp32      bf16
-bf16     bf16      fp32      fp32      bf16
-fp16     fp16      fp32      fp16      fp16
-bf16     bf16      fp32      bf16      bf16
-Remarks:
-Output type = Input type
-Compute always in FP32
-*/
-namespace layer_norm {
-// Create registries and provide runtime versions of config hash functions.
-FwdRegistry FWD_FUNCS, PARALLEL_FWD_FUNCS;
-BwdRegistry BWD_FUNCS, PARALLEL_BWD_FUNCS;
-////////////////////////////////////////////////////////////////////////////////////////////////////
-uint32_t get_type_id(torch::Dtype dtype){
-    if( dtype == torch::kFloat16 ) {
-        return TypeId<fp16>::Value;
-    } else if( dtype == torch::kBFloat16 ) {
-        return TypeId<bf16>::Value;
-    } else if( dtype == torch::kFloat32 ) {
-        return TypeId<fp32>::Value;
-    } else {
-        TORCH_CHECK(false, "Type not supported: ", dtype);
-    }
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-uint64_t get_key(torch::Dtype wtype, torch::Dtype itype, torch::Dtype rtype, torch::Dtype otype, torch::Dtype ctype, uint64_t hidden_size) {
-    using namespace layer_norm;
-    uint64_t type_key = get_type_id(wtype) | (get_type_id(itype) << 2) | (get_type_id(rtype) << 4) | (get_type_id(otype) << 6) | (get_type_id(ctype) << 8);
-    uint64_t launcher_key = (type_key << 32) | hidden_size;
-    return launcher_key;
-}
-}  // namespace layer_norm
-////////////////////////////////////////////////////////////////////////////////////////////////////
-layer_norm::FwdFunction & get_fwd_launcher(torch::Dtype wtype, torch::Dtype itype, torch::Dtype rtype, torch::Dtype otype, torch::Dtype ctype, uint32_t hidden_size) {
-    auto iter = layer_norm::FWD_FUNCS.find(layer_norm::get_key(wtype, itype, rtype, otype, ctype, hidden_size));
-    if( iter != layer_norm::FWD_FUNCS.end() ) {
-        return iter->second;
-    } else {
-        TORCH_CHECK(false, "FWD: Unsupported hidden_size or types: ", hidden_size, wtype, itype, rtype, otype, ctype);
-    }
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-layer_norm::BwdFunction & get_bwd_launcher(torch::Dtype wtype, torch::Dtype itype, torch::Dtype rtype, torch::Dtype otype, torch::Dtype ctype, uint32_t hidden_size) {
-    auto iter = layer_norm::BWD_FUNCS.find(layer_norm::get_key(wtype, itype, rtype, otype, ctype, hidden_size));
-    if( iter != layer_norm::BWD_FUNCS.end() ) {
-        return iter->second;
-    } else {
-        TORCH_CHECK(false, "BWD: Unsupported hidden_size or types: ", hidden_size, wtype, itype, rtype, otype, ctype);
-    }
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-layer_norm::FwdFunction & get_parallel_fwd_launcher(torch::Dtype wtype, torch::Dtype itype, torch::Dtype rtype, torch::Dtype otype, torch::Dtype ctype, uint32_t hidden_size) {
-    auto iter = layer_norm::PARALLEL_FWD_FUNCS.find(layer_norm::get_key(wtype, itype, rtype, otype, ctype, hidden_size));
-    if( iter != layer_norm::PARALLEL_FWD_FUNCS.end() ) {
-        return iter->second;
-    } else {
-        TORCH_CHECK(false, "FWD: Unsupported hidden_size or types: ", hidden_size, wtype, itype, rtype, otype, ctype);
-    }
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-layer_norm::BwdFunction & get_parallel_bwd_launcher(torch::Dtype wtype, torch::Dtype itype, torch::Dtype rtype, torch::Dtype otype, torch::Dtype ctype, uint32_t hidden_size) {
-    auto iter = layer_norm::PARALLEL_BWD_FUNCS.find(layer_norm::get_key(wtype, itype, rtype, otype, ctype, hidden_size));
-    if( iter != layer_norm::PARALLEL_BWD_FUNCS.end() ) {
-        return iter->second;
-    } else {
-        TORCH_CHECK(false, "BWD: Unsupported hidden_size or types: ", hidden_size, wtype, itype, rtype, otype, ctype);
-    }
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-std::vector<at::Tensor> dropout_add_ln_fwd(const at::Tensor &x0,      // Input: BxSxhidden_size
-                                           c10::optional<const at::Tensor> &residual_,  // Residual: BxSxhidden_size
-                                           const at::Tensor &gamma,   // hidden_size
-                                           c10::optional<const at::Tensor> &beta_,   // hidden_size
-                                           c10::optional<const at::Tensor> &rowscale_,      // BxS
-                                           c10::optional<const at::Tensor> &colscale_,      // hidden_size
-                                           c10::optional<const at::Tensor> &x0_subset_,      // BxS
-                                           c10::optional<const at::Tensor> &z_subset_,      // BxS
-                                           const float dropout_p,
-                                           const float epsilon,
-                                           const float rowscale_const,
-                                           const int64_t z_numrows,
-                                           c10::optional<at::Generator> gen_,
-                                           bool residual_in_fp32=false,
-                                           bool is_rms_norm=false
-) {
-    auto itype = x0.scalar_type();
-    auto rtype = residual_.has_value()
-        ? residual_.value().scalar_type()
-        : (residual_in_fp32 ? torch::kFloat32 : x0.scalar_type());
-    auto wtype = gamma.scalar_type();
-    auto otype = itype;
-    auto ctype = torch::kFloat32;
-    auto mtype = torch::kUInt8;
-    TORCH_CHECK(x0.is_cuda());
-    TORCH_CHECK(gamma.is_cuda());
-    TORCH_CHECK(x0.is_contiguous());
-    // c10::IntArrayRef does not own the storage, so we need to construct a vector.
-    // Otherwise just constructing IntArrayRef({blah}) will cause uninitialized memory because
-    // blah is then deallocated.
-    std::vector<int64_t> sizes_vec {!x0_subset_.has_value() ? x0.size(0) : x0_subset_.value().size(0), x0.size(1)};
-    auto sizes = c10::IntArrayRef(sizes_vec);
-    TORCH_CHECK(x0.dim() == 2);
-    TORCH_CHECK(sizes.size() == 2);
-    const int rows = sizes[0];
-    const int cols = sizes[1];
-    auto hidden_size = gamma.numel();
-    TORCH_CHECK(hidden_size == cols);
-    if (beta_.has_value()) {
-        auto beta = beta_.value();
-        TORCH_CHECK(beta.dtype() == wtype);
-        TORCH_CHECK(beta.is_cuda());
-        TORCH_CHECK(beta.is_contiguous());
-        TORCH_CHECK(beta.sizes() == gamma.sizes());
-    }
-    if (residual_.has_value()) {
-        auto residual = residual_.value();
-        TORCH_CHECK(residual.is_cuda());
-        TORCH_CHECK(residual.is_contiguous());
-        TORCH_CHECK(residual.sizes() == sizes);
-    }
-    if (rowscale_.has_value()) {
-        auto rowscale = rowscale_.value();
-        TORCH_CHECK(rowscale.is_cuda());
-        TORCH_CHECK(rowscale.is_contiguous());
-        TORCH_CHECK(rowscale.sizes() == c10::IntArrayRef{rows});
-        TORCH_CHECK(rowscale.dtype() == itype);
-    }
-    if (colscale_.has_value()) {
-        auto colscale = colscale_.value();
-        TORCH_CHECK(colscale.is_cuda());
-        TORCH_CHECK(colscale.is_contiguous());
-        TORCH_CHECK(colscale.sizes() == c10::IntArrayRef{cols});
-        TORCH_CHECK(colscale.dtype() == wtype);
-    }
-    if (x0_subset_.has_value()) {
-        auto x0_subset = x0_subset_.value();
-        TORCH_CHECK(x0_subset.is_cuda());
-        TORCH_CHECK(x0_subset.is_contiguous());
-        TORCH_CHECK(x0_subset.sizes() == c10::IntArrayRef{rows});
-        TORCH_CHECK(x0_subset.dtype() == torch::kInt32);
-        TORCH_CHECK(z_subset_.has_value());
-        auto z_subset = z_subset_.value();
-        TORCH_CHECK(z_subset.is_cuda());
-        TORCH_CHECK(z_subset.is_contiguous());
-        TORCH_CHECK(z_subset.sizes() == c10::IntArrayRef{rows});
-        TORCH_CHECK(z_subset.dtype() == torch::kInt32);
-    }
-    TORCH_CHECK((hidden_size % 8 == 0) && (hidden_size <= 8192));
-    TORCH_CHECK(epsilon >= 0.f);
-    // Otherwise the kernel will be launched from cuda:0 device
-    // Cast to char to avoid compiler warning about narrowing
-    at::cuda::CUDAGuard device_guard{(char)x0.get_device()};
-    auto opts = x0.options();
-    bool save_x = residual_.has_value() || (dropout_p > 0.f) || rowscale_.has_value() || colscale_.has_value() || x0_subset_.has_value() || (itype != rtype);
-    at::Tensor x;
-    if (save_x) { x = torch::empty(sizes, opts.dtype(rtype)); }
-    at::Tensor dmask;
-    if (dropout_p > 0.f) { dmask = torch::empty(x0.sizes(), opts.dtype(mtype)); };
-    auto z = torch::empty(z_subset_.has_value() ? c10::IntArrayRef{z_numrows, cols} : sizes, opts.dtype(otype));
-    auto mu = torch::empty({ rows }, opts.dtype(ctype));
-    auto rsigma = torch::empty({ rows }, opts.dtype(ctype));
-    layer_norm::LaunchParams<layer_norm::FwdParams> launch_params;
-    launch_params.props = at::cuda::getCurrentDeviceProperties();
-    launch_params.stream = at::cuda::getCurrentCUDAStream().stream();
-    TORCH_CHECK(dropout_p < 1.f);
-    launch_params.params.dropout_keep_p = 1.f - dropout_p;
-    launch_params.params.residual = residual_.has_value() ? residual_.value().data_ptr() : nullptr;
-    launch_params.params.rowscale = rowscale_.has_value() ? rowscale_.value().data_ptr() : nullptr;
-    launch_params.params.colscale = colscale_.has_value() ? colscale_.value().data_ptr() : nullptr;
-    launch_params.params.x0_subset = x0_subset_.has_value() ? x0_subset_.value().data_ptr() : nullptr;
-    launch_params.params.z_subset = z_subset_.has_value() ? z_subset_.value().data_ptr() : nullptr;
-    auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
-        gen_, at::cuda::detail::getDefaultCUDAGenerator());
-    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
-    const int multiple = hidden_size <= 1536 ? 256 : (hidden_size <= 3072 ? 512 : 1024);
-    // Request the kernel launcher.
-    auto launcher = get_fwd_launcher(wtype, itype, rtype, otype, ctype, round_multiple(hidden_size, multiple));
-    // Set the kernel runtime parameters.
-    layer_norm::FwdParams &params = launch_params.params;
-    params.rows = rows;
-    params.cols = cols;
-    params.x0 = x0.data_ptr();
-    params.x = save_x ? x.data_ptr() : nullptr;
-    params.dmask = dropout_p > 0.f ? dmask.data_ptr() : nullptr;
-    params.mu = mu.data_ptr();
-    params.rs = rsigma.data_ptr();
-    params.gamma = gamma.data_ptr();
-    params.beta = beta_.has_value() ? beta_.value().data_ptr() : nullptr;
-    params.z = z.data_ptr();
-    params.epsilon = epsilon;
-    params.dropout_scale = 1.f / (1.f - dropout_p);
-    params.inverse_cols = 1.f / float(params.cols);
-    params.rowscale_const = rowscale_const;
-    params.is_rms_norm = is_rms_norm;
-    // Query the kernel-specific launch parameters.
-    launcher(launch_params, true);
-    at::Tensor workspace, barrier;
-    if (dropout_p > 0.f) {
-        // number of times random will be generated per thread, to offset philox counter in thc random
-        // state
-        int64_t counter_offset = launch_params.elts_per_thread;
-        // See Note [Acquire lock when using random generators]
-        {
-            std::lock_guard<std::mutex> lock(gen->mutex_);
-            params.philox_args = gen->philox_cuda_state(counter_offset);
-        }
-    }
-    if( launch_params.barrier_size > 0 ) {
-        auto options = x0.options();
-        barrier = torch::zeros(launch_params.barrier_size, options.dtype(torch::kInt32));
-        workspace = torch::empty(launch_params.workspace_bytes, options.dtype(torch::kChar));
-        params.workspace = workspace.data_ptr();
-        params.barrier = barrier.data_ptr<int>();
-    }
-    // Launch the kernel.
-    launcher(launch_params, false);
-    return { z, x, dmask, mu, rsigma };
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-std::vector<at::Tensor> dropout_add_ln_bwd(const at::Tensor &dz,     // BxSxhidden_size
-                                           c10::optional<const at::Tensor> &dx_,     // BxSxhidden_size
-                                           const at::Tensor &x,      // BxSxhidden_size
-                                           c10::optional<const at::Tensor> &x0_,     // BxSxhidden_size
-                                           c10::optional<const at::Tensor> &dmask_,  // BxSxhidden_size
-                                           const at::Tensor &mu,     // BxS, FP32!
-                                           const at::Tensor &rsigma, // BxS, FP32!
-                                           const at::Tensor &gamma,   // hidden_size
-                                           c10::optional<const at::Tensor> &rowscale_,      // BxS
-                                           c10::optional<const at::Tensor> &colscale_,      // hidden_size
-                                           c10::optional<const at::Tensor> &x0_subset_,      // BxS
-                                           c10::optional<const at::Tensor> &z_subset_,      // BxS
-                                           const float dropout_p,
-                                           const float rowscale_const,
-                                           const int64_t x0_numrows,
-                                           const bool has_residual,
-                                           bool is_rms_norm=false
-) {
-    auto itype = dz.scalar_type();
-    auto rtype = x.scalar_type();
-    auto wtype = gamma.scalar_type();
-    auto otype = itype;
-    auto ctype = torch::kFloat32;
-    auto mtype = torch::kUInt8;
-    if (dropout_p > 0.f) { TORCH_CHECK(dmask_.has_value()); }
-    TORCH_CHECK(dz.dtype() == otype);
-    TORCH_CHECK(mu.dtype() == ctype);
-    TORCH_CHECK(rsigma.dtype() == ctype);
-    TORCH_CHECK(x.is_cuda());
-    TORCH_CHECK(dz.is_cuda());
-    TORCH_CHECK(mu.is_cuda());
-    TORCH_CHECK(rsigma.is_cuda());
-    TORCH_CHECK(gamma.is_cuda());
-    TORCH_CHECK(x.is_contiguous());
-    TORCH_CHECK(dz.is_contiguous());
-    auto sizes = x.sizes();
-    TORCH_CHECK(sizes.size() == 2);
-    auto rows = sizes[0];
-    auto cols = sizes[1];
-    TORCH_CHECK(dz.dim() == 2);
-    TORCH_CHECK(dz.size(1) == cols);
-    auto hidden_size = gamma.numel();
-    TORCH_CHECK(hidden_size == cols);
-    // c10::IntArrayRef does not own the storage, so we need to construct a vector.
-    // Otherwise just constructing IntArrayRef({blah}) will cause uninitialized memory because
-    // blah is then deallocated.
-    std::vector<int64_t> x0_sizes_vec {!x0_subset_.has_value() ? rows : x0_numrows, cols};
-    auto x0_sizes = c10::IntArrayRef(x0_sizes_vec);
-    if (dx_.has_value()) {
-        auto dx = dx_.value();
-        TORCH_CHECK(dx.dtype() == rtype);
-        TORCH_CHECK(dx.is_cuda());
-        TORCH_CHECK(dx.is_contiguous());
-        TORCH_CHECK(dx.sizes() == sizes);
-    }
-    if (dmask_.has_value()) {
-        auto dmask = dmask_.value();
-        TORCH_CHECK(dmask.dtype() == mtype);
-        TORCH_CHECK(dmask.is_cuda());
-        TORCH_CHECK(dmask.is_contiguous());
-        TORCH_CHECK(dmask.sizes() == x0_sizes);
-    }
-    if (rowscale_.has_value()) {
-        auto rowscale = rowscale_.value();
-        TORCH_CHECK(rowscale.is_cuda());
-        TORCH_CHECK(rowscale.is_contiguous());
-        TORCH_CHECK(rowscale.sizes() == c10::IntArrayRef{rows});
-        TORCH_CHECK(rowscale.dtype() == itype);
-    }
-    if (colscale_.has_value()) {
-        auto colscale = colscale_.value();
-        TORCH_CHECK(colscale.is_cuda());
-        TORCH_CHECK(colscale.is_contiguous());
-        TORCH_CHECK(colscale.sizes() == c10::IntArrayRef{cols});
-        TORCH_CHECK(colscale.dtype() == wtype);
-        TORCH_CHECK(x0_.has_value());
-        auto x0 = x0_.value();
-        TORCH_CHECK(x0.is_cuda());
-        TORCH_CHECK(x0.is_contiguous());
-        TORCH_CHECK(x0.sizes() == x0_sizes);
-        TORCH_CHECK(x0.dtype() == itype);
-    }
-    if (x0_subset_.has_value()) {
-        auto x0_subset = x0_subset_.value();
-        TORCH_CHECK(x0_subset.is_cuda());
-        TORCH_CHECK(x0_subset.is_contiguous());
-        TORCH_CHECK(x0_subset.sizes() == c10::IntArrayRef{rows});
-        TORCH_CHECK(x0_subset.dtype() == torch::kInt32);
-        TORCH_CHECK(z_subset_.has_value());
-        auto z_subset = z_subset_.value();
-        TORCH_CHECK(z_subset.is_cuda());
-        TORCH_CHECK(z_subset.is_contiguous());
-        TORCH_CHECK(z_subset.sizes() == c10::IntArrayRef{rows});
-        TORCH_CHECK(z_subset.dtype() == torch::kInt32);
-    }
-    TORCH_CHECK((hidden_size % 8 == 0) && (hidden_size <= 8192));
-    TORCH_CHECK(mu.numel() == rows);
-    TORCH_CHECK(mu.sizes() == rsigma.sizes());
-    TORCH_CHECK(gamma.numel() == cols);
-    // Otherwise the kernel will be launched from cuda:0 device
-    // Cast to char to avoid compiler warning about narrowing
-    at::cuda::CUDAGuard device_guard{(char)dz.get_device()};
-    auto opts = x.options();
-    auto dx0 = torch::empty(x0_sizes, opts.dtype(itype));
-    at::Tensor dresidual;
-    if (has_residual) { dresidual = torch::empty_like(x, opts.dtype(rtype)); }
-    auto dgamma = torch::empty_like(gamma);
-    auto dbeta = torch::empty_like(gamma);
-    at::Tensor dcolscale;
-    if (colscale_.has_value()) {
-        dcolscale = torch::empty_like(colscale_.value());
-    }
-    layer_norm::LaunchParams<layer_norm::BwdParams> launch_params;
-    launch_params.stream = at::cuda::getCurrentCUDAStream().stream();
-    launch_params.props = at::cuda::getCurrentDeviceProperties();
-    TORCH_CHECK(dropout_p < 1.f);
-    launch_params.params.dropout_keep_p = 1.f - dropout_p;
-    launch_params.params.dresidual = has_residual ? dresidual.data_ptr() : nullptr;
-    launch_params.params.rowscale = rowscale_.has_value() ? rowscale_.value().data_ptr() : nullptr;
-    launch_params.params.colscale = colscale_.has_value() ? colscale_.value().data_ptr() : nullptr;
-    launch_params.params.x0_subset = x0_subset_.has_value() ? x0_subset_.value().data_ptr() : nullptr;
-    launch_params.params.z_subset = z_subset_.has_value() ? z_subset_.value().data_ptr() : nullptr;
-    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
-    const int multiple = hidden_size <= 1536 ? 256 : (hidden_size <= 3072 ? 512 : 1024);
-    auto launcher = get_bwd_launcher(wtype, itype, rtype, otype, ctype, round_multiple(hidden_size, multiple));
-    launcher(launch_params, true);
-    auto dgamma_part = torch::empty({ launch_params.params.ctas_per_col, hidden_size }, opts.dtype(ctype));
-    auto dbeta_part = torch::empty({ launch_params.params.ctas_per_col, hidden_size }, opts.dtype(ctype));
-    at::Tensor dcolscale_part;
-    if (colscale_.has_value()) {
-        dcolscale_part = torch::empty({ launch_params.params.ctas_per_col, hidden_size }, opts.dtype(ctype));
-    }
-    at::Tensor workspace, barrier;
-    layer_norm::BwdParams &params = launch_params.params;
-    params.rows = rows;
-    params.cols = cols;
-    params.x = x.data_ptr();
-    params.x0 = x0_.has_value() ? x0_.value().data_ptr() : nullptr;
-    params.dmask = dropout_p > 0.f ? dmask_.value().data_ptr() : nullptr;
-    params.mu = mu.data_ptr();
-    params.rs = rsigma.data_ptr();
-    params.gamma = gamma.data_ptr();
-    params.dz = dz.data_ptr();
-    params.dx = dx_.has_value() ? dx_.value().data_ptr() : nullptr;
-    params.dx0 = dx0.data_ptr();
-    params.dbeta = dbeta.data_ptr();
-    params.dgamma = dgamma.data_ptr();
-    params.dcolscale = colscale_.has_value() ? dcolscale.data_ptr() : nullptr;
-    params.dbeta_part = dbeta_part.data_ptr();
-    params.dgamma_part = dgamma_part.data_ptr();
-    params.dcolscale_part = colscale_.has_value() ? dcolscale_part.data_ptr() : nullptr;
-    params.dropout_scale = 1.f / (1.f - dropout_p);
-    params.inverse_cols = 1.f / float(params.cols);
-    params.rowscale_const = rowscale_const;
-    params.is_rms_norm = is_rms_norm;
-    if( launch_params.barrier_size > 0 ) {
-        // TODO Any way to avoid this?
-        barrier = torch::zeros(launch_params.barrier_size, opts.dtype(torch::kInt32));
-        workspace = torch::empty(launch_params.workspace_bytes, opts.dtype(torch::kChar));
-        params.workspace = workspace.data_ptr();
-        params.barrier = barrier.data_ptr<int>();
-    }
-    launcher(launch_params, false);
-    std::vector<at::Tensor> result = { dx0, dresidual, dgamma, dbeta, dgamma_part, dbeta_part };
-    if (colscale_.has_value()) {
-        result.push_back(dcolscale);
-        result.push_back(dcolscale_part);
-    }
-    return result;
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-std::vector<at::Tensor> dropout_add_ln_parallel_residual_fwd(
-    const at::Tensor &x0,      // Input: BxSxhidden_size
-    c10::optional<const at::Tensor> &x1_,      // Input: BxSxhidden_size
-    c10::optional<const at::Tensor> &residual_,  // Residual: BxSxhidden_size
-    const at::Tensor &gamma0,   // hidden_size
-    c10::optional<const at::Tensor> &beta0_,   // hidden_size
-    c10::optional<const at::Tensor> &gamma1_,   // hidden_size
-    c10::optional<const at::Tensor> &beta1_,   // hidden_size
-    const float dropout_p,
-    const float epsilon,
-    c10::optional<at::Generator> gen_,
-    bool residual_in_fp32=false,
-    bool is_rms_norm=false
-) {
-    auto itype = x0.scalar_type();
-    auto rtype = residual_.has_value()
-        ? residual_.value().scalar_type()
-        : (residual_in_fp32 ? torch::kFloat32 : x0.scalar_type());
-    auto wtype = gamma0.scalar_type();
-    auto otype = itype;
-    auto ctype = torch::kFloat32;
-    auto mtype = torch::kUInt8;
-    TORCH_CHECK(x0.is_cuda());
-    TORCH_CHECK(gamma0.is_cuda());
-    TORCH_CHECK(x0.is_contiguous());
-    const auto sizes = x0.sizes();
-    TORCH_CHECK(x0.dim() == 2);
-    const int rows = sizes[0];
-    const int cols = sizes[1];
-    auto hidden_size = gamma0.numel();
-    TORCH_CHECK(hidden_size == cols);
-    if (x1_.has_value()) {
-        auto x1 = x1_.value();
-        TORCH_CHECK(x1.is_cuda());
-        TORCH_CHECK(x1.is_contiguous());
-        TORCH_CHECK(x1.sizes() == sizes);
-    }
-    if (residual_.has_value()) {
-        auto residual = residual_.value();
-        TORCH_CHECK(residual.is_cuda());
-        TORCH_CHECK(residual.is_contiguous());
-        TORCH_CHECK(residual.sizes() == sizes);
-    }
-    if (beta0_.has_value()) {
-        auto beta0 = beta0_.value();
-        TORCH_CHECK(beta0.dtype() == wtype);
-        TORCH_CHECK(beta0.is_cuda());
-        TORCH_CHECK(beta0.is_contiguous());
-        TORCH_CHECK(beta0.sizes() == gamma0.sizes());
-    }
-    if (gamma1_.has_value()) {
-        auto gamma1 = gamma1_.value();
-        TORCH_CHECK(gamma1.dtype() == wtype);
-        TORCH_CHECK(gamma1.is_cuda());
-        TORCH_CHECK(gamma1.is_contiguous());
-        TORCH_CHECK(gamma1.sizes() == gamma0.sizes());
-    }
-    if (beta1_.has_value()) {
-        auto beta1 = beta1_.value();
-        TORCH_CHECK(beta1.dtype() == wtype);
-        TORCH_CHECK(beta1.is_cuda());
-        TORCH_CHECK(beta1.is_contiguous());
-        TORCH_CHECK(beta1.sizes() == gamma0.sizes());
-    }
-    TORCH_CHECK((hidden_size % 8 == 0) && (hidden_size <= 8192));
-    TORCH_CHECK(epsilon >= 0.f);
-    // Otherwise the kernel will be launched from cuda:0 device
-    // Cast to char to avoid compiler warning about narrowing
-    at::cuda::CUDAGuard device_guard{(char)x0.get_device()};
-    auto opts = x0.options();
-    bool save_x = residual_.has_value() || x1_.has_value() || (dropout_p > 0.f) || (itype != rtype);
-    at::Tensor x;
-    if (save_x) { x = torch::empty(sizes, opts.dtype(rtype)); }
-    at::Tensor dmask0, dmask1;
-    if (dropout_p > 0.f) {
-        dmask0 = torch::empty(x0.sizes(), opts.dtype(mtype));
-        if (x1_.has_value()) { dmask1 = torch::empty(x0.sizes(), opts.dtype(mtype)); }
-    };
-    auto z0 = torch::empty(sizes, opts.dtype(otype));
-    at::Tensor z1;
-    if (gamma1_.has_value()) { z1 = torch::empty(sizes, opts.dtype(otype)); }
-    auto mu = torch::empty({ rows }, opts.dtype(ctype));
-    auto rsigma = torch::empty({ rows }, opts.dtype(ctype));
-    layer_norm::LaunchParams<layer_norm::FwdParams> launch_params;
-    launch_params.props = at::cuda::getCurrentDeviceProperties();
-    launch_params.stream = at::cuda::getCurrentCUDAStream().stream();
-    TORCH_CHECK(dropout_p < 1.f);
-    launch_params.params.dropout_keep_p = 1.f - dropout_p;
-    launch_params.params.residual = residual_.has_value() ? residual_.value().data_ptr() : nullptr;
-    auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
-        gen_, at::cuda::detail::getDefaultCUDAGenerator());
-    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
-    const int multiple = hidden_size <= 1536 ? 256 : (hidden_size <= 3072 ? 512 : 1024);
-    // Request the kernel launcher.
-    auto launcher = get_parallel_fwd_launcher(wtype, itype, rtype, otype, ctype, round_multiple(hidden_size, multiple));
-    // Set the kernel runtime parameters.
-    layer_norm::FwdParams &params = launch_params.params;
-    params.rows = rows;
-    params.cols = cols;
-    params.x0 = x0.data_ptr();
-    params.x1 = x1_.has_value() ? x1_.value().data_ptr() : nullptr;
-    params.x = save_x ? x.data_ptr() : nullptr;
-    params.dmask = dropout_p > 0.f ? dmask0.data_ptr() : nullptr;
-    params.dmask1 = (dropout_p > 0.f && x1_.has_value()) ? dmask1.data_ptr() : nullptr;
-    params.mu = mu.data_ptr();
-    params.rs = rsigma.data_ptr();
-    params.gamma = gamma0.data_ptr();
-    params.gamma1 = gamma1_.has_value() ? gamma1_.value().data_ptr() : nullptr;
-    params.beta = beta0_.has_value() ? beta0_.value().data_ptr() : nullptr;
-    params.beta1 = beta1_.has_value() ? beta1_.value().data_ptr() : nullptr;
-    params.z = z0.data_ptr();
-    params.z1 = gamma1_.has_value() ? z1.data_ptr() : nullptr;
-    params.epsilon = epsilon;
-    params.dropout_scale = 1.f / (1.f - dropout_p);
-    params.inverse_cols = 1.f / float(params.cols);
-    params.is_rms_norm = is_rms_norm;
-    // Query the kernel-specific launch parameters.
-    launcher(launch_params, true);
-    at::Tensor workspace, barrier;
-    if (dropout_p > 0.f) {
-        // number of times random will be generated per thread, to offset philox counter in thc random
-        // state
-        int64_t counter_offset = 2 * launch_params.elts_per_thread;
-        // See Note [Acquire lock when using random generators]
-        {
-            std::lock_guard<std::mutex> lock(gen->mutex_);
-            params.philox_args = gen->philox_cuda_state(counter_offset);
-        }
-    }
-    if( launch_params.barrier_size > 0 ) {
-        auto options = x0.options();
-        barrier = torch::zeros(launch_params.barrier_size, options.dtype(torch::kInt32));
-        workspace = torch::empty(launch_params.workspace_bytes, options.dtype(torch::kChar));
-        params.workspace = workspace.data_ptr();
-        params.barrier = barrier.data_ptr<int>();
-    }
-    // Launch the kernel.
-    launcher(launch_params, false);
-    return { z0, z1, x, dmask0, dmask1, mu, rsigma };
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-std::vector<at::Tensor> dropout_add_ln_parallel_residual_bwd(
-    const at::Tensor &dz0,     // BxSxhidden_size
-    c10::optional<const at::Tensor> &dz1_,     // BxSxhidden_size
-    c10::optional<const at::Tensor> &dx_,     // BxSxhidden_size
-    const at::Tensor &x,      // BxSxhidden_size
-    c10::optional<const at::Tensor> &dmask0_,  // BxSxhidden_size
-    c10::optional<const at::Tensor> &dmask1_,  // BxSxhidden_size
-    const at::Tensor &mu,     // BxS, FP32!
-    const at::Tensor &rsigma, // BxS, FP32!
-    const at::Tensor &gamma0,   // hidden_size
-    c10::optional<const at::Tensor> &gamma1_,   // hidden_size
-    const float dropout_p,
-    const bool has_x1,
-    const bool has_residual,
-    bool is_rms_norm=false
-) {
-    auto itype = dz0.scalar_type();
-    auto rtype = x.scalar_type();
-    auto wtype = gamma0.scalar_type();
-    auto otype = itype;
-    auto ctype = torch::kFloat32;
-    auto mtype = torch::kUInt8;
-    if (dropout_p > 0.f) { TORCH_CHECK(dmask0_.has_value()); }
-    TORCH_CHECK(dz0.dtype() == otype);
-    TORCH_CHECK(dz0.dtype() == otype);
-    TORCH_CHECK(mu.dtype() == ctype);
-    TORCH_CHECK(rsigma.dtype() == ctype);
-    TORCH_CHECK(x.is_cuda());
-    TORCH_CHECK(dz0.is_cuda());
-    TORCH_CHECK(mu.is_cuda());
-    TORCH_CHECK(rsigma.is_cuda());
-    TORCH_CHECK(gamma0.is_cuda());
-    TORCH_CHECK(x.is_contiguous());
-    TORCH_CHECK(dz0.is_contiguous());
-    auto sizes = x.sizes();
-    TORCH_CHECK(sizes.size() == 2);
-    auto rows = sizes[0];
-    auto cols = sizes[1];
-    TORCH_CHECK(dz0.dim() == 2);
-    TORCH_CHECK(dz0.size(1) == cols);
-    auto hidden_size = gamma0.numel();
-    TORCH_CHECK(hidden_size == cols);
-    if (dz1_.has_value()) {
-        auto dz1 = dz1_.value();
-        TORCH_CHECK(dz1.dtype() == otype);
-        TORCH_CHECK(dz1.is_cuda());
-        TORCH_CHECK(dz1.is_contiguous());
-        TORCH_CHECK(dz1.sizes() == sizes);
-        TORCH_CHECK(gamma1_.has_value());
-        auto gamma1 = gamma1_.value();
-        TORCH_CHECK(gamma1.dtype() == wtype);
-        TORCH_CHECK(gamma1.is_cuda());
-        TORCH_CHECK(gamma1.is_contiguous());
-        TORCH_CHECK(gamma1.sizes() == gamma0.sizes());
-    }
-    if (dx_.has_value()) {
-        auto dx = dx_.value();
-        TORCH_CHECK(dx.dtype() == rtype);
-        TORCH_CHECK(dx.is_cuda());
-        TORCH_CHECK(dx.is_contiguous());
-        TORCH_CHECK(dx.sizes() == sizes);
-    }
-    if (dmask0_.has_value()) {
-        auto dmask0 = dmask0_.value();
-        TORCH_CHECK(dmask0.dtype() == mtype);
-        TORCH_CHECK(dmask0.is_cuda());
-        TORCH_CHECK(dmask0.is_contiguous());
-        TORCH_CHECK(dmask0.sizes() == sizes);
-        if (has_x1) {
-            TORCH_CHECK(dmask1_.has_value());
-            auto dmask1 = dmask1_.value();
-            TORCH_CHECK(dmask1.dtype() == mtype);
-            TORCH_CHECK(dmask1.is_cuda());
-            TORCH_CHECK(dmask1.is_contiguous());
-            TORCH_CHECK(dmask1.sizes() == sizes);
-        }
-    }
-    TORCH_CHECK((hidden_size % 8 == 0) && (hidden_size <= 8192));
-    TORCH_CHECK(mu.numel() == rows);
-    TORCH_CHECK(mu.sizes() == rsigma.sizes());
-    // Otherwise the kernel will be launched from cuda:0 device
-    // Cast to char to avoid compiler warning about narrowing
-    at::cuda::CUDAGuard device_guard{(char)dz0.get_device()};
-    auto opts = x.options();
-    auto dx0 = torch::empty(sizes, opts.dtype(itype));
-    at::Tensor dx1;
-    if (has_x1) { dx1 = torch::empty(sizes, opts.dtype(itype)); }
-    at::Tensor dresidual;
-    if (has_residual) { dresidual = torch::empty_like(x, opts.dtype(rtype)); }
-    auto dgamma0 = torch::empty_like(gamma0);
-    auto dbeta0 = torch::empty_like(gamma0);
-    at::Tensor dgamma1, dbeta1;
-    if (gamma1_.has_value()) {
-        dgamma1 = torch::empty_like(gamma0);
-        dbeta1 = torch::empty_like(gamma0);
-    }
-    layer_norm::LaunchParams<layer_norm::BwdParams> launch_params;
-    launch_params.stream = at::cuda::getCurrentCUDAStream().stream();
-    launch_params.props = at::cuda::getCurrentDeviceProperties();
-    TORCH_CHECK(dropout_p < 1.f);
-    launch_params.params.dropout_keep_p = 1.f - dropout_p;
-    launch_params.params.dresidual = has_residual ? dresidual.data_ptr() : nullptr;
-    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
-    const int multiple = hidden_size <= 1536 ? 256 : (hidden_size <= 3072 ? 512 : 1024);
-    auto launcher = get_parallel_bwd_launcher(wtype, itype, rtype, otype, ctype, round_multiple(hidden_size, multiple));
-    launcher(launch_params, true);
-    auto dgamma0_part = torch::zeros({ launch_params.params.ctas_per_col, hidden_size }, opts.dtype(ctype));
-    auto dbeta0_part = torch::zeros({ launch_params.params.ctas_per_col, hidden_size }, opts.dtype(ctype));
-    at::Tensor dgamma1_part, dbeta1_part;
-    if (gamma1_.has_value()) {
-        dgamma1_part = torch::zeros_like(dgamma0_part);
-        dbeta1_part = torch::zeros_like(dbeta0_part);
-    }
-    at::Tensor workspace, barrier;
-    layer_norm::BwdParams &params = launch_params.params;
-    params.rows = rows;
-    params.cols = cols;
-    params.x = x.data_ptr();
-    params.dmask = dropout_p > 0.f ? dmask0_.value().data_ptr() : nullptr;
-    params.dmask1 = (dropout_p > 0.f && has_x1) ? dmask1_.value().data_ptr() : nullptr;
-    params.mu = mu.data_ptr();
-    params.rs = rsigma.data_ptr();
-    params.gamma = gamma0.data_ptr();
-    params.gamma1 = gamma1_.has_value() ? gamma1_.value().data_ptr() : nullptr;
-    params.dz = dz0.data_ptr();
-    params.dz1 = dz1_.has_value() ? dz1_.value().data_ptr() : nullptr;
-    params.dx = dx_.has_value() ? dx_.value().data_ptr() : nullptr;
-    params.dx0 = dx0.data_ptr();
-    params.dx1 = has_x1 ? dx1.data_ptr() : nullptr;
-    params.dbeta = dbeta0.data_ptr();
-    params.dgamma = dgamma0.data_ptr();
-    params.dbeta1 = gamma1_.has_value() ? dbeta1.data_ptr() : nullptr;
-    params.dgamma1 = gamma1_.has_value() ? dgamma1.data_ptr() : nullptr;
-    params.dbeta_part = dbeta0_part.data_ptr();
-    params.dgamma_part = dgamma0_part.data_ptr();
-    params.dbeta1_part = gamma1_.has_value() ? dbeta1_part.data_ptr() : nullptr;
-    params.dgamma1_part = gamma1_.has_value() ? dgamma1_part.data_ptr() : nullptr;
-    params.dropout_scale = 1.f / (1.f - dropout_p);
-    params.inverse_cols = 1.f / float(params.cols);
-    params.is_rms_norm = is_rms_norm;
-    if( launch_params.barrier_size > 0 ) {
-        // TODO Any way to avoid this?
-        barrier = torch::zeros(launch_params.barrier_size, opts.dtype(torch::kInt32));
-        workspace = torch::empty(launch_params.workspace_bytes, opts.dtype(torch::kChar));
-        params.workspace = workspace.data_ptr();
-        params.barrier = barrier.data_ptr<int>();
-    }
-    launcher(launch_params, false);
-    std::vector<at::Tensor> result = { dx0, dx1, dresidual, dgamma0, dbeta0, dgamma1, dbeta1, dgamma0_part, dbeta0_part, dgamma1_part, dbeta1_part };
-    return result;
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////

layer_norm/ln_bwd_1024.cu DELETED Viewed

@@ -1,15 +0,0 @@
-#include "ln_bwd_kernels.cuh"
-// Create backward launch function and register. Macro signature:
-//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
-REGISTER_BWD_LAUNCHER(  1024, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  1024, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  1024, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  1024, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  1024, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  1024, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  1024, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  1024, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  1024, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  1024, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);

layer_norm/ln_bwd_1280.cu DELETED Viewed

@@ -1,15 +0,0 @@
-#include "ln_bwd_kernels.cuh"
-// Create backward launch function and register. Macro signature:
-//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
-REGISTER_BWD_LAUNCHER(  1280, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  1280, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  1280, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  1280, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  1280, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  1280, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  1280, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  1280, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  1280, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  1280, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);

layer_norm/ln_bwd_1536.cu DELETED Viewed

@@ -1,15 +0,0 @@
-#include "ln_bwd_kernels.cuh"
-// Create backward launch function and register. Macro signature:
-//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
-REGISTER_BWD_LAUNCHER( 1536, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 1536, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 1536, fp32, fp16, fp32, fp16, fp32, 1, 1, 4,  8, 4);
-REGISTER_BWD_LAUNCHER( 1536, fp16, fp16, fp32, fp16, fp32, 1, 1, 4,  8, 4);
-REGISTER_BWD_LAUNCHER( 1536, fp32, fp16, fp16, fp16, fp32, 1, 1, 4,  8, 4);
-REGISTER_BWD_LAUNCHER( 1536, fp32, bf16, fp32, bf16, fp32, 1, 1, 4,  8, 4);
-REGISTER_BWD_LAUNCHER( 1536, bf16, bf16, fp32, bf16, fp32, 1, 1, 4,  8, 4);
-REGISTER_BWD_LAUNCHER( 1536, fp32, bf16, bf16, bf16, fp32, 1, 1, 4,  8, 4);
-REGISTER_BWD_LAUNCHER( 1536, fp16, fp16, fp16, fp16, fp32, 1, 1, 4,  8, 4);
-REGISTER_BWD_LAUNCHER( 1536, bf16, bf16, bf16, bf16, fp32, 1, 1, 4,  8, 4);

layer_norm/ln_bwd_2048.cu DELETED Viewed

@@ -1,15 +0,0 @@
-#include "ln_bwd_kernels.cuh"
-// Create backward launch function and register. Macro signature:
-//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
-REGISTER_BWD_LAUNCHER( 2048, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 2048, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 2048, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 2048, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 2048, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 2048, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 2048, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 2048, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 2048, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 2048, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);

layer_norm/ln_bwd_256.cu DELETED Viewed

@@ -1,15 +0,0 @@
-#include "ln_bwd_kernels.cuh"
-// Create backward launch function and register. Macro signature:
-//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
-REGISTER_BWD_LAUNCHER(  256, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  256, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  256, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  256, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  256, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  256, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  256, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  256, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  256, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  256, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);

layer_norm/ln_bwd_2560.cu DELETED Viewed

@@ -1,15 +0,0 @@
-#include "ln_bwd_kernels.cuh"
-// Create backward launch function and register. Macro signature:
-//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
-REGISTER_BWD_LAUNCHER( 2560, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 2560, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 2560, fp32, fp16, fp32, fp16, fp32, 1, 1, 4,  8, 4);
-REGISTER_BWD_LAUNCHER( 2560, fp16, fp16, fp32, fp16, fp32, 1, 1, 4,  8, 4);
-REGISTER_BWD_LAUNCHER( 2560, fp32, fp16, fp16, fp16, fp32, 1, 1, 4,  8, 4);
-REGISTER_BWD_LAUNCHER( 2560, fp32, bf16, fp32, bf16, fp32, 1, 1, 4,  8, 4);
-REGISTER_BWD_LAUNCHER( 2560, bf16, bf16, fp32, bf16, fp32, 1, 1, 4,  8, 4);
-REGISTER_BWD_LAUNCHER( 2560, fp32, bf16, bf16, bf16, fp32, 1, 1, 4,  8, 4);
-REGISTER_BWD_LAUNCHER( 2560, fp16, fp16, fp16, fp16, fp32, 1, 1, 4,  8, 4);
-REGISTER_BWD_LAUNCHER( 2560, bf16, bf16, bf16, bf16, fp32, 1, 1, 4,  8, 4);

layer_norm/ln_bwd_3072.cu DELETED Viewed

@@ -1,15 +0,0 @@
-#include "ln_bwd_kernels.cuh"
-// Create backward launch function and register. Macro signature:
-//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
-REGISTER_BWD_LAUNCHER( 3072, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 3072, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 3072, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 3072, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 3072, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 3072, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 3072, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 3072, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 3072, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 3072, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);

layer_norm/ln_bwd_4096.cu DELETED Viewed

@@ -1,15 +0,0 @@
-#include "ln_bwd_kernels.cuh"
-// Create backward launch function and register. Macro signature:
-//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
-REGISTER_BWD_LAUNCHER( 4096, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 4096, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 4096, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 4096, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 4096, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 4096, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 4096, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 4096, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 4096, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 4096, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);

layer_norm/ln_bwd_512.cu DELETED Viewed

@@ -1,15 +0,0 @@
-#include "ln_bwd_kernels.cuh"
-// Create backward launch function and register. Macro signature:
-//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
-REGISTER_BWD_LAUNCHER(  512, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  512, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  512, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  512, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  512, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  512, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  512, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  512, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  512, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  512, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);

layer_norm/ln_bwd_5120.cu DELETED Viewed

@@ -1,15 +0,0 @@
-#include "ln_bwd_kernels.cuh"
-// Create backward launch function and register. Macro signature:
-//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
-REGISTER_BWD_LAUNCHER( 5120, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 5120, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 5120, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 5120, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 5120, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 5120, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 5120, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 5120, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 5120, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 5120, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);

layer_norm/ln_bwd_6144.cu DELETED Viewed

@@ -1,15 +0,0 @@
-#include "ln_bwd_kernels.cuh"
-// Create backward launch function and register. Macro signature:
-//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
-REGISTER_BWD_LAUNCHER( 6144, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER( 6144, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER( 6144, fp32, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER( 6144, fp16, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER( 6144, fp32, fp16, fp16, fp16, fp32, 1, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER( 6144, fp32, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER( 6144, bf16, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER( 6144, fp32, bf16, bf16, bf16, fp32, 1, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER( 6144, fp16, fp16, fp16, fp16, fp32, 1, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER( 6144, bf16, bf16, bf16, bf16, fp32, 1, 1, 8, 16, 4);

layer_norm/ln_bwd_7168.cu DELETED Viewed

@@ -1,15 +0,0 @@
-#include "ln_bwd_kernels.cuh"
-// Create backward launch function and register. Macro signature:
-//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
-REGISTER_BWD_LAUNCHER( 7168, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER( 7168, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER( 7168, fp32, fp16, fp32, fp16, fp32, 1, 1, 8,  8, 4);
-REGISTER_BWD_LAUNCHER( 7168, fp16, fp16, fp32, fp16, fp32, 1, 1, 8,  8, 4);
-REGISTER_BWD_LAUNCHER( 7168, fp32, fp16, fp16, fp16, fp32, 1, 1, 8,  8, 4);
-REGISTER_BWD_LAUNCHER( 7168, fp32, bf16, fp32, bf16, fp32, 1, 1, 8,  8, 4);
-REGISTER_BWD_LAUNCHER( 7168, bf16, bf16, fp32, bf16, fp32, 1, 1, 8,  8, 4);
-REGISTER_BWD_LAUNCHER( 7168, fp32, bf16, bf16, bf16, fp32, 1, 1, 8,  8, 4);
-REGISTER_BWD_LAUNCHER( 7168, fp16, fp16, fp16, fp16, fp32, 1, 1, 8,  8, 4);
-REGISTER_BWD_LAUNCHER( 7168, bf16, bf16, bf16, bf16, fp32, 1, 1, 8,  8, 4);

layer_norm/ln_bwd_768.cu DELETED Viewed

@@ -1,15 +0,0 @@
-#include "ln_bwd_kernels.cuh"
-// Create backward launch function and register. Macro signature:
-//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
-REGISTER_BWD_LAUNCHER(  768, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  768, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  768, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  768, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  768, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  768, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  768, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  768, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  768, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  768, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);

layer_norm/ln_bwd_8192.cu DELETED Viewed

@@ -1,15 +0,0 @@
-#include "ln_bwd_kernels.cuh"
-// Create backward launch function and register. Macro signature:
-//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
-REGISTER_BWD_LAUNCHER( 8192, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER( 8192, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER( 8192, fp32, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER( 8192, fp16, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER( 8192, fp32, fp16, fp16, fp16, fp32, 1, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER( 8192, fp32, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER( 8192, bf16, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER( 8192, fp32, bf16, bf16, bf16, fp32, 1, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER( 8192, fp16, fp16, fp16, fp16, fp32, 1, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER( 8192, bf16, bf16, bf16, bf16, fp32, 1, 1, 8, 16, 4);

layer_norm/ln_bwd_kernels.cuh DELETED Viewed

@@ -1,534 +0,0 @@
-#pragma once
-#include "ln.h"
-#include "ln_utils.cuh"
-#include "ln_kernel_traits.h"
-#include "static_switch.h"
-namespace layer_norm {
-template<typename Ktraits, bool Is_dropout, bool Has_colscale, bool Has_subset, bool Is_even_cols>
-__global__ __launch_bounds__(Ktraits::THREADS_PER_CTA)
-void ln_bwd_kernel(layer_norm::BwdParams params) {
-    enum { ROWS_PER_CTA = Ktraits::ROWS_PER_CTA };
-    enum { WARPS_M = Ktraits::WARPS_M };
-    enum { WARPS_N = Ktraits::WARPS_N };
-    enum { THREADS_PER_ROW = Ktraits::THREADS_PER_ROW };
-    enum { COLS = Ktraits::COLS };
-    enum { BYTES_PER_ROW = Ktraits::BYTES_PER_ROW };
-    enum { LDGS = Ktraits::LDGS };
-    enum { NUM_ELTS = Ktraits::ELTS_PER_LDG };
-    enum { THREADS_PER_WARP = Ktraits::THREADS_PER_WARP };
-    enum { CTAS_PER_ROW = Ktraits::CTAS_PER_ROW };
-    using input_t = typename Ktraits::input_t;
-    using compute_t = typename Ktraits::compute_t;
-    using index_t = typename Ktraits::index_t;
-    using mask_t = typename Ktraits::mask_t;
-    using Ivec = typename Ktraits::Ivec;
-    using Rvec = typename Ktraits::Rvec;
-    using Ovec = typename Ktraits::Ovec;
-    using Wvec = typename Ktraits::Wvec;
-    using Cvec = typename Ktraits::Cvec;
-    using Mvec = typename Ktraits::Mvec;
-    using Reducer = typename Ktraits::Reducer;
-    using reduce_t = typename Reducer::Type;
-    extern __shared__ char smem_[];
-    const bool has_residual = params.dresidual != nullptr;
-    const bool prenorm = params.dx != nullptr;
-    const index_t tidx = threadIdx.x;
-    const index_t bidn = blockIdx.x % CTAS_PER_ROW;
-    const index_t bidm = blockIdx.x / CTAS_PER_ROW;
-    const index_t lane = tidx % THREADS_PER_WARP;
-    const index_t warp = tidx / THREADS_PER_WARP;
-    const index_t warp_m = warp / Ktraits::WARPS_N;
-    const index_t warp_n = warp % Ktraits::WARPS_N;
-    const index_t tid_r = warp_n * THREADS_PER_WARP + lane;
-    const index_t r = bidm * Ktraits::ROWS_PER_CTA + warp_m;
-    const index_t c = bidn * THREADS_PER_ROW + warp_n * THREADS_PER_WARP + lane;
-    static_assert(COLS == THREADS_PER_ROW * LDGS * NUM_ELTS * CTAS_PER_ROW);
-    const input_t *rowscale = static_cast<input_t *>(params.rowscale);
-    const index_t *x0_subset = static_cast<index_t *>(params.x0_subset);
-    const index_t *z_subset = static_cast<index_t *>(params.z_subset);
-    Cvec dzy_sum[LDGS];
-    Cvec dz_sum[LDGS];
-    Cvec dcolscale_sum[LDGS];
-    memset(dzy_sum, 0, sizeof(dzy_sum));
-    memset(dz_sum, 0, sizeof(dz_sum));
-    if (Has_colscale) { memset(dcolscale_sum, 0, sizeof(dcolscale_sum)); }
-    compute_t * smem_wgrad = reinterpret_cast<compute_t*>(smem_);
-    char *smem_dgrad = smem_ + Ktraits::SMEM_BYTES_WGRAD;
-    Reducer reducer(params, bidm, bidn, warp_m, warp_n, lane, smem_dgrad);
-    Sum<reduce_t> sum;
-    const index_t num_valid_ldgs =
-        ((params.cols / Ktraits::ELTS_PER_LDG) - 1 - c + Ktraits::VEC_COLS_PER_LDG) / Ktraits::VEC_COLS_PER_LDG;
-    Wvec gamma[LDGS];
-    Wvec colscale[LDGS];
-    index_t idx = c;
-    #pragma unroll
-    for( int it = 0; it < LDGS; it++ ) {
-        if (Is_even_cols || (it < num_valid_ldgs)) {
-            gamma[it].load_from(params.gamma, idx);
-            if (Has_colscale) { colscale[it].load_from(params.colscale, idx); }
-            idx += Ktraits::VEC_COLS_PER_LDG;
-        }
-    }
-    // TODO if ROWS_PER_CTA does not divide rows, we might get divergence in the
-    // last blocks with syncthreads!
-    // grid stride over rows
-    #pragma unroll 1
-    for( int row = r; row < params.rows; row += params.ctas_per_col * ROWS_PER_CTA ) {
-        const compute_t mu_r = static_cast<const compute_t *>(params.mu)[row];
-        const compute_t rs_r = static_cast<const compute_t *>(params.rs)[row];
-        const compute_t rowscale_val = !Has_subset ? (params.rowscale == nullptr ? 1.0f : compute_t(rowscale[row])) : params.rowscale_const;
-        const int row_z = !Has_subset ? row + 1 : z_subset[row];
-        const int row_x0 = !Has_subset ? row + 1 : x0_subset[row];
-        const bool load_dz = !Has_subset || row_z > 0;
-        const bool save_dx0 = !Has_subset || row_x0 > 0;
-        Mvec dmask[LDGS];
-        Rvec dx[LDGS];
-        compute_t dy[LDGS * NUM_ELTS];
-        compute_t y[LDGS * NUM_ELTS];
-        compute_t mdy_local = 0.f;
-        compute_t mdyy_local = 0.f;
-        // If dz is not loaded, then dy should be 0 and we don't care about the value of y.
-        if (load_dz) {
-            index_t idx_x = row * params.cols / Ktraits::ELTS_PER_LDG + c;
-            index_t idx_z = !Has_subset ? idx_x : (load_dz ? (row_z - 1) * params.cols / Ktraits::ELTS_PER_LDG + c : 0);
-            index_t idx_x0 = !Has_subset ? idx_x : (save_dx0 ? (row_x0 - 1) * params.cols / Ktraits::ELTS_PER_LDG + c : 0);
-            #pragma unroll
-            for( int it = 0; it < LDGS; it++ ) {
-                if (Is_even_cols || (it < num_valid_ldgs)) {
-                    Rvec x;
-                    Ovec dz;
-                    dz.load_from(params.dz, !Has_subset ? idx_x : idx_z);
-                    if (prenorm) { dx[it].load_from(params.dx, idx_x); }
-                    x.load_from(params.x, idx_x);
-                    if (Is_dropout) { dmask[it].load_from(params.dmask, !Has_subset ? idx_x : idx_x0); }
-                    idx_x += Ktraits::VEC_COLS_PER_LDG;
-                    idx_z += Ktraits::VEC_COLS_PER_LDG;
-                    idx_x0 += Ktraits::VEC_COLS_PER_LDG;
-                    #pragma unroll
-                    for( int jt = 0; jt < NUM_ELTS; jt++ ) {
-                        compute_t x_tmp = x.data.elt[jt];
-                        compute_t y_tmp = rs_r * (x_tmp - (!params.is_rms_norm ? mu_r : 0.f));
-                        compute_t dy_tmp = compute_t(gamma[it].data.elt[jt]) * compute_t(dz.data.elt[jt]);
-                        compute_t dz_tmp = dz.data.elt[jt];
-                        mdy_local += dy_tmp;
-                        mdyy_local += dy_tmp * y_tmp;
-                        dy[it * NUM_ELTS + jt] = dy_tmp;
-                        y[it * NUM_ELTS + jt] = y_tmp;
-                        dzy_sum[it].data.elt[jt] += dz_tmp * y_tmp;
-                        dz_sum[it].data.elt[jt] += dz_tmp;
-                    }
-                }
-            }
-        } else {
-            index_t idx_x = row * params.cols / Ktraits::ELTS_PER_LDG + c;
-            index_t idx_x0 = !Has_subset ? idx_x : (save_dx0 ? (row_x0 - 1) * params.cols / Ktraits::ELTS_PER_LDG + c : 0);
-            #pragma unroll
-            for( int it = 0; it < LDGS; it++ ) {
-                if (Is_even_cols || (it < num_valid_ldgs)) {
-                    if (prenorm) { dx[it].load_from(params.dx, idx_x); }
-                    if (Is_dropout) { dmask[it].load_from(params.dmask, !Has_subset ? idx_x : idx_x0); }
-                    idx_x += Ktraits::VEC_COLS_PER_LDG;
-                    idx_x0 += Ktraits::VEC_COLS_PER_LDG;
-                }
-            }
-        }
-        reduce_t result = reducer.allreduce({mdy_local, mdyy_local}, sum);
-        mdy_local = layer_norm::Get<0>::of<reduce_t, compute_t>(result) * params.inverse_cols;
-        mdyy_local = layer_norm::Get<1>::of<reduce_t, compute_t>(result) * params.inverse_cols;
-        index_t idx_x = row * params.cols / Ktraits::ELTS_PER_LDG + c;
-        index_t idx_x0 = !Has_subset ? idx_x : (save_dx0 ? (row_x0 - 1) * params.cols / Ktraits::ELTS_PER_LDG + c : 0);
-        #pragma unroll
-        for( int it = 0; it < LDGS; it++ ) {
-            if (Is_even_cols || (it < num_valid_ldgs)) {
-                Ivec dx0;
-                Rvec dresidual;
-                Ivec x0;
-                if (Has_colscale && save_dx0) { x0.load_from(params.x0, !Has_subset ? idx_x : idx_x0); }
-                #pragma unroll
-                for( int jt = 0; jt < NUM_ELTS; jt++ ) {
-                    compute_t dx_tmp_res;
-                    if (load_dz) {
-                        compute_t dy_tmp = dy[it * NUM_ELTS + jt];
-                        compute_t y_tmp = y[it * NUM_ELTS + jt];
-                        compute_t dx_tmp = rs_r * (dy_tmp - (mdyy_local * y_tmp + (!params.is_rms_norm ? mdy_local : 0.f)));
-                        dx_tmp_res = prenorm ? dx_tmp + compute_t(dx[it].data.elt[jt]) : dx_tmp;
-                    } else {
-                        dx_tmp_res = prenorm ? compute_t(dx[it].data.elt[jt]) : 0.f;
-                    }
-                    if (has_residual) { dresidual.data.elt[jt] = dx_tmp_res; }
-                    if (save_dx0) {
-                        compute_t dx0_tmp_res = dx_tmp_res * rowscale_val;
-                        if (Is_dropout) {
-                            dx0_tmp_res *= params.dropout_scale;
-                            if (Has_colscale) {
-                                dcolscale_sum[it].data.elt[jt] += dmask[it].data.elt[jt] ? dx0_tmp_res * compute_t(x0.data.elt[jt]) : 0.f;
-                                dx0.data.elt[jt] = dmask[it].data.elt[jt] ? dx0_tmp_res * compute_t(colscale[it].data.elt[jt]) : 0.f;
-                            } else {
-                                dx0.data.elt[jt] = dmask[it].data.elt[jt] ? dx0_tmp_res : 0.f;
-                            }
-                        } else {
-                            if (Has_colscale) {
-                                dcolscale_sum[it].data.elt[jt] += dx0_tmp_res * compute_t(x0.data.elt[jt]);
-                                dx0.data.elt[jt] = dx0_tmp_res * compute_t(colscale[it].data.elt[jt]);
-                            } else {
-                                dx0.data.elt[jt] = dx0_tmp_res;
-                            }
-                        }
-                    }
-                }
-                if (has_residual) { dresidual.store_to(params.dresidual, idx_x); }
-                if (save_dx0) { dx0.store_to(params.dx0, !Has_subset ? idx_x : idx_x0); }
-                idx_x += Ktraits::VEC_COLS_PER_LDG;
-                idx_x0 += Ktraits::VEC_COLS_PER_LDG;
-            }
-        }
-    }  // end: grid stride loop
-    if( WARPS_M == 1 ) {
-        idx = r * params.cols / Ktraits::ELTS_PER_LDG + c;
-        #pragma unroll
-        for( int it = 0; it < LDGS; it++ ) {
-            if (Is_even_cols || (it < num_valid_ldgs)) {
-                dz_sum[it].store_to(params.dbeta_part, idx);
-                dzy_sum[it].store_to(params.dgamma_part, idx);
-                if (Has_colscale) { dcolscale_sum[it].store_to(params.dcolscale_part, idx); }
-                idx += Ktraits::VEC_COLS_PER_LDG;
-            }
-        }
-    } else {
-        static_assert(WARPS_M == 1 || Ktraits::CTAS_PER_ROW == 1, "Multiple rows per CTA not supported for Multi-CTA.");
-        // Finalize reduction of part dgamma and dbeta for this CTA
-        // by reducing over the rows held across the WARPS_M warps
-        // Assumption: blockSize divides hidden size.
-        enum { NUM_RES = COLS / Ktraits::THREADS_PER_CTA };
-        static_assert(NUM_RES * Ktraits::THREADS_PER_CTA == COLS, "");
-        idx = warp_m * Ktraits::VEC_COLS + tid_r;
-        #pragma unroll
-        for( int it = 0; it < LDGS; it++ ) {
-            dz_sum[it].store_to(smem_wgrad, idx);
-            idx += THREADS_PER_ROW;
-        }
-        __syncthreads();
-        compute_t cta_dz_sum[NUM_RES];
-        memset(cta_dz_sum, 0, sizeof(compute_t) * NUM_RES);
-        for( int it = 0; it < ROWS_PER_CTA; it++ ) {
-            for( int jt = 0; jt < NUM_RES; jt++ ) {
-                cta_dz_sum[jt] += smem_wgrad[it * COLS + tidx + jt * Ktraits::THREADS_PER_CTA];
-            }
-        }
-        __syncthreads();
-        idx = warp_m * Ktraits::VEC_COLS + tid_r;
-        #pragma unroll
-        for( int it = 0; it < LDGS; it++ ) {
-            dzy_sum[it].store_to(smem_wgrad, idx);
-            idx += THREADS_PER_ROW;
-        }
-        __syncthreads();
-        compute_t cta_dzy_sum[NUM_RES];
-        memset(cta_dzy_sum, 0, sizeof(compute_t) * NUM_RES);
-        for( int it = 0; it < ROWS_PER_CTA; it++ ) {
-            for( int jt = 0; jt < NUM_RES; jt++ ) {
-                cta_dzy_sum[jt] += smem_wgrad[it * COLS + tidx + jt * Ktraits::THREADS_PER_CTA];
-            }
-        }
-        compute_t cta_dcolscale_sum[NUM_RES];
-        if (Has_colscale) {
-            __syncthreads();
-            idx = warp_m * Ktraits::VEC_COLS + tid_r;
-            #pragma unroll
-            for( int it = 0; it < LDGS; it++ ) {
-                dcolscale_sum[it].store_to(smem_wgrad, idx);
-                idx += THREADS_PER_ROW;
-            }
-            __syncthreads();
-            memset(cta_dcolscale_sum, 0, sizeof(compute_t) * NUM_RES);
-            for( int it = 0; it < ROWS_PER_CTA; it++ ) {
-                for( int jt = 0; jt < NUM_RES; jt++ ) {
-                    cta_dcolscale_sum[jt] += smem_wgrad[it * COLS + tidx + jt * Ktraits::THREADS_PER_CTA];
-                }
-            }
-        }
-        const index_t num_valid_writes
-            = (params.cols - 1 - tidx + Ktraits::THREADS_PER_CTA) / Ktraits::THREADS_PER_CTA;
-        compute_t *dgamma_part = static_cast<compute_t *>(params.dgamma_part) + bidm * params.cols + tidx;
-        compute_t *dbeta_part = static_cast<compute_t *>(params.dbeta_part) + bidm * params.cols + tidx;
-        compute_t *dcolscale_part = Has_colscale ? static_cast<compute_t *>(params.dcolscale_part) + bidm * params.cols + tidx : nullptr;
-        for( int jt = 0; jt < NUM_RES; jt++ ) {
-            if (Is_even_cols || (jt < num_valid_writes)) {
-                *dgamma_part = cta_dzy_sum[jt];
-                dgamma_part += Ktraits::THREADS_PER_CTA;
-                *dbeta_part = cta_dz_sum[jt];
-                dbeta_part += Ktraits::THREADS_PER_CTA;
-                if (Has_colscale) {
-                    *dcolscale_part = cta_dcolscale_sum[jt];
-                    dcolscale_part += Ktraits::THREADS_PER_CTA;
-                }
-            }
-        }
-    }
-}
-template<typename Kernel_traits, bool Has_colscale, bool Is_even_cols>
-__global__ __launch_bounds__(Kernel_traits::THREADS_PER_CTA)
-void ln_bwd_finalize_kernel(BwdParams params)
-{
-    using compute_t = typename Kernel_traits::compute_t;
-    using weight_t = typename Kernel_traits::weight_t;
-    using index_t = typename Kernel_traits::index_t;
-    using Reducer = typename Kernel_traits::Reducer;
-    using reduce_t = typename Reducer::Type;
-    Sum<reduce_t> sum;
-    enum { NUM_ELT = Kernel_traits::ELTS_PER_LDG };
-    enum { THREADS_PER_WARP = Kernel_traits::THREADS_PER_WARP };
-    __shared__ char smem_[Kernel_traits::SMEM_BYTES_PER_CTA];
-    constexpr uint32_t bidm = 0;
-    const uint32_t bidn = blockIdx.x;
-    const uint32_t tidx = threadIdx.x;
-    const uint32_t warp = tidx / THREADS_PER_WARP;
-    const uint32_t lane = tidx % THREADS_PER_WARP;
-    Reducer reducer(params, bidm, bidn, 0, 0, lane, smem_);
-    const uint32_t c = bidn * THREADS_PER_WARP + lane;
-    const uint32_t c_out = bidn * THREADS_PER_WARP / 2 + lane;
-    constexpr uint32_t COL_STRIDE = Kernel_traits::CTAS * THREADS_PER_WARP;
-    for( uint32_t col = c, col_out = c_out; col < Kernel_traits::COLS; col += COL_STRIDE, col_out += COL_STRIDE / 2 ) {
-        // Each thread sums over NUM_ELT columns.
-        Vec<compute_t, NUM_ELT> dbeta_local, dgamma_local, dcolscale_local;
-        memset(&dgamma_local, 0, sizeof(dgamma_local));
-        memset(&dbeta_local, 0, sizeof(dbeta_local));
-        if (Has_colscale) { memset(&dcolscale_local, 0, sizeof(dcolscale_local)); }
-        if (Is_even_cols || col < params.cols) {
-            for( uint32_t row = warp; row < params.ctas_per_col; row += Kernel_traits::ROWS_PER_CTA ) {
-                index_t idx = row * params.cols + col;
-                Vec<compute_t, NUM_ELT> dbeta_part, dgamma_part, dcolscale_part;
-                dbeta_part.load_from(params.dbeta_part, idx);
-                dgamma_part.load_from(params.dgamma_part, idx);
-                if (Has_colscale) { dcolscale_part.load_from(params.dcolscale_part, idx); }
-                #pragma unroll
-                for( int it = 0; it < NUM_ELT; it++ ) {
-                    dgamma_local.data.elt[it] += dgamma_part.data.elt[it];
-                    dbeta_local.data.elt[it] += dbeta_part.data.elt[it];
-                    if (Has_colscale) { dcolscale_local.data.elt[it] += dcolscale_part.data.elt[it]; }
-                }
-            }
-        }
-        void * smem_gamma = smem_;
-        void * smem_beta = &smem_[Kernel_traits::SMEM_BYTES_TRANSPOSE];
-        void * smem_colscale = &smem_[2 * Kernel_traits::SMEM_BYTES_TRANSPOSE];
-        const int write_row = warp;
-        const int write_col = lane ^ write_row;
-        const int write_idx = write_row * THREADS_PER_WARP + write_col;
-        dgamma_local.store_to(smem_gamma, write_idx);
-        dbeta_local.store_to(smem_beta, write_idx);
-        if (Has_colscale) { dcolscale_local.store_to(smem_colscale, write_idx); }
-        __syncthreads();
-        // It would be probably safe to reuse the first row of smem_beta and smem_gamma
-        void * smem_gamma_out = &smem_[Kernel_traits::NUM_FACTORS * Kernel_traits::SMEM_BYTES_TRANSPOSE];
-        void * smem_beta_out = &smem_[Kernel_traits::NUM_FACTORS * Kernel_traits::SMEM_BYTES_TRANSPOSE + Kernel_traits::SMEM_BYTES_OUTPUT];
-        void * smem_colscale_out = &smem_[Kernel_traits::NUM_FACTORS * Kernel_traits::SMEM_BYTES_TRANSPOSE + 2 * Kernel_traits::SMEM_BYTES_OUTPUT];
-        // More than one iter iff ROWS_PER_CTA < 32.
-        for( int w = warp; w < THREADS_PER_WARP; w += Kernel_traits::ROWS_PER_CTA ) {
-            const int read_row = lane;
-            const int read_col = w ^ read_row;
-            const int read_idx = read_row * THREADS_PER_WARP + read_col;
-            memset(&dbeta_local, 0, sizeof(dbeta_local));
-            memset(&dgamma_local, 0, sizeof(dgamma_local));
-            if (Has_colscale) { memset(&dcolscale_local, 0, sizeof(dcolscale_local)); }
-            // Load beta and gamma transposed
-            if(read_row < Kernel_traits::ROWS_PER_CTA){
-                dbeta_local.load_from(smem_beta, read_idx);
-                dgamma_local.load_from(smem_gamma, read_idx);
-                if (Has_colscale) { dcolscale_local.load_from(smem_colscale, read_idx); }
-            }
-            // Call reducer on the loaded value(s) and convert.
-            #pragma unroll
-            for( int it = 0; it < NUM_ELT; it++ ) {
-                compute_t b_i = dbeta_local.data.elt[it];
-                compute_t g_i = dgamma_local.data.elt[it];
-                b_i = reducer.allreduce(b_i, sum);
-                g_i = reducer.allreduce(g_i, sum);
-                dgamma_local.data.elt[it] = g_i;
-                dbeta_local.data.elt[it] = b_i;
-                if (Has_colscale) {
-                    compute_t cs_i = dcolscale_local.data.elt[it];
-                    cs_i = reducer.allreduce(cs_i, sum);
-                    dcolscale_local.data.elt[it] = cs_i;
-                }
-            }
-            // Leader stores the result at the current column.
-            if(lane == 0){
-                dgamma_local.store_to(smem_gamma_out, w);
-                dbeta_local.store_to(smem_beta_out, w);
-                if (Has_colscale) { dcolscale_local.store_to(smem_colscale_out, w); }
-            }
-        }
-        // All writes done.
-        __syncthreads();
-        // Pack and store: 2-wide stores with half the threads.
-        if (Is_even_cols || col_out * 2 < params.cols) {
-            if( warp == Kernel_traits::ROWS_PER_CTA - 1 && lane < THREADS_PER_WARP / 2 ) {
-                using src_t = typename TypeToVec2<compute_t>::Type;
-                using dst_t = typename TypeToVec2<weight_t>::Type;
-                Vec<src_t, NUM_ELT> dbeta_vec2, dgamma_vec2, dcolscale_vec2;
-                Vec<dst_t, NUM_ELT> dbeta_out2, dgamma_out2, dcolscale_out2;
-                dgamma_vec2.load_from(smem_gamma_out, lane);
-                dbeta_vec2.load_from(smem_beta_out, lane);
-                if (Has_colscale) { dcolscale_vec2.load_from(smem_colscale_out, lane); }
-                #pragma unroll
-                for( int it = 0; it < NUM_ELT; it++ ) {
-                    dgamma_out2.data.elt[it] = Converter<src_t,dst_t>::convert(dgamma_vec2.data.elt[it]);
-                    dbeta_out2.data.elt[it] = Converter<src_t,dst_t>::convert(dbeta_vec2.data.elt[it]);
-                    if (Has_colscale) { dcolscale_out2.data.elt[it] = Converter<src_t,dst_t>::convert(dcolscale_vec2.data.elt[it]); }
-                }
-                dgamma_out2.store_to(params.dgamma, col_out);
-                dbeta_out2.store_to(params.dbeta, col_out);
-                if (Has_colscale) { dcolscale_out2.store_to(params.dcolscale, col_out); }
-            }
-        }
-    }
-}
-}  // namespace layer_norm
-using namespace layer_norm;
-template<
-    typename weight_t,
-    typename input_t,
-    typename residual_t,
-    typename output_t,
-    typename compute_t,
-    typename index_t,
-    int HIDDEN_SIZE,
-    int CTAS_PER_ROW,
-    int WARPS_M,
-    int WARPS_N,
-    int BYTES_PER_LDG_MAIN,
-    int BYTES_PER_LDG_FINAL
->
-void launch_(LaunchParams<BwdParams> &launch_params, const bool configure_params){
-    using Kernel_traits = Kernel_traits<weight_t,
-                                        input_t,
-                                        residual_t,
-                                        output_t,
-                                        compute_t,
-                                        index_t,
-                                        HIDDEN_SIZE,
-                                        CTAS_PER_ROW,
-                                        WARPS_M,
-                                        WARPS_N,
-                                        BYTES_PER_LDG_MAIN
-                                        >;
-    bool is_dropout = launch_params.params.dropout_keep_p < 1.f;
-    bool has_colscale = launch_params.params.colscale != nullptr;
-    bool has_subset = launch_params.params.x0_subset != nullptr;
-    bool is_even_cols = launch_params.params.cols == HIDDEN_SIZE;
-    BOOL_SWITCH(is_dropout, IsDropoutConst, [&] {
-        BOOL_SWITCH(has_colscale, HasColscaleConst, [&] {
-            BOOL_SWITCH(has_subset, HasSubsetConst, [&] {
-                BOOL_SWITCH(is_even_cols, IsEvenColsConst, [&] {
-                    auto kernel = &ln_bwd_kernel<Kernel_traits, IsDropoutConst, HasColscaleConst, HasSubsetConst, IsEvenColsConst>;
-                    if( configure_params ) {
-                        int ctas_per_sm;
-                        CHECK_CUDA(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-                            &ctas_per_sm, kernel, Kernel_traits::THREADS_PER_CTA, Kernel_traits::SMEM_BYTES));
-                        launch_params.params.ctas_per_col = launch_params.props->multiProcessorCount * ctas_per_sm / Kernel_traits::CTAS_PER_ROW;
-                        launch_params.barrier_size = 0;
-                        launch_params.workspace_bytes = 0;
-                        if(Kernel_traits::CTAS_PER_ROW > 1) {
-                            launch_params.barrier_size = 2 * launch_params.params.ctas_per_col;
-                            launch_params.workspace_bytes = launch_params.params.ctas_per_col
-                                                          * Kernel_traits::WARPS_M
-                                                          * Kernel_traits::CTAS_PER_ROW
-                                                          * sizeof(typename Kernel_traits::reduce_t)
-                                                          * 2;
-                        }
-                        return;
-                    }
-                    if( Kernel_traits::SMEM_BYTES >= 48 * 1024 ) {
-                        CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, Kernel_traits::SMEM_BYTES));
-                    }
-                    auto stream = launch_params.stream;
-                    auto ctas_per_col = launch_params.params.ctas_per_col;
-                    if( Kernel_traits::CTAS_PER_ROW == 1 ) {
-                        kernel<<<ctas_per_col, Kernel_traits::THREADS_PER_CTA, Kernel_traits::SMEM_BYTES, stream>>>(launch_params.params);
-                    } else {
-                        dim3 grid(Kernel_traits::CTAS_PER_ROW * ctas_per_col);
-                        dim3 block(Kernel_traits::THREADS_PER_CTA);
-                        void *params_ = (void *)&launch_params.params;
-                        cudaLaunchCooperativeKernel((void *)kernel, grid, block, (void **)&params_, Kernel_traits::SMEM_BYTES, stream);
-                    }
-                    using Kernel_traits_f = layer_norm::Kernel_traits_finalize<HIDDEN_SIZE,
-                                                                              weight_t,
-                                                                              input_t,
-                                                                              residual_t,
-                                                                              output_t,
-                                                                              compute_t,
-                                                                              index_t,
-                                                                              HasColscaleConst,
-                                                                              32 * 32,  // THREADS_PER_CTA
-                                                                              BYTES_PER_LDG_FINAL>;
-                    auto kernel_f = &layer_norm::ln_bwd_finalize_kernel<Kernel_traits_f, HasColscaleConst, IsEvenColsConst>;
-                    kernel_f<<<Kernel_traits_f::CTAS, Kernel_traits_f::THREADS_PER_CTA, 0, stream>>>(launch_params.params);
-                });
-            });
-        });
-    });
-}

layer_norm/ln_fwd_1024.cu DELETED Viewed

@@ -1,15 +0,0 @@
-#include "ln_fwd_kernels.cuh"
-// Create forward launch function and register. Macro signature:
-//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
-REGISTER_FWD_LAUNCHER( 1024, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
-REGISTER_FWD_LAUNCHER( 1024, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
-REGISTER_FWD_LAUNCHER( 1024, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
-REGISTER_FWD_LAUNCHER( 1024, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
-REGISTER_FWD_LAUNCHER( 1024, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
-REGISTER_FWD_LAUNCHER( 1024, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
-REGISTER_FWD_LAUNCHER( 1024, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
-REGISTER_FWD_LAUNCHER( 1024, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
-REGISTER_FWD_LAUNCHER( 1024, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
-REGISTER_FWD_LAUNCHER( 1024, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16);