diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..6f3a44b5d1775df1e9798eaed004c9b5a3a5f7d6 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,77 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu126-aarch64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu128-aarch64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu130-aarch64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu126-aarch64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu128-aarch64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu130-aarch64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch29-cxx11-cu129-aarch64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu126-x86_64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu128-x86_64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu130-x86_64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu126-x86_64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu128-x86_64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu130-x86_64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch29-cxx11-cu129-x86_64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu126-aarch64-linux/_layer_norm_cuda_d3db3a4.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu128-aarch64-linux/_layer_norm_cuda_d3db3a4.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu130-aarch64-linux/_layer_norm_cuda_d3db3a4.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu126-aarch64-linux/_layer_norm_cuda_d3db3a4.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu128-aarch64-linux/_layer_norm_cuda_d3db3a4.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu130-aarch64-linux/_layer_norm_cuda_d3db3a4.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch29-cxx11-cu129-aarch64-linux/_layer_norm_cuda_d3db3a4.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu126-x86_64-linux/_layer_norm_cuda_d3db3a4.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu128-x86_64-linux/_layer_norm_cuda_d3db3a4.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu130-x86_64-linux/_layer_norm_cuda_d3db3a4.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu126-x86_64-linux/_layer_norm_cuda_d3db3a4.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu128-x86_64-linux/_layer_norm_cuda_d3db3a4.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu130-x86_64-linux/_layer_norm_cuda_d3db3a4.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch29-cxx11-cu129-x86_64-linux/_layer_norm_cuda_d3db3a4.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu126-aarch64-linux/_layer_norm_cuda_abf7d7d.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu128-aarch64-linux/_layer_norm_cuda_abf7d7d.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu130-aarch64-linux/_layer_norm_cuda_abf7d7d.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu126-aarch64-linux/_layer_norm_cuda_abf7d7d.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu128-aarch64-linux/_layer_norm_cuda_abf7d7d.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu130-aarch64-linux/_layer_norm_cuda_abf7d7d.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch29-cxx11-cu129-aarch64-linux/_layer_norm_cuda_abf7d7d.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu126-x86_64-linux/_layer_norm_cuda_abf7d7d.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu128-x86_64-linux/_layer_norm_cuda_abf7d7d.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu130-x86_64-linux/_layer_norm_cuda_abf7d7d.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu126-x86_64-linux/_layer_norm_cuda_abf7d7d.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu128-x86_64-linux/_layer_norm_cuda_abf7d7d.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu130-x86_64-linux/_layer_norm_cuda_abf7d7d.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch29-cxx11-cu129-x86_64-linux/_layer_norm_cuda_abf7d7d.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu126-x86_64-linux/_layer_norm_cuda_86f75d9.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu128-x86_64-linux/_layer_norm_cuda_86f75d9.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu130-x86_64-linux/_layer_norm_cuda_86f75d9.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu126-x86_64-linux/_layer_norm_cuda_86f75d9.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu128-x86_64-linux/_layer_norm_cuda_86f75d9.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu130-x86_64-linux/_layer_norm_cuda_86f75d9.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu126-aarch64-linux/_layer_norm_cuda_86f75d9.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu128-aarch64-linux/_layer_norm_cuda_86f75d9.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu130-aarch64-linux/_layer_norm_cuda_86f75d9.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu126-aarch64-linux/_layer_norm_cuda_86f75d9.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu128-aarch64-linux/_layer_norm_cuda_86f75d9.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu130-aarch64-linux/_layer_norm_cuda_86f75d9.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu126-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu128-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu130-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu126-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu128-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu130-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch212-cxx11-cu126-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch212-cxx11-cu130-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch212-cxx11-cu132-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu126-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu128-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu130-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu126-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu128-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu130-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch212-cxx11-cu126-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch212-cxx11-cu130-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch212-cxx11-cu132-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch29-cxx11-cu128-aarch64-linux/dropout_layer_norm.cpython-312-aarch64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text +build/torch29-cxx11-cu128-x86_64-linux/dropout_layer_norm.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..65aa2fc93afcf7af551e686baef8d23f23cd2fa4 --- /dev/null +++ b/README.md @@ -0,0 +1,28 @@ +--- +library_name: kernels +license: bsd-3-clause +--- + +This is the repository card of kernels-community/layer-norm that has been pushed on the Hub. It was built to be used with the [`kernels` library](https://github.com/huggingface/kernels). This card was automatically generated. + +## How to use + +```python +# make sure `kernels` is installed: `pip install -U kernels` +from kernels import get_kernel + +kernel_module = get_kernel("kernels-community/layer-norm") +dropout_add_ln_fwd = kernel_module.dropout_add_ln_fwd + +dropout_add_ln_fwd(...) +``` + +## Available functions +- `dropout_add_ln_fwd` +- `dropout_add_ln_bwd` +- `dropout_add_ln_parallel_residual_fwd` +- `dropout_add_ln_parallel_residual_bwd` + +## Benchmarks + +Benchmarking script is available for this kernel. Run `kernels benchmark kernels-community/layer-norm`. diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..d943bf949eafb80e00f77b26fdb620957cab218f --- /dev/null +++ b/benchmarks/benchmark.py @@ -0,0 +1,9 @@ +from kernels.benchmarks import LayerNormBenchmark, RMSNormBenchmark + + +class LayerNorm(LayerNormBenchmark): + pass + + +class RMSNorm(RMSNormBenchmark): + pass diff --git a/build/torch210-cxx11-cu126-aarch64-linux/__init__.py b/build/torch210-cxx11-cu126-aarch64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch210-cxx11-cu126-aarch64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch210-cxx11-cu126-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..22ed7c8dfe85ee9d915de7bf81b91a2c1f1e77b1 --- /dev/null +++ b/build/torch210-cxx11-cu126-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc6cbb986c0ac1160fd75d6db0033b133350ede8d51c815cd6c821d7e2c512a1 +size 711710472 diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_ops.py b/build/torch210-cxx11-cu126-aarch64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503 --- /dev/null +++ b/build/torch210-cxx11-cu126-aarch64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_73ccd0c +ops = torch.ops._layer_norm_cuda_73ccd0c + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_73ccd0c::{op_name}" diff --git a/build/torch210-cxx11-cu126-aarch64-linux/layer_norm/__init__.py b/build/torch210-cxx11-cu126-aarch64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch210-cxx11-cu126-aarch64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch210-cxx11-cu126-aarch64-linux/layers.py b/build/torch210-cxx11-cu126-aarch64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch210-cxx11-cu126-aarch64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch210-cxx11-cu126-aarch64-linux/metadata.json b/build/torch210-cxx11-cu126-aarch64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..e16b07bc42edabbfb0c8f5e9a87013306d25ea4f --- /dev/null +++ b/build/torch210-cxx11-cu126-aarch64-linux/metadata.json @@ -0,0 +1,15 @@ +{ + "name": "layer-norm", + "id": "_layer_norm_cuda_73ccd0c", + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch210-cxx11-cu126-x86_64-linux/__init__.py b/build/torch210-cxx11-cu126-x86_64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch210-cxx11-cu126-x86_64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch210-cxx11-cu126-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..c8a7a43d72bf15271672a0053c815f210c9522d4 --- /dev/null +++ b/build/torch210-cxx11-cu126-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1a95a30fff3a4b64414535756ed2d26fc50321c6caeb284a4f1e2e46cfe04dd +size 712093824 diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_ops.py b/build/torch210-cxx11-cu126-x86_64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503 --- /dev/null +++ b/build/torch210-cxx11-cu126-x86_64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_73ccd0c +ops = torch.ops._layer_norm_cuda_73ccd0c + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_73ccd0c::{op_name}" diff --git a/build/torch210-cxx11-cu126-x86_64-linux/layer_norm/__init__.py b/build/torch210-cxx11-cu126-x86_64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch210-cxx11-cu126-x86_64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch210-cxx11-cu126-x86_64-linux/layers.py b/build/torch210-cxx11-cu126-x86_64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch210-cxx11-cu126-x86_64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch210-cxx11-cu126-x86_64-linux/metadata.json b/build/torch210-cxx11-cu126-x86_64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..e16b07bc42edabbfb0c8f5e9a87013306d25ea4f --- /dev/null +++ b/build/torch210-cxx11-cu126-x86_64-linux/metadata.json @@ -0,0 +1,15 @@ +{ + "name": "layer-norm", + "id": "_layer_norm_cuda_73ccd0c", + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch210-cxx11-cu128-aarch64-linux/__init__.py b/build/torch210-cxx11-cu128-aarch64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch210-cxx11-cu128-aarch64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch210-cxx11-cu128-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..37480e3d91f02c1fee53939897394a3120e23283 --- /dev/null +++ b/build/torch210-cxx11-cu128-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b32a3daff6960337d42eb4b5484b2fc628e773616e3361bcce21d656d477096d +size 1231083200 diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_ops.py b/build/torch210-cxx11-cu128-aarch64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503 --- /dev/null +++ b/build/torch210-cxx11-cu128-aarch64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_73ccd0c +ops = torch.ops._layer_norm_cuda_73ccd0c + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_73ccd0c::{op_name}" diff --git a/build/torch210-cxx11-cu128-aarch64-linux/layer_norm/__init__.py b/build/torch210-cxx11-cu128-aarch64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch210-cxx11-cu128-aarch64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch210-cxx11-cu128-aarch64-linux/layers.py b/build/torch210-cxx11-cu128-aarch64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch210-cxx11-cu128-aarch64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch210-cxx11-cu128-aarch64-linux/metadata.json b/build/torch210-cxx11-cu128-aarch64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..75fccb89550243db092fbc5d218aa4b497699bed --- /dev/null +++ b/build/torch210-cxx11-cu128-aarch64-linux/metadata.json @@ -0,0 +1,17 @@ +{ + "name": "layer-norm", + "id": "_layer_norm_cuda_73ccd0c", + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "10.0", + "12.0", + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch210-cxx11-cu128-x86_64-linux/__init__.py b/build/torch210-cxx11-cu128-x86_64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch210-cxx11-cu128-x86_64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch210-cxx11-cu128-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..522ce34f387a0c26ed90b76f88a77cb18a6a786f --- /dev/null +++ b/build/torch210-cxx11-cu128-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4660dc3f5e3cc4d0531fdebd5d0df2d082b0fad6599a8e63eabcc42b2cedada +size 1231419520 diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_ops.py b/build/torch210-cxx11-cu128-x86_64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503 --- /dev/null +++ b/build/torch210-cxx11-cu128-x86_64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_73ccd0c +ops = torch.ops._layer_norm_cuda_73ccd0c + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_73ccd0c::{op_name}" diff --git a/build/torch210-cxx11-cu128-x86_64-linux/layer_norm/__init__.py b/build/torch210-cxx11-cu128-x86_64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch210-cxx11-cu128-x86_64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch210-cxx11-cu128-x86_64-linux/layers.py b/build/torch210-cxx11-cu128-x86_64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch210-cxx11-cu128-x86_64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch210-cxx11-cu128-x86_64-linux/metadata.json b/build/torch210-cxx11-cu128-x86_64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..75fccb89550243db092fbc5d218aa4b497699bed --- /dev/null +++ b/build/torch210-cxx11-cu128-x86_64-linux/metadata.json @@ -0,0 +1,17 @@ +{ + "name": "layer-norm", + "id": "_layer_norm_cuda_73ccd0c", + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "10.0", + "12.0", + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch210-cxx11-cu130-aarch64-linux/__init__.py b/build/torch210-cxx11-cu130-aarch64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch210-cxx11-cu130-aarch64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch210-cxx11-cu130-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..0e8093ad578fcf418a995b83b892e465603b4f9c --- /dev/null +++ b/build/torch210-cxx11-cu130-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1be4d1ef49363641003ad084d112971753c08b1c8ce6755ce073d7e6fce171c +size 1235994200 diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_ops.py b/build/torch210-cxx11-cu130-aarch64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503 --- /dev/null +++ b/build/torch210-cxx11-cu130-aarch64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_73ccd0c +ops = torch.ops._layer_norm_cuda_73ccd0c + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_73ccd0c::{op_name}" diff --git a/build/torch210-cxx11-cu130-aarch64-linux/layer_norm/__init__.py b/build/torch210-cxx11-cu130-aarch64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch210-cxx11-cu130-aarch64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch210-cxx11-cu130-aarch64-linux/layers.py b/build/torch210-cxx11-cu130-aarch64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch210-cxx11-cu130-aarch64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch210-cxx11-cu130-aarch64-linux/metadata.json b/build/torch210-cxx11-cu130-aarch64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..75fccb89550243db092fbc5d218aa4b497699bed --- /dev/null +++ b/build/torch210-cxx11-cu130-aarch64-linux/metadata.json @@ -0,0 +1,17 @@ +{ + "name": "layer-norm", + "id": "_layer_norm_cuda_73ccd0c", + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "10.0", + "12.0", + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch210-cxx11-cu130-x86_64-linux/__init__.py b/build/torch210-cxx11-cu130-x86_64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch210-cxx11-cu130-x86_64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch210-cxx11-cu130-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..00326bbd2c440f0d8f3a256ffd65b77a222bdfd2 --- /dev/null +++ b/build/torch210-cxx11-cu130-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61244ac2828b69fe5445df5c6564764eb1b1b80c57312c24836b41595aaf4cc1 +size 1238402192 diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_ops.py b/build/torch210-cxx11-cu130-x86_64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503 --- /dev/null +++ b/build/torch210-cxx11-cu130-x86_64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_73ccd0c +ops = torch.ops._layer_norm_cuda_73ccd0c + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_73ccd0c::{op_name}" diff --git a/build/torch210-cxx11-cu130-x86_64-linux/layer_norm/__init__.py b/build/torch210-cxx11-cu130-x86_64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch210-cxx11-cu130-x86_64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch210-cxx11-cu130-x86_64-linux/layers.py b/build/torch210-cxx11-cu130-x86_64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch210-cxx11-cu130-x86_64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch210-cxx11-cu130-x86_64-linux/metadata.json b/build/torch210-cxx11-cu130-x86_64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..75fccb89550243db092fbc5d218aa4b497699bed --- /dev/null +++ b/build/torch210-cxx11-cu130-x86_64-linux/metadata.json @@ -0,0 +1,17 @@ +{ + "name": "layer-norm", + "id": "_layer_norm_cuda_73ccd0c", + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "10.0", + "12.0", + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch211-cxx11-cu126-aarch64-linux/__init__.py b/build/torch211-cxx11-cu126-aarch64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch211-cxx11-cu126-aarch64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch211-cxx11-cu126-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..1a060127cf4bfd141745ec93bbd0478b4fb73286 --- /dev/null +++ b/build/torch211-cxx11-cu126-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ebbc069d60e09aea141d01bef3f1a14b81d315d2e944f78b99ee794a370f199 +size 711706784 diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_ops.py b/build/torch211-cxx11-cu126-aarch64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503 --- /dev/null +++ b/build/torch211-cxx11-cu126-aarch64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_73ccd0c +ops = torch.ops._layer_norm_cuda_73ccd0c + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_73ccd0c::{op_name}" diff --git a/build/torch211-cxx11-cu126-aarch64-linux/layer_norm/__init__.py b/build/torch211-cxx11-cu126-aarch64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch211-cxx11-cu126-aarch64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch211-cxx11-cu126-aarch64-linux/layers.py b/build/torch211-cxx11-cu126-aarch64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch211-cxx11-cu126-aarch64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch211-cxx11-cu126-aarch64-linux/metadata.json b/build/torch211-cxx11-cu126-aarch64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..e16b07bc42edabbfb0c8f5e9a87013306d25ea4f --- /dev/null +++ b/build/torch211-cxx11-cu126-aarch64-linux/metadata.json @@ -0,0 +1,15 @@ +{ + "name": "layer-norm", + "id": "_layer_norm_cuda_73ccd0c", + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch211-cxx11-cu126-x86_64-linux/__init__.py b/build/torch211-cxx11-cu126-x86_64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch211-cxx11-cu126-x86_64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch211-cxx11-cu126-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..3b570e3a18b2fec03e0a53d608caa4e809e7ea22 --- /dev/null +++ b/build/torch211-cxx11-cu126-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1586a1b500ccc87796c33107b971e38b73d49257aa935a8568c235021490cb9 +size 712082776 diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_ops.py b/build/torch211-cxx11-cu126-x86_64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503 --- /dev/null +++ b/build/torch211-cxx11-cu126-x86_64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_73ccd0c +ops = torch.ops._layer_norm_cuda_73ccd0c + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_73ccd0c::{op_name}" diff --git a/build/torch211-cxx11-cu126-x86_64-linux/layer_norm/__init__.py b/build/torch211-cxx11-cu126-x86_64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch211-cxx11-cu126-x86_64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch211-cxx11-cu126-x86_64-linux/layers.py b/build/torch211-cxx11-cu126-x86_64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch211-cxx11-cu126-x86_64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch211-cxx11-cu126-x86_64-linux/metadata.json b/build/torch211-cxx11-cu126-x86_64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..e16b07bc42edabbfb0c8f5e9a87013306d25ea4f --- /dev/null +++ b/build/torch211-cxx11-cu126-x86_64-linux/metadata.json @@ -0,0 +1,15 @@ +{ + "name": "layer-norm", + "id": "_layer_norm_cuda_73ccd0c", + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch211-cxx11-cu128-aarch64-linux/__init__.py b/build/torch211-cxx11-cu128-aarch64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch211-cxx11-cu128-aarch64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch211-cxx11-cu128-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..c50d3b0b9e94b2b206d59352e12dc5988b5057f4 --- /dev/null +++ b/build/torch211-cxx11-cu128-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2ca8ce06883b52cc6fc635f080a8a1e025a2cf87a9ad151df8525d2d1842c87 +size 1231079512 diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_ops.py b/build/torch211-cxx11-cu128-aarch64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503 --- /dev/null +++ b/build/torch211-cxx11-cu128-aarch64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_73ccd0c +ops = torch.ops._layer_norm_cuda_73ccd0c + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_73ccd0c::{op_name}" diff --git a/build/torch211-cxx11-cu128-aarch64-linux/layer_norm/__init__.py b/build/torch211-cxx11-cu128-aarch64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch211-cxx11-cu128-aarch64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch211-cxx11-cu128-aarch64-linux/layers.py b/build/torch211-cxx11-cu128-aarch64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch211-cxx11-cu128-aarch64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch211-cxx11-cu128-aarch64-linux/metadata.json b/build/torch211-cxx11-cu128-aarch64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..75fccb89550243db092fbc5d218aa4b497699bed --- /dev/null +++ b/build/torch211-cxx11-cu128-aarch64-linux/metadata.json @@ -0,0 +1,17 @@ +{ + "name": "layer-norm", + "id": "_layer_norm_cuda_73ccd0c", + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "10.0", + "12.0", + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch211-cxx11-cu128-x86_64-linux/__init__.py b/build/torch211-cxx11-cu128-x86_64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch211-cxx11-cu128-x86_64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch211-cxx11-cu128-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..b6c847b224019ee201ff7ef55e3baba3e3b02087 --- /dev/null +++ b/build/torch211-cxx11-cu128-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be4534f53612397be85ac1dc8075c4dcdb8aba68842de6e909c15ea3ab4cc64d +size 1231408464 diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_ops.py b/build/torch211-cxx11-cu128-x86_64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503 --- /dev/null +++ b/build/torch211-cxx11-cu128-x86_64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_73ccd0c +ops = torch.ops._layer_norm_cuda_73ccd0c + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_73ccd0c::{op_name}" diff --git a/build/torch211-cxx11-cu128-x86_64-linux/layer_norm/__init__.py b/build/torch211-cxx11-cu128-x86_64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch211-cxx11-cu128-x86_64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch211-cxx11-cu128-x86_64-linux/layers.py b/build/torch211-cxx11-cu128-x86_64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch211-cxx11-cu128-x86_64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch211-cxx11-cu128-x86_64-linux/metadata.json b/build/torch211-cxx11-cu128-x86_64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..75fccb89550243db092fbc5d218aa4b497699bed --- /dev/null +++ b/build/torch211-cxx11-cu128-x86_64-linux/metadata.json @@ -0,0 +1,17 @@ +{ + "name": "layer-norm", + "id": "_layer_norm_cuda_73ccd0c", + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "10.0", + "12.0", + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch211-cxx11-cu130-aarch64-linux/__init__.py b/build/torch211-cxx11-cu130-aarch64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch211-cxx11-cu130-aarch64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch211-cxx11-cu130-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..5050382923073d1f5f2e171e58cd28986724459f --- /dev/null +++ b/build/torch211-cxx11-cu130-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d526240fb5ecde4f704be1bd671e339a0da2f7861af43d1b87535c920d841d81 +size 1235990520 diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_ops.py b/build/torch211-cxx11-cu130-aarch64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503 --- /dev/null +++ b/build/torch211-cxx11-cu130-aarch64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_73ccd0c +ops = torch.ops._layer_norm_cuda_73ccd0c + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_73ccd0c::{op_name}" diff --git a/build/torch211-cxx11-cu130-aarch64-linux/layer_norm/__init__.py b/build/torch211-cxx11-cu130-aarch64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch211-cxx11-cu130-aarch64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch211-cxx11-cu130-aarch64-linux/layers.py b/build/torch211-cxx11-cu130-aarch64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch211-cxx11-cu130-aarch64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch211-cxx11-cu130-aarch64-linux/metadata.json b/build/torch211-cxx11-cu130-aarch64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..75fccb89550243db092fbc5d218aa4b497699bed --- /dev/null +++ b/build/torch211-cxx11-cu130-aarch64-linux/metadata.json @@ -0,0 +1,17 @@ +{ + "name": "layer-norm", + "id": "_layer_norm_cuda_73ccd0c", + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "10.0", + "12.0", + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch211-cxx11-cu130-x86_64-linux/__init__.py b/build/torch211-cxx11-cu130-x86_64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch211-cxx11-cu130-x86_64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch211-cxx11-cu130-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..2160fa07368fe1f4ae3c3df7a6ccda7d300ba013 --- /dev/null +++ b/build/torch211-cxx11-cu130-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f30404419b58ef0f8bf5176c0d67a176837b70d853f59d7cc326d056e9ab2ee +size 1238395232 diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_ops.py b/build/torch211-cxx11-cu130-x86_64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503 --- /dev/null +++ b/build/torch211-cxx11-cu130-x86_64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_73ccd0c +ops = torch.ops._layer_norm_cuda_73ccd0c + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_73ccd0c::{op_name}" diff --git a/build/torch211-cxx11-cu130-x86_64-linux/layer_norm/__init__.py b/build/torch211-cxx11-cu130-x86_64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch211-cxx11-cu130-x86_64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch211-cxx11-cu130-x86_64-linux/layers.py b/build/torch211-cxx11-cu130-x86_64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch211-cxx11-cu130-x86_64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch211-cxx11-cu130-x86_64-linux/metadata.json b/build/torch211-cxx11-cu130-x86_64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..75fccb89550243db092fbc5d218aa4b497699bed --- /dev/null +++ b/build/torch211-cxx11-cu130-x86_64-linux/metadata.json @@ -0,0 +1,17 @@ +{ + "name": "layer-norm", + "id": "_layer_norm_cuda_73ccd0c", + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "10.0", + "12.0", + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch212-cxx11-cu126-aarch64-linux/__init__.py b/build/torch212-cxx11-cu126-aarch64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch212-cxx11-cu126-aarch64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch212-cxx11-cu126-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch212-cxx11-cu126-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..2cd9a2db9a694cb40de7bb39ddd389e7658fd166 --- /dev/null +++ b/build/torch212-cxx11-cu126-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4053b1e60f9aa2d0c36871fb49638a2644dee850719716382159abf3749e997e +size 711709952 diff --git a/build/torch212-cxx11-cu126-aarch64-linux/_ops.py b/build/torch212-cxx11-cu126-aarch64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503 --- /dev/null +++ b/build/torch212-cxx11-cu126-aarch64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_73ccd0c +ops = torch.ops._layer_norm_cuda_73ccd0c + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_73ccd0c::{op_name}" diff --git a/build/torch212-cxx11-cu126-aarch64-linux/layer_norm/__init__.py b/build/torch212-cxx11-cu126-aarch64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch212-cxx11-cu126-aarch64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch212-cxx11-cu126-aarch64-linux/layers.py b/build/torch212-cxx11-cu126-aarch64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch212-cxx11-cu126-aarch64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch212-cxx11-cu126-aarch64-linux/metadata.json b/build/torch212-cxx11-cu126-aarch64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..e16b07bc42edabbfb0c8f5e9a87013306d25ea4f --- /dev/null +++ b/build/torch212-cxx11-cu126-aarch64-linux/metadata.json @@ -0,0 +1,15 @@ +{ + "name": "layer-norm", + "id": "_layer_norm_cuda_73ccd0c", + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch212-cxx11-cu126-x86_64-linux/__init__.py b/build/torch212-cxx11-cu126-x86_64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch212-cxx11-cu126-x86_64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch212-cxx11-cu126-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch212-cxx11-cu126-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..ea0a8d9ca269230bd0ddf426e0e847014ce2c6c8 --- /dev/null +++ b/build/torch212-cxx11-cu126-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5e4fbafaa6e30159f55100f8fdf7fdcc4a33af09da5345b2327eb01a37b2ad7 +size 712088360 diff --git a/build/torch212-cxx11-cu126-x86_64-linux/_ops.py b/build/torch212-cxx11-cu126-x86_64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503 --- /dev/null +++ b/build/torch212-cxx11-cu126-x86_64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_73ccd0c +ops = torch.ops._layer_norm_cuda_73ccd0c + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_73ccd0c::{op_name}" diff --git a/build/torch212-cxx11-cu126-x86_64-linux/layer_norm/__init__.py b/build/torch212-cxx11-cu126-x86_64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch212-cxx11-cu126-x86_64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch212-cxx11-cu126-x86_64-linux/layers.py b/build/torch212-cxx11-cu126-x86_64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch212-cxx11-cu126-x86_64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch212-cxx11-cu126-x86_64-linux/metadata.json b/build/torch212-cxx11-cu126-x86_64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..e16b07bc42edabbfb0c8f5e9a87013306d25ea4f --- /dev/null +++ b/build/torch212-cxx11-cu126-x86_64-linux/metadata.json @@ -0,0 +1,15 @@ +{ + "name": "layer-norm", + "id": "_layer_norm_cuda_73ccd0c", + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch212-cxx11-cu130-aarch64-linux/__init__.py b/build/torch212-cxx11-cu130-aarch64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch212-cxx11-cu130-aarch64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch212-cxx11-cu130-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch212-cxx11-cu130-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..fee0bcab5df25fac8d850431b184ec1129a7c196 --- /dev/null +++ b/build/torch212-cxx11-cu130-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df4db4bae71fab28e0e88250091baaad4b1359ce1cf5f74afb4eaaaf1b8feb03 +size 1235993656 diff --git a/build/torch212-cxx11-cu130-aarch64-linux/_ops.py b/build/torch212-cxx11-cu130-aarch64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503 --- /dev/null +++ b/build/torch212-cxx11-cu130-aarch64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_73ccd0c +ops = torch.ops._layer_norm_cuda_73ccd0c + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_73ccd0c::{op_name}" diff --git a/build/torch212-cxx11-cu130-aarch64-linux/layer_norm/__init__.py b/build/torch212-cxx11-cu130-aarch64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch212-cxx11-cu130-aarch64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch212-cxx11-cu130-aarch64-linux/layers.py b/build/torch212-cxx11-cu130-aarch64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch212-cxx11-cu130-aarch64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch212-cxx11-cu130-aarch64-linux/metadata.json b/build/torch212-cxx11-cu130-aarch64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..75fccb89550243db092fbc5d218aa4b497699bed --- /dev/null +++ b/build/torch212-cxx11-cu130-aarch64-linux/metadata.json @@ -0,0 +1,17 @@ +{ + "name": "layer-norm", + "id": "_layer_norm_cuda_73ccd0c", + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "10.0", + "12.0", + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch212-cxx11-cu130-x86_64-linux/__init__.py b/build/torch212-cxx11-cu130-x86_64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch212-cxx11-cu130-x86_64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch212-cxx11-cu130-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch212-cxx11-cu130-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..d9f9dffb6fa623d56811d05189c2ab1da2c615dd --- /dev/null +++ b/build/torch212-cxx11-cu130-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da280ce065bfa6679da65d46168975cbb0a52d90c9d175ac77deb228ca20756a +size 1238392592 diff --git a/build/torch212-cxx11-cu130-x86_64-linux/_ops.py b/build/torch212-cxx11-cu130-x86_64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503 --- /dev/null +++ b/build/torch212-cxx11-cu130-x86_64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_73ccd0c +ops = torch.ops._layer_norm_cuda_73ccd0c + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_73ccd0c::{op_name}" diff --git a/build/torch212-cxx11-cu130-x86_64-linux/layer_norm/__init__.py b/build/torch212-cxx11-cu130-x86_64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch212-cxx11-cu130-x86_64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch212-cxx11-cu130-x86_64-linux/layers.py b/build/torch212-cxx11-cu130-x86_64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch212-cxx11-cu130-x86_64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch212-cxx11-cu130-x86_64-linux/metadata.json b/build/torch212-cxx11-cu130-x86_64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..75fccb89550243db092fbc5d218aa4b497699bed --- /dev/null +++ b/build/torch212-cxx11-cu130-x86_64-linux/metadata.json @@ -0,0 +1,17 @@ +{ + "name": "layer-norm", + "id": "_layer_norm_cuda_73ccd0c", + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "10.0", + "12.0", + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch212-cxx11-cu132-aarch64-linux/__init__.py b/build/torch212-cxx11-cu132-aarch64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch212-cxx11-cu132-aarch64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch212-cxx11-cu132-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch212-cxx11-cu132-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..c1b05b781254aaa11005173a50cfc99850ace6aa --- /dev/null +++ b/build/torch212-cxx11-cu132-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e3e2f8cb06b921f94b179521d5f773a41fc5817c5667d3a405cf111f04feaf2 +size 1219551712 diff --git a/build/torch212-cxx11-cu132-aarch64-linux/_ops.py b/build/torch212-cxx11-cu132-aarch64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503 --- /dev/null +++ b/build/torch212-cxx11-cu132-aarch64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_73ccd0c +ops = torch.ops._layer_norm_cuda_73ccd0c + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_73ccd0c::{op_name}" diff --git a/build/torch212-cxx11-cu132-aarch64-linux/layer_norm/__init__.py b/build/torch212-cxx11-cu132-aarch64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch212-cxx11-cu132-aarch64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch212-cxx11-cu132-aarch64-linux/layers.py b/build/torch212-cxx11-cu132-aarch64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch212-cxx11-cu132-aarch64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch212-cxx11-cu132-aarch64-linux/metadata.json b/build/torch212-cxx11-cu132-aarch64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..75fccb89550243db092fbc5d218aa4b497699bed --- /dev/null +++ b/build/torch212-cxx11-cu132-aarch64-linux/metadata.json @@ -0,0 +1,17 @@ +{ + "name": "layer-norm", + "id": "_layer_norm_cuda_73ccd0c", + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "10.0", + "12.0", + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch212-cxx11-cu132-x86_64-linux/__init__.py b/build/torch212-cxx11-cu132-x86_64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch212-cxx11-cu132-x86_64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch212-cxx11-cu132-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch212-cxx11-cu132-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..cc51370dadcc1f31c286cb6fea91e19096532579 --- /dev/null +++ b/build/torch212-cxx11-cu132-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16f936b4a001754ef5d21ca0a7d2bfb2fa8a25f713fdd166f8b1a47e524c588d +size 1222008944 diff --git a/build/torch212-cxx11-cu132-x86_64-linux/_ops.py b/build/torch212-cxx11-cu132-x86_64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503 --- /dev/null +++ b/build/torch212-cxx11-cu132-x86_64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_73ccd0c +ops = torch.ops._layer_norm_cuda_73ccd0c + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_73ccd0c::{op_name}" diff --git a/build/torch212-cxx11-cu132-x86_64-linux/layer_norm/__init__.py b/build/torch212-cxx11-cu132-x86_64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch212-cxx11-cu132-x86_64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch212-cxx11-cu132-x86_64-linux/layers.py b/build/torch212-cxx11-cu132-x86_64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch212-cxx11-cu132-x86_64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch212-cxx11-cu132-x86_64-linux/metadata.json b/build/torch212-cxx11-cu132-x86_64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..75fccb89550243db092fbc5d218aa4b497699bed --- /dev/null +++ b/build/torch212-cxx11-cu132-x86_64-linux/metadata.json @@ -0,0 +1,17 @@ +{ + "name": "layer-norm", + "id": "_layer_norm_cuda_73ccd0c", + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "10.0", + "12.0", + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch29-cxx11-cu128-aarch64-linux/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0be8169e6c0323882915c55d98a444fce3832008 --- /dev/null +++ b/build/torch29-cxx11-cu128-aarch64-linux/__init__.py @@ -0,0 +1,61 @@ +"""torch 2.9 / cu12.8 build variant grafted from the odysseyml flash-attention fork. + +This variant wraps the pre-built ``dropout_layer_norm`` extension from the +``odyssey-fused-kernels`` wheel (tag ``odyssey-v2.8.3-fused-1``, built for +sm_80/90/100/120) because upstream kernels-community/layer-norm only provides +a cu129 build for torch 2.9. + +Unlike the kernels-community builds (which drop the ``residual`` argument), +the wheel extension keeps flash-attn's original signature with ``residual`` +as the second argument. We expose the upstream community signature so both +sources are interchangeable. +""" + +import importlib.util +from pathlib import Path + +# The extension's PyInit_* symbol is derived from the module name passed to +# the loader, so the spec name must exactly equal the .so module name. +_so_path = next(Path(__file__).parent.glob("dropout_layer_norm*.so")) +_spec = importlib.util.spec_from_file_location("dropout_layer_norm", _so_path) +_ext = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(_ext) + + +def dropout_add_ln_fwd( + input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, + rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm, +): + return _ext.dropout_add_ln_fwd( + input, None, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, + epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm, + ) + + +def dropout_add_ln_bwd(*args, **kwargs): + # The wheel extension's bwd takes (dz, dx_, x, x0, dmask, ...), the community + # build takes (dz, dx, x, mu, ...). Positional pass-through would silently + # feed the wrong tensors, so refuse until backward is actually needed. + raise NotImplementedError("dropout_add_ln_bwd is not exposed by this build variant (inference only)") + + +def dropout_add_ln_parallel_residual_fwd( + input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm, +): + return _ext.dropout_add_ln_parallel_residual_fwd( + input, None, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm + ) + + +def dropout_add_ln_parallel_residual_bwd(*args, **kwargs): + raise NotImplementedError( + "dropout_add_ln_parallel_residual_bwd is not exposed by this build variant (inference only)" + ) + + +__all__ = [ + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] diff --git a/build/torch29-cxx11-cu128-aarch64-linux/dropout_layer_norm.cpython-312-aarch64-linux-gnu.so b/build/torch29-cxx11-cu128-aarch64-linux/dropout_layer_norm.cpython-312-aarch64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..62edd7481c5862739d771218a271569a471c6cc3 --- /dev/null +++ b/build/torch29-cxx11-cu128-aarch64-linux/dropout_layer_norm.cpython-312-aarch64-linux-gnu.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb431ebaceb7b12f0a5ad50eeca97d1aeaec16d13bbdedf807cae8a5dc18c2cb +size 1258734424 diff --git a/build/torch29-cxx11-cu128-aarch64-linux/layer_norm/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch29-cxx11-cu128-aarch64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch29-cxx11-cu128-aarch64-linux/metadata.json b/build/torch29-cxx11-cu128-aarch64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..fbd5000813293fa09c4e8f564d6565075e25f140 --- /dev/null +++ b/build/torch29-cxx11-cu128-aarch64-linux/metadata.json @@ -0,0 +1,14 @@ +{ + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "8.0", + "9.0", + "10.0", + "12.0" + ] + } +} diff --git a/build/torch29-cxx11-cu128-x86_64-linux/__init__.py b/build/torch29-cxx11-cu128-x86_64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0be8169e6c0323882915c55d98a444fce3832008 --- /dev/null +++ b/build/torch29-cxx11-cu128-x86_64-linux/__init__.py @@ -0,0 +1,61 @@ +"""torch 2.9 / cu12.8 build variant grafted from the odysseyml flash-attention fork. + +This variant wraps the pre-built ``dropout_layer_norm`` extension from the +``odyssey-fused-kernels`` wheel (tag ``odyssey-v2.8.3-fused-1``, built for +sm_80/90/100/120) because upstream kernels-community/layer-norm only provides +a cu129 build for torch 2.9. + +Unlike the kernels-community builds (which drop the ``residual`` argument), +the wheel extension keeps flash-attn's original signature with ``residual`` +as the second argument. We expose the upstream community signature so both +sources are interchangeable. +""" + +import importlib.util +from pathlib import Path + +# The extension's PyInit_* symbol is derived from the module name passed to +# the loader, so the spec name must exactly equal the .so module name. +_so_path = next(Path(__file__).parent.glob("dropout_layer_norm*.so")) +_spec = importlib.util.spec_from_file_location("dropout_layer_norm", _so_path) +_ext = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(_ext) + + +def dropout_add_ln_fwd( + input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, + rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm, +): + return _ext.dropout_add_ln_fwd( + input, None, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, + epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm, + ) + + +def dropout_add_ln_bwd(*args, **kwargs): + # The wheel extension's bwd takes (dz, dx_, x, x0, dmask, ...), the community + # build takes (dz, dx, x, mu, ...). Positional pass-through would silently + # feed the wrong tensors, so refuse until backward is actually needed. + raise NotImplementedError("dropout_add_ln_bwd is not exposed by this build variant (inference only)") + + +def dropout_add_ln_parallel_residual_fwd( + input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm, +): + return _ext.dropout_add_ln_parallel_residual_fwd( + input, None, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm + ) + + +def dropout_add_ln_parallel_residual_bwd(*args, **kwargs): + raise NotImplementedError( + "dropout_add_ln_parallel_residual_bwd is not exposed by this build variant (inference only)" + ) + + +__all__ = [ + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] diff --git a/build/torch29-cxx11-cu128-x86_64-linux/__pycache__/__init__.cpython-312.pyc b/build/torch29-cxx11-cu128-x86_64-linux/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6b47d7774e67f5a29428547bd5bd76fa1e64200c Binary files /dev/null and b/build/torch29-cxx11-cu128-x86_64-linux/__pycache__/__init__.cpython-312.pyc differ diff --git a/build/torch29-cxx11-cu128-x86_64-linux/dropout_layer_norm.cpython-312-x86_64-linux-gnu.so b/build/torch29-cxx11-cu128-x86_64-linux/dropout_layer_norm.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..1804ddc7b148b1fc8d5e912811369d3c04631bdf --- /dev/null +++ b/build/torch29-cxx11-cu128-x86_64-linux/dropout_layer_norm.cpython-312-x86_64-linux-gnu.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ae8a09f42c4c44a01120b1e881f9966dada8eb4acdb5263297ff8b21305bbc5 +size 1259466744 diff --git a/build/torch29-cxx11-cu128-x86_64-linux/layer_norm/__init__.py b/build/torch29-cxx11-cu128-x86_64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch29-cxx11-cu128-x86_64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch29-cxx11-cu128-x86_64-linux/layer_norm/__pycache__/__init__.cpython-312.pyc b/build/torch29-cxx11-cu128-x86_64-linux/layer_norm/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4d5826be93af360ba9a0aca31de0b439d9b2b524 Binary files /dev/null and b/build/torch29-cxx11-cu128-x86_64-linux/layer_norm/__pycache__/__init__.cpython-312.pyc differ diff --git a/build/torch29-cxx11-cu128-x86_64-linux/metadata.json b/build/torch29-cxx11-cu128-x86_64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..fbd5000813293fa09c4e8f564d6565075e25f140 --- /dev/null +++ b/build/torch29-cxx11-cu128-x86_64-linux/metadata.json @@ -0,0 +1,14 @@ +{ + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "8.0", + "9.0", + "10.0", + "12.0" + ] + } +} diff --git a/build/torch29-cxx11-cu129-aarch64-linux/__init__.py b/build/torch29-cxx11-cu129-aarch64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch29-cxx11-cu129-aarch64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_layer_norm_cuda_abf7d7d.abi3.so b/build/torch29-cxx11-cu129-aarch64-linux/_layer_norm_cuda_abf7d7d.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..aa8c18ea490d34a92ccbbc7403a3290db78d3d19 --- /dev/null +++ b/build/torch29-cxx11-cu129-aarch64-linux/_layer_norm_cuda_abf7d7d.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8c9f84d3152ac716ccefc08e081e3c387156bc00b65eb003b1b6e7385bdc15d +size 1282721000 diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_ops.py b/build/torch29-cxx11-cu129-aarch64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..f5c1175ef45a68190dd1ce19f12e70b3d32dd9e0 --- /dev/null +++ b/build/torch29-cxx11-cu129-aarch64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_abf7d7d +ops = torch.ops._layer_norm_cuda_abf7d7d + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_abf7d7d::{op_name}" diff --git a/build/torch29-cxx11-cu129-aarch64-linux/layer_norm/__init__.py b/build/torch29-cxx11-cu129-aarch64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch29-cxx11-cu129-aarch64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch29-cxx11-cu129-aarch64-linux/layers.py b/build/torch29-cxx11-cu129-aarch64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch29-cxx11-cu129-aarch64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch29-cxx11-cu129-aarch64-linux/metadata.json b/build/torch29-cxx11-cu129-aarch64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..5bf2743043146a22147fad17302989accb0b505c --- /dev/null +++ b/build/torch29-cxx11-cu129-aarch64-linux/metadata.json @@ -0,0 +1,15 @@ +{ + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "10.0", + "12.0", + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch29-cxx11-cu129-x86_64-linux/__init__.py b/build/torch29-cxx11-cu129-x86_64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch29-cxx11-cu129-x86_64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_layer_norm_cuda_abf7d7d.abi3.so b/build/torch29-cxx11-cu129-x86_64-linux/_layer_norm_cuda_abf7d7d.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..fe60c2e5e4100af95429279dccb73835beda4062 --- /dev/null +++ b/build/torch29-cxx11-cu129-x86_64-linux/_layer_norm_cuda_abf7d7d.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f86328f0302dd6ec2d76a1ba8a70bc9c4daa3d9d3739d650fd124b11fdf49a +size 1283022120 diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_ops.py b/build/torch29-cxx11-cu129-x86_64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..f5c1175ef45a68190dd1ce19f12e70b3d32dd9e0 --- /dev/null +++ b/build/torch29-cxx11-cu129-x86_64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_abf7d7d +ops = torch.ops._layer_norm_cuda_abf7d7d + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_abf7d7d::{op_name}" diff --git a/build/torch29-cxx11-cu129-x86_64-linux/layer_norm/__init__.py b/build/torch29-cxx11-cu129-x86_64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch29-cxx11-cu129-x86_64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch29-cxx11-cu129-x86_64-linux/layers.py b/build/torch29-cxx11-cu129-x86_64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch29-cxx11-cu129-x86_64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch29-cxx11-cu129-x86_64-linux/metadata.json b/build/torch29-cxx11-cu129-x86_64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..5bf2743043146a22147fad17302989accb0b505c --- /dev/null +++ b/build/torch29-cxx11-cu129-x86_64-linux/metadata.json @@ -0,0 +1,15 @@ +{ + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "10.0", + "12.0", + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/media/benches_dark_animation.svg b/media/benches_dark_animation.svg new file mode 100644 index 0000000000000000000000000000000000000000..113c2ffd377659b878db4a0f974c5da04841a1de --- /dev/null +++ b/media/benches_dark_animation.svg @@ -0,0 +1,69 @@ + +kernels-community/layer-norm vs Torch - Relative Speed +PyTorch 2.11.0+cu130 · CPU + +LayerNorm.large +1.00x + + + + + + + +LayerNorm.medium +0.80x + + + + + + + +LayerNorm.small +0.64x + + + + + + + +RMSNorm.large +3.39x + + + + + + + +RMSNorm.medium +2.04x + + + + + + + +RMSNorm.small +1.71x + + + + + + + +Kernel + +Torch (ref) + + + + + + + + \ No newline at end of file diff --git a/media/benches_dark_latency.svg b/media/benches_dark_latency.svg new file mode 100644 index 0000000000000000000000000000000000000000..1d075322c4e958c3d5ab46a8d7cfba046353d7a9 --- /dev/null +++ b/media/benches_dark_latency.svg @@ -0,0 +1,2348 @@ + + + + + + + + 2026-04-18T11:00:27.058608 + image/svg+xml + + + Matplotlib v3.10.8, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/media/benches_dark_throughput.svg b/media/benches_dark_throughput.svg new file mode 100644 index 0000000000000000000000000000000000000000..fd6a66b308f1cbf7ec9d08c102bcd470f7ff5eb9 --- /dev/null +++ b/media/benches_dark_throughput.svg @@ -0,0 +1,2566 @@ + + + + + + + + 2026-04-18T11:00:27.246068 + image/svg+xml + + + Matplotlib v3.10.8, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/media/benches_light_animation.svg b/media/benches_light_animation.svg new file mode 100644 index 0000000000000000000000000000000000000000..63fbe2789d21e08aabff9dbeea841cf8a67f92c6 --- /dev/null +++ b/media/benches_light_animation.svg @@ -0,0 +1,69 @@ + +kernels-community/layer-norm vs Torch - Relative Speed +PyTorch 2.11.0+cu130 · CPU + +LayerNorm.large +1.00x + + + + + + + +LayerNorm.medium +0.80x + + + + + + + +LayerNorm.small +0.64x + + + + + + + +RMSNorm.large +3.39x + + + + + + + +RMSNorm.medium +2.04x + + + + + + + +RMSNorm.small +1.71x + + + + + + + +Kernel + +Torch (ref) + + + + + + + + \ No newline at end of file diff --git a/media/benches_light_latency.svg b/media/benches_light_latency.svg new file mode 100644 index 0000000000000000000000000000000000000000..672c8b469c2e39608144a5da2444b085110b56c0 --- /dev/null +++ b/media/benches_light_latency.svg @@ -0,0 +1,2348 @@ + + + + + + + + 2026-04-18T11:00:26.296573 + image/svg+xml + + + Matplotlib v3.10.8, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/media/benches_light_throughput.svg b/media/benches_light_throughput.svg new file mode 100644 index 0000000000000000000000000000000000000000..c65b9a3da3371922803f3b8ba3a720dcaa34251c --- /dev/null +++ b/media/benches_light_throughput.svg @@ -0,0 +1,2566 @@ + + + + + + + + 2026-04-18T11:00:26.724003 + image/svg+xml + + + Matplotlib v3.10.8, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +