diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..6f3a44b5d1775df1e9798eaed004c9b5a3a5f7d6 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,77 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-aarch64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-aarch64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-aarch64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu126-aarch64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu128-aarch64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu130-aarch64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu129-aarch64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-x86_64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-x86_64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-x86_64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu126-x86_64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu128-x86_64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu130-x86_64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu129-x86_64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-aarch64-linux/_layer_norm_cuda_d3db3a4.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-aarch64-linux/_layer_norm_cuda_d3db3a4.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-aarch64-linux/_layer_norm_cuda_d3db3a4.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu126-aarch64-linux/_layer_norm_cuda_d3db3a4.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu128-aarch64-linux/_layer_norm_cuda_d3db3a4.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu130-aarch64-linux/_layer_norm_cuda_d3db3a4.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu129-aarch64-linux/_layer_norm_cuda_d3db3a4.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-x86_64-linux/_layer_norm_cuda_d3db3a4.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-x86_64-linux/_layer_norm_cuda_d3db3a4.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-x86_64-linux/_layer_norm_cuda_d3db3a4.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu126-x86_64-linux/_layer_norm_cuda_d3db3a4.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu128-x86_64-linux/_layer_norm_cuda_d3db3a4.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu130-x86_64-linux/_layer_norm_cuda_d3db3a4.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu129-x86_64-linux/_layer_norm_cuda_d3db3a4.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-aarch64-linux/_layer_norm_cuda_abf7d7d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-aarch64-linux/_layer_norm_cuda_abf7d7d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-aarch64-linux/_layer_norm_cuda_abf7d7d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu126-aarch64-linux/_layer_norm_cuda_abf7d7d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu128-aarch64-linux/_layer_norm_cuda_abf7d7d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu130-aarch64-linux/_layer_norm_cuda_abf7d7d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu129-aarch64-linux/_layer_norm_cuda_abf7d7d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-x86_64-linux/_layer_norm_cuda_abf7d7d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-x86_64-linux/_layer_norm_cuda_abf7d7d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-x86_64-linux/_layer_norm_cuda_abf7d7d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu126-x86_64-linux/_layer_norm_cuda_abf7d7d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu128-x86_64-linux/_layer_norm_cuda_abf7d7d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu130-x86_64-linux/_layer_norm_cuda_abf7d7d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu129-x86_64-linux/_layer_norm_cuda_abf7d7d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-x86_64-linux/_layer_norm_cuda_86f75d9.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-x86_64-linux/_layer_norm_cuda_86f75d9.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-x86_64-linux/_layer_norm_cuda_86f75d9.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu126-x86_64-linux/_layer_norm_cuda_86f75d9.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu128-x86_64-linux/_layer_norm_cuda_86f75d9.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu130-x86_64-linux/_layer_norm_cuda_86f75d9.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-aarch64-linux/_layer_norm_cuda_86f75d9.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-aarch64-linux/_layer_norm_cuda_86f75d9.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-aarch64-linux/_layer_norm_cuda_86f75d9.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu126-aarch64-linux/_layer_norm_cuda_86f75d9.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu128-aarch64-linux/_layer_norm_cuda_86f75d9.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu130-aarch64-linux/_layer_norm_cuda_86f75d9.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu126-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu128-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu130-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch212-cxx11-cu126-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch212-cxx11-cu130-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch212-cxx11-cu132-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu126-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu128-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu130-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch212-cxx11-cu126-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch212-cxx11-cu130-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch212-cxx11-cu132-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu128-aarch64-linux/dropout_layer_norm.cpython-312-aarch64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu128-x86_64-linux/dropout_layer_norm.cpython-312-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..65aa2fc93afcf7af551e686baef8d23f23cd2fa4
--- /dev/null
+++ b/README.md
@@ -0,0 +1,28 @@
+---
+library_name: kernels
+license: bsd-3-clause
+---
+
+This is the repository card of kernels-community/layer-norm that has been pushed on the Hub. It was built to be used with the [`kernels` library](https://github.com/huggingface/kernels). This card was automatically generated.
+
+## How to use
+
+```python
+# make sure `kernels` is installed: `pip install -U kernels`
+from kernels import get_kernel
+
+kernel_module = get_kernel("kernels-community/layer-norm")
+dropout_add_ln_fwd = kernel_module.dropout_add_ln_fwd
+
+dropout_add_ln_fwd(...)
+```
+
+## Available functions
+- `dropout_add_ln_fwd`
+- `dropout_add_ln_bwd`
+- `dropout_add_ln_parallel_residual_fwd`
+- `dropout_add_ln_parallel_residual_bwd`
+
+## Benchmarks
+
+Benchmarking script is available for this kernel. Run `kernels benchmark kernels-community/layer-norm`.
diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..d943bf949eafb80e00f77b26fdb620957cab218f
--- /dev/null
+++ b/benchmarks/benchmark.py
@@ -0,0 +1,9 @@
+from kernels.benchmarks import LayerNormBenchmark, RMSNormBenchmark
+
+
+class LayerNorm(LayerNormBenchmark):
+ pass
+
+
+class RMSNorm(RMSNormBenchmark):
+ pass
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/__init__.py b/build/torch210-cxx11-cu126-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/__init__.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+from . import layers
+
+def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
+
+__all__ = [
+ "layers",
+ "dropout_add_ln_fwd",
+ "dropout_add_ln_bwd",
+ "dropout_add_ln_parallel_residual_fwd",
+ "dropout_add_ln_parallel_residual_bwd",
+]
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch210-cxx11-cu126-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..22ed7c8dfe85ee9d915de7bf81b91a2c1f1e77b1
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc6cbb986c0ac1160fd75d6db0033b133350ede8d51c815cd6c821d7e2c512a1
+size 711710472
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_ops.py b/build/torch210-cxx11-cu126-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _layer_norm_cuda_73ccd0c
+ops = torch.ops._layer_norm_cuda_73ccd0c
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_layer_norm_cuda_73ccd0c::{op_name}"
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/layer_norm/__init__.py b/build/torch210-cxx11-cu126-aarch64-linux/layer_norm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/layer_norm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/layers.py b/build/torch210-cxx11-cu126-aarch64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/layers.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class LayerNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = False,
+ )
+ return output[0].view(hidden_states.shape)
+
+class LlamaRMSNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = True,
+ )
+ return output[0].view(hidden_states.shape)
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/metadata.json b/build/torch210-cxx11-cu126-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..e16b07bc42edabbfb0c8f5e9a87013306d25ea4f
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/metadata.json
@@ -0,0 +1,15 @@
+{
+ "name": "layer-norm",
+ "id": "_layer_norm_cuda_73ccd0c",
+ "version": 1,
+ "license": "BSD-3-Clause",
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/__init__.py b/build/torch210-cxx11-cu126-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/__init__.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+from . import layers
+
+def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
+
+__all__ = [
+ "layers",
+ "dropout_add_ln_fwd",
+ "dropout_add_ln_bwd",
+ "dropout_add_ln_parallel_residual_fwd",
+ "dropout_add_ln_parallel_residual_bwd",
+]
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch210-cxx11-cu126-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..c8a7a43d72bf15271672a0053c815f210c9522d4
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e1a95a30fff3a4b64414535756ed2d26fc50321c6caeb284a4f1e2e46cfe04dd
+size 712093824
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_ops.py b/build/torch210-cxx11-cu126-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _layer_norm_cuda_73ccd0c
+ops = torch.ops._layer_norm_cuda_73ccd0c
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_layer_norm_cuda_73ccd0c::{op_name}"
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/layer_norm/__init__.py b/build/torch210-cxx11-cu126-x86_64-linux/layer_norm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/layer_norm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/layers.py b/build/torch210-cxx11-cu126-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/layers.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class LayerNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = False,
+ )
+ return output[0].view(hidden_states.shape)
+
+class LlamaRMSNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = True,
+ )
+ return output[0].view(hidden_states.shape)
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/metadata.json b/build/torch210-cxx11-cu126-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..e16b07bc42edabbfb0c8f5e9a87013306d25ea4f
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/metadata.json
@@ -0,0 +1,15 @@
+{
+ "name": "layer-norm",
+ "id": "_layer_norm_cuda_73ccd0c",
+ "version": 1,
+ "license": "BSD-3-Clause",
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/__init__.py b/build/torch210-cxx11-cu128-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/__init__.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+from . import layers
+
+def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
+
+__all__ = [
+ "layers",
+ "dropout_add_ln_fwd",
+ "dropout_add_ln_bwd",
+ "dropout_add_ln_parallel_residual_fwd",
+ "dropout_add_ln_parallel_residual_bwd",
+]
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch210-cxx11-cu128-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..37480e3d91f02c1fee53939897394a3120e23283
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b32a3daff6960337d42eb4b5484b2fc628e773616e3361bcce21d656d477096d
+size 1231083200
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_ops.py b/build/torch210-cxx11-cu128-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _layer_norm_cuda_73ccd0c
+ops = torch.ops._layer_norm_cuda_73ccd0c
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_layer_norm_cuda_73ccd0c::{op_name}"
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/layer_norm/__init__.py b/build/torch210-cxx11-cu128-aarch64-linux/layer_norm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/layer_norm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/layers.py b/build/torch210-cxx11-cu128-aarch64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/layers.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class LayerNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = False,
+ )
+ return output[0].view(hidden_states.shape)
+
+class LlamaRMSNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = True,
+ )
+ return output[0].view(hidden_states.shape)
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/metadata.json b/build/torch210-cxx11-cu128-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..75fccb89550243db092fbc5d218aa4b497699bed
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/metadata.json
@@ -0,0 +1,17 @@
+{
+ "name": "layer-norm",
+ "id": "_layer_norm_cuda_73ccd0c",
+ "version": 1,
+ "license": "BSD-3-Clause",
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/__init__.py b/build/torch210-cxx11-cu128-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/__init__.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+from . import layers
+
+def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
+
+__all__ = [
+ "layers",
+ "dropout_add_ln_fwd",
+ "dropout_add_ln_bwd",
+ "dropout_add_ln_parallel_residual_fwd",
+ "dropout_add_ln_parallel_residual_bwd",
+]
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch210-cxx11-cu128-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..522ce34f387a0c26ed90b76f88a77cb18a6a786f
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4660dc3f5e3cc4d0531fdebd5d0df2d082b0fad6599a8e63eabcc42b2cedada
+size 1231419520
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_ops.py b/build/torch210-cxx11-cu128-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _layer_norm_cuda_73ccd0c
+ops = torch.ops._layer_norm_cuda_73ccd0c
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_layer_norm_cuda_73ccd0c::{op_name}"
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/layer_norm/__init__.py b/build/torch210-cxx11-cu128-x86_64-linux/layer_norm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/layer_norm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/layers.py b/build/torch210-cxx11-cu128-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/layers.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class LayerNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = False,
+ )
+ return output[0].view(hidden_states.shape)
+
+class LlamaRMSNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = True,
+ )
+ return output[0].view(hidden_states.shape)
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/metadata.json b/build/torch210-cxx11-cu128-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..75fccb89550243db092fbc5d218aa4b497699bed
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/metadata.json
@@ -0,0 +1,17 @@
+{
+ "name": "layer-norm",
+ "id": "_layer_norm_cuda_73ccd0c",
+ "version": 1,
+ "license": "BSD-3-Clause",
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/__init__.py b/build/torch210-cxx11-cu130-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/__init__.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+from . import layers
+
+def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
+
+__all__ = [
+ "layers",
+ "dropout_add_ln_fwd",
+ "dropout_add_ln_bwd",
+ "dropout_add_ln_parallel_residual_fwd",
+ "dropout_add_ln_parallel_residual_bwd",
+]
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch210-cxx11-cu130-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..0e8093ad578fcf418a995b83b892e465603b4f9c
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1be4d1ef49363641003ad084d112971753c08b1c8ce6755ce073d7e6fce171c
+size 1235994200
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_ops.py b/build/torch210-cxx11-cu130-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _layer_norm_cuda_73ccd0c
+ops = torch.ops._layer_norm_cuda_73ccd0c
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_layer_norm_cuda_73ccd0c::{op_name}"
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/layer_norm/__init__.py b/build/torch210-cxx11-cu130-aarch64-linux/layer_norm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/layer_norm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/layers.py b/build/torch210-cxx11-cu130-aarch64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/layers.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class LayerNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = False,
+ )
+ return output[0].view(hidden_states.shape)
+
+class LlamaRMSNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = True,
+ )
+ return output[0].view(hidden_states.shape)
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/metadata.json b/build/torch210-cxx11-cu130-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..75fccb89550243db092fbc5d218aa4b497699bed
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/metadata.json
@@ -0,0 +1,17 @@
+{
+ "name": "layer-norm",
+ "id": "_layer_norm_cuda_73ccd0c",
+ "version": 1,
+ "license": "BSD-3-Clause",
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/__init__.py b/build/torch210-cxx11-cu130-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/__init__.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+from . import layers
+
+def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
+
+__all__ = [
+ "layers",
+ "dropout_add_ln_fwd",
+ "dropout_add_ln_bwd",
+ "dropout_add_ln_parallel_residual_fwd",
+ "dropout_add_ln_parallel_residual_bwd",
+]
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch210-cxx11-cu130-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..00326bbd2c440f0d8f3a256ffd65b77a222bdfd2
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61244ac2828b69fe5445df5c6564764eb1b1b80c57312c24836b41595aaf4cc1
+size 1238402192
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_ops.py b/build/torch210-cxx11-cu130-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _layer_norm_cuda_73ccd0c
+ops = torch.ops._layer_norm_cuda_73ccd0c
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_layer_norm_cuda_73ccd0c::{op_name}"
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/layer_norm/__init__.py b/build/torch210-cxx11-cu130-x86_64-linux/layer_norm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/layer_norm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/layers.py b/build/torch210-cxx11-cu130-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/layers.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class LayerNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = False,
+ )
+ return output[0].view(hidden_states.shape)
+
+class LlamaRMSNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = True,
+ )
+ return output[0].view(hidden_states.shape)
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/metadata.json b/build/torch210-cxx11-cu130-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..75fccb89550243db092fbc5d218aa4b497699bed
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/metadata.json
@@ -0,0 +1,17 @@
+{
+ "name": "layer-norm",
+ "id": "_layer_norm_cuda_73ccd0c",
+ "version": 1,
+ "license": "BSD-3-Clause",
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/__init__.py b/build/torch211-cxx11-cu126-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/__init__.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+from . import layers
+
+def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
+
+__all__ = [
+ "layers",
+ "dropout_add_ln_fwd",
+ "dropout_add_ln_bwd",
+ "dropout_add_ln_parallel_residual_fwd",
+ "dropout_add_ln_parallel_residual_bwd",
+]
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch211-cxx11-cu126-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..1a060127cf4bfd141745ec93bbd0478b4fb73286
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ebbc069d60e09aea141d01bef3f1a14b81d315d2e944f78b99ee794a370f199
+size 711706784
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_ops.py b/build/torch211-cxx11-cu126-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _layer_norm_cuda_73ccd0c
+ops = torch.ops._layer_norm_cuda_73ccd0c
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_layer_norm_cuda_73ccd0c::{op_name}"
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/layer_norm/__init__.py b/build/torch211-cxx11-cu126-aarch64-linux/layer_norm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/layer_norm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/layers.py b/build/torch211-cxx11-cu126-aarch64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/layers.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class LayerNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = False,
+ )
+ return output[0].view(hidden_states.shape)
+
+class LlamaRMSNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = True,
+ )
+ return output[0].view(hidden_states.shape)
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/metadata.json b/build/torch211-cxx11-cu126-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..e16b07bc42edabbfb0c8f5e9a87013306d25ea4f
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/metadata.json
@@ -0,0 +1,15 @@
+{
+ "name": "layer-norm",
+ "id": "_layer_norm_cuda_73ccd0c",
+ "version": 1,
+ "license": "BSD-3-Clause",
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/__init__.py b/build/torch211-cxx11-cu126-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/__init__.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+from . import layers
+
+def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
+
+__all__ = [
+ "layers",
+ "dropout_add_ln_fwd",
+ "dropout_add_ln_bwd",
+ "dropout_add_ln_parallel_residual_fwd",
+ "dropout_add_ln_parallel_residual_bwd",
+]
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch211-cxx11-cu126-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..3b570e3a18b2fec03e0a53d608caa4e809e7ea22
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a1586a1b500ccc87796c33107b971e38b73d49257aa935a8568c235021490cb9
+size 712082776
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_ops.py b/build/torch211-cxx11-cu126-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _layer_norm_cuda_73ccd0c
+ops = torch.ops._layer_norm_cuda_73ccd0c
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_layer_norm_cuda_73ccd0c::{op_name}"
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/layer_norm/__init__.py b/build/torch211-cxx11-cu126-x86_64-linux/layer_norm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/layer_norm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/layers.py b/build/torch211-cxx11-cu126-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/layers.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class LayerNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = False,
+ )
+ return output[0].view(hidden_states.shape)
+
+class LlamaRMSNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = True,
+ )
+ return output[0].view(hidden_states.shape)
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/metadata.json b/build/torch211-cxx11-cu126-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..e16b07bc42edabbfb0c8f5e9a87013306d25ea4f
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/metadata.json
@@ -0,0 +1,15 @@
+{
+ "name": "layer-norm",
+ "id": "_layer_norm_cuda_73ccd0c",
+ "version": 1,
+ "license": "BSD-3-Clause",
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/__init__.py b/build/torch211-cxx11-cu128-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/__init__.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+from . import layers
+
+def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
+
+__all__ = [
+ "layers",
+ "dropout_add_ln_fwd",
+ "dropout_add_ln_bwd",
+ "dropout_add_ln_parallel_residual_fwd",
+ "dropout_add_ln_parallel_residual_bwd",
+]
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch211-cxx11-cu128-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..c50d3b0b9e94b2b206d59352e12dc5988b5057f4
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2ca8ce06883b52cc6fc635f080a8a1e025a2cf87a9ad151df8525d2d1842c87
+size 1231079512
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_ops.py b/build/torch211-cxx11-cu128-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _layer_norm_cuda_73ccd0c
+ops = torch.ops._layer_norm_cuda_73ccd0c
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_layer_norm_cuda_73ccd0c::{op_name}"
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/layer_norm/__init__.py b/build/torch211-cxx11-cu128-aarch64-linux/layer_norm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/layer_norm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/layers.py b/build/torch211-cxx11-cu128-aarch64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/layers.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class LayerNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = False,
+ )
+ return output[0].view(hidden_states.shape)
+
+class LlamaRMSNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = True,
+ )
+ return output[0].view(hidden_states.shape)
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/metadata.json b/build/torch211-cxx11-cu128-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..75fccb89550243db092fbc5d218aa4b497699bed
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/metadata.json
@@ -0,0 +1,17 @@
+{
+ "name": "layer-norm",
+ "id": "_layer_norm_cuda_73ccd0c",
+ "version": 1,
+ "license": "BSD-3-Clause",
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/__init__.py b/build/torch211-cxx11-cu128-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/__init__.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+from . import layers
+
+def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
+
+__all__ = [
+ "layers",
+ "dropout_add_ln_fwd",
+ "dropout_add_ln_bwd",
+ "dropout_add_ln_parallel_residual_fwd",
+ "dropout_add_ln_parallel_residual_bwd",
+]
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch211-cxx11-cu128-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..b6c847b224019ee201ff7ef55e3baba3e3b02087
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be4534f53612397be85ac1dc8075c4dcdb8aba68842de6e909c15ea3ab4cc64d
+size 1231408464
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_ops.py b/build/torch211-cxx11-cu128-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _layer_norm_cuda_73ccd0c
+ops = torch.ops._layer_norm_cuda_73ccd0c
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_layer_norm_cuda_73ccd0c::{op_name}"
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/layer_norm/__init__.py b/build/torch211-cxx11-cu128-x86_64-linux/layer_norm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/layer_norm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/layers.py b/build/torch211-cxx11-cu128-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/layers.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class LayerNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = False,
+ )
+ return output[0].view(hidden_states.shape)
+
+class LlamaRMSNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = True,
+ )
+ return output[0].view(hidden_states.shape)
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/metadata.json b/build/torch211-cxx11-cu128-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..75fccb89550243db092fbc5d218aa4b497699bed
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/metadata.json
@@ -0,0 +1,17 @@
+{
+ "name": "layer-norm",
+ "id": "_layer_norm_cuda_73ccd0c",
+ "version": 1,
+ "license": "BSD-3-Clause",
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/__init__.py b/build/torch211-cxx11-cu130-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/__init__.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+from . import layers
+
+def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
+
+__all__ = [
+ "layers",
+ "dropout_add_ln_fwd",
+ "dropout_add_ln_bwd",
+ "dropout_add_ln_parallel_residual_fwd",
+ "dropout_add_ln_parallel_residual_bwd",
+]
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch211-cxx11-cu130-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..5050382923073d1f5f2e171e58cd28986724459f
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d526240fb5ecde4f704be1bd671e339a0da2f7861af43d1b87535c920d841d81
+size 1235990520
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_ops.py b/build/torch211-cxx11-cu130-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _layer_norm_cuda_73ccd0c
+ops = torch.ops._layer_norm_cuda_73ccd0c
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_layer_norm_cuda_73ccd0c::{op_name}"
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/layer_norm/__init__.py b/build/torch211-cxx11-cu130-aarch64-linux/layer_norm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/layer_norm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/layers.py b/build/torch211-cxx11-cu130-aarch64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/layers.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class LayerNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = False,
+ )
+ return output[0].view(hidden_states.shape)
+
+class LlamaRMSNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = True,
+ )
+ return output[0].view(hidden_states.shape)
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/metadata.json b/build/torch211-cxx11-cu130-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..75fccb89550243db092fbc5d218aa4b497699bed
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/metadata.json
@@ -0,0 +1,17 @@
+{
+ "name": "layer-norm",
+ "id": "_layer_norm_cuda_73ccd0c",
+ "version": 1,
+ "license": "BSD-3-Clause",
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/__init__.py b/build/torch211-cxx11-cu130-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/__init__.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+from . import layers
+
+def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
+
+__all__ = [
+ "layers",
+ "dropout_add_ln_fwd",
+ "dropout_add_ln_bwd",
+ "dropout_add_ln_parallel_residual_fwd",
+ "dropout_add_ln_parallel_residual_bwd",
+]
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch211-cxx11-cu130-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..2160fa07368fe1f4ae3c3df7a6ccda7d300ba013
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f30404419b58ef0f8bf5176c0d67a176837b70d853f59d7cc326d056e9ab2ee
+size 1238395232
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_ops.py b/build/torch211-cxx11-cu130-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _layer_norm_cuda_73ccd0c
+ops = torch.ops._layer_norm_cuda_73ccd0c
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_layer_norm_cuda_73ccd0c::{op_name}"
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/layer_norm/__init__.py b/build/torch211-cxx11-cu130-x86_64-linux/layer_norm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/layer_norm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/layers.py b/build/torch211-cxx11-cu130-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/layers.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class LayerNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = False,
+ )
+ return output[0].view(hidden_states.shape)
+
+class LlamaRMSNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = True,
+ )
+ return output[0].view(hidden_states.shape)
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/metadata.json b/build/torch211-cxx11-cu130-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..75fccb89550243db092fbc5d218aa4b497699bed
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/metadata.json
@@ -0,0 +1,17 @@
+{
+ "name": "layer-norm",
+ "id": "_layer_norm_cuda_73ccd0c",
+ "version": 1,
+ "license": "BSD-3-Clause",
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch212-cxx11-cu126-aarch64-linux/__init__.py b/build/torch212-cxx11-cu126-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802
--- /dev/null
+++ b/build/torch212-cxx11-cu126-aarch64-linux/__init__.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+from . import layers
+
+def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
+
+__all__ = [
+ "layers",
+ "dropout_add_ln_fwd",
+ "dropout_add_ln_bwd",
+ "dropout_add_ln_parallel_residual_fwd",
+ "dropout_add_ln_parallel_residual_bwd",
+]
\ No newline at end of file
diff --git a/build/torch212-cxx11-cu126-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch212-cxx11-cu126-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..2cd9a2db9a694cb40de7bb39ddd389e7658fd166
--- /dev/null
+++ b/build/torch212-cxx11-cu126-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4053b1e60f9aa2d0c36871fb49638a2644dee850719716382159abf3749e997e
+size 711709952
diff --git a/build/torch212-cxx11-cu126-aarch64-linux/_ops.py b/build/torch212-cxx11-cu126-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503
--- /dev/null
+++ b/build/torch212-cxx11-cu126-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _layer_norm_cuda_73ccd0c
+ops = torch.ops._layer_norm_cuda_73ccd0c
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_layer_norm_cuda_73ccd0c::{op_name}"
diff --git a/build/torch212-cxx11-cu126-aarch64-linux/layer_norm/__init__.py b/build/torch212-cxx11-cu126-aarch64-linux/layer_norm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch212-cxx11-cu126-aarch64-linux/layer_norm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch212-cxx11-cu126-aarch64-linux/layers.py b/build/torch212-cxx11-cu126-aarch64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74
--- /dev/null
+++ b/build/torch212-cxx11-cu126-aarch64-linux/layers.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class LayerNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = False,
+ )
+ return output[0].view(hidden_states.shape)
+
+class LlamaRMSNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = True,
+ )
+ return output[0].view(hidden_states.shape)
\ No newline at end of file
diff --git a/build/torch212-cxx11-cu126-aarch64-linux/metadata.json b/build/torch212-cxx11-cu126-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..e16b07bc42edabbfb0c8f5e9a87013306d25ea4f
--- /dev/null
+++ b/build/torch212-cxx11-cu126-aarch64-linux/metadata.json
@@ -0,0 +1,15 @@
+{
+ "name": "layer-norm",
+ "id": "_layer_norm_cuda_73ccd0c",
+ "version": 1,
+ "license": "BSD-3-Clause",
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch212-cxx11-cu126-x86_64-linux/__init__.py b/build/torch212-cxx11-cu126-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802
--- /dev/null
+++ b/build/torch212-cxx11-cu126-x86_64-linux/__init__.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+from . import layers
+
+def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
+
+__all__ = [
+ "layers",
+ "dropout_add_ln_fwd",
+ "dropout_add_ln_bwd",
+ "dropout_add_ln_parallel_residual_fwd",
+ "dropout_add_ln_parallel_residual_bwd",
+]
\ No newline at end of file
diff --git a/build/torch212-cxx11-cu126-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch212-cxx11-cu126-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..ea0a8d9ca269230bd0ddf426e0e847014ce2c6c8
--- /dev/null
+++ b/build/torch212-cxx11-cu126-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5e4fbafaa6e30159f55100f8fdf7fdcc4a33af09da5345b2327eb01a37b2ad7
+size 712088360
diff --git a/build/torch212-cxx11-cu126-x86_64-linux/_ops.py b/build/torch212-cxx11-cu126-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503
--- /dev/null
+++ b/build/torch212-cxx11-cu126-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _layer_norm_cuda_73ccd0c
+ops = torch.ops._layer_norm_cuda_73ccd0c
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_layer_norm_cuda_73ccd0c::{op_name}"
diff --git a/build/torch212-cxx11-cu126-x86_64-linux/layer_norm/__init__.py b/build/torch212-cxx11-cu126-x86_64-linux/layer_norm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch212-cxx11-cu126-x86_64-linux/layer_norm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch212-cxx11-cu126-x86_64-linux/layers.py b/build/torch212-cxx11-cu126-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74
--- /dev/null
+++ b/build/torch212-cxx11-cu126-x86_64-linux/layers.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class LayerNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = False,
+ )
+ return output[0].view(hidden_states.shape)
+
+class LlamaRMSNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = True,
+ )
+ return output[0].view(hidden_states.shape)
\ No newline at end of file
diff --git a/build/torch212-cxx11-cu126-x86_64-linux/metadata.json b/build/torch212-cxx11-cu126-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..e16b07bc42edabbfb0c8f5e9a87013306d25ea4f
--- /dev/null
+++ b/build/torch212-cxx11-cu126-x86_64-linux/metadata.json
@@ -0,0 +1,15 @@
+{
+ "name": "layer-norm",
+ "id": "_layer_norm_cuda_73ccd0c",
+ "version": 1,
+ "license": "BSD-3-Clause",
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch212-cxx11-cu130-aarch64-linux/__init__.py b/build/torch212-cxx11-cu130-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802
--- /dev/null
+++ b/build/torch212-cxx11-cu130-aarch64-linux/__init__.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+from . import layers
+
+def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
+
+__all__ = [
+ "layers",
+ "dropout_add_ln_fwd",
+ "dropout_add_ln_bwd",
+ "dropout_add_ln_parallel_residual_fwd",
+ "dropout_add_ln_parallel_residual_bwd",
+]
\ No newline at end of file
diff --git a/build/torch212-cxx11-cu130-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch212-cxx11-cu130-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..fee0bcab5df25fac8d850431b184ec1129a7c196
--- /dev/null
+++ b/build/torch212-cxx11-cu130-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df4db4bae71fab28e0e88250091baaad4b1359ce1cf5f74afb4eaaaf1b8feb03
+size 1235993656
diff --git a/build/torch212-cxx11-cu130-aarch64-linux/_ops.py b/build/torch212-cxx11-cu130-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503
--- /dev/null
+++ b/build/torch212-cxx11-cu130-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _layer_norm_cuda_73ccd0c
+ops = torch.ops._layer_norm_cuda_73ccd0c
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_layer_norm_cuda_73ccd0c::{op_name}"
diff --git a/build/torch212-cxx11-cu130-aarch64-linux/layer_norm/__init__.py b/build/torch212-cxx11-cu130-aarch64-linux/layer_norm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch212-cxx11-cu130-aarch64-linux/layer_norm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch212-cxx11-cu130-aarch64-linux/layers.py b/build/torch212-cxx11-cu130-aarch64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74
--- /dev/null
+++ b/build/torch212-cxx11-cu130-aarch64-linux/layers.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class LayerNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = False,
+ )
+ return output[0].view(hidden_states.shape)
+
+class LlamaRMSNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = True,
+ )
+ return output[0].view(hidden_states.shape)
\ No newline at end of file
diff --git a/build/torch212-cxx11-cu130-aarch64-linux/metadata.json b/build/torch212-cxx11-cu130-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..75fccb89550243db092fbc5d218aa4b497699bed
--- /dev/null
+++ b/build/torch212-cxx11-cu130-aarch64-linux/metadata.json
@@ -0,0 +1,17 @@
+{
+ "name": "layer-norm",
+ "id": "_layer_norm_cuda_73ccd0c",
+ "version": 1,
+ "license": "BSD-3-Clause",
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch212-cxx11-cu130-x86_64-linux/__init__.py b/build/torch212-cxx11-cu130-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802
--- /dev/null
+++ b/build/torch212-cxx11-cu130-x86_64-linux/__init__.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+from . import layers
+
+def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
+
+__all__ = [
+ "layers",
+ "dropout_add_ln_fwd",
+ "dropout_add_ln_bwd",
+ "dropout_add_ln_parallel_residual_fwd",
+ "dropout_add_ln_parallel_residual_bwd",
+]
\ No newline at end of file
diff --git a/build/torch212-cxx11-cu130-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch212-cxx11-cu130-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..d9f9dffb6fa623d56811d05189c2ab1da2c615dd
--- /dev/null
+++ b/build/torch212-cxx11-cu130-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da280ce065bfa6679da65d46168975cbb0a52d90c9d175ac77deb228ca20756a
+size 1238392592
diff --git a/build/torch212-cxx11-cu130-x86_64-linux/_ops.py b/build/torch212-cxx11-cu130-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503
--- /dev/null
+++ b/build/torch212-cxx11-cu130-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _layer_norm_cuda_73ccd0c
+ops = torch.ops._layer_norm_cuda_73ccd0c
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_layer_norm_cuda_73ccd0c::{op_name}"
diff --git a/build/torch212-cxx11-cu130-x86_64-linux/layer_norm/__init__.py b/build/torch212-cxx11-cu130-x86_64-linux/layer_norm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch212-cxx11-cu130-x86_64-linux/layer_norm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch212-cxx11-cu130-x86_64-linux/layers.py b/build/torch212-cxx11-cu130-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74
--- /dev/null
+++ b/build/torch212-cxx11-cu130-x86_64-linux/layers.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class LayerNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = False,
+ )
+ return output[0].view(hidden_states.shape)
+
+class LlamaRMSNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = True,
+ )
+ return output[0].view(hidden_states.shape)
\ No newline at end of file
diff --git a/build/torch212-cxx11-cu130-x86_64-linux/metadata.json b/build/torch212-cxx11-cu130-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..75fccb89550243db092fbc5d218aa4b497699bed
--- /dev/null
+++ b/build/torch212-cxx11-cu130-x86_64-linux/metadata.json
@@ -0,0 +1,17 @@
+{
+ "name": "layer-norm",
+ "id": "_layer_norm_cuda_73ccd0c",
+ "version": 1,
+ "license": "BSD-3-Clause",
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch212-cxx11-cu132-aarch64-linux/__init__.py b/build/torch212-cxx11-cu132-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802
--- /dev/null
+++ b/build/torch212-cxx11-cu132-aarch64-linux/__init__.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+from . import layers
+
+def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
+
+__all__ = [
+ "layers",
+ "dropout_add_ln_fwd",
+ "dropout_add_ln_bwd",
+ "dropout_add_ln_parallel_residual_fwd",
+ "dropout_add_ln_parallel_residual_bwd",
+]
\ No newline at end of file
diff --git a/build/torch212-cxx11-cu132-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch212-cxx11-cu132-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..c1b05b781254aaa11005173a50cfc99850ace6aa
--- /dev/null
+++ b/build/torch212-cxx11-cu132-aarch64-linux/_layer_norm_cuda_73ccd0c.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e3e2f8cb06b921f94b179521d5f773a41fc5817c5667d3a405cf111f04feaf2
+size 1219551712
diff --git a/build/torch212-cxx11-cu132-aarch64-linux/_ops.py b/build/torch212-cxx11-cu132-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503
--- /dev/null
+++ b/build/torch212-cxx11-cu132-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _layer_norm_cuda_73ccd0c
+ops = torch.ops._layer_norm_cuda_73ccd0c
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_layer_norm_cuda_73ccd0c::{op_name}"
diff --git a/build/torch212-cxx11-cu132-aarch64-linux/layer_norm/__init__.py b/build/torch212-cxx11-cu132-aarch64-linux/layer_norm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch212-cxx11-cu132-aarch64-linux/layer_norm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch212-cxx11-cu132-aarch64-linux/layers.py b/build/torch212-cxx11-cu132-aarch64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74
--- /dev/null
+++ b/build/torch212-cxx11-cu132-aarch64-linux/layers.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class LayerNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = False,
+ )
+ return output[0].view(hidden_states.shape)
+
+class LlamaRMSNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = True,
+ )
+ return output[0].view(hidden_states.shape)
\ No newline at end of file
diff --git a/build/torch212-cxx11-cu132-aarch64-linux/metadata.json b/build/torch212-cxx11-cu132-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..75fccb89550243db092fbc5d218aa4b497699bed
--- /dev/null
+++ b/build/torch212-cxx11-cu132-aarch64-linux/metadata.json
@@ -0,0 +1,17 @@
+{
+ "name": "layer-norm",
+ "id": "_layer_norm_cuda_73ccd0c",
+ "version": 1,
+ "license": "BSD-3-Clause",
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch212-cxx11-cu132-x86_64-linux/__init__.py b/build/torch212-cxx11-cu132-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802
--- /dev/null
+++ b/build/torch212-cxx11-cu132-x86_64-linux/__init__.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+from . import layers
+
+def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
+
+__all__ = [
+ "layers",
+ "dropout_add_ln_fwd",
+ "dropout_add_ln_bwd",
+ "dropout_add_ln_parallel_residual_fwd",
+ "dropout_add_ln_parallel_residual_bwd",
+]
\ No newline at end of file
diff --git a/build/torch212-cxx11-cu132-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so b/build/torch212-cxx11-cu132-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..cc51370dadcc1f31c286cb6fea91e19096532579
--- /dev/null
+++ b/build/torch212-cxx11-cu132-x86_64-linux/_layer_norm_cuda_73ccd0c.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:16f936b4a001754ef5d21ca0a7d2bfb2fa8a25f713fdd166f8b1a47e524c588d
+size 1222008944
diff --git a/build/torch212-cxx11-cu132-x86_64-linux/_ops.py b/build/torch212-cxx11-cu132-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac84e05d9b8d22d65b0113f991fac1560deb7503
--- /dev/null
+++ b/build/torch212-cxx11-cu132-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _layer_norm_cuda_73ccd0c
+ops = torch.ops._layer_norm_cuda_73ccd0c
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_layer_norm_cuda_73ccd0c::{op_name}"
diff --git a/build/torch212-cxx11-cu132-x86_64-linux/layer_norm/__init__.py b/build/torch212-cxx11-cu132-x86_64-linux/layer_norm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch212-cxx11-cu132-x86_64-linux/layer_norm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch212-cxx11-cu132-x86_64-linux/layers.py b/build/torch212-cxx11-cu132-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74
--- /dev/null
+++ b/build/torch212-cxx11-cu132-x86_64-linux/layers.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class LayerNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = False,
+ )
+ return output[0].view(hidden_states.shape)
+
+class LlamaRMSNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = True,
+ )
+ return output[0].view(hidden_states.shape)
\ No newline at end of file
diff --git a/build/torch212-cxx11-cu132-x86_64-linux/metadata.json b/build/torch212-cxx11-cu132-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..75fccb89550243db092fbc5d218aa4b497699bed
--- /dev/null
+++ b/build/torch212-cxx11-cu132-x86_64-linux/metadata.json
@@ -0,0 +1,17 @@
+{
+ "name": "layer-norm",
+ "id": "_layer_norm_cuda_73ccd0c",
+ "version": 1,
+ "license": "BSD-3-Clause",
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0be8169e6c0323882915c55d98a444fce3832008
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/__init__.py
@@ -0,0 +1,61 @@
+"""torch 2.9 / cu12.8 build variant grafted from the odysseyml flash-attention fork.
+
+This variant wraps the pre-built ``dropout_layer_norm`` extension from the
+``odyssey-fused-kernels`` wheel (tag ``odyssey-v2.8.3-fused-1``, built for
+sm_80/90/100/120) because upstream kernels-community/layer-norm only provides
+a cu129 build for torch 2.9.
+
+Unlike the kernels-community builds (which drop the ``residual`` argument),
+the wheel extension keeps flash-attn's original signature with ``residual``
+as the second argument. We expose the upstream community signature so both
+sources are interchangeable.
+"""
+
+import importlib.util
+from pathlib import Path
+
+# The extension's PyInit_* symbol is derived from the module name passed to
+# the loader, so the spec name must exactly equal the .so module name.
+_so_path = next(Path(__file__).parent.glob("dropout_layer_norm*.so"))
+_spec = importlib.util.spec_from_file_location("dropout_layer_norm", _so_path)
+_ext = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(_ext)
+
+
+def dropout_add_ln_fwd(
+ input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon,
+ rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm,
+):
+ return _ext.dropout_add_ln_fwd(
+ input, None, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p,
+ epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm,
+ )
+
+
+def dropout_add_ln_bwd(*args, **kwargs):
+ # The wheel extension's bwd takes (dz, dx_, x, x0, dmask, ...), the community
+ # build takes (dz, dx, x, mu, ...). Positional pass-through would silently
+ # feed the wrong tensors, so refuse until backward is actually needed.
+ raise NotImplementedError("dropout_add_ln_bwd is not exposed by this build variant (inference only)")
+
+
+def dropout_add_ln_parallel_residual_fwd(
+ input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm,
+):
+ return _ext.dropout_add_ln_parallel_residual_fwd(
+ input, None, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm
+ )
+
+
+def dropout_add_ln_parallel_residual_bwd(*args, **kwargs):
+ raise NotImplementedError(
+ "dropout_add_ln_parallel_residual_bwd is not exposed by this build variant (inference only)"
+ )
+
+
+__all__ = [
+ "dropout_add_ln_fwd",
+ "dropout_add_ln_bwd",
+ "dropout_add_ln_parallel_residual_fwd",
+ "dropout_add_ln_parallel_residual_bwd",
+]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/dropout_layer_norm.cpython-312-aarch64-linux-gnu.so b/build/torch29-cxx11-cu128-aarch64-linux/dropout_layer_norm.cpython-312-aarch64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..62edd7481c5862739d771218a271569a471c6cc3
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/dropout_layer_norm.cpython-312-aarch64-linux-gnu.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb431ebaceb7b12f0a5ad50eeca97d1aeaec16d13bbdedf807cae8a5dc18c2cb
+size 1258734424
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/layer_norm/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/layer_norm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/layer_norm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/metadata.json b/build/torch29-cxx11-cu128-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..fbd5000813293fa09c4e8f564d6565075e25f140
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/metadata.json
@@ -0,0 +1,14 @@
+{
+ "version": 1,
+ "license": "BSD-3-Clause",
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "8.0",
+ "9.0",
+ "10.0",
+ "12.0"
+ ]
+ }
+}
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/__init__.py b/build/torch29-cxx11-cu128-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0be8169e6c0323882915c55d98a444fce3832008
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/__init__.py
@@ -0,0 +1,61 @@
+"""torch 2.9 / cu12.8 build variant grafted from the odysseyml flash-attention fork.
+
+This variant wraps the pre-built ``dropout_layer_norm`` extension from the
+``odyssey-fused-kernels`` wheel (tag ``odyssey-v2.8.3-fused-1``, built for
+sm_80/90/100/120) because upstream kernels-community/layer-norm only provides
+a cu129 build for torch 2.9.
+
+Unlike the kernels-community builds (which drop the ``residual`` argument),
+the wheel extension keeps flash-attn's original signature with ``residual``
+as the second argument. We expose the upstream community signature so both
+sources are interchangeable.
+"""
+
+import importlib.util
+from pathlib import Path
+
+# The extension's PyInit_* symbol is derived from the module name passed to
+# the loader, so the spec name must exactly equal the .so module name.
+_so_path = next(Path(__file__).parent.glob("dropout_layer_norm*.so"))
+_spec = importlib.util.spec_from_file_location("dropout_layer_norm", _so_path)
+_ext = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(_ext)
+
+
+def dropout_add_ln_fwd(
+ input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon,
+ rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm,
+):
+ return _ext.dropout_add_ln_fwd(
+ input, None, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p,
+ epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm,
+ )
+
+
+def dropout_add_ln_bwd(*args, **kwargs):
+ # The wheel extension's bwd takes (dz, dx_, x, x0, dmask, ...), the community
+ # build takes (dz, dx, x, mu, ...). Positional pass-through would silently
+ # feed the wrong tensors, so refuse until backward is actually needed.
+ raise NotImplementedError("dropout_add_ln_bwd is not exposed by this build variant (inference only)")
+
+
+def dropout_add_ln_parallel_residual_fwd(
+ input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm,
+):
+ return _ext.dropout_add_ln_parallel_residual_fwd(
+ input, None, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm
+ )
+
+
+def dropout_add_ln_parallel_residual_bwd(*args, **kwargs):
+ raise NotImplementedError(
+ "dropout_add_ln_parallel_residual_bwd is not exposed by this build variant (inference only)"
+ )
+
+
+__all__ = [
+ "dropout_add_ln_fwd",
+ "dropout_add_ln_bwd",
+ "dropout_add_ln_parallel_residual_fwd",
+ "dropout_add_ln_parallel_residual_bwd",
+]
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/__pycache__/__init__.cpython-312.pyc b/build/torch29-cxx11-cu128-x86_64-linux/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b47d7774e67f5a29428547bd5bd76fa1e64200c
Binary files /dev/null and b/build/torch29-cxx11-cu128-x86_64-linux/__pycache__/__init__.cpython-312.pyc differ
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/dropout_layer_norm.cpython-312-x86_64-linux-gnu.so b/build/torch29-cxx11-cu128-x86_64-linux/dropout_layer_norm.cpython-312-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..1804ddc7b148b1fc8d5e912811369d3c04631bdf
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/dropout_layer_norm.cpython-312-x86_64-linux-gnu.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ae8a09f42c4c44a01120b1e881f9966dada8eb4acdb5263297ff8b21305bbc5
+size 1259466744
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/layer_norm/__init__.py b/build/torch29-cxx11-cu128-x86_64-linux/layer_norm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/layer_norm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/layer_norm/__pycache__/__init__.cpython-312.pyc b/build/torch29-cxx11-cu128-x86_64-linux/layer_norm/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4d5826be93af360ba9a0aca31de0b439d9b2b524
Binary files /dev/null and b/build/torch29-cxx11-cu128-x86_64-linux/layer_norm/__pycache__/__init__.cpython-312.pyc differ
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/metadata.json b/build/torch29-cxx11-cu128-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..fbd5000813293fa09c4e8f564d6565075e25f140
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/metadata.json
@@ -0,0 +1,14 @@
+{
+ "version": 1,
+ "license": "BSD-3-Clause",
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "8.0",
+ "9.0",
+ "10.0",
+ "12.0"
+ ]
+ }
+}
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/__init__.py b/build/torch29-cxx11-cu129-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/__init__.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+from . import layers
+
+def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
+
+__all__ = [
+ "layers",
+ "dropout_add_ln_fwd",
+ "dropout_add_ln_bwd",
+ "dropout_add_ln_parallel_residual_fwd",
+ "dropout_add_ln_parallel_residual_bwd",
+]
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_layer_norm_cuda_abf7d7d.abi3.so b/build/torch29-cxx11-cu129-aarch64-linux/_layer_norm_cuda_abf7d7d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..aa8c18ea490d34a92ccbbc7403a3290db78d3d19
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/_layer_norm_cuda_abf7d7d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8c9f84d3152ac716ccefc08e081e3c387156bc00b65eb003b1b6e7385bdc15d
+size 1282721000
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_ops.py b/build/torch29-cxx11-cu129-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5c1175ef45a68190dd1ce19f12e70b3d32dd9e0
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _layer_norm_cuda_abf7d7d
+ops = torch.ops._layer_norm_cuda_abf7d7d
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_layer_norm_cuda_abf7d7d::{op_name}"
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/layer_norm/__init__.py b/build/torch29-cxx11-cu129-aarch64-linux/layer_norm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/layer_norm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/layers.py b/build/torch29-cxx11-cu129-aarch64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/layers.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class LayerNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = False,
+ )
+ return output[0].view(hidden_states.shape)
+
+class LlamaRMSNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = True,
+ )
+ return output[0].view(hidden_states.shape)
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/metadata.json b/build/torch29-cxx11-cu129-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..5bf2743043146a22147fad17302989accb0b505c
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/metadata.json
@@ -0,0 +1,15 @@
+{
+ "version": 1,
+ "license": "BSD-3-Clause",
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/__init__.py b/build/torch29-cxx11-cu129-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/__init__.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+from . import layers
+
+def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm)
+
+def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm):
+ return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm)
+
+__all__ = [
+ "layers",
+ "dropout_add_ln_fwd",
+ "dropout_add_ln_bwd",
+ "dropout_add_ln_parallel_residual_fwd",
+ "dropout_add_ln_parallel_residual_bwd",
+]
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_layer_norm_cuda_abf7d7d.abi3.so b/build/torch29-cxx11-cu129-x86_64-linux/_layer_norm_cuda_abf7d7d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..fe60c2e5e4100af95429279dccb73835beda4062
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/_layer_norm_cuda_abf7d7d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91f86328f0302dd6ec2d76a1ba8a70bc9c4daa3d9d3739d650fd124b11fdf49a
+size 1283022120
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_ops.py b/build/torch29-cxx11-cu129-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5c1175ef45a68190dd1ce19f12e70b3d32dd9e0
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _layer_norm_cuda_abf7d7d
+ops = torch.ops._layer_norm_cuda_abf7d7d
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_layer_norm_cuda_abf7d7d::{op_name}"
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/layer_norm/__init__.py b/build/torch29-cxx11-cu129-x86_64-linux/layer_norm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/layer_norm/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/layers.py b/build/torch29-cxx11-cu129-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/layers.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class LayerNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = False,
+ )
+ return output[0].view(hidden_states.shape)
+
+class LlamaRMSNorm(nn.Module):
+ weight: torch.Tensor
+ variance_epsilon: float
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ output = ops.dropout_add_ln_fwd(
+ hidden_states.view(-1, hidden_states.shape[-1]),
+ gamma = self.weight,
+ beta = None,
+ rowscale = None,
+ colscale = None,
+ x0_subset = None,
+ z_subset = None,
+ dropout_p = 0,
+ epsilon = self.variance_epsilon,
+ rowscale_const = 1.0,
+ z_numrows = hidden_states.shape[1],
+ gen = None,
+ residual_in_fp32 = False,
+ is_rms_norm = True,
+ )
+ return output[0].view(hidden_states.shape)
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/metadata.json b/build/torch29-cxx11-cu129-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..5bf2743043146a22147fad17302989accb0b505c
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/metadata.json
@@ -0,0 +1,15 @@
+{
+ "version": 1,
+ "license": "BSD-3-Clause",
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/media/benches_dark_animation.svg b/media/benches_dark_animation.svg
new file mode 100644
index 0000000000000000000000000000000000000000..113c2ffd377659b878db4a0f974c5da04841a1de
--- /dev/null
+++ b/media/benches_dark_animation.svg
@@ -0,0 +1,69 @@
+
\ No newline at end of file
diff --git a/media/benches_dark_latency.svg b/media/benches_dark_latency.svg
new file mode 100644
index 0000000000000000000000000000000000000000..1d075322c4e958c3d5ab46a8d7cfba046353d7a9
--- /dev/null
+++ b/media/benches_dark_latency.svg
@@ -0,0 +1,2348 @@
+
+
+
diff --git a/media/benches_dark_throughput.svg b/media/benches_dark_throughput.svg
new file mode 100644
index 0000000000000000000000000000000000000000..fd6a66b308f1cbf7ec9d08c102bcd470f7ff5eb9
--- /dev/null
+++ b/media/benches_dark_throughput.svg
@@ -0,0 +1,2566 @@
+
+
+
diff --git a/media/benches_light_animation.svg b/media/benches_light_animation.svg
new file mode 100644
index 0000000000000000000000000000000000000000..63fbe2789d21e08aabff9dbeea841cf8a67f92c6
--- /dev/null
+++ b/media/benches_light_animation.svg
@@ -0,0 +1,69 @@
+
\ No newline at end of file
diff --git a/media/benches_light_latency.svg b/media/benches_light_latency.svg
new file mode 100644
index 0000000000000000000000000000000000000000..672c8b469c2e39608144a5da2444b085110b56c0
--- /dev/null
+++ b/media/benches_light_latency.svg
@@ -0,0 +1,2348 @@
+
+
+
diff --git a/media/benches_light_throughput.svg b/media/benches_light_throughput.svg
new file mode 100644
index 0000000000000000000000000000000000000000..c65b9a3da3371922803f3b8ba3a720dcaa34251c
--- /dev/null
+++ b/media/benches_light_throughput.svg
@@ -0,0 +1,2566 @@
+
+
+