diff --git a/build.toml b/build.toml
index abc9bb2e4d9b1c82a576b7e9802504aa67e84d4d..ce1953079d04f1265c38cdbc6080bbecc7705d1c 100644
--- a/build.toml
+++ b/build.toml
@@ -10,7 +10,7 @@ src = [
 
 [kernel.activation]
 backend = "rocm"
-rocm-archs = [ "gfx90a" ]
+rocm-archs = [ "gfx90a", "gfx942" ]
 src = [
   "activation/poly_norm.cu",
   "activation/rms_norm.cu",
@@ -21,3 +21,17 @@ src = [
   "activation/atomic_utils.h",
 ]
 depends = [ "torch" ]
+
+[kernel.activation_cuda]
+backend = "cuda"
+src = [
+  "activation/poly_norm.cu",
+  "activation/rms_norm.cu",
+  "activation/cuda_compat.h",
+  "activation/block_reduce.h",
+  "activation/dispatch_utils.h",
+  "activation/assert_utils.h",
+  "activation/atomic_utils.h",
+]
+depends = ["torch"]
+
diff --git a/build/torch26-cxx11-cu118-x86_64-linux/activation/__init__.py b/build/torch26-cxx11-cu118-x86_64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fadce68f2fa0f130463f00a59c3436b822835e24
--- /dev/null
+++ b/build/torch26-cxx11-cu118-x86_64-linux/activation/__init__.py
@@ -0,0 +1,30 @@
+import torch
+
+from . import layers
+from ._ops import ops
+from .poly_norm import PolyNormFunction
+from .rms_norm import RMSNormFunction
+
+
+def poly_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float = 1e-6,
+) -> None:
+    return PolyNormFunction.apply(x, weight, bias, eps)
+
+
+def rms_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+) -> None:
+    return RMSNormFunction.apply(x, weight, eps)
+
+
+__all__ = [
+    "poly_norm",
+    "layers",
+    "ops",
+]
diff --git a/build/torch26-cxx11-cu118-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so b/build/torch26-cxx11-cu118-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..67eeec1217118899a04322a83fafa12857ebcff4
--- /dev/null
+++ b/build/torch26-cxx11-cu118-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2fdb7378a1c907c3ff3ad0a5134a0a8ce4a464196404436470d7b4eb77ec305
+size 2957296
diff --git a/build/torch26-cxx11-cu118-x86_64-linux/activation/_ops.py b/build/torch26-cxx11-cu118-x86_64-linux/activation/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f65fa5de7bb717f07527d6846085f48d18b7e1d
--- /dev/null
+++ b/build/torch26-cxx11-cu118-x86_64-linux/activation/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_605f22e_dirty
+ops = torch.ops._activation_605f22e_dirty
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_605f22e_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx11-cu118-x86_64-linux/activation/layers.py b/build/torch26-cxx11-cu118-x86_64-linux/activation/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3824e5de50583f385215cf90adc03aff91653e2e
--- /dev/null
+++ b/build/torch26-cxx11-cu118-x86_64-linux/activation/layers.py
@@ -0,0 +1,46 @@
+import torch
+import torch.nn as nn
+from torch.nn import init
+
+from .poly_norm import PolyNormFunction
+from .rms_norm import RMSNormFunction
+
+
+class PolyNorm(nn.Module):
+    def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
+        self.bias = torch.nn.Parameter(torch.zeros(1, dtype=dtype))
+        self.eps = eps
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ):
+        return PolyNormFunction.apply(x, self.weight, self.bias, self.eps)
+
+    def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in __init__.
+        """
+        init.ones_(self.weight)
+        init.zeros_(self.bias)
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))
+        self.eps = eps
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ):
+        return RMSNormFunction.apply(x, self.weight, self.eps)
+
+    def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in __init__.
+        """
+        init.ones_(self.weight)
diff --git a/build/torch26-cxx11-cu118-x86_64-linux/activation/poly_norm.py b/build/torch26-cxx11-cu118-x86_64-linux/activation/poly_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce14e5cd8078de06d964775e34e1e668df32493e
--- /dev/null
+++ b/build/torch26-cxx11-cu118-x86_64-linux/activation/poly_norm.py
@@ -0,0 +1,41 @@
+import torch
+
+from ._ops import ops
+
+
+# Inherit from Function
+class PolyNormFunction(torch.autograd.Function):
+    # Note that forward, setup_context, and backward are @staticmethods
+    @staticmethod
+    def forward(input, weight, bias, eps):
+        output = torch.empty_like(input)
+        ops.poly_norm(output, input, weight, bias, eps)
+        return output
+
+    @staticmethod
+    # inputs is a Tuple of all of the inputs passed to forward.
+    # output is the output of the forward().
+    def setup_context(ctx, inputs, output):
+        input, weight, bias, eps = inputs
+        ctx.save_for_backward(input, weight)
+        ctx.eps = eps
+
+    # This function has only a single output, so it gets only one gradient
+    @staticmethod
+    def backward(ctx, output_grad):
+        input, weight = ctx.saved_tensors
+        eps = ctx.eps
+
+        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
+        bias_grad = (
+            torch.empty(1, dtype=weight.dtype, device=weight.device)
+            if ctx.needs_input_grad[2]
+            else None
+        )
+
+        ops.poly_norm_backward(
+            input_grad, weight_grad, bias_grad, output_grad, input, weight, eps
+        )
+
+        return input_grad, weight_grad, bias_grad, None
diff --git a/build/torch26-cxx11-cu118-x86_64-linux/activation/rms_norm.py b/build/torch26-cxx11-cu118-x86_64-linux/activation/rms_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..53df35e855d5d1591bc6fceff50ba81afdb2c873
--- /dev/null
+++ b/build/torch26-cxx11-cu118-x86_64-linux/activation/rms_norm.py
@@ -0,0 +1,34 @@
+import torch
+
+from ._ops import ops
+
+
+# Inherit from Function
+class RMSNormFunction(torch.autograd.Function):
+    # Note that forward, setup_context, and backward are @staticmethods
+    @staticmethod
+    def forward(input, weight, eps):
+        output = torch.empty_like(input)
+        ops.rms_norm(output, input, weight, eps)
+        return output
+
+    @staticmethod
+    # inputs is a Tuple of all of the inputs passed to forward.
+    # output is the output of the forward().
+    def setup_context(ctx, inputs, output):
+        input, weight, eps = inputs
+        ctx.save_for_backward(input, weight)
+        ctx.eps = eps
+
+    # This function has only a single output, so it gets only one gradient
+    @staticmethod
+    def backward(ctx, output_grad):
+        input, weight = ctx.saved_tensors
+        eps = ctx.eps
+
+        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
+
+        ops.rms_norm_backward(input_grad, weight_grad, output_grad, input, weight, eps)
+
+        return input_grad, weight_grad, None
diff --git a/build/torch26-cxx11-cu124-x86_64-linux/activation/__init__.py b/build/torch26-cxx11-cu124-x86_64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fadce68f2fa0f130463f00a59c3436b822835e24
--- /dev/null
+++ b/build/torch26-cxx11-cu124-x86_64-linux/activation/__init__.py
@@ -0,0 +1,30 @@
+import torch
+
+from . import layers
+from ._ops import ops
+from .poly_norm import PolyNormFunction
+from .rms_norm import RMSNormFunction
+
+
+def poly_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float = 1e-6,
+) -> None:
+    return PolyNormFunction.apply(x, weight, bias, eps)
+
+
+def rms_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+) -> None:
+    return RMSNormFunction.apply(x, weight, eps)
+
+
+__all__ = [
+    "poly_norm",
+    "layers",
+    "ops",
+]
diff --git a/build/torch26-cxx11-cu124-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so b/build/torch26-cxx11-cu124-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..c7b50b41f57eeef90b1f0a959e07e295ac6d1308
--- /dev/null
+++ b/build/torch26-cxx11-cu124-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5baac6228e04fbb209cbc90a24702c14f4eb52d2698cea12a766d77412622096
+size 2981880
diff --git a/build/torch26-cxx11-cu124-x86_64-linux/activation/_ops.py b/build/torch26-cxx11-cu124-x86_64-linux/activation/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f65fa5de7bb717f07527d6846085f48d18b7e1d
--- /dev/null
+++ b/build/torch26-cxx11-cu124-x86_64-linux/activation/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_605f22e_dirty
+ops = torch.ops._activation_605f22e_dirty
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_605f22e_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx11-cu124-x86_64-linux/activation/layers.py b/build/torch26-cxx11-cu124-x86_64-linux/activation/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3824e5de50583f385215cf90adc03aff91653e2e
--- /dev/null
+++ b/build/torch26-cxx11-cu124-x86_64-linux/activation/layers.py
@@ -0,0 +1,46 @@
+import torch
+import torch.nn as nn
+from torch.nn import init
+
+from .poly_norm import PolyNormFunction
+from .rms_norm import RMSNormFunction
+
+
+class PolyNorm(nn.Module):
+    def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
+        self.bias = torch.nn.Parameter(torch.zeros(1, dtype=dtype))
+        self.eps = eps
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ):
+        return PolyNormFunction.apply(x, self.weight, self.bias, self.eps)
+
+    def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in __init__.
+        """
+        init.ones_(self.weight)
+        init.zeros_(self.bias)
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))
+        self.eps = eps
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ):
+        return RMSNormFunction.apply(x, self.weight, self.eps)
+
+    def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in __init__.
+        """
+        init.ones_(self.weight)
diff --git a/build/torch26-cxx11-cu124-x86_64-linux/activation/poly_norm.py b/build/torch26-cxx11-cu124-x86_64-linux/activation/poly_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce14e5cd8078de06d964775e34e1e668df32493e
--- /dev/null
+++ b/build/torch26-cxx11-cu124-x86_64-linux/activation/poly_norm.py
@@ -0,0 +1,41 @@
+import torch
+
+from ._ops import ops
+
+
+# Inherit from Function
+class PolyNormFunction(torch.autograd.Function):
+    # Note that forward, setup_context, and backward are @staticmethods
+    @staticmethod
+    def forward(input, weight, bias, eps):
+        output = torch.empty_like(input)
+        ops.poly_norm(output, input, weight, bias, eps)
+        return output
+
+    @staticmethod
+    # inputs is a Tuple of all of the inputs passed to forward.
+    # output is the output of the forward().
+    def setup_context(ctx, inputs, output):
+        input, weight, bias, eps = inputs
+        ctx.save_for_backward(input, weight)
+        ctx.eps = eps
+
+    # This function has only a single output, so it gets only one gradient
+    @staticmethod
+    def backward(ctx, output_grad):
+        input, weight = ctx.saved_tensors
+        eps = ctx.eps
+
+        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
+        bias_grad = (
+            torch.empty(1, dtype=weight.dtype, device=weight.device)
+            if ctx.needs_input_grad[2]
+            else None
+        )
+
+        ops.poly_norm_backward(
+            input_grad, weight_grad, bias_grad, output_grad, input, weight, eps
+        )
+
+        return input_grad, weight_grad, bias_grad, None
diff --git a/build/torch26-cxx11-cu124-x86_64-linux/activation/rms_norm.py b/build/torch26-cxx11-cu124-x86_64-linux/activation/rms_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..53df35e855d5d1591bc6fceff50ba81afdb2c873
--- /dev/null
+++ b/build/torch26-cxx11-cu124-x86_64-linux/activation/rms_norm.py
@@ -0,0 +1,34 @@
+import torch
+
+from ._ops import ops
+
+
+# Inherit from Function
+class RMSNormFunction(torch.autograd.Function):
+    # Note that forward, setup_context, and backward are @staticmethods
+    @staticmethod
+    def forward(input, weight, eps):
+        output = torch.empty_like(input)
+        ops.rms_norm(output, input, weight, eps)
+        return output
+
+    @staticmethod
+    # inputs is a Tuple of all of the inputs passed to forward.
+    # output is the output of the forward().
+    def setup_context(ctx, inputs, output):
+        input, weight, eps = inputs
+        ctx.save_for_backward(input, weight)
+        ctx.eps = eps
+
+    # This function has only a single output, so it gets only one gradient
+    @staticmethod
+    def backward(ctx, output_grad):
+        input, weight = ctx.saved_tensors
+        eps = ctx.eps
+
+        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
+
+        ops.rms_norm_backward(input_grad, weight_grad, output_grad, input, weight, eps)
+
+        return input_grad, weight_grad, None
diff --git a/build/torch26-cxx11-cu126-x86_64-linux/activation/__init__.py b/build/torch26-cxx11-cu126-x86_64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fadce68f2fa0f130463f00a59c3436b822835e24
--- /dev/null
+++ b/build/torch26-cxx11-cu126-x86_64-linux/activation/__init__.py
@@ -0,0 +1,30 @@
+import torch
+
+from . import layers
+from ._ops import ops
+from .poly_norm import PolyNormFunction
+from .rms_norm import RMSNormFunction
+
+
+def poly_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float = 1e-6,
+) -> None:
+    return PolyNormFunction.apply(x, weight, bias, eps)
+
+
+def rms_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+) -> None:
+    return RMSNormFunction.apply(x, weight, eps)
+
+
+__all__ = [
+    "poly_norm",
+    "layers",
+    "ops",
+]
diff --git a/build/torch26-cxx11-cu126-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so b/build/torch26-cxx11-cu126-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..d30d11725ae8ec2ee197902f1acf9c6eb16c1f08
--- /dev/null
+++ b/build/torch26-cxx11-cu126-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d5c0095b931923008435d361c1871e97ff2ef04100e93205f09e65316f307f3
+size 2994704
diff --git a/build/torch26-cxx11-cu126-x86_64-linux/activation/_ops.py b/build/torch26-cxx11-cu126-x86_64-linux/activation/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f65fa5de7bb717f07527d6846085f48d18b7e1d
--- /dev/null
+++ b/build/torch26-cxx11-cu126-x86_64-linux/activation/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_605f22e_dirty
+ops = torch.ops._activation_605f22e_dirty
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_605f22e_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx11-cu126-x86_64-linux/activation/layers.py b/build/torch26-cxx11-cu126-x86_64-linux/activation/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3824e5de50583f385215cf90adc03aff91653e2e
--- /dev/null
+++ b/build/torch26-cxx11-cu126-x86_64-linux/activation/layers.py
@@ -0,0 +1,46 @@
+import torch
+import torch.nn as nn
+from torch.nn import init
+
+from .poly_norm import PolyNormFunction
+from .rms_norm import RMSNormFunction
+
+
+class PolyNorm(nn.Module):
+    def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
+        self.bias = torch.nn.Parameter(torch.zeros(1, dtype=dtype))
+        self.eps = eps
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ):
+        return PolyNormFunction.apply(x, self.weight, self.bias, self.eps)
+
+    def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in __init__.
+        """
+        init.ones_(self.weight)
+        init.zeros_(self.bias)
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))
+        self.eps = eps
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ):
+        return RMSNormFunction.apply(x, self.weight, self.eps)
+
+    def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in __init__.
+        """
+        init.ones_(self.weight)
diff --git a/build/torch26-cxx11-cu126-x86_64-linux/activation/poly_norm.py b/build/torch26-cxx11-cu126-x86_64-linux/activation/poly_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce14e5cd8078de06d964775e34e1e668df32493e
--- /dev/null
+++ b/build/torch26-cxx11-cu126-x86_64-linux/activation/poly_norm.py
@@ -0,0 +1,41 @@
+import torch
+
+from ._ops import ops
+
+
+# Inherit from Function
+class PolyNormFunction(torch.autograd.Function):
+    # Note that forward, setup_context, and backward are @staticmethods
+    @staticmethod
+    def forward(input, weight, bias, eps):
+        output = torch.empty_like(input)
+        ops.poly_norm(output, input, weight, bias, eps)
+        return output
+
+    @staticmethod
+    # inputs is a Tuple of all of the inputs passed to forward.
+    # output is the output of the forward().
+    def setup_context(ctx, inputs, output):
+        input, weight, bias, eps = inputs
+        ctx.save_for_backward(input, weight)
+        ctx.eps = eps
+
+    # This function has only a single output, so it gets only one gradient
+    @staticmethod
+    def backward(ctx, output_grad):
+        input, weight = ctx.saved_tensors
+        eps = ctx.eps
+
+        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
+        bias_grad = (
+            torch.empty(1, dtype=weight.dtype, device=weight.device)
+            if ctx.needs_input_grad[2]
+            else None
+        )
+
+        ops.poly_norm_backward(
+            input_grad, weight_grad, bias_grad, output_grad, input, weight, eps
+        )
+
+        return input_grad, weight_grad, bias_grad, None
diff --git a/build/torch26-cxx11-cu126-x86_64-linux/activation/rms_norm.py b/build/torch26-cxx11-cu126-x86_64-linux/activation/rms_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..53df35e855d5d1591bc6fceff50ba81afdb2c873
--- /dev/null
+++ b/build/torch26-cxx11-cu126-x86_64-linux/activation/rms_norm.py
@@ -0,0 +1,34 @@
+import torch
+
+from ._ops import ops
+
+
+# Inherit from Function
+class RMSNormFunction(torch.autograd.Function):
+    # Note that forward, setup_context, and backward are @staticmethods
+    @staticmethod
+    def forward(input, weight, eps):
+        output = torch.empty_like(input)
+        ops.rms_norm(output, input, weight, eps)
+        return output
+
+    @staticmethod
+    # inputs is a Tuple of all of the inputs passed to forward.
+    # output is the output of the forward().
+    def setup_context(ctx, inputs, output):
+        input, weight, eps = inputs
+        ctx.save_for_backward(input, weight)
+        ctx.eps = eps
+
+    # This function has only a single output, so it gets only one gradient
+    @staticmethod
+    def backward(ctx, output_grad):
+        input, weight = ctx.saved_tensors
+        eps = ctx.eps
+
+        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
+
+        ops.rms_norm_backward(input_grad, weight_grad, output_grad, input, weight, eps)
+
+        return input_grad, weight_grad, None
diff --git a/build/torch26-cxx11-rocm62-x86_64-linux/activation/__init__.py b/build/torch26-cxx11-rocm62-x86_64-linux/activation/__init__.py
old mode 100755
new mode 100644
diff --git a/build/torch26-cxx11-rocm62-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so b/build/torch26-cxx11-rocm62-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..21c1a09ee3c8d9b1e90ff323d1d25332a1d0bbe3
--- /dev/null
+++ b/build/torch26-cxx11-rocm62-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c29fccf3f62ac3e3b7ff59e898d31ae38f3484bfe762f6767b8bc8cedf1af01
+size 2660632
diff --git a/build/torch26-cxx11-rocm62-x86_64-linux/activation/_activation_f3b99fb_dirty.abi3.so b/build/torch26-cxx11-rocm62-x86_64-linux/activation/_activation_f3b99fb_dirty.abi3.so
deleted file mode 100755
index 684db4e4fa0ee039ca5e8386155b134039672d35..0000000000000000000000000000000000000000
--- a/build/torch26-cxx11-rocm62-x86_64-linux/activation/_activation_f3b99fb_dirty.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7bf9f4d85d15bc4869292e6a293ec53b7658cee61284457ea727c4be435062f7
-size 2656296
diff --git a/build/torch26-cxx11-rocm62-x86_64-linux/activation/_ops.py b/build/torch26-cxx11-rocm62-x86_64-linux/activation/_ops.py
old mode 100755
new mode 100644
index acb0ec4991085b2b1b7ddcfc061eb8c8e27533ca..4f65fa5de7bb717f07527d6846085f48d18b7e1d
--- a/build/torch26-cxx11-rocm62-x86_64-linux/activation/_ops.py
+++ b/build/torch26-cxx11-rocm62-x86_64-linux/activation/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _activation_f3b99fb_dirty
-ops = torch.ops._activation_f3b99fb_dirty
+from . import _activation_605f22e_dirty
+ops = torch.ops._activation_605f22e_dirty
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_f3b99fb_dirty::{op_name}"
\ No newline at end of file
+    return f"_activation_605f22e_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx11-rocm62-x86_64-linux/activation/layers.py b/build/torch26-cxx11-rocm62-x86_64-linux/activation/layers.py
old mode 100755
new mode 100644
diff --git a/build/torch26-cxx11-rocm62-x86_64-linux/activation/poly_norm.py b/build/torch26-cxx11-rocm62-x86_64-linux/activation/poly_norm.py
old mode 100755
new mode 100644
diff --git a/build/torch26-cxx11-rocm62-x86_64-linux/activation/rms_norm.py b/build/torch26-cxx11-rocm62-x86_64-linux/activation/rms_norm.py
old mode 100755
new mode 100644
diff --git a/build/torch26-cxx98-cu118-x86_64-linux/activation/__init__.py b/build/torch26-cxx98-cu118-x86_64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fadce68f2fa0f130463f00a59c3436b822835e24
--- /dev/null
+++ b/build/torch26-cxx98-cu118-x86_64-linux/activation/__init__.py
@@ -0,0 +1,30 @@
+import torch
+
+from . import layers
+from ._ops import ops
+from .poly_norm import PolyNormFunction
+from .rms_norm import RMSNormFunction
+
+
+def poly_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float = 1e-6,
+) -> None:
+    return PolyNormFunction.apply(x, weight, bias, eps)
+
+
+def rms_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+) -> None:
+    return RMSNormFunction.apply(x, weight, eps)
+
+
+__all__ = [
+    "poly_norm",
+    "layers",
+    "ops",
+]
diff --git a/build/torch26-cxx98-cu118-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so b/build/torch26-cxx98-cu118-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..5e6e767041018f2157d2a53d7479cf80cbdbcaef
--- /dev/null
+++ b/build/torch26-cxx98-cu118-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:176b8610ed2b9650c68347ec2f1d9e99b653170b4fd4f6f3540731f3fd78e98b
+size 2949936
diff --git a/build/torch26-cxx98-cu118-x86_64-linux/activation/_ops.py b/build/torch26-cxx98-cu118-x86_64-linux/activation/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f65fa5de7bb717f07527d6846085f48d18b7e1d
--- /dev/null
+++ b/build/torch26-cxx98-cu118-x86_64-linux/activation/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_605f22e_dirty
+ops = torch.ops._activation_605f22e_dirty
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_605f22e_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx98-cu118-x86_64-linux/activation/layers.py b/build/torch26-cxx98-cu118-x86_64-linux/activation/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3824e5de50583f385215cf90adc03aff91653e2e
--- /dev/null
+++ b/build/torch26-cxx98-cu118-x86_64-linux/activation/layers.py
@@ -0,0 +1,46 @@
+import torch
+import torch.nn as nn
+from torch.nn import init
+
+from .poly_norm import PolyNormFunction
+from .rms_norm import RMSNormFunction
+
+
+class PolyNorm(nn.Module):
+    def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
+        self.bias = torch.nn.Parameter(torch.zeros(1, dtype=dtype))
+        self.eps = eps
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ):
+        return PolyNormFunction.apply(x, self.weight, self.bias, self.eps)
+
+    def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in __init__.
+        """
+        init.ones_(self.weight)
+        init.zeros_(self.bias)
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))
+        self.eps = eps
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ):
+        return RMSNormFunction.apply(x, self.weight, self.eps)
+
+    def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in __init__.
+        """
+        init.ones_(self.weight)
diff --git a/build/torch26-cxx98-cu118-x86_64-linux/activation/poly_norm.py b/build/torch26-cxx98-cu118-x86_64-linux/activation/poly_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce14e5cd8078de06d964775e34e1e668df32493e
--- /dev/null
+++ b/build/torch26-cxx98-cu118-x86_64-linux/activation/poly_norm.py
@@ -0,0 +1,41 @@
+import torch
+
+from ._ops import ops
+
+
+# Inherit from Function
+class PolyNormFunction(torch.autograd.Function):
+    # Note that forward, setup_context, and backward are @staticmethods
+    @staticmethod
+    def forward(input, weight, bias, eps):
+        output = torch.empty_like(input)
+        ops.poly_norm(output, input, weight, bias, eps)
+        return output
+
+    @staticmethod
+    # inputs is a Tuple of all of the inputs passed to forward.
+    # output is the output of the forward().
+    def setup_context(ctx, inputs, output):
+        input, weight, bias, eps = inputs
+        ctx.save_for_backward(input, weight)
+        ctx.eps = eps
+
+    # This function has only a single output, so it gets only one gradient
+    @staticmethod
+    def backward(ctx, output_grad):
+        input, weight = ctx.saved_tensors
+        eps = ctx.eps
+
+        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
+        bias_grad = (
+            torch.empty(1, dtype=weight.dtype, device=weight.device)
+            if ctx.needs_input_grad[2]
+            else None
+        )
+
+        ops.poly_norm_backward(
+            input_grad, weight_grad, bias_grad, output_grad, input, weight, eps
+        )
+
+        return input_grad, weight_grad, bias_grad, None
diff --git a/build/torch26-cxx98-cu118-x86_64-linux/activation/rms_norm.py b/build/torch26-cxx98-cu118-x86_64-linux/activation/rms_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..53df35e855d5d1591bc6fceff50ba81afdb2c873
--- /dev/null
+++ b/build/torch26-cxx98-cu118-x86_64-linux/activation/rms_norm.py
@@ -0,0 +1,34 @@
+import torch
+
+from ._ops import ops
+
+
+# Inherit from Function
+class RMSNormFunction(torch.autograd.Function):
+    # Note that forward, setup_context, and backward are @staticmethods
+    @staticmethod
+    def forward(input, weight, eps):
+        output = torch.empty_like(input)
+        ops.rms_norm(output, input, weight, eps)
+        return output
+
+    @staticmethod
+    # inputs is a Tuple of all of the inputs passed to forward.
+    # output is the output of the forward().
+    def setup_context(ctx, inputs, output):
+        input, weight, eps = inputs
+        ctx.save_for_backward(input, weight)
+        ctx.eps = eps
+
+    # This function has only a single output, so it gets only one gradient
+    @staticmethod
+    def backward(ctx, output_grad):
+        input, weight = ctx.saved_tensors
+        eps = ctx.eps
+
+        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
+
+        ops.rms_norm_backward(input_grad, weight_grad, output_grad, input, weight, eps)
+
+        return input_grad, weight_grad, None
diff --git a/build/torch26-cxx98-cu124-x86_64-linux/activation/__init__.py b/build/torch26-cxx98-cu124-x86_64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fadce68f2fa0f130463f00a59c3436b822835e24
--- /dev/null
+++ b/build/torch26-cxx98-cu124-x86_64-linux/activation/__init__.py
@@ -0,0 +1,30 @@
+import torch
+
+from . import layers
+from ._ops import ops
+from .poly_norm import PolyNormFunction
+from .rms_norm import RMSNormFunction
+
+
+def poly_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float = 1e-6,
+) -> None:
+    return PolyNormFunction.apply(x, weight, bias, eps)
+
+
+def rms_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+) -> None:
+    return RMSNormFunction.apply(x, weight, eps)
+
+
+__all__ = [
+    "poly_norm",
+    "layers",
+    "ops",
+]
diff --git a/build/torch26-cxx98-cu124-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so b/build/torch26-cxx98-cu124-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..c65d99af91dd3aaaffe08d2c572cf46f1a671fee
--- /dev/null
+++ b/build/torch26-cxx98-cu124-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8075bbb5b339e0305d353003eb86a2b6a4d8a468907d821cefbed29e6e439c19
+size 2974640
diff --git a/build/torch26-cxx98-cu124-x86_64-linux/activation/_ops.py b/build/torch26-cxx98-cu124-x86_64-linux/activation/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f65fa5de7bb717f07527d6846085f48d18b7e1d
--- /dev/null
+++ b/build/torch26-cxx98-cu124-x86_64-linux/activation/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_605f22e_dirty
+ops = torch.ops._activation_605f22e_dirty
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_605f22e_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx98-cu124-x86_64-linux/activation/layers.py b/build/torch26-cxx98-cu124-x86_64-linux/activation/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3824e5de50583f385215cf90adc03aff91653e2e
--- /dev/null
+++ b/build/torch26-cxx98-cu124-x86_64-linux/activation/layers.py
@@ -0,0 +1,46 @@
+import torch
+import torch.nn as nn
+from torch.nn import init
+
+from .poly_norm import PolyNormFunction
+from .rms_norm import RMSNormFunction
+
+
+class PolyNorm(nn.Module):
+    def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
+        self.bias = torch.nn.Parameter(torch.zeros(1, dtype=dtype))
+        self.eps = eps
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ):
+        return PolyNormFunction.apply(x, self.weight, self.bias, self.eps)
+
+    def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in __init__.
+        """
+        init.ones_(self.weight)
+        init.zeros_(self.bias)
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))
+        self.eps = eps
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ):
+        return RMSNormFunction.apply(x, self.weight, self.eps)
+
+    def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in __init__.
+        """
+        init.ones_(self.weight)
diff --git a/build/torch26-cxx98-cu124-x86_64-linux/activation/poly_norm.py b/build/torch26-cxx98-cu124-x86_64-linux/activation/poly_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce14e5cd8078de06d964775e34e1e668df32493e
--- /dev/null
+++ b/build/torch26-cxx98-cu124-x86_64-linux/activation/poly_norm.py
@@ -0,0 +1,41 @@
+import torch
+
+from ._ops import ops
+
+
+# Inherit from Function
+class PolyNormFunction(torch.autograd.Function):
+    # Note that forward, setup_context, and backward are @staticmethods
+    @staticmethod
+    def forward(input, weight, bias, eps):
+        output = torch.empty_like(input)
+        ops.poly_norm(output, input, weight, bias, eps)
+        return output
+
+    @staticmethod
+    # inputs is a Tuple of all of the inputs passed to forward.
+    # output is the output of the forward().
+    def setup_context(ctx, inputs, output):
+        input, weight, bias, eps = inputs
+        ctx.save_for_backward(input, weight)
+        ctx.eps = eps
+
+    # This function has only a single output, so it gets only one gradient
+    @staticmethod
+    def backward(ctx, output_grad):
+        input, weight = ctx.saved_tensors
+        eps = ctx.eps
+
+        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
+        bias_grad = (
+            torch.empty(1, dtype=weight.dtype, device=weight.device)
+            if ctx.needs_input_grad[2]
+            else None
+        )
+
+        ops.poly_norm_backward(
+            input_grad, weight_grad, bias_grad, output_grad, input, weight, eps
+        )
+
+        return input_grad, weight_grad, bias_grad, None
diff --git a/build/torch26-cxx98-cu124-x86_64-linux/activation/rms_norm.py b/build/torch26-cxx98-cu124-x86_64-linux/activation/rms_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..53df35e855d5d1591bc6fceff50ba81afdb2c873
--- /dev/null
+++ b/build/torch26-cxx98-cu124-x86_64-linux/activation/rms_norm.py
@@ -0,0 +1,34 @@
+import torch
+
+from ._ops import ops
+
+
+# Inherit from Function
+class RMSNormFunction(torch.autograd.Function):
+    # Note that forward, setup_context, and backward are @staticmethods
+    @staticmethod
+    def forward(input, weight, eps):
+        output = torch.empty_like(input)
+        ops.rms_norm(output, input, weight, eps)
+        return output
+
+    @staticmethod
+    # inputs is a Tuple of all of the inputs passed to forward.
+    # output is the output of the forward().
+    def setup_context(ctx, inputs, output):
+        input, weight, eps = inputs
+        ctx.save_for_backward(input, weight)
+        ctx.eps = eps
+
+    # This function has only a single output, so it gets only one gradient
+    @staticmethod
+    def backward(ctx, output_grad):
+        input, weight = ctx.saved_tensors
+        eps = ctx.eps
+
+        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
+
+        ops.rms_norm_backward(input_grad, weight_grad, output_grad, input, weight, eps)
+
+        return input_grad, weight_grad, None
diff --git a/build/torch26-cxx98-cu126-x86_64-linux/activation/__init__.py b/build/torch26-cxx98-cu126-x86_64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fadce68f2fa0f130463f00a59c3436b822835e24
--- /dev/null
+++ b/build/torch26-cxx98-cu126-x86_64-linux/activation/__init__.py
@@ -0,0 +1,30 @@
+import torch
+
+from . import layers
+from ._ops import ops
+from .poly_norm import PolyNormFunction
+from .rms_norm import RMSNormFunction
+
+
+def poly_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float = 1e-6,
+) -> None:
+    return PolyNormFunction.apply(x, weight, bias, eps)
+
+
+def rms_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+) -> None:
+    return RMSNormFunction.apply(x, weight, eps)
+
+
+__all__ = [
+    "poly_norm",
+    "layers",
+    "ops",
+]
diff --git a/build/torch26-cxx98-cu126-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so b/build/torch26-cxx98-cu126-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..c8dbf12bc93a1955e673270baa58f88c9189ed2d
--- /dev/null
+++ b/build/torch26-cxx98-cu126-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:614ef2bf9867f65bf8e09d861def1c554d384676aa58dfbfd73bf96241cb7171
+size 2987456
diff --git a/build/torch26-cxx98-cu126-x86_64-linux/activation/_ops.py b/build/torch26-cxx98-cu126-x86_64-linux/activation/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f65fa5de7bb717f07527d6846085f48d18b7e1d
--- /dev/null
+++ b/build/torch26-cxx98-cu126-x86_64-linux/activation/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_605f22e_dirty
+ops = torch.ops._activation_605f22e_dirty
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_605f22e_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx98-cu126-x86_64-linux/activation/layers.py b/build/torch26-cxx98-cu126-x86_64-linux/activation/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3824e5de50583f385215cf90adc03aff91653e2e
--- /dev/null
+++ b/build/torch26-cxx98-cu126-x86_64-linux/activation/layers.py
@@ -0,0 +1,46 @@
+import torch
+import torch.nn as nn
+from torch.nn import init
+
+from .poly_norm import PolyNormFunction
+from .rms_norm import RMSNormFunction
+
+
+class PolyNorm(nn.Module):
+    def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
+        self.bias = torch.nn.Parameter(torch.zeros(1, dtype=dtype))
+        self.eps = eps
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ):
+        return PolyNormFunction.apply(x, self.weight, self.bias, self.eps)
+
+    def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in __init__.
+        """
+        init.ones_(self.weight)
+        init.zeros_(self.bias)
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))
+        self.eps = eps
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ):
+        return RMSNormFunction.apply(x, self.weight, self.eps)
+
+    def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in __init__.
+        """
+        init.ones_(self.weight)
diff --git a/build/torch26-cxx98-cu126-x86_64-linux/activation/poly_norm.py b/build/torch26-cxx98-cu126-x86_64-linux/activation/poly_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce14e5cd8078de06d964775e34e1e668df32493e
--- /dev/null
+++ b/build/torch26-cxx98-cu126-x86_64-linux/activation/poly_norm.py
@@ -0,0 +1,41 @@
+import torch
+
+from ._ops import ops
+
+
+# Inherit from Function
+class PolyNormFunction(torch.autograd.Function):
+    # Note that forward, setup_context, and backward are @staticmethods
+    @staticmethod
+    def forward(input, weight, bias, eps):
+        output = torch.empty_like(input)
+        ops.poly_norm(output, input, weight, bias, eps)
+        return output
+
+    @staticmethod
+    # inputs is a Tuple of all of the inputs passed to forward.
+    # output is the output of the forward().
+    def setup_context(ctx, inputs, output):
+        input, weight, bias, eps = inputs
+        ctx.save_for_backward(input, weight)
+        ctx.eps = eps
+
+    # This function has only a single output, so it gets only one gradient
+    @staticmethod
+    def backward(ctx, output_grad):
+        input, weight = ctx.saved_tensors
+        eps = ctx.eps
+
+        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
+        bias_grad = (
+            torch.empty(1, dtype=weight.dtype, device=weight.device)
+            if ctx.needs_input_grad[2]
+            else None
+        )
+
+        ops.poly_norm_backward(
+            input_grad, weight_grad, bias_grad, output_grad, input, weight, eps
+        )
+
+        return input_grad, weight_grad, bias_grad, None
diff --git a/build/torch26-cxx98-cu126-x86_64-linux/activation/rms_norm.py b/build/torch26-cxx98-cu126-x86_64-linux/activation/rms_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..53df35e855d5d1591bc6fceff50ba81afdb2c873
--- /dev/null
+++ b/build/torch26-cxx98-cu126-x86_64-linux/activation/rms_norm.py
@@ -0,0 +1,34 @@
+import torch
+
+from ._ops import ops
+
+
+# Inherit from Function
+class RMSNormFunction(torch.autograd.Function):
+    # Note that forward, setup_context, and backward are @staticmethods
+    @staticmethod
+    def forward(input, weight, eps):
+        output = torch.empty_like(input)
+        ops.rms_norm(output, input, weight, eps)
+        return output
+
+    @staticmethod
+    # inputs is a Tuple of all of the inputs passed to forward.
+    # output is the output of the forward().
+    def setup_context(ctx, inputs, output):
+        input, weight, eps = inputs
+        ctx.save_for_backward(input, weight)
+        ctx.eps = eps
+
+    # This function has only a single output, so it gets only one gradient
+    @staticmethod
+    def backward(ctx, output_grad):
+        input, weight = ctx.saved_tensors
+        eps = ctx.eps
+
+        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
+
+        ops.rms_norm_backward(input_grad, weight_grad, output_grad, input, weight, eps)
+
+        return input_grad, weight_grad, None
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/activation/__init__.py b/build/torch27-cxx11-cu118-x86_64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fadce68f2fa0f130463f00a59c3436b822835e24
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/activation/__init__.py
@@ -0,0 +1,30 @@
+import torch
+
+from . import layers
+from ._ops import ops
+from .poly_norm import PolyNormFunction
+from .rms_norm import RMSNormFunction
+
+
+def poly_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float = 1e-6,
+) -> None:
+    return PolyNormFunction.apply(x, weight, bias, eps)
+
+
+def rms_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+) -> None:
+    return RMSNormFunction.apply(x, weight, eps)
+
+
+__all__ = [
+    "poly_norm",
+    "layers",
+    "ops",
+]
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so b/build/torch27-cxx11-cu118-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..22f58c584491d644d7e0a5f52938366b2bfc014b
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:654d16d66565185dfd1a6f16e0b24d8fff83e12558c8862c322734e6b52e5cc0
+size 2957448
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/activation/_ops.py b/build/torch27-cxx11-cu118-x86_64-linux/activation/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f65fa5de7bb717f07527d6846085f48d18b7e1d
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/activation/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_605f22e_dirty
+ops = torch.ops._activation_605f22e_dirty
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_605f22e_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/activation/layers.py b/build/torch27-cxx11-cu118-x86_64-linux/activation/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3824e5de50583f385215cf90adc03aff91653e2e
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/activation/layers.py
@@ -0,0 +1,46 @@
+import torch
+import torch.nn as nn
+from torch.nn import init
+
+from .poly_norm import PolyNormFunction
+from .rms_norm import RMSNormFunction
+
+
+class PolyNorm(nn.Module):
+    def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
+        self.bias = torch.nn.Parameter(torch.zeros(1, dtype=dtype))
+        self.eps = eps
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ):
+        return PolyNormFunction.apply(x, self.weight, self.bias, self.eps)
+
+    def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in __init__.
+        """
+        init.ones_(self.weight)
+        init.zeros_(self.bias)
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))
+        self.eps = eps
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ):
+        return RMSNormFunction.apply(x, self.weight, self.eps)
+
+    def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in __init__.
+        """
+        init.ones_(self.weight)
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/activation/poly_norm.py b/build/torch27-cxx11-cu118-x86_64-linux/activation/poly_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce14e5cd8078de06d964775e34e1e668df32493e
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/activation/poly_norm.py
@@ -0,0 +1,41 @@
+import torch
+
+from ._ops import ops
+
+
+# Inherit from Function
+class PolyNormFunction(torch.autograd.Function):
+    # Note that forward, setup_context, and backward are @staticmethods
+    @staticmethod
+    def forward(input, weight, bias, eps):
+        output = torch.empty_like(input)
+        ops.poly_norm(output, input, weight, bias, eps)
+        return output
+
+    @staticmethod
+    # inputs is a Tuple of all of the inputs passed to forward.
+    # output is the output of the forward().
+    def setup_context(ctx, inputs, output):
+        input, weight, bias, eps = inputs
+        ctx.save_for_backward(input, weight)
+        ctx.eps = eps
+
+    # This function has only a single output, so it gets only one gradient
+    @staticmethod
+    def backward(ctx, output_grad):
+        input, weight = ctx.saved_tensors
+        eps = ctx.eps
+
+        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
+        bias_grad = (
+            torch.empty(1, dtype=weight.dtype, device=weight.device)
+            if ctx.needs_input_grad[2]
+            else None
+        )
+
+        ops.poly_norm_backward(
+            input_grad, weight_grad, bias_grad, output_grad, input, weight, eps
+        )
+
+        return input_grad, weight_grad, bias_grad, None
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/activation/rms_norm.py b/build/torch27-cxx11-cu118-x86_64-linux/activation/rms_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..53df35e855d5d1591bc6fceff50ba81afdb2c873
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/activation/rms_norm.py
@@ -0,0 +1,34 @@
+import torch
+
+from ._ops import ops
+
+
+# Inherit from Function
+class RMSNormFunction(torch.autograd.Function):
+    # Note that forward, setup_context, and backward are @staticmethods
+    @staticmethod
+    def forward(input, weight, eps):
+        output = torch.empty_like(input)
+        ops.rms_norm(output, input, weight, eps)
+        return output
+
+    @staticmethod
+    # inputs is a Tuple of all of the inputs passed to forward.
+    # output is the output of the forward().
+    def setup_context(ctx, inputs, output):
+        input, weight, eps = inputs
+        ctx.save_for_backward(input, weight)
+        ctx.eps = eps
+
+    # This function has only a single output, so it gets only one gradient
+    @staticmethod
+    def backward(ctx, output_grad):
+        input, weight = ctx.saved_tensors
+        eps = ctx.eps
+
+        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
+
+        ops.rms_norm_backward(input_grad, weight_grad, output_grad, input, weight, eps)
+
+        return input_grad, weight_grad, None
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/activation/__init__.py b/build/torch27-cxx11-cu126-x86_64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fadce68f2fa0f130463f00a59c3436b822835e24
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/activation/__init__.py
@@ -0,0 +1,30 @@
+import torch
+
+from . import layers
+from ._ops import ops
+from .poly_norm import PolyNormFunction
+from .rms_norm import RMSNormFunction
+
+
+def poly_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float = 1e-6,
+) -> None:
+    return PolyNormFunction.apply(x, weight, bias, eps)
+
+
+def rms_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+) -> None:
+    return RMSNormFunction.apply(x, weight, eps)
+
+
+__all__ = [
+    "poly_norm",
+    "layers",
+    "ops",
+]
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so b/build/torch27-cxx11-cu126-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..1d4298b396205534b39df4f6014a0cab01c8ac06
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eaf16ec2d17feb812fd485f168fbdd938880122d882edcdfb7fd8efcf3ed77af
+size 2994736
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/activation/_ops.py b/build/torch27-cxx11-cu126-x86_64-linux/activation/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f65fa5de7bb717f07527d6846085f48d18b7e1d
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/activation/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_605f22e_dirty
+ops = torch.ops._activation_605f22e_dirty
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_605f22e_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/activation/layers.py b/build/torch27-cxx11-cu126-x86_64-linux/activation/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3824e5de50583f385215cf90adc03aff91653e2e
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/activation/layers.py
@@ -0,0 +1,46 @@
+import torch
+import torch.nn as nn
+from torch.nn import init
+
+from .poly_norm import PolyNormFunction
+from .rms_norm import RMSNormFunction
+
+
+class PolyNorm(nn.Module):
+    def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
+        self.bias = torch.nn.Parameter(torch.zeros(1, dtype=dtype))
+        self.eps = eps
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ):
+        return PolyNormFunction.apply(x, self.weight, self.bias, self.eps)
+
+    def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in __init__.
+        """
+        init.ones_(self.weight)
+        init.zeros_(self.bias)
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))
+        self.eps = eps
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ):
+        return RMSNormFunction.apply(x, self.weight, self.eps)
+
+    def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in __init__.
+        """
+        init.ones_(self.weight)
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/activation/poly_norm.py b/build/torch27-cxx11-cu126-x86_64-linux/activation/poly_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce14e5cd8078de06d964775e34e1e668df32493e
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/activation/poly_norm.py
@@ -0,0 +1,41 @@
+import torch
+
+from ._ops import ops
+
+
+# Inherit from Function
+class PolyNormFunction(torch.autograd.Function):
+    # Note that forward, setup_context, and backward are @staticmethods
+    @staticmethod
+    def forward(input, weight, bias, eps):
+        output = torch.empty_like(input)
+        ops.poly_norm(output, input, weight, bias, eps)
+        return output
+
+    @staticmethod
+    # inputs is a Tuple of all of the inputs passed to forward.
+    # output is the output of the forward().
+    def setup_context(ctx, inputs, output):
+        input, weight, bias, eps = inputs
+        ctx.save_for_backward(input, weight)
+        ctx.eps = eps
+
+    # This function has only a single output, so it gets only one gradient
+    @staticmethod
+    def backward(ctx, output_grad):
+        input, weight = ctx.saved_tensors
+        eps = ctx.eps
+
+        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
+        bias_grad = (
+            torch.empty(1, dtype=weight.dtype, device=weight.device)
+            if ctx.needs_input_grad[2]
+            else None
+        )
+
+        ops.poly_norm_backward(
+            input_grad, weight_grad, bias_grad, output_grad, input, weight, eps
+        )
+
+        return input_grad, weight_grad, bias_grad, None
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/activation/rms_norm.py b/build/torch27-cxx11-cu126-x86_64-linux/activation/rms_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..53df35e855d5d1591bc6fceff50ba81afdb2c873
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/activation/rms_norm.py
@@ -0,0 +1,34 @@
+import torch
+
+from ._ops import ops
+
+
+# Inherit from Function
+class RMSNormFunction(torch.autograd.Function):
+    # Note that forward, setup_context, and backward are @staticmethods
+    @staticmethod
+    def forward(input, weight, eps):
+        output = torch.empty_like(input)
+        ops.rms_norm(output, input, weight, eps)
+        return output
+
+    @staticmethod
+    # inputs is a Tuple of all of the inputs passed to forward.
+    # output is the output of the forward().
+    def setup_context(ctx, inputs, output):
+        input, weight, eps = inputs
+        ctx.save_for_backward(input, weight)
+        ctx.eps = eps
+
+    # This function has only a single output, so it gets only one gradient
+    @staticmethod
+    def backward(ctx, output_grad):
+        input, weight = ctx.saved_tensors
+        eps = ctx.eps
+
+        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
+
+        ops.rms_norm_backward(input_grad, weight_grad, output_grad, input, weight, eps)
+
+        return input_grad, weight_grad, None
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/activation/__init__.py b/build/torch27-cxx11-cu128-x86_64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fadce68f2fa0f130463f00a59c3436b822835e24
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/activation/__init__.py
@@ -0,0 +1,30 @@
+import torch
+
+from . import layers
+from ._ops import ops
+from .poly_norm import PolyNormFunction
+from .rms_norm import RMSNormFunction
+
+
+def poly_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float = 1e-6,
+) -> None:
+    return PolyNormFunction.apply(x, weight, bias, eps)
+
+
+def rms_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+) -> None:
+    return RMSNormFunction.apply(x, weight, eps)
+
+
+__all__ = [
+    "poly_norm",
+    "layers",
+    "ops",
+]
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so b/build/torch27-cxx11-cu128-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..e893ea914b71ecdba19784492df7b62bca7d20d0
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4188bd38f2c4d2b19a4a79e2234456fe29c28da064269abc0409de11c725f831
+size 3909704
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/activation/_ops.py b/build/torch27-cxx11-cu128-x86_64-linux/activation/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f65fa5de7bb717f07527d6846085f48d18b7e1d
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/activation/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_605f22e_dirty
+ops = torch.ops._activation_605f22e_dirty
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_605f22e_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/activation/layers.py b/build/torch27-cxx11-cu128-x86_64-linux/activation/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3824e5de50583f385215cf90adc03aff91653e2e
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/activation/layers.py
@@ -0,0 +1,46 @@
+import torch
+import torch.nn as nn
+from torch.nn import init
+
+from .poly_norm import PolyNormFunction
+from .rms_norm import RMSNormFunction
+
+
+class PolyNorm(nn.Module):
+    def __init__(self, eps=1e-6, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(3, dtype=dtype) / 3)
+        self.bias = torch.nn.Parameter(torch.zeros(1, dtype=dtype))
+        self.eps = eps
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ):
+        return PolyNormFunction.apply(x, self.weight, self.bias, self.eps)
+
+    def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in __init__.
+        """
+        init.ones_(self.weight)
+        init.zeros_(self.bias)
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps=1e-6, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(dim, dtype=dtype))
+        self.eps = eps
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ):
+        return RMSNormFunction.apply(x, self.weight, self.eps)
+
+    def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in __init__.
+        """
+        init.ones_(self.weight)
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/activation/poly_norm.py b/build/torch27-cxx11-cu128-x86_64-linux/activation/poly_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce14e5cd8078de06d964775e34e1e668df32493e
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/activation/poly_norm.py
@@ -0,0 +1,41 @@
+import torch
+
+from ._ops import ops
+
+
+# Inherit from Function
+class PolyNormFunction(torch.autograd.Function):
+    # Note that forward, setup_context, and backward are @staticmethods
+    @staticmethod
+    def forward(input, weight, bias, eps):
+        output = torch.empty_like(input)
+        ops.poly_norm(output, input, weight, bias, eps)
+        return output
+
+    @staticmethod
+    # inputs is a Tuple of all of the inputs passed to forward.
+    # output is the output of the forward().
+    def setup_context(ctx, inputs, output):
+        input, weight, bias, eps = inputs
+        ctx.save_for_backward(input, weight)
+        ctx.eps = eps
+
+    # This function has only a single output, so it gets only one gradient
+    @staticmethod
+    def backward(ctx, output_grad):
+        input, weight = ctx.saved_tensors
+        eps = ctx.eps
+
+        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
+        bias_grad = (
+            torch.empty(1, dtype=weight.dtype, device=weight.device)
+            if ctx.needs_input_grad[2]
+            else None
+        )
+
+        ops.poly_norm_backward(
+            input_grad, weight_grad, bias_grad, output_grad, input, weight, eps
+        )
+
+        return input_grad, weight_grad, bias_grad, None
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/activation/rms_norm.py b/build/torch27-cxx11-cu128-x86_64-linux/activation/rms_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..53df35e855d5d1591bc6fceff50ba81afdb2c873
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/activation/rms_norm.py
@@ -0,0 +1,34 @@
+import torch
+
+from ._ops import ops
+
+
+# Inherit from Function
+class RMSNormFunction(torch.autograd.Function):
+    # Note that forward, setup_context, and backward are @staticmethods
+    @staticmethod
+    def forward(input, weight, eps):
+        output = torch.empty_like(input)
+        ops.rms_norm(output, input, weight, eps)
+        return output
+
+    @staticmethod
+    # inputs is a Tuple of all of the inputs passed to forward.
+    # output is the output of the forward().
+    def setup_context(ctx, inputs, output):
+        input, weight, eps = inputs
+        ctx.save_for_backward(input, weight)
+        ctx.eps = eps
+
+    # This function has only a single output, so it gets only one gradient
+    @staticmethod
+    def backward(ctx, output_grad):
+        input, weight = ctx.saved_tensors
+        eps = ctx.eps
+
+        input_grad = torch.empty_like(input) if ctx.needs_input_grad[0] else None
+        weight_grad = torch.empty_like(weight) if ctx.needs_input_grad[1] else None
+
+        ops.rms_norm_backward(input_grad, weight_grad, output_grad, input, weight, eps)
+
+        return input_grad, weight_grad, None
diff --git a/build/torch27-cxx11-rocm63-x86_64-linux/activation/__init__.py b/build/torch27-cxx11-rocm63-x86_64-linux/activation/__init__.py
old mode 100755
new mode 100644
diff --git a/build/torch27-cxx11-rocm63-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so b/build/torch27-cxx11-rocm63-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..d2a82f4c55f0b615552bf5c38e7ee20a868b9f1f
--- /dev/null
+++ b/build/torch27-cxx11-rocm63-x86_64-linux/activation/_activation_605f22e_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90da5f30c084a0316266d4886a052721347d40e775f03b6033894de5890331bf
+size 2642880
diff --git a/build/torch27-cxx11-rocm63-x86_64-linux/activation/_activation_f3b99fb_dirty.abi3.so b/build/torch27-cxx11-rocm63-x86_64-linux/activation/_activation_f3b99fb_dirty.abi3.so
deleted file mode 100755
index 53539f8f957fd58af470c22266313642409f6948..0000000000000000000000000000000000000000
--- a/build/torch27-cxx11-rocm63-x86_64-linux/activation/_activation_f3b99fb_dirty.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ad289cf495aa7bcb7318535f2d76a6543bd44827369ec358ff7411e182ce089f
-size 2642736
diff --git a/build/torch27-cxx11-rocm63-x86_64-linux/activation/_ops.py b/build/torch27-cxx11-rocm63-x86_64-linux/activation/_ops.py
old mode 100755
new mode 100644
index acb0ec4991085b2b1b7ddcfc061eb8c8e27533ca..4f65fa5de7bb717f07527d6846085f48d18b7e1d
--- a/build/torch27-cxx11-rocm63-x86_64-linux/activation/_ops.py
+++ b/build/torch27-cxx11-rocm63-x86_64-linux/activation/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _activation_f3b99fb_dirty
-ops = torch.ops._activation_f3b99fb_dirty
+from . import _activation_605f22e_dirty
+ops = torch.ops._activation_605f22e_dirty
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_f3b99fb_dirty::{op_name}"
\ No newline at end of file
+    return f"_activation_605f22e_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-rocm63-x86_64-linux/activation/layers.py b/build/torch27-cxx11-rocm63-x86_64-linux/activation/layers.py
old mode 100755
new mode 100644
diff --git a/build/torch27-cxx11-rocm63-x86_64-linux/activation/poly_norm.py b/build/torch27-cxx11-rocm63-x86_64-linux/activation/poly_norm.py
old mode 100755
new mode 100644
diff --git a/build/torch27-cxx11-rocm63-x86_64-linux/activation/rms_norm.py b/build/torch27-cxx11-rocm63-x86_64-linux/activation/rms_norm.py
old mode 100755
new mode 100644
diff --git a/flake.lock b/flake.lock
deleted file mode 100644
index 4fa3e9a2e465daa852b90bc10e0c14b442b53b12..0000000000000000000000000000000000000000
--- a/flake.lock
+++ /dev/null
@@ -1,168 +0,0 @@
-{
-  "nodes": {
-    "flake-compat": {
-      "locked": {
-        "lastModified": 1747046372,
-        "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
-        "owner": "edolstra",
-        "repo": "flake-compat",
-        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
-        "type": "github"
-      },
-      "original": {
-        "owner": "edolstra",
-        "repo": "flake-compat",
-        "type": "github"
-      }
-    },
-    "flake-compat_2": {
-      "locked": {
-        "lastModified": 1733328505,
-        "narHash": "sha256-NeCCThCEP3eCl2l/+27kNNK7QrwZB1IJCrXfrbv5oqU=",
-        "owner": "edolstra",
-        "repo": "flake-compat",
-        "rev": "ff81ac966bb2cae68946d5ed5fc4994f96d0ffec",
-        "type": "github"
-      },
-      "original": {
-        "owner": "edolstra",
-        "repo": "flake-compat",
-        "type": "github"
-      }
-    },
-    "flake-utils": {
-      "inputs": {
-        "systems": "systems"
-      },
-      "locked": {
-        "lastModified": 1731533236,
-        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
-        "type": "github"
-      },
-      "original": {
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "type": "github"
-      }
-    },
-    "flake-utils_2": {
-      "inputs": {
-        "systems": "systems_2"
-      },
-      "locked": {
-        "lastModified": 1731533236,
-        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
-        "type": "github"
-      },
-      "original": {
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "type": "github"
-      }
-    },
-    "hf-nix": {
-      "inputs": {
-        "flake-compat": "flake-compat_2",
-        "flake-utils": "flake-utils_2",
-        "nixpkgs": "nixpkgs"
-      },
-      "locked": {
-        "lastModified": 1747919133,
-        "narHash": "sha256-VvF1naQOvv7yulQ5/cDiaxkNxlh1Y84QMZnderv1szk=",
-        "owner": "huggingface",
-        "repo": "hf-nix",
-        "rev": "9c71e026d6c7c8588ef85a5f7c77f57d598e038c",
-        "type": "github"
-      },
-      "original": {
-        "owner": "huggingface",
-        "repo": "hf-nix",
-        "type": "github"
-      }
-    },
-    "kernel-builder": {
-      "inputs": {
-        "flake-compat": "flake-compat",
-        "flake-utils": "flake-utils",
-        "hf-nix": "hf-nix",
-        "nixpkgs": [
-          "kernel-builder",
-          "hf-nix",
-          "nixpkgs"
-        ]
-      },
-      "locked": {
-        "lastModified": 1748620233,
-        "narHash": "sha256-VULm9HgGXvo3pyfsPy3SOhoqgkuqbGSaSemvzNUbdIU=",
-        "owner": "huggingface",
-        "repo": "kernel-builder",
-        "rev": "da3340e5b3cbb6086600420f4814b033395788d1",
-        "type": "github"
-      },
-      "original": {
-        "owner": "huggingface",
-        "repo": "kernel-builder",
-        "type": "github"
-      }
-    },
-    "nixpkgs": {
-      "locked": {
-        "lastModified": 1747820358,
-        "narHash": "sha256-fTqsZsUX6M3yeEvgyQvXcbGmT2CaRVyVwsi8eK29Oj4=",
-        "owner": "danieldk",
-        "repo": "nixpkgs",
-        "rev": "d3c1681180717528068082103bf323147de6ab0b",
-        "type": "github"
-      },
-      "original": {
-        "owner": "danieldk",
-        "ref": "cudatoolkit-12.9-kernel-builder",
-        "repo": "nixpkgs",
-        "type": "github"
-      }
-    },
-    "root": {
-      "inputs": {
-        "kernel-builder": "kernel-builder"
-      }
-    },
-    "systems": {
-      "locked": {
-        "lastModified": 1681028828,
-        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
-        "owner": "nix-systems",
-        "repo": "default",
-        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
-        "type": "github"
-      },
-      "original": {
-        "owner": "nix-systems",
-        "repo": "default",
-        "type": "github"
-      }
-    },
-    "systems_2": {
-      "locked": {
-        "lastModified": 1681028828,
-        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
-        "owner": "nix-systems",
-        "repo": "default",
-        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
-        "type": "github"
-      },
-      "original": {
-        "owner": "nix-systems",
-        "repo": "default",
-        "type": "github"
-      }
-    }
-  },
-  "root": "root",
-  "version": 7
-}