diff --git a/build/torch26-cxx11-cu118-x86_64-linux/activation/_activation_be5bedb.abi3.so b/build/torch26-cxx11-cu118-x86_64-linux/activation/_activation_be5bedb.abi3.so
deleted file mode 100755
index c1e52a91b4fa56b4ff39c854b33497b094135599..0000000000000000000000000000000000000000
--- a/build/torch26-cxx11-cu118-x86_64-linux/activation/_activation_be5bedb.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9b6ba32ecc6fc898df3b0cebee85e9afc6881749fe58142280f051ca3332d913
-size 2546864
diff --git a/build/torch26-cxx11-cu118-x86_64-linux/activation/_ops.py b/build/torch26-cxx11-cu118-x86_64-linux/activation/_ops.py
deleted file mode 100644
index 0110324ade19f59f705c61d5c21912c958e92e96..0000000000000000000000000000000000000000
--- a/build/torch26-cxx11-cu118-x86_64-linux/activation/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _activation_be5bedb
-ops = torch.ops._activation_be5bedb
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_activation_be5bedb::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx11-cu118-x86_64-linux/activation/layers.py b/build/torch26-cxx11-cu118-x86_64-linux/activation/layers.py
deleted file mode 100644
index 45b31181ffb80509a85d729a7f7ee86fc2cf014a..0000000000000000000000000000000000000000
--- a/build/torch26-cxx11-cu118-x86_64-linux/activation/layers.py
+++ /dev/null
@@ -1,128 +0,0 @@
-import torch
-import torch.nn as nn
-
-from ._ops import ops
-
-
-class SiluAndMul(nn.Module):
-    """An activation function for SwiGLU.
-
-    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
-
-    Shapes:
-        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
-        return: (num_tokens, d) or (batch_size, seq_len, d)
-    """
-
-    can_torch_compile: bool = True
-
-    def forward(self, x: torch.Tensor):
-        d = x.shape[-1] // 2
-        output_shape = x.shape[:-1] + (d,)
-        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        ops.silu_and_mul(out, x)
-        return out
-
-
-class MulAndSilu(nn.Module):
-    """An activation function for SwiGLU.
-
-    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
-
-    Shapes:
-        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
-        return: (num_tokens, d) or (batch_size, seq_len, d)
-    """
-
-    can_torch_compile: bool = True
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        d = x.shape[-1] // 2
-        output_shape = x.shape[:-1] + (d,)
-        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        ops.mul_and_silu(out, x)
-        return out
-
-
-class GeluAndMul(nn.Module):
-    """An activation function for GeGLU.
-
-    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
-
-    Shapes:
-        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
-        return: (batch_size, seq_len, d) or (num_tokens, d)
-    """
-
-    can_torch_compile: bool = True
-
-    def forward(self, x: torch.Tensor):
-        d = x.shape[-1] // 2
-        output_shape = x.shape[:-1] + (d,)
-        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        ops.gelu_and_mul(out, x)
-        return out
-
-
-class GeluTanhAndMul(nn.Module):
-    can_torch_compile: bool = True
-
-    def forward(self, x: torch.Tensor):
-        d = x.shape[-1] // 2
-        output_shape = x.shape[:-1] + (d,)
-        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        ops.gelu_tanh_and_mul(out, x)
-        return out
-
-
-class FatreluAndMul(nn.Module):
-    """An activation function for FATReLU.
-
-    The function computes x -> FATReLU(x[:d]) * x[d:] where
-    d = x.shape[-1] // 2.
-    This is used in openbmb/MiniCPM-S-1B-sft.
-
-    Shapes:
-        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
-        return: (num_tokens, d) or (batch_size, seq_len, d)
-    """
-
-    can_torch_compile: bool = True
-
-    def __init__(self, threshold: float = 0.0):
-        super().__init__()
-        self.threshold = threshold
-
-    def forward(self, x: torch.Tensor):
-        d = x.shape[-1] // 2
-        output_shape = x.shape[:-1] + (d,)
-        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        ops.fatrelu_and_mul(out, x, self.threshold)
-        return out
-
-
-class FastGELU(nn.Module):
-    can_torch_compile: bool = True
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        out = torch.empty_like(x)
-        ops.gelu_fast(out, x)
-        return out
-
-
-class NewGELU(nn.Module):
-    can_torch_compile: bool = True
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        out = torch.empty_like(x)
-        ops.gelu_new(out, x)
-        return out
-
-
-class QuickGELU(nn.Module):
-    can_torch_compile: bool = True
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        out = torch.empty_like(x)
-        ops.gelu_quick(out, x)
-        return out
diff --git a/build/torch26-cxx11-cu124-x86_64-linux/activation/_activation_be5bedb.abi3.so b/build/torch26-cxx11-cu124-x86_64-linux/activation/_activation_be5bedb.abi3.so
deleted file mode 100755
index f45a6ffcf3f11e3b24919496e213a61acb258d2a..0000000000000000000000000000000000000000
--- a/build/torch26-cxx11-cu124-x86_64-linux/activation/_activation_be5bedb.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:331dcb3900d5e47a11d3577cdbac54f15a0b6e14910239293323c1d9e4eb9f49
-size 2616928
diff --git a/build/torch26-cxx11-cu124-x86_64-linux/activation/_ops.py b/build/torch26-cxx11-cu124-x86_64-linux/activation/_ops.py
deleted file mode 100644
index 0110324ade19f59f705c61d5c21912c958e92e96..0000000000000000000000000000000000000000
--- a/build/torch26-cxx11-cu124-x86_64-linux/activation/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _activation_be5bedb
-ops = torch.ops._activation_be5bedb
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_activation_be5bedb::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx11-cu126-x86_64-linux/activation/_activation_be5bedb.abi3.so b/build/torch26-cxx11-cu126-x86_64-linux/activation/_activation_be5bedb.abi3.so
deleted file mode 100755
index 12f5777398872e7a3d93ab936e42ade8eeec3213..0000000000000000000000000000000000000000
--- a/build/torch26-cxx11-cu126-x86_64-linux/activation/_activation_be5bedb.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1ce11492b9675a44afb3b896ed80e425f2a47e29481c4aad9c4a6ac59520f011
-size 2621472
diff --git a/build/torch26-cxx11-cu126-x86_64-linux/activation/_ops.py b/build/torch26-cxx11-cu126-x86_64-linux/activation/_ops.py
deleted file mode 100644
index 0110324ade19f59f705c61d5c21912c958e92e96..0000000000000000000000000000000000000000
--- a/build/torch26-cxx11-cu126-x86_64-linux/activation/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _activation_be5bedb
-ops = torch.ops._activation_be5bedb
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_activation_be5bedb::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx98-cu118-x86_64-linux/activation/__init__.py b/build/torch26-cxx98-cu118-x86_64-linux/activation/__init__.py
deleted file mode 100644
index 1c4f207354093c6ef83eb5d7f3a5a3b22b95d357..0000000000000000000000000000000000000000
--- a/build/torch26-cxx98-cu118-x86_64-linux/activation/__init__.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import torch
-
-from ._ops import ops
-
-from . import layers
-
-
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
-
-
-def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.mul_and_silu(out, x)
-    return out
-
-
-def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.gelu_and_mul(out, x)
-    return out
-
-
-def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.gelu_tanh_and_mul(out, x)
-    return out
-
-
-def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
-    ops.fatrelu_and_mul(out, x, threshold)
-    return out
-
-
-def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.gelu_fast(out, x)
-    return out
-
-
-def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.gelu_new(out, x)
-    return out
-
-
-def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.gelu_quick(out, x)
-    return out
-
-
-__all__ = [
-    "silu_and_mul",
-    "gelu_and_mul",
-    "gelu_tanh_and_mul",
-    "fatrelu_and_mul",
-    "gelu_fast",
-    "gelu_new",
-    "gelu_quick",
-    "layers",
-]
diff --git a/build/torch26-cxx98-cu118-x86_64-linux/activation/_activation_be5bedb.abi3.so b/build/torch26-cxx98-cu118-x86_64-linux/activation/_activation_be5bedb.abi3.so
deleted file mode 100755
index 056de26936949cc36baf3caa9c4212d730da81f7..0000000000000000000000000000000000000000
--- a/build/torch26-cxx98-cu118-x86_64-linux/activation/_activation_be5bedb.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:434bd1ae43b7cbdb10d86b82da9a237ec05ef9d9fb4fc15cdc9096d3d5ed3fa7
-size 2539352
diff --git a/build/torch26-cxx98-cu118-x86_64-linux/activation/_ops.py b/build/torch26-cxx98-cu118-x86_64-linux/activation/_ops.py
deleted file mode 100644
index 0110324ade19f59f705c61d5c21912c958e92e96..0000000000000000000000000000000000000000
--- a/build/torch26-cxx98-cu118-x86_64-linux/activation/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _activation_be5bedb
-ops = torch.ops._activation_be5bedb
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_activation_be5bedb::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx98-cu124-x86_64-linux/activation/__init__.py b/build/torch26-cxx98-cu124-x86_64-linux/activation/__init__.py
deleted file mode 100644
index 1c4f207354093c6ef83eb5d7f3a5a3b22b95d357..0000000000000000000000000000000000000000
--- a/build/torch26-cxx98-cu124-x86_64-linux/activation/__init__.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import torch
-
-from ._ops import ops
-
-from . import layers
-
-
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
-
-
-def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.mul_and_silu(out, x)
-    return out
-
-
-def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.gelu_and_mul(out, x)
-    return out
-
-
-def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.gelu_tanh_and_mul(out, x)
-    return out
-
-
-def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
-    ops.fatrelu_and_mul(out, x, threshold)
-    return out
-
-
-def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.gelu_fast(out, x)
-    return out
-
-
-def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.gelu_new(out, x)
-    return out
-
-
-def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.gelu_quick(out, x)
-    return out
-
-
-__all__ = [
-    "silu_and_mul",
-    "gelu_and_mul",
-    "gelu_tanh_and_mul",
-    "fatrelu_and_mul",
-    "gelu_fast",
-    "gelu_new",
-    "gelu_quick",
-    "layers",
-]
diff --git a/build/torch26-cxx98-cu124-x86_64-linux/activation/_activation_be5bedb.abi3.so b/build/torch26-cxx98-cu124-x86_64-linux/activation/_activation_be5bedb.abi3.so
deleted file mode 100755
index c31190f8f2be87dbb5d5a9c497c68cea2258fded..0000000000000000000000000000000000000000
--- a/build/torch26-cxx98-cu124-x86_64-linux/activation/_activation_be5bedb.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:53ddfb42466bfe01feb98348f5c2d6beefd589aeb3dec4c5c36609e11a6bde4c
-size 2605136
diff --git a/build/torch26-cxx98-cu124-x86_64-linux/activation/_ops.py b/build/torch26-cxx98-cu124-x86_64-linux/activation/_ops.py
deleted file mode 100644
index 0110324ade19f59f705c61d5c21912c958e92e96..0000000000000000000000000000000000000000
--- a/build/torch26-cxx98-cu124-x86_64-linux/activation/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _activation_be5bedb
-ops = torch.ops._activation_be5bedb
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_activation_be5bedb::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx98-cu124-x86_64-linux/activation/layers.py b/build/torch26-cxx98-cu124-x86_64-linux/activation/layers.py
deleted file mode 100644
index 45b31181ffb80509a85d729a7f7ee86fc2cf014a..0000000000000000000000000000000000000000
--- a/build/torch26-cxx98-cu124-x86_64-linux/activation/layers.py
+++ /dev/null
@@ -1,128 +0,0 @@
-import torch
-import torch.nn as nn
-
-from ._ops import ops
-
-
-class SiluAndMul(nn.Module):
-    """An activation function for SwiGLU.
-
-    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
-
-    Shapes:
-        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
-        return: (num_tokens, d) or (batch_size, seq_len, d)
-    """
-
-    can_torch_compile: bool = True
-
-    def forward(self, x: torch.Tensor):
-        d = x.shape[-1] // 2
-        output_shape = x.shape[:-1] + (d,)
-        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        ops.silu_and_mul(out, x)
-        return out
-
-
-class MulAndSilu(nn.Module):
-    """An activation function for SwiGLU.
-
-    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
-
-    Shapes:
-        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
-        return: (num_tokens, d) or (batch_size, seq_len, d)
-    """
-
-    can_torch_compile: bool = True
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        d = x.shape[-1] // 2
-        output_shape = x.shape[:-1] + (d,)
-        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        ops.mul_and_silu(out, x)
-        return out
-
-
-class GeluAndMul(nn.Module):
-    """An activation function for GeGLU.
-
-    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
-
-    Shapes:
-        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
-        return: (batch_size, seq_len, d) or (num_tokens, d)
-    """
-
-    can_torch_compile: bool = True
-
-    def forward(self, x: torch.Tensor):
-        d = x.shape[-1] // 2
-        output_shape = x.shape[:-1] + (d,)
-        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        ops.gelu_and_mul(out, x)
-        return out
-
-
-class GeluTanhAndMul(nn.Module):
-    can_torch_compile: bool = True
-
-    def forward(self, x: torch.Tensor):
-        d = x.shape[-1] // 2
-        output_shape = x.shape[:-1] + (d,)
-        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        ops.gelu_tanh_and_mul(out, x)
-        return out
-
-
-class FatreluAndMul(nn.Module):
-    """An activation function for FATReLU.
-
-    The function computes x -> FATReLU(x[:d]) * x[d:] where
-    d = x.shape[-1] // 2.
-    This is used in openbmb/MiniCPM-S-1B-sft.
-
-    Shapes:
-        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
-        return: (num_tokens, d) or (batch_size, seq_len, d)
-    """
-
-    can_torch_compile: bool = True
-
-    def __init__(self, threshold: float = 0.0):
-        super().__init__()
-        self.threshold = threshold
-
-    def forward(self, x: torch.Tensor):
-        d = x.shape[-1] // 2
-        output_shape = x.shape[:-1] + (d,)
-        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        ops.fatrelu_and_mul(out, x, self.threshold)
-        return out
-
-
-class FastGELU(nn.Module):
-    can_torch_compile: bool = True
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        out = torch.empty_like(x)
-        ops.gelu_fast(out, x)
-        return out
-
-
-class NewGELU(nn.Module):
-    can_torch_compile: bool = True
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        out = torch.empty_like(x)
-        ops.gelu_new(out, x)
-        return out
-
-
-class QuickGELU(nn.Module):
-    can_torch_compile: bool = True
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        out = torch.empty_like(x)
-        ops.gelu_quick(out, x)
-        return out
diff --git a/build/torch26-cxx98-cu126-x86_64-linux/activation/__init__.py b/build/torch26-cxx98-cu126-x86_64-linux/activation/__init__.py
deleted file mode 100644
index 1c4f207354093c6ef83eb5d7f3a5a3b22b95d357..0000000000000000000000000000000000000000
--- a/build/torch26-cxx98-cu126-x86_64-linux/activation/__init__.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import torch
-
-from ._ops import ops
-
-from . import layers
-
-
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
-
-
-def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.mul_and_silu(out, x)
-    return out
-
-
-def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.gelu_and_mul(out, x)
-    return out
-
-
-def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.gelu_tanh_and_mul(out, x)
-    return out
-
-
-def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
-    ops.fatrelu_and_mul(out, x, threshold)
-    return out
-
-
-def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.gelu_fast(out, x)
-    return out
-
-
-def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.gelu_new(out, x)
-    return out
-
-
-def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.gelu_quick(out, x)
-    return out
-
-
-__all__ = [
-    "silu_and_mul",
-    "gelu_and_mul",
-    "gelu_tanh_and_mul",
-    "fatrelu_and_mul",
-    "gelu_fast",
-    "gelu_new",
-    "gelu_quick",
-    "layers",
-]
diff --git a/build/torch26-cxx98-cu126-x86_64-linux/activation/_activation_be5bedb.abi3.so b/build/torch26-cxx98-cu126-x86_64-linux/activation/_activation_be5bedb.abi3.so
deleted file mode 100755
index 516f085e9ac787a2454fb78975dbaec25d2a6576..0000000000000000000000000000000000000000
--- a/build/torch26-cxx98-cu126-x86_64-linux/activation/_activation_be5bedb.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ac7174352dea307231f308c84ca32ee001cdbcefd976de860e76501c52aae591
-size 2613776
diff --git a/build/torch26-cxx98-cu126-x86_64-linux/activation/_ops.py b/build/torch26-cxx98-cu126-x86_64-linux/activation/_ops.py
deleted file mode 100644
index 0110324ade19f59f705c61d5c21912c958e92e96..0000000000000000000000000000000000000000
--- a/build/torch26-cxx98-cu126-x86_64-linux/activation/_ops.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import torch
-from . import _activation_be5bedb
-ops = torch.ops._activation_be5bedb
-
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_activation_be5bedb::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx98-cu126-x86_64-linux/activation/layers.py b/build/torch26-cxx98-cu126-x86_64-linux/activation/layers.py
deleted file mode 100644
index 45b31181ffb80509a85d729a7f7ee86fc2cf014a..0000000000000000000000000000000000000000
--- a/build/torch26-cxx98-cu126-x86_64-linux/activation/layers.py
+++ /dev/null
@@ -1,128 +0,0 @@
-import torch
-import torch.nn as nn
-
-from ._ops import ops
-
-
-class SiluAndMul(nn.Module):
-    """An activation function for SwiGLU.
-
-    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
-
-    Shapes:
-        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
-        return: (num_tokens, d) or (batch_size, seq_len, d)
-    """
-
-    can_torch_compile: bool = True
-
-    def forward(self, x: torch.Tensor):
-        d = x.shape[-1] // 2
-        output_shape = x.shape[:-1] + (d,)
-        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        ops.silu_and_mul(out, x)
-        return out
-
-
-class MulAndSilu(nn.Module):
-    """An activation function for SwiGLU.
-
-    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
-
-    Shapes:
-        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
-        return: (num_tokens, d) or (batch_size, seq_len, d)
-    """
-
-    can_torch_compile: bool = True
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        d = x.shape[-1] // 2
-        output_shape = x.shape[:-1] + (d,)
-        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        ops.mul_and_silu(out, x)
-        return out
-
-
-class GeluAndMul(nn.Module):
-    """An activation function for GeGLU.
-
-    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
-
-    Shapes:
-        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
-        return: (batch_size, seq_len, d) or (num_tokens, d)
-    """
-
-    can_torch_compile: bool = True
-
-    def forward(self, x: torch.Tensor):
-        d = x.shape[-1] // 2
-        output_shape = x.shape[:-1] + (d,)
-        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        ops.gelu_and_mul(out, x)
-        return out
-
-
-class GeluTanhAndMul(nn.Module):
-    can_torch_compile: bool = True
-
-    def forward(self, x: torch.Tensor):
-        d = x.shape[-1] // 2
-        output_shape = x.shape[:-1] + (d,)
-        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        ops.gelu_tanh_and_mul(out, x)
-        return out
-
-
-class FatreluAndMul(nn.Module):
-    """An activation function for FATReLU.
-
-    The function computes x -> FATReLU(x[:d]) * x[d:] where
-    d = x.shape[-1] // 2.
-    This is used in openbmb/MiniCPM-S-1B-sft.
-
-    Shapes:
-        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
-        return: (num_tokens, d) or (batch_size, seq_len, d)
-    """
-
-    can_torch_compile: bool = True
-
-    def __init__(self, threshold: float = 0.0):
-        super().__init__()
-        self.threshold = threshold
-
-    def forward(self, x: torch.Tensor):
-        d = x.shape[-1] // 2
-        output_shape = x.shape[:-1] + (d,)
-        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        ops.fatrelu_and_mul(out, x, self.threshold)
-        return out
-
-
-class FastGELU(nn.Module):
-    can_torch_compile: bool = True
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        out = torch.empty_like(x)
-        ops.gelu_fast(out, x)
-        return out
-
-
-class NewGELU(nn.Module):
-    can_torch_compile: bool = True
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        out = torch.empty_like(x)
-        ops.gelu_new(out, x)
-        return out
-
-
-class QuickGELU(nn.Module):
-    can_torch_compile: bool = True
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        out = torch.empty_like(x)
-        ops.gelu_quick(out, x)
-        return out
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc
index 3a6358b82d007fa92ac419a82b73a371a184992c..bbf3ad846a76e365312ad965559a177976801396 100644
Binary files a/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc and b/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc
index aa07da5459427811e64acc67e85be6a1a5d8109d..47765ef8e985a500bbb3e25990387a1f1f15c767 100644
Binary files a/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc and b/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc b/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc
index 09398aaf4f3214cbf0c6b079dc7c7f6d2c12e109..de62862184381714910c79ecdf8db3ca14f8a753 100644
Binary files a/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc and b/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/activation/_activation_20250917153858.abi3.so b/build/torch27-cxx11-cu118-x86_64-linux/activation/_activation_20250917153858.abi3.so
deleted file mode 100755
index 707666b73feb1d1a677d21840923c0146c316f66..0000000000000000000000000000000000000000
--- a/build/torch27-cxx11-cu118-x86_64-linux/activation/_activation_20250917153858.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:618cdba5f19eabc1f9c1d33e130ef03ab1b11b52f1e7b00b73f2a10d5cf1e62f
-size 2773664
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/activation/_activation_beeaae6.abi3.so b/build/torch27-cxx11-cu118-x86_64-linux/activation/_activation_beeaae6.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..c6c9665f880b574481be0f6464ac7637e732df84
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/activation/_activation_beeaae6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce06ec284ecd4ac5423d3822a60cd9eeb686d0054b38d66567de73e1137b0567
+size 2773632
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/activation/_ops.py b/build/torch27-cxx11-cu118-x86_64-linux/activation/_ops.py
index a24764a95a7a5490ca596cd418d5ce2c2591c906..4d722bffa37106dd2bfdb75db14408c7eecefcb0 100644
--- a/build/torch27-cxx11-cu118-x86_64-linux/activation/_ops.py
+++ b/build/torch27-cxx11-cu118-x86_64-linux/activation/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _activation_20250917153858
-ops = torch.ops._activation_20250917153858
+from . import _activation_beeaae6
+ops = torch.ops._activation_beeaae6
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_20250917153858::{op_name}"
\ No newline at end of file
+    return f"_activation_beeaae6::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc
index 0c4d3787b1aeba2c506fc491aaa28cbb5dbf9ac6..29e76b5c619af9b19c5650edcfd4f63c4725d35f 100644
Binary files a/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc and b/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc
index 3aed458254d1ebba49b19df3d2984ea7ce30556f..f54053b63e8c2b7598967b6ca9739ecc85d6142a 100644
Binary files a/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc and b/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc
index 4fe6da8188a01106d53124e5bcb3b53d1dc0e509..4d4a3c1172a3a2b4c954199c9762b3251d1c468c 100644
Binary files a/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc and b/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/activation/_activation_20250917153858.abi3.so b/build/torch27-cxx11-cu126-x86_64-linux/activation/_activation_20250917153858.abi3.so
deleted file mode 100755
index b1d622e9f768e1d07dc670ad89deb0de15a8a46a..0000000000000000000000000000000000000000
--- a/build/torch27-cxx11-cu126-x86_64-linux/activation/_activation_20250917153858.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:87ee9280b670b3323378c17d75ee7082f419987a568769fe8479bf509ee6c245
-size 2852232
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/activation/_activation_beeaae6.abi3.so b/build/torch27-cxx11-cu126-x86_64-linux/activation/_activation_beeaae6.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..e9e9102689a8ddf42f881abedcd19e137f22d5e4
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/activation/_activation_beeaae6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a529bd105aca5081398d63329e829b6b159570424cd654d3a9f275ca9a720e82
+size 2852200
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/activation/_ops.py b/build/torch27-cxx11-cu126-x86_64-linux/activation/_ops.py
index a24764a95a7a5490ca596cd418d5ce2c2591c906..4d722bffa37106dd2bfdb75db14408c7eecefcb0 100644
--- a/build/torch27-cxx11-cu126-x86_64-linux/activation/_ops.py
+++ b/build/torch27-cxx11-cu126-x86_64-linux/activation/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _activation_20250917153858
-ops = torch.ops._activation_20250917153858
+from . import _activation_beeaae6
+ops = torch.ops._activation_beeaae6
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_20250917153858::{op_name}"
\ No newline at end of file
+    return f"_activation_beeaae6::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc
index 09ba7d3df59ba0e6bb6f28483d8d9d066e736296..364976ff5017b183a827c0dfcda90becfbab0e7c 100644
Binary files a/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc and b/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc
index 76b49d8e1d63e6bc3eab559ae97d3dd57281a675..008e1b91db1ae539587989af1a212f9cd38a1ae2 100644
Binary files a/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc and b/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc
index 13146c78d42a18877fe1041ac8469d766158775e..d00f03a5b9a4944132d13ac0986acc2c54e0ca3c 100644
Binary files a/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc and b/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/activation/_activation_20250917153858.abi3.so b/build/torch27-cxx11-cu128-x86_64-linux/activation/_activation_20250917153858.abi3.so
deleted file mode 100755
index 9830157016a530b7cfeac9d15d361a7c2cffeffd..0000000000000000000000000000000000000000
--- a/build/torch27-cxx11-cu128-x86_64-linux/activation/_activation_20250917153858.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:28ca9a3e35c49ae49694d7c6c77f85f3664622cad9c857bf13dfbf3bc144ae1b
-size 4127912
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/activation/_activation_beeaae6.abi3.so b/build/torch27-cxx11-cu128-x86_64-linux/activation/_activation_beeaae6.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..6d8adc0f26f3b10cbc1b441b74bc7f49c0ebdaae
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/activation/_activation_beeaae6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f2cffcb6b5b9a49f03a2df46fc2ad36765676edecb468c233e78e1f5e21e206
+size 4127872
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/activation/_ops.py b/build/torch27-cxx11-cu128-x86_64-linux/activation/_ops.py
index a24764a95a7a5490ca596cd418d5ce2c2591c906..4d722bffa37106dd2bfdb75db14408c7eecefcb0 100644
--- a/build/torch27-cxx11-cu128-x86_64-linux/activation/_ops.py
+++ b/build/torch27-cxx11-cu128-x86_64-linux/activation/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _activation_20250917153858
-ops = torch.ops._activation_20250917153858
+from . import _activation_beeaae6
+ops = torch.ops._activation_beeaae6
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_20250917153858::{op_name}"
\ No newline at end of file
+    return f"_activation_beeaae6::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-cu126-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc
index 9b1754cfdb6ad5edfe73ae99dcd829df47bbbe92..e8f8e706b1057711ae9e53bf255aa392d9356d5b 100644
Binary files a/build/torch28-cxx11-cu126-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc and b/build/torch28-cxx11-cu126-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc b/build/torch28-cxx11-cu126-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc
index cb5b93c070c1bc3449aeddfd7bc67f3e73ce0671..ca11e4cda13d6d4f0a9f8a37d7188d53380ddde2 100644
Binary files a/build/torch28-cxx11-cu126-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc and b/build/torch28-cxx11-cu126-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc b/build/torch28-cxx11-cu126-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc
index d3c18f3d02cc0af239075a590f1f1232c7bb61f8..e906e10360ab9b669e4add9e39cb9ce133ca04f6 100644
Binary files a/build/torch28-cxx11-cu126-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc and b/build/torch28-cxx11-cu126-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/activation/_activation_20250917153858.abi3.so b/build/torch28-cxx11-cu126-x86_64-linux/activation/_activation_20250917153858.abi3.so
deleted file mode 100755
index 2ffd19a1b43e107e6703a009dfa85619524754b9..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu126-x86_64-linux/activation/_activation_20250917153858.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0fcd47dd765bba10bb09f65388f6c1b357b117b2611c17bae5bf8214499a9e39
-size 2837224
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/activation/_activation_beeaae6.abi3.so b/build/torch28-cxx11-cu126-x86_64-linux/activation/_activation_beeaae6.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..7c3397feac6fa683af5617d944ea5e6f5f42bf1b
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/activation/_activation_beeaae6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:972be0b2b7ce4f771028406367437488743dc81d70e6316e7a2694df1422b23d
+size 2837192
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/activation/_ops.py b/build/torch28-cxx11-cu126-x86_64-linux/activation/_ops.py
index a24764a95a7a5490ca596cd418d5ce2c2591c906..4d722bffa37106dd2bfdb75db14408c7eecefcb0 100644
--- a/build/torch28-cxx11-cu126-x86_64-linux/activation/_ops.py
+++ b/build/torch28-cxx11-cu126-x86_64-linux/activation/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _activation_20250917153858
-ops = torch.ops._activation_20250917153858
+from . import _activation_beeaae6
+ops = torch.ops._activation_beeaae6
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_20250917153858::{op_name}"
\ No newline at end of file
+    return f"_activation_beeaae6::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-cu128-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc
index a7fd63365a953f7804b2a89b5dda50cd506a0fdc..d12dd70b4a1174dc45b09641f8a67395f73f2052 100644
Binary files a/build/torch28-cxx11-cu128-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc and b/build/torch28-cxx11-cu128-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc b/build/torch28-cxx11-cu128-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc
index fe47bb82e8371e3dba3018517aec31b669970d04..e5ad25122dbe45d007132c05ad491272043aff5a 100644
Binary files a/build/torch28-cxx11-cu128-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc and b/build/torch28-cxx11-cu128-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc b/build/torch28-cxx11-cu128-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc
index 232694fed7e1ea130e0cfcb18f219a62a996c206..55353ba18a89c372e3738c44597e1c129e955e3f 100644
Binary files a/build/torch28-cxx11-cu128-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc and b/build/torch28-cxx11-cu128-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/activation/_activation_20250917153858.abi3.so b/build/torch28-cxx11-cu128-x86_64-linux/activation/_activation_20250917153858.abi3.so
deleted file mode 100755
index a86c4c4db41ceacc50bb8a05ab438c747a8ef0ab..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu128-x86_64-linux/activation/_activation_20250917153858.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0e6d88c71eebabc842f6a566de7cfaf24d3d90a30572eae584a3b51dcb7e838e
-size 4117000
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/activation/_activation_beeaae6.abi3.so b/build/torch28-cxx11-cu128-x86_64-linux/activation/_activation_beeaae6.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..f12d8ce6414b9517c65869fe83bb570a87480d74
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/activation/_activation_beeaae6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec0756eb56dab9c57cc1aa01cfc2301d508fdf11ac4d02d015f7c16dd2246f2f
+size 4116960
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/activation/_ops.py b/build/torch28-cxx11-cu128-x86_64-linux/activation/_ops.py
index a24764a95a7a5490ca596cd418d5ce2c2591c906..4d722bffa37106dd2bfdb75db14408c7eecefcb0 100644
--- a/build/torch28-cxx11-cu128-x86_64-linux/activation/_ops.py
+++ b/build/torch28-cxx11-cu128-x86_64-linux/activation/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _activation_20250917153858
-ops = torch.ops._activation_20250917153858
+from . import _activation_beeaae6
+ops = torch.ops._activation_beeaae6
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_20250917153858::{op_name}"
\ No newline at end of file
+    return f"_activation_beeaae6::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-cu129-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc
index ed1db9c86882966d57ed36a0ed55bc4b2ca19321..cbbd7d5ff58d32b11600b3114e01c9f049ac553a 100644
Binary files a/build/torch28-cxx11-cu129-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc and b/build/torch28-cxx11-cu129-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc b/build/torch28-cxx11-cu129-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc
index 5241c54af2fe7946d1a0fd85a475d0d3ca40a4cf..6239d94f12316596571aa36b5f80073c4b3001c4 100644
Binary files a/build/torch28-cxx11-cu129-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc and b/build/torch28-cxx11-cu129-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc b/build/torch28-cxx11-cu129-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc
index f6d111cf4f598453f07c754bf3bce7d50cafbff8..7c61641f68aa6668f378809762977aac8344e655 100644
Binary files a/build/torch28-cxx11-cu129-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc and b/build/torch28-cxx11-cu129-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/activation/_activation_20250917153858.abi3.so b/build/torch28-cxx11-cu129-x86_64-linux/activation/_activation_20250917153858.abi3.so
deleted file mode 100755
index 56bc6e0d6cb4f9b4e7260eab9be147746e14bd98..0000000000000000000000000000000000000000
--- a/build/torch28-cxx11-cu129-x86_64-linux/activation/_activation_20250917153858.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f3c1b86db31b04bd5fe75b0c9d6915ba2766a2456ea9bd1a20f2d75c4b1acf35
-size 4154880
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/activation/_activation_beeaae6.abi3.so b/build/torch28-cxx11-cu129-x86_64-linux/activation/_activation_beeaae6.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..df6a901f09b0db5c03a0dea245c2500eb9a4b05a
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/activation/_activation_beeaae6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de346c02f046cbb177556580efc9994632adad1439bb90f451f2f690e326c39c
+size 4154840
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/activation/_ops.py b/build/torch28-cxx11-cu129-x86_64-linux/activation/_ops.py
index a24764a95a7a5490ca596cd418d5ce2c2591c906..4d722bffa37106dd2bfdb75db14408c7eecefcb0 100644
--- a/build/torch28-cxx11-cu129-x86_64-linux/activation/_ops.py
+++ b/build/torch28-cxx11-cu129-x86_64-linux/activation/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _activation_20250917153858
-ops = torch.ops._activation_20250917153858
+from . import _activation_beeaae6
+ops = torch.ops._activation_beeaae6
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_20250917153858::{op_name}"
\ No newline at end of file
+    return f"_activation_beeaae6::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx11-cu118-x86_64-linux/activation/__init__.py b/build/torch29-cxx11-cu126-x86_64-linux/activation/__init__.py
similarity index 76%
rename from build/torch26-cxx11-cu118-x86_64-linux/activation/__init__.py
rename to build/torch29-cxx11-cu126-x86_64-linux/activation/__init__.py
index 1c4f207354093c6ef83eb5d7f3a5a3b22b95d357..1a9cd15a0a75f95c5ab956fb05c2a9860f218156 100644
--- a/build/torch26-cxx11-cu118-x86_64-linux/activation/__init__.py
+++ b/build/torch29-cxx11-cu126-x86_64-linux/activation/__init__.py
@@ -30,6 +30,20 @@ def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0)
     return out
 
 
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
 def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_fast(out, x)
     return out
@@ -47,11 +61,15 @@ def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
 
 __all__ = [
     "silu_and_mul",
+    "mul_and_silu",
     "gelu_and_mul",
     "gelu_tanh_and_mul",
     "fatrelu_and_mul",
     "gelu_fast",
     "gelu_new",
     "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
     "layers",
 ]
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc b/build/torch29-cxx11-cu126-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8701dcb62a9afdfff0bf2da0b13995a2f4052dc2
Binary files /dev/null and b/build/torch29-cxx11-cu126-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc b/build/torch29-cxx11-cu126-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a4a13d5d1bf25ab58915502dc566b8de851bc021
Binary files /dev/null and b/build/torch29-cxx11-cu126-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc b/build/torch29-cxx11-cu126-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d7a5ecaadd06dac28e818f8290b371c1294f7a4
Binary files /dev/null and b/build/torch29-cxx11-cu126-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc differ
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/activation/_activation_beeaae6.abi3.so b/build/torch29-cxx11-cu126-x86_64-linux/activation/_activation_beeaae6.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..31e749efdff1ee341c214c67049d687123ed5a42
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/activation/_activation_beeaae6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59e915bb752b7105f3c2594ababa4480e8de7408257b07f5897f82012377e8c7
+size 2837168
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/activation/_ops.py b/build/torch29-cxx11-cu126-x86_64-linux/activation/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d722bffa37106dd2bfdb75db14408c7eecefcb0
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/activation/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_beeaae6
+ops = torch.ops._activation_beeaae6
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_beeaae6::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx11-cu124-x86_64-linux/activation/layers.py b/build/torch29-cxx11-cu126-x86_64-linux/activation/layers.py
similarity index 73%
rename from build/torch26-cxx11-cu124-x86_64-linux/activation/layers.py
rename to build/torch29-cxx11-cu126-x86_64-linux/activation/layers.py
index 45b31181ffb80509a85d729a7f7ee86fc2cf014a..0aec9c95fa75e4d3ff699ce69fc6618798b179c1 100644
--- a/build/torch26-cxx11-cu124-x86_64-linux/activation/layers.py
+++ b/build/torch29-cxx11-cu126-x86_64-linux/activation/layers.py
@@ -23,6 +23,57 @@ class SiluAndMul(nn.Module):
         ops.silu_and_mul(out, x)
         return out
 
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
 
 class MulAndSilu(nn.Module):
     """An activation function for SwiGLU.
diff --git a/build/torch26-cxx11-cu124-x86_64-linux/activation/__init__.py b/build/torch29-cxx11-cu128-x86_64-linux/activation/__init__.py
similarity index 76%
rename from build/torch26-cxx11-cu124-x86_64-linux/activation/__init__.py
rename to build/torch29-cxx11-cu128-x86_64-linux/activation/__init__.py
index 1c4f207354093c6ef83eb5d7f3a5a3b22b95d357..1a9cd15a0a75f95c5ab956fb05c2a9860f218156 100644
--- a/build/torch26-cxx11-cu124-x86_64-linux/activation/__init__.py
+++ b/build/torch29-cxx11-cu128-x86_64-linux/activation/__init__.py
@@ -30,6 +30,20 @@ def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0)
     return out
 
 
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
 def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_fast(out, x)
     return out
@@ -47,11 +61,15 @@ def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
 
 __all__ = [
     "silu_and_mul",
+    "mul_and_silu",
     "gelu_and_mul",
     "gelu_tanh_and_mul",
     "fatrelu_and_mul",
     "gelu_fast",
     "gelu_new",
     "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
     "layers",
 ]
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc b/build/torch29-cxx11-cu128-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b934f588f1084b4e695f05dd5b505bb9f3b6977a
Binary files /dev/null and b/build/torch29-cxx11-cu128-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc b/build/torch29-cxx11-cu128-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65804eed7cc7204dc308abe7c10470bb29e91534
Binary files /dev/null and b/build/torch29-cxx11-cu128-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc b/build/torch29-cxx11-cu128-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d232f4f0d36e80341e80d079349f68ddc9f5a3cc
Binary files /dev/null and b/build/torch29-cxx11-cu128-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc differ
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/activation/_activation_beeaae6.abi3.so b/build/torch29-cxx11-cu128-x86_64-linux/activation/_activation_beeaae6.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..386275e1936b21f67c78effb606db9a1d69f729a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/activation/_activation_beeaae6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:74494aaff73017fd8103b598f6fc8c92085e3dc0be63bda413f658bb7bbfc9b0
+size 4116936
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/activation/_ops.py b/build/torch29-cxx11-cu128-x86_64-linux/activation/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d722bffa37106dd2bfdb75db14408c7eecefcb0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/activation/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_beeaae6
+ops = torch.ops._activation_beeaae6
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_beeaae6::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx11-cu126-x86_64-linux/activation/layers.py b/build/torch29-cxx11-cu128-x86_64-linux/activation/layers.py
similarity index 73%
rename from build/torch26-cxx11-cu126-x86_64-linux/activation/layers.py
rename to build/torch29-cxx11-cu128-x86_64-linux/activation/layers.py
index 45b31181ffb80509a85d729a7f7ee86fc2cf014a..0aec9c95fa75e4d3ff699ce69fc6618798b179c1 100644
--- a/build/torch26-cxx11-cu126-x86_64-linux/activation/layers.py
+++ b/build/torch29-cxx11-cu128-x86_64-linux/activation/layers.py
@@ -23,6 +23,57 @@ class SiluAndMul(nn.Module):
         ops.silu_and_mul(out, x)
         return out
 
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
 
 class MulAndSilu(nn.Module):
     """An activation function for SwiGLU.
diff --git a/build/torch26-cxx11-cu126-x86_64-linux/activation/__init__.py b/build/torch29-cxx11-cu130-x86_64-linux/activation/__init__.py
similarity index 76%
rename from build/torch26-cxx11-cu126-x86_64-linux/activation/__init__.py
rename to build/torch29-cxx11-cu130-x86_64-linux/activation/__init__.py
index 1c4f207354093c6ef83eb5d7f3a5a3b22b95d357..1a9cd15a0a75f95c5ab956fb05c2a9860f218156 100644
--- a/build/torch26-cxx11-cu126-x86_64-linux/activation/__init__.py
+++ b/build/torch29-cxx11-cu130-x86_64-linux/activation/__init__.py
@@ -30,6 +30,20 @@ def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0)
     return out
 
 
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
 def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_fast(out, x)
     return out
@@ -47,11 +61,15 @@ def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
 
 __all__ = [
     "silu_and_mul",
+    "mul_and_silu",
     "gelu_and_mul",
     "gelu_tanh_and_mul",
     "fatrelu_and_mul",
     "gelu_fast",
     "gelu_new",
     "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
     "layers",
 ]
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc b/build/torch29-cxx11-cu130-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eac32d49e24d1a8671ffcddff8119d7a14e35f3f
Binary files /dev/null and b/build/torch29-cxx11-cu130-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc b/build/torch29-cxx11-cu130-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f4111d9c64240435bd7d59958c320ea24e2f710
Binary files /dev/null and b/build/torch29-cxx11-cu130-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc b/build/torch29-cxx11-cu130-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..40eec88689ee66667ecd946bb43a0cd137b80d38
Binary files /dev/null and b/build/torch29-cxx11-cu130-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc differ
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/activation/_activation_beeaae6.abi3.so b/build/torch29-cxx11-cu130-x86_64-linux/activation/_activation_beeaae6.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..38e458a1206168d344db213c3c06e3cd873a6834
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/activation/_activation_beeaae6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f5500ae615f8a0abf063368bf22c4d031a2e4a8893817bd3bcaffc321d1622d
+size 4019704
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/activation/_ops.py b/build/torch29-cxx11-cu130-x86_64-linux/activation/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d722bffa37106dd2bfdb75db14408c7eecefcb0
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/activation/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_beeaae6
+ops = torch.ops._activation_beeaae6
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_beeaae6::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx98-cu118-x86_64-linux/activation/layers.py b/build/torch29-cxx11-cu130-x86_64-linux/activation/layers.py
similarity index 73%
rename from build/torch26-cxx98-cu118-x86_64-linux/activation/layers.py
rename to build/torch29-cxx11-cu130-x86_64-linux/activation/layers.py
index 45b31181ffb80509a85d729a7f7ee86fc2cf014a..0aec9c95fa75e4d3ff699ce69fc6618798b179c1 100644
--- a/build/torch26-cxx98-cu118-x86_64-linux/activation/layers.py
+++ b/build/torch29-cxx11-cu130-x86_64-linux/activation/layers.py
@@ -23,6 +23,57 @@ class SiluAndMul(nn.Module):
         ops.silu_and_mul(out, x)
         return out
 
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
 
 class MulAndSilu(nn.Module):
     """An activation function for SwiGLU.