diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..c59f6506a65a54819230080499358ef470b07cbe
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,48 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cu130-x86_64-windows/rotary/_rotary_a793e44.pyd filter=lfs diff=lfs merge=lfs -text
+build/torch210-cu128-x86_64-windows/rotary/_rotary_119c830.pyd filter=lfs diff=lfs merge=lfs -text
+build/torch210-cu128-x86_64-windows/rotary/_rotary_cdcfefe.pyd filter=lfs diff=lfs merge=lfs -text
+build/torch29-xpu20252-x86_64-windows/rotary/_rotary_cdcfefe.pyd filter=lfs diff=lfs merge=lfs -text
+build/torch210-cu128-x86_64-windows/rotary/_rotary_dec30e1.pyd filter=lfs diff=lfs merge=lfs -text
+build/torch29-xpu20252-x86_64-windows/rotary/_rotary_dec30e1.pyd filter=lfs diff=lfs merge=lfs -text
+build/torch210-cu128-x86_64-windows/rotary/_rotary_66b961a.pyd filter=lfs diff=lfs merge=lfs -text
+build/torch29-xpu20252-x86_64-windows/rotary/_rotary_66b961a.pyd filter=lfs diff=lfs merge=lfs -text
+build/torch210-cu128-x86_64-windows/rotary/_rotary_9f63cc2.pyd filter=lfs diff=lfs merge=lfs -text
+build/torch210-xpu20253-x86_64-windows/rotary/_rotary_9f63cc2.pyd filter=lfs diff=lfs merge=lfs -text
+build/torch210-cu128-x86_64-windows/_rotary_cuda_07a01e5.pyd filter=lfs diff=lfs merge=lfs -text
+build/torch210-xpu20253-x86_64-windows/_rotary_xpu_07a01e5.pyd filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..179b56bedf996512ad8327bfa4cc7af907981b79
--- /dev/null
+++ b/README.md
@@ -0,0 +1,14 @@
+---
+license: bsd-3-clause
+tags:
+  - kernels
+---
+
+![Status](https://hubwebhook.dholtz.com/shield?repo=kernels-community/rotary)
+
+## rotary
+
+rotary embedding kernel from [Flash Attention](https://github.com/Dao-AILab/flash-attention/tree/main/csrc/rotary).
+
+Kernel source: https://github.com/huggingface/kernels-community/tree/main/rotary
+
diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdc40732641641e3a82d30aec2d8643fd7f3f31a
--- /dev/null
+++ b/benchmarks/benchmark.py
@@ -0,0 +1,119 @@
+import torch
+
+from kernels.benchmark import Benchmark
+
+
+def apply_rotary_reference(
+    x1: torch.Tensor, x2: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, conj: bool
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if not conj:
+        out1 = x1 * cos - x2 * sin
+        out2 = x1 * sin + x2 * cos
+    else:
+        out1 = x1 * cos + x2 * sin
+        out2 = -x1 * sin + x2 * cos
+    return out1, out2
+
+
+class RotaryBenchmark(Benchmark):
+    seed: int = 42
+
+    def setup(self):
+        batch_size = 2
+        seqlen = 128
+        num_heads = 8
+        head_dim = 64
+        rotary_dim = 32
+
+        # Query tensor split into rotary parts
+        self.x1 = torch.randn(
+            batch_size,
+            seqlen,
+            num_heads,
+            rotary_dim,
+            device=self.device,
+            dtype=torch.float32,
+        )
+        self.x2 = torch.randn(
+            batch_size,
+            seqlen,
+            num_heads,
+            rotary_dim,
+            device=self.device,
+            dtype=torch.float32,
+        )
+
+        # Rotary position embeddings
+        self.cos = torch.randn(
+            seqlen, 1, rotary_dim, device=self.device, dtype=torch.float32
+        )
+        self.sin = torch.randn(
+            seqlen, 1, rotary_dim, device=self.device, dtype=torch.float32
+        )
+
+        # Output tensors (in-place, so clone inputs)
+        self.out1 = self.x1.clone()
+        self.out2 = self.x2.clone()
+
+    def benchmark_base(self):
+        # Reset outputs to input values for in-place operation
+        self.out1.copy_(self.x1)
+        self.out2.copy_(self.x2)
+        self.kernel.apply_rotary(
+            self.out1, self.out2, self.cos, self.sin, self.out1, self.out2, False
+        )
+
+    def verify_base(self) -> torch.Tensor:
+        ref_out1, ref_out2 = apply_rotary_reference(
+            self.x1, self.x2, self.cos, self.sin, False
+        )
+        # Concatenate for comparison (benchmark compares self.out with returned tensor)
+        self.out = torch.cat([self.out1, self.out2], dim=-1)
+        return torch.cat([ref_out1, ref_out2], dim=-1)
+
+    def setup_large(self):
+        batch_size = 8
+        seqlen = 512
+        num_heads = 32
+        rotary_dim = 64
+
+        self.x1 = torch.randn(
+            batch_size,
+            seqlen,
+            num_heads,
+            rotary_dim,
+            device=self.device,
+            dtype=torch.float32,
+        )
+        self.x2 = torch.randn(
+            batch_size,
+            seqlen,
+            num_heads,
+            rotary_dim,
+            device=self.device,
+            dtype=torch.float32,
+        )
+
+        self.cos = torch.randn(
+            seqlen, 1, rotary_dim, device=self.device, dtype=torch.float32
+        )
+        self.sin = torch.randn(
+            seqlen, 1, rotary_dim, device=self.device, dtype=torch.float32
+        )
+
+        self.out1 = self.x1.clone()
+        self.out2 = self.x2.clone()
+
+    def benchmark_large(self):
+        self.out1.copy_(self.x1)
+        self.out2.copy_(self.x2)
+        self.kernel.apply_rotary(
+            self.out1, self.out2, self.cos, self.sin, self.out1, self.out2, False
+        )
+
+    def verify_large(self) -> torch.Tensor:
+        ref_out1, ref_out2 = apply_rotary_reference(
+            self.x1, self.x2, self.cos, self.sin, False
+        )
+        self.out = torch.cat([self.out1, self.out2], dim=-1)
+        return torch.cat([ref_out1, ref_out2], dim=-1)
diff --git a/build/torch210-cu128-x86_64-windows/__init__.py b/build/torch210-cu128-x86_64-windows/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..537713178faffc508bce05bd7d15d96ff6c3bd4c
--- /dev/null
+++ b/build/torch210-cu128-x86_64-windows/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch210-cu128-x86_64-windows/_ops.py b/build/torch210-cu128-x86_64-windows/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dfdfd9a8cba564049603f81d84e8115957ff81c
--- /dev/null
+++ b/build/torch210-cu128-x86_64-windows/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_cuda_07a01e5
+ops = torch.ops._rotary_cuda_07a01e5
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_cuda_07a01e5::{op_name}"
diff --git a/build/torch210-cu128-x86_64-windows/_rotary_cuda_07a01e5.pyd b/build/torch210-cu128-x86_64-windows/_rotary_cuda_07a01e5.pyd
new file mode 100644
index 0000000000000000000000000000000000000000..bba61677d0b769029bf21d3ac2fec570d5308319
--- /dev/null
+++ b/build/torch210-cu128-x86_64-windows/_rotary_cuda_07a01e5.pyd
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd29928a6e2a3930f4c7ec3bcffc37574981cf59bed97e6a8f3c522fa7ca0dda
+size 10415616
diff --git a/build/torch210-cu128-x86_64-windows/metadata.json b/build/torch210-cu128-x86_64-windows/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..f31b329a1dd1a4ca410bfb95958bc1af9b300910
--- /dev/null
+++ b/build/torch210-cu128-x86_64-windows/metadata.json
@@ -0,0 +1,21 @@
+{
+  "version": 1,
+  "license": "BSD-3-Clause",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0+PTX",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch210-cu128-x86_64-windows/rotary/__init__.py b/build/torch210-cu128-x86_64-windows/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc434ef44e63409acb52a8f3fff54a4adc46ed6a
--- /dev/null
+++ b/build/torch210-cu128-x86_64-windows/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/__init__.py b/build/torch210-cxx11-cu126-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32e6a58cb685314795328dccadba33e87eaee6f
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_ops.py b/build/torch210-cxx11-cu126-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d09f2f0956a472d57a4bb833d515b40d124f276f
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_cuda_2022aa6
+ops = torch.ops._rotary_cuda_2022aa6
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_cuda_2022aa6::{op_name}"
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_rotary_cuda_2022aa6.abi3.so b/build/torch210-cxx11-cu126-aarch64-linux/_rotary_cuda_2022aa6.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..b20dcb75df2a7911675e44cbbda0296e14bd3e6b
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/_rotary_cuda_2022aa6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7863cbd6a156cd3f873e926b2f8861e151d43952a26a989b9ad19753aa6270dc
+size 8282888
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/metadata.json b/build/torch210-cxx11-cu126-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..0dacb99125f1112a811819ca1ffdde15c8c0faff
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/metadata.json
@@ -0,0 +1,18 @@
+{
+  "version": 1,
+  "license": "BSD-3-Clause",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0+PTX"
+    ]
+  }
+}
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/rotary/__init__.py b/build/torch210-cxx11-cu126-aarch64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/__init__.py b/build/torch210-cxx11-cu126-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32e6a58cb685314795328dccadba33e87eaee6f
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_ops.py b/build/torch210-cxx11-cu126-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d09f2f0956a472d57a4bb833d515b40d124f276f
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_cuda_2022aa6
+ops = torch.ops._rotary_cuda_2022aa6
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_cuda_2022aa6::{op_name}"
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_rotary_cuda_2022aa6.abi3.so b/build/torch210-cxx11-cu126-x86_64-linux/_rotary_cuda_2022aa6.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..eefc0825bd168ffb2beea9f8c061713f91c18fff
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/_rotary_cuda_2022aa6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f2ac4fb2c7bbe3b277ed069761faabce67d1e1f8b3d5708f2d6f0b8b1ccfa873
+size 8200568
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/metadata.json b/build/torch210-cxx11-cu126-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..0dacb99125f1112a811819ca1ffdde15c8c0faff
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/metadata.json
@@ -0,0 +1,18 @@
+{
+  "version": 1,
+  "license": "BSD-3-Clause",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0+PTX"
+    ]
+  }
+}
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/rotary/__init__.py b/build/torch210-cxx11-cu126-x86_64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/__init__.py b/build/torch210-cxx11-cu128-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32e6a58cb685314795328dccadba33e87eaee6f
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_ops.py b/build/torch210-cxx11-cu128-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d09f2f0956a472d57a4bb833d515b40d124f276f
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_cuda_2022aa6
+ops = torch.ops._rotary_cuda_2022aa6
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_cuda_2022aa6::{op_name}"
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_rotary_cuda_2022aa6.abi3.so b/build/torch210-cxx11-cu128-aarch64-linux/_rotary_cuda_2022aa6.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..4c91a6f04e2d2c6b7b0bb779299c23247896a42e
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/_rotary_cuda_2022aa6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:696ff3570b3f6fbc9623e44b53f189bb0be0bc6260d490616b03c58dd5dd2146
+size 12019200
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/metadata.json b/build/torch210-cxx11-cu128-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..a794c92436c3827ae79b48d55f7ea964afd50f52
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/metadata.json
@@ -0,0 +1,21 @@
+{
+  "version": 1,
+  "license": "BSD-3-Clause",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0+PTX",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/rotary/__init__.py b/build/torch210-cxx11-cu128-aarch64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/__init__.py b/build/torch210-cxx11-cu128-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32e6a58cb685314795328dccadba33e87eaee6f
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_ops.py b/build/torch210-cxx11-cu128-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d09f2f0956a472d57a4bb833d515b40d124f276f
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_cuda_2022aa6
+ops = torch.ops._rotary_cuda_2022aa6
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_cuda_2022aa6::{op_name}"
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_rotary_cuda_2022aa6.abi3.so b/build/torch210-cxx11-cu128-x86_64-linux/_rotary_cuda_2022aa6.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..e8d0268e1282c0d4dd17a8731e17b59da799e0e6
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/_rotary_cuda_2022aa6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1238e4b57b2f30d5c5f67fc1d64a133de551f9b68b619271ac2a10f948d66b04
+size 11905904
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/metadata.json b/build/torch210-cxx11-cu128-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..a794c92436c3827ae79b48d55f7ea964afd50f52
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/metadata.json
@@ -0,0 +1,21 @@
+{
+  "version": 1,
+  "license": "BSD-3-Clause",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0+PTX",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/rotary/__init__.py b/build/torch210-cxx11-cu128-x86_64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/__init__.py b/build/torch210-cxx11-cu130-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32e6a58cb685314795328dccadba33e87eaee6f
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_ops.py b/build/torch210-cxx11-cu130-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d09f2f0956a472d57a4bb833d515b40d124f276f
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_cuda_2022aa6
+ops = torch.ops._rotary_cuda_2022aa6
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_cuda_2022aa6::{op_name}"
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_rotary_cuda_2022aa6.abi3.so b/build/torch210-cxx11-cu130-aarch64-linux/_rotary_cuda_2022aa6.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..1a6fec3254f1e21490194c0bc6321977ff41af40
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/_rotary_cuda_2022aa6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:686edb81b5ffdc43e88e35995b962aed5d23061c6aa27aff61af910b76cf03bf
+size 10411432
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/metadata.json b/build/torch210-cxx11-cu130-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..eff725542128e103dfb5df382d74940efff77214
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/metadata.json
@@ -0,0 +1,19 @@
+{
+  "version": 1,
+  "license": "BSD-3-Clause",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "11.0",
+      "12.0+PTX",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/rotary/__init__.py b/build/torch210-cxx11-cu130-aarch64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/__init__.py b/build/torch210-cxx11-cu130-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32e6a58cb685314795328dccadba33e87eaee6f
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_ops.py b/build/torch210-cxx11-cu130-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d09f2f0956a472d57a4bb833d515b40d124f276f
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_cuda_2022aa6
+ops = torch.ops._rotary_cuda_2022aa6
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_cuda_2022aa6::{op_name}"
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_rotary_cuda_2022aa6.abi3.so b/build/torch210-cxx11-cu130-x86_64-linux/_rotary_cuda_2022aa6.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..07873a50b7d4180acef02c38372e8a4217e72258
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/_rotary_cuda_2022aa6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:069004af51893d2f112d58bc00197cf813c5271ef6f9105936b7966bbb44881f
+size 10310752
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/metadata.json b/build/torch210-cxx11-cu130-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..eff725542128e103dfb5df382d74940efff77214
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/metadata.json
@@ -0,0 +1,19 @@
+{
+  "version": 1,
+  "license": "BSD-3-Clause",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "11.0",
+      "12.0+PTX",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/rotary/__init__.py b/build/torch210-cxx11-cu130-x86_64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/__init__.py b/build/torch210-cxx11-xpu20253-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32e6a58cb685314795328dccadba33e87eaee6f
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/_ops.py b/build/torch210-cxx11-xpu20253-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..0316a95137455eff318a2ed3f70d396c1980d290
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_xpu_2022aa6
+ops = torch.ops._rotary_xpu_2022aa6
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_xpu_2022aa6::{op_name}"
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/_rotary_xpu_2022aa6.abi3.so b/build/torch210-cxx11-xpu20253-x86_64-linux/_rotary_xpu_2022aa6.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..bff78b8120c85f92bbcaeec042e0558a97a8c003
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/_rotary_xpu_2022aa6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26ce5dd015655bbbccf535f2b7078b184d01831778effd3058fa24256be69111
+size 2301504
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/metadata.json b/build/torch210-cxx11-xpu20253-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f032899cf61212add2325c22107252842bd1588
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/metadata.json
@@ -0,0 +1,8 @@
+{
+  "version": 1,
+  "license": "BSD-3-Clause",
+  "python-depends": [],
+  "backend": {
+    "type": "xpu"
+  }
+}
\ No newline at end of file
diff --git a/build/torch210-cxx11-xpu20253-x86_64-linux/rotary/__init__.py b/build/torch210-cxx11-xpu20253-x86_64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-xpu20253-x86_64-linux/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-xpu20253-x86_64-windows/__init__.py b/build/torch210-xpu20253-x86_64-windows/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..537713178faffc508bce05bd7d15d96ff6c3bd4c
--- /dev/null
+++ b/build/torch210-xpu20253-x86_64-windows/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch210-xpu20253-x86_64-windows/_ops.py b/build/torch210-xpu20253-x86_64-windows/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f89db5464ca634c7664e6b311ca56da25d34b7c
--- /dev/null
+++ b/build/torch210-xpu20253-x86_64-windows/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_xpu_07a01e5
+ops = torch.ops._rotary_xpu_07a01e5
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_xpu_07a01e5::{op_name}"
diff --git a/build/torch210-xpu20253-x86_64-windows/_rotary_xpu_07a01e5.pyd b/build/torch210-xpu20253-x86_64-windows/_rotary_xpu_07a01e5.pyd
new file mode 100644
index 0000000000000000000000000000000000000000..27056447e8d9fe208c01d248d5082f9e887ebac7
--- /dev/null
+++ b/build/torch210-xpu20253-x86_64-windows/_rotary_xpu_07a01e5.pyd
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02d857f2afd55cccc36d439f348ff360bdc7274c0e65660e41a2f8775526dec1
+size 396288
diff --git a/build/torch210-xpu20253-x86_64-windows/metadata.json b/build/torch210-xpu20253-x86_64-windows/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..de6520c18deaab0372f91d85948970c48240031c
--- /dev/null
+++ b/build/torch210-xpu20253-x86_64-windows/metadata.json
@@ -0,0 +1,5 @@
+{
+  "version": 1,
+  "license": "BSD-3-Clause",
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch210-xpu20253-x86_64-windows/rotary/__init__.py b/build/torch210-xpu20253-x86_64-windows/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc434ef44e63409acb52a8f3fff54a4adc46ed6a
--- /dev/null
+++ b/build/torch210-xpu20253-x86_64-windows/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/__init__.py b/build/torch211-cxx11-cu126-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32e6a58cb685314795328dccadba33e87eaee6f
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_ops.py b/build/torch211-cxx11-cu126-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d09f2f0956a472d57a4bb833d515b40d124f276f
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_cuda_2022aa6
+ops = torch.ops._rotary_cuda_2022aa6
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_cuda_2022aa6::{op_name}"
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_rotary_cuda_2022aa6.abi3.so b/build/torch211-cxx11-cu126-aarch64-linux/_rotary_cuda_2022aa6.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..87a4edbb7b4a50318b77d60394a9588955a35d67
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/_rotary_cuda_2022aa6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac9808fdc543b02bbf1614464032fbd1fcd9433e4fc8f8f38646c71d66821b98
+size 8279200
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/metadata.json b/build/torch211-cxx11-cu126-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..0dacb99125f1112a811819ca1ffdde15c8c0faff
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/metadata.json
@@ -0,0 +1,18 @@
+{
+  "version": 1,
+  "license": "BSD-3-Clause",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0+PTX"
+    ]
+  }
+}
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/rotary/__init__.py b/build/torch211-cxx11-cu126-aarch64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/__init__.py b/build/torch211-cxx11-cu126-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32e6a58cb685314795328dccadba33e87eaee6f
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_ops.py b/build/torch211-cxx11-cu126-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d09f2f0956a472d57a4bb833d515b40d124f276f
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_cuda_2022aa6
+ops = torch.ops._rotary_cuda_2022aa6
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_cuda_2022aa6::{op_name}"
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_rotary_cuda_2022aa6.abi3.so b/build/torch211-cxx11-cu126-x86_64-linux/_rotary_cuda_2022aa6.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..fc70cacbdeb5607aa83943b92b0fabdaea18395e
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/_rotary_cuda_2022aa6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb1cd25cceded1b6b4ef38b25c640e69f41110e5678388e70ea05edf4c7ce061
+size 8193600
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/metadata.json b/build/torch211-cxx11-cu126-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..0dacb99125f1112a811819ca1ffdde15c8c0faff
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/metadata.json
@@ -0,0 +1,18 @@
+{
+  "version": 1,
+  "license": "BSD-3-Clause",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0+PTX"
+    ]
+  }
+}
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/rotary/__init__.py b/build/torch211-cxx11-cu126-x86_64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/__init__.py b/build/torch211-cxx11-cu128-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32e6a58cb685314795328dccadba33e87eaee6f
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_ops.py b/build/torch211-cxx11-cu128-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d09f2f0956a472d57a4bb833d515b40d124f276f
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_cuda_2022aa6
+ops = torch.ops._rotary_cuda_2022aa6
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_cuda_2022aa6::{op_name}"
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_rotary_cuda_2022aa6.abi3.so b/build/torch211-cxx11-cu128-aarch64-linux/_rotary_cuda_2022aa6.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..173fa9e3d2262b43d5776203b637b6334f76f00a
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/_rotary_cuda_2022aa6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89f7a34ebfb0d84e1f301fac0293edacf6fea4321c7566d759d4e339c7d860fc
+size 12015512
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/metadata.json b/build/torch211-cxx11-cu128-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..a794c92436c3827ae79b48d55f7ea964afd50f52
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/metadata.json
@@ -0,0 +1,21 @@
+{
+  "version": 1,
+  "license": "BSD-3-Clause",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0+PTX",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/rotary/__init__.py b/build/torch211-cxx11-cu128-aarch64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/__init__.py b/build/torch211-cxx11-cu128-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32e6a58cb685314795328dccadba33e87eaee6f
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_ops.py b/build/torch211-cxx11-cu128-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d09f2f0956a472d57a4bb833d515b40d124f276f
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_cuda_2022aa6
+ops = torch.ops._rotary_cuda_2022aa6
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_cuda_2022aa6::{op_name}"
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_rotary_cuda_2022aa6.abi3.so b/build/torch211-cxx11-cu128-x86_64-linux/_rotary_cuda_2022aa6.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..3e769bfdb01dfeb5ef6f853a0ee13978e944e6f2
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/_rotary_cuda_2022aa6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8543effa188022e7fd780bf55a705873473ae908867c6fa1465efa72b611cc04
+size 11894840
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/metadata.json b/build/torch211-cxx11-cu128-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..a794c92436c3827ae79b48d55f7ea964afd50f52
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/metadata.json
@@ -0,0 +1,21 @@
+{
+  "version": 1,
+  "license": "BSD-3-Clause",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0+PTX",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/rotary/__init__.py b/build/torch211-cxx11-cu128-x86_64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/__init__.py b/build/torch211-cxx11-cu130-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32e6a58cb685314795328dccadba33e87eaee6f
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_ops.py b/build/torch211-cxx11-cu130-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d09f2f0956a472d57a4bb833d515b40d124f276f
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_cuda_2022aa6
+ops = torch.ops._rotary_cuda_2022aa6
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_cuda_2022aa6::{op_name}"
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_rotary_cuda_2022aa6.abi3.so b/build/torch211-cxx11-cu130-aarch64-linux/_rotary_cuda_2022aa6.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..6b2f5fe6c71a8c0693fe74848470281b5a9ee458
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/_rotary_cuda_2022aa6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:008992ab92a2e0f4d5a63664706f69115de50db7d86c00f80cf944a85f979ae5
+size 10407744
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/metadata.json b/build/torch211-cxx11-cu130-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..eff725542128e103dfb5df382d74940efff77214
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/metadata.json
@@ -0,0 +1,19 @@
+{
+  "version": 1,
+  "license": "BSD-3-Clause",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "11.0",
+      "12.0+PTX",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/rotary/__init__.py b/build/torch211-cxx11-cu130-aarch64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/__init__.py b/build/torch211-cxx11-cu130-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32e6a58cb685314795328dccadba33e87eaee6f
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_ops.py b/build/torch211-cxx11-cu130-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d09f2f0956a472d57a4bb833d515b40d124f276f
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_cuda_2022aa6
+ops = torch.ops._rotary_cuda_2022aa6
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_cuda_2022aa6::{op_name}"
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_rotary_cuda_2022aa6.abi3.so b/build/torch211-cxx11-cu130-x86_64-linux/_rotary_cuda_2022aa6.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..602bc069951380743948af6480f96b9f27168e59
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/_rotary_cuda_2022aa6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:795a71adb51bd0405c372522f4f13d60addf89f36d4784f1206273d38261bafd
+size 10303784
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/metadata.json b/build/torch211-cxx11-cu130-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..eff725542128e103dfb5df382d74940efff77214
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/metadata.json
@@ -0,0 +1,19 @@
+{
+  "version": 1,
+  "license": "BSD-3-Clause",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "11.0",
+      "12.0+PTX",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/rotary/__init__.py b/build/torch211-cxx11-cu130-x86_64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/__init__.py b/build/torch211-cxx11-xpu20253-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32e6a58cb685314795328dccadba33e87eaee6f
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/_ops.py b/build/torch211-cxx11-xpu20253-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..0316a95137455eff318a2ed3f70d396c1980d290
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_xpu_2022aa6
+ops = torch.ops._rotary_xpu_2022aa6
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_xpu_2022aa6::{op_name}"
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/_rotary_xpu_2022aa6.abi3.so b/build/torch211-cxx11-xpu20253-x86_64-linux/_rotary_xpu_2022aa6.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..e16d7f4e849125d39825d5c1a844b6e383f7bc8d
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/_rotary_xpu_2022aa6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d3e980c4780de32a6a55f04bc9642e516f7858a6174d2cd3b973d23141c17ce
+size 2301504
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/metadata.json b/build/torch211-cxx11-xpu20253-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f032899cf61212add2325c22107252842bd1588
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/metadata.json
@@ -0,0 +1,8 @@
+{
+  "version": 1,
+  "license": "BSD-3-Clause",
+  "python-depends": [],
+  "backend": {
+    "type": "xpu"
+  }
+}
\ No newline at end of file
diff --git a/build/torch211-cxx11-xpu20253-x86_64-linux/rotary/__init__.py b/build/torch211-cxx11-xpu20253-x86_64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-xpu20253-x86_64-linux/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch26-cxx11-cu126-aarch64-linux/rotary/__init__.py b/build/torch26-cxx11-cu126-aarch64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eba8039e210c8b710c5c663ef4e7930757f271be
--- /dev/null
+++ b/build/torch26-cxx11-cu126-aarch64-linux/rotary/__init__.py
@@ -0,0 +1,19 @@
+from typing import Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+):
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+__all__ = ["apply_rotary"]
diff --git a/build/torch26-cxx11-cu126-aarch64-linux/rotary/_ops.py b/build/torch26-cxx11-cu126-aarch64-linux/rotary/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d9717343c07cf81e45646b6fc80dddc95d58bdf
--- /dev/null
+++ b/build/torch26-cxx11-cu126-aarch64-linux/rotary/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_6b8e81d
+ops = torch.ops._rotary_6b8e81d
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_6b8e81d::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx11-cu126-aarch64-linux/rotary/_rotary_6b8e81d.abi3.so b/build/torch26-cxx11-cu126-aarch64-linux/rotary/_rotary_6b8e81d.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..7abd3dca0cbc48d258d764432d3f912e30beb788
--- /dev/null
+++ b/build/torch26-cxx11-cu126-aarch64-linux/rotary/_rotary_6b8e81d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91e09bffe6812e5fbf856a01a164bc41c4eb3f49e2102c723c20d695025a34e9
+size 4543712
diff --git a/build/torch26-cxx98-cu126-aarch64-linux/rotary/__init__.py b/build/torch26-cxx98-cu126-aarch64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eba8039e210c8b710c5c663ef4e7930757f271be
--- /dev/null
+++ b/build/torch26-cxx98-cu126-aarch64-linux/rotary/__init__.py
@@ -0,0 +1,19 @@
+from typing import Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+):
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+__all__ = ["apply_rotary"]
diff --git a/build/torch26-cxx98-cu126-aarch64-linux/rotary/_ops.py b/build/torch26-cxx98-cu126-aarch64-linux/rotary/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d9717343c07cf81e45646b6fc80dddc95d58bdf
--- /dev/null
+++ b/build/torch26-cxx98-cu126-aarch64-linux/rotary/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_6b8e81d
+ops = torch.ops._rotary_6b8e81d
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_6b8e81d::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx98-cu126-aarch64-linux/rotary/_rotary_6b8e81d.abi3.so b/build/torch26-cxx98-cu126-aarch64-linux/rotary/_rotary_6b8e81d.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..8fcb995b549848b4d7375e4d41bcd219f4857328
--- /dev/null
+++ b/build/torch26-cxx98-cu126-aarch64-linux/rotary/_rotary_6b8e81d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b215a74951fe7e1c8be6a8fb7f54483e0e393958acc4c410b9fca7ce70470e39
+size 4540224
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/rotary/__init__.py b/build/torch27-cxx11-cu118-x86_64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eba8039e210c8b710c5c663ef4e7930757f271be
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/rotary/__init__.py
@@ -0,0 +1,19 @@
+from typing import Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+):
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+__all__ = ["apply_rotary"]
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/rotary/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu118-x86_64-linux/rotary/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..973f6b3c5f14b063a77f0feb30beb6749e74e985
Binary files /dev/null and b/build/torch27-cxx11-cu118-x86_64-linux/rotary/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/rotary/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu118-x86_64-linux/rotary/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..24dc100bca31802e6e17c9293e489129509bea6c
Binary files /dev/null and b/build/torch27-cxx11-cu118-x86_64-linux/rotary/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/rotary/_ops.py b/build/torch27-cxx11-cu118-x86_64-linux/rotary/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d43408adb5450ff15c2e04cd3311709823d05e29
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/rotary/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_98ffc18
+ops = torch.ops._rotary_98ffc18
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_98ffc18::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/rotary/_rotary_98ffc18.abi3.so b/build/torch27-cxx11-cu118-x86_64-linux/rotary/_rotary_98ffc18.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..9eac909edbaf49935f07cc1b554e77df437c30e0
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/rotary/_rotary_98ffc18.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:030e76c6ee0921ae7ada04dfe14fceb8a4454e794ddf9ce68f29a32e7075c9be
+size 6807656
diff --git a/build/torch27-cxx11-cu126-aarch64-linux/rotary/__init__.py b/build/torch27-cxx11-cu126-aarch64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eba8039e210c8b710c5c663ef4e7930757f271be
--- /dev/null
+++ b/build/torch27-cxx11-cu126-aarch64-linux/rotary/__init__.py
@@ -0,0 +1,19 @@
+from typing import Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+):
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+__all__ = ["apply_rotary"]
diff --git a/build/torch27-cxx11-cu126-aarch64-linux/rotary/_ops.py b/build/torch27-cxx11-cu126-aarch64-linux/rotary/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d9717343c07cf81e45646b6fc80dddc95d58bdf
--- /dev/null
+++ b/build/torch27-cxx11-cu126-aarch64-linux/rotary/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_6b8e81d
+ops = torch.ops._rotary_6b8e81d
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_6b8e81d::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu126-aarch64-linux/rotary/_rotary_6b8e81d.abi3.so b/build/torch27-cxx11-cu126-aarch64-linux/rotary/_rotary_6b8e81d.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..56513966cfae8464c984ce7af405618bee4dfabb
--- /dev/null
+++ b/build/torch27-cxx11-cu126-aarch64-linux/rotary/_rotary_6b8e81d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35fab33bc6bf4d4294efc1140427f8ff608a4633d8f6dfc9416547e78fc2dba4
+size 6378944
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/rotary/__init__.py b/build/torch27-cxx11-cu126-x86_64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eba8039e210c8b710c5c663ef4e7930757f271be
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/rotary/__init__.py
@@ -0,0 +1,19 @@
+from typing import Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+):
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+__all__ = ["apply_rotary"]
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/rotary/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/rotary/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d9b67802d61c31b708beb2b07a6b65187dbd5ae1
Binary files /dev/null and b/build/torch27-cxx11-cu126-x86_64-linux/rotary/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/rotary/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/rotary/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..54042216baf24f270b9709ac7bc50bc654fd31f3
Binary files /dev/null and b/build/torch27-cxx11-cu126-x86_64-linux/rotary/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/rotary/_ops.py b/build/torch27-cxx11-cu126-x86_64-linux/rotary/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d43408adb5450ff15c2e04cd3311709823d05e29
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/rotary/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_98ffc18
+ops = torch.ops._rotary_98ffc18
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_98ffc18::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/rotary/_rotary_98ffc18.abi3.so b/build/torch27-cxx11-cu126-x86_64-linux/rotary/_rotary_98ffc18.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..923de95a7bd6bbf59a860f4ab35fcc11cb371765
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/rotary/_rotary_98ffc18.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a502a5e1b8282bfc625c617eb6b239a2c4277d9198ec0dd162589b61005c8c92
+size 6820496
diff --git a/build/torch27-cxx11-cu128-aarch64-linux/rotary/__init__.py b/build/torch27-cxx11-cu128-aarch64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eba8039e210c8b710c5c663ef4e7930757f271be
--- /dev/null
+++ b/build/torch27-cxx11-cu128-aarch64-linux/rotary/__init__.py
@@ -0,0 +1,19 @@
+from typing import Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+):
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+__all__ = ["apply_rotary"]
diff --git a/build/torch27-cxx11-cu128-aarch64-linux/rotary/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu128-aarch64-linux/rotary/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..60d5bfab0618d302e241517f1c7ec4ce6f9fb156
Binary files /dev/null and b/build/torch27-cxx11-cu128-aarch64-linux/rotary/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu128-aarch64-linux/rotary/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu128-aarch64-linux/rotary/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fed6ec5d42bed01465a824b1081d1950022698cc
Binary files /dev/null and b/build/torch27-cxx11-cu128-aarch64-linux/rotary/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu128-aarch64-linux/rotary/_ops.py b/build/torch27-cxx11-cu128-aarch64-linux/rotary/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cec319f8e2b3f08afbe538960d08c34a6b08e
--- /dev/null
+++ b/build/torch27-cxx11-cu128-aarch64-linux/rotary/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_6abd2a8
+ops = torch.ops._rotary_6abd2a8
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_6abd2a8::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu128-aarch64-linux/rotary/_rotary_6abd2a8.abi3.so b/build/torch27-cxx11-cu128-aarch64-linux/rotary/_rotary_6abd2a8.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..d26d05f076e49a4fbc1848f4c61173854ffce7a7
--- /dev/null
+++ b/build/torch27-cxx11-cu128-aarch64-linux/rotary/_rotary_6abd2a8.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29909df0009da77fb276a0ffd328200a201b2fd06ec78c457e6c63554f4d3e2d
+size 10639192
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/rotary/__init__.py b/build/torch27-cxx11-cu128-x86_64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eba8039e210c8b710c5c663ef4e7930757f271be
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/rotary/__init__.py
@@ -0,0 +1,19 @@
+from typing import Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+):
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+__all__ = ["apply_rotary"]
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/rotary/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/rotary/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..52c6bb36a1ac3067d18af5428e95c9c081f5112f
Binary files /dev/null and b/build/torch27-cxx11-cu128-x86_64-linux/rotary/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/rotary/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/rotary/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..781e81e09df4dbace6fbfd55b41b182206c546b8
Binary files /dev/null and b/build/torch27-cxx11-cu128-x86_64-linux/rotary/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/rotary/_ops.py b/build/torch27-cxx11-cu128-x86_64-linux/rotary/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d43408adb5450ff15c2e04cd3311709823d05e29
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/rotary/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_98ffc18
+ops = torch.ops._rotary_98ffc18
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_98ffc18::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/rotary/_rotary_98ffc18.abi3.so b/build/torch27-cxx11-cu128-x86_64-linux/rotary/_rotary_98ffc18.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..20b1a12d41e9ce96e265f91c2989cce7219e1f77
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/rotary/_rotary_98ffc18.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8acfdd58aac193ab809386077c567e58b3da4481b5bb38af87bd4cdc18e6dd2b
+size 10529816
diff --git a/build/torch27-cxx11-xpu20250-x86_64-linux/rotary/__init__.py b/build/torch27-cxx11-xpu20250-x86_64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eba8039e210c8b710c5c663ef4e7930757f271be
--- /dev/null
+++ b/build/torch27-cxx11-xpu20250-x86_64-linux/rotary/__init__.py
@@ -0,0 +1,19 @@
+from typing import Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+):
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+__all__ = ["apply_rotary"]
diff --git a/build/torch27-cxx11-xpu20250-x86_64-linux/rotary/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-xpu20250-x86_64-linux/rotary/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..969f133617d659b76547c15b0dfa58dfc7e50a20
Binary files /dev/null and b/build/torch27-cxx11-xpu20250-x86_64-linux/rotary/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-xpu20250-x86_64-linux/rotary/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-xpu20250-x86_64-linux/rotary/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5678d770414a30c0cf0d5b03123de2438c00a066
Binary files /dev/null and b/build/torch27-cxx11-xpu20250-x86_64-linux/rotary/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-xpu20250-x86_64-linux/rotary/_ops.py b/build/torch27-cxx11-xpu20250-x86_64-linux/rotary/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d43408adb5450ff15c2e04cd3311709823d05e29
--- /dev/null
+++ b/build/torch27-cxx11-xpu20250-x86_64-linux/rotary/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_98ffc18
+ops = torch.ops._rotary_98ffc18
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_98ffc18::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-xpu20250-x86_64-linux/rotary/_rotary_98ffc18.abi3.so b/build/torch27-cxx11-xpu20250-x86_64-linux/rotary/_rotary_98ffc18.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..8173fd02037c1c71668f5f27551da5070912a1dd
--- /dev/null
+++ b/build/torch27-cxx11-xpu20250-x86_64-linux/rotary/_rotary_98ffc18.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c6c59c7df2489ffd3154047967b69d5a39788d784e2aa543b64ff192c184792
+size 2337512
diff --git a/build/torch28-cxx11-cu126-aarch64-linux/rotary/__init__.py b/build/torch28-cxx11-cu126-aarch64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eba8039e210c8b710c5c663ef4e7930757f271be
--- /dev/null
+++ b/build/torch28-cxx11-cu126-aarch64-linux/rotary/__init__.py
@@ -0,0 +1,19 @@
+from typing import Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+):
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+__all__ = ["apply_rotary"]
diff --git a/build/torch28-cxx11-cu126-aarch64-linux/rotary/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-cu126-aarch64-linux/rotary/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..73c1a2cac7d35226a8f650ce5aff7fe0020f027d
Binary files /dev/null and b/build/torch28-cxx11-cu126-aarch64-linux/rotary/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu126-aarch64-linux/rotary/__pycache__/_ops.cpython-313.pyc b/build/torch28-cxx11-cu126-aarch64-linux/rotary/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..677c6b7a644f88c0bfbc029c3640cb441914ed17
Binary files /dev/null and b/build/torch28-cxx11-cu126-aarch64-linux/rotary/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu126-aarch64-linux/rotary/_ops.py b/build/torch28-cxx11-cu126-aarch64-linux/rotary/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9506529a11d919c18f69068b69a3ed69a630bace
--- /dev/null
+++ b/build/torch28-cxx11-cu126-aarch64-linux/rotary/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_005dcc7_dirty
+ops = torch.ops._rotary_005dcc7_dirty
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_005dcc7_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu126-aarch64-linux/rotary/_rotary_005dcc7_dirty.abi3.so b/build/torch28-cxx11-cu126-aarch64-linux/rotary/_rotary_005dcc7_dirty.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..cbd5ed429365ebbeba58af0cb62a45e99a5073e3
--- /dev/null
+++ b/build/torch28-cxx11-cu126-aarch64-linux/rotary/_rotary_005dcc7_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:50a66e336456fb3e5ad528a8e6eef13f2d6ed8289936fb99445bd45a53172750
+size 6380008
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/__init__.py b/build/torch28-cxx11-cu126-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32e6a58cb685314795328dccadba33e87eaee6f
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/_ops.py b/build/torch28-cxx11-cu126-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..2671c5d7b93b8ebd836715b40fea0fd4fbbffc1e
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_1d9fc74
+ops = torch.ops._rotary_1d9fc74
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_1d9fc74::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/_rotary_1d9fc74.abi3.so b/build/torch28-cxx11-cu126-x86_64-linux/_rotary_1d9fc74.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..0ce5b4d6b2a5244edf4133c94a15afa2268cb444
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/_rotary_1d9fc74.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94d0a2fccecef02464217ce8fcd0029b858d20d38a0df9e4be56da8fe5196c95
+size 8190648
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/metadata.json b/build/torch28-cxx11-cu126-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cf5deed9898dce769f4cc73913d3530b92a0bd8
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/metadata.json
@@ -0,0 +1,4 @@
+{
+  "version": 1,
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/rotary/__init__.py b/build/torch28-cxx11-cu126-x86_64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch28-cxx11-cu128-aarch64-linux/rotary/__init__.py b/build/torch28-cxx11-cu128-aarch64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eba8039e210c8b710c5c663ef4e7930757f271be
--- /dev/null
+++ b/build/torch28-cxx11-cu128-aarch64-linux/rotary/__init__.py
@@ -0,0 +1,19 @@
+from typing import Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+):
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+__all__ = ["apply_rotary"]
diff --git a/build/torch28-cxx11-cu128-aarch64-linux/rotary/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-cu128-aarch64-linux/rotary/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6caf9406147987852b246c94b0370fbe237a919c
Binary files /dev/null and b/build/torch28-cxx11-cu128-aarch64-linux/rotary/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu128-aarch64-linux/rotary/__pycache__/_ops.cpython-313.pyc b/build/torch28-cxx11-cu128-aarch64-linux/rotary/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8e61932f2c6d305e84aa29f173031a65312e8e4b
Binary files /dev/null and b/build/torch28-cxx11-cu128-aarch64-linux/rotary/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu128-aarch64-linux/rotary/_ops.py b/build/torch28-cxx11-cu128-aarch64-linux/rotary/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9506529a11d919c18f69068b69a3ed69a630bace
--- /dev/null
+++ b/build/torch28-cxx11-cu128-aarch64-linux/rotary/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_005dcc7_dirty
+ops = torch.ops._rotary_005dcc7_dirty
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_005dcc7_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu128-aarch64-linux/rotary/_rotary_005dcc7_dirty.abi3.so b/build/torch28-cxx11-cu128-aarch64-linux/rotary/_rotary_005dcc7_dirty.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..0b8b58a9c83ba9a0795bff49d90a348cd5576d02
--- /dev/null
+++ b/build/torch28-cxx11-cu128-aarch64-linux/rotary/_rotary_005dcc7_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1789291b4f77fa26b1ed011d3dc768265cff6fe38bad6559a277c6eddabe7f4a
+size 10247136
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/__init__.py b/build/torch28-cxx11-cu128-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32e6a58cb685314795328dccadba33e87eaee6f
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/_ops.py b/build/torch28-cxx11-cu128-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..2671c5d7b93b8ebd836715b40fea0fd4fbbffc1e
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_1d9fc74
+ops = torch.ops._rotary_1d9fc74
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_1d9fc74::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/_rotary_1d9fc74.abi3.so b/build/torch28-cxx11-cu128-x86_64-linux/_rotary_1d9fc74.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..cd324605e809580f530a2a8a0cd5ee1844648f5c
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/_rotary_1d9fc74.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95350793e833b22a67c9569eb3936925cc128a13b4e2409ac2d23112519baf1c
+size 11895768
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/metadata.json b/build/torch28-cxx11-cu128-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cf5deed9898dce769f4cc73913d3530b92a0bd8
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/metadata.json
@@ -0,0 +1,4 @@
+{
+  "version": 1,
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/rotary/__init__.py b/build/torch28-cxx11-cu128-x86_64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch28-cxx11-cu129-aarch64-linux/rotary/__init__.py b/build/torch28-cxx11-cu129-aarch64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eba8039e210c8b710c5c663ef4e7930757f271be
--- /dev/null
+++ b/build/torch28-cxx11-cu129-aarch64-linux/rotary/__init__.py
@@ -0,0 +1,19 @@
+from typing import Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+):
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+__all__ = ["apply_rotary"]
diff --git a/build/torch28-cxx11-cu129-aarch64-linux/rotary/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-cu129-aarch64-linux/rotary/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3f633a77315c00e7ab7ec26434af98ba2813c6b
Binary files /dev/null and b/build/torch28-cxx11-cu129-aarch64-linux/rotary/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu129-aarch64-linux/rotary/__pycache__/_ops.cpython-313.pyc b/build/torch28-cxx11-cu129-aarch64-linux/rotary/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7adf912877cb1f1d05201e5643c964bfb03ca111
Binary files /dev/null and b/build/torch28-cxx11-cu129-aarch64-linux/rotary/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu129-aarch64-linux/rotary/_ops.py b/build/torch28-cxx11-cu129-aarch64-linux/rotary/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cec319f8e2b3f08afbe538960d08c34a6b08e
--- /dev/null
+++ b/build/torch28-cxx11-cu129-aarch64-linux/rotary/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_6abd2a8
+ops = torch.ops._rotary_6abd2a8
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_6abd2a8::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu129-aarch64-linux/rotary/_rotary_6abd2a8.abi3.so b/build/torch28-cxx11-cu129-aarch64-linux/rotary/_rotary_6abd2a8.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..ed2f0d1965c5a08e3a11d2335d0d445a078a072f
--- /dev/null
+++ b/build/torch28-cxx11-cu129-aarch64-linux/rotary/_rotary_6abd2a8.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d97b6efbc29c35c6a13325e8e533a8479169a75f5ca915743616cd86344a962
+size 10705440
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/__init__.py b/build/torch28-cxx11-cu129-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32e6a58cb685314795328dccadba33e87eaee6f
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/_ops.py b/build/torch28-cxx11-cu129-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..2671c5d7b93b8ebd836715b40fea0fd4fbbffc1e
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_1d9fc74
+ops = torch.ops._rotary_1d9fc74
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_1d9fc74::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/_rotary_1d9fc74.abi3.so b/build/torch28-cxx11-cu129-x86_64-linux/_rotary_1d9fc74.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..9f90c6de8ec076811de9f052543b1bd0ea6cde20
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/_rotary_1d9fc74.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15a5dee3e2e653cdb92486fb7c546c5e3a94fd5291351f1847e04f7dcaa6402c
+size 11964056
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/metadata.json b/build/torch28-cxx11-cu129-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cf5deed9898dce769f4cc73913d3530b92a0bd8
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/metadata.json
@@ -0,0 +1,4 @@
+{
+  "version": 1,
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/rotary/__init__.py b/build/torch28-cxx11-cu129-x86_64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch28-cxx11-xpu20251-x86_64-linux/__init__.py b/build/torch28-cxx11-xpu20251-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32e6a58cb685314795328dccadba33e87eaee6f
--- /dev/null
+++ b/build/torch28-cxx11-xpu20251-x86_64-linux/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch28-cxx11-xpu20251-x86_64-linux/_ops.py b/build/torch28-cxx11-xpu20251-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..2671c5d7b93b8ebd836715b40fea0fd4fbbffc1e
--- /dev/null
+++ b/build/torch28-cxx11-xpu20251-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_1d9fc74
+ops = torch.ops._rotary_1d9fc74
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_1d9fc74::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-xpu20251-x86_64-linux/_rotary_1d9fc74.abi3.so b/build/torch28-cxx11-xpu20251-x86_64-linux/_rotary_1d9fc74.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..3f6f5a1ae5aebd47b44b8aa60715f2ddfa981fce
--- /dev/null
+++ b/build/torch28-cxx11-xpu20251-x86_64-linux/_rotary_1d9fc74.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:926df52f77f1250de78cdcf777f4c135dfc1a200c3726a4d98fdd1b83f3cbb07
+size 2338640
diff --git a/build/torch28-cxx11-xpu20251-x86_64-linux/metadata.json b/build/torch28-cxx11-xpu20251-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cf5deed9898dce769f4cc73913d3530b92a0bd8
--- /dev/null
+++ b/build/torch28-cxx11-xpu20251-x86_64-linux/metadata.json
@@ -0,0 +1,4 @@
+{
+  "version": 1,
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch28-cxx11-xpu20251-x86_64-linux/rotary/__init__.py b/build/torch28-cxx11-xpu20251-x86_64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch28-cxx11-xpu20251-x86_64-linux/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cu130-x86_64-windows/rotary/__init__.py b/build/torch29-cu130-x86_64-windows/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe210ea3a2e6da63872100ee4bc1b749552b463d
--- /dev/null
+++ b/build/torch29-cu130-x86_64-windows/rotary/__init__.py
@@ -0,0 +1,53 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    position_ids: Optional[torch.Tensor] = None,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernels implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch29-cu130-x86_64-windows/rotary/_ops.py b/build/torch29-cu130-x86_64-windows/rotary/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..028a253cf4050404f40d8dadac35fc9c509aa98c
--- /dev/null
+++ b/build/torch29-cu130-x86_64-windows/rotary/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_a793e44
+ops = torch.ops._rotary_a793e44
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_a793e44::{op_name}"
\ No newline at end of file
diff --git a/build/torch29-cu130-x86_64-windows/rotary/_rotary_a793e44.pyd b/build/torch29-cu130-x86_64-windows/rotary/_rotary_a793e44.pyd
new file mode 100644
index 0000000000000000000000000000000000000000..3174a0d680b4c6f57a4c8d8521bb088c08c1aeb0
--- /dev/null
+++ b/build/torch29-cu130-x86_64-windows/rotary/_rotary_a793e44.pyd
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:606c6eb81894dc8197f73e0e71a5356f56c61c612e5f77ab5c3d7c351eab8d3a
+size 8007680
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/__init__.py b/build/torch29-cxx11-cu126-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32e6a58cb685314795328dccadba33e87eaee6f
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/_ops.py b/build/torch29-cxx11-cu126-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..0040359339944bb061af9ca88170e28934477a4d
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_cuda_4e81b67
+ops = torch.ops._rotary_cuda_4e81b67
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_cuda_4e81b67::{op_name}"
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/_rotary_cuda_4e81b67.abi3.so b/build/torch29-cxx11-cu126-aarch64-linux/_rotary_cuda_4e81b67.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..53c031fd4a92dbde7d393b494c94f87468fcaf84
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/_rotary_cuda_4e81b67.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0dba12829865f9420696f7d05da5c56518fbb55d932e8acc1eb271be97ee1acf
+size 8280552
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/metadata.json b/build/torch29-cxx11-cu126-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..0dacb99125f1112a811819ca1ffdde15c8c0faff
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/metadata.json
@@ -0,0 +1,18 @@
+{
+  "version": 1,
+  "license": "BSD-3-Clause",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0+PTX"
+    ]
+  }
+}
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/rotary/__init__.py b/build/torch29-cxx11-cu126-aarch64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/__init__.py b/build/torch29-cxx11-cu126-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32e6a58cb685314795328dccadba33e87eaee6f
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/_ops.py b/build/torch29-cxx11-cu126-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..0040359339944bb061af9ca88170e28934477a4d
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_cuda_4e81b67
+ops = torch.ops._rotary_cuda_4e81b67
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_cuda_4e81b67::{op_name}"
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/_rotary_cuda_4e81b67.abi3.so b/build/torch29-cxx11-cu126-x86_64-linux/_rotary_cuda_4e81b67.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..11be7d69f8e17bce8b5f4ae96d45f494e1a0dc8b
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/_rotary_cuda_4e81b67.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:05d9271f2c7650370cafbb527311eafea0ae8a39cfeb8fe12873fbc0a142588a
+size 8190552
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/metadata.json b/build/torch29-cxx11-cu126-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..0dacb99125f1112a811819ca1ffdde15c8c0faff
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/metadata.json
@@ -0,0 +1,18 @@
+{
+  "version": 1,
+  "license": "BSD-3-Clause",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0+PTX"
+    ]
+  }
+}
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/rotary/__init__.py b/build/torch29-cxx11-cu126-x86_64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32e6a58cb685314795328dccadba33e87eaee6f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/_ops.py b/build/torch29-cxx11-cu128-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..0040359339944bb061af9ca88170e28934477a4d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_cuda_4e81b67
+ops = torch.ops._rotary_cuda_4e81b67
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_cuda_4e81b67::{op_name}"
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/_rotary_cuda_4e81b67.abi3.so b/build/torch29-cxx11-cu128-aarch64-linux/_rotary_cuda_4e81b67.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..0b7e1c3230822c71e1281ec7a8bc33f9bf0d871a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/_rotary_cuda_4e81b67.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24ba03d9bdd3fbe264d722c2e066493fc7ab72de5014bf3ee548fc0f86bb30d7
+size 12016752
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/metadata.json b/build/torch29-cxx11-cu128-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..a794c92436c3827ae79b48d55f7ea964afd50f52
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/metadata.json
@@ -0,0 +1,21 @@
+{
+  "version": 1,
+  "license": "BSD-3-Clause",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0+PTX",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/rotary/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/__init__.py b/build/torch29-cxx11-cu128-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32e6a58cb685314795328dccadba33e87eaee6f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/_ops.py b/build/torch29-cxx11-cu128-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..0040359339944bb061af9ca88170e28934477a4d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_cuda_4e81b67
+ops = torch.ops._rotary_cuda_4e81b67
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_cuda_4e81b67::{op_name}"
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/_rotary_cuda_4e81b67.abi3.so b/build/torch29-cxx11-cu128-x86_64-linux/_rotary_cuda_4e81b67.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..7fc5c476a2550473618d47acf930aad93311b087
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/_rotary_cuda_4e81b67.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:34033788a41442c5ce7fd43688c3efb3357f068b612b2617652037e60765e3a1
+size 11899984
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/metadata.json b/build/torch29-cxx11-cu128-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..a794c92436c3827ae79b48d55f7ea964afd50f52
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/metadata.json
@@ -0,0 +1,21 @@
+{
+  "version": 1,
+  "license": "BSD-3-Clause",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0+PTX",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/rotary/__init__.py b/build/torch29-cxx11-cu128-x86_64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/__init__.py b/build/torch29-cxx11-cu129-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32e6a58cb685314795328dccadba33e87eaee6f
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_ops.py b/build/torch29-cxx11-cu129-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d09f2f0956a472d57a4bb833d515b40d124f276f
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_cuda_2022aa6
+ops = torch.ops._rotary_cuda_2022aa6
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_cuda_2022aa6::{op_name}"
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_rotary_cuda_2022aa6.abi3.so b/build/torch29-cxx11-cu129-aarch64-linux/_rotary_cuda_2022aa6.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..79373c7c4f5fe10983232886b6677a3cd1981703
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/_rotary_cuda_2022aa6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e9bd5da495de58a9054535da6724fe2ac98a81b9746c6a0455195fa5eea42a2
+size 12081848
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/metadata.json b/build/torch29-cxx11-cu129-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..a794c92436c3827ae79b48d55f7ea964afd50f52
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/metadata.json
@@ -0,0 +1,21 @@
+{
+  "version": 1,
+  "license": "BSD-3-Clause",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0+PTX",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/rotary/__init__.py b/build/torch29-cxx11-cu129-aarch64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/__init__.py b/build/torch29-cxx11-cu129-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32e6a58cb685314795328dccadba33e87eaee6f
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_ops.py b/build/torch29-cxx11-cu129-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d09f2f0956a472d57a4bb833d515b40d124f276f
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_cuda_2022aa6
+ops = torch.ops._rotary_cuda_2022aa6
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_cuda_2022aa6::{op_name}"
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_rotary_cuda_2022aa6.abi3.so b/build/torch29-cxx11-cu129-x86_64-linux/_rotary_cuda_2022aa6.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..bff12f8743608a39b19b3482c2e3b7c8890f3e76
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/_rotary_cuda_2022aa6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:30e6433b2165e9f2c8c2dd84e828b231f51920ba35119d515f5d25606f19e661
+size 11964176
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/metadata.json b/build/torch29-cxx11-cu129-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..a794c92436c3827ae79b48d55f7ea964afd50f52
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/metadata.json
@@ -0,0 +1,21 @@
+{
+  "version": 1,
+  "license": "BSD-3-Clause",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0+PTX",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/rotary/__init__.py b/build/torch29-cxx11-cu129-x86_64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/__init__.py b/build/torch29-cxx11-cu130-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32e6a58cb685314795328dccadba33e87eaee6f
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/_ops.py b/build/torch29-cxx11-cu130-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..0040359339944bb061af9ca88170e28934477a4d
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_cuda_4e81b67
+ops = torch.ops._rotary_cuda_4e81b67
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_cuda_4e81b67::{op_name}"
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/_rotary_cuda_4e81b67.abi3.so b/build/torch29-cxx11-cu130-aarch64-linux/_rotary_cuda_4e81b67.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..0e905f7a4fb4532dcbd9d9855ce9acfa5754c9d7
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/_rotary_cuda_4e81b67.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:82e09ee619e75dfcc66cabaebc1da15b08b2550449e1e598b16968a0a5b8dff0
+size 10408984
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/metadata.json b/build/torch29-cxx11-cu130-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..eff725542128e103dfb5df382d74940efff77214
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/metadata.json
@@ -0,0 +1,19 @@
+{
+  "version": 1,
+  "license": "BSD-3-Clause",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "11.0",
+      "12.0+PTX",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/rotary/__init__.py b/build/torch29-cxx11-cu130-aarch64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/__init__.py b/build/torch29-cxx11-cu130-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32e6a58cb685314795328dccadba33e87eaee6f
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/_ops.py b/build/torch29-cxx11-cu130-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..0040359339944bb061af9ca88170e28934477a4d
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_cuda_4e81b67
+ops = torch.ops._rotary_cuda_4e81b67
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_cuda_4e81b67::{op_name}"
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/_rotary_cuda_4e81b67.abi3.so b/build/torch29-cxx11-cu130-x86_64-linux/_rotary_cuda_4e81b67.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..8a78791edab97f3f479b441c2cae413c84d7df36
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/_rotary_cuda_4e81b67.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf0f41e92c54c3410e06b820e501007aa115ae95808f144de0c1281cfed4da7c
+size 10304832
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/metadata.json b/build/torch29-cxx11-cu130-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..eff725542128e103dfb5df382d74940efff77214
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/metadata.json
@@ -0,0 +1,19 @@
+{
+  "version": 1,
+  "license": "BSD-3-Clause",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "11.0",
+      "12.0+PTX",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/rotary/__init__.py b/build/torch29-cxx11-cu130-x86_64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/__init__.py b/build/torch29-cxx11-xpu20252-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32e6a58cb685314795328dccadba33e87eaee6f
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/_ops.py b/build/torch29-cxx11-xpu20252-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..89c673c2015410bb249176345087b5e291299350
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_xpu_17de4fe
+ops = torch.ops._rotary_xpu_17de4fe
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_xpu_17de4fe::{op_name}"
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/_rotary_xpu_17de4fe.abi3.so b/build/torch29-cxx11-xpu20252-x86_64-linux/_rotary_xpu_17de4fe.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..f3d64122d3c290b60ca632899a4d6d739e06de21
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/_rotary_xpu_17de4fe.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a60fb18c3e28fb30f341f8fc88a54ffac29be9db41da579b89be1b9ec3576acd
+size 2287136
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/metadata.json b/build/torch29-cxx11-xpu20252-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f032899cf61212add2325c22107252842bd1588
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/metadata.json
@@ -0,0 +1,8 @@
+{
+  "version": 1,
+  "license": "BSD-3-Clause",
+  "python-depends": [],
+  "backend": {
+    "type": "xpu"
+  }
+}
\ No newline at end of file
diff --git a/build/torch29-cxx11-xpu20252-x86_64-linux/rotary/__init__.py b/build/torch29-cxx11-xpu20252-x86_64-linux/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch29-cxx11-xpu20252-x86_64-linux/rotary/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-xpu20252-x86_64-windows/metadata.json b/build/torch29-xpu20252-x86_64-windows/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cf5deed9898dce769f4cc73913d3530b92a0bd8
--- /dev/null
+++ b/build/torch29-xpu20252-x86_64-windows/metadata.json
@@ -0,0 +1,4 @@
+{
+  "version": 1,
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch29-xpu20252-x86_64-windows/rotary/__init__.py b/build/torch29-xpu20252-x86_64-windows/rotary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..537713178faffc508bce05bd7d15d96ff6c3bd4c
--- /dev/null
+++ b/build/torch29-xpu20252-x86_64-windows/rotary/__init__.py
@@ -0,0 +1,52 @@
+from typing import Optional, Tuple
+import torch
+
+from ._ops import ops
+
+
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+) -> None:
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+
+
+def apply_rotary_transformers(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Rotary kernel implementation wrapper
+    Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    q_rotated = q.clone()
+    k_rotated = k.clone()
+
+    # Get half dimension for rotation
+    half_dim = q.shape[-1] // 2
+    q1 = q_rotated[..., :half_dim]
+    q2 = q_rotated[..., half_dim:]
+    k1 = k_rotated[..., :half_dim]
+    k2 = k_rotated[..., half_dim:]
+    if cos.shape[-1] != half_dim:
+        # Trim cos/sin to match half_dim
+        cos = cos[..., :half_dim]
+        sin = sin[..., :half_dim]
+
+    apply_rotary(q1, q2, cos, sin, q1, q2, False)
+    apply_rotary(k1, k2, cos, sin, k1, k2, False)
+    return q_rotated, k_rotated
+
+
+__all__ = ["apply_rotary", "apply_rotary_transformers"]
diff --git a/build/torch29-xpu20252-x86_64-windows/rotary/_ops.py b/build/torch29-xpu20252-x86_64-windows/rotary/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..6367e8893026e18a84480fe87d722f439ed4fa51
--- /dev/null
+++ b/build/torch29-xpu20252-x86_64-windows/rotary/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _rotary_66b961a
+ops = torch.ops._rotary_66b961a
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_66b961a::{op_name}"
\ No newline at end of file
diff --git a/build/torch29-xpu20252-x86_64-windows/rotary/_rotary_66b961a.pyd b/build/torch29-xpu20252-x86_64-windows/rotary/_rotary_66b961a.pyd
new file mode 100644
index 0000000000000000000000000000000000000000..b838aa60a9c8d6e52c4146d5131c3da6854af356
--- /dev/null
+++ b/build/torch29-xpu20252-x86_64-windows/rotary/_rotary_66b961a.pyd
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4067ad1adb1b5a73202ffc9c78f8c827ff9c273506705670509ae81ffac68484
+size 388096