drbh commited on
Commit
663e348
·
unverified ·
0 Parent(s):

Migrated from kernels-community/rotary

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +48 -0
  2. README.md +14 -0
  3. benchmarks/benchmark.py +119 -0
  4. build/torch210-cu128-x86_64-windows/__init__.py +52 -0
  5. build/torch210-cu128-x86_64-windows/_ops.py +9 -0
  6. build/torch210-cu128-x86_64-windows/_rotary_cuda_07a01e5.pyd +3 -0
  7. build/torch210-cu128-x86_64-windows/metadata.json +21 -0
  8. build/torch210-cu128-x86_64-windows/rotary/__init__.py +26 -0
  9. build/torch210-cxx11-cu126-aarch64-linux/__init__.py +52 -0
  10. build/torch210-cxx11-cu126-aarch64-linux/_ops.py +9 -0
  11. build/torch210-cxx11-cu126-aarch64-linux/_rotary_cuda_2022aa6.abi3.so +3 -0
  12. build/torch210-cxx11-cu126-aarch64-linux/metadata.json +18 -0
  13. build/torch210-cxx11-cu126-aarch64-linux/rotary/__init__.py +26 -0
  14. build/torch210-cxx11-cu126-x86_64-linux/__init__.py +52 -0
  15. build/torch210-cxx11-cu126-x86_64-linux/_ops.py +9 -0
  16. build/torch210-cxx11-cu126-x86_64-linux/_rotary_cuda_2022aa6.abi3.so +3 -0
  17. build/torch210-cxx11-cu126-x86_64-linux/metadata.json +18 -0
  18. build/torch210-cxx11-cu126-x86_64-linux/rotary/__init__.py +26 -0
  19. build/torch210-cxx11-cu128-aarch64-linux/__init__.py +52 -0
  20. build/torch210-cxx11-cu128-aarch64-linux/_ops.py +9 -0
  21. build/torch210-cxx11-cu128-aarch64-linux/_rotary_cuda_2022aa6.abi3.so +3 -0
  22. build/torch210-cxx11-cu128-aarch64-linux/metadata.json +21 -0
  23. build/torch210-cxx11-cu128-aarch64-linux/rotary/__init__.py +26 -0
  24. build/torch210-cxx11-cu128-x86_64-linux/__init__.py +52 -0
  25. build/torch210-cxx11-cu128-x86_64-linux/_ops.py +9 -0
  26. build/torch210-cxx11-cu128-x86_64-linux/_rotary_cuda_2022aa6.abi3.so +3 -0
  27. build/torch210-cxx11-cu128-x86_64-linux/metadata.json +21 -0
  28. build/torch210-cxx11-cu128-x86_64-linux/rotary/__init__.py +26 -0
  29. build/torch210-cxx11-cu130-aarch64-linux/__init__.py +52 -0
  30. build/torch210-cxx11-cu130-aarch64-linux/_ops.py +9 -0
  31. build/torch210-cxx11-cu130-aarch64-linux/_rotary_cuda_2022aa6.abi3.so +3 -0
  32. build/torch210-cxx11-cu130-aarch64-linux/metadata.json +19 -0
  33. build/torch210-cxx11-cu130-aarch64-linux/rotary/__init__.py +26 -0
  34. build/torch210-cxx11-cu130-x86_64-linux/__init__.py +52 -0
  35. build/torch210-cxx11-cu130-x86_64-linux/_ops.py +9 -0
  36. build/torch210-cxx11-cu130-x86_64-linux/_rotary_cuda_2022aa6.abi3.so +3 -0
  37. build/torch210-cxx11-cu130-x86_64-linux/metadata.json +19 -0
  38. build/torch210-cxx11-cu130-x86_64-linux/rotary/__init__.py +26 -0
  39. build/torch210-cxx11-xpu20253-x86_64-linux/__init__.py +52 -0
  40. build/torch210-cxx11-xpu20253-x86_64-linux/_ops.py +9 -0
  41. build/torch210-cxx11-xpu20253-x86_64-linux/_rotary_xpu_2022aa6.abi3.so +3 -0
  42. build/torch210-cxx11-xpu20253-x86_64-linux/metadata.json +8 -0
  43. build/torch210-cxx11-xpu20253-x86_64-linux/rotary/__init__.py +26 -0
  44. build/torch210-xpu20253-x86_64-windows/__init__.py +52 -0
  45. build/torch210-xpu20253-x86_64-windows/_ops.py +9 -0
  46. build/torch210-xpu20253-x86_64-windows/_rotary_xpu_07a01e5.pyd +3 -0
  47. build/torch210-xpu20253-x86_64-windows/metadata.json +5 -0
  48. build/torch210-xpu20253-x86_64-windows/rotary/__init__.py +26 -0
  49. build/torch211-cxx11-cu126-aarch64-linux/__init__.py +52 -0
  50. build/torch211-cxx11-cu126-aarch64-linux/_ops.py +9 -0
.gitattributes ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.so filter=lfs diff=lfs merge=lfs -text
37
+ build/torch29-cu130-x86_64-windows/rotary/_rotary_a793e44.pyd filter=lfs diff=lfs merge=lfs -text
38
+ build/torch210-cu128-x86_64-windows/rotary/_rotary_119c830.pyd filter=lfs diff=lfs merge=lfs -text
39
+ build/torch210-cu128-x86_64-windows/rotary/_rotary_cdcfefe.pyd filter=lfs diff=lfs merge=lfs -text
40
+ build/torch29-xpu20252-x86_64-windows/rotary/_rotary_cdcfefe.pyd filter=lfs diff=lfs merge=lfs -text
41
+ build/torch210-cu128-x86_64-windows/rotary/_rotary_dec30e1.pyd filter=lfs diff=lfs merge=lfs -text
42
+ build/torch29-xpu20252-x86_64-windows/rotary/_rotary_dec30e1.pyd filter=lfs diff=lfs merge=lfs -text
43
+ build/torch210-cu128-x86_64-windows/rotary/_rotary_66b961a.pyd filter=lfs diff=lfs merge=lfs -text
44
+ build/torch29-xpu20252-x86_64-windows/rotary/_rotary_66b961a.pyd filter=lfs diff=lfs merge=lfs -text
45
+ build/torch210-cu128-x86_64-windows/rotary/_rotary_9f63cc2.pyd filter=lfs diff=lfs merge=lfs -text
46
+ build/torch210-xpu20253-x86_64-windows/rotary/_rotary_9f63cc2.pyd filter=lfs diff=lfs merge=lfs -text
47
+ build/torch210-cu128-x86_64-windows/_rotary_cuda_07a01e5.pyd filter=lfs diff=lfs merge=lfs -text
48
+ build/torch210-xpu20253-x86_64-windows/_rotary_xpu_07a01e5.pyd filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: bsd-3-clause
3
+ tags:
4
+ - kernels
5
+ ---
6
+
7
+ ![Status](https://hubwebhook.dholtz.com/shield?repo=kernels-community/rotary)
8
+
9
+ ## rotary
10
+
11
+ rotary embedding kernel from [Flash Attention](https://github.com/Dao-AILab/flash-attention/tree/main/csrc/rotary).
12
+
13
+ Kernel source: https://github.com/huggingface/kernels-community/tree/main/rotary
14
+
benchmarks/benchmark.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ from kernels.benchmark import Benchmark
4
+
5
+
6
+ def apply_rotary_reference(
7
+ x1: torch.Tensor, x2: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, conj: bool
8
+ ) -> tuple[torch.Tensor, torch.Tensor]:
9
+ if not conj:
10
+ out1 = x1 * cos - x2 * sin
11
+ out2 = x1 * sin + x2 * cos
12
+ else:
13
+ out1 = x1 * cos + x2 * sin
14
+ out2 = -x1 * sin + x2 * cos
15
+ return out1, out2
16
+
17
+
18
+ class RotaryBenchmark(Benchmark):
19
+ seed: int = 42
20
+
21
+ def setup(self):
22
+ batch_size = 2
23
+ seqlen = 128
24
+ num_heads = 8
25
+ head_dim = 64
26
+ rotary_dim = 32
27
+
28
+ # Query tensor split into rotary parts
29
+ self.x1 = torch.randn(
30
+ batch_size,
31
+ seqlen,
32
+ num_heads,
33
+ rotary_dim,
34
+ device=self.device,
35
+ dtype=torch.float32,
36
+ )
37
+ self.x2 = torch.randn(
38
+ batch_size,
39
+ seqlen,
40
+ num_heads,
41
+ rotary_dim,
42
+ device=self.device,
43
+ dtype=torch.float32,
44
+ )
45
+
46
+ # Rotary position embeddings
47
+ self.cos = torch.randn(
48
+ seqlen, 1, rotary_dim, device=self.device, dtype=torch.float32
49
+ )
50
+ self.sin = torch.randn(
51
+ seqlen, 1, rotary_dim, device=self.device, dtype=torch.float32
52
+ )
53
+
54
+ # Output tensors (in-place, so clone inputs)
55
+ self.out1 = self.x1.clone()
56
+ self.out2 = self.x2.clone()
57
+
58
+ def benchmark_base(self):
59
+ # Reset outputs to input values for in-place operation
60
+ self.out1.copy_(self.x1)
61
+ self.out2.copy_(self.x2)
62
+ self.kernel.apply_rotary(
63
+ self.out1, self.out2, self.cos, self.sin, self.out1, self.out2, False
64
+ )
65
+
66
+ def verify_base(self) -> torch.Tensor:
67
+ ref_out1, ref_out2 = apply_rotary_reference(
68
+ self.x1, self.x2, self.cos, self.sin, False
69
+ )
70
+ # Concatenate for comparison (benchmark compares self.out with returned tensor)
71
+ self.out = torch.cat([self.out1, self.out2], dim=-1)
72
+ return torch.cat([ref_out1, ref_out2], dim=-1)
73
+
74
+ def setup_large(self):
75
+ batch_size = 8
76
+ seqlen = 512
77
+ num_heads = 32
78
+ rotary_dim = 64
79
+
80
+ self.x1 = torch.randn(
81
+ batch_size,
82
+ seqlen,
83
+ num_heads,
84
+ rotary_dim,
85
+ device=self.device,
86
+ dtype=torch.float32,
87
+ )
88
+ self.x2 = torch.randn(
89
+ batch_size,
90
+ seqlen,
91
+ num_heads,
92
+ rotary_dim,
93
+ device=self.device,
94
+ dtype=torch.float32,
95
+ )
96
+
97
+ self.cos = torch.randn(
98
+ seqlen, 1, rotary_dim, device=self.device, dtype=torch.float32
99
+ )
100
+ self.sin = torch.randn(
101
+ seqlen, 1, rotary_dim, device=self.device, dtype=torch.float32
102
+ )
103
+
104
+ self.out1 = self.x1.clone()
105
+ self.out2 = self.x2.clone()
106
+
107
+ def benchmark_large(self):
108
+ self.out1.copy_(self.x1)
109
+ self.out2.copy_(self.x2)
110
+ self.kernel.apply_rotary(
111
+ self.out1, self.out2, self.cos, self.sin, self.out1, self.out2, False
112
+ )
113
+
114
+ def verify_large(self) -> torch.Tensor:
115
+ ref_out1, ref_out2 = apply_rotary_reference(
116
+ self.x1, self.x2, self.cos, self.sin, False
117
+ )
118
+ self.out = torch.cat([self.out1, self.out2], dim=-1)
119
+ return torch.cat([ref_out1, ref_out2], dim=-1)
build/torch210-cu128-x86_64-windows/__init__.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Tuple
2
+ import torch
3
+
4
+ from ._ops import ops
5
+
6
+
7
+ def apply_rotary(
8
+ x1: torch.Tensor,
9
+ x2: torch.Tensor,
10
+ cos: torch.Tensor,
11
+ sin: torch.Tensor,
12
+ out1: torch.Tensor,
13
+ out2: torch.Tensor,
14
+ conj: bool,
15
+ ) -> None:
16
+ ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
17
+
18
+
19
+ def apply_rotary_transformers(
20
+ q: torch.Tensor,
21
+ k: torch.Tensor,
22
+ cos: torch.Tensor,
23
+ sin: torch.Tensor,
24
+ unsqueeze_dim: int = 1,
25
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
26
+ """
27
+ Rotary kernel implementation wrapper
28
+ Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
29
+ """
30
+ cos = cos.unsqueeze(unsqueeze_dim)
31
+ sin = sin.unsqueeze(unsqueeze_dim)
32
+
33
+ q_rotated = q.clone()
34
+ k_rotated = k.clone()
35
+
36
+ # Get half dimension for rotation
37
+ half_dim = q.shape[-1] // 2
38
+ q1 = q_rotated[..., :half_dim]
39
+ q2 = q_rotated[..., half_dim:]
40
+ k1 = k_rotated[..., :half_dim]
41
+ k2 = k_rotated[..., half_dim:]
42
+ if cos.shape[-1] != half_dim:
43
+ # Trim cos/sin to match half_dim
44
+ cos = cos[..., :half_dim]
45
+ sin = sin[..., :half_dim]
46
+
47
+ apply_rotary(q1, q2, cos, sin, q1, q2, False)
48
+ apply_rotary(k1, k2, cos, sin, k1, k2, False)
49
+ return q_rotated, k_rotated
50
+
51
+
52
+ __all__ = ["apply_rotary", "apply_rotary_transformers"]
build/torch210-cu128-x86_64-windows/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _rotary_cuda_07a01e5
3
+ ops = torch.ops._rotary_cuda_07a01e5
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_rotary_cuda_07a01e5::{op_name}"
build/torch210-cu128-x86_64-windows/_rotary_cuda_07a01e5.pyd ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd29928a6e2a3930f4c7ec3bcffc37574981cf59bed97e6a8f3c522fa7ca0dda
3
+ size 10415616
build/torch210-cu128-x86_64-windows/metadata.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": 1,
3
+ "license": "BSD-3-Clause",
4
+ "python-depends": [],
5
+ "backend": {
6
+ "type": "cuda",
7
+ "archs": [
8
+ "10.0",
9
+ "10.1",
10
+ "12.0+PTX",
11
+ "7.0",
12
+ "7.2",
13
+ "7.5",
14
+ "8.0",
15
+ "8.6",
16
+ "8.7",
17
+ "8.9",
18
+ "9.0"
19
+ ]
20
+ }
21
+ }
build/torch210-cu128-x86_64-windows/rotary/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import sys
3
+
4
+ import importlib
5
+ from pathlib import Path
6
+ from types import ModuleType
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch210-cxx11-cu126-aarch64-linux/__init__.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Tuple
2
+ import torch
3
+
4
+ from ._ops import ops
5
+
6
+
7
+ def apply_rotary(
8
+ x1: torch.Tensor,
9
+ x2: torch.Tensor,
10
+ cos: torch.Tensor,
11
+ sin: torch.Tensor,
12
+ out1: torch.Tensor,
13
+ out2: torch.Tensor,
14
+ conj: bool,
15
+ ) -> None:
16
+ ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
17
+
18
+
19
+ def apply_rotary_transformers(
20
+ q: torch.Tensor,
21
+ k: torch.Tensor,
22
+ cos: torch.Tensor,
23
+ sin: torch.Tensor,
24
+ unsqueeze_dim: int = 1,
25
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
26
+ """
27
+ Rotary kernel implementation wrapper
28
+ Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
29
+ """
30
+ cos = cos.unsqueeze(unsqueeze_dim)
31
+ sin = sin.unsqueeze(unsqueeze_dim)
32
+
33
+ q_rotated = q.clone()
34
+ k_rotated = k.clone()
35
+
36
+ # Get half dimension for rotation
37
+ half_dim = q.shape[-1] // 2
38
+ q1 = q_rotated[..., :half_dim]
39
+ q2 = q_rotated[..., half_dim:]
40
+ k1 = k_rotated[..., :half_dim]
41
+ k2 = k_rotated[..., half_dim:]
42
+ if cos.shape[-1] != half_dim:
43
+ # Trim cos/sin to match half_dim
44
+ cos = cos[..., :half_dim]
45
+ sin = sin[..., :half_dim]
46
+
47
+ apply_rotary(q1, q2, cos, sin, q1, q2, False)
48
+ apply_rotary(k1, k2, cos, sin, k1, k2, False)
49
+ return q_rotated, k_rotated
50
+
51
+
52
+ __all__ = ["apply_rotary", "apply_rotary_transformers"]
build/torch210-cxx11-cu126-aarch64-linux/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _rotary_cuda_2022aa6
3
+ ops = torch.ops._rotary_cuda_2022aa6
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_rotary_cuda_2022aa6::{op_name}"
build/torch210-cxx11-cu126-aarch64-linux/_rotary_cuda_2022aa6.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7863cbd6a156cd3f873e926b2f8861e151d43952a26a989b9ad19753aa6270dc
3
+ size 8282888
build/torch210-cxx11-cu126-aarch64-linux/metadata.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": 1,
3
+ "license": "BSD-3-Clause",
4
+ "python-depends": [],
5
+ "backend": {
6
+ "type": "cuda",
7
+ "archs": [
8
+ "7.0",
9
+ "7.2",
10
+ "7.5",
11
+ "8.0",
12
+ "8.6",
13
+ "8.7",
14
+ "8.9",
15
+ "9.0+PTX"
16
+ ]
17
+ }
18
+ }
build/torch210-cxx11-cu126-aarch64-linux/rotary/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import importlib.util
3
+ import sys
4
+ from pathlib import Path
5
+ from types import ModuleType
6
+
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch210-cxx11-cu126-x86_64-linux/__init__.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Tuple
2
+ import torch
3
+
4
+ from ._ops import ops
5
+
6
+
7
+ def apply_rotary(
8
+ x1: torch.Tensor,
9
+ x2: torch.Tensor,
10
+ cos: torch.Tensor,
11
+ sin: torch.Tensor,
12
+ out1: torch.Tensor,
13
+ out2: torch.Tensor,
14
+ conj: bool,
15
+ ) -> None:
16
+ ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
17
+
18
+
19
+ def apply_rotary_transformers(
20
+ q: torch.Tensor,
21
+ k: torch.Tensor,
22
+ cos: torch.Tensor,
23
+ sin: torch.Tensor,
24
+ unsqueeze_dim: int = 1,
25
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
26
+ """
27
+ Rotary kernel implementation wrapper
28
+ Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
29
+ """
30
+ cos = cos.unsqueeze(unsqueeze_dim)
31
+ sin = sin.unsqueeze(unsqueeze_dim)
32
+
33
+ q_rotated = q.clone()
34
+ k_rotated = k.clone()
35
+
36
+ # Get half dimension for rotation
37
+ half_dim = q.shape[-1] // 2
38
+ q1 = q_rotated[..., :half_dim]
39
+ q2 = q_rotated[..., half_dim:]
40
+ k1 = k_rotated[..., :half_dim]
41
+ k2 = k_rotated[..., half_dim:]
42
+ if cos.shape[-1] != half_dim:
43
+ # Trim cos/sin to match half_dim
44
+ cos = cos[..., :half_dim]
45
+ sin = sin[..., :half_dim]
46
+
47
+ apply_rotary(q1, q2, cos, sin, q1, q2, False)
48
+ apply_rotary(k1, k2, cos, sin, k1, k2, False)
49
+ return q_rotated, k_rotated
50
+
51
+
52
+ __all__ = ["apply_rotary", "apply_rotary_transformers"]
build/torch210-cxx11-cu126-x86_64-linux/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _rotary_cuda_2022aa6
3
+ ops = torch.ops._rotary_cuda_2022aa6
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_rotary_cuda_2022aa6::{op_name}"
build/torch210-cxx11-cu126-x86_64-linux/_rotary_cuda_2022aa6.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2ac4fb2c7bbe3b277ed069761faabce67d1e1f8b3d5708f2d6f0b8b1ccfa873
3
+ size 8200568
build/torch210-cxx11-cu126-x86_64-linux/metadata.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": 1,
3
+ "license": "BSD-3-Clause",
4
+ "python-depends": [],
5
+ "backend": {
6
+ "type": "cuda",
7
+ "archs": [
8
+ "7.0",
9
+ "7.2",
10
+ "7.5",
11
+ "8.0",
12
+ "8.6",
13
+ "8.7",
14
+ "8.9",
15
+ "9.0+PTX"
16
+ ]
17
+ }
18
+ }
build/torch210-cxx11-cu126-x86_64-linux/rotary/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import importlib.util
3
+ import sys
4
+ from pathlib import Path
5
+ from types import ModuleType
6
+
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch210-cxx11-cu128-aarch64-linux/__init__.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Tuple
2
+ import torch
3
+
4
+ from ._ops import ops
5
+
6
+
7
+ def apply_rotary(
8
+ x1: torch.Tensor,
9
+ x2: torch.Tensor,
10
+ cos: torch.Tensor,
11
+ sin: torch.Tensor,
12
+ out1: torch.Tensor,
13
+ out2: torch.Tensor,
14
+ conj: bool,
15
+ ) -> None:
16
+ ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
17
+
18
+
19
+ def apply_rotary_transformers(
20
+ q: torch.Tensor,
21
+ k: torch.Tensor,
22
+ cos: torch.Tensor,
23
+ sin: torch.Tensor,
24
+ unsqueeze_dim: int = 1,
25
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
26
+ """
27
+ Rotary kernel implementation wrapper
28
+ Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
29
+ """
30
+ cos = cos.unsqueeze(unsqueeze_dim)
31
+ sin = sin.unsqueeze(unsqueeze_dim)
32
+
33
+ q_rotated = q.clone()
34
+ k_rotated = k.clone()
35
+
36
+ # Get half dimension for rotation
37
+ half_dim = q.shape[-1] // 2
38
+ q1 = q_rotated[..., :half_dim]
39
+ q2 = q_rotated[..., half_dim:]
40
+ k1 = k_rotated[..., :half_dim]
41
+ k2 = k_rotated[..., half_dim:]
42
+ if cos.shape[-1] != half_dim:
43
+ # Trim cos/sin to match half_dim
44
+ cos = cos[..., :half_dim]
45
+ sin = sin[..., :half_dim]
46
+
47
+ apply_rotary(q1, q2, cos, sin, q1, q2, False)
48
+ apply_rotary(k1, k2, cos, sin, k1, k2, False)
49
+ return q_rotated, k_rotated
50
+
51
+
52
+ __all__ = ["apply_rotary", "apply_rotary_transformers"]
build/torch210-cxx11-cu128-aarch64-linux/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _rotary_cuda_2022aa6
3
+ ops = torch.ops._rotary_cuda_2022aa6
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_rotary_cuda_2022aa6::{op_name}"
build/torch210-cxx11-cu128-aarch64-linux/_rotary_cuda_2022aa6.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:696ff3570b3f6fbc9623e44b53f189bb0be0bc6260d490616b03c58dd5dd2146
3
+ size 12019200
build/torch210-cxx11-cu128-aarch64-linux/metadata.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": 1,
3
+ "license": "BSD-3-Clause",
4
+ "python-depends": [],
5
+ "backend": {
6
+ "type": "cuda",
7
+ "archs": [
8
+ "10.0",
9
+ "10.1",
10
+ "12.0+PTX",
11
+ "7.0",
12
+ "7.2",
13
+ "7.5",
14
+ "8.0",
15
+ "8.6",
16
+ "8.7",
17
+ "8.9",
18
+ "9.0"
19
+ ]
20
+ }
21
+ }
build/torch210-cxx11-cu128-aarch64-linux/rotary/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import importlib.util
3
+ import sys
4
+ from pathlib import Path
5
+ from types import ModuleType
6
+
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch210-cxx11-cu128-x86_64-linux/__init__.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Tuple
2
+ import torch
3
+
4
+ from ._ops import ops
5
+
6
+
7
+ def apply_rotary(
8
+ x1: torch.Tensor,
9
+ x2: torch.Tensor,
10
+ cos: torch.Tensor,
11
+ sin: torch.Tensor,
12
+ out1: torch.Tensor,
13
+ out2: torch.Tensor,
14
+ conj: bool,
15
+ ) -> None:
16
+ ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
17
+
18
+
19
+ def apply_rotary_transformers(
20
+ q: torch.Tensor,
21
+ k: torch.Tensor,
22
+ cos: torch.Tensor,
23
+ sin: torch.Tensor,
24
+ unsqueeze_dim: int = 1,
25
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
26
+ """
27
+ Rotary kernel implementation wrapper
28
+ Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
29
+ """
30
+ cos = cos.unsqueeze(unsqueeze_dim)
31
+ sin = sin.unsqueeze(unsqueeze_dim)
32
+
33
+ q_rotated = q.clone()
34
+ k_rotated = k.clone()
35
+
36
+ # Get half dimension for rotation
37
+ half_dim = q.shape[-1] // 2
38
+ q1 = q_rotated[..., :half_dim]
39
+ q2 = q_rotated[..., half_dim:]
40
+ k1 = k_rotated[..., :half_dim]
41
+ k2 = k_rotated[..., half_dim:]
42
+ if cos.shape[-1] != half_dim:
43
+ # Trim cos/sin to match half_dim
44
+ cos = cos[..., :half_dim]
45
+ sin = sin[..., :half_dim]
46
+
47
+ apply_rotary(q1, q2, cos, sin, q1, q2, False)
48
+ apply_rotary(k1, k2, cos, sin, k1, k2, False)
49
+ return q_rotated, k_rotated
50
+
51
+
52
+ __all__ = ["apply_rotary", "apply_rotary_transformers"]
build/torch210-cxx11-cu128-x86_64-linux/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _rotary_cuda_2022aa6
3
+ ops = torch.ops._rotary_cuda_2022aa6
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_rotary_cuda_2022aa6::{op_name}"
build/torch210-cxx11-cu128-x86_64-linux/_rotary_cuda_2022aa6.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1238e4b57b2f30d5c5f67fc1d64a133de551f9b68b619271ac2a10f948d66b04
3
+ size 11905904
build/torch210-cxx11-cu128-x86_64-linux/metadata.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": 1,
3
+ "license": "BSD-3-Clause",
4
+ "python-depends": [],
5
+ "backend": {
6
+ "type": "cuda",
7
+ "archs": [
8
+ "10.0",
9
+ "10.1",
10
+ "12.0+PTX",
11
+ "7.0",
12
+ "7.2",
13
+ "7.5",
14
+ "8.0",
15
+ "8.6",
16
+ "8.7",
17
+ "8.9",
18
+ "9.0"
19
+ ]
20
+ }
21
+ }
build/torch210-cxx11-cu128-x86_64-linux/rotary/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import importlib.util
3
+ import sys
4
+ from pathlib import Path
5
+ from types import ModuleType
6
+
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch210-cxx11-cu130-aarch64-linux/__init__.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Tuple
2
+ import torch
3
+
4
+ from ._ops import ops
5
+
6
+
7
+ def apply_rotary(
8
+ x1: torch.Tensor,
9
+ x2: torch.Tensor,
10
+ cos: torch.Tensor,
11
+ sin: torch.Tensor,
12
+ out1: torch.Tensor,
13
+ out2: torch.Tensor,
14
+ conj: bool,
15
+ ) -> None:
16
+ ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
17
+
18
+
19
+ def apply_rotary_transformers(
20
+ q: torch.Tensor,
21
+ k: torch.Tensor,
22
+ cos: torch.Tensor,
23
+ sin: torch.Tensor,
24
+ unsqueeze_dim: int = 1,
25
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
26
+ """
27
+ Rotary kernel implementation wrapper
28
+ Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
29
+ """
30
+ cos = cos.unsqueeze(unsqueeze_dim)
31
+ sin = sin.unsqueeze(unsqueeze_dim)
32
+
33
+ q_rotated = q.clone()
34
+ k_rotated = k.clone()
35
+
36
+ # Get half dimension for rotation
37
+ half_dim = q.shape[-1] // 2
38
+ q1 = q_rotated[..., :half_dim]
39
+ q2 = q_rotated[..., half_dim:]
40
+ k1 = k_rotated[..., :half_dim]
41
+ k2 = k_rotated[..., half_dim:]
42
+ if cos.shape[-1] != half_dim:
43
+ # Trim cos/sin to match half_dim
44
+ cos = cos[..., :half_dim]
45
+ sin = sin[..., :half_dim]
46
+
47
+ apply_rotary(q1, q2, cos, sin, q1, q2, False)
48
+ apply_rotary(k1, k2, cos, sin, k1, k2, False)
49
+ return q_rotated, k_rotated
50
+
51
+
52
+ __all__ = ["apply_rotary", "apply_rotary_transformers"]
build/torch210-cxx11-cu130-aarch64-linux/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _rotary_cuda_2022aa6
3
+ ops = torch.ops._rotary_cuda_2022aa6
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_rotary_cuda_2022aa6::{op_name}"
build/torch210-cxx11-cu130-aarch64-linux/_rotary_cuda_2022aa6.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:686edb81b5ffdc43e88e35995b962aed5d23061c6aa27aff61af910b76cf03bf
3
+ size 10411432
build/torch210-cxx11-cu130-aarch64-linux/metadata.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": 1,
3
+ "license": "BSD-3-Clause",
4
+ "python-depends": [],
5
+ "backend": {
6
+ "type": "cuda",
7
+ "archs": [
8
+ "10.0",
9
+ "11.0",
10
+ "12.0+PTX",
11
+ "7.5",
12
+ "8.0",
13
+ "8.6",
14
+ "8.7",
15
+ "8.9",
16
+ "9.0"
17
+ ]
18
+ }
19
+ }
build/torch210-cxx11-cu130-aarch64-linux/rotary/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import importlib.util
3
+ import sys
4
+ from pathlib import Path
5
+ from types import ModuleType
6
+
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch210-cxx11-cu130-x86_64-linux/__init__.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Tuple
2
+ import torch
3
+
4
+ from ._ops import ops
5
+
6
+
7
+ def apply_rotary(
8
+ x1: torch.Tensor,
9
+ x2: torch.Tensor,
10
+ cos: torch.Tensor,
11
+ sin: torch.Tensor,
12
+ out1: torch.Tensor,
13
+ out2: torch.Tensor,
14
+ conj: bool,
15
+ ) -> None:
16
+ ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
17
+
18
+
19
+ def apply_rotary_transformers(
20
+ q: torch.Tensor,
21
+ k: torch.Tensor,
22
+ cos: torch.Tensor,
23
+ sin: torch.Tensor,
24
+ unsqueeze_dim: int = 1,
25
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
26
+ """
27
+ Rotary kernel implementation wrapper
28
+ Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
29
+ """
30
+ cos = cos.unsqueeze(unsqueeze_dim)
31
+ sin = sin.unsqueeze(unsqueeze_dim)
32
+
33
+ q_rotated = q.clone()
34
+ k_rotated = k.clone()
35
+
36
+ # Get half dimension for rotation
37
+ half_dim = q.shape[-1] // 2
38
+ q1 = q_rotated[..., :half_dim]
39
+ q2 = q_rotated[..., half_dim:]
40
+ k1 = k_rotated[..., :half_dim]
41
+ k2 = k_rotated[..., half_dim:]
42
+ if cos.shape[-1] != half_dim:
43
+ # Trim cos/sin to match half_dim
44
+ cos = cos[..., :half_dim]
45
+ sin = sin[..., :half_dim]
46
+
47
+ apply_rotary(q1, q2, cos, sin, q1, q2, False)
48
+ apply_rotary(k1, k2, cos, sin, k1, k2, False)
49
+ return q_rotated, k_rotated
50
+
51
+
52
+ __all__ = ["apply_rotary", "apply_rotary_transformers"]
build/torch210-cxx11-cu130-x86_64-linux/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _rotary_cuda_2022aa6
3
+ ops = torch.ops._rotary_cuda_2022aa6
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_rotary_cuda_2022aa6::{op_name}"
build/torch210-cxx11-cu130-x86_64-linux/_rotary_cuda_2022aa6.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:069004af51893d2f112d58bc00197cf813c5271ef6f9105936b7966bbb44881f
3
+ size 10310752
build/torch210-cxx11-cu130-x86_64-linux/metadata.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": 1,
3
+ "license": "BSD-3-Clause",
4
+ "python-depends": [],
5
+ "backend": {
6
+ "type": "cuda",
7
+ "archs": [
8
+ "10.0",
9
+ "11.0",
10
+ "12.0+PTX",
11
+ "7.5",
12
+ "8.0",
13
+ "8.6",
14
+ "8.7",
15
+ "8.9",
16
+ "9.0"
17
+ ]
18
+ }
19
+ }
build/torch210-cxx11-cu130-x86_64-linux/rotary/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import importlib.util
3
+ import sys
4
+ from pathlib import Path
5
+ from types import ModuleType
6
+
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch210-cxx11-xpu20253-x86_64-linux/__init__.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Tuple
2
+ import torch
3
+
4
+ from ._ops import ops
5
+
6
+
7
+ def apply_rotary(
8
+ x1: torch.Tensor,
9
+ x2: torch.Tensor,
10
+ cos: torch.Tensor,
11
+ sin: torch.Tensor,
12
+ out1: torch.Tensor,
13
+ out2: torch.Tensor,
14
+ conj: bool,
15
+ ) -> None:
16
+ ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
17
+
18
+
19
+ def apply_rotary_transformers(
20
+ q: torch.Tensor,
21
+ k: torch.Tensor,
22
+ cos: torch.Tensor,
23
+ sin: torch.Tensor,
24
+ unsqueeze_dim: int = 1,
25
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
26
+ """
27
+ Rotary kernel implementation wrapper
28
+ Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
29
+ """
30
+ cos = cos.unsqueeze(unsqueeze_dim)
31
+ sin = sin.unsqueeze(unsqueeze_dim)
32
+
33
+ q_rotated = q.clone()
34
+ k_rotated = k.clone()
35
+
36
+ # Get half dimension for rotation
37
+ half_dim = q.shape[-1] // 2
38
+ q1 = q_rotated[..., :half_dim]
39
+ q2 = q_rotated[..., half_dim:]
40
+ k1 = k_rotated[..., :half_dim]
41
+ k2 = k_rotated[..., half_dim:]
42
+ if cos.shape[-1] != half_dim:
43
+ # Trim cos/sin to match half_dim
44
+ cos = cos[..., :half_dim]
45
+ sin = sin[..., :half_dim]
46
+
47
+ apply_rotary(q1, q2, cos, sin, q1, q2, False)
48
+ apply_rotary(k1, k2, cos, sin, k1, k2, False)
49
+ return q_rotated, k_rotated
50
+
51
+
52
+ __all__ = ["apply_rotary", "apply_rotary_transformers"]
build/torch210-cxx11-xpu20253-x86_64-linux/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _rotary_xpu_2022aa6
3
+ ops = torch.ops._rotary_xpu_2022aa6
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_rotary_xpu_2022aa6::{op_name}"
build/torch210-cxx11-xpu20253-x86_64-linux/_rotary_xpu_2022aa6.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26ce5dd015655bbbccf535f2b7078b184d01831778effd3058fa24256be69111
3
+ size 2301504
build/torch210-cxx11-xpu20253-x86_64-linux/metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": 1,
3
+ "license": "BSD-3-Clause",
4
+ "python-depends": [],
5
+ "backend": {
6
+ "type": "xpu"
7
+ }
8
+ }
build/torch210-cxx11-xpu20253-x86_64-linux/rotary/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import importlib.util
3
+ import sys
4
+ from pathlib import Path
5
+ from types import ModuleType
6
+
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch210-xpu20253-x86_64-windows/__init__.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Tuple
2
+ import torch
3
+
4
+ from ._ops import ops
5
+
6
+
7
+ def apply_rotary(
8
+ x1: torch.Tensor,
9
+ x2: torch.Tensor,
10
+ cos: torch.Tensor,
11
+ sin: torch.Tensor,
12
+ out1: torch.Tensor,
13
+ out2: torch.Tensor,
14
+ conj: bool,
15
+ ) -> None:
16
+ ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
17
+
18
+
19
+ def apply_rotary_transformers(
20
+ q: torch.Tensor,
21
+ k: torch.Tensor,
22
+ cos: torch.Tensor,
23
+ sin: torch.Tensor,
24
+ unsqueeze_dim: int = 1,
25
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
26
+ """
27
+ Rotary kernel implementation wrapper
28
+ Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
29
+ """
30
+ cos = cos.unsqueeze(unsqueeze_dim)
31
+ sin = sin.unsqueeze(unsqueeze_dim)
32
+
33
+ q_rotated = q.clone()
34
+ k_rotated = k.clone()
35
+
36
+ # Get half dimension for rotation
37
+ half_dim = q.shape[-1] // 2
38
+ q1 = q_rotated[..., :half_dim]
39
+ q2 = q_rotated[..., half_dim:]
40
+ k1 = k_rotated[..., :half_dim]
41
+ k2 = k_rotated[..., half_dim:]
42
+ if cos.shape[-1] != half_dim:
43
+ # Trim cos/sin to match half_dim
44
+ cos = cos[..., :half_dim]
45
+ sin = sin[..., :half_dim]
46
+
47
+ apply_rotary(q1, q2, cos, sin, q1, q2, False)
48
+ apply_rotary(k1, k2, cos, sin, k1, k2, False)
49
+ return q_rotated, k_rotated
50
+
51
+
52
+ __all__ = ["apply_rotary", "apply_rotary_transformers"]
build/torch210-xpu20253-x86_64-windows/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _rotary_xpu_07a01e5
3
+ ops = torch.ops._rotary_xpu_07a01e5
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_rotary_xpu_07a01e5::{op_name}"
build/torch210-xpu20253-x86_64-windows/_rotary_xpu_07a01e5.pyd ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02d857f2afd55cccc36d439f348ff360bdc7274c0e65660e41a2f8775526dec1
3
+ size 396288
build/torch210-xpu20253-x86_64-windows/metadata.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "version": 1,
3
+ "license": "BSD-3-Clause",
4
+ "python-depends": []
5
+ }
build/torch210-xpu20253-x86_64-windows/rotary/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import sys
3
+
4
+ import importlib
5
+ from pathlib import Path
6
+ from types import ModuleType
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch211-cxx11-cu126-aarch64-linux/__init__.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Tuple
2
+ import torch
3
+
4
+ from ._ops import ops
5
+
6
+
7
+ def apply_rotary(
8
+ x1: torch.Tensor,
9
+ x2: torch.Tensor,
10
+ cos: torch.Tensor,
11
+ sin: torch.Tensor,
12
+ out1: torch.Tensor,
13
+ out2: torch.Tensor,
14
+ conj: bool,
15
+ ) -> None:
16
+ ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
17
+
18
+
19
+ def apply_rotary_transformers(
20
+ q: torch.Tensor,
21
+ k: torch.Tensor,
22
+ cos: torch.Tensor,
23
+ sin: torch.Tensor,
24
+ unsqueeze_dim: int = 1,
25
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
26
+ """
27
+ Rotary kernel implementation wrapper
28
+ Adapts rotary kernel implementation to match transformers apply_rotary_pos_emb signature
29
+ """
30
+ cos = cos.unsqueeze(unsqueeze_dim)
31
+ sin = sin.unsqueeze(unsqueeze_dim)
32
+
33
+ q_rotated = q.clone()
34
+ k_rotated = k.clone()
35
+
36
+ # Get half dimension for rotation
37
+ half_dim = q.shape[-1] // 2
38
+ q1 = q_rotated[..., :half_dim]
39
+ q2 = q_rotated[..., half_dim:]
40
+ k1 = k_rotated[..., :half_dim]
41
+ k2 = k_rotated[..., half_dim:]
42
+ if cos.shape[-1] != half_dim:
43
+ # Trim cos/sin to match half_dim
44
+ cos = cos[..., :half_dim]
45
+ sin = sin[..., :half_dim]
46
+
47
+ apply_rotary(q1, q2, cos, sin, q1, q2, False)
48
+ apply_rotary(k1, k2, cos, sin, k1, k2, False)
49
+ return q_rotated, k_rotated
50
+
51
+
52
+ __all__ = ["apply_rotary", "apply_rotary_transformers"]
build/torch211-cxx11-cu126-aarch64-linux/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _rotary_cuda_2022aa6
3
+ ops = torch.ops._rotary_cuda_2022aa6
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_rotary_cuda_2022aa6::{op_name}"