kernels-community
/

rwkv

Kernels

Model card Files Files and versions

xet

Community

danieldk HF Staff commited on 14 days ago

Commit

adc211c

verified ·

1 Parent(s): 5f8a57e

Benchmarks uploaded using `kernels`.

Browse files

Files changed (1) hide show

benchmarks/benchmark.py +81 -0

benchmarks/benchmark.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import torch
+from kernels.benchmark import Benchmark
+def rwkv_wkv_reference(
+    w: torch.Tensor, u: torch.Tensor, k: torch.Tensor, v: torch.Tensor
+) -> torch.Tensor:
+    B, T, C = k.shape
+    device = k.device
+    dtype = k.dtype
+    y = torch.zeros(B, T, C, device=device, dtype=dtype)
+    # State: accumulated numerator, denominator, and max exponent
+    aa = torch.zeros(B, C, device=device, dtype=torch.float32)
+    bb = torch.zeros(B, C, device=device, dtype=torch.float32)
+    pp = torch.full((B, C), -1e38, device=device, dtype=torch.float32)
+    w = w.float()
+    u = u.float()
+    for t in range(T):
+        kt = k[:, t, :].float()  # [B, C]
+        vt = v[:, t, :].float()  # [B, C]
+        # Output computation
+        ww = u + kt
+        p = torch.maximum(pp, ww)
+        e1 = torch.exp(pp - p)
+        e2 = torch.exp(ww - p)
+        y[:, t, :] = ((e1 * aa + e2 * vt) / (e1 * bb + e2)).to(dtype)
+        # State update (note: w + pp, not pp - w)
+        ww = w + pp
+        p = torch.maximum(ww, kt)
+        e1 = torch.exp(ww - p)
+        e2 = torch.exp(kt - p)
+        aa = e1 * aa + e2 * vt
+        bb = e1 * bb + e2
+        pp = p
+    return y
+class RwkvBenchmark(Benchmark):
+    seed: int = 42
+    def setup(self):
+        B, T, C = 2, 64, 256
+        self.w = torch.randn(
+            C, device=self.device, dtype=torch.float32
+        ).abs()  # Decay should be positive
+        self.u = torch.randn(C, device=self.device, dtype=torch.float32)
+        self.k = torch.randn(B, T, C, device=self.device, dtype=torch.float32) * 0.1
+        self.v = torch.randn(B, T, C, device=self.device, dtype=torch.float32) * 0.1
+        self.out = torch.zeros(B, T, C, device=self.device, dtype=torch.float32)
+    def benchmark_base(self):
+        self.out.zero_()
+        self.kernel.forward(self.w, self.u, self.k, self.v, self.out)
+    def verify_base(self) -> torch.Tensor:
+        return rwkv_wkv_reference(self.w, self.u, self.k, self.v)
+    def setup_large(self):
+        B, T, C = 8, 256, 512
+        self.w = torch.randn(C, device=self.device, dtype=torch.float32).abs()
+        self.u = torch.randn(C, device=self.device, dtype=torch.float32)
+        self.k = torch.randn(B, T, C, device=self.device, dtype=torch.float32) * 0.1
+        self.v = torch.randn(B, T, C, device=self.device, dtype=torch.float32) * 0.1
+        self.out = torch.zeros(B, T, C, device=self.device, dtype=torch.float32)
+    def benchmark_large(self):
+        self.out.zero_()
+        self.kernel.forward(self.w, self.u, self.k, self.v, self.out)
+    def verify_large(self) -> torch.Tensor:
+        return rwkv_wkv_reference(self.w, self.u, self.k, self.v)