medmekk commited on Oct 1, 2025

Commit

84ec9f0

verified ·

1 Parent(s): 57f64dd

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +6 -0
README.md +3 -0
build.toml +35 -0
build/torch27-cxx11-cu118-x86_64-linux/torch_harmonics_attn/__init__.py +10 -0
build/torch27-cxx11-cu118-x86_64-linux/torch_harmonics_attn/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch27-cxx11-cu118-x86_64-linux/torch_harmonics_attn/__pycache__/_attn_utils.cpython-313.pyc +0 -0
build/torch27-cxx11-cu118-x86_64-linux/torch_harmonics_attn/__pycache__/_ops.cpython-313.pyc +0 -0
build/torch27-cxx11-cu118-x86_64-linux/torch_harmonics_attn/_attn_utils.py +637 -0
build/torch27-cxx11-cu118-x86_64-linux/torch_harmonics_attn/_ops.py +9 -0
build/torch27-cxx11-cu118-x86_64-linux/torch_harmonics_attn/_torch_harmonics_attn_20251001150033.abi3.so +3 -0
build/torch27-cxx11-cu126-x86_64-linux/torch_harmonics_attn/__init__.py +10 -0
build/torch27-cxx11-cu126-x86_64-linux/torch_harmonics_attn/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch27-cxx11-cu126-x86_64-linux/torch_harmonics_attn/__pycache__/_attn_utils.cpython-313.pyc +0 -0
build/torch27-cxx11-cu126-x86_64-linux/torch_harmonics_attn/__pycache__/_ops.cpython-313.pyc +0 -0
build/torch27-cxx11-cu126-x86_64-linux/torch_harmonics_attn/_attn_utils.py +637 -0
build/torch27-cxx11-cu126-x86_64-linux/torch_harmonics_attn/_ops.py +9 -0
build/torch27-cxx11-cu126-x86_64-linux/torch_harmonics_attn/_torch_harmonics_attn_20251001150033.abi3.so +3 -0
build/torch27-cxx11-cu128-x86_64-linux/torch_harmonics_attn/__init__.py +10 -0
build/torch27-cxx11-cu128-x86_64-linux/torch_harmonics_attn/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch27-cxx11-cu128-x86_64-linux/torch_harmonics_attn/__pycache__/_attn_utils.cpython-313.pyc +0 -0
build/torch27-cxx11-cu128-x86_64-linux/torch_harmonics_attn/__pycache__/_ops.cpython-313.pyc +0 -0
build/torch27-cxx11-cu128-x86_64-linux/torch_harmonics_attn/_attn_utils.py +637 -0
build/torch27-cxx11-cu128-x86_64-linux/torch_harmonics_attn/_ops.py +9 -0
build/torch27-cxx11-cu128-x86_64-linux/torch_harmonics_attn/_torch_harmonics_attn_20251001150033.abi3.so +3 -0
build/torch28-cxx11-cu126-x86_64-linux/torch_harmonics_attn/__init__.py +10 -0
build/torch28-cxx11-cu126-x86_64-linux/torch_harmonics_attn/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch28-cxx11-cu126-x86_64-linux/torch_harmonics_attn/__pycache__/_attn_utils.cpython-313.pyc +0 -0
build/torch28-cxx11-cu126-x86_64-linux/torch_harmonics_attn/__pycache__/_ops.cpython-313.pyc +0 -0
build/torch28-cxx11-cu126-x86_64-linux/torch_harmonics_attn/_attn_utils.py +637 -0
build/torch28-cxx11-cu126-x86_64-linux/torch_harmonics_attn/_ops.py +9 -0
build/torch28-cxx11-cu126-x86_64-linux/torch_harmonics_attn/_torch_harmonics_attn_20251001150033.abi3.so +3 -0
build/torch28-cxx11-cu128-x86_64-linux/torch_harmonics_attn/__init__.py +10 -0
build/torch28-cxx11-cu128-x86_64-linux/torch_harmonics_attn/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch28-cxx11-cu128-x86_64-linux/torch_harmonics_attn/__pycache__/_attn_utils.cpython-313.pyc +0 -0
build/torch28-cxx11-cu128-x86_64-linux/torch_harmonics_attn/__pycache__/_ops.cpython-313.pyc +0 -0
build/torch28-cxx11-cu128-x86_64-linux/torch_harmonics_attn/_attn_utils.py +637 -0
build/torch28-cxx11-cu128-x86_64-linux/torch_harmonics_attn/_ops.py +9 -0
build/torch28-cxx11-cu128-x86_64-linux/torch_harmonics_attn/_torch_harmonics_attn_20251001150033.abi3.so +3 -0
build/torch28-cxx11-cu129-x86_64-linux/torch_harmonics_attn/__init__.py +10 -0
build/torch28-cxx11-cu129-x86_64-linux/torch_harmonics_attn/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch28-cxx11-cu129-x86_64-linux/torch_harmonics_attn/__pycache__/_attn_utils.cpython-313.pyc +0 -0
build/torch28-cxx11-cu129-x86_64-linux/torch_harmonics_attn/__pycache__/_ops.cpython-313.pyc +0 -0
build/torch28-cxx11-cu129-x86_64-linux/torch_harmonics_attn/_attn_utils.py +637 -0
build/torch28-cxx11-cu129-x86_64-linux/torch_harmonics_attn/_ops.py +9 -0
build/torch28-cxx11-cu129-x86_64-linux/torch_harmonics_attn/_torch_harmonics_attn_20251001150033.abi3.so +3 -0
flake.nix +13 -0
nix-build.log +0 -0
torch-ext/torch_binding.cpp +14 -0
torch-ext/torch_binding.h +31 -0
torch-ext/torch_harmonics_attn/__init__.py +10 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+build/torch27-cxx11-cu118-x86_64-linux/torch_harmonics_attn/_torch_harmonics_attn_20251001150033.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch27-cxx11-cu126-x86_64-linux/torch_harmonics_attn/_torch_harmonics_attn_20251001150033.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch27-cxx11-cu128-x86_64-linux/torch_harmonics_attn/_torch_harmonics_attn_20251001150033.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu126-x86_64-linux/torch_harmonics_attn/_torch_harmonics_attn_20251001150033.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu128-x86_64-linux/torch_harmonics_attn/_torch_harmonics_attn_20251001150033.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu129-x86_64-linux/torch_harmonics_attn/_torch_harmonics_attn_20251001150033.abi3.so filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ ## Torch Harmonics Attn
2	+
3	+ Attention mechanisms for the Spherical Harmonics basis using the torch-harmonics package : https://github.com/NVIDIA/torch-harmonics/tree/main/torch_harmonics/attention

build.toml ADDED Viewed

	@@ -0,0 +1,35 @@

+[general]
+name = "torch_harmonics_attn"
+universal = false
+[torch]
+src = [
+  "torch-ext/torch_binding.cpp",
+  "torch-ext/torch_binding.h",
+]
+[kernel.torch_harmonics_attn]
+depends = ["torch"]
+backend = "cuda"
+cuda-capabilities = [
+    "7.5",
+    "8.0",
+    "8.9",
+    "9.0",
+    "10.0",
+]
+src = [
+    "torch_harmonics_attn/attention_cpu_bwd.cpp",
+    "torch_harmonics_attn/attention_cpu_fwd.cpp",
+    "torch_harmonics_attn/attention_cpu.h",
+    "torch_harmonics_attn/attention_cuda_bwd.cu",
+    "torch_harmonics_attn/attention_cuda_fwd.cu",
+    "torch_harmonics_attn/attention_cuda_utils.cu",
+    "torch_harmonics_attn/attention_cuda_utils.cuh",
+    "torch_harmonics_attn/attention_cuda.cuh",
+    "torch_harmonics_attn/attention.h",
+    "torch_harmonics_attn/cudamacro.h"
+]

build/torch27-cxx11-cu118-x86_64-linux/torch_harmonics_attn/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from ._attn_utils import backward, forward, forward_optimized, backward_optimized, _neighborhood_s2_attention_fwd_torch, _neighborhood_s2_attention_bwd_torch
+__all__ = [
+    "backward",
+    "forward",
+    "forward_optimized",
+    "backward_optimized",
+    "_neighborhood_s2_attention_fwd_torch",
+    "_neighborhood_s2_attention_bwd_torch",
+]

build/torch27-cxx11-cu118-x86_64-linux/torch_harmonics_attn/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (436 Bytes). View file

build/torch27-cxx11-cu118-x86_64-linux/torch_harmonics_attn/__pycache__/_attn_utils.cpython-313.pyc ADDED Viewed

Binary file (27.2 kB). View file

build/torch27-cxx11-cu118-x86_64-linux/torch_harmonics_attn/__pycache__/_ops.cpython-313.pyc ADDED Viewed

Binary file (570 Bytes). View file

build/torch27-cxx11-cu118-x86_64-linux/torch_harmonics_attn/_attn_utils.py ADDED Viewed

	@@ -0,0 +1,637 @@

+# coding=utf-8
+# SPDX-FileCopyrightText: Copyright (c) 2025 The torch-harmonics Authors. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+from typing import Union, Tuple
+import torch
+import torch.nn.functional as F
+from ._ops import ops
+def backward(kx, vx, qy, dy, quad_weights, psi_col_idx, psi_row_off, nlon_in, nlat_out, nlon_out):
+    return ops.s2_attention_bwd_dkvq_cuda(kx, vx, qy, dy, quad_weights, psi_col_idx, psi_row_off, nlon_in, nlat_out, nlon_out)
+def forward(kx, vx, qy, quad_weights, psi_col_idx, psi_row_off, nlon_in, nlat_out, nlon_out):
+    return ops.s2_attention_fwd_cuda(kx, vx, qy, quad_weights, psi_col_idx, psi_row_off, nlon_in, nlat_out, nlon_out)
+def _setup_context_attention_backward(ctx, inputs, output):
+    k, v, q, wk, wv, wq, bk, bv, bq, quad_weights, col_idx, row_off, max_psi_nnz, nh, nlon_in, nlat_out, nlon_out = inputs
+    ctx.save_for_backward(col_idx, row_off, quad_weights, k, v, q, wk, wv, wq, bk, bv, bq)
+    ctx.nh = nh
+    ctx.max_psi_nnz = max_psi_nnz
+    ctx.nlon_in = nlon_in
+    ctx.nlat_out = nlat_out
+    ctx.nlon_out = nlon_out
+def forward_default(kw: torch.Tensor, vw: torch.Tensor, qw: torch.Tensor,
+        quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+        nlon_in: int, nlat_out: int, nlon_out: int) -> torch.Tensor:
+    out_shape = (kw.shape[0], vw.shape[1], nlat_out, nlon_out)
+    return torch.empty(out_shape, dtype=kw.dtype, device=kw.device)
+def backward_default(kw: torch.Tensor, vw: torch.Tensor, qw: torch.Tensor, grad_output: torch.Tensor,
+        quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+        nlon_in: int, nlat_out: int, nlon_out: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    dk = torch.empty_like(kw)
+    dv = torch.empty_like(vw)
+    dq = torch.empty_like(qw)
+    return dk, dv, dq
+    # forward
+def forward_optimized(k: torch.Tensor, v: torch.Tensor, q: torch.Tensor,
+                                            wk: torch.Tensor, wv: torch.Tensor, wq: torch.Tensor,
+                                            bk: Union[torch.Tensor, None], bv: Union[torch.Tensor, None], bq: Union[torch.Tensor, None],
+                                            quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                            max_psi_nnz: int, nh: int, nlon_in: int, nlat_out: int, nlon_out: int) -> torch.Tensor:
+    kw = F.conv2d(k, weight=wk, bias=bk)
+    vw = F.conv2d(v, weight=wv, bias=bv)
+    qw = F.conv2d(q, weight=wq, bias=bq)
+    # reshape, folding num heads into batch dim
+    B, _, H, W = kw.shape
+    kw = kw.reshape(B*nh, -1, H, W)
+    B, _, H, W = vw.shape
+    vw = vw.reshape(B*nh, -1, H, W)
+    B, _, H, W = qw.shape
+    qw = qw.reshape(B*nh, -1, H, W)
+    # convert to float32
+    inp_dtype = kw.dtype
+    kw = kw.to(torch.float32).contiguous()
+    vw = vw.to(torch.float32).contiguous()
+    qw = qw.to(torch.float32).contiguous()
+    output = forward(kw, vw, qw, quad_weights,
+                                                col_idx, row_off,
+                                                nlon_in, nlat_out, nlon_out)
+    _, C, H, W = output.shape
+    output = output.reshape(B, -1, H, W)
+    # convert back precision
+    output = output.to(dtype=inp_dtype)
+    return output
+def backward_optimized(ctx, grad_output):
+    col_idx, row_off, quad_weights, k, v, q, wk, wv, wq, bk, bv, bq = ctx.saved_tensors
+    nh = ctx.nh
+    max_psi_nnz = ctx.max_psi_nnz
+    nlon_in = ctx.nlon_in
+    nlat_out = ctx.nlat_out
+    nlon_out = ctx.nlon_out
+    # check if we need the grads at all
+    k_needs_grad = ctx.needs_input_grad[0]
+    v_needs_grad = ctx.needs_input_grad[1]
+    q_needs_grad = ctx.needs_input_grad[2]
+    wk_needs_grad = ctx.needs_input_grad[3]
+    wv_needs_grad = ctx.needs_input_grad[4]
+    wq_needs_grad = ctx.needs_input_grad[5]
+    bk_needs_grad = ctx.needs_input_grad[6]
+    bv_needs_grad = ctx.needs_input_grad[7]
+    bq_needs_grad = ctx.needs_input_grad[8]
+    kw = F.conv2d(k, weight=wk, bias=bk)
+    vw = F.conv2d(v, weight=wv, bias=bv)
+    qw = F.conv2d(q, weight=wq, bias=bq)
+    # reshape, folding num heads into batch dim
+    B, _, H, W = kw.shape
+    kw = kw.reshape(B*nh, -1, H, W)
+    B, _, H, W = vw.shape
+    vw = vw.reshape(B*nh, -1, H, W)
+    B, _, H, W = qw.shape
+    qw = qw.reshape(B*nh, -1, H, W)
+    B, _, H, W  = grad_output.shape
+    grad_output = grad_output.reshape(B*nh, -1, H, W)
+    # save type and convert to float32
+    kw_dtype = kw.dtype
+    vw_dtype = vw.dtype
+    qw_dtype = qw.dtype
+    kw = kw.to(torch.float32).contiguous()
+    vw = vw.to(torch.float32).contiguous()
+    qw = qw.to(torch.float32).contiguous()
+    grad_output = grad_output.to(torch.float32).contiguous()
+    dkw, dvw, dqw = backward(kw, vw, qw, grad_output,
+                                                       quad_weights,
+                                                       col_idx, row_off,
+                                                       nlon_in, nlat_out, nlon_out)
+    # weight grads
+    _, C, H, W = dkw.shape
+    dkw = dkw.reshape(B, -1, H, W)
+    dkw = dkw.to(dtype=kw_dtype)
+    if wk_needs_grad:
+        dwk = torch.einsum("bchw,bfhw->cf", dkw, k).reshape(*wk.shape).contiguous()
+    else:
+        dwk = None
+    _, C, H, W = dvw.shape
+    dvw = dvw.reshape(B, -1, H, W)
+    dvw = dvw.to(dtype=vw_dtype)
+    if wv_needs_grad:
+        dwv = torch.einsum("bchw,bfhw->cf", dvw, v).reshape(*wv.shape).contiguous()
+    else:
+        dwv = None
+    _, C, H, W = dqw.shape
+    dqw = dqw.reshape(B, -1, H, W)
+    dqw = dqw.to(dtype=qw_dtype)
+    if wq_needs_grad:
+        dwq = torch.einsum("bchw,bfhw->cf", dqw, q).reshape(*wq.shape).contiguous()
+    else:
+        dwq = None
+    # input grads
+    if v_needs_grad:
+        dv = torch.nn.functional.conv2d(dvw, weight=wv.permute([1,0,2,3]), bias=None)
+    else:
+        dv = None
+    if k_needs_grad:
+        dk = torch.nn.functional.conv2d(dkw, weight=wk.permute([1,0,2,3]), bias=None)
+    else:
+        dk = None
+    if q_needs_grad:
+        dq = torch.nn.functional.conv2d(dqw, weight=wq.permute([1,0,2,3]), bias=None)
+    else:
+        dq = None
+    # bias grads:
+    if bv_needs_grad:
+        dbv = torch.sum(dvw, dim=(0,2,3))
+    else:
+        dbv = None
+    if bk_needs_grad:
+        dbk = torch.sum(dkw, dim=(0,2,3))
+    else:
+        dbk = None
+    if bq_needs_grad:
+        dbq = torch.sum(dqw, dim=(0,2,3))
+    else:
+        dbq = None
+    return dk, dv, dq, dwk, dwv, dwq, dbk, dbv, dbq, \
+            None, None, None, None, None, None, None, None
+# torch kernels
+# uses qdotk_max update trick to avoid two loops when computing the softmax
+# see e.g., https://arxiv.org/abs/1805.02867
+# and https://alexdremov.me/understanding-flash-attention-writing-the-algorithm-from-scratch-in-triton/
+def _neighborhood_s2_attention_fwd_torch(kx: torch.Tensor, vx: torch.Tensor, qy: torch.Tensor,
+                                         quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                         nlon_in: int, nlat_out: int, nlon_out: int) -> torch.Tensor:
+    # prepare result tensor
+    out_shape = (qy.shape[0], vx.shape[1], nlat_out, nlon_out)
+    y = torch.zeros(out_shape, dtype=qy.dtype, device=qy.device)
+    for ho in range(nlat_out):
+	    # get number of nonzeros
+        zstart = row_off[ho]
+        zend = row_off[ho+1]
+        for wo in range(nlon_out):
+            alpha_sum = torch.zeros((y.shape[0],), dtype=y.dtype, device=y.device)
+            qdotk_max = torch.zeros((y.shape[0],), dtype=y.dtype, device=y.device)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi + wo) % nlon_in
+                # compute correlation & softmax numerator
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hi_wip = kx[:, :, hi, wip]
+                qdotk = torch.sum(q_ho_wo * k_hi_wip, dim=1)
+                # tmp max
+                qdotk_max_tmp = torch.maximum(qdotk_max, qdotk)
+                # alpha sum update
+                alpha = torch.exp(qdotk - qdotk_max_tmp) * quad_weights[hi]
+                alpha_sum = alpha + alpha_sum * torch.exp(qdotk_max - qdotk_max_tmp)
+                # update output
+                y[:,:,ho,wo] = y[:,:,ho,wo] * torch.exp(qdotk_max - qdotk_max_tmp).unsqueeze(1) + alpha[:, None] * vx[:,:,hi,wip]
+                # define new max
+                qdotk_max = qdotk_max_tmp
+            y[:,:,ho,wo] = y[:,:,ho,wo] / alpha_sum[:, None]
+    return y
+# Explicit gradient w.r.t. vx: dM/dv
+# provided as a reference for CUDA & other hand-written gradients
+def _neighborhood_s2_attention_bwd_dv_torch(kx: torch.Tensor, vx: torch.Tensor, qy: torch.Tensor, dy: torch.Tensor,
+                                            quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                            nlon_in: int, nlat_out: int, nlon_out: int):
+    # shapes:
+    # input
+    # kx: B, C, Hi, Wi
+    # vx: B, Cout, Hi, Wi
+    # qy: B, Cout, Ho, Wo
+    # quad_weights: Hi
+    # output
+    # dvx: B, Cout, Hi, Wi
+    dvx = torch.zeros_like(vx)
+    batch_size = dy.shape[0]
+    for ho in range(nlat_out):
+        # get number of nonzeros
+        zstart = row_off[ho]
+        zend = row_off[ho+1]
+        for wo in range(nlon_out):
+            alpha_nz = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            qdotk_nz = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            alpha_sum = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                # compute correlation & softmax numerator
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hi_wi = kx[:, :, hi, wip]
+                qdotk_nz[:,idz-zstart] = torch.sum(q_ho_wo * k_hi_wi, dim=1)
+            qdotk_max, _ = torch.max(qdotk_nz, dim=1)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                alpha_nz[:,idz-zstart] = torch.exp(qdotk_nz[:,idz-zstart] - qdotk_max) * quad_weights[hi]
+                alpha_sum[:] += alpha_nz[:,idz-zstart]
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                dvx[:,:,hi, wip] += (alpha_nz[:, None, idz-zstart] / alpha_sum[:, None]) * dy[:,:,ho,wo]
+    return dvx
+# Explicit gradient w.r.t. kx: dM/dk
+# provided as a reference for CUDA & other hand-written gradients
+def _neighborhood_s2_attention_bwd_dk_torch(kx: torch.Tensor, vx: torch.Tensor, qy: torch.Tensor, dy: torch.Tensor,
+                                            quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                            nlon_in: int, nlat_out: int, nlon_out: int):
+    # shapes:
+    # input
+    # kx: B, C, Hi, Wi
+    # vx: B, Cout, Hi, Wi
+    # qy: B, C, Ho, Wo
+    # quad_weights: Hi
+    # output
+    # dkx: B, C, Hi, Wi
+    dkx = torch.zeros_like(kx)
+    batch_size = dy.shape[0]
+    for ho in range(nlat_out):
+        # get number of nonzeros
+        zstart = row_off[ho]
+        zend = row_off[ho+1]
+        for wo in range(nlon_out):
+            qdotk_nz = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            integral = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            alpha = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            alpha_sum = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hj = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wj = nz_col_idx % nlon_in
+                wjp = (wj+wo) % nlon_in
+                # compute correlation & softmax numerator
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hj_wjp = kx[:, :, hj, wjp]
+                qdotk_nz[:,idz-zstart] = torch.sum(q_ho_wo * k_hj_wjp, dim=1)
+            qdotk_max, _ = torch.max(qdotk_nz, dim=1)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hj = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wj = nz_col_idx % nlon_in
+                wjp = (wj+wo) % nlon_in
+                alpha[:, idz-zstart] = torch.exp(qdotk_nz[:,idz-zstart] - qdotk_max) * quad_weights[hj]
+                alpha_sum[:] += alpha[:, idz-zstart]
+                # input dot
+                gdotv = torch.sum(dy[:,:,ho, wo] * vx[:,:,hj, wjp], dim=1)
+                # integral term
+                integral[:] += alpha[:, idz-zstart] * gdotv[:]
+            integral[:] = integral[:] / alpha_sum[:]
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                # compute correlation & softmax numerator
+                gdotv = torch.sum(dy[:,:,ho, wo] * vx[:,:,hi, wip], dim=1)
+                dkx[:,:,hi,wip] += qy[:, :, ho, wo] * (alpha[:, None, idz-zstart] / alpha_sum[:, None]) * (gdotv[:, None] - integral[:, None])
+    return dkx
+# Explicit gradient w.r.t. qy: dM/dq
+# provided as a reference for CUDA & other hand-written gradients
+def _neighborhood_s2_attention_bwd_dq_torch(kx: torch.Tensor, vx: torch.Tensor, qy: torch.Tensor, dy: torch.Tensor,
+                                            quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                            nlon_in: int, nlat_out: int, nlon_out: int):
+    # shapes:
+    # input
+    # kx: B, C, Hi, Wi
+    # vx: B, Cout, Hi, Wi
+    # qy: B, C, Ho, Wo
+    # quad_weights: Hi
+    # output
+    # dq: B, C, Ho, Wo
+    batch_size = dy.shape[0]
+    channels_in = kx.shape[1]
+    channels_out = vx.shape[1]
+    dqy = torch.zeros_like(qy)
+    for ho in range(nlat_out):
+        # get number of nonzeros
+        zstart = row_off[ho]
+        zend = row_off[ho+1]
+        for wo in range(nlon_out):
+            alpha = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            qdotk_nz = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            alpha_k = torch.zeros((batch_size, channels_in), dtype=dy.dtype, device=dy.device)
+            alpha_vw = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            alpha_kvw = torch.zeros((batch_size, channels_in), dtype=dy.dtype, device=dy.device)
+            alpha_sum = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            alpha_sum2 = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                idz_i = idz-zstart
+                # compute correlation & softmax numerator
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hi_wi = kx[:, :, hi, wip]
+                qdotk_nz[:,idz-zstart] = torch.sum(q_ho_wo * k_hi_wi, dim=1)
+            qdotk_max,_ = qdotk_nz.max(dim=1)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hi_wi = kx[:, :, hi, wip]
+                idz_i = idz-zstart
+                alpha[:, idz_i] = torch.exp(qdotk_nz[:,idz-zstart] - qdotk_max) * quad_weights[hi]
+                alpha_sum[:] += alpha[:, idz_i]
+                gdotv = torch.sum(dy[:,:,ho, wo] * vx[:,:,hi, wip], dim=1)
+                alpha_k[:,:] += alpha[:, None, idz_i] * k_hi_wi
+                alpha_vw[:] += alpha[:, idz_i] * gdotv[:]
+                alpha_kvw[:,:] += alpha[:, None, idz_i] * k_hi_wi * gdotv[:,None]
+            dqy[:,:,ho,wo] = (alpha_kvw * alpha_sum[:,None] - alpha_vw[:, None] * alpha_k) / (alpha_sum[:,None] * alpha_sum[:,None])
+    return dqy
+def _neighborhood_s2_attention_torch(k: torch.Tensor, v: torch.Tensor, q: torch.Tensor,
+                                     wk: torch.Tensor, wv: torch.Tensor, wq: torch.Tensor,
+                                     bk: Union[torch.Tensor, None], bv: Union[torch.Tensor, None], bq: Union[torch.Tensor, None],
+                                     quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                     max_psi_nnz: int, nh: int, nlon_in: int, nlat_out: int, nlon_out: int) -> torch.Tensor:
+    kw = F.conv2d(k, weight=wk, bias=bk)
+    vw = F.conv2d(v, weight=wv, bias=bv)
+    qw = F.conv2d(q, weight=wq, bias=bq)
+    # reshape, folding num heads into batch dim
+    B, _, H, W = kw.shape
+    kw = kw.reshape(B*nh, -1, H, W)
+    B, _, H, W = vw.shape
+    vw = vw.reshape(B*nh, -1, H, W)
+    B, _, H, W = qw.shape
+    qw = qw.reshape(B*nh, -1, H, W)
+    kw = kw.to(torch.float32)
+    vw = vw.to(torch.float32)
+    qw = qw.to(torch.float32)
+    output = _neighborhood_s2_attention_fwd_torch(kw, vw, qw, quad_weights,
+                                                  col_idx, row_off,
+                                                  nlon_in, nlat_out, nlon_out)
+    _, C, H, W = output.shape
+    output = output.reshape(B, -1, H, W)
+    return output
+def _neighborhood_s2_attention_bwd_torch(ctx, grad_output):
+    col_idx, row_off, quad_weights, k, v, q, wk, wv, wq, bk, bv, bq = ctx.saved_tensors
+    nh = ctx.nh
+    nlon_in = ctx.nlon_in
+    nlat_out = ctx.nlat_out
+    nlon_out = ctx.nlon_out
+    # check if we need the grads at all
+    k_needs_grad = ctx.needs_input_grad[0]
+    v_needs_grad = ctx.needs_input_grad[1]
+    q_needs_grad = ctx.needs_input_grad[2]
+    wk_needs_grad = ctx.needs_input_grad[3]
+    wv_needs_grad = ctx.needs_input_grad[4]
+    wq_needs_grad = ctx.needs_input_grad[5]
+    bk_needs_grad = ctx.needs_input_grad[6]
+    bv_needs_grad = ctx.needs_input_grad[7]
+    bq_needs_grad = ctx.needs_input_grad[8]
+    kw = F.conv2d(k, weight=wk, bias=bk)
+    vw = F.conv2d(v, weight=wv, bias=bv)
+    qw = F.conv2d(q, weight=wq, bias=bq)
+    # reshape, folding num heads into batch dim
+    B, _, H, W = kw.shape
+    kw = kw.reshape(B*nh, -1, H, W)
+    B, _, H, W = vw.shape
+    vw = vw.reshape(B*nh, -1, H, W)
+    B, _, H, W = qw.shape
+    qw = qw.reshape(B*nh, -1, H, W)
+    B, _, H, W  = grad_output.shape
+    grad_output = grad_output.reshape(B*nh, -1, H, W)
+    if v_needs_grad or wv_needs_grad or bv_needs_grad:
+        dvw = _neighborhood_s2_attention_bwd_dv_torch(kw, vw, qw, grad_output,
+                                                      quad_weights,
+                                                      col_idx, row_off,
+                                                      nlon_in, nlat_out, nlon_out)
+        _, C, H, W = dvw.shape
+        dvw = dvw.reshape(B, -1, H, W)
+    else:
+        dvw = None
+    if k_needs_grad or wk_needs_grad or bk_needs_grad:
+        dkw = _neighborhood_s2_attention_bwd_dk_torch(kw, vw, qw, grad_output,
+                                                      quad_weights,
+                                                      col_idx, row_off,
+                                                      nlon_in, nlat_out, nlon_out)
+        _, C, H, W = dkw.shape
+        dkw = dkw.reshape(B, -1, H, W)
+    else:
+        dkw = None
+    if q_needs_grad or wq_needs_grad or bq_needs_grad:
+        dqw = _neighborhood_s2_attention_bwd_dq_torch(kw, vw, qw, grad_output,
+                                                      quad_weights,
+                                                      col_idx, row_off,
+                                                      nlon_in, nlat_out, nlon_out)
+        _, C, H, W = dqw.shape
+        dqw = dqw.reshape(B, -1, H, W)
+    else:
+        dqw = None
+    # input grads
+    if v_needs_grad:
+        dv = torch.nn.functional.conv2d(dvw, weight=wv.permute([1,0,2,3]), bias=None)
+    else:
+        dv = None
+    if k_needs_grad:
+        dk = torch.nn.functional.conv2d(dkw, weight=wk.permute([1,0,2,3]), bias=None)
+    else:
+        dk = None
+    if q_needs_grad:
+        dq = torch.nn.functional.conv2d(dqw, weight=wq.permute([1,0,2,3]), bias=None)
+    else:
+        dq = None
+    # weight grads
+    if wv_needs_grad:
+        dwv = torch.einsum("bchw,bfhw->cf", dvw, v).reshape(*wv.shape).contiguous()
+    else:
+        dwv = None
+    if wk_needs_grad:
+        dwk = torch.einsum("bchw,bfhw->cf", dkw, k).reshape(*wk.shape).contiguous()
+    else:
+        dwk = None
+    if wq_needs_grad:
+        dwq = torch.einsum("bchw,bfhw->cf", dqw, q).reshape(*wq.shape).contiguous()
+    else:
+        dwq = None
+    # bias grads:
+    if bv_needs_grad:
+        dbv = torch.sum(dvw, dim=(0,2,3))
+    else:
+        dbv = None
+    if bk_needs_grad:
+        dbk = torch.sum(dkw, dim=(0,2,3))
+    else:
+        dbk = None
+    if bq_needs_grad:
+        dbq = torch.sum(dqw, dim=(0,2,3))
+    else:
+        dbq = None
+    return dk, dv, dq, dwk, dwv, dwq, dbk, dbv, dbq, \
+            None, None, None, None, None, None, None, None

build/torch27-cxx11-cu118-x86_64-linux/torch_harmonics_attn/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _torch_harmonics_attn_20251001150033
+ops = torch.ops._torch_harmonics_attn_20251001150033
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_torch_harmonics_attn_20251001150033::{op_name}"

build/torch27-cxx11-cu118-x86_64-linux/torch_harmonics_attn/_torch_harmonics_attn_20251001150033.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d4e9bb69e777ace94e18326ea2559292b3c0fbb11d68b185c1c4d700767ebf68
+size 27631360

build/torch27-cxx11-cu126-x86_64-linux/torch_harmonics_attn/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from ._attn_utils import backward, forward, forward_optimized, backward_optimized, _neighborhood_s2_attention_fwd_torch, _neighborhood_s2_attention_bwd_torch
+__all__ = [
+    "backward",
+    "forward",
+    "forward_optimized",
+    "backward_optimized",
+    "_neighborhood_s2_attention_fwd_torch",
+    "_neighborhood_s2_attention_bwd_torch",
+]

build/torch27-cxx11-cu126-x86_64-linux/torch_harmonics_attn/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (436 Bytes). View file

build/torch27-cxx11-cu126-x86_64-linux/torch_harmonics_attn/__pycache__/_attn_utils.cpython-313.pyc ADDED Viewed

Binary file (27.2 kB). View file

build/torch27-cxx11-cu126-x86_64-linux/torch_harmonics_attn/__pycache__/_ops.cpython-313.pyc ADDED Viewed

Binary file (570 Bytes). View file

build/torch27-cxx11-cu126-x86_64-linux/torch_harmonics_attn/_attn_utils.py ADDED Viewed

	@@ -0,0 +1,637 @@

+# coding=utf-8
+# SPDX-FileCopyrightText: Copyright (c) 2025 The torch-harmonics Authors. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+from typing import Union, Tuple
+import torch
+import torch.nn.functional as F
+from ._ops import ops
+def backward(kx, vx, qy, dy, quad_weights, psi_col_idx, psi_row_off, nlon_in, nlat_out, nlon_out):
+    return ops.s2_attention_bwd_dkvq_cuda(kx, vx, qy, dy, quad_weights, psi_col_idx, psi_row_off, nlon_in, nlat_out, nlon_out)
+def forward(kx, vx, qy, quad_weights, psi_col_idx, psi_row_off, nlon_in, nlat_out, nlon_out):
+    return ops.s2_attention_fwd_cuda(kx, vx, qy, quad_weights, psi_col_idx, psi_row_off, nlon_in, nlat_out, nlon_out)
+def _setup_context_attention_backward(ctx, inputs, output):
+    k, v, q, wk, wv, wq, bk, bv, bq, quad_weights, col_idx, row_off, max_psi_nnz, nh, nlon_in, nlat_out, nlon_out = inputs
+    ctx.save_for_backward(col_idx, row_off, quad_weights, k, v, q, wk, wv, wq, bk, bv, bq)
+    ctx.nh = nh
+    ctx.max_psi_nnz = max_psi_nnz
+    ctx.nlon_in = nlon_in
+    ctx.nlat_out = nlat_out
+    ctx.nlon_out = nlon_out
+def forward_default(kw: torch.Tensor, vw: torch.Tensor, qw: torch.Tensor,
+        quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+        nlon_in: int, nlat_out: int, nlon_out: int) -> torch.Tensor:
+    out_shape = (kw.shape[0], vw.shape[1], nlat_out, nlon_out)
+    return torch.empty(out_shape, dtype=kw.dtype, device=kw.device)
+def backward_default(kw: torch.Tensor, vw: torch.Tensor, qw: torch.Tensor, grad_output: torch.Tensor,
+        quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+        nlon_in: int, nlat_out: int, nlon_out: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    dk = torch.empty_like(kw)
+    dv = torch.empty_like(vw)
+    dq = torch.empty_like(qw)
+    return dk, dv, dq
+    # forward
+def forward_optimized(k: torch.Tensor, v: torch.Tensor, q: torch.Tensor,
+                                            wk: torch.Tensor, wv: torch.Tensor, wq: torch.Tensor,
+                                            bk: Union[torch.Tensor, None], bv: Union[torch.Tensor, None], bq: Union[torch.Tensor, None],
+                                            quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                            max_psi_nnz: int, nh: int, nlon_in: int, nlat_out: int, nlon_out: int) -> torch.Tensor:
+    kw = F.conv2d(k, weight=wk, bias=bk)
+    vw = F.conv2d(v, weight=wv, bias=bv)
+    qw = F.conv2d(q, weight=wq, bias=bq)
+    # reshape, folding num heads into batch dim
+    B, _, H, W = kw.shape
+    kw = kw.reshape(B*nh, -1, H, W)
+    B, _, H, W = vw.shape
+    vw = vw.reshape(B*nh, -1, H, W)
+    B, _, H, W = qw.shape
+    qw = qw.reshape(B*nh, -1, H, W)
+    # convert to float32
+    inp_dtype = kw.dtype
+    kw = kw.to(torch.float32).contiguous()
+    vw = vw.to(torch.float32).contiguous()
+    qw = qw.to(torch.float32).contiguous()
+    output = forward(kw, vw, qw, quad_weights,
+                                                col_idx, row_off,
+                                                nlon_in, nlat_out, nlon_out)
+    _, C, H, W = output.shape
+    output = output.reshape(B, -1, H, W)
+    # convert back precision
+    output = output.to(dtype=inp_dtype)
+    return output
+def backward_optimized(ctx, grad_output):
+    col_idx, row_off, quad_weights, k, v, q, wk, wv, wq, bk, bv, bq = ctx.saved_tensors
+    nh = ctx.nh
+    max_psi_nnz = ctx.max_psi_nnz
+    nlon_in = ctx.nlon_in
+    nlat_out = ctx.nlat_out
+    nlon_out = ctx.nlon_out
+    # check if we need the grads at all
+    k_needs_grad = ctx.needs_input_grad[0]
+    v_needs_grad = ctx.needs_input_grad[1]
+    q_needs_grad = ctx.needs_input_grad[2]
+    wk_needs_grad = ctx.needs_input_grad[3]
+    wv_needs_grad = ctx.needs_input_grad[4]
+    wq_needs_grad = ctx.needs_input_grad[5]
+    bk_needs_grad = ctx.needs_input_grad[6]
+    bv_needs_grad = ctx.needs_input_grad[7]
+    bq_needs_grad = ctx.needs_input_grad[8]
+    kw = F.conv2d(k, weight=wk, bias=bk)
+    vw = F.conv2d(v, weight=wv, bias=bv)
+    qw = F.conv2d(q, weight=wq, bias=bq)
+    # reshape, folding num heads into batch dim
+    B, _, H, W = kw.shape
+    kw = kw.reshape(B*nh, -1, H, W)
+    B, _, H, W = vw.shape
+    vw = vw.reshape(B*nh, -1, H, W)
+    B, _, H, W = qw.shape
+    qw = qw.reshape(B*nh, -1, H, W)
+    B, _, H, W  = grad_output.shape
+    grad_output = grad_output.reshape(B*nh, -1, H, W)
+    # save type and convert to float32
+    kw_dtype = kw.dtype
+    vw_dtype = vw.dtype
+    qw_dtype = qw.dtype
+    kw = kw.to(torch.float32).contiguous()
+    vw = vw.to(torch.float32).contiguous()
+    qw = qw.to(torch.float32).contiguous()
+    grad_output = grad_output.to(torch.float32).contiguous()
+    dkw, dvw, dqw = backward(kw, vw, qw, grad_output,
+                                                       quad_weights,
+                                                       col_idx, row_off,
+                                                       nlon_in, nlat_out, nlon_out)
+    # weight grads
+    _, C, H, W = dkw.shape
+    dkw = dkw.reshape(B, -1, H, W)
+    dkw = dkw.to(dtype=kw_dtype)
+    if wk_needs_grad:
+        dwk = torch.einsum("bchw,bfhw->cf", dkw, k).reshape(*wk.shape).contiguous()
+    else:
+        dwk = None
+    _, C, H, W = dvw.shape
+    dvw = dvw.reshape(B, -1, H, W)
+    dvw = dvw.to(dtype=vw_dtype)
+    if wv_needs_grad:
+        dwv = torch.einsum("bchw,bfhw->cf", dvw, v).reshape(*wv.shape).contiguous()
+    else:
+        dwv = None
+    _, C, H, W = dqw.shape
+    dqw = dqw.reshape(B, -1, H, W)
+    dqw = dqw.to(dtype=qw_dtype)
+    if wq_needs_grad:
+        dwq = torch.einsum("bchw,bfhw->cf", dqw, q).reshape(*wq.shape).contiguous()
+    else:
+        dwq = None
+    # input grads
+    if v_needs_grad:
+        dv = torch.nn.functional.conv2d(dvw, weight=wv.permute([1,0,2,3]), bias=None)
+    else:
+        dv = None
+    if k_needs_grad:
+        dk = torch.nn.functional.conv2d(dkw, weight=wk.permute([1,0,2,3]), bias=None)
+    else:
+        dk = None
+    if q_needs_grad:
+        dq = torch.nn.functional.conv2d(dqw, weight=wq.permute([1,0,2,3]), bias=None)
+    else:
+        dq = None
+    # bias grads:
+    if bv_needs_grad:
+        dbv = torch.sum(dvw, dim=(0,2,3))
+    else:
+        dbv = None
+    if bk_needs_grad:
+        dbk = torch.sum(dkw, dim=(0,2,3))
+    else:
+        dbk = None
+    if bq_needs_grad:
+        dbq = torch.sum(dqw, dim=(0,2,3))
+    else:
+        dbq = None
+    return dk, dv, dq, dwk, dwv, dwq, dbk, dbv, dbq, \
+            None, None, None, None, None, None, None, None
+# torch kernels
+# uses qdotk_max update trick to avoid two loops when computing the softmax
+# see e.g., https://arxiv.org/abs/1805.02867
+# and https://alexdremov.me/understanding-flash-attention-writing-the-algorithm-from-scratch-in-triton/
+def _neighborhood_s2_attention_fwd_torch(kx: torch.Tensor, vx: torch.Tensor, qy: torch.Tensor,
+                                         quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                         nlon_in: int, nlat_out: int, nlon_out: int) -> torch.Tensor:
+    # prepare result tensor
+    out_shape = (qy.shape[0], vx.shape[1], nlat_out, nlon_out)
+    y = torch.zeros(out_shape, dtype=qy.dtype, device=qy.device)
+    for ho in range(nlat_out):
+	    # get number of nonzeros
+        zstart = row_off[ho]
+        zend = row_off[ho+1]
+        for wo in range(nlon_out):
+            alpha_sum = torch.zeros((y.shape[0],), dtype=y.dtype, device=y.device)
+            qdotk_max = torch.zeros((y.shape[0],), dtype=y.dtype, device=y.device)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi + wo) % nlon_in
+                # compute correlation & softmax numerator
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hi_wip = kx[:, :, hi, wip]
+                qdotk = torch.sum(q_ho_wo * k_hi_wip, dim=1)
+                # tmp max
+                qdotk_max_tmp = torch.maximum(qdotk_max, qdotk)
+                # alpha sum update
+                alpha = torch.exp(qdotk - qdotk_max_tmp) * quad_weights[hi]
+                alpha_sum = alpha + alpha_sum * torch.exp(qdotk_max - qdotk_max_tmp)
+                # update output
+                y[:,:,ho,wo] = y[:,:,ho,wo] * torch.exp(qdotk_max - qdotk_max_tmp).unsqueeze(1) + alpha[:, None] * vx[:,:,hi,wip]
+                # define new max
+                qdotk_max = qdotk_max_tmp
+            y[:,:,ho,wo] = y[:,:,ho,wo] / alpha_sum[:, None]
+    return y
+# Explicit gradient w.r.t. vx: dM/dv
+# provided as a reference for CUDA & other hand-written gradients
+def _neighborhood_s2_attention_bwd_dv_torch(kx: torch.Tensor, vx: torch.Tensor, qy: torch.Tensor, dy: torch.Tensor,
+                                            quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                            nlon_in: int, nlat_out: int, nlon_out: int):
+    # shapes:
+    # input
+    # kx: B, C, Hi, Wi
+    # vx: B, Cout, Hi, Wi
+    # qy: B, Cout, Ho, Wo
+    # quad_weights: Hi
+    # output
+    # dvx: B, Cout, Hi, Wi
+    dvx = torch.zeros_like(vx)
+    batch_size = dy.shape[0]
+    for ho in range(nlat_out):
+        # get number of nonzeros
+        zstart = row_off[ho]
+        zend = row_off[ho+1]
+        for wo in range(nlon_out):
+            alpha_nz = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            qdotk_nz = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            alpha_sum = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                # compute correlation & softmax numerator
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hi_wi = kx[:, :, hi, wip]
+                qdotk_nz[:,idz-zstart] = torch.sum(q_ho_wo * k_hi_wi, dim=1)
+            qdotk_max, _ = torch.max(qdotk_nz, dim=1)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                alpha_nz[:,idz-zstart] = torch.exp(qdotk_nz[:,idz-zstart] - qdotk_max) * quad_weights[hi]
+                alpha_sum[:] += alpha_nz[:,idz-zstart]
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                dvx[:,:,hi, wip] += (alpha_nz[:, None, idz-zstart] / alpha_sum[:, None]) * dy[:,:,ho,wo]
+    return dvx
+# Explicit gradient w.r.t. kx: dM/dk
+# provided as a reference for CUDA & other hand-written gradients
+def _neighborhood_s2_attention_bwd_dk_torch(kx: torch.Tensor, vx: torch.Tensor, qy: torch.Tensor, dy: torch.Tensor,
+                                            quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                            nlon_in: int, nlat_out: int, nlon_out: int):
+    # shapes:
+    # input
+    # kx: B, C, Hi, Wi
+    # vx: B, Cout, Hi, Wi
+    # qy: B, C, Ho, Wo
+    # quad_weights: Hi
+    # output
+    # dkx: B, C, Hi, Wi
+    dkx = torch.zeros_like(kx)
+    batch_size = dy.shape[0]
+    for ho in range(nlat_out):
+        # get number of nonzeros
+        zstart = row_off[ho]
+        zend = row_off[ho+1]
+        for wo in range(nlon_out):
+            qdotk_nz = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            integral = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            alpha = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            alpha_sum = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hj = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wj = nz_col_idx % nlon_in
+                wjp = (wj+wo) % nlon_in
+                # compute correlation & softmax numerator
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hj_wjp = kx[:, :, hj, wjp]
+                qdotk_nz[:,idz-zstart] = torch.sum(q_ho_wo * k_hj_wjp, dim=1)
+            qdotk_max, _ = torch.max(qdotk_nz, dim=1)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hj = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wj = nz_col_idx % nlon_in
+                wjp = (wj+wo) % nlon_in
+                alpha[:, idz-zstart] = torch.exp(qdotk_nz[:,idz-zstart] - qdotk_max) * quad_weights[hj]
+                alpha_sum[:] += alpha[:, idz-zstart]
+                # input dot
+                gdotv = torch.sum(dy[:,:,ho, wo] * vx[:,:,hj, wjp], dim=1)
+                # integral term
+                integral[:] += alpha[:, idz-zstart] * gdotv[:]
+            integral[:] = integral[:] / alpha_sum[:]
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                # compute correlation & softmax numerator
+                gdotv = torch.sum(dy[:,:,ho, wo] * vx[:,:,hi, wip], dim=1)
+                dkx[:,:,hi,wip] += qy[:, :, ho, wo] * (alpha[:, None, idz-zstart] / alpha_sum[:, None]) * (gdotv[:, None] - integral[:, None])
+    return dkx
+# Explicit gradient w.r.t. qy: dM/dq
+# provided as a reference for CUDA & other hand-written gradients
+def _neighborhood_s2_attention_bwd_dq_torch(kx: torch.Tensor, vx: torch.Tensor, qy: torch.Tensor, dy: torch.Tensor,
+                                            quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                            nlon_in: int, nlat_out: int, nlon_out: int):
+    # shapes:
+    # input
+    # kx: B, C, Hi, Wi
+    # vx: B, Cout, Hi, Wi
+    # qy: B, C, Ho, Wo
+    # quad_weights: Hi
+    # output
+    # dq: B, C, Ho, Wo
+    batch_size = dy.shape[0]
+    channels_in = kx.shape[1]
+    channels_out = vx.shape[1]
+    dqy = torch.zeros_like(qy)
+    for ho in range(nlat_out):
+        # get number of nonzeros
+        zstart = row_off[ho]
+        zend = row_off[ho+1]
+        for wo in range(nlon_out):
+            alpha = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            qdotk_nz = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            alpha_k = torch.zeros((batch_size, channels_in), dtype=dy.dtype, device=dy.device)
+            alpha_vw = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            alpha_kvw = torch.zeros((batch_size, channels_in), dtype=dy.dtype, device=dy.device)
+            alpha_sum = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            alpha_sum2 = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                idz_i = idz-zstart
+                # compute correlation & softmax numerator
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hi_wi = kx[:, :, hi, wip]
+                qdotk_nz[:,idz-zstart] = torch.sum(q_ho_wo * k_hi_wi, dim=1)
+            qdotk_max,_ = qdotk_nz.max(dim=1)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hi_wi = kx[:, :, hi, wip]
+                idz_i = idz-zstart
+                alpha[:, idz_i] = torch.exp(qdotk_nz[:,idz-zstart] - qdotk_max) * quad_weights[hi]
+                alpha_sum[:] += alpha[:, idz_i]
+                gdotv = torch.sum(dy[:,:,ho, wo] * vx[:,:,hi, wip], dim=1)
+                alpha_k[:,:] += alpha[:, None, idz_i] * k_hi_wi
+                alpha_vw[:] += alpha[:, idz_i] * gdotv[:]
+                alpha_kvw[:,:] += alpha[:, None, idz_i] * k_hi_wi * gdotv[:,None]
+            dqy[:,:,ho,wo] = (alpha_kvw * alpha_sum[:,None] - alpha_vw[:, None] * alpha_k) / (alpha_sum[:,None] * alpha_sum[:,None])
+    return dqy
+def _neighborhood_s2_attention_torch(k: torch.Tensor, v: torch.Tensor, q: torch.Tensor,
+                                     wk: torch.Tensor, wv: torch.Tensor, wq: torch.Tensor,
+                                     bk: Union[torch.Tensor, None], bv: Union[torch.Tensor, None], bq: Union[torch.Tensor, None],
+                                     quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                     max_psi_nnz: int, nh: int, nlon_in: int, nlat_out: int, nlon_out: int) -> torch.Tensor:
+    kw = F.conv2d(k, weight=wk, bias=bk)
+    vw = F.conv2d(v, weight=wv, bias=bv)
+    qw = F.conv2d(q, weight=wq, bias=bq)
+    # reshape, folding num heads into batch dim
+    B, _, H, W = kw.shape
+    kw = kw.reshape(B*nh, -1, H, W)
+    B, _, H, W = vw.shape
+    vw = vw.reshape(B*nh, -1, H, W)
+    B, _, H, W = qw.shape
+    qw = qw.reshape(B*nh, -1, H, W)
+    kw = kw.to(torch.float32)
+    vw = vw.to(torch.float32)
+    qw = qw.to(torch.float32)
+    output = _neighborhood_s2_attention_fwd_torch(kw, vw, qw, quad_weights,
+                                                  col_idx, row_off,
+                                                  nlon_in, nlat_out, nlon_out)
+    _, C, H, W = output.shape
+    output = output.reshape(B, -1, H, W)
+    return output
+def _neighborhood_s2_attention_bwd_torch(ctx, grad_output):
+    col_idx, row_off, quad_weights, k, v, q, wk, wv, wq, bk, bv, bq = ctx.saved_tensors
+    nh = ctx.nh
+    nlon_in = ctx.nlon_in
+    nlat_out = ctx.nlat_out
+    nlon_out = ctx.nlon_out
+    # check if we need the grads at all
+    k_needs_grad = ctx.needs_input_grad[0]
+    v_needs_grad = ctx.needs_input_grad[1]
+    q_needs_grad = ctx.needs_input_grad[2]
+    wk_needs_grad = ctx.needs_input_grad[3]
+    wv_needs_grad = ctx.needs_input_grad[4]
+    wq_needs_grad = ctx.needs_input_grad[5]
+    bk_needs_grad = ctx.needs_input_grad[6]
+    bv_needs_grad = ctx.needs_input_grad[7]
+    bq_needs_grad = ctx.needs_input_grad[8]
+    kw = F.conv2d(k, weight=wk, bias=bk)
+    vw = F.conv2d(v, weight=wv, bias=bv)
+    qw = F.conv2d(q, weight=wq, bias=bq)
+    # reshape, folding num heads into batch dim
+    B, _, H, W = kw.shape
+    kw = kw.reshape(B*nh, -1, H, W)
+    B, _, H, W = vw.shape
+    vw = vw.reshape(B*nh, -1, H, W)
+    B, _, H, W = qw.shape
+    qw = qw.reshape(B*nh, -1, H, W)
+    B, _, H, W  = grad_output.shape
+    grad_output = grad_output.reshape(B*nh, -1, H, W)
+    if v_needs_grad or wv_needs_grad or bv_needs_grad:
+        dvw = _neighborhood_s2_attention_bwd_dv_torch(kw, vw, qw, grad_output,
+                                                      quad_weights,
+                                                      col_idx, row_off,
+                                                      nlon_in, nlat_out, nlon_out)
+        _, C, H, W = dvw.shape
+        dvw = dvw.reshape(B, -1, H, W)
+    else:
+        dvw = None
+    if k_needs_grad or wk_needs_grad or bk_needs_grad:
+        dkw = _neighborhood_s2_attention_bwd_dk_torch(kw, vw, qw, grad_output,
+                                                      quad_weights,
+                                                      col_idx, row_off,
+                                                      nlon_in, nlat_out, nlon_out)
+        _, C, H, W = dkw.shape
+        dkw = dkw.reshape(B, -1, H, W)
+    else:
+        dkw = None
+    if q_needs_grad or wq_needs_grad or bq_needs_grad:
+        dqw = _neighborhood_s2_attention_bwd_dq_torch(kw, vw, qw, grad_output,
+                                                      quad_weights,
+                                                      col_idx, row_off,
+                                                      nlon_in, nlat_out, nlon_out)
+        _, C, H, W = dqw.shape
+        dqw = dqw.reshape(B, -1, H, W)
+    else:
+        dqw = None
+    # input grads
+    if v_needs_grad:
+        dv = torch.nn.functional.conv2d(dvw, weight=wv.permute([1,0,2,3]), bias=None)
+    else:
+        dv = None
+    if k_needs_grad:
+        dk = torch.nn.functional.conv2d(dkw, weight=wk.permute([1,0,2,3]), bias=None)
+    else:
+        dk = None
+    if q_needs_grad:
+        dq = torch.nn.functional.conv2d(dqw, weight=wq.permute([1,0,2,3]), bias=None)
+    else:
+        dq = None
+    # weight grads
+    if wv_needs_grad:
+        dwv = torch.einsum("bchw,bfhw->cf", dvw, v).reshape(*wv.shape).contiguous()
+    else:
+        dwv = None
+    if wk_needs_grad:
+        dwk = torch.einsum("bchw,bfhw->cf", dkw, k).reshape(*wk.shape).contiguous()
+    else:
+        dwk = None
+    if wq_needs_grad:
+        dwq = torch.einsum("bchw,bfhw->cf", dqw, q).reshape(*wq.shape).contiguous()
+    else:
+        dwq = None
+    # bias grads:
+    if bv_needs_grad:
+        dbv = torch.sum(dvw, dim=(0,2,3))
+    else:
+        dbv = None
+    if bk_needs_grad:
+        dbk = torch.sum(dkw, dim=(0,2,3))
+    else:
+        dbk = None
+    if bq_needs_grad:
+        dbq = torch.sum(dqw, dim=(0,2,3))
+    else:
+        dbq = None
+    return dk, dv, dq, dwk, dwv, dwq, dbk, dbv, dbq, \
+            None, None, None, None, None, None, None, None

build/torch27-cxx11-cu126-x86_64-linux/torch_harmonics_attn/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _torch_harmonics_attn_20251001150033
+ops = torch.ops._torch_harmonics_attn_20251001150033
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_torch_harmonics_attn_20251001150033::{op_name}"

build/torch27-cxx11-cu126-x86_64-linux/torch_harmonics_attn/_torch_harmonics_attn_20251001150033.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a01d03d3f594f42388c5627a59cb8976d3e2fbb5f2adf76c4d5a5dc3f295d35a
+size 27689536

build/torch27-cxx11-cu128-x86_64-linux/torch_harmonics_attn/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from ._attn_utils import backward, forward, forward_optimized, backward_optimized, _neighborhood_s2_attention_fwd_torch, _neighborhood_s2_attention_bwd_torch
+__all__ = [
+    "backward",
+    "forward",
+    "forward_optimized",
+    "backward_optimized",
+    "_neighborhood_s2_attention_fwd_torch",
+    "_neighborhood_s2_attention_bwd_torch",
+]

build/torch27-cxx11-cu128-x86_64-linux/torch_harmonics_attn/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (436 Bytes). View file

build/torch27-cxx11-cu128-x86_64-linux/torch_harmonics_attn/__pycache__/_attn_utils.cpython-313.pyc ADDED Viewed

Binary file (27.2 kB). View file

build/torch27-cxx11-cu128-x86_64-linux/torch_harmonics_attn/__pycache__/_ops.cpython-313.pyc ADDED Viewed

Binary file (570 Bytes). View file

build/torch27-cxx11-cu128-x86_64-linux/torch_harmonics_attn/_attn_utils.py ADDED Viewed

	@@ -0,0 +1,637 @@

+# coding=utf-8
+# SPDX-FileCopyrightText: Copyright (c) 2025 The torch-harmonics Authors. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+from typing import Union, Tuple
+import torch
+import torch.nn.functional as F
+from ._ops import ops
+def backward(kx, vx, qy, dy, quad_weights, psi_col_idx, psi_row_off, nlon_in, nlat_out, nlon_out):
+    return ops.s2_attention_bwd_dkvq_cuda(kx, vx, qy, dy, quad_weights, psi_col_idx, psi_row_off, nlon_in, nlat_out, nlon_out)
+def forward(kx, vx, qy, quad_weights, psi_col_idx, psi_row_off, nlon_in, nlat_out, nlon_out):
+    return ops.s2_attention_fwd_cuda(kx, vx, qy, quad_weights, psi_col_idx, psi_row_off, nlon_in, nlat_out, nlon_out)
+def _setup_context_attention_backward(ctx, inputs, output):
+    k, v, q, wk, wv, wq, bk, bv, bq, quad_weights, col_idx, row_off, max_psi_nnz, nh, nlon_in, nlat_out, nlon_out = inputs
+    ctx.save_for_backward(col_idx, row_off, quad_weights, k, v, q, wk, wv, wq, bk, bv, bq)
+    ctx.nh = nh
+    ctx.max_psi_nnz = max_psi_nnz
+    ctx.nlon_in = nlon_in
+    ctx.nlat_out = nlat_out
+    ctx.nlon_out = nlon_out
+def forward_default(kw: torch.Tensor, vw: torch.Tensor, qw: torch.Tensor,
+        quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+        nlon_in: int, nlat_out: int, nlon_out: int) -> torch.Tensor:
+    out_shape = (kw.shape[0], vw.shape[1], nlat_out, nlon_out)
+    return torch.empty(out_shape, dtype=kw.dtype, device=kw.device)
+def backward_default(kw: torch.Tensor, vw: torch.Tensor, qw: torch.Tensor, grad_output: torch.Tensor,
+        quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+        nlon_in: int, nlat_out: int, nlon_out: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    dk = torch.empty_like(kw)
+    dv = torch.empty_like(vw)
+    dq = torch.empty_like(qw)
+    return dk, dv, dq
+    # forward
+def forward_optimized(k: torch.Tensor, v: torch.Tensor, q: torch.Tensor,
+                                            wk: torch.Tensor, wv: torch.Tensor, wq: torch.Tensor,
+                                            bk: Union[torch.Tensor, None], bv: Union[torch.Tensor, None], bq: Union[torch.Tensor, None],
+                                            quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                            max_psi_nnz: int, nh: int, nlon_in: int, nlat_out: int, nlon_out: int) -> torch.Tensor:
+    kw = F.conv2d(k, weight=wk, bias=bk)
+    vw = F.conv2d(v, weight=wv, bias=bv)
+    qw = F.conv2d(q, weight=wq, bias=bq)
+    # reshape, folding num heads into batch dim
+    B, _, H, W = kw.shape
+    kw = kw.reshape(B*nh, -1, H, W)
+    B, _, H, W = vw.shape
+    vw = vw.reshape(B*nh, -1, H, W)
+    B, _, H, W = qw.shape
+    qw = qw.reshape(B*nh, -1, H, W)
+    # convert to float32
+    inp_dtype = kw.dtype
+    kw = kw.to(torch.float32).contiguous()
+    vw = vw.to(torch.float32).contiguous()
+    qw = qw.to(torch.float32).contiguous()
+    output = forward(kw, vw, qw, quad_weights,
+                                                col_idx, row_off,
+                                                nlon_in, nlat_out, nlon_out)
+    _, C, H, W = output.shape
+    output = output.reshape(B, -1, H, W)
+    # convert back precision
+    output = output.to(dtype=inp_dtype)
+    return output
+def backward_optimized(ctx, grad_output):
+    col_idx, row_off, quad_weights, k, v, q, wk, wv, wq, bk, bv, bq = ctx.saved_tensors
+    nh = ctx.nh
+    max_psi_nnz = ctx.max_psi_nnz
+    nlon_in = ctx.nlon_in
+    nlat_out = ctx.nlat_out
+    nlon_out = ctx.nlon_out
+    # check if we need the grads at all
+    k_needs_grad = ctx.needs_input_grad[0]
+    v_needs_grad = ctx.needs_input_grad[1]
+    q_needs_grad = ctx.needs_input_grad[2]
+    wk_needs_grad = ctx.needs_input_grad[3]
+    wv_needs_grad = ctx.needs_input_grad[4]
+    wq_needs_grad = ctx.needs_input_grad[5]
+    bk_needs_grad = ctx.needs_input_grad[6]
+    bv_needs_grad = ctx.needs_input_grad[7]
+    bq_needs_grad = ctx.needs_input_grad[8]
+    kw = F.conv2d(k, weight=wk, bias=bk)
+    vw = F.conv2d(v, weight=wv, bias=bv)
+    qw = F.conv2d(q, weight=wq, bias=bq)
+    # reshape, folding num heads into batch dim
+    B, _, H, W = kw.shape
+    kw = kw.reshape(B*nh, -1, H, W)
+    B, _, H, W = vw.shape
+    vw = vw.reshape(B*nh, -1, H, W)
+    B, _, H, W = qw.shape
+    qw = qw.reshape(B*nh, -1, H, W)
+    B, _, H, W  = grad_output.shape
+    grad_output = grad_output.reshape(B*nh, -1, H, W)
+    # save type and convert to float32
+    kw_dtype = kw.dtype
+    vw_dtype = vw.dtype
+    qw_dtype = qw.dtype
+    kw = kw.to(torch.float32).contiguous()
+    vw = vw.to(torch.float32).contiguous()
+    qw = qw.to(torch.float32).contiguous()
+    grad_output = grad_output.to(torch.float32).contiguous()
+    dkw, dvw, dqw = backward(kw, vw, qw, grad_output,
+                                                       quad_weights,
+                                                       col_idx, row_off,
+                                                       nlon_in, nlat_out, nlon_out)
+    # weight grads
+    _, C, H, W = dkw.shape
+    dkw = dkw.reshape(B, -1, H, W)
+    dkw = dkw.to(dtype=kw_dtype)
+    if wk_needs_grad:
+        dwk = torch.einsum("bchw,bfhw->cf", dkw, k).reshape(*wk.shape).contiguous()
+    else:
+        dwk = None
+    _, C, H, W = dvw.shape
+    dvw = dvw.reshape(B, -1, H, W)
+    dvw = dvw.to(dtype=vw_dtype)
+    if wv_needs_grad:
+        dwv = torch.einsum("bchw,bfhw->cf", dvw, v).reshape(*wv.shape).contiguous()
+    else:
+        dwv = None
+    _, C, H, W = dqw.shape
+    dqw = dqw.reshape(B, -1, H, W)
+    dqw = dqw.to(dtype=qw_dtype)
+    if wq_needs_grad:
+        dwq = torch.einsum("bchw,bfhw->cf", dqw, q).reshape(*wq.shape).contiguous()
+    else:
+        dwq = None
+    # input grads
+    if v_needs_grad:
+        dv = torch.nn.functional.conv2d(dvw, weight=wv.permute([1,0,2,3]), bias=None)
+    else:
+        dv = None
+    if k_needs_grad:
+        dk = torch.nn.functional.conv2d(dkw, weight=wk.permute([1,0,2,3]), bias=None)
+    else:
+        dk = None
+    if q_needs_grad:
+        dq = torch.nn.functional.conv2d(dqw, weight=wq.permute([1,0,2,3]), bias=None)
+    else:
+        dq = None
+    # bias grads:
+    if bv_needs_grad:
+        dbv = torch.sum(dvw, dim=(0,2,3))
+    else:
+        dbv = None
+    if bk_needs_grad:
+        dbk = torch.sum(dkw, dim=(0,2,3))
+    else:
+        dbk = None
+    if bq_needs_grad:
+        dbq = torch.sum(dqw, dim=(0,2,3))
+    else:
+        dbq = None
+    return dk, dv, dq, dwk, dwv, dwq, dbk, dbv, dbq, \
+            None, None, None, None, None, None, None, None
+# torch kernels
+# uses qdotk_max update trick to avoid two loops when computing the softmax
+# see e.g., https://arxiv.org/abs/1805.02867
+# and https://alexdremov.me/understanding-flash-attention-writing-the-algorithm-from-scratch-in-triton/
+def _neighborhood_s2_attention_fwd_torch(kx: torch.Tensor, vx: torch.Tensor, qy: torch.Tensor,
+                                         quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                         nlon_in: int, nlat_out: int, nlon_out: int) -> torch.Tensor:
+    # prepare result tensor
+    out_shape = (qy.shape[0], vx.shape[1], nlat_out, nlon_out)
+    y = torch.zeros(out_shape, dtype=qy.dtype, device=qy.device)
+    for ho in range(nlat_out):
+	    # get number of nonzeros
+        zstart = row_off[ho]
+        zend = row_off[ho+1]
+        for wo in range(nlon_out):
+            alpha_sum = torch.zeros((y.shape[0],), dtype=y.dtype, device=y.device)
+            qdotk_max = torch.zeros((y.shape[0],), dtype=y.dtype, device=y.device)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi + wo) % nlon_in
+                # compute correlation & softmax numerator
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hi_wip = kx[:, :, hi, wip]
+                qdotk = torch.sum(q_ho_wo * k_hi_wip, dim=1)
+                # tmp max
+                qdotk_max_tmp = torch.maximum(qdotk_max, qdotk)
+                # alpha sum update
+                alpha = torch.exp(qdotk - qdotk_max_tmp) * quad_weights[hi]
+                alpha_sum = alpha + alpha_sum * torch.exp(qdotk_max - qdotk_max_tmp)
+                # update output
+                y[:,:,ho,wo] = y[:,:,ho,wo] * torch.exp(qdotk_max - qdotk_max_tmp).unsqueeze(1) + alpha[:, None] * vx[:,:,hi,wip]
+                # define new max
+                qdotk_max = qdotk_max_tmp
+            y[:,:,ho,wo] = y[:,:,ho,wo] / alpha_sum[:, None]
+    return y
+# Explicit gradient w.r.t. vx: dM/dv
+# provided as a reference for CUDA & other hand-written gradients
+def _neighborhood_s2_attention_bwd_dv_torch(kx: torch.Tensor, vx: torch.Tensor, qy: torch.Tensor, dy: torch.Tensor,
+                                            quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                            nlon_in: int, nlat_out: int, nlon_out: int):
+    # shapes:
+    # input
+    # kx: B, C, Hi, Wi
+    # vx: B, Cout, Hi, Wi
+    # qy: B, Cout, Ho, Wo
+    # quad_weights: Hi
+    # output
+    # dvx: B, Cout, Hi, Wi
+    dvx = torch.zeros_like(vx)
+    batch_size = dy.shape[0]
+    for ho in range(nlat_out):
+        # get number of nonzeros
+        zstart = row_off[ho]
+        zend = row_off[ho+1]
+        for wo in range(nlon_out):
+            alpha_nz = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            qdotk_nz = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            alpha_sum = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                # compute correlation & softmax numerator
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hi_wi = kx[:, :, hi, wip]
+                qdotk_nz[:,idz-zstart] = torch.sum(q_ho_wo * k_hi_wi, dim=1)
+            qdotk_max, _ = torch.max(qdotk_nz, dim=1)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                alpha_nz[:,idz-zstart] = torch.exp(qdotk_nz[:,idz-zstart] - qdotk_max) * quad_weights[hi]
+                alpha_sum[:] += alpha_nz[:,idz-zstart]
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                dvx[:,:,hi, wip] += (alpha_nz[:, None, idz-zstart] / alpha_sum[:, None]) * dy[:,:,ho,wo]
+    return dvx
+# Explicit gradient w.r.t. kx: dM/dk
+# provided as a reference for CUDA & other hand-written gradients
+def _neighborhood_s2_attention_bwd_dk_torch(kx: torch.Tensor, vx: torch.Tensor, qy: torch.Tensor, dy: torch.Tensor,
+                                            quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                            nlon_in: int, nlat_out: int, nlon_out: int):
+    # shapes:
+    # input
+    # kx: B, C, Hi, Wi
+    # vx: B, Cout, Hi, Wi
+    # qy: B, C, Ho, Wo
+    # quad_weights: Hi
+    # output
+    # dkx: B, C, Hi, Wi
+    dkx = torch.zeros_like(kx)
+    batch_size = dy.shape[0]
+    for ho in range(nlat_out):
+        # get number of nonzeros
+        zstart = row_off[ho]
+        zend = row_off[ho+1]
+        for wo in range(nlon_out):
+            qdotk_nz = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            integral = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            alpha = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            alpha_sum = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hj = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wj = nz_col_idx % nlon_in
+                wjp = (wj+wo) % nlon_in
+                # compute correlation & softmax numerator
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hj_wjp = kx[:, :, hj, wjp]
+                qdotk_nz[:,idz-zstart] = torch.sum(q_ho_wo * k_hj_wjp, dim=1)
+            qdotk_max, _ = torch.max(qdotk_nz, dim=1)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hj = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wj = nz_col_idx % nlon_in
+                wjp = (wj+wo) % nlon_in
+                alpha[:, idz-zstart] = torch.exp(qdotk_nz[:,idz-zstart] - qdotk_max) * quad_weights[hj]
+                alpha_sum[:] += alpha[:, idz-zstart]
+                # input dot
+                gdotv = torch.sum(dy[:,:,ho, wo] * vx[:,:,hj, wjp], dim=1)
+                # integral term
+                integral[:] += alpha[:, idz-zstart] * gdotv[:]
+            integral[:] = integral[:] / alpha_sum[:]
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                # compute correlation & softmax numerator
+                gdotv = torch.sum(dy[:,:,ho, wo] * vx[:,:,hi, wip], dim=1)
+                dkx[:,:,hi,wip] += qy[:, :, ho, wo] * (alpha[:, None, idz-zstart] / alpha_sum[:, None]) * (gdotv[:, None] - integral[:, None])
+    return dkx
+# Explicit gradient w.r.t. qy: dM/dq
+# provided as a reference for CUDA & other hand-written gradients
+def _neighborhood_s2_attention_bwd_dq_torch(kx: torch.Tensor, vx: torch.Tensor, qy: torch.Tensor, dy: torch.Tensor,
+                                            quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                            nlon_in: int, nlat_out: int, nlon_out: int):
+    # shapes:
+    # input
+    # kx: B, C, Hi, Wi
+    # vx: B, Cout, Hi, Wi
+    # qy: B, C, Ho, Wo
+    # quad_weights: Hi
+    # output
+    # dq: B, C, Ho, Wo
+    batch_size = dy.shape[0]
+    channels_in = kx.shape[1]
+    channels_out = vx.shape[1]
+    dqy = torch.zeros_like(qy)
+    for ho in range(nlat_out):
+        # get number of nonzeros
+        zstart = row_off[ho]
+        zend = row_off[ho+1]
+        for wo in range(nlon_out):
+            alpha = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            qdotk_nz = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            alpha_k = torch.zeros((batch_size, channels_in), dtype=dy.dtype, device=dy.device)
+            alpha_vw = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            alpha_kvw = torch.zeros((batch_size, channels_in), dtype=dy.dtype, device=dy.device)
+            alpha_sum = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            alpha_sum2 = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                idz_i = idz-zstart
+                # compute correlation & softmax numerator
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hi_wi = kx[:, :, hi, wip]
+                qdotk_nz[:,idz-zstart] = torch.sum(q_ho_wo * k_hi_wi, dim=1)
+            qdotk_max,_ = qdotk_nz.max(dim=1)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hi_wi = kx[:, :, hi, wip]
+                idz_i = idz-zstart
+                alpha[:, idz_i] = torch.exp(qdotk_nz[:,idz-zstart] - qdotk_max) * quad_weights[hi]
+                alpha_sum[:] += alpha[:, idz_i]
+                gdotv = torch.sum(dy[:,:,ho, wo] * vx[:,:,hi, wip], dim=1)
+                alpha_k[:,:] += alpha[:, None, idz_i] * k_hi_wi
+                alpha_vw[:] += alpha[:, idz_i] * gdotv[:]
+                alpha_kvw[:,:] += alpha[:, None, idz_i] * k_hi_wi * gdotv[:,None]
+            dqy[:,:,ho,wo] = (alpha_kvw * alpha_sum[:,None] - alpha_vw[:, None] * alpha_k) / (alpha_sum[:,None] * alpha_sum[:,None])
+    return dqy
+def _neighborhood_s2_attention_torch(k: torch.Tensor, v: torch.Tensor, q: torch.Tensor,
+                                     wk: torch.Tensor, wv: torch.Tensor, wq: torch.Tensor,
+                                     bk: Union[torch.Tensor, None], bv: Union[torch.Tensor, None], bq: Union[torch.Tensor, None],
+                                     quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                     max_psi_nnz: int, nh: int, nlon_in: int, nlat_out: int, nlon_out: int) -> torch.Tensor:
+    kw = F.conv2d(k, weight=wk, bias=bk)
+    vw = F.conv2d(v, weight=wv, bias=bv)
+    qw = F.conv2d(q, weight=wq, bias=bq)
+    # reshape, folding num heads into batch dim
+    B, _, H, W = kw.shape
+    kw = kw.reshape(B*nh, -1, H, W)
+    B, _, H, W = vw.shape
+    vw = vw.reshape(B*nh, -1, H, W)
+    B, _, H, W = qw.shape
+    qw = qw.reshape(B*nh, -1, H, W)
+    kw = kw.to(torch.float32)
+    vw = vw.to(torch.float32)
+    qw = qw.to(torch.float32)
+    output = _neighborhood_s2_attention_fwd_torch(kw, vw, qw, quad_weights,
+                                                  col_idx, row_off,
+                                                  nlon_in, nlat_out, nlon_out)
+    _, C, H, W = output.shape
+    output = output.reshape(B, -1, H, W)
+    return output
+def _neighborhood_s2_attention_bwd_torch(ctx, grad_output):
+    col_idx, row_off, quad_weights, k, v, q, wk, wv, wq, bk, bv, bq = ctx.saved_tensors
+    nh = ctx.nh
+    nlon_in = ctx.nlon_in
+    nlat_out = ctx.nlat_out
+    nlon_out = ctx.nlon_out
+    # check if we need the grads at all
+    k_needs_grad = ctx.needs_input_grad[0]
+    v_needs_grad = ctx.needs_input_grad[1]
+    q_needs_grad = ctx.needs_input_grad[2]
+    wk_needs_grad = ctx.needs_input_grad[3]
+    wv_needs_grad = ctx.needs_input_grad[4]
+    wq_needs_grad = ctx.needs_input_grad[5]
+    bk_needs_grad = ctx.needs_input_grad[6]
+    bv_needs_grad = ctx.needs_input_grad[7]
+    bq_needs_grad = ctx.needs_input_grad[8]
+    kw = F.conv2d(k, weight=wk, bias=bk)
+    vw = F.conv2d(v, weight=wv, bias=bv)
+    qw = F.conv2d(q, weight=wq, bias=bq)
+    # reshape, folding num heads into batch dim
+    B, _, H, W = kw.shape
+    kw = kw.reshape(B*nh, -1, H, W)
+    B, _, H, W = vw.shape
+    vw = vw.reshape(B*nh, -1, H, W)
+    B, _, H, W = qw.shape
+    qw = qw.reshape(B*nh, -1, H, W)
+    B, _, H, W  = grad_output.shape
+    grad_output = grad_output.reshape(B*nh, -1, H, W)
+    if v_needs_grad or wv_needs_grad or bv_needs_grad:
+        dvw = _neighborhood_s2_attention_bwd_dv_torch(kw, vw, qw, grad_output,
+                                                      quad_weights,
+                                                      col_idx, row_off,
+                                                      nlon_in, nlat_out, nlon_out)
+        _, C, H, W = dvw.shape
+        dvw = dvw.reshape(B, -1, H, W)
+    else:
+        dvw = None
+    if k_needs_grad or wk_needs_grad or bk_needs_grad:
+        dkw = _neighborhood_s2_attention_bwd_dk_torch(kw, vw, qw, grad_output,
+                                                      quad_weights,
+                                                      col_idx, row_off,
+                                                      nlon_in, nlat_out, nlon_out)
+        _, C, H, W = dkw.shape
+        dkw = dkw.reshape(B, -1, H, W)
+    else:
+        dkw = None
+    if q_needs_grad or wq_needs_grad or bq_needs_grad:
+        dqw = _neighborhood_s2_attention_bwd_dq_torch(kw, vw, qw, grad_output,
+                                                      quad_weights,
+                                                      col_idx, row_off,
+                                                      nlon_in, nlat_out, nlon_out)
+        _, C, H, W = dqw.shape
+        dqw = dqw.reshape(B, -1, H, W)
+    else:
+        dqw = None
+    # input grads
+    if v_needs_grad:
+        dv = torch.nn.functional.conv2d(dvw, weight=wv.permute([1,0,2,3]), bias=None)
+    else:
+        dv = None
+    if k_needs_grad:
+        dk = torch.nn.functional.conv2d(dkw, weight=wk.permute([1,0,2,3]), bias=None)
+    else:
+        dk = None
+    if q_needs_grad:
+        dq = torch.nn.functional.conv2d(dqw, weight=wq.permute([1,0,2,3]), bias=None)
+    else:
+        dq = None
+    # weight grads
+    if wv_needs_grad:
+        dwv = torch.einsum("bchw,bfhw->cf", dvw, v).reshape(*wv.shape).contiguous()
+    else:
+        dwv = None
+    if wk_needs_grad:
+        dwk = torch.einsum("bchw,bfhw->cf", dkw, k).reshape(*wk.shape).contiguous()
+    else:
+        dwk = None
+    if wq_needs_grad:
+        dwq = torch.einsum("bchw,bfhw->cf", dqw, q).reshape(*wq.shape).contiguous()
+    else:
+        dwq = None
+    # bias grads:
+    if bv_needs_grad:
+        dbv = torch.sum(dvw, dim=(0,2,3))
+    else:
+        dbv = None
+    if bk_needs_grad:
+        dbk = torch.sum(dkw, dim=(0,2,3))
+    else:
+        dbk = None
+    if bq_needs_grad:
+        dbq = torch.sum(dqw, dim=(0,2,3))
+    else:
+        dbq = None
+    return dk, dv, dq, dwk, dwv, dwq, dbk, dbv, dbq, \
+            None, None, None, None, None, None, None, None

build/torch27-cxx11-cu128-x86_64-linux/torch_harmonics_attn/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _torch_harmonics_attn_20251001150033
+ops = torch.ops._torch_harmonics_attn_20251001150033
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_torch_harmonics_attn_20251001150033::{op_name}"

build/torch27-cxx11-cu128-x86_64-linux/torch_harmonics_attn/_torch_harmonics_attn_20251001150033.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe35cb08c5705c56860da606c3b5480ef7880deaeb42eb0efcd4a37ef1bd70d6
+size 35370448

build/torch28-cxx11-cu126-x86_64-linux/torch_harmonics_attn/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from ._attn_utils import backward, forward, forward_optimized, backward_optimized, _neighborhood_s2_attention_fwd_torch, _neighborhood_s2_attention_bwd_torch
+__all__ = [
+    "backward",
+    "forward",
+    "forward_optimized",
+    "backward_optimized",
+    "_neighborhood_s2_attention_fwd_torch",
+    "_neighborhood_s2_attention_bwd_torch",
+]

build/torch28-cxx11-cu126-x86_64-linux/torch_harmonics_attn/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (436 Bytes). View file

build/torch28-cxx11-cu126-x86_64-linux/torch_harmonics_attn/__pycache__/_attn_utils.cpython-313.pyc ADDED Viewed

Binary file (27.2 kB). View file

build/torch28-cxx11-cu126-x86_64-linux/torch_harmonics_attn/__pycache__/_ops.cpython-313.pyc ADDED Viewed

Binary file (570 Bytes). View file

build/torch28-cxx11-cu126-x86_64-linux/torch_harmonics_attn/_attn_utils.py ADDED Viewed

	@@ -0,0 +1,637 @@

+# coding=utf-8
+# SPDX-FileCopyrightText: Copyright (c) 2025 The torch-harmonics Authors. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+from typing import Union, Tuple
+import torch
+import torch.nn.functional as F
+from ._ops import ops
+def backward(kx, vx, qy, dy, quad_weights, psi_col_idx, psi_row_off, nlon_in, nlat_out, nlon_out):
+    return ops.s2_attention_bwd_dkvq_cuda(kx, vx, qy, dy, quad_weights, psi_col_idx, psi_row_off, nlon_in, nlat_out, nlon_out)
+def forward(kx, vx, qy, quad_weights, psi_col_idx, psi_row_off, nlon_in, nlat_out, nlon_out):
+    return ops.s2_attention_fwd_cuda(kx, vx, qy, quad_weights, psi_col_idx, psi_row_off, nlon_in, nlat_out, nlon_out)
+def _setup_context_attention_backward(ctx, inputs, output):
+    k, v, q, wk, wv, wq, bk, bv, bq, quad_weights, col_idx, row_off, max_psi_nnz, nh, nlon_in, nlat_out, nlon_out = inputs
+    ctx.save_for_backward(col_idx, row_off, quad_weights, k, v, q, wk, wv, wq, bk, bv, bq)
+    ctx.nh = nh
+    ctx.max_psi_nnz = max_psi_nnz
+    ctx.nlon_in = nlon_in
+    ctx.nlat_out = nlat_out
+    ctx.nlon_out = nlon_out
+def forward_default(kw: torch.Tensor, vw: torch.Tensor, qw: torch.Tensor,
+        quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+        nlon_in: int, nlat_out: int, nlon_out: int) -> torch.Tensor:
+    out_shape = (kw.shape[0], vw.shape[1], nlat_out, nlon_out)
+    return torch.empty(out_shape, dtype=kw.dtype, device=kw.device)
+def backward_default(kw: torch.Tensor, vw: torch.Tensor, qw: torch.Tensor, grad_output: torch.Tensor,
+        quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+        nlon_in: int, nlat_out: int, nlon_out: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    dk = torch.empty_like(kw)
+    dv = torch.empty_like(vw)
+    dq = torch.empty_like(qw)
+    return dk, dv, dq
+    # forward
+def forward_optimized(k: torch.Tensor, v: torch.Tensor, q: torch.Tensor,
+                                            wk: torch.Tensor, wv: torch.Tensor, wq: torch.Tensor,
+                                            bk: Union[torch.Tensor, None], bv: Union[torch.Tensor, None], bq: Union[torch.Tensor, None],
+                                            quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                            max_psi_nnz: int, nh: int, nlon_in: int, nlat_out: int, nlon_out: int) -> torch.Tensor:
+    kw = F.conv2d(k, weight=wk, bias=bk)
+    vw = F.conv2d(v, weight=wv, bias=bv)
+    qw = F.conv2d(q, weight=wq, bias=bq)
+    # reshape, folding num heads into batch dim
+    B, _, H, W = kw.shape
+    kw = kw.reshape(B*nh, -1, H, W)
+    B, _, H, W = vw.shape
+    vw = vw.reshape(B*nh, -1, H, W)
+    B, _, H, W = qw.shape
+    qw = qw.reshape(B*nh, -1, H, W)
+    # convert to float32
+    inp_dtype = kw.dtype
+    kw = kw.to(torch.float32).contiguous()
+    vw = vw.to(torch.float32).contiguous()
+    qw = qw.to(torch.float32).contiguous()
+    output = forward(kw, vw, qw, quad_weights,
+                                                col_idx, row_off,
+                                                nlon_in, nlat_out, nlon_out)
+    _, C, H, W = output.shape
+    output = output.reshape(B, -1, H, W)
+    # convert back precision
+    output = output.to(dtype=inp_dtype)
+    return output
+def backward_optimized(ctx, grad_output):
+    col_idx, row_off, quad_weights, k, v, q, wk, wv, wq, bk, bv, bq = ctx.saved_tensors
+    nh = ctx.nh
+    max_psi_nnz = ctx.max_psi_nnz
+    nlon_in = ctx.nlon_in
+    nlat_out = ctx.nlat_out
+    nlon_out = ctx.nlon_out
+    # check if we need the grads at all
+    k_needs_grad = ctx.needs_input_grad[0]
+    v_needs_grad = ctx.needs_input_grad[1]
+    q_needs_grad = ctx.needs_input_grad[2]
+    wk_needs_grad = ctx.needs_input_grad[3]
+    wv_needs_grad = ctx.needs_input_grad[4]
+    wq_needs_grad = ctx.needs_input_grad[5]
+    bk_needs_grad = ctx.needs_input_grad[6]
+    bv_needs_grad = ctx.needs_input_grad[7]
+    bq_needs_grad = ctx.needs_input_grad[8]
+    kw = F.conv2d(k, weight=wk, bias=bk)
+    vw = F.conv2d(v, weight=wv, bias=bv)
+    qw = F.conv2d(q, weight=wq, bias=bq)
+    # reshape, folding num heads into batch dim
+    B, _, H, W = kw.shape
+    kw = kw.reshape(B*nh, -1, H, W)
+    B, _, H, W = vw.shape
+    vw = vw.reshape(B*nh, -1, H, W)
+    B, _, H, W = qw.shape
+    qw = qw.reshape(B*nh, -1, H, W)
+    B, _, H, W  = grad_output.shape
+    grad_output = grad_output.reshape(B*nh, -1, H, W)
+    # save type and convert to float32
+    kw_dtype = kw.dtype
+    vw_dtype = vw.dtype
+    qw_dtype = qw.dtype
+    kw = kw.to(torch.float32).contiguous()
+    vw = vw.to(torch.float32).contiguous()
+    qw = qw.to(torch.float32).contiguous()
+    grad_output = grad_output.to(torch.float32).contiguous()
+    dkw, dvw, dqw = backward(kw, vw, qw, grad_output,
+                                                       quad_weights,
+                                                       col_idx, row_off,
+                                                       nlon_in, nlat_out, nlon_out)
+    # weight grads
+    _, C, H, W = dkw.shape
+    dkw = dkw.reshape(B, -1, H, W)
+    dkw = dkw.to(dtype=kw_dtype)
+    if wk_needs_grad:
+        dwk = torch.einsum("bchw,bfhw->cf", dkw, k).reshape(*wk.shape).contiguous()
+    else:
+        dwk = None
+    _, C, H, W = dvw.shape
+    dvw = dvw.reshape(B, -1, H, W)
+    dvw = dvw.to(dtype=vw_dtype)
+    if wv_needs_grad:
+        dwv = torch.einsum("bchw,bfhw->cf", dvw, v).reshape(*wv.shape).contiguous()
+    else:
+        dwv = None
+    _, C, H, W = dqw.shape
+    dqw = dqw.reshape(B, -1, H, W)
+    dqw = dqw.to(dtype=qw_dtype)
+    if wq_needs_grad:
+        dwq = torch.einsum("bchw,bfhw->cf", dqw, q).reshape(*wq.shape).contiguous()
+    else:
+        dwq = None
+    # input grads
+    if v_needs_grad:
+        dv = torch.nn.functional.conv2d(dvw, weight=wv.permute([1,0,2,3]), bias=None)
+    else:
+        dv = None
+    if k_needs_grad:
+        dk = torch.nn.functional.conv2d(dkw, weight=wk.permute([1,0,2,3]), bias=None)
+    else:
+        dk = None
+    if q_needs_grad:
+        dq = torch.nn.functional.conv2d(dqw, weight=wq.permute([1,0,2,3]), bias=None)
+    else:
+        dq = None
+    # bias grads:
+    if bv_needs_grad:
+        dbv = torch.sum(dvw, dim=(0,2,3))
+    else:
+        dbv = None
+    if bk_needs_grad:
+        dbk = torch.sum(dkw, dim=(0,2,3))
+    else:
+        dbk = None
+    if bq_needs_grad:
+        dbq = torch.sum(dqw, dim=(0,2,3))
+    else:
+        dbq = None
+    return dk, dv, dq, dwk, dwv, dwq, dbk, dbv, dbq, \
+            None, None, None, None, None, None, None, None
+# torch kernels
+# uses qdotk_max update trick to avoid two loops when computing the softmax
+# see e.g., https://arxiv.org/abs/1805.02867
+# and https://alexdremov.me/understanding-flash-attention-writing-the-algorithm-from-scratch-in-triton/
+def _neighborhood_s2_attention_fwd_torch(kx: torch.Tensor, vx: torch.Tensor, qy: torch.Tensor,
+                                         quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                         nlon_in: int, nlat_out: int, nlon_out: int) -> torch.Tensor:
+    # prepare result tensor
+    out_shape = (qy.shape[0], vx.shape[1], nlat_out, nlon_out)
+    y = torch.zeros(out_shape, dtype=qy.dtype, device=qy.device)
+    for ho in range(nlat_out):
+	    # get number of nonzeros
+        zstart = row_off[ho]
+        zend = row_off[ho+1]
+        for wo in range(nlon_out):
+            alpha_sum = torch.zeros((y.shape[0],), dtype=y.dtype, device=y.device)
+            qdotk_max = torch.zeros((y.shape[0],), dtype=y.dtype, device=y.device)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi + wo) % nlon_in
+                # compute correlation & softmax numerator
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hi_wip = kx[:, :, hi, wip]
+                qdotk = torch.sum(q_ho_wo * k_hi_wip, dim=1)
+                # tmp max
+                qdotk_max_tmp = torch.maximum(qdotk_max, qdotk)
+                # alpha sum update
+                alpha = torch.exp(qdotk - qdotk_max_tmp) * quad_weights[hi]
+                alpha_sum = alpha + alpha_sum * torch.exp(qdotk_max - qdotk_max_tmp)
+                # update output
+                y[:,:,ho,wo] = y[:,:,ho,wo] * torch.exp(qdotk_max - qdotk_max_tmp).unsqueeze(1) + alpha[:, None] * vx[:,:,hi,wip]
+                # define new max
+                qdotk_max = qdotk_max_tmp
+            y[:,:,ho,wo] = y[:,:,ho,wo] / alpha_sum[:, None]
+    return y
+# Explicit gradient w.r.t. vx: dM/dv
+# provided as a reference for CUDA & other hand-written gradients
+def _neighborhood_s2_attention_bwd_dv_torch(kx: torch.Tensor, vx: torch.Tensor, qy: torch.Tensor, dy: torch.Tensor,
+                                            quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                            nlon_in: int, nlat_out: int, nlon_out: int):
+    # shapes:
+    # input
+    # kx: B, C, Hi, Wi
+    # vx: B, Cout, Hi, Wi
+    # qy: B, Cout, Ho, Wo
+    # quad_weights: Hi
+    # output
+    # dvx: B, Cout, Hi, Wi
+    dvx = torch.zeros_like(vx)
+    batch_size = dy.shape[0]
+    for ho in range(nlat_out):
+        # get number of nonzeros
+        zstart = row_off[ho]
+        zend = row_off[ho+1]
+        for wo in range(nlon_out):
+            alpha_nz = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            qdotk_nz = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            alpha_sum = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                # compute correlation & softmax numerator
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hi_wi = kx[:, :, hi, wip]
+                qdotk_nz[:,idz-zstart] = torch.sum(q_ho_wo * k_hi_wi, dim=1)
+            qdotk_max, _ = torch.max(qdotk_nz, dim=1)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                alpha_nz[:,idz-zstart] = torch.exp(qdotk_nz[:,idz-zstart] - qdotk_max) * quad_weights[hi]
+                alpha_sum[:] += alpha_nz[:,idz-zstart]
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                dvx[:,:,hi, wip] += (alpha_nz[:, None, idz-zstart] / alpha_sum[:, None]) * dy[:,:,ho,wo]
+    return dvx
+# Explicit gradient w.r.t. kx: dM/dk
+# provided as a reference for CUDA & other hand-written gradients
+def _neighborhood_s2_attention_bwd_dk_torch(kx: torch.Tensor, vx: torch.Tensor, qy: torch.Tensor, dy: torch.Tensor,
+                                            quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                            nlon_in: int, nlat_out: int, nlon_out: int):
+    # shapes:
+    # input
+    # kx: B, C, Hi, Wi
+    # vx: B, Cout, Hi, Wi
+    # qy: B, C, Ho, Wo
+    # quad_weights: Hi
+    # output
+    # dkx: B, C, Hi, Wi
+    dkx = torch.zeros_like(kx)
+    batch_size = dy.shape[0]
+    for ho in range(nlat_out):
+        # get number of nonzeros
+        zstart = row_off[ho]
+        zend = row_off[ho+1]
+        for wo in range(nlon_out):
+            qdotk_nz = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            integral = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            alpha = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            alpha_sum = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hj = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wj = nz_col_idx % nlon_in
+                wjp = (wj+wo) % nlon_in
+                # compute correlation & softmax numerator
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hj_wjp = kx[:, :, hj, wjp]
+                qdotk_nz[:,idz-zstart] = torch.sum(q_ho_wo * k_hj_wjp, dim=1)
+            qdotk_max, _ = torch.max(qdotk_nz, dim=1)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hj = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wj = nz_col_idx % nlon_in
+                wjp = (wj+wo) % nlon_in
+                alpha[:, idz-zstart] = torch.exp(qdotk_nz[:,idz-zstart] - qdotk_max) * quad_weights[hj]
+                alpha_sum[:] += alpha[:, idz-zstart]
+                # input dot
+                gdotv = torch.sum(dy[:,:,ho, wo] * vx[:,:,hj, wjp], dim=1)
+                # integral term
+                integral[:] += alpha[:, idz-zstart] * gdotv[:]
+            integral[:] = integral[:] / alpha_sum[:]
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                # compute correlation & softmax numerator
+                gdotv = torch.sum(dy[:,:,ho, wo] * vx[:,:,hi, wip], dim=1)
+                dkx[:,:,hi,wip] += qy[:, :, ho, wo] * (alpha[:, None, idz-zstart] / alpha_sum[:, None]) * (gdotv[:, None] - integral[:, None])
+    return dkx
+# Explicit gradient w.r.t. qy: dM/dq
+# provided as a reference for CUDA & other hand-written gradients
+def _neighborhood_s2_attention_bwd_dq_torch(kx: torch.Tensor, vx: torch.Tensor, qy: torch.Tensor, dy: torch.Tensor,
+                                            quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                            nlon_in: int, nlat_out: int, nlon_out: int):
+    # shapes:
+    # input
+    # kx: B, C, Hi, Wi
+    # vx: B, Cout, Hi, Wi
+    # qy: B, C, Ho, Wo
+    # quad_weights: Hi
+    # output
+    # dq: B, C, Ho, Wo
+    batch_size = dy.shape[0]
+    channels_in = kx.shape[1]
+    channels_out = vx.shape[1]
+    dqy = torch.zeros_like(qy)
+    for ho in range(nlat_out):
+        # get number of nonzeros
+        zstart = row_off[ho]
+        zend = row_off[ho+1]
+        for wo in range(nlon_out):
+            alpha = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            qdotk_nz = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            alpha_k = torch.zeros((batch_size, channels_in), dtype=dy.dtype, device=dy.device)
+            alpha_vw = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            alpha_kvw = torch.zeros((batch_size, channels_in), dtype=dy.dtype, device=dy.device)
+            alpha_sum = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            alpha_sum2 = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                idz_i = idz-zstart
+                # compute correlation & softmax numerator
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hi_wi = kx[:, :, hi, wip]
+                qdotk_nz[:,idz-zstart] = torch.sum(q_ho_wo * k_hi_wi, dim=1)
+            qdotk_max,_ = qdotk_nz.max(dim=1)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hi_wi = kx[:, :, hi, wip]
+                idz_i = idz-zstart
+                alpha[:, idz_i] = torch.exp(qdotk_nz[:,idz-zstart] - qdotk_max) * quad_weights[hi]
+                alpha_sum[:] += alpha[:, idz_i]
+                gdotv = torch.sum(dy[:,:,ho, wo] * vx[:,:,hi, wip], dim=1)
+                alpha_k[:,:] += alpha[:, None, idz_i] * k_hi_wi
+                alpha_vw[:] += alpha[:, idz_i] * gdotv[:]
+                alpha_kvw[:,:] += alpha[:, None, idz_i] * k_hi_wi * gdotv[:,None]
+            dqy[:,:,ho,wo] = (alpha_kvw * alpha_sum[:,None] - alpha_vw[:, None] * alpha_k) / (alpha_sum[:,None] * alpha_sum[:,None])
+    return dqy
+def _neighborhood_s2_attention_torch(k: torch.Tensor, v: torch.Tensor, q: torch.Tensor,
+                                     wk: torch.Tensor, wv: torch.Tensor, wq: torch.Tensor,
+                                     bk: Union[torch.Tensor, None], bv: Union[torch.Tensor, None], bq: Union[torch.Tensor, None],
+                                     quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                     max_psi_nnz: int, nh: int, nlon_in: int, nlat_out: int, nlon_out: int) -> torch.Tensor:
+    kw = F.conv2d(k, weight=wk, bias=bk)
+    vw = F.conv2d(v, weight=wv, bias=bv)
+    qw = F.conv2d(q, weight=wq, bias=bq)
+    # reshape, folding num heads into batch dim
+    B, _, H, W = kw.shape
+    kw = kw.reshape(B*nh, -1, H, W)
+    B, _, H, W = vw.shape
+    vw = vw.reshape(B*nh, -1, H, W)
+    B, _, H, W = qw.shape
+    qw = qw.reshape(B*nh, -1, H, W)
+    kw = kw.to(torch.float32)
+    vw = vw.to(torch.float32)
+    qw = qw.to(torch.float32)
+    output = _neighborhood_s2_attention_fwd_torch(kw, vw, qw, quad_weights,
+                                                  col_idx, row_off,
+                                                  nlon_in, nlat_out, nlon_out)
+    _, C, H, W = output.shape
+    output = output.reshape(B, -1, H, W)
+    return output
+def _neighborhood_s2_attention_bwd_torch(ctx, grad_output):
+    col_idx, row_off, quad_weights, k, v, q, wk, wv, wq, bk, bv, bq = ctx.saved_tensors
+    nh = ctx.nh
+    nlon_in = ctx.nlon_in
+    nlat_out = ctx.nlat_out
+    nlon_out = ctx.nlon_out
+    # check if we need the grads at all
+    k_needs_grad = ctx.needs_input_grad[0]
+    v_needs_grad = ctx.needs_input_grad[1]
+    q_needs_grad = ctx.needs_input_grad[2]
+    wk_needs_grad = ctx.needs_input_grad[3]
+    wv_needs_grad = ctx.needs_input_grad[4]
+    wq_needs_grad = ctx.needs_input_grad[5]
+    bk_needs_grad = ctx.needs_input_grad[6]
+    bv_needs_grad = ctx.needs_input_grad[7]
+    bq_needs_grad = ctx.needs_input_grad[8]
+    kw = F.conv2d(k, weight=wk, bias=bk)
+    vw = F.conv2d(v, weight=wv, bias=bv)
+    qw = F.conv2d(q, weight=wq, bias=bq)
+    # reshape, folding num heads into batch dim
+    B, _, H, W = kw.shape
+    kw = kw.reshape(B*nh, -1, H, W)
+    B, _, H, W = vw.shape
+    vw = vw.reshape(B*nh, -1, H, W)
+    B, _, H, W = qw.shape
+    qw = qw.reshape(B*nh, -1, H, W)
+    B, _, H, W  = grad_output.shape
+    grad_output = grad_output.reshape(B*nh, -1, H, W)
+    if v_needs_grad or wv_needs_grad or bv_needs_grad:
+        dvw = _neighborhood_s2_attention_bwd_dv_torch(kw, vw, qw, grad_output,
+                                                      quad_weights,
+                                                      col_idx, row_off,
+                                                      nlon_in, nlat_out, nlon_out)
+        _, C, H, W = dvw.shape
+        dvw = dvw.reshape(B, -1, H, W)
+    else:
+        dvw = None
+    if k_needs_grad or wk_needs_grad or bk_needs_grad:
+        dkw = _neighborhood_s2_attention_bwd_dk_torch(kw, vw, qw, grad_output,
+                                                      quad_weights,
+                                                      col_idx, row_off,
+                                                      nlon_in, nlat_out, nlon_out)
+        _, C, H, W = dkw.shape
+        dkw = dkw.reshape(B, -1, H, W)
+    else:
+        dkw = None
+    if q_needs_grad or wq_needs_grad or bq_needs_grad:
+        dqw = _neighborhood_s2_attention_bwd_dq_torch(kw, vw, qw, grad_output,
+                                                      quad_weights,
+                                                      col_idx, row_off,
+                                                      nlon_in, nlat_out, nlon_out)
+        _, C, H, W = dqw.shape
+        dqw = dqw.reshape(B, -1, H, W)
+    else:
+        dqw = None
+    # input grads
+    if v_needs_grad:
+        dv = torch.nn.functional.conv2d(dvw, weight=wv.permute([1,0,2,3]), bias=None)
+    else:
+        dv = None
+    if k_needs_grad:
+        dk = torch.nn.functional.conv2d(dkw, weight=wk.permute([1,0,2,3]), bias=None)
+    else:
+        dk = None
+    if q_needs_grad:
+        dq = torch.nn.functional.conv2d(dqw, weight=wq.permute([1,0,2,3]), bias=None)
+    else:
+        dq = None
+    # weight grads
+    if wv_needs_grad:
+        dwv = torch.einsum("bchw,bfhw->cf", dvw, v).reshape(*wv.shape).contiguous()
+    else:
+        dwv = None
+    if wk_needs_grad:
+        dwk = torch.einsum("bchw,bfhw->cf", dkw, k).reshape(*wk.shape).contiguous()
+    else:
+        dwk = None
+    if wq_needs_grad:
+        dwq = torch.einsum("bchw,bfhw->cf", dqw, q).reshape(*wq.shape).contiguous()
+    else:
+        dwq = None
+    # bias grads:
+    if bv_needs_grad:
+        dbv = torch.sum(dvw, dim=(0,2,3))
+    else:
+        dbv = None
+    if bk_needs_grad:
+        dbk = torch.sum(dkw, dim=(0,2,3))
+    else:
+        dbk = None
+    if bq_needs_grad:
+        dbq = torch.sum(dqw, dim=(0,2,3))
+    else:
+        dbq = None
+    return dk, dv, dq, dwk, dwv, dwq, dbk, dbv, dbq, \
+            None, None, None, None, None, None, None, None

build/torch28-cxx11-cu126-x86_64-linux/torch_harmonics_attn/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _torch_harmonics_attn_20251001150033
+ops = torch.ops._torch_harmonics_attn_20251001150033
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_torch_harmonics_attn_20251001150033::{op_name}"

build/torch28-cxx11-cu126-x86_64-linux/torch_harmonics_attn/_torch_harmonics_attn_20251001150033.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a1f5426e6d758a776dab4a8ccd4abecbf516f0c53d9884b44746cf5585898af
+size 27627336

build/torch28-cxx11-cu128-x86_64-linux/torch_harmonics_attn/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from ._attn_utils import backward, forward, forward_optimized, backward_optimized, _neighborhood_s2_attention_fwd_torch, _neighborhood_s2_attention_bwd_torch
+__all__ = [
+    "backward",
+    "forward",
+    "forward_optimized",
+    "backward_optimized",
+    "_neighborhood_s2_attention_fwd_torch",
+    "_neighborhood_s2_attention_bwd_torch",
+]

build/torch28-cxx11-cu128-x86_64-linux/torch_harmonics_attn/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (436 Bytes). View file

build/torch28-cxx11-cu128-x86_64-linux/torch_harmonics_attn/__pycache__/_attn_utils.cpython-313.pyc ADDED Viewed

Binary file (27.2 kB). View file

build/torch28-cxx11-cu128-x86_64-linux/torch_harmonics_attn/__pycache__/_ops.cpython-313.pyc ADDED Viewed

Binary file (570 Bytes). View file

build/torch28-cxx11-cu128-x86_64-linux/torch_harmonics_attn/_attn_utils.py ADDED Viewed

	@@ -0,0 +1,637 @@

+# coding=utf-8
+# SPDX-FileCopyrightText: Copyright (c) 2025 The torch-harmonics Authors. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+from typing import Union, Tuple
+import torch
+import torch.nn.functional as F
+from ._ops import ops
+def backward(kx, vx, qy, dy, quad_weights, psi_col_idx, psi_row_off, nlon_in, nlat_out, nlon_out):
+    return ops.s2_attention_bwd_dkvq_cuda(kx, vx, qy, dy, quad_weights, psi_col_idx, psi_row_off, nlon_in, nlat_out, nlon_out)
+def forward(kx, vx, qy, quad_weights, psi_col_idx, psi_row_off, nlon_in, nlat_out, nlon_out):
+    return ops.s2_attention_fwd_cuda(kx, vx, qy, quad_weights, psi_col_idx, psi_row_off, nlon_in, nlat_out, nlon_out)
+def _setup_context_attention_backward(ctx, inputs, output):
+    k, v, q, wk, wv, wq, bk, bv, bq, quad_weights, col_idx, row_off, max_psi_nnz, nh, nlon_in, nlat_out, nlon_out = inputs
+    ctx.save_for_backward(col_idx, row_off, quad_weights, k, v, q, wk, wv, wq, bk, bv, bq)
+    ctx.nh = nh
+    ctx.max_psi_nnz = max_psi_nnz
+    ctx.nlon_in = nlon_in
+    ctx.nlat_out = nlat_out
+    ctx.nlon_out = nlon_out
+def forward_default(kw: torch.Tensor, vw: torch.Tensor, qw: torch.Tensor,
+        quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+        nlon_in: int, nlat_out: int, nlon_out: int) -> torch.Tensor:
+    out_shape = (kw.shape[0], vw.shape[1], nlat_out, nlon_out)
+    return torch.empty(out_shape, dtype=kw.dtype, device=kw.device)
+def backward_default(kw: torch.Tensor, vw: torch.Tensor, qw: torch.Tensor, grad_output: torch.Tensor,
+        quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+        nlon_in: int, nlat_out: int, nlon_out: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    dk = torch.empty_like(kw)
+    dv = torch.empty_like(vw)
+    dq = torch.empty_like(qw)
+    return dk, dv, dq
+    # forward
+def forward_optimized(k: torch.Tensor, v: torch.Tensor, q: torch.Tensor,
+                                            wk: torch.Tensor, wv: torch.Tensor, wq: torch.Tensor,
+                                            bk: Union[torch.Tensor, None], bv: Union[torch.Tensor, None], bq: Union[torch.Tensor, None],
+                                            quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                            max_psi_nnz: int, nh: int, nlon_in: int, nlat_out: int, nlon_out: int) -> torch.Tensor:
+    kw = F.conv2d(k, weight=wk, bias=bk)
+    vw = F.conv2d(v, weight=wv, bias=bv)
+    qw = F.conv2d(q, weight=wq, bias=bq)
+    # reshape, folding num heads into batch dim
+    B, _, H, W = kw.shape
+    kw = kw.reshape(B*nh, -1, H, W)
+    B, _, H, W = vw.shape
+    vw = vw.reshape(B*nh, -1, H, W)
+    B, _, H, W = qw.shape
+    qw = qw.reshape(B*nh, -1, H, W)
+    # convert to float32
+    inp_dtype = kw.dtype
+    kw = kw.to(torch.float32).contiguous()
+    vw = vw.to(torch.float32).contiguous()
+    qw = qw.to(torch.float32).contiguous()
+    output = forward(kw, vw, qw, quad_weights,
+                                                col_idx, row_off,
+                                                nlon_in, nlat_out, nlon_out)
+    _, C, H, W = output.shape
+    output = output.reshape(B, -1, H, W)
+    # convert back precision
+    output = output.to(dtype=inp_dtype)
+    return output
+def backward_optimized(ctx, grad_output):
+    col_idx, row_off, quad_weights, k, v, q, wk, wv, wq, bk, bv, bq = ctx.saved_tensors
+    nh = ctx.nh
+    max_psi_nnz = ctx.max_psi_nnz
+    nlon_in = ctx.nlon_in
+    nlat_out = ctx.nlat_out
+    nlon_out = ctx.nlon_out
+    # check if we need the grads at all
+    k_needs_grad = ctx.needs_input_grad[0]
+    v_needs_grad = ctx.needs_input_grad[1]
+    q_needs_grad = ctx.needs_input_grad[2]
+    wk_needs_grad = ctx.needs_input_grad[3]
+    wv_needs_grad = ctx.needs_input_grad[4]
+    wq_needs_grad = ctx.needs_input_grad[5]
+    bk_needs_grad = ctx.needs_input_grad[6]
+    bv_needs_grad = ctx.needs_input_grad[7]
+    bq_needs_grad = ctx.needs_input_grad[8]
+    kw = F.conv2d(k, weight=wk, bias=bk)
+    vw = F.conv2d(v, weight=wv, bias=bv)
+    qw = F.conv2d(q, weight=wq, bias=bq)
+    # reshape, folding num heads into batch dim
+    B, _, H, W = kw.shape
+    kw = kw.reshape(B*nh, -1, H, W)
+    B, _, H, W = vw.shape
+    vw = vw.reshape(B*nh, -1, H, W)
+    B, _, H, W = qw.shape
+    qw = qw.reshape(B*nh, -1, H, W)
+    B, _, H, W  = grad_output.shape
+    grad_output = grad_output.reshape(B*nh, -1, H, W)
+    # save type and convert to float32
+    kw_dtype = kw.dtype
+    vw_dtype = vw.dtype
+    qw_dtype = qw.dtype
+    kw = kw.to(torch.float32).contiguous()
+    vw = vw.to(torch.float32).contiguous()
+    qw = qw.to(torch.float32).contiguous()
+    grad_output = grad_output.to(torch.float32).contiguous()
+    dkw, dvw, dqw = backward(kw, vw, qw, grad_output,
+                                                       quad_weights,
+                                                       col_idx, row_off,
+                                                       nlon_in, nlat_out, nlon_out)
+    # weight grads
+    _, C, H, W = dkw.shape
+    dkw = dkw.reshape(B, -1, H, W)
+    dkw = dkw.to(dtype=kw_dtype)
+    if wk_needs_grad:
+        dwk = torch.einsum("bchw,bfhw->cf", dkw, k).reshape(*wk.shape).contiguous()
+    else:
+        dwk = None
+    _, C, H, W = dvw.shape
+    dvw = dvw.reshape(B, -1, H, W)
+    dvw = dvw.to(dtype=vw_dtype)
+    if wv_needs_grad:
+        dwv = torch.einsum("bchw,bfhw->cf", dvw, v).reshape(*wv.shape).contiguous()
+    else:
+        dwv = None
+    _, C, H, W = dqw.shape
+    dqw = dqw.reshape(B, -1, H, W)
+    dqw = dqw.to(dtype=qw_dtype)
+    if wq_needs_grad:
+        dwq = torch.einsum("bchw,bfhw->cf", dqw, q).reshape(*wq.shape).contiguous()
+    else:
+        dwq = None
+    # input grads
+    if v_needs_grad:
+        dv = torch.nn.functional.conv2d(dvw, weight=wv.permute([1,0,2,3]), bias=None)
+    else:
+        dv = None
+    if k_needs_grad:
+        dk = torch.nn.functional.conv2d(dkw, weight=wk.permute([1,0,2,3]), bias=None)
+    else:
+        dk = None
+    if q_needs_grad:
+        dq = torch.nn.functional.conv2d(dqw, weight=wq.permute([1,0,2,3]), bias=None)
+    else:
+        dq = None
+    # bias grads:
+    if bv_needs_grad:
+        dbv = torch.sum(dvw, dim=(0,2,3))
+    else:
+        dbv = None
+    if bk_needs_grad:
+        dbk = torch.sum(dkw, dim=(0,2,3))
+    else:
+        dbk = None
+    if bq_needs_grad:
+        dbq = torch.sum(dqw, dim=(0,2,3))
+    else:
+        dbq = None
+    return dk, dv, dq, dwk, dwv, dwq, dbk, dbv, dbq, \
+            None, None, None, None, None, None, None, None
+# torch kernels
+# uses qdotk_max update trick to avoid two loops when computing the softmax
+# see e.g., https://arxiv.org/abs/1805.02867
+# and https://alexdremov.me/understanding-flash-attention-writing-the-algorithm-from-scratch-in-triton/
+def _neighborhood_s2_attention_fwd_torch(kx: torch.Tensor, vx: torch.Tensor, qy: torch.Tensor,
+                                         quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                         nlon_in: int, nlat_out: int, nlon_out: int) -> torch.Tensor:
+    # prepare result tensor
+    out_shape = (qy.shape[0], vx.shape[1], nlat_out, nlon_out)
+    y = torch.zeros(out_shape, dtype=qy.dtype, device=qy.device)
+    for ho in range(nlat_out):
+	    # get number of nonzeros
+        zstart = row_off[ho]
+        zend = row_off[ho+1]
+        for wo in range(nlon_out):
+            alpha_sum = torch.zeros((y.shape[0],), dtype=y.dtype, device=y.device)
+            qdotk_max = torch.zeros((y.shape[0],), dtype=y.dtype, device=y.device)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi + wo) % nlon_in
+                # compute correlation & softmax numerator
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hi_wip = kx[:, :, hi, wip]
+                qdotk = torch.sum(q_ho_wo * k_hi_wip, dim=1)
+                # tmp max
+                qdotk_max_tmp = torch.maximum(qdotk_max, qdotk)
+                # alpha sum update
+                alpha = torch.exp(qdotk - qdotk_max_tmp) * quad_weights[hi]
+                alpha_sum = alpha + alpha_sum * torch.exp(qdotk_max - qdotk_max_tmp)
+                # update output
+                y[:,:,ho,wo] = y[:,:,ho,wo] * torch.exp(qdotk_max - qdotk_max_tmp).unsqueeze(1) + alpha[:, None] * vx[:,:,hi,wip]
+                # define new max
+                qdotk_max = qdotk_max_tmp
+            y[:,:,ho,wo] = y[:,:,ho,wo] / alpha_sum[:, None]
+    return y
+# Explicit gradient w.r.t. vx: dM/dv
+# provided as a reference for CUDA & other hand-written gradients
+def _neighborhood_s2_attention_bwd_dv_torch(kx: torch.Tensor, vx: torch.Tensor, qy: torch.Tensor, dy: torch.Tensor,
+                                            quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                            nlon_in: int, nlat_out: int, nlon_out: int):
+    # shapes:
+    # input
+    # kx: B, C, Hi, Wi
+    # vx: B, Cout, Hi, Wi
+    # qy: B, Cout, Ho, Wo
+    # quad_weights: Hi
+    # output
+    # dvx: B, Cout, Hi, Wi
+    dvx = torch.zeros_like(vx)
+    batch_size = dy.shape[0]
+    for ho in range(nlat_out):
+        # get number of nonzeros
+        zstart = row_off[ho]
+        zend = row_off[ho+1]
+        for wo in range(nlon_out):
+            alpha_nz = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            qdotk_nz = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            alpha_sum = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                # compute correlation & softmax numerator
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hi_wi = kx[:, :, hi, wip]
+                qdotk_nz[:,idz-zstart] = torch.sum(q_ho_wo * k_hi_wi, dim=1)
+            qdotk_max, _ = torch.max(qdotk_nz, dim=1)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                alpha_nz[:,idz-zstart] = torch.exp(qdotk_nz[:,idz-zstart] - qdotk_max) * quad_weights[hi]
+                alpha_sum[:] += alpha_nz[:,idz-zstart]
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                dvx[:,:,hi, wip] += (alpha_nz[:, None, idz-zstart] / alpha_sum[:, None]) * dy[:,:,ho,wo]
+    return dvx
+# Explicit gradient w.r.t. kx: dM/dk
+# provided as a reference for CUDA & other hand-written gradients
+def _neighborhood_s2_attention_bwd_dk_torch(kx: torch.Tensor, vx: torch.Tensor, qy: torch.Tensor, dy: torch.Tensor,
+                                            quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                            nlon_in: int, nlat_out: int, nlon_out: int):
+    # shapes:
+    # input
+    # kx: B, C, Hi, Wi
+    # vx: B, Cout, Hi, Wi
+    # qy: B, C, Ho, Wo
+    # quad_weights: Hi
+    # output
+    # dkx: B, C, Hi, Wi
+    dkx = torch.zeros_like(kx)
+    batch_size = dy.shape[0]
+    for ho in range(nlat_out):
+        # get number of nonzeros
+        zstart = row_off[ho]
+        zend = row_off[ho+1]
+        for wo in range(nlon_out):
+            qdotk_nz = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            integral = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            alpha = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            alpha_sum = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hj = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wj = nz_col_idx % nlon_in
+                wjp = (wj+wo) % nlon_in
+                # compute correlation & softmax numerator
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hj_wjp = kx[:, :, hj, wjp]
+                qdotk_nz[:,idz-zstart] = torch.sum(q_ho_wo * k_hj_wjp, dim=1)
+            qdotk_max, _ = torch.max(qdotk_nz, dim=1)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hj = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wj = nz_col_idx % nlon_in
+                wjp = (wj+wo) % nlon_in
+                alpha[:, idz-zstart] = torch.exp(qdotk_nz[:,idz-zstart] - qdotk_max) * quad_weights[hj]
+                alpha_sum[:] += alpha[:, idz-zstart]
+                # input dot
+                gdotv = torch.sum(dy[:,:,ho, wo] * vx[:,:,hj, wjp], dim=1)
+                # integral term
+                integral[:] += alpha[:, idz-zstart] * gdotv[:]
+            integral[:] = integral[:] / alpha_sum[:]
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                # compute correlation & softmax numerator
+                gdotv = torch.sum(dy[:,:,ho, wo] * vx[:,:,hi, wip], dim=1)
+                dkx[:,:,hi,wip] += qy[:, :, ho, wo] * (alpha[:, None, idz-zstart] / alpha_sum[:, None]) * (gdotv[:, None] - integral[:, None])
+    return dkx
+# Explicit gradient w.r.t. qy: dM/dq
+# provided as a reference for CUDA & other hand-written gradients
+def _neighborhood_s2_attention_bwd_dq_torch(kx: torch.Tensor, vx: torch.Tensor, qy: torch.Tensor, dy: torch.Tensor,
+                                            quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                            nlon_in: int, nlat_out: int, nlon_out: int):
+    # shapes:
+    # input
+    # kx: B, C, Hi, Wi
+    # vx: B, Cout, Hi, Wi
+    # qy: B, C, Ho, Wo
+    # quad_weights: Hi
+    # output
+    # dq: B, C, Ho, Wo
+    batch_size = dy.shape[0]
+    channels_in = kx.shape[1]
+    channels_out = vx.shape[1]
+    dqy = torch.zeros_like(qy)
+    for ho in range(nlat_out):
+        # get number of nonzeros
+        zstart = row_off[ho]
+        zend = row_off[ho+1]
+        for wo in range(nlon_out):
+            alpha = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            qdotk_nz = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            alpha_k = torch.zeros((batch_size, channels_in), dtype=dy.dtype, device=dy.device)
+            alpha_vw = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            alpha_kvw = torch.zeros((batch_size, channels_in), dtype=dy.dtype, device=dy.device)
+            alpha_sum = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            alpha_sum2 = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                idz_i = idz-zstart
+                # compute correlation & softmax numerator
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hi_wi = kx[:, :, hi, wip]
+                qdotk_nz[:,idz-zstart] = torch.sum(q_ho_wo * k_hi_wi, dim=1)
+            qdotk_max,_ = qdotk_nz.max(dim=1)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hi_wi = kx[:, :, hi, wip]
+                idz_i = idz-zstart
+                alpha[:, idz_i] = torch.exp(qdotk_nz[:,idz-zstart] - qdotk_max) * quad_weights[hi]
+                alpha_sum[:] += alpha[:, idz_i]
+                gdotv = torch.sum(dy[:,:,ho, wo] * vx[:,:,hi, wip], dim=1)
+                alpha_k[:,:] += alpha[:, None, idz_i] * k_hi_wi
+                alpha_vw[:] += alpha[:, idz_i] * gdotv[:]
+                alpha_kvw[:,:] += alpha[:, None, idz_i] * k_hi_wi * gdotv[:,None]
+            dqy[:,:,ho,wo] = (alpha_kvw * alpha_sum[:,None] - alpha_vw[:, None] * alpha_k) / (alpha_sum[:,None] * alpha_sum[:,None])
+    return dqy
+def _neighborhood_s2_attention_torch(k: torch.Tensor, v: torch.Tensor, q: torch.Tensor,
+                                     wk: torch.Tensor, wv: torch.Tensor, wq: torch.Tensor,
+                                     bk: Union[torch.Tensor, None], bv: Union[torch.Tensor, None], bq: Union[torch.Tensor, None],
+                                     quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                     max_psi_nnz: int, nh: int, nlon_in: int, nlat_out: int, nlon_out: int) -> torch.Tensor:
+    kw = F.conv2d(k, weight=wk, bias=bk)
+    vw = F.conv2d(v, weight=wv, bias=bv)
+    qw = F.conv2d(q, weight=wq, bias=bq)
+    # reshape, folding num heads into batch dim
+    B, _, H, W = kw.shape
+    kw = kw.reshape(B*nh, -1, H, W)
+    B, _, H, W = vw.shape
+    vw = vw.reshape(B*nh, -1, H, W)
+    B, _, H, W = qw.shape
+    qw = qw.reshape(B*nh, -1, H, W)
+    kw = kw.to(torch.float32)
+    vw = vw.to(torch.float32)
+    qw = qw.to(torch.float32)
+    output = _neighborhood_s2_attention_fwd_torch(kw, vw, qw, quad_weights,
+                                                  col_idx, row_off,
+                                                  nlon_in, nlat_out, nlon_out)
+    _, C, H, W = output.shape
+    output = output.reshape(B, -1, H, W)
+    return output
+def _neighborhood_s2_attention_bwd_torch(ctx, grad_output):
+    col_idx, row_off, quad_weights, k, v, q, wk, wv, wq, bk, bv, bq = ctx.saved_tensors
+    nh = ctx.nh
+    nlon_in = ctx.nlon_in
+    nlat_out = ctx.nlat_out
+    nlon_out = ctx.nlon_out
+    # check if we need the grads at all
+    k_needs_grad = ctx.needs_input_grad[0]
+    v_needs_grad = ctx.needs_input_grad[1]
+    q_needs_grad = ctx.needs_input_grad[2]
+    wk_needs_grad = ctx.needs_input_grad[3]
+    wv_needs_grad = ctx.needs_input_grad[4]
+    wq_needs_grad = ctx.needs_input_grad[5]
+    bk_needs_grad = ctx.needs_input_grad[6]
+    bv_needs_grad = ctx.needs_input_grad[7]
+    bq_needs_grad = ctx.needs_input_grad[8]
+    kw = F.conv2d(k, weight=wk, bias=bk)
+    vw = F.conv2d(v, weight=wv, bias=bv)
+    qw = F.conv2d(q, weight=wq, bias=bq)
+    # reshape, folding num heads into batch dim
+    B, _, H, W = kw.shape
+    kw = kw.reshape(B*nh, -1, H, W)
+    B, _, H, W = vw.shape
+    vw = vw.reshape(B*nh, -1, H, W)
+    B, _, H, W = qw.shape
+    qw = qw.reshape(B*nh, -1, H, W)
+    B, _, H, W  = grad_output.shape
+    grad_output = grad_output.reshape(B*nh, -1, H, W)
+    if v_needs_grad or wv_needs_grad or bv_needs_grad:
+        dvw = _neighborhood_s2_attention_bwd_dv_torch(kw, vw, qw, grad_output,
+                                                      quad_weights,
+                                                      col_idx, row_off,
+                                                      nlon_in, nlat_out, nlon_out)
+        _, C, H, W = dvw.shape
+        dvw = dvw.reshape(B, -1, H, W)
+    else:
+        dvw = None
+    if k_needs_grad or wk_needs_grad or bk_needs_grad:
+        dkw = _neighborhood_s2_attention_bwd_dk_torch(kw, vw, qw, grad_output,
+                                                      quad_weights,
+                                                      col_idx, row_off,
+                                                      nlon_in, nlat_out, nlon_out)
+        _, C, H, W = dkw.shape
+        dkw = dkw.reshape(B, -1, H, W)
+    else:
+        dkw = None
+    if q_needs_grad or wq_needs_grad or bq_needs_grad:
+        dqw = _neighborhood_s2_attention_bwd_dq_torch(kw, vw, qw, grad_output,
+                                                      quad_weights,
+                                                      col_idx, row_off,
+                                                      nlon_in, nlat_out, nlon_out)
+        _, C, H, W = dqw.shape
+        dqw = dqw.reshape(B, -1, H, W)
+    else:
+        dqw = None
+    # input grads
+    if v_needs_grad:
+        dv = torch.nn.functional.conv2d(dvw, weight=wv.permute([1,0,2,3]), bias=None)
+    else:
+        dv = None
+    if k_needs_grad:
+        dk = torch.nn.functional.conv2d(dkw, weight=wk.permute([1,0,2,3]), bias=None)
+    else:
+        dk = None
+    if q_needs_grad:
+        dq = torch.nn.functional.conv2d(dqw, weight=wq.permute([1,0,2,3]), bias=None)
+    else:
+        dq = None
+    # weight grads
+    if wv_needs_grad:
+        dwv = torch.einsum("bchw,bfhw->cf", dvw, v).reshape(*wv.shape).contiguous()
+    else:
+        dwv = None
+    if wk_needs_grad:
+        dwk = torch.einsum("bchw,bfhw->cf", dkw, k).reshape(*wk.shape).contiguous()
+    else:
+        dwk = None
+    if wq_needs_grad:
+        dwq = torch.einsum("bchw,bfhw->cf", dqw, q).reshape(*wq.shape).contiguous()
+    else:
+        dwq = None
+    # bias grads:
+    if bv_needs_grad:
+        dbv = torch.sum(dvw, dim=(0,2,3))
+    else:
+        dbv = None
+    if bk_needs_grad:
+        dbk = torch.sum(dkw, dim=(0,2,3))
+    else:
+        dbk = None
+    if bq_needs_grad:
+        dbq = torch.sum(dqw, dim=(0,2,3))
+    else:
+        dbq = None
+    return dk, dv, dq, dwk, dwv, dwq, dbk, dbv, dbq, \
+            None, None, None, None, None, None, None, None

build/torch28-cxx11-cu128-x86_64-linux/torch_harmonics_attn/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _torch_harmonics_attn_20251001150033
+ops = torch.ops._torch_harmonics_attn_20251001150033
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_torch_harmonics_attn_20251001150033::{op_name}"

build/torch28-cxx11-cu128-x86_64-linux/torch_harmonics_attn/_torch_harmonics_attn_20251001150033.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e3f834671fd44bea1d2e3cd23d4f99f5cb61ec7822b028830000b358f70797fe
+size 35321056

build/torch28-cxx11-cu129-x86_64-linux/torch_harmonics_attn/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from ._attn_utils import backward, forward, forward_optimized, backward_optimized, _neighborhood_s2_attention_fwd_torch, _neighborhood_s2_attention_bwd_torch
+__all__ = [
+    "backward",
+    "forward",
+    "forward_optimized",
+    "backward_optimized",
+    "_neighborhood_s2_attention_fwd_torch",
+    "_neighborhood_s2_attention_bwd_torch",
+]

build/torch28-cxx11-cu129-x86_64-linux/torch_harmonics_attn/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (436 Bytes). View file

build/torch28-cxx11-cu129-x86_64-linux/torch_harmonics_attn/__pycache__/_attn_utils.cpython-313.pyc ADDED Viewed

Binary file (27.2 kB). View file

build/torch28-cxx11-cu129-x86_64-linux/torch_harmonics_attn/__pycache__/_ops.cpython-313.pyc ADDED Viewed

Binary file (570 Bytes). View file

build/torch28-cxx11-cu129-x86_64-linux/torch_harmonics_attn/_attn_utils.py ADDED Viewed

	@@ -0,0 +1,637 @@

+# coding=utf-8
+# SPDX-FileCopyrightText: Copyright (c) 2025 The torch-harmonics Authors. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+from typing import Union, Tuple
+import torch
+import torch.nn.functional as F
+from ._ops import ops
+def backward(kx, vx, qy, dy, quad_weights, psi_col_idx, psi_row_off, nlon_in, nlat_out, nlon_out):
+    return ops.s2_attention_bwd_dkvq_cuda(kx, vx, qy, dy, quad_weights, psi_col_idx, psi_row_off, nlon_in, nlat_out, nlon_out)
+def forward(kx, vx, qy, quad_weights, psi_col_idx, psi_row_off, nlon_in, nlat_out, nlon_out):
+    return ops.s2_attention_fwd_cuda(kx, vx, qy, quad_weights, psi_col_idx, psi_row_off, nlon_in, nlat_out, nlon_out)
+def _setup_context_attention_backward(ctx, inputs, output):
+    k, v, q, wk, wv, wq, bk, bv, bq, quad_weights, col_idx, row_off, max_psi_nnz, nh, nlon_in, nlat_out, nlon_out = inputs
+    ctx.save_for_backward(col_idx, row_off, quad_weights, k, v, q, wk, wv, wq, bk, bv, bq)
+    ctx.nh = nh
+    ctx.max_psi_nnz = max_psi_nnz
+    ctx.nlon_in = nlon_in
+    ctx.nlat_out = nlat_out
+    ctx.nlon_out = nlon_out
+def forward_default(kw: torch.Tensor, vw: torch.Tensor, qw: torch.Tensor,
+        quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+        nlon_in: int, nlat_out: int, nlon_out: int) -> torch.Tensor:
+    out_shape = (kw.shape[0], vw.shape[1], nlat_out, nlon_out)
+    return torch.empty(out_shape, dtype=kw.dtype, device=kw.device)
+def backward_default(kw: torch.Tensor, vw: torch.Tensor, qw: torch.Tensor, grad_output: torch.Tensor,
+        quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+        nlon_in: int, nlat_out: int, nlon_out: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    dk = torch.empty_like(kw)
+    dv = torch.empty_like(vw)
+    dq = torch.empty_like(qw)
+    return dk, dv, dq
+    # forward
+def forward_optimized(k: torch.Tensor, v: torch.Tensor, q: torch.Tensor,
+                                            wk: torch.Tensor, wv: torch.Tensor, wq: torch.Tensor,
+                                            bk: Union[torch.Tensor, None], bv: Union[torch.Tensor, None], bq: Union[torch.Tensor, None],
+                                            quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                            max_psi_nnz: int, nh: int, nlon_in: int, nlat_out: int, nlon_out: int) -> torch.Tensor:
+    kw = F.conv2d(k, weight=wk, bias=bk)
+    vw = F.conv2d(v, weight=wv, bias=bv)
+    qw = F.conv2d(q, weight=wq, bias=bq)
+    # reshape, folding num heads into batch dim
+    B, _, H, W = kw.shape
+    kw = kw.reshape(B*nh, -1, H, W)
+    B, _, H, W = vw.shape
+    vw = vw.reshape(B*nh, -1, H, W)
+    B, _, H, W = qw.shape
+    qw = qw.reshape(B*nh, -1, H, W)
+    # convert to float32
+    inp_dtype = kw.dtype
+    kw = kw.to(torch.float32).contiguous()
+    vw = vw.to(torch.float32).contiguous()
+    qw = qw.to(torch.float32).contiguous()
+    output = forward(kw, vw, qw, quad_weights,
+                                                col_idx, row_off,
+                                                nlon_in, nlat_out, nlon_out)
+    _, C, H, W = output.shape
+    output = output.reshape(B, -1, H, W)
+    # convert back precision
+    output = output.to(dtype=inp_dtype)
+    return output
+def backward_optimized(ctx, grad_output):
+    col_idx, row_off, quad_weights, k, v, q, wk, wv, wq, bk, bv, bq = ctx.saved_tensors
+    nh = ctx.nh
+    max_psi_nnz = ctx.max_psi_nnz
+    nlon_in = ctx.nlon_in
+    nlat_out = ctx.nlat_out
+    nlon_out = ctx.nlon_out
+    # check if we need the grads at all
+    k_needs_grad = ctx.needs_input_grad[0]
+    v_needs_grad = ctx.needs_input_grad[1]
+    q_needs_grad = ctx.needs_input_grad[2]
+    wk_needs_grad = ctx.needs_input_grad[3]
+    wv_needs_grad = ctx.needs_input_grad[4]
+    wq_needs_grad = ctx.needs_input_grad[5]
+    bk_needs_grad = ctx.needs_input_grad[6]
+    bv_needs_grad = ctx.needs_input_grad[7]
+    bq_needs_grad = ctx.needs_input_grad[8]
+    kw = F.conv2d(k, weight=wk, bias=bk)
+    vw = F.conv2d(v, weight=wv, bias=bv)
+    qw = F.conv2d(q, weight=wq, bias=bq)
+    # reshape, folding num heads into batch dim
+    B, _, H, W = kw.shape
+    kw = kw.reshape(B*nh, -1, H, W)
+    B, _, H, W = vw.shape
+    vw = vw.reshape(B*nh, -1, H, W)
+    B, _, H, W = qw.shape
+    qw = qw.reshape(B*nh, -1, H, W)
+    B, _, H, W  = grad_output.shape
+    grad_output = grad_output.reshape(B*nh, -1, H, W)
+    # save type and convert to float32
+    kw_dtype = kw.dtype
+    vw_dtype = vw.dtype
+    qw_dtype = qw.dtype
+    kw = kw.to(torch.float32).contiguous()
+    vw = vw.to(torch.float32).contiguous()
+    qw = qw.to(torch.float32).contiguous()
+    grad_output = grad_output.to(torch.float32).contiguous()
+    dkw, dvw, dqw = backward(kw, vw, qw, grad_output,
+                                                       quad_weights,
+                                                       col_idx, row_off,
+                                                       nlon_in, nlat_out, nlon_out)
+    # weight grads
+    _, C, H, W = dkw.shape
+    dkw = dkw.reshape(B, -1, H, W)
+    dkw = dkw.to(dtype=kw_dtype)
+    if wk_needs_grad:
+        dwk = torch.einsum("bchw,bfhw->cf", dkw, k).reshape(*wk.shape).contiguous()
+    else:
+        dwk = None
+    _, C, H, W = dvw.shape
+    dvw = dvw.reshape(B, -1, H, W)
+    dvw = dvw.to(dtype=vw_dtype)
+    if wv_needs_grad:
+        dwv = torch.einsum("bchw,bfhw->cf", dvw, v).reshape(*wv.shape).contiguous()
+    else:
+        dwv = None
+    _, C, H, W = dqw.shape
+    dqw = dqw.reshape(B, -1, H, W)
+    dqw = dqw.to(dtype=qw_dtype)
+    if wq_needs_grad:
+        dwq = torch.einsum("bchw,bfhw->cf", dqw, q).reshape(*wq.shape).contiguous()
+    else:
+        dwq = None
+    # input grads
+    if v_needs_grad:
+        dv = torch.nn.functional.conv2d(dvw, weight=wv.permute([1,0,2,3]), bias=None)
+    else:
+        dv = None
+    if k_needs_grad:
+        dk = torch.nn.functional.conv2d(dkw, weight=wk.permute([1,0,2,3]), bias=None)
+    else:
+        dk = None
+    if q_needs_grad:
+        dq = torch.nn.functional.conv2d(dqw, weight=wq.permute([1,0,2,3]), bias=None)
+    else:
+        dq = None
+    # bias grads:
+    if bv_needs_grad:
+        dbv = torch.sum(dvw, dim=(0,2,3))
+    else:
+        dbv = None
+    if bk_needs_grad:
+        dbk = torch.sum(dkw, dim=(0,2,3))
+    else:
+        dbk = None
+    if bq_needs_grad:
+        dbq = torch.sum(dqw, dim=(0,2,3))
+    else:
+        dbq = None
+    return dk, dv, dq, dwk, dwv, dwq, dbk, dbv, dbq, \
+            None, None, None, None, None, None, None, None
+# torch kernels
+# uses qdotk_max update trick to avoid two loops when computing the softmax
+# see e.g., https://arxiv.org/abs/1805.02867
+# and https://alexdremov.me/understanding-flash-attention-writing-the-algorithm-from-scratch-in-triton/
+def _neighborhood_s2_attention_fwd_torch(kx: torch.Tensor, vx: torch.Tensor, qy: torch.Tensor,
+                                         quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                         nlon_in: int, nlat_out: int, nlon_out: int) -> torch.Tensor:
+    # prepare result tensor
+    out_shape = (qy.shape[0], vx.shape[1], nlat_out, nlon_out)
+    y = torch.zeros(out_shape, dtype=qy.dtype, device=qy.device)
+    for ho in range(nlat_out):
+	    # get number of nonzeros
+        zstart = row_off[ho]
+        zend = row_off[ho+1]
+        for wo in range(nlon_out):
+            alpha_sum = torch.zeros((y.shape[0],), dtype=y.dtype, device=y.device)
+            qdotk_max = torch.zeros((y.shape[0],), dtype=y.dtype, device=y.device)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi + wo) % nlon_in
+                # compute correlation & softmax numerator
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hi_wip = kx[:, :, hi, wip]
+                qdotk = torch.sum(q_ho_wo * k_hi_wip, dim=1)
+                # tmp max
+                qdotk_max_tmp = torch.maximum(qdotk_max, qdotk)
+                # alpha sum update
+                alpha = torch.exp(qdotk - qdotk_max_tmp) * quad_weights[hi]
+                alpha_sum = alpha + alpha_sum * torch.exp(qdotk_max - qdotk_max_tmp)
+                # update output
+                y[:,:,ho,wo] = y[:,:,ho,wo] * torch.exp(qdotk_max - qdotk_max_tmp).unsqueeze(1) + alpha[:, None] * vx[:,:,hi,wip]
+                # define new max
+                qdotk_max = qdotk_max_tmp
+            y[:,:,ho,wo] = y[:,:,ho,wo] / alpha_sum[:, None]
+    return y
+# Explicit gradient w.r.t. vx: dM/dv
+# provided as a reference for CUDA & other hand-written gradients
+def _neighborhood_s2_attention_bwd_dv_torch(kx: torch.Tensor, vx: torch.Tensor, qy: torch.Tensor, dy: torch.Tensor,
+                                            quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                            nlon_in: int, nlat_out: int, nlon_out: int):
+    # shapes:
+    # input
+    # kx: B, C, Hi, Wi
+    # vx: B, Cout, Hi, Wi
+    # qy: B, Cout, Ho, Wo
+    # quad_weights: Hi
+    # output
+    # dvx: B, Cout, Hi, Wi
+    dvx = torch.zeros_like(vx)
+    batch_size = dy.shape[0]
+    for ho in range(nlat_out):
+        # get number of nonzeros
+        zstart = row_off[ho]
+        zend = row_off[ho+1]
+        for wo in range(nlon_out):
+            alpha_nz = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            qdotk_nz = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            alpha_sum = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                # compute correlation & softmax numerator
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hi_wi = kx[:, :, hi, wip]
+                qdotk_nz[:,idz-zstart] = torch.sum(q_ho_wo * k_hi_wi, dim=1)
+            qdotk_max, _ = torch.max(qdotk_nz, dim=1)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                alpha_nz[:,idz-zstart] = torch.exp(qdotk_nz[:,idz-zstart] - qdotk_max) * quad_weights[hi]
+                alpha_sum[:] += alpha_nz[:,idz-zstart]
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                dvx[:,:,hi, wip] += (alpha_nz[:, None, idz-zstart] / alpha_sum[:, None]) * dy[:,:,ho,wo]
+    return dvx
+# Explicit gradient w.r.t. kx: dM/dk
+# provided as a reference for CUDA & other hand-written gradients
+def _neighborhood_s2_attention_bwd_dk_torch(kx: torch.Tensor, vx: torch.Tensor, qy: torch.Tensor, dy: torch.Tensor,
+                                            quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                            nlon_in: int, nlat_out: int, nlon_out: int):
+    # shapes:
+    # input
+    # kx: B, C, Hi, Wi
+    # vx: B, Cout, Hi, Wi
+    # qy: B, C, Ho, Wo
+    # quad_weights: Hi
+    # output
+    # dkx: B, C, Hi, Wi
+    dkx = torch.zeros_like(kx)
+    batch_size = dy.shape[0]
+    for ho in range(nlat_out):
+        # get number of nonzeros
+        zstart = row_off[ho]
+        zend = row_off[ho+1]
+        for wo in range(nlon_out):
+            qdotk_nz = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            integral = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            alpha = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            alpha_sum = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hj = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wj = nz_col_idx % nlon_in
+                wjp = (wj+wo) % nlon_in
+                # compute correlation & softmax numerator
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hj_wjp = kx[:, :, hj, wjp]
+                qdotk_nz[:,idz-zstart] = torch.sum(q_ho_wo * k_hj_wjp, dim=1)
+            qdotk_max, _ = torch.max(qdotk_nz, dim=1)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hj = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wj = nz_col_idx % nlon_in
+                wjp = (wj+wo) % nlon_in
+                alpha[:, idz-zstart] = torch.exp(qdotk_nz[:,idz-zstart] - qdotk_max) * quad_weights[hj]
+                alpha_sum[:] += alpha[:, idz-zstart]
+                # input dot
+                gdotv = torch.sum(dy[:,:,ho, wo] * vx[:,:,hj, wjp], dim=1)
+                # integral term
+                integral[:] += alpha[:, idz-zstart] * gdotv[:]
+            integral[:] = integral[:] / alpha_sum[:]
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                # compute correlation & softmax numerator
+                gdotv = torch.sum(dy[:,:,ho, wo] * vx[:,:,hi, wip], dim=1)
+                dkx[:,:,hi,wip] += qy[:, :, ho, wo] * (alpha[:, None, idz-zstart] / alpha_sum[:, None]) * (gdotv[:, None] - integral[:, None])
+    return dkx
+# Explicit gradient w.r.t. qy: dM/dq
+# provided as a reference for CUDA & other hand-written gradients
+def _neighborhood_s2_attention_bwd_dq_torch(kx: torch.Tensor, vx: torch.Tensor, qy: torch.Tensor, dy: torch.Tensor,
+                                            quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                            nlon_in: int, nlat_out: int, nlon_out: int):
+    # shapes:
+    # input
+    # kx: B, C, Hi, Wi
+    # vx: B, Cout, Hi, Wi
+    # qy: B, C, Ho, Wo
+    # quad_weights: Hi
+    # output
+    # dq: B, C, Ho, Wo
+    batch_size = dy.shape[0]
+    channels_in = kx.shape[1]
+    channels_out = vx.shape[1]
+    dqy = torch.zeros_like(qy)
+    for ho in range(nlat_out):
+        # get number of nonzeros
+        zstart = row_off[ho]
+        zend = row_off[ho+1]
+        for wo in range(nlon_out):
+            alpha = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            qdotk_nz = torch.zeros((batch_size, zend-zstart), dtype=dy.dtype, device=dy.device)
+            alpha_k = torch.zeros((batch_size, channels_in), dtype=dy.dtype, device=dy.device)
+            alpha_vw = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            alpha_kvw = torch.zeros((batch_size, channels_in), dtype=dy.dtype, device=dy.device)
+            alpha_sum = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            alpha_sum2 = torch.zeros((batch_size,), dtype=dy.dtype, device=dy.device)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                idz_i = idz-zstart
+                # compute correlation & softmax numerator
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hi_wi = kx[:, :, hi, wip]
+                qdotk_nz[:,idz-zstart] = torch.sum(q_ho_wo * k_hi_wi, dim=1)
+            qdotk_max,_ = qdotk_nz.max(dim=1)
+            for idz in range(zstart, zend):
+                nz_col_idx = col_idx[idz]
+                # compute input indices from psi datastructure
+                hi = nz_col_idx // nlon_in
+                # account for output shift and ensure positive index due to circular condition
+                wi = nz_col_idx % nlon_in
+                wip = (wi+wo) % nlon_in
+                q_ho_wo = qy[:, :, ho, wo]
+                k_hi_wi = kx[:, :, hi, wip]
+                idz_i = idz-zstart
+                alpha[:, idz_i] = torch.exp(qdotk_nz[:,idz-zstart] - qdotk_max) * quad_weights[hi]
+                alpha_sum[:] += alpha[:, idz_i]
+                gdotv = torch.sum(dy[:,:,ho, wo] * vx[:,:,hi, wip], dim=1)
+                alpha_k[:,:] += alpha[:, None, idz_i] * k_hi_wi
+                alpha_vw[:] += alpha[:, idz_i] * gdotv[:]
+                alpha_kvw[:,:] += alpha[:, None, idz_i] * k_hi_wi * gdotv[:,None]
+            dqy[:,:,ho,wo] = (alpha_kvw * alpha_sum[:,None] - alpha_vw[:, None] * alpha_k) / (alpha_sum[:,None] * alpha_sum[:,None])
+    return dqy
+def _neighborhood_s2_attention_torch(k: torch.Tensor, v: torch.Tensor, q: torch.Tensor,
+                                     wk: torch.Tensor, wv: torch.Tensor, wq: torch.Tensor,
+                                     bk: Union[torch.Tensor, None], bv: Union[torch.Tensor, None], bq: Union[torch.Tensor, None],
+                                     quad_weights: torch.Tensor, col_idx: torch.Tensor, row_off: torch.Tensor,
+                                     max_psi_nnz: int, nh: int, nlon_in: int, nlat_out: int, nlon_out: int) -> torch.Tensor:
+    kw = F.conv2d(k, weight=wk, bias=bk)
+    vw = F.conv2d(v, weight=wv, bias=bv)
+    qw = F.conv2d(q, weight=wq, bias=bq)
+    # reshape, folding num heads into batch dim
+    B, _, H, W = kw.shape
+    kw = kw.reshape(B*nh, -1, H, W)
+    B, _, H, W = vw.shape
+    vw = vw.reshape(B*nh, -1, H, W)
+    B, _, H, W = qw.shape
+    qw = qw.reshape(B*nh, -1, H, W)
+    kw = kw.to(torch.float32)
+    vw = vw.to(torch.float32)
+    qw = qw.to(torch.float32)
+    output = _neighborhood_s2_attention_fwd_torch(kw, vw, qw, quad_weights,
+                                                  col_idx, row_off,
+                                                  nlon_in, nlat_out, nlon_out)
+    _, C, H, W = output.shape
+    output = output.reshape(B, -1, H, W)
+    return output
+def _neighborhood_s2_attention_bwd_torch(ctx, grad_output):
+    col_idx, row_off, quad_weights, k, v, q, wk, wv, wq, bk, bv, bq = ctx.saved_tensors
+    nh = ctx.nh
+    nlon_in = ctx.nlon_in
+    nlat_out = ctx.nlat_out
+    nlon_out = ctx.nlon_out
+    # check if we need the grads at all
+    k_needs_grad = ctx.needs_input_grad[0]
+    v_needs_grad = ctx.needs_input_grad[1]
+    q_needs_grad = ctx.needs_input_grad[2]
+    wk_needs_grad = ctx.needs_input_grad[3]
+    wv_needs_grad = ctx.needs_input_grad[4]
+    wq_needs_grad = ctx.needs_input_grad[5]
+    bk_needs_grad = ctx.needs_input_grad[6]
+    bv_needs_grad = ctx.needs_input_grad[7]
+    bq_needs_grad = ctx.needs_input_grad[8]
+    kw = F.conv2d(k, weight=wk, bias=bk)
+    vw = F.conv2d(v, weight=wv, bias=bv)
+    qw = F.conv2d(q, weight=wq, bias=bq)
+    # reshape, folding num heads into batch dim
+    B, _, H, W = kw.shape
+    kw = kw.reshape(B*nh, -1, H, W)
+    B, _, H, W = vw.shape
+    vw = vw.reshape(B*nh, -1, H, W)
+    B, _, H, W = qw.shape
+    qw = qw.reshape(B*nh, -1, H, W)
+    B, _, H, W  = grad_output.shape
+    grad_output = grad_output.reshape(B*nh, -1, H, W)
+    if v_needs_grad or wv_needs_grad or bv_needs_grad:
+        dvw = _neighborhood_s2_attention_bwd_dv_torch(kw, vw, qw, grad_output,
+                                                      quad_weights,
+                                                      col_idx, row_off,
+                                                      nlon_in, nlat_out, nlon_out)
+        _, C, H, W = dvw.shape
+        dvw = dvw.reshape(B, -1, H, W)
+    else:
+        dvw = None
+    if k_needs_grad or wk_needs_grad or bk_needs_grad:
+        dkw = _neighborhood_s2_attention_bwd_dk_torch(kw, vw, qw, grad_output,
+                                                      quad_weights,
+                                                      col_idx, row_off,
+                                                      nlon_in, nlat_out, nlon_out)
+        _, C, H, W = dkw.shape
+        dkw = dkw.reshape(B, -1, H, W)
+    else:
+        dkw = None
+    if q_needs_grad or wq_needs_grad or bq_needs_grad:
+        dqw = _neighborhood_s2_attention_bwd_dq_torch(kw, vw, qw, grad_output,
+                                                      quad_weights,
+                                                      col_idx, row_off,
+                                                      nlon_in, nlat_out, nlon_out)
+        _, C, H, W = dqw.shape
+        dqw = dqw.reshape(B, -1, H, W)
+    else:
+        dqw = None
+    # input grads
+    if v_needs_grad:
+        dv = torch.nn.functional.conv2d(dvw, weight=wv.permute([1,0,2,3]), bias=None)
+    else:
+        dv = None
+    if k_needs_grad:
+        dk = torch.nn.functional.conv2d(dkw, weight=wk.permute([1,0,2,3]), bias=None)
+    else:
+        dk = None
+    if q_needs_grad:
+        dq = torch.nn.functional.conv2d(dqw, weight=wq.permute([1,0,2,3]), bias=None)
+    else:
+        dq = None
+    # weight grads
+    if wv_needs_grad:
+        dwv = torch.einsum("bchw,bfhw->cf", dvw, v).reshape(*wv.shape).contiguous()
+    else:
+        dwv = None
+    if wk_needs_grad:
+        dwk = torch.einsum("bchw,bfhw->cf", dkw, k).reshape(*wk.shape).contiguous()
+    else:
+        dwk = None
+    if wq_needs_grad:
+        dwq = torch.einsum("bchw,bfhw->cf", dqw, q).reshape(*wq.shape).contiguous()
+    else:
+        dwq = None
+    # bias grads:
+    if bv_needs_grad:
+        dbv = torch.sum(dvw, dim=(0,2,3))
+    else:
+        dbv = None
+    if bk_needs_grad:
+        dbk = torch.sum(dkw, dim=(0,2,3))
+    else:
+        dbk = None
+    if bq_needs_grad:
+        dbq = torch.sum(dqw, dim=(0,2,3))
+    else:
+        dbq = None
+    return dk, dv, dq, dwk, dwv, dwq, dbk, dbv, dbq, \
+            None, None, None, None, None, None, None, None

build/torch28-cxx11-cu129-x86_64-linux/torch_harmonics_attn/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _torch_harmonics_attn_20251001150033
+ops = torch.ops._torch_harmonics_attn_20251001150033
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_torch_harmonics_attn_20251001150033::{op_name}"

build/torch28-cxx11-cu129-x86_64-linux/torch_harmonics_attn/_torch_harmonics_attn_20251001150033.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1e4408020fb8b28578efcad9e4f0358b96e643c9e9c18bd5d4e589112d94d84
+size 34089304

flake.nix ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  description = "Flake for Torch kernel extension";
+  inputs = {
+    kernel-builder.url = "github:huggingface/kernel-builder";
+  };
+  outputs = { self, kernel-builder, }:
+    kernel-builder.lib.genFlakeOutputs {
+      path = ./.;
+      rev = self.shortRev or self.dirtyShortRev or self.lastModifiedDate;
+    };
+}

nix-build.log ADDED Viewed

The diff for this file is too large to render. See raw diff

torch-ext/torch_binding.cpp ADDED Viewed

	@@ -0,0 +1,14 @@

+#include <torch/library.h>
+#include "registration.h"
+#include "torch_binding.h"
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+    ops.def("s2_attention_bwd_dkvq_cuda(Tensor kx, Tensor vx, Tensor qy, Tensor dy, Tensor quad_weights, Tensor psi_col_idx, Tensor psi_row_off, int nlon_in, int nlat_out, int nlon_out) -> (Tensor, Tensor, Tensor)");
+    ops.impl("s2_attention_bwd_dkvq_cuda", torch::kCUDA, &s2_attention_bwd_dkvq_cuda);
+    ops.def("s2_attention_fwd_cuda(Tensor kx, Tensor vx, Tensor qy, Tensor quad_weights, Tensor psi_col_idx, Tensor psi_row_off, int nlon_in, int nlat_out, int nlon_out) -> Tensor");
+    ops.impl("s2_attention_fwd_cuda", torch::kCUDA, &s2_attention_fwd_cuda);
+}
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

torch-ext/torch_binding.h ADDED Viewed

	@@ -0,0 +1,31 @@

+#pragma once
+#include <torch/torch.h>
+#include <cstdint>
+#include <tuple>
+std::tuple<at::Tensor, at::Tensor, at::Tensor> s2_attention_bwd_dkvq_cuda(
+    at::Tensor kx,
+    at::Tensor vx,
+    at::Tensor qy,
+    at::Tensor dy,
+    at::Tensor quad_weights,
+    at::Tensor psi_col_idx,
+    at::Tensor psi_row_off,
+    int64_t nlon_in,
+    int64_t nlat_out,
+    int64_t nlon_out
+);
+torch::Tensor s2_attention_fwd_cuda(
+    at::Tensor kx,
+    at::Tensor vx,
+    at::Tensor qy,
+    at::Tensor quad_weights,
+    at::Tensor psi_col_idx,
+    at::Tensor psi_row_off,
+    int64_t nlon_in,
+    int64_t nlat_out,
+    int64_t nlon_out
+);

torch-ext/torch_harmonics_attn/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from ._attn_utils import backward, forward, forward_optimized, backward_optimized, _neighborhood_s2_attention_fwd_torch, _neighborhood_s2_attention_bwd_torch
+__all__ = [
+    "backward",
+    "forward",
+    "forward_optimized",
+    "backward_optimized",
+    "_neighborhood_s2_attention_fwd_torch",
+    "_neighborhood_s2_attention_bwd_torch",
+]