|
|
|
|
|
|
|
|
import torch |
|
|
import triton |
|
|
import triton.language as tl |
|
|
|
|
|
@triton.jit |
|
|
def triton_cross_scan( |
|
|
x, |
|
|
y, |
|
|
BC: tl.constexpr, |
|
|
BH: tl.constexpr, |
|
|
BW: tl.constexpr, |
|
|
DC: tl.constexpr, |
|
|
DH: tl.constexpr, |
|
|
DW: tl.constexpr, |
|
|
NH: tl.constexpr, |
|
|
NW: tl.constexpr, |
|
|
): |
|
|
i_hw, i_c, i_b = tl.program_id(0), tl.program_id(1), tl.program_id(2) |
|
|
i_h, i_w = (i_hw // NW), (i_hw % NW) |
|
|
_mask_h = (i_h * BH + tl.arange(0, BH)) < DH |
|
|
_mask_w = (i_w * BW + tl.arange(0, BW)) < DW |
|
|
_mask_hw = _mask_h[:, None] & _mask_w[None, :] |
|
|
_for_C = min(DC - i_c * BC, BC) |
|
|
|
|
|
_tmp0 = i_c * BC * DH * DW |
|
|
_tmp1 = DC * DH * DW |
|
|
_tmp2 = _tmp0 + i_h * BH * DW + tl.arange(0, BH)[:, None] * DW + i_w * BW + tl.arange(0, BW)[None, :] |
|
|
p_x = x + i_b * _tmp1 + _tmp2 |
|
|
p_y1 = y + i_b * 4 * _tmp1 + _tmp2 |
|
|
p_y2 = y + i_b * 4 * _tmp1 + _tmp1 + _tmp0 + i_w * BW * DH + tl.arange(0, BW)[None, :] * DH + i_h * BH + tl.arange(0, BH)[:, None] |
|
|
p_y3 = y + i_b * 4 * _tmp1 + 2 * _tmp1 + _tmp0 + (NH - i_h - 1) * BH * DW + (BH - 1 - tl.arange(0, BH)[:, None]) * DW + (NW - i_w - 1) * BW + (BW - 1 - tl.arange(0, BW)[None, :]) + (DH - NH * BH) * DW + (DW - NW * BW) |
|
|
p_y4 = y + i_b * 4 * _tmp1 + 3 * _tmp1 + _tmp0 + (NW - i_w - 1) * BW * DH + (BW - 1 - tl.arange(0, BW)[None, :]) * DH + (NH - i_h - 1) * BH + (BH - 1 - tl.arange(0, BH)[:, None]) + (DH - NH * BH) + (DW - NW * BW) * DH |
|
|
|
|
|
for idxc in range(_for_C): |
|
|
_idx = idxc * DH * DW |
|
|
_x = tl.load(p_x + _idx, mask=_mask_hw) |
|
|
tl.store(p_y1 + _idx, _x, mask=_mask_hw) |
|
|
tl.store(p_y2 + _idx, _x, mask=_mask_hw) |
|
|
tl.store(p_y3 + _idx, _x, mask=_mask_hw) |
|
|
tl.store(p_y4 + _idx, _x, mask=_mask_hw) |
|
|
|
|
|
@triton.jit |
|
|
def triton_cross_merge( |
|
|
x, |
|
|
y, |
|
|
BC: tl.constexpr, |
|
|
BH: tl.constexpr, |
|
|
BW: tl.constexpr, |
|
|
DC: tl.constexpr, |
|
|
DH: tl.constexpr, |
|
|
DW: tl.constexpr, |
|
|
NH: tl.constexpr, |
|
|
NW: tl.constexpr, |
|
|
): |
|
|
i_hw, i_c, i_b = tl.program_id(0), tl.program_id(1), tl.program_id(2) |
|
|
i_h, i_w = (i_hw // NW), (i_hw % NW) |
|
|
_mask_h = (i_h * BH + tl.arange(0, BH)) < DH |
|
|
_mask_w = (i_w * BW + tl.arange(0, BW)) < DW |
|
|
_mask_hw = _mask_h[:, None] & _mask_w[None, :] |
|
|
_for_C = min(DC - i_c * BC, BC) |
|
|
|
|
|
_tmp0 = i_c * BC * DH * DW |
|
|
_tmp1 = DC * DH * DW |
|
|
_tmp2 = _tmp0 + i_h * BH * DW + tl.arange(0, BH)[:, None] * DW + i_w * BW + tl.arange(0, BW)[None, :] |
|
|
p_x = x + i_b * _tmp1 + _tmp2 |
|
|
p_y1 = y + i_b * 4 * _tmp1 + _tmp2 |
|
|
p_y2 = y + i_b * 4 * _tmp1 + _tmp1 + _tmp0 + i_w * BW * DH + tl.arange(0, BW)[None, :] * DH + i_h * BH + tl.arange(0, BH)[:, None] |
|
|
p_y3 = y + i_b * 4 * _tmp1 + 2 * _tmp1 + _tmp0 + (NH - i_h - 1) * BH * DW + (BH - 1 - tl.arange(0, BH)[:, None]) * DW + (NW - i_w - 1) * BW + (BW - 1 - tl.arange(0, BW)[None, :]) + (DH - NH * BH) * DW + (DW - NW * BW) |
|
|
p_y4 = y + i_b * 4 * _tmp1 + 3 * _tmp1 + _tmp0 + (NW - i_w - 1) * BW * DH + (BW - 1 - tl.arange(0, BW)[None, :]) * DH + (NH - i_h - 1) * BH + (BH - 1 - tl.arange(0, BH)[:, None]) + (DH - NH * BH) + (DW - NW * BW) * DH |
|
|
|
|
|
for idxc in range(_for_C): |
|
|
_idx = idxc * DH * DW |
|
|
_y1 = tl.load(p_y1 + _idx, mask=_mask_hw) |
|
|
_y2 = tl.load(p_y2 + _idx, mask=_mask_hw) |
|
|
_y3 = tl.load(p_y3 + _idx, mask=_mask_hw) |
|
|
_y4 = tl.load(p_y4 + _idx, mask=_mask_hw) |
|
|
tl.store(p_x + _idx, _y1 + _y2 + _y3 + _y4, mask=_mask_hw) |
|
|
|
|
|
@triton.jit |
|
|
def triton_cross_scan_1b1( |
|
|
x, |
|
|
y, |
|
|
BC: tl.constexpr, |
|
|
BH: tl.constexpr, |
|
|
BW: tl.constexpr, |
|
|
DC: tl.constexpr, |
|
|
DH: tl.constexpr, |
|
|
DW: tl.constexpr, |
|
|
NH: tl.constexpr, |
|
|
NW: tl.constexpr, |
|
|
): |
|
|
i_hw, i_c, i_b = tl.program_id(0), tl.program_id(1), tl.program_id(2) |
|
|
i_h, i_w = (i_hw // NW), (i_hw % NW) |
|
|
_mask_h = (i_h * BH + tl.arange(0, BH)) < DH |
|
|
_mask_w = (i_w * BW + tl.arange(0, BW)) < DW |
|
|
_mask_hw = _mask_h[:, None] & _mask_w[None, :] |
|
|
_for_C = min(DC - i_c * BC, BC) |
|
|
|
|
|
_tmp0 = i_c * BC * DH * DW |
|
|
_tmp1 = DC * DH * DW |
|
|
_tmp2 = _tmp0 + i_h * BH * DW + tl.arange(0, BH)[:, None] * DW + i_w * BW + tl.arange(0, BW)[None, :] |
|
|
p_y1 = y + i_b * 4 * _tmp1 + _tmp2 |
|
|
p_y2 = y + i_b * 4 * _tmp1 + _tmp1 + _tmp0 + i_w * BW * DH + tl.arange(0, BW)[None, :] * DH + i_h * BH + tl.arange(0, BH)[:, None] |
|
|
p_y3 = y + i_b * 4 * _tmp1 + 2 * _tmp1 + _tmp0 + (NH - i_h - 1) * BH * DW + (BH - 1 - tl.arange(0, BH)[:, None]) * DW + (NW - i_w - 1) * BW + (BW - 1 - tl.arange(0, BW)[None, :]) + (DH - NH * BH) * DW + (DW - NW * BW) |
|
|
p_y4 = y + i_b * 4 * _tmp1 + 3 * _tmp1 + _tmp0 + (NW - i_w - 1) * BW * DH + (BW - 1 - tl.arange(0, BW)[None, :]) * DH + (NH - i_h - 1) * BH + (BH - 1 - tl.arange(0, BH)[:, None]) + (DH - NH * BH) + (DW - NW * BW) * DH |
|
|
|
|
|
p_x1 = x + i_b * 4 * _tmp1 + _tmp2 |
|
|
p_x2 = p_x1 + _tmp1 |
|
|
p_x3 = p_x2 + _tmp1 |
|
|
p_x4 = p_x3 + _tmp1 |
|
|
for idxc in range(_for_C): |
|
|
_idx = idxc * DH * DW |
|
|
tl.store(p_y1 + _idx, tl.load(p_x1 + _idx, mask=_mask_hw), mask=_mask_hw) |
|
|
tl.store(p_y2 + _idx, tl.load(p_x2 + _idx, mask=_mask_hw), mask=_mask_hw) |
|
|
tl.store(p_y3 + _idx, tl.load(p_x3 + _idx, mask=_mask_hw), mask=_mask_hw) |
|
|
tl.store(p_y4 + _idx, tl.load(p_x4 + _idx, mask=_mask_hw), mask=_mask_hw) |
|
|
|
|
|
@triton.jit |
|
|
def triton_cross_merge_1b1( |
|
|
x, |
|
|
y, |
|
|
BC: tl.constexpr, |
|
|
BH: tl.constexpr, |
|
|
BW: tl.constexpr, |
|
|
DC: tl.constexpr, |
|
|
DH: tl.constexpr, |
|
|
DW: tl.constexpr, |
|
|
NH: tl.constexpr, |
|
|
NW: tl.constexpr, |
|
|
): |
|
|
i_hw, i_c, i_b = tl.program_id(0), tl.program_id(1), tl.program_id(2) |
|
|
i_h, i_w = (i_hw // NW), (i_hw % NW) |
|
|
_mask_h = (i_h * BH + tl.arange(0, BH)) < DH |
|
|
_mask_w = (i_w * BW + tl.arange(0, BW)) < DW |
|
|
_mask_hw = _mask_h[:, None] & _mask_w[None, :] |
|
|
_for_C = min(DC - i_c * BC, BC) |
|
|
|
|
|
_tmp0 = i_c * BC * DH * DW |
|
|
_tmp1 = DC * DH * DW |
|
|
_tmp2 = _tmp0 + i_h * BH * DW + tl.arange(0, BH)[:, None] * DW + i_w * BW + tl.arange(0, BW)[None, :] |
|
|
p_y1 = y + i_b * 4 * _tmp1 + _tmp2 |
|
|
p_y2 = y + i_b * 4 * _tmp1 + _tmp1 + _tmp0 + i_w * BW * DH + tl.arange(0, BW)[None, :] * DH + i_h * BH + tl.arange(0, BH)[:, None] |
|
|
p_y3 = y + i_b * 4 * _tmp1 + 2 * _tmp1 + _tmp0 + (NH - i_h - 1) * BH * DW + (BH - 1 - tl.arange(0, BH)[:, None]) * DW + (NW - i_w - 1) * BW + (BW - 1 - tl.arange(0, BW)[None, :]) + (DH - NH * BH) * DW + (DW - NW * BW) |
|
|
p_y4 = y + i_b * 4 * _tmp1 + 3 * _tmp1 + _tmp0 + (NW - i_w - 1) * BW * DH + (BW - 1 - tl.arange(0, BW)[None, :]) * DH + (NH - i_h - 1) * BH + (BH - 1 - tl.arange(0, BH)[:, None]) + (DH - NH * BH) + (DW - NW * BW) * DH |
|
|
|
|
|
p_x1 = x + i_b * 4 * _tmp1 + _tmp2 |
|
|
p_x2 = p_x1 + _tmp1 |
|
|
p_x3 = p_x2 + _tmp1 |
|
|
p_x4 = p_x3 + _tmp1 |
|
|
for idxc in range(_for_C): |
|
|
_idx = idxc * DH * DW |
|
|
tl.store(p_x1 + _idx, tl.load(p_y1 + _idx), mask=_mask_hw) |
|
|
tl.store(p_x2 + _idx, tl.load(p_y2 + _idx), mask=_mask_hw) |
|
|
tl.store(p_x3 + _idx, tl.load(p_y3 + _idx), mask=_mask_hw) |
|
|
tl.store(p_x4 + _idx, tl.load(p_y4 + _idx), mask=_mask_hw) |
|
|
|
|
|
|
|
|
class CrossScanTriton(torch.autograd.Function): |
|
|
@staticmethod |
|
|
def forward(ctx, x: torch.Tensor): |
|
|
B, C, H, W = x.shape |
|
|
B, C, H, W = int(B), int(C), int(H), int(W) |
|
|
BC, BH, BW = min(triton.next_power_of_2(C), 1), min(triton.next_power_of_2(H), 64), min(triton.next_power_of_2(W), 64) |
|
|
NH, NW, NC = triton.cdiv(H, BH), triton.cdiv(W, BW), triton.cdiv(C, BC) |
|
|
ctx.shape = (B, C, H, W) |
|
|
ctx.triton_shape = (BC, BH, BW, NC, NH, NW) |
|
|
x = x.contiguous() |
|
|
y = x.new_empty((B, 4, C, H, W)) |
|
|
triton_cross_scan[(NH * NW, NC, B)](x, y, BC, BH, BW, C, H, W, NH, NW) |
|
|
return y.view(B, 4, C, -1) |
|
|
|
|
|
@staticmethod |
|
|
def backward(ctx, y: torch.Tensor): |
|
|
|
|
|
B, C, H, W = ctx.shape |
|
|
BC, BH, BW, NC, NH, NW = ctx.triton_shape |
|
|
y = y.contiguous().view(B, 4, C, H, W) |
|
|
x = y.new_empty((B, C, H, W)) |
|
|
triton_cross_merge[(NH * NW, NC, B)](x, y, BC, BH, BW, C, H, W, NH, NW) |
|
|
return x |
|
|
|
|
|
|
|
|
class CrossMergeTriton(torch.autograd.Function): |
|
|
@staticmethod |
|
|
def forward(ctx, y: torch.Tensor): |
|
|
B, K, C, H, W = y.shape |
|
|
B, C, H, W = int(B), int(C), int(H), int(W) |
|
|
BC, BH, BW = min(triton.next_power_of_2(C), 1), min(triton.next_power_of_2(H), 64), min(triton.next_power_of_2(W), 64) |
|
|
NH, NW, NC = triton.cdiv(H, BH), triton.cdiv(W, BW), triton.cdiv(C, BC) |
|
|
ctx.shape = (B, C, H, W) |
|
|
ctx.triton_shape = (BC, BH, BW, NC, NH, NW) |
|
|
y = y.contiguous().view(B, 4, C, H, W) |
|
|
x = y.new_empty((B, C, H, W)) |
|
|
triton_cross_merge[(NH * NW, NC, B)](x, y, BC, BH, BW, C, H, W, NH, NW) |
|
|
return x.view(B, C, -1) |
|
|
|
|
|
@staticmethod |
|
|
def backward(ctx, x: torch.Tensor): |
|
|
|
|
|
B, C, H, W = ctx.shape |
|
|
BC, BH, BW, NC, NH, NW = ctx.triton_shape |
|
|
x = x.contiguous() |
|
|
y = x.new_empty((B, 4, C, H, W)) |
|
|
triton_cross_scan[(NH * NW, NC, B)](x, y, BC, BH, BW, C, H, W, NH, NW) |
|
|
return y |
|
|
|
|
|
|
|
|
class CrossScanTriton1b1(torch.autograd.Function): |
|
|
@staticmethod |
|
|
def forward(ctx, x: torch.Tensor): |
|
|
B, K, C, H, W = x.shape |
|
|
B, C, H, W = int(B), int(C), int(H), int(W) |
|
|
BC, BH, BW = min(triton.next_power_of_2(C), 1), min(triton.next_power_of_2(H), 64), min(triton.next_power_of_2(W), 64) |
|
|
NH, NW, NC = triton.cdiv(H, BH), triton.cdiv(W, BW), triton.cdiv(C, BC) |
|
|
ctx.shape = (B, C, H, W) |
|
|
ctx.triton_shape = (BC, BH, BW, NC, NH, NW) |
|
|
x = x.contiguous() |
|
|
y = x.new_empty((B, 4, C, H, W)) |
|
|
triton_cross_scan_1b1[(NH * NW, NC, B)](x, y, BC, BH, BW, C, H, W, NH, NW) |
|
|
return y.view(B, 4, C, -1) |
|
|
|
|
|
@staticmethod |
|
|
def backward(ctx, y: torch.Tensor): |
|
|
|
|
|
B, C, H, W = ctx.shape |
|
|
BC, BH, BW, NC, NH, NW = ctx.triton_shape |
|
|
y = y.contiguous().view(B, 4, C, H, W) |
|
|
x = y.new_empty((B, 4, C, H, W)) |
|
|
triton_cross_merge_1b1[(NH * NW, NC, B)](x, y, BC, BH, BW, C, H, W, NH, NW) |
|
|
return x |
|
|
|
|
|
|
|
|
|