Spaces:

Tingman
/

MatchStereo

Sleeping

App Files Files Community

Tingman commited on Nov 12

Commit

0940df6

1 Parent(s): 9d66414

code release

Browse files

Signed-off-by: tingmany <tmyann@outlook.com>

Files changed (21) hide show

.gitattributes +1 -1
README.md +1 -1
dataloader/stereo/transforms.py +82 -0
gradio_app.py +371 -0
models/__init__.py +0 -0
models/attention_blocks.py +210 -0
models/common.py +48 -0
models/compile.sh +4 -0
models/convformer.py +391 -0
models/cost_volume.py +179 -0
models/mat_pytorch_impl.py +178 -0
models/match_former_ops.py +121 -0
models/match_stereo.py +130 -0
models/setup.py +20 -0
models/src/match_former_cuda.cpp +49 -0
models/src/match_former_cuda_kernel.cu +26 -0
models/src/match_former_fused_forward.cu +628 -0
models/src/match_former_fused_forward.hpp +22 -0
requirements.txt +14 -0
utils/file_io.py +37 -0
utils/utils.py +58 -0

.gitattributes CHANGED Viewed

@@ -33,4 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
-staircase_q_left.png filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -5,7 +5,7 @@ colorFrom: green
 colorTo: blue
 sdk: gradio
 sdk_version: 5.49.1
-app_file: app.py
 pinned: false
 license: gpl-3.0
 ---

 colorTo: blue
 sdk: gradio
 sdk_version: 5.49.1
+app_file: gradio_app.py
 pinned: false
 license: gpl-3.0
 ---

dataloader/stereo/transforms.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from __future__ import division
+import torch
+import numpy as np
+import cv2
+class Compose(object):
+    def __init__(self, transforms):
+        self.transforms = transforms
+    def __call__(self, sample):
+        for t in self.transforms:
+            sample = t(sample)
+        return sample
+class ToTensor(object):
+    """Convert numpy array to torch tensor"""
+    def __init__(self, no_normalize=False):
+        self.no_normalize = no_normalize
+    def __call__(self, sample):
+        left = np.transpose(sample['left'], (2, 0, 1))  # [3, H, W]
+        if self.no_normalize:
+            sample['left'] = torch.from_numpy(left)
+        else:
+            sample['left'] = torch.from_numpy(left) / 255.
+        right = np.transpose(sample['right'], (2, 0, 1))
+        if self.no_normalize:
+            sample['right'] = torch.from_numpy(right)
+        else:
+            sample['right'] = torch.from_numpy(right) / 255.
+        if 'disp' in sample.keys():
+            disp = sample['disp']  # [H, W]
+            sample['disp'] = torch.from_numpy(disp)
+        if 'disp_r' in sample.keys():
+            disp_r = sample['disp_r']  # [H, W]
+            sample['disp_r'] = torch.from_numpy(disp_r)
+        if 'valid' in sample.keys():
+            valid = sample['valid']  # [H, W]
+            sample['valid'] = torch.from_numpy(valid)
+        return sample
+class Resize(object):
+    def __init__(self,
+                 scale_x=1,
+                 scale_y=1,
+                 nearest_interp=True,  # for sparse gt
+                 ):
+        """
+        Resize low-resolution data to high-res for mixed dataset training
+        """
+        self.scale_x = scale_x
+        self.scale_y = scale_y
+        self.nearest_interp = nearest_interp
+    def __call__(self, sample):
+        scale_x = self.scale_x
+        scale_y = self.scale_y
+        sample['left'] = cv2.resize(sample['left'], None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
+        sample['right'] = cv2.resize(sample['right'], None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
+        if 'disp' in sample.keys():
+            sample['disp'] = cv2.resize(
+                sample['disp'], None, fx=scale_x, fy=scale_y,
+                interpolation=cv2.INTER_LINEAR if not self.nearest_interp else cv2.INTER_NEAREST
+            ) * scale_x
+        if 'disp_r' in sample.keys():
+            sample['disp_r'] = cv2.resize(
+                sample['disp_r'], None, fx=scale_x, fy=scale_y,
+                interpolation=cv2.INTER_LINEAR if not self.nearest_interp else cv2.INTER_NEAREST
+            ) * scale_x
+        return sample

gradio_app.py ADDED Viewed

	@@ -0,0 +1,371 @@

+import gradio as gr
+import argparse
+import numpy as np
+import torch
+import torch.nn.functional as F
+import os
+import time
+import spaces
+from dataloader.stereo import transforms
+from utils.utils import InputPadder, calc_noc_mask
+from huggingface_hub import hf_hub_download
+from models.match_stereo import MatchStereo
+torch.backends.cudnn.benchmark = True
+class MatchStereoDemo:
+    def __init__(self):
+        self.has_cuda = torch.cuda.is_available()
+        self.device = torch.device('cuda:0') if self.has_cuda else 'cpu'
+        self.model = None
+        self.current_variant = None
+        self.current_mode = None
+        self.current_precision = None
+        self.current_mat_impl = None
+        self.download_model()
+    def download_model(self):
+        REPO_ID = 'Tingman/MatchAttention'
+        filename_list = ['matchstereo_tiny_fsd.pth', 'matchstereo_small_fsd.pth', 'matchstereo_base_fsd.pth', 'matchflow_base_sintel.pth']
+        if not os.path.exists('./checkpoints/'):
+            os.makedirs('./checkpoints/')
+        for filename in filename_list:
+            local_file = os.path.join('./checkpoints/', filename)
+            if not os.path.exists(local_file):
+                hf_hub_download(repo_id=REPO_ID, filename=filename, local_dir='./checkpoints/', local_dir_use_symlinks=False)
+    def load_model(self, mode, variant, precision, mat_impl):
+        """load model, skip if the model has been loaded"""
+        if (self.model is not None and
+            self.current_variant == variant and
+            self.current_mode == mode and
+            self.current_precision == precision and
+            self.current_mat_impl == mat_impl):
+            return "Model already loaded"
+        # fixed checkpoint path
+        checkpoint_base_path = "./checkpoints"
+        if mode == 'stereo':
+            checkpoint_name = f"match{mode}_{variant}_fsd.pth"
+        elif mode == 'flow':
+            checkpoint_name = f"match{mode}_{variant}_sintel.pth"
+        else:
+            raise NotImplementedError
+        checkpoint_path = os.path.join(checkpoint_base_path, checkpoint_name)
+        if not os.path.exists(checkpoint_path):
+            return f"Error: Checkpoint not found at {checkpoint_path}"
+        args = argparse.Namespace()
+        args.mode = mode
+        args.variant = variant
+        args.mat_impl = mat_impl
+        if not self.has_cuda:
+            precision = "fp32"
+        dtypes = {'fp32': torch.float32, 'fp16': torch.float16}
+        self.dtype = dtypes[precision]
+        self.model = MatchStereo(args)
+        try:
+            checkpoint = torch.load(checkpoint_path, map_location='cpu')
+            self.model.load_state_dict(state_dict=checkpoint['model'], strict=False)
+            self.model.to(self.device)
+            self.model.eval()
+            self.model = self.model.to(self.dtype)
+            self._warmup_model()
+            self.current_variant = variant
+            self.current_mode = mode
+            self.current_precision = precision
+            self.current_mat_impl = mat_impl
+            device_info = "GPU" if self.has_cuda else "CPU"
+            return f"Successfully loaded {mode} {variant} model on {device_info} (precision: {precision}, mat_impl: {mat_impl})"
+        except Exception as e:
+            return f"Error loading model: {str(e)}"
+    def _warmup_model(self):
+        """warmup the model for accurate time measurement"""
+        if self.model is None:
+            return
+        dummy_left = torch.randn(1, 3, 512, 512, device=self.device, dtype=self.dtype)
+        dummy_right = torch.randn(1, 3, 512, 512, device=self.device, dtype=self.dtype)
+        with torch.no_grad():
+            _ = self.model(dummy_left, dummy_right, stereo=(self.current_mode == 'stereo'))
+    def run_frame(self, left, right, stereo, low_res_init=False, factor=2.):
+        """single frame inference"""
+        if low_res_init:
+            left_ds = F.interpolate(left, scale_factor=1/factor, mode='bilinear', align_corners=True)
+            right_ds = F.interpolate(right, scale_factor=1/factor, mode='bilinear', align_corners=True)
+            padder_ds = InputPadder(left_ds.shape, padding_factor=32)
+            left_ds, right_ds = padder_ds.pad(left_ds, right_ds)
+            field_up_ds = self.model(left_ds, right_ds, stereo=stereo)['field_up']
+            field_up_ds = padder_ds.unpad(field_up_ds.permute(0, 3, 1, 2).contiguous()).contiguous()
+            field_up_init = F.interpolate(field_up_ds, scale_factor=factor/32, mode='bilinear', align_corners=True)*(factor/32)
+            field_up_init = field_up_init.permute(0, 2, 3, 1).contiguous()
+            results_dict = self.model(left, right, stereo=stereo, init_flow=field_up_init)
+        else:
+            results_dict = self.model(left, right, stereo=stereo)
+        return results_dict
+    def get_inference_size(self, size_name):
+        if size_name == "Original":
+            return None
+        def round_to_32(x):
+            return (x + 16) // 32 * 32
+        size_presets = {
+            "720P": (round_to_32(1280), round_to_32(720)),
+            "1080P": (round_to_32(1920), round_to_32(1080)),
+            "2K": (round_to_32(2048), round_to_32(1080)),
+            "4K UHD": (round_to_32(3840), round_to_32(2160))
+        }
+        return size_presets.get(size_name, None)
+    def process_images(self, left_image, right_image, mode, variant,
+                      low_res_init=False, inference_size_name="Original",
+                      precision="fp32", mat_impl="pytorch"):
+        if not self.has_cuda:
+            precision = "fp32"
+            mat_impl = "pytorch"
+        load_result = self.load_model(mode, variant, precision, mat_impl)
+        if load_result.startswith("Error"):
+            return None, None, None, load_result
+        try:
+            left = np.array(left_image.convert('RGB')).astype(np.float32)
+            right = np.array(right_image.convert('RGB')).astype(np.float32)
+            original_size = left.shape[:2]  # (H, W)
+            inference_size = self.get_inference_size(inference_size_name)
+            val_transform_list = [transforms.ToTensor(no_normalize=True)]
+            val_transform = transforms.Compose(val_transform_list)
+            sample = {'left': left, 'right': right}
+            sample = val_transform(sample)
+            left_tensor = sample['left'].to(self.device, dtype=self.dtype).unsqueeze(0)
+            right_tensor = sample['right'].to(self.device, dtype=self.dtype).unsqueeze(0)
+            stereo = (mode == 'stereo')
+            ori_size = left_tensor.shape[-2:]
+            if inference_size is not None:
+                left_tensor = F.interpolate(left_tensor, size=inference_size, mode='bilinear', align_corners=True)
+                right_tensor = F.interpolate(right_tensor, size=inference_size, mode='bilinear', align_corners=True)
+                padder = None
+            else:
+                padder = InputPadder(left_tensor.shape, padding_factor=32)
+                left_tensor, right_tensor = padder.pad(left_tensor, right_tensor)
+            device_type = "GPU" if self.has_cuda else "CPU"
+            actual_size = inference_size if inference_size else ori_size
+            status_info = f"Device: {device_type} | Resolution: {actual_size[1]}x{actual_size[0]} | Precision: {precision}"
+            start_time = time.time()
+            with torch.no_grad():
+                results_dict = self.run_frame(left_tensor, right_tensor, stereo, low_res_init)
+            inference_time = (time.time() - start_time) * 1000 # ms
+            field_up = results_dict['field_up'].permute(0, 3, 1, 2).float().contiguous()
+            if padder is not None:
+                field_up = padder.unpad(field_up)
+            elif inference_size is not None:
+                field_up = F.interpolate(field_up, size=ori_size, mode='bilinear', align_corners=True)
+                field_up[:, 0] = field_up[:, 0] * (ori_size[1] / float(inference_size[1]))
+                field_up[:, 1] = field_up[:, 1] * (ori_size[0] / float(inference_size[0]))
+            noc_mask = calc_noc_mask(field_up.permute(0, 2, 3, 1), A=8)
+            noc_mask = noc_mask[0].detach().cpu().numpy()
+            noc_mask = np.where(noc_mask, 255, 128).astype(np.uint8)
+            field_up = torch.cat((field_up, torch.zeros_like(field_up[:, :1])), dim=1)
+            field_up = field_up.permute(0, 2, 3, 1).contiguous()
+            field, field_r = field_up.chunk(2, dim=0)
+            if stereo:
+                disparity = (-field[..., 0]).clamp(min=0)
+                disparity_np = disparity[0].detach().cpu().numpy()
+                min_val = disparity_np.min()
+                max_val = disparity_np.max()
+                if max_val - min_val > 1e-6:
+                    disparity_norm = (disparity_np - min_val) / (max_val - min_val)
+                else:
+                    disparity_norm = np.zeros_like(disparity_np)
+                disparity_img = (disparity_norm * 255).astype(np.uint8)
+                return disparity_img, noc_mask, f"Inference time: {inference_time:.2f} ms. (Please re-run to get accurate time.)", status_info
+            else:
+                flow = field[0].detach().cpu().numpy()
+                flow_rgb = self.flow_to_color(flow)
+                return flow_rgb, noc_mask, f"Inference time: {inference_time:.2f} ms. (Please re-run to get accurate time.)", status_info
+        except Exception as e:
+            device_type = "GPU" if self.has_cuda else "CPU"
+            return None, None, f"Error during inference: {str(e)}", f"Device: {device_type} | Error occurred"
+    def flow_to_color(self, flow):
+        """visualization of flow"""
+        u = flow[..., 0]
+        v = flow[..., 1]
+        rad = np.sqrt(u**2 + v**2)
+        rad_max = np.max(rad)
+        epsilon = 1e-8
+        if rad_max > epsilon:
+            u = u / (rad_max + epsilon)
+            v = v / (rad_max + epsilon)
+        h, w = u.shape
+        hsv = np.zeros((h, w, 3), dtype=np.uint8)
+        hsv[..., 1] = 255
+        mag, ang = cv2.cartToPolar(u, v)
+        hsv[..., 0] = ang * 180 / np.pi / 2
+        hsv[..., 2] = cv2.normalize(mag, None, 0, 255, cv2.NORM_MINMAX)
+        flow_rgb = cv2.cvtColor(hsv, cv2.COLOR_HSV2RGB)
+        return flow_rgb
+demo_model = MatchStereoDemo()
+# example images
+examples = [
+    ["examples/booster_bathroom_left.png", "examples/booster_bathroom_right.png", "stereo", "tiny"],
+    ["examples/staircase_q_left.png", "examples/staircase_q_right.png", "stereo", "tiny"],
+    ["examples/frame_0031_clean.png", "examples/frame_0032_clean.png", "flow", "base"],
+]
+@spaces.GPU
+def process_inference(left_img, right_img, mode, variant,
+                     low_res_init, inference_size, precision, mat_impl):
+    """Gradio function"""
+    if left_img is None or right_img is None:
+        return None, None, "Please upload both left and right images", "Waiting for input..."
+    try:
+        result = demo_model.process_images(
+            left_img, right_img, mode, variant,
+            low_res_init, inference_size, precision, mat_impl
+        )
+        return result
+    except Exception as e:
+        return None, None, f"Error during inference: {str(e)}", f"Error: {str(e)}"
+def update_variant_choices(mode):
+    if mode == "flow":
+        return gr.Radio(choices=["base"], value="base")
+    else:
+        return gr.Radio(choices=["tiny", "small", "base"], value="tiny")
+# Gradio UI
+with gr.Blocks(title="MatchStereo/MatchFlow Demo") as demo:
+    gr.Markdown("# MatchStereo/MatchFlow Demo")
+    gr.Markdown("Upload stereo images for disparity estimation or consecutive frames for optical flow estimation.")
+    if not demo_model.has_cuda:
+        gr.Markdown("> Note: Running on CPU. Some options (fp16, cuda) are disabled.")
+    with gr.Row():
+        with gr.Column():
+            left_image = gr.Image(label="Left Image / Frame 1", type="pil")
+            right_image = gr.Image(label="Right Image / Frame 2", type="pil")
+            with gr.Row():
+                mode = gr.Radio(
+                    choices=["stereo", "flow"],
+                    label="Mode",
+                    value="stereo",
+                    info="Select stereo for disparity estimation or flow for optical flow"
+                )
+                variant = gr.Radio(
+                    choices=["tiny", "small", "base"],
+                    label="Model Variant",
+                    value="tiny",
+                    info="Model size variant"
+                )
+            with gr.Row():
+                low_res_init = gr.Checkbox(
+                    label="Low Resolution Init",
+                    value=False,
+                    info="Use low-resolution initialization for high-res images (>=2K)"
+                )
+                inference_size = gr.Dropdown(
+                    choices=["Original", "720P", "1080P", "2K", "4K UHD"],
+                    label="Inference Size",
+                    value="Original",
+                    info="Rounded to multiples of 32"
+                )
+            with gr.Row():
+                precision = gr.Radio(
+                    choices=["fp32", "fp16"],
+                    label="Precision",
+                    value="fp32",
+                    info="Model precision",
+                    interactive=demo_model.has_cuda
+                )
+                mat_impl = gr.Radio(
+                    choices=["cuda", "pytorch"],
+                    label="MatchAttention Implementation",
+                    value="cuda",
+                    info="MatchAttention implementations",
+                    interactive=demo_model.has_cuda
+                )
+            run_btn = gr.Button("Run Inference", variant="primary")
+        with gr.Column():
+            output_image = gr.Image(label="Output Result", interactive=False)
+            noc_mask = gr.Image(label="NOC Mask", interactive=False)
+            time_output = gr.Textbox(label="Inference Time", interactive=False)
+            status = gr.Textbox(label="Status Info", interactive=False, lines=2)
+    gr.Markdown("## Examples")
+    gr.Examples(
+        examples=examples,
+        inputs=[left_image, right_image, mode, variant],
+        outputs=[output_image, noc_mask, time_output, status],
+        fn=process_inference,
+        cache_examples=False,
+        label="Click any example below to load it"
+    )
+    run_btn.click(
+        fn=process_inference,
+        inputs=[left_image, right_image, mode, variant,
+                low_res_init, inference_size, precision, mat_impl],
+        outputs=[output_image, noc_mask, time_output, status]
+    )
+    mode.change(
+        fn=update_variant_choices,
+        inputs=[mode],
+        outputs=[variant]
+    )
+if __name__ == "__main__":
+    try:
+        import cv2
+    except ImportError:
+        print("Please install OpenCV for optical flow visualization: pip install opencv-python")
+    demo.launch()

models/__init__.py ADDED Viewed

File without changes

models/attention_blocks.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import torch
+import torch.nn as nn
+from timm.models.layers import DropPath
+from models.convformer import LayerNormWithoutBias
+from models.common import ConvGLU
+from models.mat_pytorch_impl import compute_bilinear_weights, compute_match_attention, compute_bilinear_softmax, attention_aggregate
+from models.match_former_ops import MF_FusedForwardOps
+from utils.utils import bilinear_sample_by_offset, init_coords
+class MatchAttention(torch.nn.Module):
+    r"""MatchAttention: Matching the relative positions
+    """
+    def __init__(self, args, dim, win_r=[1, 1], num_head=8, head_dim=None, qkv_bias=False,
+                 attn_drop=0., proj_drop=0., proj_bias=False, cross=False, noc_embed=False, **kargs):
+        super().__init__()
+        self.num_head = num_head
+        self.cross = cross
+        self.noc_embed = noc_embed if not cross else False # only for self attention
+        self.head_dim = dim // num_head if head_dim is None else head_dim
+        self.scale = self.head_dim ** -0.5
+        self.attention_dim = self.num_head * self.head_dim
+        self.win_r = win_r
+        self.attn_num = (2*win_r[0]+2)*(2*win_r[1]+2)
+        embed_dim = dim + 1 if noc_embed else dim # '1' for noc_mask
+        self.q = nn.Linear(embed_dim, self.attention_dim, bias=qkv_bias)
+        self.k = nn.Linear(embed_dim, self.attention_dim, bias=qkv_bias)
+        self.v = nn.Linear(embed_dim, self.attention_dim, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        if self.cross:
+            self.g = nn.Sequential(nn.Linear(embed_dim, self.attention_dim,bias=qkv_bias), nn.SiLU())
+            self.proj = nn.Linear(self.attention_dim + self.num_head*self.attn_num, dim, bias=proj_bias)
+        else:
+            self.proj = nn.Linear(self.attention_dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.use_pytorch = (args.mat_impl == 'pytorch')
+        self.mf_fused = MF_FusedForwardOps()
+    def clamp_max_offset(self, max_offset, H, W):
+        max_offset_x, max_offset_y = max_offset.chunk(2, dim=-1) # to avoid inplace operation
+        # for ONNX support
+        min_x = torch.tensor(self.win_r[0], dtype=max_offset.dtype, device=max_offset.device)
+        max_x = torch.tensor(W - 1 - self.win_r[0] - 1e-3, dtype=max_offset.dtype, device=max_offset.device)
+        min_y = torch.tensor(self.win_r[1], dtype=max_offset.dtype, device=max_offset.device)
+        max_y = torch.tensor(H - 1 - self.win_r[1] - 1e-3, dtype=max_offset.dtype, device=max_offset.device)
+        max_offset_x = torch.clamp(max_offset_x, min=min_x, max=max_x)
+        max_offset_y = torch.clamp(max_offset_y, min=min_y, max=max_y)
+        ## max_offset_x = max_offset_x.clamp(min=self.win_r[0], max=W-1-self.win_r[0]-1e-3)
+        ## max_offset_y = max_offset_y.clamp(min=self.win_r[1], max=H-1-self.win_r[1]-1e-3)
+        return torch.cat((max_offset_x, max_offset_y), dim=-1).contiguous()
+    def forward(self, x, max_offset, noc_mask=None): # offset: [B, N, h, 2]
+        B, H, W, _ = x.shape
+        N = H*W
+        assert (2*self.win_r[1] + 2 <= H) and (2*self.win_r[0] + 2 <= W)
+        x = x.view(B, N, -1).contiguous()
+        if self.cross:
+            ref_, tgt_ = x.chunk(2, dim=0) # split along batch dimension
+            ref = torch.cat((ref_, tgt_), dim=0) # order
+            tgt = torch.cat((tgt_, ref_), dim=0) # reverse order
+            g = self.g(ref)
+        else: # self-attn
+            if self.noc_embed:
+                x = torch.cat((x, noc_mask.view(B, N, -1)), dim=-1).contiguous()
+            ref, tgt = x, x
+        q, k, v = self.q(ref), self.k(tgt), self.v(tgt)
+        ## non-parameter modules
+        max_offset = self.clamp_max_offset(max_offset, H, W)
+        if self.use_pytorch:
+            m_id = torch.floor(max_offset).to(torch.int32) # [B, N, h, 2]
+            bilinear_weight = compute_bilinear_weights(max_offset)
+            attn, indices_gather = compute_match_attention(q.view(B, N, self.num_head, -1), k.view(B, N, self.num_head, -1), m_id, self.win_r, H, W)
+            attn = attn * self.scale
+            attn = compute_bilinear_softmax(attn, bilinear_weight, self.win_r)
+            attn = self.attn_drop(attn)
+            x = attention_aggregate(v.view(B, N, self.num_head, -1), attn, indices_gather, self.win_r)
+        else:
+            x, attn = self.mf_fused(max_offset, q, k, v, H, W, self.win_r, self.attn_num, attn_type='l1_norm', scale=self.scale)
+        if self.cross:
+            x = g * x # gate
+            attn = attn.view(B, N, -1).contiguous()
+            x = torch.cat((x, attn), dim=-1).contiguous()
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x.view(B, H, W, -1).contiguous()
+class MatchAttentionLayer(nn.Module):
+    r"""MatchAttention layer with interleaved self-MatchAttention, cross-MatchAttention, and ConvGLU
+    """
+    def __init__(self, args, dim, win_r,
+                 num_head=8, head_dim=32, mlp=ConvGLU, mlp_ratio=2, field_dim=2,
+                 norm_layer=nn.LayerNorm, drop=0., drop_path=0.):
+        super().__init__()
+        self.num_head = num_head
+        self.field_dim = field_dim
+        self.match_attention_self = MatchAttention(args, dim + self.field_dim + self.num_head*2, [win_r, win_r], num_head=num_head, head_dim=head_dim, noc_embed=True)
+        self.norm0 = norm_layer(dim + self.field_dim + self.num_head*2)
+        self.match_attention_cross = MatchAttention(args, dim + self.field_dim, [win_r, win_r], num_head=num_head, head_dim=head_dim, cross=True)
+        self.norm1 = norm_layer(dim + self.field_dim)
+        self.mlp = mlp(dim=dim, mlp_ratio=mlp_ratio, drop=drop)
+        self.norm2 = norm_layer(dim)
+        self.field_scale = nn.Parameter(0.1*torch.ones(1, 1, 1, 2))
+        self.drop_path0 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def consistency_mask(self, field, A=2):
+        offset = field + init_coords(field) # [B, H, W, 2]
+        field_ref_, field_tgt_ = field.chunk(2, dim=0)
+        field_ref = torch.cat((field_ref_, field_tgt_), dim=0) # order
+        field_tgt = torch.cat((field_tgt_, field_ref_), dim=0) # reverse order
+        field_tgt_to_ref = bilinear_sample_by_offset(field_tgt.permute(0, 3, 1, 2).contiguous(), offset).permute(0, 2, 3, 1).contiguous()
+        field_diff = torch.abs(field_ref + field_tgt_to_ref).sum(dim=-1, keepdim=True) # ref and tgt flow has different sign
+        noc_mask = (field_diff < A).to(field_diff.dtype)
+        return noc_mask
+    def forward(self, x, self_rpos, field, stereo=True): # self_rpos [B, H, W, h*2], field [B, H, W, 2]
+        field_out = {}
+        B, H, W, C = x.shape
+        noc_mask = self.consistency_mask(field.detach())
+        x = torch.cat((x, field*self.field_scale.to(field.dtype), self_rpos), dim=-1).contiguous()
+        coords_0 = init_coords(field).repeat(1, 1, 1, self.num_head)
+        self_offset = self_rpos + coords_0
+        self_offset = self_offset.view(B, H*W, self.num_head, 2).contiguous()
+        x = x + self.drop_path0(self.match_attention_self(self.norm0(x), self_offset, noc_mask))
+        self_rpos = x[..., -(self.num_head*2):].contiguous() # [B, H, W, h*2]
+        x = x[..., :-(self.num_head*2)].contiguous()
+        if stereo: x[..., -1] = 0
+        field = x[..., -self.field_dim:].contiguous() / self.field_scale.to(field.dtype)
+        field_out['self'] = field.clone()
+        offset = field.repeat(1, 1, 1, self.num_head).contiguous() + coords_0 # [B, H, W, h*2]
+        offset = offset.view(B, H*W, self.num_head, 2).contiguous()
+        x = x + self.drop_path1(self.match_attention_cross(self.norm1(x), offset))
+        if stereo: x[..., -1] = 0
+        field = x[..., -self.field_dim:].contiguous() / self.field_scale.to(field.dtype)
+        field_out['cross'] = field.clone()
+        x = x[..., :-self.field_dim].contiguous() # No field feature in MLP
+        x = x + self.drop_path2(self.mlp(self.norm2(x)))
+        return x, self_rpos, field, field_out
+class MatchAttentionBlock(nn.Module):
+    r"""MatchAttention block with multiple match-attention layers
+    """
+    def __init__(self, args, dim, win_r=2,
+                 num_layer=6, num_head=8, head_dim=32,
+                 mlp=ConvGLU, mlp_ratio=2, field_dim=2,
+                 norm_layer=LayerNormWithoutBias,
+                 drop=0., dp_rates=[0.]):
+        super().__init__()
+        self.num_head = num_head
+        self.layers = nn.ModuleList()
+        for i in range(num_layer):
+            layer = MatchAttentionLayer(args, dim, win_r=win_r, num_head=num_head, head_dim=head_dim,
+                                        mlp=mlp, mlp_ratio=mlp_ratio, field_dim=field_dim,
+                                        norm_layer=norm_layer, drop=drop, drop_path=dp_rates[i])
+            self.layers.append(layer)
+    def forward(self, x, self_rpos, field, stereo=True):
+        fields = []
+        B, H, W, C = x.shape
+        self_rpos = self_rpos.repeat(1, 1, 1, self.num_head) # [B, H, W, 2] -> [B, H, W, h*2]
+        for layer in self.layers:
+            x, self_rpos, field, field_out = layer(x, self_rpos, field, stereo)
+            fields.append(field_out)
+        self_rpos = self_rpos.view(B, H, W, self.num_head, 2).mean(dim=-2, keepdim=False)
+        return x, self_rpos, field, fields

models/common.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class UpConv(nn.Module):
+    r"""Upsample using transposed conv"""
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.up = nn.ConvTranspose2d(in_channels, out_channels, kernel_size=2, stride=2, padding=0, output_padding=0)
+        self.conv = nn.Sequential(
+            nn.Conv2d(out_channels*2, out_channels, kernel_size=1, padding=0),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
+        )
+    def forward(self, x1, x2, use_up=True):
+        x1 = x1.permute(0, 3, 1, 2).contiguous()
+        x2 = x2.permute(0, 3, 1, 2).contiguous()
+        if use_up:
+            x1 = self.up(x1)
+        x = torch.cat([x2, x1], dim=1)
+        out = self.conv(x)
+        return out.permute(0, 2, 3, 1).contiguous() # [B, H, W, C]
+class ConvGLU(nn.Module):
+    '''
+    Convolutional GLU, referenced from TransNeXt
+    '''
+    def __init__(self, dim, mlp_ratio=2, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        in_features = dim
+        out_features = out_features or in_features
+        hidden_features = int(mlp_ratio * in_features)
+        self.fc1 = nn.Linear(in_features, hidden_features * 2)
+        self.dwconv = nn.Conv2d(hidden_features, hidden_features, kernel_size=3, stride=1, padding=1, bias=True, groups=hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x): # [B, H, W, C]
+        x, v = self.fc1(x).chunk(2, dim=-1)
+        x = self.act(self.dwconv(x.permute(0, 3, 1, 2)).permute(0, 2, 3, 1).contiguous()) * v
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x

models/compile.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+#!/bin/bash
+rm -rf build/ dist/ match_attention.egg-info/ __pycache__
+python setup.py clean
+pip install .

models/convformer.py ADDED Viewed

	@@ -0,0 +1,391 @@

+from functools import partial
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.models.layers import trunc_normal_, DropPath
+from timm.models.registry import register_model
+from timm.models.layers.helpers import to_2tuple
+class LayerNormGeneral(nn.Module):
+    r""" General LayerNorm for different situations.
+    Args:
+        affine_shape (int, list or tuple): The shape of affine weight and bias.
+            Usually the affine_shape=C, but in some implementation, like torch.nn.LayerNorm,
+            the affine_shape is the same as normalized_dim by default.
+            To adapt to different situations, we offer this argument here.
+        normalized_dim (tuple or list): Which dims to compute mean and variance.
+        scale (bool): Flag indicates whether to use scale or not.
+        bias (bool): Flag indicates whether to use scale or not.
+        We give several examples to show how to specify the arguments.
+        LayerNorm (https://arxiv.org/abs/1607.06450):
+            For input shape of (B, *, C) like (B, N, C) or (B, H, W, C),
+                affine_shape=C, normalized_dim=(-1, ), scale=True, bias=True;
+            For input shape of (B, C, H, W),
+                affine_shape=(C, 1, 1), normalized_dim=(1, ), scale=True, bias=True.
+        Modified LayerNorm (https://arxiv.org/abs/2111.11418)
+            that is idental to partial(torch.nn.GroupNorm, num_groups=1):
+            For input shape of (B, N, C),
+                affine_shape=C, normalized_dim=(1, 2), scale=True, bias=True;
+            For input shape of (B, H, W, C),
+                affine_shape=C, normalized_dim=(1, 2, 3), scale=True, bias=True;
+            For input shape of (B, C, H, W),
+                affine_shape=(C, 1, 1), normalized_dim=(1, 2, 3), scale=True, bias=True.
+        For the several metaformer baslines,
+            IdentityFormer, RandFormer and PoolFormerV2 utilize Modified LayerNorm without bias (bias=False);
+            ConvFormer and CAFormer utilizes LayerNorm without bias (bias=False).
+    """
+    def __init__(self, affine_shape=None, normalized_dim=(-1, ), scale=True,
+                 bias=False, eps=1e-6):
+        super().__init__()
+        self.normalized_dim = normalized_dim
+        self.use_scale = scale
+        self.use_bias = bias
+        self.weight = nn.Parameter(torch.ones(affine_shape)) if scale else None
+        self.bias = nn.Parameter(torch.zeros(affine_shape)) if bias else None
+        self.eps = eps
+    def forward(self, x):
+        c = x - x.mean(self.normalized_dim, keepdim=True)
+        s = c.pow(2).mean(self.normalized_dim, keepdim=True)
+        x = c / torch.sqrt(s + self.eps)
+        if self.use_scale:
+            x = x * self.weight
+        if self.use_bias:
+            x = x + self.bias
+        return x
+def stem(in_chs, out_chs, act_layer=nn.GELU):
+    return nn.Sequential(
+        nn.Conv2d(in_chs, out_chs // 2, kernel_size=3, stride=2, padding=1),
+        ## nn.BatchNorm2d(out_chs // 2),
+        nn.InstanceNorm2d(out_chs // 2),
+        act_layer(),
+        nn.Conv2d(out_chs // 2, out_chs, kernel_size=3, stride=2, padding=1),
+        ## nn.BatchNorm2d(out_chs),
+        nn.InstanceNorm2d(out_chs),
+        act_layer(),
+    )
+class Downsampling(nn.Module):
+    """
+    Downsampling implemented by a layer of convolution.
+    """
+    def __init__(self, in_channels, out_channels,
+                 kernel_size=3, stride=2, padding=1,
+                 pre_norm=LayerNormGeneral, post_norm=None, pre_permute=True):
+        super().__init__()
+        self.pre_norm = pre_norm(in_channels) if pre_norm else nn.Identity()
+        self.pre_permute = pre_permute
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size,
+                              stride=stride, padding=padding)
+        self.post_norm = post_norm(
+            out_channels) if post_norm else nn.Identity()
+    def forward(self, x):
+        x = self.pre_norm(x)
+        if self.pre_permute:
+            x = x.permute(0, 3, 1, 2).contiguous() # if take [B, H, W, C] as input, permute it to [B, C, H, W]
+        x = self.conv(x)
+        x = x.permute(0, 2, 3, 1).contiguous()  # [B, C, H, W] -> [B, H, W, C]
+        x = self.post_norm(x)
+        return x
+class Scale(nn.Module):
+    """
+    Scale vector by element multiplications.
+    """
+    def __init__(self, dim, init_value=1.0, trainable=True):
+        super().__init__()
+        self.scale = nn.Parameter(
+            init_value * torch.ones(dim), requires_grad=trainable)
+    def forward(self, x):
+        return x * self.scale
+class LayerNormWithoutBias(nn.Module):
+    """
+    Equal to partial(LayerNormGeneral, bias=False) but faster,
+    because it directly utilizes otpimized F.layer_norm
+    """
+    def __init__(self, normalized_shape, eps=1e-5, **kwargs):
+        super().__init__()
+        self.eps = eps
+        self.bias = None
+        if isinstance(normalized_shape, int):
+            normalized_shape = (normalized_shape,)
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.normalized_shape = normalized_shape
+    def forward(self, x):
+        return F.layer_norm(x, self.normalized_shape, weight=self.weight, bias=self.bias, eps=self.eps)
+class SepConv(nn.Module):
+    r"""
+    Inverted separable convolution from MobileNetV2: https://arxiv.org/abs/1801.04381.
+    """
+    def __init__(self, dim, expansion_ratio=2,
+                 act1_layer=nn.GELU, act2_layer=nn.Identity,
+                 bias=False, kernel_size=3, padding=1,
+                 **kwargs, ):
+        super().__init__()
+        med_channels = int(expansion_ratio * dim)
+        self.pwconv1 = nn.Linear(dim, med_channels, bias=bias)
+        self.act1 = act1_layer()
+        self.dwconv = nn.Conv2d(
+            med_channels, med_channels, kernel_size=kernel_size,
+            padding=padding, groups=med_channels, bias=bias)  # depthwise conv
+        self.act2 = act2_layer()
+        self.pwconv2 = nn.Linear(med_channels, dim, bias=bias)
+    def forward(self, x):
+        x = self.pwconv1(x)
+        x = self.act1(x)
+        x = x.permute(0, 3, 1, 2)
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 3, 1)
+        x = self.act2(x)
+        x = self.pwconv2(x)
+        return x
+class Mlp(nn.Module):
+    """ MLP as used in MetaFormer models, eg Transformer, MLP-Mixer, PoolFormer, MetaFormer baslines and related networks.
+    Mostly copied from timm.
+    """
+    def __init__(self, dim, mlp_ratio=4, out_features=None, act_layer=nn.GELU, drop=0., bias=False, **kwargs):
+        super().__init__()
+        in_features = dim
+        out_features = out_features or in_features
+        hidden_features = int(mlp_ratio * in_features)
+        drop_probs = to_2tuple(drop)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop2 = nn.Dropout(drop_probs[1])
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+class MetaFormerBlock(nn.Module):
+    """
+    Implementation of one MetaFormer block.
+    """
+    def __init__(self, dim,
+                 token_mixer=nn.Identity, mlp=Mlp, mlp_ratio=4,
+                 norm_layer=nn.LayerNorm, drop=0., drop_path=0.,
+                 layer_scale_init_value=None, res_scale_init_value=None
+                 ):
+        super().__init__()
+        self.token_mixer = token_mixer(dim, drop=drop)
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm1 = norm_layer(dim)
+        self.layer_scale1 = Scale(dim=dim, init_value=layer_scale_init_value) \
+            if layer_scale_init_value else nn.Identity()
+        self.res_scale1 = Scale(dim=dim, init_value=res_scale_init_value) \
+            if res_scale_init_value else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.mlp = mlp(dim=dim, mlp_ratio=mlp_ratio, drop=drop)
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.layer_scale2 = Scale(dim=dim, init_value=layer_scale_init_value) \
+            if layer_scale_init_value else nn.Identity()
+        self.res_scale2 = Scale(dim=dim, init_value=res_scale_init_value) \
+            if res_scale_init_value else nn.Identity()
+    def forward(self, x):
+        x = x + self.drop_path1(self.token_mixer(self.norm1(x)))
+        x = x + self.drop_path2(self.mlp(self.norm2(x)))
+        return x
+class MetaFormer(nn.Module):
+    r""" MetaFormer
+        A PyTorch impl of : `MetaFormer Baselines for Vision`  -
+          https://arxiv.org/abs/2210.13452
+    Args:
+        in_chans (int): Number of input image channels. Default: 3.
+        num_classes (int): Number of classes for classification head. Default: 1000.
+        depths (list or tuple): Number of blocks at each stage. Default: [2, 2, 6, 2].
+        dims (int): Feature dimension at each stage. Default: [64, 128, 320, 512].
+        downsample_layers: (list or tuple): Downsampling layers before each stage.
+        token_mixers (list, tuple or token_fcn): Token mixer for each stage. Default: nn.Identity.
+        mlps (list, tuple or mlp_fcn): Mlp for each stage. Default: Mlp.
+        norm_layers (list, tuple or norm_fcn): Norm layers for each stage. Default: partial(LayerNormGeneral, eps=1e-6, bias=False).
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        layer_scale_init_values (list, tuple, float or None): Init value for Layer Scale. Default: None.
+            None means not use the layer scale. Form: https://arxiv.org/abs/2103.17239.
+        res_scale_init_values (list, tuple, float or None): Init value for Layer Scale. Default: [None, None, 1.0, 1.0].
+            None means not use the layer scale. From: https://arxiv.org/abs/2110.09456.
+        head_fn: classification head. Default: nn.Linear.
+    """
+    def __init__(self, in_chans=3, num_classes=1000,
+                 depths=[2, 2, 6, 2],
+                 dims=[64, 128, 320, 512],
+                 downsample_layers=[stem] + [Downsampling]*3,
+                 token_mixers=nn.Identity,
+                 mlps=Mlp, mlp_ratio=4,
+                 norm_layers=partial(LayerNormWithoutBias, eps=1e-6),
+                 drop_path_rate=0.,
+                 layer_scale_init_values=None,
+                 res_scale_init_values=[None, None, 1.0, 1.0],
+                 head_fn=nn.Linear,
+                 **kwargs,
+                 ):
+        super().__init__()
+        self.num_classes = num_classes
+        if not isinstance(depths, (list, tuple)):
+            depths = [depths]  # it means the model has only one stage
+        if not isinstance(dims, (list, tuple)):
+            dims = [dims]
+        self.dims = dims
+        self.depths = depths
+        num_stage = len(depths)
+        self.num_stage = num_stage
+        down_dims = [in_chans] + dims
+        self.downsample_layers = nn.ModuleList([downsample_layers[i](down_dims[i], down_dims[i+1]) for i in range(num_stage)])
+        if not isinstance(token_mixers, (list, tuple)):
+            token_mixers = [token_mixers] * num_stage
+        self.token_mixers = token_mixers
+        if not isinstance(mlps, (list, tuple)):
+            mlps = [mlps] * num_stage
+        if not isinstance(norm_layers, (list, tuple)):
+            norm_layers = [norm_layers] * num_stage
+        dp_rates = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
+        if not isinstance(layer_scale_init_values, (list, tuple)):
+            layer_scale_init_values = [layer_scale_init_values] * num_stage
+        if not isinstance(res_scale_init_values, (list, tuple)):
+            res_scale_init_values = [res_scale_init_values] * num_stage
+        self.stages = nn.ModuleList()  # each stage consists of multiple metaformer blocks
+        cur = 0
+        for i in range(num_stage):
+            stage = nn.ModuleList(
+                [MetaFormerBlock(dim=dims[i], token_mixer=token_mixers[i],
+                                  mlp=mlps[i], mlp_ratio=mlp_ratio, norm_layer=norm_layers[i],
+                                  drop_path=dp_rates[cur + j],
+                                  layer_scale_init_value=layer_scale_init_values[i],
+                                  res_scale_init_value=res_scale_init_values[i],
+                                  ) for j in range(depths[i])]
+            )
+            self.stages.append(stage)
+            cur += depths[i]
+        self.head = head_fn(dims[-1], num_classes)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv2d, nn.Linear)):
+            trunc_normal_(m.weight, std=.02)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+    def forward(self, x):
+        outs = []
+        for i in range(self.num_stage):
+            x = self.downsample_layers[i](x)
+            if i==0: x = x.permute(0, 2, 3, 1).contiguous()  # [B, C, H, W] -> [B, H, W, C]
+            for j in range(self.depths[i]):
+                x= self.stages[i][j](x)
+            outs.append(x) # [B, H, W, C]
+        return outs
+def convformer(variant='tiny'):
+    if variant == 'tiny':
+        model = convformer_t()
+    elif variant == 'small':
+        model = convformer_s()
+    elif variant == 'base':
+        model = convformer_b()
+    elif variant == 'large':
+        model = convformer_l()
+    else:
+        raise NotImplementedError
+    return model
+@register_model
+def convformer_t(**kwargs):
+    model = MetaFormer(
+        depths=[2, 2, 6, 2],
+        dims=[32, 64, 128, 160],
+        mlps=Mlp, mlp_ratio=2,
+        token_mixers=[SepConv, SepConv, SepConv, SepConv],
+        head_fn=nn.Linear,
+        **kwargs)
+    return model
+@register_model
+def convformer_s(**kwargs):
+    model = MetaFormer(
+        depths=[2, 2, 6, 2],
+        dims=[64, 128, 160, 320],
+        mlps=Mlp, mlp_ratio=2,
+        token_mixers=[SepConv, SepConv, SepConv, SepConv],
+        head_fn=nn.Linear,
+        **kwargs)
+    return model
+@register_model
+def convformer_b(**kwargs):
+    model = MetaFormer(
+        depths=[2, 2, 6, 2],
+        dims=[128, 256, 320, 512],
+        mlps=Mlp, mlp_ratio=2,
+        token_mixers=[SepConv, SepConv, SepConv, SepConv],
+        head_fn=nn.Linear,
+        **kwargs)
+    return model
+@register_model
+def convformer_l(**kwargs):
+    model = MetaFormer(
+        depths=[2, 2, 6, 2],
+        dims=[256, 384, 512, 768],
+        mlps=Mlp, mlp_ratio=2,
+        token_mixers=[SepConv, SepConv, SepConv, SepConv],
+        head_fn=nn.Linear,
+        **kwargs)
+    return model

models/cost_volume.py ADDED Viewed

	@@ -0,0 +1,179 @@

+from __future__ import print_function
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from models.convformer import LayerNormWithoutBias
+from utils.utils import init_coords
+class GlobalCorrelation(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.norm = LayerNormWithoutBias(dim)
+        self.q = nn.Linear(dim, dim, bias=False)
+        self.k = nn.Linear(dim, dim, bias=False)
+        self.scale = dim**-0.5
+    def forward(self, x, stereo=True):
+        x = self.norm(x)
+        ref, tgt = x.chunk(2, dim=0)
+        ref, tgt = self.q(ref), self.k(tgt)
+        # global correlation on horizontal direction
+        B, H, W, C = ref.shape
+        if stereo:
+            correlation = torch.matmul(ref, tgt.transpose(-2, -1))*self.scale  # [B, H, W, W]
+            # mask subsequent positions to make disparity positive
+            mask = torch.triu(torch.ones((W, W), dtype=ref.dtype, device=ref.device), diagonal=1) # [W, W]
+            valid_mask = (mask == 0).unsqueeze(0).unsqueeze(0).repeat(B, H, 1, 1)  # [B, H, W, W]
+            mask_ = torch.triu(torch.ones((W, W), dtype=ref.dtype, device=ref.device), diagonal=0) # mask for input order [right, left]
+            valid_mask_ = (mask_ != 0).unsqueeze(0).unsqueeze(0).repeat(B, H, 1, 1) # upper right
+            valid_mask = torch.cat((valid_mask, valid_mask_), dim=0) # [B*2, H, W, W]
+            correlation = torch.cat((correlation, correlation.permute(0, 1, 3, 2)), dim=0) # [B*2, H, W, W]
+            B = B*2
+            correlation[~valid_mask] = -1e9 if correlation.dtype == torch.float32 else -1e4
+            # build volume from correlation
+            D = W # all-pair correlation
+            volume = correlation.new_zeros([B, D, H, W])
+            for d in range(D): # most time-consuming
+                volume[:B//2, d, :, d:] = correlation[:B//2, :, range(d, W), range(W-d)]
+                volume[B//2:, d, :, :(W-d)] = correlation[B//2:, :, range(W-d), range(d, W)]
+            volume = F.softmax(volume, dim=1).to(volume.dtype)
+            volume_clone = volume.clone()
+            for d in range(D): # fill out of view # second time-consuming
+                volume_clone[:B//2, d, :, :d] = volume[:B//2, d, :, d:d+1] # left
+                volume_clone[B//2:, d, :, W-1-d:] = volume[B//2:, d, :, W-1-d:(W-d)] # right
+            flow = local_disparity_estimator(volume_clone)
+            return flow, volume_clone
+        else:
+            init_grid = init_coords(ref) # [B, H, W, 2]
+            ref = ref.view(B, -1, C)  # [B, H*W, C]
+            tgt = tgt.view(B, -1, C)  # [B, H*W, C]
+            correlation = torch.matmul(ref, tgt.transpose(-2, -1))*self.scale  # [B, H*W, H*W]
+            correlation = torch.cat((correlation, correlation.permute(0, 2, 1)), dim=0)  # [2*B, H*W, H*W]
+            init_grid = init_grid.repeat(2, 1, 1, 1)  # [2*B, H, W, 2]
+            B = B * 2
+            prob = F.softmax(correlation, dim=-1).to(correlation.dtype)  # [B, H*W, H*W]
+            flow = local_flow_estimator(prob, init_grid)
+            return flow, prob.view(B, H, W, H*W)
+def local_flow_estimator(prob, init_grid, k=5):
+    """
+    Flow estimator using weighted sum within local window centered at max prob
+    Args:
+        prob: normalized correlation volume [B, H*W, H*W]
+        init_grid: init coordinate grid [B, H, W, 2]
+        k: local window size (odd number)
+    Returns:
+        flow: optical field [B, H, W, 2]
+    """
+    B, H, W, _ = init_grid.shape
+    r = k // 2
+    device = prob.device
+    prob_blur = F.avg_pool2d(prob, kernel_size=k, stride=1, padding=r).view(B, H*W, H*W)
+    max_prob, max_idx = torch.max(prob_blur, dim=-1)  # [B, H*W]
+    max_idx = max_idx.unsqueeze(-1)  # [B, H*W, 1]
+    target_coords = init_grid  # [B, H, W, 2]
+    max_y = max_idx // W  # [B, H*W, 1]
+    max_x = max_idx % W   # [B, H*W, 1]
+    max_y = torch.clamp(max_y, r, H-1-r)
+    max_x = torch.clamp(max_x, r, W-1-r)
+    yy, xx = torch.meshgrid(torch.arange(-r, r+1, device=device), torch.arange(-r, r+1, device=device), indexing='ij')
+    offsets_y = yy.reshape(1, 1, k*k, 1)  # [1, 1, k*k, 1]
+    offsets_x = xx.reshape(1, 1, k*k, 1)  # [1, 1, k*k, 1]
+    sample_y = max_y.unsqueeze(2) + offsets_y  # [B, H*W, k*k, 1]
+    sample_x = max_x.unsqueeze(2) + offsets_x  # [B, H*W, k*k, 1]
+    sample_y = sample_y.long().squeeze(-1)  # [B, H*W, k*k]
+    sample_x = sample_x.long().squeeze(-1)  # [B, H*W, k*k]
+    batch_idx = torch.arange(B, device=device).view(B, 1, 1).expand(-1, H*W, k*k)
+    window_coords = target_coords[batch_idx, sample_y, sample_x]  # [B, H*W, k*k, 2]
+    window_indices = sample_y * W + sample_x  # [B, H*W, k*k]
+    window_probs = torch.gather(prob, dim=-1, index=window_indices)  # [B, H*W, k*k]
+    mean_prob = 1.0 / (H * W)
+    invalid_mask = window_probs < mean_prob
+    window_probs[invalid_mask] = 0
+    window_probs_sum = window_probs.sum(dim=-1, keepdim=True).to(window_probs.dtype)
+    window_probs_sum = torch.clamp(window_probs_sum, min=torch.finfo(window_probs_sum.dtype).tiny)
+    normalized_probs = window_probs / window_probs_sum  # [B, H*W, k*k]
+    normalized_probs = normalized_probs.unsqueeze(-1)  # [B, H*W, k*k, 1]
+    correspondence = torch.sum(normalized_probs * window_coords, dim=2).to(normalized_probs.dtype) # [B, H*W, 2]
+    correspondence = correspondence.view(B, H, W, 2)  # [B, H, W, 2]
+    flow = correspondence - init_grid
+    return flow
+def local_disparity_estimator(cv, k=5):
+    """
+    Disparity estimator using weighted sum within local window centered at max prob
+    Args:
+        cv: cost volume [B, D, H, W]
+        k: local window size (odd number)
+    Returns:
+        flow: [B, H, W, 2]
+    """
+    B, D, H, W = cv.shape
+    r = k // 2
+    device = cv.device
+    cv_blur = F.avg_pool1d(cv.permute(0, 2, 3, 1).view(B, -1, D), kernel_size=k, stride=1, padding=r).view(B, H, W, D).permute(0, 3, 1, 2)
+    # find max idx in blured cv
+    max_cv, max_idx = torch.max(cv_blur, dim=1)  # max_idx: [B, H, W]
+    max_idx = max_idx.unsqueeze(1)  # [B, 1, H, W]
+    max_idx = torch.clamp(max_idx, r, D-1-r)  # [B, 1, H, W]
+    offsets = torch.arange(-r, r+1, device=device).view(1, k, 1, 1)  # [1, k, 1, 1]
+    sample_idx = max_idx + offsets  # [B, k, H, W]
+    sample_idx = torch.clamp(sample_idx, 0, D-1)
+    batch_idx = torch.arange(B, device=device).view(B, 1, 1, 1).expand(-1, k, H, W)
+    h_idx = torch.arange(H, device=device).view(1, 1, H, 1).expand(B, k, H, W)
+    w_idx = torch.arange(W, device=device).view(1, 1, 1, W).expand(B, k, H, W)
+    window_probs = cv[batch_idx, sample_idx, h_idx, w_idx]  # [B, k, H, W]
+    mean_prob = 1.0 / D
+    invalid_mask = window_probs < mean_prob
+    window_probs[invalid_mask] = 0
+    # normalize within local window
+    window_probs_sum = window_probs.sum(dim=1, keepdim=True).to(window_probs.dtype)  # [B, 1, H, W]
+    window_probs_sum = torch.clamp(window_probs_sum, min=torch.finfo(window_probs_sum.dtype).tiny)
+    normalized_probs = window_probs / window_probs_sum  # [B, k, H, W]
+    window_disp = sample_idx.to(normalized_probs.dtype) # [B, k, H, W]
+    disp = torch.sum(normalized_probs * window_disp, dim=1).to(normalized_probs.dtype).unsqueeze(-1)  # [B, H, W, 1]
+    return disp_to_flow(disp, B)
+def disp_to_flow(disp, B):
+    ## disp[:B//2, ...] = -disp[:B//2, ...] # negetive left flow
+    ## for onnx support
+    batch_indices = torch.arange(B, device=disp.device)
+    mask = batch_indices < (B // 2)
+    disp = torch.where(mask.view(B, 1, 1, 1), -disp, disp)
+    flow = torch.cat((disp, torch.zeros_like(disp)), dim=-1).contiguous() # [B, H, W, 2]
+    return flow

models/mat_pytorch_impl.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import torch
+def compute_bilinear_weights(grid):
+    """
+    Compute bilinear weights for BilinearSoftmax
+    Args:
+        grid: [..., 2], (x, y)
+    Returns:
+        weights: [..., 4], [nw, ne, sw, se]
+    """
+    x = grid[..., 0]
+    y = grid[..., 1]
+    x0 = torch.floor(x)
+    y0 = torch.floor(y)
+    dx = x - x0
+    dy = y - y0
+    nw = (1 - dx) * (1 - dy)
+    ne = dx * (1 - dy)
+    sw = (1 - dx) * dy
+    se = dx * dy
+    weights = torch.stack([nw, ne, sw, se], dim=-1)
+    return weights
+def compute_match_attention(q, k, m_id, win_r, H, W):
+    """
+    Args:
+        q: [B, N, h, C]   # Query tensor
+        k: [B, N, h, C]   # Key tensor
+        m_id: [B, N, h, 2] # Sampling centers, last dim is (x, y)
+        r: int             # Sampling window radius
+        H: int             # Height
+        W: int             # Width
+    Returns:
+        output: [B, N, h, M] where M = (2*win_r[0]+2)*(2*win_r[1]+2)
+    """
+    B, N, h, C = q.shape
+    M = (2*win_r[0] + 2)*(2*win_r[1] + 2)
+    dx = torch.arange(-win_r[0], win_r[0] + 2, device=q.device, dtype=torch.long)
+    dy = torch.arange(-win_r[1], win_r[1] + 2, device=q.device, dtype=torch.long)
+    dy, dx = torch.meshgrid(dy, dx, indexing='ij')
+    offsets = torch.stack((dx, dy), dim=-1).reshape(M, 2)  # [M, 2]
+    centers = m_id.unsqueeze(3)          # [B, N, h, 1, 2]
+    offsets = offsets.view(1, 1, 1, M, 2) # [1, 1, 1, M, 2]
+    coords = centers + offsets            # [B, N, h, M, 2]
+    x_coords = coords[..., 0]  # [B, N, h, M]
+    y_coords = coords[..., 1]  # [B, N, h, M]
+    # Clamp coordinates to valid range
+    x_coords = x_coords.clamp(0, W-1)
+    y_coords = y_coords.clamp(0, H-1)
+    indices = y_coords * W + x_coords  # [B, N, h, M]
+    # [B, N, h, C] -> [B, N, h, M, C]
+    k_expanded = k.unsqueeze(3).expand(-1, -1, -1, M, -1)
+    # [B, N, h, M] -> [B, N, h, M, C]
+    indices_gather = indices.unsqueeze(-1).expand(-1, -1, -1, -1, C)
+    # [B, N, h, M, C]
+    k_sampled = torch.gather(k_expanded, dim=1, index=indices_gather)
+    # [B, N, h, M, C] -> [B, N, h, M]
+    # negative L1 norm
+    output = -torch.abs(q.unsqueeze(3) - k_sampled).sum(dim=-1)
+    return output, indices_gather
+def attn_scatter(attn, win_r):
+    """
+    Scatter the attn to four sub-windows
+    Args:
+        attn: [B, N, h, M], M = (2*win_r[0]+2) * (2*win_r[1]+2)
+        win_r: window radius
+    Returns:
+        attn_sub: [B, N, h, 4, M_sub] attn for four sub-windows
+    """
+    B, N, h, M = attn.shape
+    M_sub = (2*win_r[0] + 1)*(2*win_r[1] + 1)
+    # [B, N, h, H_win, W_win]
+    attn_2d = attn.view(B, N, h, 2*win_r[0] + 2, 2*win_r[1] + 2)
+    # nw [0, 0] offset
+    win_nw = attn_2d[..., :2*win_r[0]+1, :2*win_r[1]+1]
+    # ne [1, 0] offset
+    win_ne = attn_2d[..., :2*win_r[0]+1, 1:2*win_r[1]+2]
+    # sw [0, 1] offset
+    win_sw = attn_2d[..., 1:2*win_r[0]+2, :2*win_r[1]+1]
+    # se [1, 1] offset
+    win_se = attn_2d[..., 1:2*win_r[0]+2, 1:2*win_r[1]+2]
+    win_nw = win_nw.reshape(B, N, h, M_sub)
+    win_ne = win_ne.reshape(B, N, h, M_sub)
+    win_sw = win_sw.reshape(B, N, h, M_sub)
+    win_se = win_se.reshape(B, N, h, M_sub)
+    attn_sub = torch.stack([win_nw, win_ne, win_sw, win_se], dim=3)
+    return attn_sub
+def attn_gather(attn_sub, win_r):
+    """
+    Gather the four attn_sub to attn
+    Args:
+        attn_sub: [B, N, h, 4, M_sub]
+        win_r: window radius
+    Returns:
+        merged_attn: [B, N, h, M]
+    """
+    B, N, h, _, M_sub = attn_sub.shape
+    merged = torch.zeros(B, N, h, 2*win_r[0] + 2, 2*win_r[1] + 2, device=attn_sub.device, dtype=attn_sub.dtype)
+    # nw [0, 0] offset
+    win_nw = attn_sub[:, :, :, 0, :].view(B, N, h, 2*win_r[0]+1, 2*win_r[1]+1)
+    merged[..., :2*win_r[0]+1, :2*win_r[1]+1] += win_nw
+    # ne [1, 0] offset
+    win_ne = attn_sub[:, :, :, 1, :].view(B, N, h, 2*win_r[0]+1, 2*win_r[1]+1)
+    merged[..., :2*win_r[0]+1, 1:2*win_r[1]+2] += win_ne
+    # sw [0, 1] offset
+    win_sw = attn_sub[:, :, :, 2, :].view(B, N, h, 2*win_r[0]+1, 2*win_r[1]+1)
+    merged[..., 1:2*win_r[0]+2, :2*win_r[1]+1] += win_sw
+    # se [1, 1] offset
+    win_se = attn_sub[:, :, :, 3, :].view(B, N, h, 2*win_r[0]+1, 2*win_r[1]+1)
+    merged[..., 1:2*win_r[0]+2, 1:2*win_r[1]+2] += win_se
+    merged_attn = merged.view(B, N, h, -1)
+    return merged_attn
+def compute_bilinear_softmax(attn, bilinear_weight, win_r):
+    """
+    Blinear Softmax: Attention sampled on a contiguous position
+    Args:
+        attn: [B, N, h, M] attention on discreate position
+        win_r: window radius
+    Returns:
+        output: [B, N, h, M] effective attention on contiguous position
+    """
+    attn_sub = attn_scatter(attn, win_r) # [B, N, h, 4, M_sub]
+    attn_weighted = bilinear_weight.unsqueeze(-1)*attn_sub.softmax(dim=-1)
+    output = attn_gather(attn_weighted, win_r) # [B, N, h, M]
+    return output
+def attention_aggregate(v, attn, indices_gather, win_r):
+    B, N, h, C = v.shape
+    M = (2*win_r[0] + 2)*(2*win_r[1] + 2)
+    # [B, N, h, C] -> [B, N, h, M, C]
+    v_expanded = v.unsqueeze(3).expand(-1, -1, -1, M, -1)
+    v_sampled = torch.gather(v_expanded, dim=1, index=indices_gather)
+    output = (attn.unsqueeze(-1)*v_sampled).sum(dim=3)
+    return output.view(B, N, -1)

models/match_former_ops.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import torch
+import torch.nn as nn
+from typing import List, Tuple
+@torch.library.custom_op("match_attention::fused_forward_ops", mutates_args={"output", "attn_out"})
+def fused_forward_ops(
+    max_offset: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    output: torch.Tensor,
+    attn_out: torch.Tensor,
+    H: int,
+    W: int,
+    win_r: List[int],
+    attn_num: int,
+    attn_type: str,
+    scale: float
+) -> None:
+    """
+    Opaque custom op for fused forward pass that prevents torch.compile tracing.
+    This wrapper ensures that torch.compile treats this as an opaque operation
+    and doesn't try to trace into the CUDA kernel internals.
+    """
+    # Call the original CUDA extension
+    try:
+        import match_attention
+        match_attention.fused_forward(
+            max_offset, q, k, v, output, attn_out,
+            H, W, win_r, attn_num, attn_type, scale
+        )
+    except ImportError:
+        # Fallback to torch.ops if direct import fails
+        torch.ops.match_attention.fused_forward(
+            max_offset, q, k, v, output, attn_out,
+            H, W, win_r, attn_num, attn_type, scale
+        )
+@fused_forward_ops.register_fake
+def _(max_offset, q, k, v, output, attn_out, H, W, win_r, attn_num, attn_type, scale):
+    """
+    Fake implementation for torch.compile that defines tensor shapes and dtypes
+    without actually executing the kernel.
+    """
+    # Validate input shapes
+    B, N, C = q.shape
+    h = max_offset.size(2)
+    # Ensure output tensors have correct shapes
+    torch._check(output.shape == (B, N, C), lambda: f"output shape mismatch: expected {(B, N, C)}, got {output.shape}")
+    torch._check(attn_out.shape == (B, N, h, attn_num), lambda: f"attn_out shape mismatch: expected {(B, N, h, attn_num)}, got {attn_out.shape}")
+    # Ensure output tensors have correct dtypes and devices
+    torch._check(output.dtype == q.dtype, lambda: f"output dtype mismatch: expected {q.dtype}, got {output.dtype}")
+    torch._check(attn_out.dtype == q.dtype, lambda: f"attn_out dtype mismatch: expected {q.dtype}, got {attn_out.dtype}")
+    torch._check(output.device == q.device, lambda: f"output device mismatch: expected {q.device}, got {output.device}")
+    torch._check(attn_out.device == q.device, lambda: f"attn_out device mismatch: expected {q.device}, got {attn_out.device}")
+    return None
+class MF_FusedForwardOps(nn.Module):
+    """
+    Opaque MatchAttention fused forward, optimized for torch.compile
+    This version uses torch.library.custom_op to create opaque custom operators,
+    preventing torch.compile from tracing into CUDA kernel internals.
+    """
+    def __init__(self):
+        super().__init__()
+    def forward(
+        self,
+        max_offset: torch.Tensor,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        H: int,
+        W: int,
+        win_r: List[int],
+        attn_num: int,
+        attn_type: str = 'l1_norm',
+        scale: float = 1.0
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Fused forward
+        Args:
+            max_offset: Offset tensor with shape [B, N, h, 2]
+            q: Query tensor with shape [B, N, C]
+            k: Key tensor with shape [B, N, C]
+            v: Value tensor with shape [B, N, C]
+            H: Feature map height
+            W: Feature map width
+            win_r: Window radius [r_h, r_w]
+            attn_num: Number of attention heads
+            attn_type: Attention type ('l1_norm' or 'l2_norm')
+            scale: Scale factor
+        Returns:
+            output: Output features with shape [B, N, C]
+            attn_out: Attention weights with shape [B, N, h, attn_num]
+        """
+        B, N, C = q.shape
+        h = max_offset.size(2)
+        # Create output tensors
+        output = torch.zeros_like(v)
+        attn_out = q.new_zeros([B, N, h, attn_num])
+        # Call opaque custom operator
+        fused_forward_ops(
+            max_offset, q, k, v, output, attn_out,
+            H, W, win_r, attn_num, attn_type, scale
+        )
+        return output, attn_out

models/match_stereo.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.models.layers import trunc_normal_
+from models.common import UpConv
+from models.convformer import convformer
+from models.attention_blocks import MatchAttentionBlock
+from models.cost_volume import GlobalCorrelation
+class MatchStereo(nn.Module):
+    def __init__(self, args,
+                 refine_win_rs=[2, 2, 1, 1], # refine window radius at 1/32, 1/16, 1/8, 1/4
+                 refine_nums=[8, 8, 8, 2],
+                 num_heads=[4, 4, 4, 4],
+                 mlp_ratios=[2, 2, 2, 2],
+                 drop_path=0.):
+        super().__init__()
+        self.refine_nums = refine_nums
+        self.encoder = convformer(args.variant)
+        self.channels = self.encoder.dims[::-1] # resolution low to high
+        self.num_heads = num_heads
+        self.head_dims = [c//h for c, h in zip(self.channels, self.num_heads)]
+        self.factor = 2
+        self.factor_last = 2**(len(self.channels) - len(refine_nums) + 2)
+        self.field_dim = 2 # 2(flow)
+        self.up_decoders = nn.ModuleList()
+        self.up_masks = nn.ModuleList()
+        for i in range(len(self.channels)):
+            if i > 0:
+                self.up_decoders.append(UpConv(self.channels[i-1], self.channels[i]))
+                self.up_masks.append(
+                    nn.Sequential(
+                    nn.Conv2d(self.channels[i-1], self.channels[i-1], 3, padding=1),
+                    nn.ReLU(inplace=True),
+                    nn.Conv2d(self.channels[i-1], (self.factor**2)*9, 1, padding=0))
+                )
+            else:
+                self.up_decoders.append(nn.Identity())
+                self.up_masks.append(nn.Identity())
+        self.up_masks.append(
+            nn.Sequential(
+            nn.Conv2d(self.channels[-1], self.channels[-1]*2, 3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(self.channels[-1]*2, (self.factor_last**2)*9, 1, padding=0)))
+        dp_rates = [x.item() for x in torch.linspace(0, drop_path, sum(refine_nums))]
+        # MatchAttention
+        self.match_attentions = nn.ModuleList()
+        for i in range(len(refine_nums)):
+            self.match_attentions.append(
+                MatchAttentionBlock(args, self.channels[i], win_r=refine_win_rs[i],
+                                    num_layer=refine_nums[i], num_head=self.num_heads[i], head_dim=self.head_dims[i],
+                                    mlp_ratio=mlp_ratios[i], field_dim=self.field_dim,
+                                    dp_rates=dp_rates[sum(refine_nums[:i]):sum(refine_nums[:i+1])])
+            )
+        self.init_correlation_volume = GlobalCorrelation(self.channels[0])
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv2d, nn.Linear)):
+            trunc_normal_(m.weight, std=.02)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+    def upsample_field(self, field, mask, factor):
+        ''' Upsample field [H/factor, W/factor, D] -> [H, W, D] using convex combination '''
+        B, H, W, D = field.shape
+        field = field.permute(0, 3, 1, 2)
+        mask = mask.view(B, 1, 9, factor, factor, H, W)
+        mask = torch.softmax(mask, dim=2).to(mask.dtype)
+        up_flow = F.unfold(field*factor, [3,3], padding=1)
+        up_flow = up_flow.view(B, D, 9, 1, 1, H, W)
+        up_flow = torch.sum(mask * up_flow, dim=2).to(mask.dtype) # [B, D, 9, factor, factor, H, W]
+        up_flow = up_flow.permute(0, 4, 2, 5, 3, 1)
+        return up_flow.reshape(B, factor*H, factor*W, D).contiguous()
+    def forward(self, img0, img1, stereo=True, init_flow=None):
+        ''' Estimate optical flow/disparity between pair of frames, output bi-directional flow/disparity '''
+        field_all = []
+        img0 = (2 * (img0 / 255.0) - 1.0).contiguous()
+        img1 = (2 * (img1 / 255.0) - 1.0).contiguous()
+        x = torch.cat((img0, img1), dim=0) # cat in batch dim
+        features = self.encoder(x) # [B*2, H, W, C]
+        features = features[::-1] # reverse 1/32, 1/16, 1/8, 1/4
+        for i in range(len(features)): # 1/32, 1/16, 1/8, 1/4
+            if i==0:
+                if init_flow is None:
+                    init_flow, init_cv = self.init_correlation_volume(features[i], stereo=stereo)
+                else:
+                    init_cv = None
+                field = init_flow.clone() # [B, H, W, 2]
+                self_rpos = torch.zeros_like(field)
+            else:
+                features[i] = self.up_decoders[i](features[i-1], features[i])
+                up_mask = self.up_masks[i](features[i-1].permute(0, 3, 1, 2)) # [B, C, H, W]
+                self_rpos = self.upsample_field(self_rpos, up_mask, self.factor)
+                field = self.upsample_field(field, up_mask, self.factor)
+                field_all.append({'self':field})
+            features[i], self_rpos, field, fields = self.match_attentions[i](features[i], self_rpos, field, stereo=stereo)
+            field_all.extend(fields)
+        if self.training:
+            B = field.shape[0]
+            field_up = self.upsample_field(field[:B//2], self.up_masks[-1](features[-1][:B//2].permute(0, 3, 1, 2)), self.factor_last)
+            field_up = torch.cat((field_up, field_up), dim=0) # dummy output
+        else:
+            field_up = self.upsample_field(field, self.up_masks[-1](features[-1].permute(0, 3, 1, 2)), self.factor_last)
+        return {
+            'init_flow': init_flow,
+            'init_cv': init_cv,
+            'field_all': field_all,
+            'field_up': field_up,
+            'self_rpos': self_rpos,
+        }

models/setup.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import torch
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CppExtension
+setup(
+    name='match_attention',
+    version='0.7',
+    description='Match Attention CUDA Extension for PyTorch',
+    author='TingmanYan',
+    ext_modules=[
+        CUDAExtension('match_attention', [
+            'src/match_former_cuda.cpp',
+            'src/match_former_cuda_kernel.cu',
+            'src/match_former_fused_forward.cu',
+        ]),
+    ],
+    cmdclass={
+        'build_ext': BuildExtension
+    }
+)

models/src/match_former_cuda.cpp ADDED Viewed

	@@ -0,0 +1,49 @@

+#include <torch/extension.h>
+#include <vector>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <string>
+#include <ATen/core/op_registration/op_registration.h>
+// CUDA declarations
+void mf_fused_forward_cuda(
+    at::Tensor max_offset,
+    at::Tensor q,
+    at::Tensor k,
+    at::Tensor v,
+    at::Tensor output,
+    at::Tensor attn_out,
+    const int H,
+    const int W,
+    const std::vector<int64_t>& win_r,
+    const int attn_num,
+    const std::string& attn_type,
+    const float scale);
+void mf_fused_forward(
+    at::Tensor max_offset,
+    at::Tensor q,
+    at::Tensor k,
+    at::Tensor v,
+    at::Tensor output,
+    at::Tensor attn_out,
+    const int64_t H,
+    const int64_t W,
+    const std::vector<int64_t>& win_r,
+    const int64_t attn_num,
+    const std::string& attn_type,
+    const double scale)
+{
+    mf_fused_forward_cuda(max_offset, q, k, v, output, attn_out, H, W, win_r, attn_num, attn_type, static_cast<float>(scale));
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("fused_forward", &mf_fused_forward, "Fused forward pass (CUDA)");
+}
+TORCH_LIBRARY(match_attention, m)
+{
+    m.def("fused_forward(Tensor max_offset, Tensor q, Tensor k, Tensor v, Tensor(a!) output, Tensor(b!) attn_out, int H, int W, int[] win_r, int attn_num, str attn_type, float scale) -> ()", &mf_fused_forward);
+}

models/src/match_former_cuda_kernel.cu ADDED Viewed

	@@ -0,0 +1,26 @@

+#include <torch/extension.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <vector>
+#include "match_former_fused_forward.hpp"
+// Fused forward function that combines all operations
+void mf_fused_forward_cuda(
+    at::Tensor max_offset,
+    at::Tensor q,
+    at::Tensor k,
+    at::Tensor v,
+    at::Tensor output,
+    at::Tensor attn_out,
+    const int H,
+    const int W,
+    const std::vector<int64_t>& win_r,
+    const int attn_num,
+    const std::string& attn_type,
+    const float scale)
+{
+    match_former_fused_forward(max_offset, q, k, v, output, attn_out, H, W, win_r, attn_num, attn_type, scale);
+}

models/src/match_former_fused_forward.cu ADDED Viewed

	@@ -0,0 +1,628 @@

+#include <torch/extension.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <vector>
+#include <cassert>
+#include <cfloat>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <ATen/native/cuda/KernelUtils.cuh>
+// Forward declarations of kernel functions
+template <typename scalar_t>
+__global__ void clip_offset_to_id_k(const scalar_t *const m_offset_d, int *const m_id_d, const int Lh, const int num_heads, const int N, const int H, const int W);
+template <typename scalar_t>
+__global__ void attn_weight_bilinear_forward_k(const scalar_t* const m_offset_d, scalar_t* const bilinear_weight_d, const int Lh);
+__global__ void check_max_id_k(int *const m_id_d, const int L, const int N, const int H, const int W, const int num_heads, const int win_x, const int win_y);
+template <typename scalar_t>
+__global__ void match_attention_l1_norm_forward_k(
+    const scalar_t *__restrict__ q_d,
+    const scalar_t *__restrict__ k_d,
+    scalar_t *__restrict__ attn_d,
+    const int *__restrict__ m_id_d,
+    const int *__restrict__ offset_d,
+    const int L, const int N, const int H, const int W,
+    const int C, const int num_heads, const int key_dim,
+    const int attn_num, const int attn_numel,
+    const bool swap_xy);
+template <typename scalar_t>
+__global__ void match_attention_dot_product_forward_k(const scalar_t *const q_d, const scalar_t *const k_d, scalar_t *const attn_d, const int *const m_id_d, const int* const offset_d, const int L, const int N, const int H, const int W, const int C, const int num_heads, const int key_dim, const int attn_num, const int attn_numel, const bool swap_xy);
+template <typename scalar_t>
+__global__ void bilinear_softmax_forward_general_k(scalar_t* const __restrict__ attn_d,
+                                   scalar_t* const __restrict__ attn_out_d,
+                                   scalar_t* const __restrict__ attn_sum_d,
+                                   const scalar_t* const __restrict__ bilinear_weight_d,
+                                   const int* const __restrict__ select_index_d,
+                                   int L, const int num_heads, const int h_attn_num,
+                                   const int attn_num, const int attn_num_sub);
+template <typename scalar_t>
+__global__ void attention_aggregate_forward_k(
+    const scalar_t *__restrict__ v_d,
+    scalar_t *__restrict__ out_d,
+    const scalar_t *__restrict__ attn_d,
+    const int *__restrict__ m_id_d,
+    const int* __restrict__ offset_d,
+    const int L, const int C, const int num_heads,
+    const int key_dim, const int attn_num,
+    const bool swap_xy);
+template <typename scalar_t>
+__global__ void scale_attention_k(scalar_t* attn_d, const scalar_t scale, const int total_size);
+// Kernel implementations
+template <typename scalar_t>
+__global__ void
+clip_offset_to_id_k(const scalar_t *const m_offset_d, int *const m_id_d, const int Lh, const int num_heads, const int N, const int H, const int W)
+{
+    int lh = blockIdx.x * blockDim.x + threadIdx.x;
+    if (lh >= Lh)
+        return;
+    int l = lh / num_heads;
+    int batch_id = l / N;
+    int m_x = __float2int_rd(static_cast<float>(m_offset_d[lh*2])); // round to floor
+    int m_y = __float2int_rd(static_cast<float>(m_offset_d[lh*2 + 1]));
+    if (m_x < 0) m_x = 0;
+    if (m_x >= W) m_x = W - 1;
+    if (m_y < 0) m_y = 0;
+    if (m_y >= H) m_y = H - 1;
+    int m_pix_id = m_y * W + m_x;
+    int m_id = batch_id * N + m_pix_id;
+    m_id_d[lh] = m_id;
+}
+template <typename scalar_t>
+__global__ void
+attn_weight_bilinear_forward_k(const scalar_t* const m_offset_d, scalar_t* const bilinear_weight_d, const int Lh)
+{
+    int lh = blockIdx.x * blockDim.x + threadIdx.x;
+    if (lh >= Lh)
+        return;
+    float ix = static_cast<float>(m_offset_d[lh*2]);
+    float iy = static_cast<float>(m_offset_d[lh*2 + 1]);
+    int ix_nw = __float2int_rd(ix);
+    int iy_nw = __float2int_rd(iy);
+    int ix_ne = ix_nw + 1;
+    int iy_ne = iy_nw;
+    int ix_sw = ix_nw;
+    int iy_sw = iy_nw + 1;
+    int ix_se = ix_nw + 1;
+    int iy_se = iy_nw + 1;
+    float nw = (ix_se - ix)    * (iy_se - iy);
+    float ne = (ix    - ix_sw) * (iy_sw - iy);
+    float sw = (ix_ne - ix)    * (iy    - iy_ne);
+    float se = (ix    - ix_nw) * (iy    - iy_nw);
+    bilinear_weight_d[lh*4] = static_cast<scalar_t>(nw);
+    bilinear_weight_d[lh*4 + 1] = static_cast<scalar_t>(ne);
+    bilinear_weight_d[lh*4 + 2] = static_cast<scalar_t>(sw);
+    bilinear_weight_d[lh*4 + 3] = static_cast<scalar_t>(se); // bilinear_weight of shape [B, N, h, 4]
+}
+// check if the search window range is out of image coordinates
+__forceinline__ __device__ void
+check_within_image_coordinates(int& l_id, const int& N, const int& H, const int& W, const int& win_x, const int& win_y)
+{
+    int pix_id = l_id % N;
+    int batch_id = l_id / N;
+    int x = pix_id % W;
+    int y = pix_id / W;
+    if (x - win_x < 0)
+        x = win_x;
+    if (x + (win_x + 1) >= W)
+        x = W - 1 - (win_x + 1);
+    if (y - win_y < 0)
+        y = win_y;
+    if (y + (win_y + 1) >= H)
+        y = H - 1 - (win_y + 1);
+    pix_id = y * W + x;
+    l_id = batch_id * N + pix_id;
+}
+__global__ void
+check_max_id_k(int *const m_id_d, const int L, const int N, const int H, const int W, const int num_heads, const int win_x, const int win_y)
+{
+    int l, h;
+    l = blockIdx.x * blockDim.x + threadIdx.x;
+    h = blockIdx.y * blockDim.y + threadIdx.y;
+    if (l >= L || h >= num_heads)
+        return;
+    int m_id = m_id_d[l * num_heads + h];
+    check_within_image_coordinates(m_id, N, H, W, win_x, win_y);
+    m_id_d[l * num_heads + h] = m_id;
+}
+template <typename scalar_t>
+__global__ void match_attention_l1_norm_forward_k(
+    const scalar_t *__restrict__ q_d,
+    const scalar_t *__restrict__ k_d,
+    scalar_t *__restrict__ attn_d,
+    const int *__restrict__ m_id_d,
+    const int *__restrict__ offset_d,
+    const int L, const int N, const int H, const int W,
+    const int C, const int num_heads, const int key_dim,
+    const int attn_num, const int attn_numel,
+    const bool swap_xy)
+{
+    int l, k;
+    if (swap_xy)
+    {
+        l = blockIdx.x * blockDim.x + threadIdx.x;
+        k = blockIdx.y * blockDim.y + threadIdx.y;
+    }
+    else
+    {
+        k = blockIdx.x * blockDim.x + threadIdx.x;
+        l = blockIdx.y * blockDim.y + threadIdx.y;
+    }
+    if (l >= L || k >= num_heads*attn_num)
+        return;
+    constexpr int vec_size = sizeof(float4) / sizeof(scalar_t);
+    const int h = k / attn_num;
+    const int attn_id = k % attn_num;
+    const int base_id = l*num_heads + h;
+    const int base_attn_id = base_id*attn_num;
+    const int key_id = m_id_d[base_id] + offset_d[attn_id];
+    const int q_base = l * C;
+    const int k_base = key_id * C;
+    const int c_start = h * key_dim / vec_size;
+    const int c_end = c_start + key_dim / vec_size;
+    const float4* q_val_vec = reinterpret_cast<const float4*>(q_d + q_base);
+    const float4* k_val_vec = reinterpret_cast<const float4*>(k_d + k_base);
+    float diff_sum = 0.0f;
+    for (int c = c_start; c < c_end; ++c) {
+        float4 q_val_f4 = __ldg(&q_val_vec[c]);
+        float4 k_val_f4 = __ldg(&k_val_vec[c]);
+        if (vec_size == 4) {  // float32
+            diff_sum += fabsf(q_val_f4.x - k_val_f4.x) +
+                        fabsf(q_val_f4.y - k_val_f4.y) +
+                        fabsf(q_val_f4.z - k_val_f4.z) +
+                        fabsf(q_val_f4.w - k_val_f4.w);
+        } else {  // bf16/fp16 (8 elements)
+            if (std::is_same<scalar_t, at::Half>::value) {
+                const half2* q_val_h2 = reinterpret_cast<const half2*>(&q_val_f4);
+                const half2* k_val_h2 = reinterpret_cast<const half2*>(&k_val_f4);
+                #pragma unroll
+                for (int i = 0; i < 4; ++i) {
+                    half2 q_h2 = q_val_h2[i];
+                    half2 k_h2 = k_val_h2[i];
+                    half2 diff_h2 = __habs2(__hsub2(q_h2, k_h2));
+                    diff_sum += __half2float(diff_h2.x) + __half2float(diff_h2.y);
+                }
+            } else {  // bf16
+                const __nv_bfloat162* q_val_bf2 = reinterpret_cast<const __nv_bfloat162*>(&q_val_f4);
+                const __nv_bfloat162* k_val_bf2 = reinterpret_cast<const __nv_bfloat162*>(&k_val_f4);
+                #pragma unroll
+                for (int i = 0; i < 4; ++i) {
+                    __nv_bfloat162 q_bf2 = q_val_bf2[i];
+                    __nv_bfloat162 k_bf2 = k_val_bf2[i];
+                    __nv_bfloat162 diff_bf2 = __habs2(__hsub2(q_bf2, k_bf2));
+                    diff_sum += __bfloat162float(diff_bf2.x) + __bfloat162float(diff_bf2.y);
+                }
+            }
+        }
+    }
+    attn_d[base_attn_id + attn_id] = static_cast<scalar_t>(-diff_sum);
+}
+template <typename scalar_t>
+__global__ void
+match_attention_dot_product_forward_k(const scalar_t *const q_d, const scalar_t *const k_d, scalar_t *const attn_d, const int *const m_id_d, const int* const offset_d, const int L, const int N, const int H, const int W, const int C, const int num_heads, const int key_dim, const int attn_num, const int attn_numel, const bool swap_xy)
+{
+    int l, k;
+    if (swap_xy)
+    {
+        l = blockIdx.x * blockDim.x + threadIdx.x;
+        k = blockIdx.y * blockDim.y + threadIdx.y;
+    }
+    else
+    {
+        k = blockIdx.x * blockDim.x + threadIdx.x;
+        l = blockIdx.y * blockDim.y + threadIdx.y;
+    }
+    if (l >= L || k >= num_heads*attn_num)
+        return;
+    int h = k / attn_num;
+    int attn_id = k % attn_num;
+    int base_id = l*num_heads + h;
+    int base_attn_id = base_id*attn_num;
+    int key_id = m_id_d[base_id] + offset_d[attn_id];
+    scalar_t diff_sum = 0;
+    for (int c = h * key_dim; c < (h + 1) * key_dim; ++c)
+    {
+        diff_sum += q_d[l * C + c] * k_d[key_id * C + c];
+    }
+    attn_d[base_attn_id + attn_id] = diff_sum;
+}
+template <typename scalar_t>
+__global__ void scale_attention_k(scalar_t* attn_d, const scalar_t scale, const int total_size)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= total_size)
+        return;
+    attn_d[idx] = attn_d[idx] * scale;
+}
+template <typename T> struct VecType { using Type = T; };
+template <> struct VecType<float> { using Type = float4; };
+template <> struct VecType<__half> { using Type = float2; };
+template <> struct VecType<__nv_bfloat16> { using Type = float2; };
+template <typename scalar_t>
+__device__ __inline__ typename VecType<scalar_t>::Type load_vec(const scalar_t* addr) {
+    return *reinterpret_cast<const typename VecType<scalar_t>::Type*>(addr);
+}
+template <typename scalar_t>
+__device__ __inline__ void store_vec(scalar_t* addr, typename VecType<scalar_t>::Type val) {
+    *reinterpret_cast<typename VecType<scalar_t>::Type*>(addr) = val;
+}
+template <int WIN_SIZE, typename scalar_t>
+__device__ __forceinline__ void load_window(scalar_t* window, const scalar_t* src) {
+    constexpr int VEC_ELEMS = sizeof(typename VecType<scalar_t>::Type) / sizeof(scalar_t);
+    constexpr int VEC_COUNT = WIN_SIZE / VEC_ELEMS;
+    using vec_t = typename VecType<scalar_t>::Type;
+    #pragma unroll 4
+    for (int i = 0; i < VEC_COUNT; ++i) {
+        vec_t vec = load_vec<scalar_t>(src + i * VEC_ELEMS);
+        store_vec<scalar_t>(window + i * VEC_ELEMS, vec);
+    }
+}
+template <int WIN_SIZE, typename scalar_t>
+__device__ __forceinline__ void store_window(scalar_t* dst, const scalar_t* window) {
+    constexpr int VEC_ELEMS = sizeof(typename VecType<scalar_t>::Type) / sizeof(scalar_t);
+    constexpr int VEC_COUNT = WIN_SIZE / VEC_ELEMS;
+    using vec_t = typename VecType<scalar_t>::Type;
+    #pragma unroll 4
+    for (int i = 0; i < VEC_COUNT; ++i) {
+        vec_t vec = load_vec<scalar_t>(window + i * VEC_ELEMS);
+        store_vec<scalar_t>(dst + i * VEC_ELEMS, vec);
+    }
+}
+template <int WIN_SIZE, int SUB_WIN_SIZE, typename scalar_t>
+__global__ void
+bilinear_softmax_forward_k(scalar_t* const __restrict__ attn_d,
+                           scalar_t* const __restrict__ attn_out_d,
+                           scalar_t* const __restrict__ attn_sum_d,
+                           const scalar_t* const __restrict__ bilinear_weight_d,
+                           const int* const __restrict__ select_index_d,
+                           int L, const int num_heads, const int h_attn_num,
+                           const int attn_num)
+{
+    constexpr int VEC_ELEMS = sizeof(typename VecType<scalar_t>::Type) / sizeof(scalar_t);
+    static_assert(WIN_SIZE % VEC_ELEMS == 0, "WIN_SIZE must be divisible by vector elements");
+    using acc_t = float;
+    int l = blockIdx.x * blockDim.x + threadIdx.x;
+    int h = blockIdx.y * blockDim.y + threadIdx.y;
+    if (l >= L || h >= num_heads)
+        return;
+    const int base_attn_id = l * h_attn_num + h * attn_num;
+    const int base_sum_idx = l * (num_heads * 4) + h * 4;
+    scalar_t window[WIN_SIZE];
+    load_window<WIN_SIZE>(window, attn_d + base_attn_id);
+    acc_t attn_max = -FLT_MAX;
+    #pragma unroll 4
+    for (int k = 0; k < WIN_SIZE; ++k) {
+        if (static_cast<acc_t>(window[k]) > attn_max) {
+            attn_max = static_cast<acc_t>(window[k]);
+        }
+    }
+    #pragma unroll 4
+    for (int k = 0; k < WIN_SIZE; ++k) {
+        window[k] = static_cast<scalar_t>(expf(static_cast<acc_t>(window[k]) - attn_max));
+    }
+    scalar_t window_out[WIN_SIZE] = {0};
+    for (int b = 0; b < 4; ++b) {
+        acc_t block_sum = 0.0f;
+        const int* block_idx = select_index_d + b * SUB_WIN_SIZE;
+        #pragma unroll 4
+        for (int k = 0; k < SUB_WIN_SIZE; ++k) {
+            block_sum += static_cast<acc_t>(window[block_idx[k]]);
+        }
+        block_sum = fmaxf(block_sum, FLT_EPSILON);
+        attn_sum_d[base_sum_idx + b] = static_cast<scalar_t>(block_sum);
+        const scalar_t weight = bilinear_weight_d[base_sum_idx + b];
+        const scalar_t scale = static_cast<scalar_t>(static_cast<acc_t>(weight) / block_sum);
+        #pragma unroll 4
+        for (int k = 0; k < SUB_WIN_SIZE; ++k) {
+            const int idx = block_idx[k];
+            window_out[idx] = window_out[idx] + window[idx] * scale;
+        }
+    }
+    // write back to global memory
+    store_window<WIN_SIZE>(attn_out_d + base_attn_id, window_out);
+}
+template <typename scalar_t>
+__global__ void
+bilinear_softmax_forward_general_k(scalar_t* const __restrict__ attn_d,
+                                   scalar_t* const __restrict__ attn_out_d,
+                                   scalar_t* const __restrict__ attn_sum_d,
+                                   const scalar_t* const __restrict__ bilinear_weight_d,
+                                   const int* const __restrict__ select_index_d,
+                                   int L, const int num_heads, const int h_attn_num,
+                                   const int attn_num, const int attn_num_sub)
+{
+    int l, h;
+    l = blockIdx.x * blockDim.x + threadIdx.x;
+    h = blockIdx.y * blockDim.y + threadIdx.y;
+    if (l >= L || h >= num_heads)
+        return;
+    scalar_t attn_max = -FLT_MAX;
+    int base_attn_id = l * h_attn_num + h * attn_num;
+    for (int k = 0; k < attn_num; ++k)
+    {
+        scalar_t attn_val = attn_d[base_attn_id + k];
+        if (attn_val > attn_max) {
+            attn_max = attn_val;
+        }
+    }
+    __syncthreads();
+    for (int k = 0; k < attn_num; ++k)
+    {
+        attn_d[base_attn_id + k] = expf(attn_d[base_attn_id + k] - attn_max);
+    }
+    __syncthreads();
+    for (int b = 0; b < 4; ++b)
+    {
+        scalar_t attn_sum = 0;
+        for (int k = 0; k < attn_num_sub; ++k)
+        {
+            attn_sum += attn_d[base_attn_id + select_index_d[b*attn_num_sub + k]];
+        }
+        attn_sum = fmaxf(attn_sum, FLT_EPSILON);
+        attn_sum_d[l*(num_heads*4) + h*4 + b] = attn_sum; // save for backward
+        scalar_t weight = bilinear_weight_d[l*num_heads*4 + h*4 + b];
+        for (int k = 0; k < attn_num_sub; ++k)
+        {
+            int select_index = select_index_d[b*attn_num_sub + k];
+            attn_out_d[base_attn_id + select_index] +=
+                attn_d[base_attn_id + select_index] / attn_sum * weight; // no write conflict
+        }
+    }
+}
+template <typename scalar_t>
+__global__ void attention_aggregate_forward_k(
+    const scalar_t *__restrict__ v_d,
+    scalar_t *__restrict__ out_d,
+    const scalar_t *__restrict__ attn_d,
+    const int *__restrict__ m_id_d,
+    const int* __restrict__ offset_d,
+    const int L, const int C, const int num_heads,
+    const int key_dim, const int attn_num,
+    const bool swap_xy)
+{
+    int c, l;
+    if (swap_xy)
+    {
+        l = blockIdx.x * blockDim.x + threadIdx.x;
+        c = blockIdx.y * blockDim.y + threadIdx.y;
+    }
+    else
+    {
+        c = blockIdx.x * blockDim.x + threadIdx.x;
+        l = blockIdx.y * blockDim.y + threadIdx.y;
+    }
+    if (l >= L || c >= C)
+        return;
+    const int h = c / key_dim;
+    const int base_id = l*num_heads + h;
+    const int base_attn_id = base_id*attn_num;
+    const int m_id = m_id_d[base_id];
+    float out_sum = 0;
+    for (int k = 0; k < attn_num; ++k)
+    {
+        int key_id = m_id + offset_d[k];
+        out_sum += static_cast<float>(attn_d[base_attn_id + k]) *
+                   static_cast<float>(v_d[key_id * C + c]);
+    }
+    out_d[l * C + c] = static_cast<scalar_t>(out_sum);
+}
+// Main fused forward function
+void match_former_fused_forward(
+    at::Tensor max_offset,
+    at::Tensor q,
+    at::Tensor k,
+    at::Tensor v,
+    at::Tensor output,
+    at::Tensor attn_out,
+    const int H,
+    const int W,
+    const std::vector<int64_t>& win_r,
+    const int attn_num,
+    const std::string& attn_type,
+    const float scale)
+{
+    const int B = q.size(0);
+    const int N = q.size(1);
+    const int C = q.size(2);
+    const int h = max_offset.size(2);
+    const int key_dim = C / h;
+    const int L = B * N;
+    const int Lh = L * h;
+    const int attn_numel = L * h * attn_num;
+    const int win_x = win_r[0];
+    const int win_y = win_r[1];
+    assert(attn_num == (2*win_r[0]+2)*(2*win_r[1]+2));
+    const bool swap_xy_match = (h * attn_num < 32);
+    const bool swap_xy_agg = (C < 32);
+    const int attn_num_sub = (2*win_r[0] + 1)*(2*win_r[1] + 1);
+    const int h_attn_num = h * attn_num;
+    // Create temporary tensors
+    auto m_id = at::zeros({B, N, h}, at::TensorOptions().dtype(at::kInt).device(max_offset.device()));
+    auto bilinear_weight = at::zeros({B, N, h, 4}, max_offset.options());
+    auto attn = at::zeros({B, N, h, attn_num}, q.options());
+    auto attn_sum = at::zeros({B, N, h, 4}, q.options());
+    // Create offset array for window
+    int *offset_d;
+    cudaMalloc(&offset_d, sizeof(int) * attn_num);
+    int *offset_h = new int[attn_num];
+    int num = 0;
+    for (int y = -win_y; y <= (win_y + 1); ++y)
+        for (int x = -win_x; x <= (win_x + 1); ++x)
+        {
+            offset_h[num++] = y * W + x;
+        }
+    cudaMemcpy(offset_d, offset_h, sizeof(int) * attn_num, cudaMemcpyHostToDevice);
+    delete[] offset_h;
+    // Create select_index array for bilinear softmax
+    int *select_index_d;
+    cudaMalloc(&select_index_d, sizeof(int)*4*attn_num_sub);
+    int *select_index_h = new int[4*attn_num_sub];
+    int win_W = 2*(win_r[0]+1);
+    int delta_x[4] = {0, 1, 0, 1};
+    int delta_y[4] = {0, 0, 1, 1};
+    num = 0;
+    for (int b = 0; b < 4; ++b) {
+        int d_x = delta_x[b];
+        int d_y = delta_y[b];
+        for (int y = d_y; y <= 2*win_r[1] + d_y; ++y)
+            for (int x = d_x; x <= 2*win_r[0] + d_x; ++x)
+            {
+                select_index_h[num++] = y * win_W + x;
+            }
+    }
+    cudaMemcpy(select_index_d, select_index_h, sizeof(int)*attn_num_sub*4, cudaMemcpyHostToDevice);
+    delete[] select_index_h;
+    // Step 1: Clip offset to id
+    {
+        int grid = (Lh + 512 - 1) / 512;
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, max_offset.scalar_type(), "clip_offset_to_id_k", ([&] {
+            clip_offset_to_id_k<scalar_t><<<grid, 512>>>(max_offset.data_ptr<scalar_t>(), m_id.data_ptr<int>(), Lh, h, N, H, W);
+        }));
+    }
+    // Step 2: Compute bilinear weights
+    {
+        int grid = (Lh + 512 - 1) / 512;
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, max_offset.scalar_type(), "attn_weight_bilinear_forward_k", ([&] {
+            attn_weight_bilinear_forward_k<scalar_t><<<grid, 512>>>(max_offset.data_ptr<scalar_t>(), bilinear_weight.data_ptr<scalar_t>(), Lh);
+        }));
+    }
+    // Step 3: Check max id bounds
+    {
+        dim3 m_blocks(8, 128);
+        dim3 grids((L + m_blocks.x - 1) / m_blocks.x, (h + m_blocks.y - 1) / m_blocks.y);
+        check_max_id_k<<<grids, m_blocks>>>(m_id.data_ptr<int>(), L, N, H, W, h, win_x, win_y);
+    }
+    // Step 4: Compute attention
+    {
+        dim3 m_blocks(8, 128);
+        dim3 grids((h*attn_num + m_blocks.x - 1) / m_blocks.x, (L + m_blocks.y - 1) / m_blocks.y);
+        if (swap_xy_match)
+            grids = dim3((L + m_blocks.x - 1) / m_blocks.x, (h*attn_num + m_blocks.y - 1) / m_blocks.y);
+        if (attn_type == "dot_product") {
+            AT_DISPATCH_FLOATING_TYPES_AND_HALF(q.scalar_type(), "match_attention_dot_product_forward_k", ([&] {
+                match_attention_dot_product_forward_k<scalar_t><<<grids, m_blocks>>>(q.data_ptr<scalar_t>(), k.data_ptr<scalar_t>(), attn.data_ptr<scalar_t>(), m_id.data_ptr<int>(), offset_d, L, N, H, W, C, h, key_dim, attn_num, attn_numel, swap_xy_match);
+            }));
+        } else if (attn_type == "l1_norm") {
+            AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, q.scalar_type(), "match_attention_l1_norm_forward_k", ([&] {
+                match_attention_l1_norm_forward_k<scalar_t><<<grids, m_blocks>>>(q.data_ptr<scalar_t>(), k.data_ptr<scalar_t>(), attn.data_ptr<scalar_t>(), m_id.data_ptr<int>(), offset_d, L, N, H, W, C, h, key_dim, attn_num, attn_numel, swap_xy_match);
+            }));
+        }
+    }
+    // Step 5: Scale attention
+    {
+        int grid = (attn_numel + 512 - 1) / 512;
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, q.scalar_type(), "scale_attention_k", ([&] {
+            scale_attention_k<scalar_t><<<grid, 512>>>(attn.data_ptr<scalar_t>(), static_cast<scalar_t>(scale), attn_numel);
+        }));
+    }
+    // Step 6: Bilinear softmax
+    {
+        dim3 m_blocks = (attn_num == 16) ? dim3(128, 4) : dim3(32, 4);
+        dim3 grids((L + m_blocks.x - 1) / m_blocks.x, (h + m_blocks.y - 1) / m_blocks.y);
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, attn.scalar_type(), "bilinear_softmax_forward", [&] {
+            if (attn_num == 16 && attn_num_sub == 9) {
+                bilinear_softmax_forward_k<16, 9><<<grids, m_blocks>>>(
+                    attn.data_ptr<scalar_t>(),
+                    attn_out.data_ptr<scalar_t>(),
+                    attn_sum.data_ptr<scalar_t>(),
+                    bilinear_weight.data_ptr<scalar_t>(),
+                    select_index_d, L, h, h_attn_num, attn_num
+                );
+            } else if (attn_num == 36 && attn_num_sub == 25) {
+                bilinear_softmax_forward_k<36, 25><<<grids, m_blocks>>>(
+                    attn.data_ptr<scalar_t>(),
+                    attn_out.data_ptr<scalar_t>(),
+                    attn_sum.data_ptr<scalar_t>(),
+                    bilinear_weight.data_ptr<scalar_t>(),
+                    select_index_d, L, h, h_attn_num, attn_num
+                );
+            } else {
+                bilinear_softmax_forward_general_k<<<grids, m_blocks>>>(
+                    attn.data_ptr<scalar_t>(),
+                    attn_out.data_ptr<scalar_t>(),
+                    attn_sum.data_ptr<scalar_t>(),
+                    bilinear_weight.data_ptr<scalar_t>(),
+                    select_index_d, L, h, h_attn_num, attn_num, attn_num_sub
+                );
+            }
+        });
+    }
+    // Step 7: Attention aggregation
+    {
+        dim3 m_blocks = (attn_num == 16) ? dim3(8, 128) : dim3(8, 32);
+        dim3 grids((C + m_blocks.x - 1) / m_blocks.x, (L + m_blocks.y - 1) / m_blocks.y);
+        if (swap_xy_agg)
+            grids = dim3((L + m_blocks.x - 1) / m_blocks.x, (C + m_blocks.y - 1) / m_blocks.y);
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, v.scalar_type(), "attention_aggregate_forward_k", ([&] {
+            attention_aggregate_forward_k<scalar_t><<<grids, m_blocks>>>(v.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(), attn_out.data_ptr<scalar_t>(), m_id.data_ptr<int>(), offset_d, L, C, h, key_dim, attn_num, swap_xy_agg);
+        }));
+    }
+    // Cleanup
+    cudaFree(offset_d);
+    cudaFree(select_index_d);
+}

models/src/match_former_fused_forward.hpp ADDED Viewed

	@@ -0,0 +1,22 @@

+#ifndef _MATCH_FORMER_FUSED_FORWARD_HPP_
+#define _MATCH_FORMER_FUSED_FORWARD_HPP_
+#include <vector>
+#include <string>
+// Fused forward function that combines all match former operations
+void match_former_fused_forward(
+    at::Tensor max_offset,
+    at::Tensor q,
+    at::Tensor k,
+    at::Tensor v,
+    at::Tensor output,
+    at::Tensor attn_out,
+    const int H,
+    const int W,
+    const std::vector<int64_t>& win_r,
+    const int attn_num,
+    const std::string& attn_type,
+    const float scale);
+#endif

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+imageio==2.9.0
+imageio-ffmpeg==0.4.9
+matplotlib==3.8.4
+opencv-python==4.9.0.80
+pillow==10.2.0
+scikit-image==0.20.0
+scipy==1.9.1
+tensorboard==2.17.0
+setuptools==59.5.0
+psutil==6.0.0
+joblib==1.4.2
+numpy==1.24.4
+tqdm==4.66.2
+timm==0.6.11

utils/file_io.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import sys
+def write_pfm(file, image, scale=1):
+    file = open(file, 'wb')
+    color = None
+    if image.dtype.name != 'float32':
+        raise Exception('Image dtype must be float32.')
+    image = np.flipud(image)
+    if len(image.shape) == 3 and image.shape[2] == 3:  # color image
+        color = True
+    elif len(image.shape) == 2 or len(
+            image.shape) == 3 and image.shape[2] == 1:  # greyscale
+        color = False
+    else:
+        raise Exception(
+            'Image must have H x W x 3, H x W x 1 or H x W dimensions.')
+    file.write(b'PF\n' if color else b'Pf\n')
+    file.write(b'%d %d\n' % (image.shape[1], image.shape[0]))
+    endian = image.dtype.byteorder
+    if endian == '<' or endian == '=' and sys.byteorder == 'little':
+        scale = -scale
+    file.write(b'%f\n' % scale)
+    image.tofile(file)

utils/utils.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import torch
+import torch.nn.functional as F
+import numpy as np
+class InputPadder:
+    """ Pads images such that dimensions are divisible by padding_factor """
+    def __init__(self, dims, mode='top_right', padding_factor=32):
+        self.ht, self.wd = dims[-2:]
+        pad_ht = (((self.ht // padding_factor) + 1) * padding_factor - self.ht) % padding_factor
+        pad_wd = (((self.wd // padding_factor) + 1) * padding_factor - self.wd) % padding_factor
+        if mode == 'sintel':
+            self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, pad_ht // 2, pad_ht - pad_ht // 2]
+        elif mode == 'top_right':
+            self._pad = [0, pad_wd, pad_ht, 0]
+        elif mode == 'bottom_right':
+            self._pad = [0, pad_wd, 0, pad_ht]
+        else:
+            self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, 0, pad_ht]
+    def pad(self, *inputs):
+        return [F.pad(x, self._pad, mode='replicate') for x in inputs]
+    def unpad(self, x):
+        ht, wd = x.shape[-2:]
+        c = [self._pad[2], ht - self._pad[3], self._pad[0], wd - self._pad[1]]
+        return x[..., c[0]:c[1], c[2]:c[3]]
+def init_coords(ref):
+    B, H, W, C = ref.shape
+    coords = torch.meshgrid(torch.arange(H, device=ref.device, dtype=ref.dtype), torch.arange(W, device=ref.device, dtype=ref.dtype), indexing='ij')
+    coords = torch.stack(coords[::-1], dim=-1)
+    return coords[None].repeat(B, 1, 1, 1).to(ref.device) # [B, H, W, 2]
+def bilinear_sample_by_offset(tgt, offset): # tgt [B, _, H, W], offset [B, H, W, 2]
+    _, _, H, W = tgt.shape
+    xgrid, ygrid = offset.split([1, 1], dim=-1)
+    xgrid = 2*xgrid/(W-1) - 1
+    ygrid = 2*ygrid/(H-1) - 1
+    grid = torch.cat([xgrid, ygrid], dim=-1)
+    tgt_to_ref = F.grid_sample(tgt, grid, mode='bilinear', align_corners=True)
+    return tgt_to_ref
+def calc_noc_mask(field, A=2):
+    offset = field + init_coords(field) # [B, H, W, 2]
+    field_ref_, field_tgt_ = field.chunk(2, dim=0)
+    field_ref = torch.cat((field_ref_, field_tgt_), dim=0) # order
+    field_tgt = torch.cat((field_tgt_, field_ref_), dim=0) # reverse order
+    field_tgt_to_ref = bilinear_sample_by_offset(field_tgt.permute(0, 3, 1, 2).contiguous(), offset).permute(0, 2, 3, 1).contiguous()
+    field_diff = torch.abs(field_ref + field_tgt_to_ref).sum(dim=-1) # ref and tgt flow has different sign
+    noc_mask = (field_diff < A).to(field_diff.dtype)
+    return noc_mask