aliensmn commited on Oct 12, 2025

Commit

61029c7

verified ·

1 Parent(s): f387762

Mirror from https://github.com/Fannovel16/ComfyUI-Frame-Interpolation

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +15 -0
.github/workflows/publish.yml +25 -0
.gitignore +3 -0
All_in_one_v1_3.png +3 -0
LICENSE +21 -0
README.md +194 -0
__init__.py +42 -0
config.yaml +3 -0
demo_frames/anime0.png +3 -0
demo_frames/anime1.png +3 -0
demo_frames/bocchi0.jpg +3 -0
demo_frames/bocchi1.jpg +3 -0
demo_frames/real0.png +3 -0
demo_frames/real1.png +3 -0
demo_frames/rick/00003.png +3 -0
demo_frames/rick/00004.png +3 -0
demo_frames/rick/00005.png +3 -0
demo_frames/violet0.png +3 -0
demo_frames/violet1.png +3 -0
example.png +3 -0
install-taichi.bat +11 -0
install.bat +16 -0
install.py +59 -0
interpolation_schedule.png +3 -0
other_nodes.py +88 -0
pyproject.toml +13 -0
requirements-no-cupy.txt +9 -0
requirements-with-cupy.txt +10 -0
test.py +38 -0
test_vfi_schedule.gif +3 -0
vfi_models/amt/__init__.py +87 -0
vfi_models/amt/amt_arch.py +1590 -0
vfi_models/cain/__init__.py +64 -0
vfi_models/cain/cain_arch.py +74 -0
vfi_models/cain/cain_encdec_arch.py +95 -0
vfi_models/cain/cain_noca_arch.py +73 -0
vfi_models/cain/common.py +361 -0
vfi_models/eisai/__init__.py +84 -0
vfi_models/eisai/eisai_arch.py +2586 -0
vfi_models/film/__init__.py +113 -0
vfi_models/film/film_arch.py +798 -0
vfi_models/flavr/__init__.py +115 -0
vfi_models/flavr/flavr_arch.py +217 -0
vfi_models/flavr/resnet_3D.py +288 -0
vfi_models/gmfss_fortuna/GMFSS_Fortuna.py +24 -0
vfi_models/gmfss_fortuna/GMFSS_Fortuna_arch.py +1850 -0
vfi_models/gmfss_fortuna/GMFSS_Fortuna_union.py +23 -0
vfi_models/gmfss_fortuna/GMFSS_Fortuna_union_arch.py +1857 -0
vfi_models/gmfss_fortuna/__init__.py +143 -0
vfi_models/ifrnet/IFRNet_L_arch.py +293 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,18 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+All_in_one_v1_3.png filter=lfs diff=lfs merge=lfs -text
+demo_frames/anime0.png filter=lfs diff=lfs merge=lfs -text
+demo_frames/anime1.png filter=lfs diff=lfs merge=lfs -text
+demo_frames/bocchi0.jpg filter=lfs diff=lfs merge=lfs -text
+demo_frames/bocchi1.jpg filter=lfs diff=lfs merge=lfs -text
+demo_frames/real0.png filter=lfs diff=lfs merge=lfs -text
+demo_frames/real1.png filter=lfs diff=lfs merge=lfs -text
+demo_frames/rick/00003.png filter=lfs diff=lfs merge=lfs -text
+demo_frames/rick/00004.png filter=lfs diff=lfs merge=lfs -text
+demo_frames/rick/00005.png filter=lfs diff=lfs merge=lfs -text
+demo_frames/violet0.png filter=lfs diff=lfs merge=lfs -text
+demo_frames/violet1.png filter=lfs diff=lfs merge=lfs -text
+example.png filter=lfs diff=lfs merge=lfs -text
+interpolation_schedule.png filter=lfs diff=lfs merge=lfs -text
+test_vfi_schedule.gif filter=lfs diff=lfs merge=lfs -text

.github/workflows/publish.yml ADDED Viewed

	@@ -0,0 +1,25 @@

+name: Publish to Comfy registry
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - main
+    paths:
+      - "pyproject.toml"
+permissions:
+  issues: write
+jobs:
+  publish-node:
+    name: Publish Custom Node to registry
+    runs-on: ubuntu-latest
+    if: ${{ github.repository_owner == 'Fannovel16' }}
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v4
+      - name: Publish Custom Node
+        uses: Comfy-Org/publish-node-action@v1
+        with:
+          ## Add your own personal access token to your Github Repository secrets and reference it here.
+          personal_access_token: ${{ secrets.REGISTRY_ACCESS_TOKEN }}

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+ckpts
+__pycache__
+test_result

All_in_one_v1_3.png ADDED Viewed

Git LFS Details

SHA256: 90735b644e0c35634642b65f2a8041a9a4da380d27b9bcc4d3bbef47869bd92a
Pointer size: 132 Bytes
Size of remote file: 1.46 MB

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Fannovel16
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,194 @@

+# ComfyUI Frame Interpolation (ComfyUI VFI) (WIP)
+A custom node set for Video Frame Interpolation in ComfyUI.
+**UPDATE** Memory management is improved. Now this extension takes less RAM and VRAM than before.
+**UPDATE 2** VFI nodes now accept scheduling multipiler values
+![](./interpolation_schedule.png)
+![](./test_vfi_schedule.gif)
+## Nodes
+* KSampler Gradually Adding More Denoise (efficient)
+* GMFSS Fortuna VFI
+* IFRNet VFI
+* IFUnet VFI
+* M2M VFI
+* RIFE VFI (4.0 - 4.9) (Note that option `fast_mode` won't do anything from v4.5+ as `contextnet` is removed)
+* FILM VFI
+* Sepconv VFI
+* AMT VFI
+* Make Interpolation State List
+* STMFNet VFI (requires at least 4 frames, can only do 2x interpolation for now)
+* FLAVR VFI (same conditions as STMFNet)
+## Install
+### ComfyUI Manager
+Incompatibile issue with it is now fixed
+Following this guide to install this extension
+https://github.com/ltdrdata/ComfyUI-Manager#how-to-use
+### Command-line
+#### Windows
+Run install.bat
+For Window users, if you are having trouble with cupy, please run `install.bat` instead of `install-cupy.py` or `python install.py`.
+#### Linux
+Open your shell app and start venv if it is used for ComfyUI. Run:
+```
+python install.py
+```
+## Support for non-CUDA device (experimental)
+If you don't have a NVidia card, you can try `taichi` ops backend powered by [Taichi Lang](https://www.taichi-lang.org/)
+On Windows, you can install it by running `install.bat` or `pip install taichi` on Linux
+Then change value of `ops_backend` from `cupy` to `taichi` in `config.yaml`
+If `NotImplementedError` appears, a VFI node in the workflow isn't supported by taichi
+## Usage
+All VFI nodes can be accessed in **category** `ComfyUI-Frame-Interpolation/VFI` if the installation is successful and require a `IMAGE` containing frames (at least 2, or at least 4 for STMF-Net/FLAVR).
+Regarding STMFNet and FLAVR, if you only have two or three frames, you should use: Load Images -> Other VFI node (FILM is recommended in this case) with `multiplier=4` -> STMFNet VFI/FLAVR VFI
+`clear_cache_after_n_frames` is used to avoid out-of-memory. Decreasing it makes the chance lower but also increases processing time.
+It is recommended to use LoadImages (LoadImagesFromDirectory) from [ComfyUI-Advanced-ControlNet](https://github.com/Kosinkadink/ComfyUI-Advanced-ControlNet/) and [ComfyUI-VideoHelperSuite](https://github.com/Kosinkadink/ComfyUI-VideoHelperSuite) along side with this extension.
+## Example
+### Simple workflow
+Workflow metadata isn't embeded
+Download these two images [anime0.png](./demo_frames/anime0.png) and [anime1.png](./demo_frames/anime0.png) and put them into a folder like `E:\test` in this image.
+![](./example.png)
+### Complex workflow
+It's used in AnimationDiff (can load workflow metadata)
+![](All_in_one_v1_3.png)
+## Credit
+Big thanks for styler00dollar for making [VSGAN-tensorrt-docker](https://github.com/styler00dollar/VSGAN-tensorrt-docker). About 99% the code of this repo comes from it.
+Citation for each VFI node:
+### GMFSS Fortuna
+The All-In-One GMFSS: Dedicated for Anime Video Frame Interpolation
+https://github.com/98mxr/GMFSS_Fortuna
+### IFRNet
+```bibtex
+@InProceedings{Kong_2022_CVPR,
+  author = {Kong, Lingtong and Jiang, Boyuan and Luo, Donghao and Chu, Wenqing and Huang, Xiaoming and Tai, Ying and Wang, Chengjie and Yang, Jie},
+  title = {IFRNet: Intermediate Feature Refine Network for Efficient Frame Interpolation},
+  booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year = {2022}
+}
+```
+### IFUnet
+RIFE with IFUNet, FusionNet and RefineNet
+https://github.com/98mxr/IFUNet
+### M2M
+```bibtex
+@InProceedings{hu2022m2m,
+    title={Many-to-many Splatting for Efficient Video Frame Interpolation},
+    author={Hu, Ping and Niklaus, Simon and Sclaroff, Stan and Saenko, Kate},
+    journal={CVPR},
+    year={2022}
+    }
+```
+### RIFE
+```bibtex
+@inproceedings{huang2022rife,
+  title={Real-Time Intermediate Flow Estimation for Video Frame Interpolation},
+  author={Huang, Zhewei and Zhang, Tianyuan and Heng, Wen and Shi, Boxin and Zhou, Shuchang},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  year={2022}
+}
+```
+### FILM
+[Frame interpolation in PyTorch](https://github.com/dajes/frame-interpolation-pytorch)
+```bibtex
+@inproceedings{reda2022film,
+ title = {FILM: Frame Interpolation for Large Motion},
+ author = {Fitsum Reda and Janne Kontkanen and Eric Tabellion and Deqing Sun and Caroline Pantofaru and Brian Curless},
+ booktitle = {European Conference on Computer Vision (ECCV)},
+ year = {2022}
+}
+```
+```bibtex
+@misc{film-tf,
+  title = {Tensorflow 2 Implementation of "FILM: Frame Interpolation for Large Motion"},
+  author = {Fitsum Reda and Janne Kontkanen and Eric Tabellion and Deqing Sun and Caroline Pantofaru and Brian Curless},
+  year = {2022},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/google-research/frame-interpolation}}
+}
+```
+### Sepconv
+```bibtex
+[1]  @inproceedings{Niklaus_WACV_2021,
+         author = {Simon Niklaus and Long Mai and Oliver Wang},
+         title = {Revisiting Adaptive Convolutions for Video Frame Interpolation},
+         booktitle = {IEEE Winter Conference on Applications of Computer Vision},
+         year = {2021}
+     }
+```
+```bibtex
+[2]  @inproceedings{Niklaus_ICCV_2017,
+         author = {Simon Niklaus and Long Mai and Feng Liu},
+         title = {Video Frame Interpolation via Adaptive Separable Convolution},
+         booktitle = {IEEE International Conference on Computer Vision},
+         year = {2017}
+     }
+```
+```bibtex
+[3]  @inproceedings{Niklaus_CVPR_2017,
+         author = {Simon Niklaus and Long Mai and Feng Liu},
+         title = {Video Frame Interpolation via Adaptive Convolution},
+         booktitle = {IEEE Conference on Computer Vision and Pattern Recognition},
+         year = {2017}
+     }
+```
+### AMT
+   ```bibtex
+   @inproceedings{licvpr23amt,
+      title={AMT: All-Pairs Multi-Field Transforms for Efficient Frame Interpolation},
+      author={Li, Zhen and Zhu, Zuo-Liang and Han, Ling-Hao and Hou, Qibin and Guo, Chun-Le and Cheng, Ming-Ming},
+      booktitle={IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+      year={2023}
+   }
+   ```
+### ST-MFNet
+```bibtex
+@InProceedings{Danier_2022_CVPR,
+    author    = {Danier, Duolikun and Zhang, Fan and Bull, David},
+    title     = {ST-MFNet: A Spatio-Temporal Multi-Flow Network for Frame Interpolation},
+    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+    month     = {June},
+    year      = {2022},
+    pages     = {3521-3531}
+}
+```
+### FLAVR
+```bibtex
+@article{kalluri2021flavr,
+  title={FLAVR: Flow-Agnostic Video Representations for Fast Frame Interpolation},
+  author={Kalluri, Tarun and Pathak, Deepak and Chandraker, Manmohan and Tran, Du},
+  booktitle={arxiv},
+  year={2021}
+}
+```

__init__.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import os
+import sys
+sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
+from .other_nodes import Gradually_More_Denoise_KSampler
+#Some models are commented out because the code is not completed
+#from vfi_models.eisai import EISAI_VFI
+from vfi_models.gmfss_fortuna import GMFSS_Fortuna_VFI
+from vfi_models.ifrnet import IFRNet_VFI
+from vfi_models.ifunet import IFUnet_VFI
+from vfi_models.m2m import M2M_VFI
+from vfi_models.rife import RIFE_VFI
+from vfi_models.sepconv import SepconvVFI
+from vfi_models.amt import AMT_VFI
+from vfi_models.film import FILM_VFI
+from vfi_models.stmfnet import STMFNet_VFI
+from vfi_models.flavr import FLAVR_VFI
+from vfi_models.cain import CAIN_VFI
+from vfi_utils import MakeInterpolationStateList, FloatToInt
+NODE_CLASS_MAPPINGS = {
+    "KSampler Gradually Adding More Denoise (efficient)": Gradually_More_Denoise_KSampler,
+#    "EISAI VFI": EISAI_VFI,
+    "GMFSS Fortuna VFI": GMFSS_Fortuna_VFI,
+    "IFRNet VFI": IFRNet_VFI,
+    "IFUnet VFI": IFUnet_VFI,
+    "M2M VFI": M2M_VFI,
+    "RIFE VFI": RIFE_VFI,
+    "Sepconv VFI": SepconvVFI,
+    "AMT VFI": AMT_VFI,
+    "FILM VFI": FILM_VFI,
+    "Make Interpolation State List": MakeInterpolationStateList,
+    "STMFNet VFI": STMFNet_VFI,
+    "FLAVR VFI": FLAVR_VFI,
+    "CAIN VFI": CAIN_VFI,
+    "VFI FloatToInt": FloatToInt
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+    "RIFE VFI": "RIFE VFI (recommend rife47 and rife49)"
+}

config.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+#Plz don't delete this file, just edit it when neccessary.
+ckpts_path: "./ckpts"
+ops_backend: "cupy" #Either "taichi" or "cupy"

demo_frames/anime0.png ADDED Viewed

Git LFS Details

SHA256: 734039ac77a89cf8d52fed8989bd4335392a1d246b099979d1c58a145c629ace
Pointer size: 131 Bytes
Size of remote file: 341 kB

demo_frames/anime1.png ADDED Viewed

Git LFS Details

SHA256: dd24bdafe9a0cfc82eada33c40962e9977ed5b6711ae6d89bf28b07cbded712a
Pointer size: 131 Bytes
Size of remote file: 329 kB

demo_frames/bocchi0.jpg ADDED Viewed

Git LFS Details

SHA256: c607fae213b83d4c15fa10d6939b612f7f2242afd0b8716b203ace51774f6718
Pointer size: 131 Bytes
Size of remote file: 130 kB

demo_frames/bocchi1.jpg ADDED Viewed

Git LFS Details

SHA256: f03f067142490d4353d3f5af8bd51b0f9f4bdd3d2094dde6a28f4fec062fbe16
Pointer size: 131 Bytes
Size of remote file: 140 kB

demo_frames/real0.png ADDED Viewed

Git LFS Details

SHA256: 4792023ccf17c8231c6eb5ee40de528d515e2f8c419b3949985411a122a4de4f
Pointer size: 132 Bytes
Size of remote file: 1.23 MB

demo_frames/real1.png ADDED Viewed

Git LFS Details

SHA256: 37c8e6ec527c81895e5a66ea49cdd18b85045f9fed6fdfb75b45f438649235bf
Pointer size: 132 Bytes
Size of remote file: 1.21 MB

demo_frames/rick/00003.png ADDED Viewed

Git LFS Details

SHA256: 98f5dba7557ba55d13f494425d340ca84af8b56e35f929fab5df39e54015e265
Pointer size: 131 Bytes
Size of remote file: 456 kB

demo_frames/rick/00004.png ADDED Viewed

Git LFS Details

SHA256: 61bcf7933b192d84870b80910f7f983371c642d5c7100b34e8cc6dbd01cba7e6
Pointer size: 131 Bytes
Size of remote file: 355 kB

demo_frames/rick/00005.png ADDED Viewed

Git LFS Details

SHA256: f795d06e93ad4f9c19db578e9378a48b6008cc3df81fb2cd9fbbd5ed91bd8cf7
Pointer size: 131 Bytes
Size of remote file: 357 kB

demo_frames/violet0.png ADDED Viewed

Git LFS Details

SHA256: c6844899b551801ee22d4f57993ab66fd4b6fbe00eab916d6b987bdf083eadfe
Pointer size: 131 Bytes
Size of remote file: 889 kB

demo_frames/violet1.png ADDED Viewed

Git LFS Details

SHA256: 66ee9a9a486f57eb80ba5d41140eaca4ca46f0d946a3cff93eabb0ee3b1e29d0
Pointer size: 131 Bytes
Size of remote file: 951 kB

example.png ADDED Viewed

Git LFS Details

SHA256: 9a5e9310bfba63b109990b326402d42477688682858bc64f146ef546e6662ead
Pointer size: 131 Bytes
Size of remote file: 182 kB

install-taichi.bat ADDED Viewed

	@@ -0,0 +1,11 @@

+@echo off
+echo Installing Taichi lang backend...
+if exist "%python_exec%" (
+    %python_exec% -s -m pip install taichi
+) else (
+    echo Installing with system Python
+    pip install taichi
+)
+pause

install.bat ADDED Viewed

	@@ -0,0 +1,16 @@

+@echo off
+set "requirements_txt=%~dp0\requirements-no-cupy.txt"
+set "python_exec=..\..\..\python_embeded\python.exe"
+echo Installing ComfyUI Frame Interpolation..
+if exist "%python_exec%" (
+    echo Installing with ComfyUI Portable
+    %python_exec% -s install.py
+) else (
+    echo Installing with system Python
+    python install.py
+)
+pause

install.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import os
+from pathlib import Path
+import sys
+import platform
+def get_cuda_ver_from_dir(cuda_home):
+    nvrtc = filter(lambda lib_file: "nvrtc-builtins" in lib_file, os.listdir(cuda_home))
+    nvrtc = list(nvrtc)
+    if len(nvrtc) == 0:
+        return
+    nvrtc = nvrtc[0]
+    if ('102' in nvrtc) or ('10.2' in nvrtc):
+        return '102'
+    if '110' in nvrtc or ('11.0' in nvrtc):
+        return '110'
+    if '111' in nvrtc or ('11.1' in nvrtc):
+        return '111'
+    if '11' in nvrtc:
+        return '11x'
+    if '12' in nvrtc:
+        return '12x'
+s_param = '-s' if "python_embeded" in sys.executable else ''
+def get_cuda_home_path():
+    if "CUDA_HOME" in os.environ:
+        return os.environ["CUDA_HOME"]
+    import torch
+    torch_lib_path = Path(torch.__file__).parent / "lib"
+    torch_lib_path = str(torch_lib_path.resolve())
+    if os.path.exists(torch_lib_path):
+        nvrtc = filter(lambda lib_file: "nvrtc-builtins" in lib_file, os.listdir(torch_lib_path))
+        nvrtc = list(nvrtc)
+        return torch_lib_path if len(nvrtc) > 0 else None
+def install_cupy():
+    cuda_home = get_cuda_home_path()
+    try:
+        if cuda_home is not None:
+            os.environ["CUDA_HOME"] = cuda_home
+            os.environ["CUDA_PATH"] = cuda_home
+        import cupy
+        print("CuPy is already installed.")
+    except:
+        print("Uninstall cupy if existed...")
+        os.system(f'"{sys.executable}" {s_param} -m pip uninstall -y cupy-wheel cupy-cuda102 cupy-cuda110 cupy-cuda111 cupy-cuda11x cupy-cuda12x')
+        print("Installing cupy...")
+        cuda_ver = get_cuda_ver_from_dir(cuda_home)
+        cupy_package = f"cupy-cuda{cuda_ver}" if cuda_ver is not None else "cupy-wheel"
+        os.system(f'"{sys.executable}" {s_param} -m pip install {cupy_package}')
+with open(Path(__file__).parent / "requirements-no-cupy.txt", 'r') as f:
+    for package in f.readlines():
+        package = package.strip()
+        print(f"Installing {package}...")
+        os.system(f'"{sys.executable}" {s_param} -m pip install {package}')
+print("Checking cupy...")
+install_cupy()

interpolation_schedule.png ADDED Viewed

Git LFS Details

SHA256: c6999ee4a5fd6222b7b05adb8afa4994053bfe8e0f9c6b5cccf25992638b586c
Pointer size: 131 Bytes
Size of remote file: 378 kB

other_nodes.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import latent_preview
+import comfy
+import einops
+import torch
+def common_ksampler(model, seed, steps, cfg, sampler_name, scheduler, positive, negative, latent, denoise=1.0, disable_noise=False, start_step=None, last_step=None, force_full_denoise=False):
+    device = comfy.model_management.get_torch_device()
+    latent_image = latent["samples"]
+    if disable_noise:
+        noise = torch.zeros(latent_image.size(), dtype=latent_image.dtype, layout=latent_image.layout, device="cpu")
+    else:
+        batch_inds = latent["batch_index"] if "batch_index" in latent else None
+        noise = comfy.sample.prepare_noise(latent_image, seed, batch_inds)
+    noise_mask = None
+    if "noise_mask" in latent:
+        noise_mask = latent["noise_mask"]
+    preview_format = "JPEG"
+    if preview_format not in ["JPEG", "PNG"]:
+        preview_format = "JPEG"
+    previewer = latent_preview.get_previewer(device, model.model.latent_format)
+    pbar = comfy.utils.ProgressBar(steps)
+    def callback(step, x0, x, total_steps):
+        preview_bytes = None
+        if previewer:
+            preview_bytes = previewer.decode_latent_to_preview_image(preview_format, x0)
+        pbar.update_absolute(step + 1, total_steps, preview_bytes)
+    samples = comfy.sample.sample(model, noise, steps, cfg, sampler_name, scheduler, positive, negative, latent_image,
+                                  denoise=denoise, disable_noise=disable_noise, start_step=start_step, last_step=last_step,
+                                  force_full_denoise=force_full_denoise, noise_mask=noise_mask, callback=callback, seed=seed)
+    out = latent.copy()
+    out["samples"] = samples
+    return (out, )
+class Gradually_More_Denoise_KSampler:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required":
+                    {"model": ("MODEL",),
+                    "positive": ("CONDITIONING", ),
+                    "negative": ("CONDITIONING", ),
+                    "latent_image": ("LATENT", ),
+                    "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
+                    "steps": ("INT", {"default": 20, "min": 1, "max": 10000}),
+                    "cfg": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 100.0}),
+                    "sampler_name": (comfy.samplers.KSampler.SAMPLERS, ),
+                    "scheduler": (comfy.samplers.KSampler.SCHEDULERS, ),
+                    "start_denoise": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.01}),
+                    "denoise_increment": ("FLOAT", {"default": 0.1, "min": 0.0, "max": 1.0, "step": 0.1}),
+                    "denoise_increment_steps": ("INT", {"default": 20, "min": 1, "max": 10000})
+                     },
+                "optional": { "optional_vae": ("VAE",) }
+                }
+    RETURN_TYPES = ("MODEL", "CONDITIONING", "CONDITIONING", "LATENT", "VAE", )
+    RETURN_NAMES = ("MODEL", "CONDITIONING+", "CONDITIONING-", "LATENT", "VAE", )
+    OUTPUT_NODE = True
+    FUNCTION = "sample"
+    CATEGORY = "ComfyUI-Frame-Interpolation/others"
+    def sample(self, model, positive, negative, latent_image, optional_vae,
+               seed, steps, cfg, sampler_name, scheduler,start_denoise, denoise_increment, denoise_increment_steps):
+        if start_denoise + denoise_increment * denoise_increment_steps > 1.0:
+            raise Exception(f"Max denoise strength can't over 1.0 (start_denoise={start_denoise}, denoise_increment={denoise_increment}, denoise_increment_steps={denoise_increment_steps}")
+        copied_latent = latent_image.copy()
+        out_samples = []
+        for latent_sample in copied_latent["samples"]:
+            latent = {"samples": einops.rearrange(latent_sample, "c h w -> 1 c h w")}
+            #Latent's shape is NCHW
+            gradually_denoising_samples = [
+                common_ksampler(
+                    model, seed, steps, cfg, sampler_name, scheduler, positive, negative, latent, denoise=start_denoise + denoise_increment * i
+                )[0]["samples"]
+                for i in range(denoise_increment_steps)
+            ]
+            out_samples.extend(gradually_denoising_samples)
+        copied_latent["samples"] = torch.cat(out_samples, dim=0)
+        return (model, positive, negative, copied_latent, optional_vae)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,13 @@

+[project]
+name = "comfyui-frame-interpolation"
+description = "A custom node suite for Video Frame Interpolation in ComfyUI"
+version = "1.0.7"
+license = { file = "LICENSE" }
+[project.urls]
+Repository = "https://github.com/Fannovel16/ComfyUI-Frame-Interpolation"
+[tool.comfy]
+PublisherId = "fannovel16"
+DisplayName = "ComfyUI-Frame-Interpolation"
+Icon = ""

requirements-no-cupy.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torch
+numpy
+einops
+opencv-contrib-python
+kornia
+scipy
+Pillow
+torchvision
+tqdm

requirements-with-cupy.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+torch
+numpy
+einops
+opencv-contrib-python
+kornia
+scipy
+Pillow
+torchvision
+tqdm
+cupy-wheel

test.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import os
+import sys
+sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
+import shutil
+import torch
+import torch.nn.functional as F
+import PIL
+import torchvision.transforms.functional as transform
+from vfi_utils import load_file_from_github_release
+from vfi_models import gmfss_fortuna, ifrnet, ifunet, m2m, rife, sepconv, amt, xvfi, cain, flavr
+import numpy as np
+frame_0 = torch.from_numpy(np.array(PIL.Image.open("demo_frames/anime0.png").convert("RGB")).astype(np.float32) / 255.0).unsqueeze(0)
+frame_1 = torch.from_numpy(np.array(PIL.Image.open("demo_frames/anime1.png").convert("RGB")).astype(np.float32) / 255.0).unsqueeze(0)
+if os.path.exists("test_result"):
+    shutil.rmtree("test_result")
+vfi_node_class = gmfss_fortuna.GMFSS_Fortuna_VFI()
+for i, ckpt_name in enumerate(vfi_node_class.INPUT_TYPES()["required"]["ckpt_name"][0][:2]):
+    result = vfi_node_class.vfi(ckpt_name, torch.cat([
+        frame_0,
+        frame_1,
+        frame_0,
+        frame_1
+    ], dim=0).cuda(), multipler=4, batch_size=2)[0]
+    print(result.shape)
+    print(f"Generated {result.size(0)} frames")
+    frames = [PIL.Image.fromarray(np.clip((frame * 255).numpy(), 0, 255).astype(np.uint8)) for frame in result]
+    print(result[0].shape)
+    os.makedirs(f"test_result/video{i}", exist_ok=True)
+    for j, frame in enumerate(frames):
+        frame.save(f"test_result/video{i}/{j}.jpg")
+    frames[0].save(f"test_result/video{i}.gif", save_all=True, append_images=frames[1:], optimize=True, duration=1/3, loop=0)
+    os.startfile(f"test_result{os.path.sep}video{i}.gif")
+#torchvision.io.video.write_video("test.mp4", einops.rearrange(result, "n c h w -> n h w c").cpu(), fps=1)

test_vfi_schedule.gif ADDED Viewed

Git LFS Details

SHA256: 931fcd4c2cc84b457cbc1b1c3b8745a2bf292ff7dc43d4f733a2c510ad90353d
Pointer size: 132 Bytes
Size of remote file: 8.41 MB

vfi_models/amt/__init__.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import pathlib
+import torch
+from torch.utils.data import DataLoader
+import pathlib
+from vfi_utils import load_file_from_direct_url, preprocess_frames, postprocess_frames, generic_frame_loop, InterpolationStateList
+import typing
+from comfy.model_management import get_torch_device
+from .amt_arch import AMT_S, AMT_L, AMT_G, InputPadder
+#https://github.com/MCG-NKU/AMT/tree/main/cfgs
+CKPT_CONFIGS = {
+    "amt-s.pth": {
+        "network": AMT_S,
+        "params": { "corr_radius": 3, "corr_lvls": 4, "num_flows": 3 }
+    },
+    "amt-l.pth": {
+        "network": AMT_L,
+        "params": { "corr_radius": 3, "corr_lvls": 4, "num_flows": 5 }
+    },
+    "amt-g.pth": {
+        "network": AMT_G,
+        "params": { "corr_radius": 3, "corr_lvls": 4, "num_flows": 5 }
+    },
+    "gopro_amt-s.pth": {
+        "network": AMT_S,
+        "params": { "corr_radius": 3, "corr_lvls": 4, "num_flows": 3 }
+    }
+}
+MODEL_TYPE = pathlib.Path(__file__).parent.name
+class AMT_VFI:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "ckpt_name": (list(CKPT_CONFIGS.keys()), ),
+                "frames": ("IMAGE", ),
+                "clear_cache_after_n_frames": ("INT", {"default": 1, "min": 1, "max": 100}),
+                "multiplier": ("INT", {"default": 2, "min": 2, "max": 1000})
+            },
+            "optional": {
+                "optional_interpolation_states": ("INTERPOLATION_STATES", )
+            }
+        }
+    RETURN_TYPES = ("IMAGE", )
+    FUNCTION = "vfi"
+    CATEGORY = "ComfyUI-Frame-Interpolation/VFI"
+    def vfi(
+        self,
+        ckpt_name: typing.AnyStr,
+        frames: torch.Tensor,
+        clear_cache_after_n_frames: typing.SupportsInt = 1,
+        multiplier: typing.SupportsInt = 2,
+        optional_interpolation_states: InterpolationStateList = None,
+        **kwargs
+    ):
+        model_path = load_file_from_direct_url(MODEL_TYPE, f"https://huggingface.co/lalala125/AMT/resolve/main/{ckpt_name}")
+        ckpt_config = CKPT_CONFIGS[ckpt_name]
+        interpolation_model = ckpt_config["network"](**ckpt_config["params"])
+        interpolation_model.load_state_dict(torch.load(model_path)["state_dict"])
+        interpolation_model.eval().to(get_torch_device())
+        frames = preprocess_frames(frames)
+        padder = InputPadder(frames.shape, 16)
+        frames = padder.pad(frames)
+        def return_middle_frame(frame_0, frame_1, timestep, model):
+            return model(
+                frame_0,
+                frame_1,
+                embt=torch.FloatTensor([timestep] * frame_0.shape[0]).view(frame_0.shape[0], 1, 1, 1).to(get_torch_device()),
+                scale_factor=1.0,
+                eval=True
+            )["imgt_pred"]
+        args = [interpolation_model]
+        out = generic_frame_loop(type(self).__name__, frames, clear_cache_after_n_frames, multiplier, return_middle_frame, *args,
+                               interpolation_states=optional_interpolation_states, dtype=torch.float32)
+        out = padder.unpad(out)
+        out = postprocess_frames(out)
+        return (out,)

vfi_models/amt/amt_arch.py ADDED Viewed

	@@ -0,0 +1,1590 @@

+"""
+https://github.com/MCG-NKU/AMT/blob/main/utils/dist_utils.py
+https://github.com/MCG-NKU/AMT/blob/main/utils/flow_utils.py
+https://github.com/MCG-NKU/AMT/blob/main/utils/utils.py
+https://github.com/MCG-NKU/AMT/blob/main/networks/blocks/feat_enc.py
+https://github.com/MCG-NKU/AMT/blob/main/networks/blocks/ifrnet.py
+https://github.com/MCG-NKU/AMT/blob/main/networks/blocks/multi_flow.py
+https://github.com/MCG-NKU/AMT/blob/main/networks/blocks/raft.py
+https://github.com/MCG-NKU/AMT/blob/main/networks/AMT-S.py
+https://github.com/MCG-NKU/AMT/blob/main/networks/AMT-L.py
+https://github.com/MCG-NKU/AMT/blob/main/networks/AMT-G.py
+"""
+#Removed imageio by removing readImage, writeImage
+#The model will receive image tensors from other ComfyUI's nodes so they are unneccessary
+import torch
+import torch.nn as nn
+import numpy as np
+from PIL import ImageFile
+import torch.nn.functional as F
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+import re
+import sys
+import random
+def warp(img, flow):
+    B, _, H, W = flow.shape
+    xx = torch.linspace(-1.0, 1.0, W).view(1, 1, 1, W).expand(B, -1, H, -1)
+    yy = torch.linspace(-1.0, 1.0, H).view(1, 1, H, 1).expand(B, -1, -1, W)
+    grid = torch.cat([xx, yy], 1).to(img)
+    flow_ = torch.cat([flow[:, 0:1, :, :] / ((W - 1.0) / 2.0), flow[:, 1:2, :, :] / ((H - 1.0) / 2.0)], 1)
+    grid_ = (grid + flow_).permute(0, 2, 3, 1)
+    output = F.grid_sample(input=img, grid=grid_, mode='bilinear', padding_mode='border', align_corners=True)
+    return output
+def make_colorwheel():
+    """
+    Generates a color wheel for optical flow visualization as presented in:
+        Baker et al. "A Database and Evaluation Methodology for Optical Flow" (ICCV, 2007)
+        URL: http://vision.middlebury.edu/flow/flowEval-iccv07.pdf
+    Code follows the original C++ source code of Daniel Scharstein.
+    Code follows the the Matlab source code of Deqing Sun.
+    Returns:
+        np.ndarray: Color wheel
+    """
+    RY = 15
+    YG = 6
+    GC = 4
+    CB = 11
+    BM = 13
+    MR = 6
+    ncols = RY + YG + GC + CB + BM + MR
+    colorwheel = np.zeros((ncols, 3))
+    col = 0
+    # RY
+    colorwheel[0:RY, 0] = 255
+    colorwheel[0:RY, 1] = np.floor(255*np.arange(0,RY)/RY)
+    col = col+RY
+    # YG
+    colorwheel[col:col+YG, 0] = 255 - np.floor(255*np.arange(0,YG)/YG)
+    colorwheel[col:col+YG, 1] = 255
+    col = col+YG
+    # GC
+    colorwheel[col:col+GC, 1] = 255
+    colorwheel[col:col+GC, 2] = np.floor(255*np.arange(0,GC)/GC)
+    col = col+GC
+    # CB
+    colorwheel[col:col+CB, 1] = 255 - np.floor(255*np.arange(CB)/CB)
+    colorwheel[col:col+CB, 2] = 255
+    col = col+CB
+    # BM
+    colorwheel[col:col+BM, 2] = 255
+    colorwheel[col:col+BM, 0] = np.floor(255*np.arange(0,BM)/BM)
+    col = col+BM
+    # MR
+    colorwheel[col:col+MR, 2] = 255 - np.floor(255*np.arange(MR)/MR)
+    colorwheel[col:col+MR, 0] = 255
+    return colorwheel
+def flow_uv_to_colors(u, v, convert_to_bgr=False):
+    """
+    Applies the flow color wheel to (possibly clipped) flow components u and v.
+    According to the C++ source code of Daniel Scharstein
+    According to the Matlab source code of Deqing Sun
+    Args:
+        u (np.ndarray): Input horizontal flow of shape [H,W]
+        v (np.ndarray): Input vertical flow of shape [H,W]
+        convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.
+    Returns:
+        np.ndarray: Flow visualization image of shape [H,W,3]
+    """
+    flow_image = np.zeros((u.shape[0], u.shape[1], 3), np.uint8)
+    colorwheel = make_colorwheel()  # shape [55x3]
+    ncols = colorwheel.shape[0]
+    rad = np.sqrt(np.square(u) + np.square(v))
+    a = np.arctan2(-v, -u)/np.pi
+    fk = (a+1) / 2*(ncols-1)
+    k0 = np.floor(fk).astype(np.int32)
+    k1 = k0 + 1
+    k1[k1 == ncols] = 0
+    f = fk - k0
+    for i in range(colorwheel.shape[1]):
+        tmp = colorwheel[:,i]
+        col0 = tmp[k0] / 255.0
+        col1 = tmp[k1] / 255.0
+        col = (1-f)*col0 + f*col1
+        idx = (rad <= 1)
+        col[idx]  = 1 - rad[idx] * (1-col[idx])
+        col[~idx] = col[~idx] * 0.75   # out of range
+        # Note the 2-i => BGR instead of RGB
+        ch_idx = 2-i if convert_to_bgr else i
+        flow_image[:,:,ch_idx] = np.floor(255 * col)
+    return flow_image
+def flow_to_image(flow_uv, clip_flow=None, convert_to_bgr=False):
+    """
+    Expects a two dimensional flow image of shape.
+    Args:
+        flow_uv (np.ndarray): Flow UV image of shape [H,W,2]
+        clip_flow (float, optional): Clip maximum of flow values. Defaults to None.
+        convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.
+    Returns:
+        np.ndarray: Flow visualization image of shape [H,W,3]
+    """
+    assert flow_uv.ndim == 3, 'input flow must have three dimensions'
+    assert flow_uv.shape[2] == 2, 'input flow must have shape [H,W,2]'
+    if clip_flow is not None:
+        flow_uv = np.clip(flow_uv, 0, clip_flow)
+    u = flow_uv[:,:,0]
+    v = flow_uv[:,:,1]
+    rad = np.sqrt(np.square(u) + np.square(v))
+    rad_max = np.max(rad)
+    epsilon = 1e-5
+    u = u / (rad_max + epsilon)
+    v = v / (rad_max + epsilon)
+    return flow_uv_to_colors(u, v, convert_to_bgr)
+class AverageMeter():
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.val = 0.
+        self.avg = 0.
+        self.sum = 0.
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+class AverageMeterGroups:
+    def __init__(self) -> None:
+        self.meter_dict = dict()
+    def update(self, dict, n=1):
+        for name, val in dict.items():
+            if self.meter_dict.get(name) is None:
+                self.meter_dict[name] = AverageMeter()
+            self.meter_dict[name].update(val, n)
+    def reset(self, name=None):
+        if name is None:
+            for v in self.meter_dict.values():
+                v.reset()
+        else:
+            meter = self.meter_dict.get(name)
+            if meter is not None:
+                meter.reset()
+    def avg(self, name):
+        meter = self.meter_dict.get(name)
+        if meter is not None:
+            return meter.avg
+class InputPadder:
+    """ Pads images such that dimensions are divisible by divisor """
+    def __init__(self, dims, divisor=16):
+        self.ht, self.wd = dims[-2:]
+        pad_ht = (((self.ht // divisor) + 1) * divisor - self.ht) % divisor
+        pad_wd = (((self.wd // divisor) + 1) * divisor - self.wd) % divisor
+        self._pad = [pad_wd//2, pad_wd - pad_wd//2, pad_ht//2, pad_ht - pad_ht//2]
+    def pad(self, input_tensor):
+        return F.pad(input_tensor, self._pad, mode='replicate')
+    def unpad(self, input_tensor):
+        return self._unpad(input_tensor)
+    def _unpad(self, x):
+        ht, wd = x.shape[-2:]
+        c = [self._pad[2], ht-self._pad[3], self._pad[0], wd-self._pad[1]]
+        return x[..., c[0]:c[1], c[2]:c[3]]
+def img2tensor(img):
+    if img.shape[-1] > 3:
+        img = img[:,:,:3]
+    return torch.tensor(img).permute(2, 0, 1).unsqueeze(0) / 255.0
+def tensor2img(img_t):
+    return (img_t * 255.).detach(
+                        ).squeeze(0).permute(1, 2, 0).cpu().numpy(
+                        ).clip(0, 255).astype(np.uint8)
+def seed_all(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+def readPFM(file):
+    file = open(file, 'rb')
+    color = None
+    width = None
+    height = None
+    scale = None
+    endian = None
+    header = file.readline().rstrip()
+    if header.decode("ascii") == 'PF':
+        color = True
+    elif header.decode("ascii") == 'Pf':
+        color = False
+    else:
+        raise Exception('Not a PFM file.')
+    dim_match = re.match(r'^(\d+)\s(\d+)\s$', file.readline().decode("ascii"))
+    if dim_match:
+        width, height = list(map(int, dim_match.groups()))
+    else:
+        raise Exception('Malformed PFM header.')
+    scale = float(file.readline().decode("ascii").rstrip())
+    if scale < 0:
+        endian = '<'
+        scale = -scale
+    else:
+        endian = '>'
+    data = np.fromfile(file, endian + 'f')
+    shape = (height, width, 3) if color else (height, width)
+    data = np.reshape(data, shape)
+    data = np.flipud(data)
+    return data, scale
+def writePFM(file, image, scale=1):
+    file = open(file, 'wb')
+    color = None
+    if image.dtype.name != 'float32':
+        raise Exception('Image dtype must be float32.')
+    image = np.flipud(image)
+    if len(image.shape) == 3 and image.shape[2] == 3:
+        color = True
+    elif len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1:
+        color = False
+    else:
+        raise Exception('Image must have H x W x 3, H x W x 1 or H x W dimensions.')
+    file.write('PF\n' if color else 'Pf\n'.encode())
+    file.write('%d %d\n'.encode() % (image.shape[1], image.shape[0]))
+    endian = image.dtype.byteorder
+    if endian == '<' or endian == '=' and sys.byteorder == 'little':
+        scale = -scale
+    file.write('%f\n'.encode() % scale)
+    image.tofile(file)
+def readFlow(name):
+    if name.endswith('.pfm') or name.endswith('.PFM'):
+        return readPFM(name)[0][:,:,0:2]
+    f = open(name, 'rb')
+    header = f.read(4)
+    if header.decode("utf-8") != 'PIEH':
+        raise Exception('Flow file header does not contain PIEH')
+    width = np.fromfile(f, np.int32, 1).squeeze()
+    height = np.fromfile(f, np.int32, 1).squeeze()
+    flow = np.fromfile(f, np.float32, width * height * 2).reshape((height, width, 2))
+    return flow.astype(np.float32)
+def writeFlow(name, flow):
+    f = open(name, 'wb')
+    f.write('PIEH'.encode('utf-8'))
+    np.array([flow.shape[1], flow.shape[0]], dtype=np.int32).tofile(f)
+    flow = flow.astype(np.float32)
+    flow.tofile(f)
+def readFloat(name):
+    f = open(name, 'rb')
+    if(f.readline().decode("utf-8"))  != 'float\n':
+        raise Exception('float file %s did not contain <float> keyword' % name)
+    dim = int(f.readline())
+    dims = []
+    count = 1
+    for i in range(0, dim):
+        d = int(f.readline())
+        dims.append(d)
+        count *= d
+    dims = list(reversed(dims))
+    data = np.fromfile(f, np.float32, count).reshape(dims)
+    if dim > 2:
+        data = np.transpose(data, (2, 1, 0))
+        data = np.transpose(data, (1, 0, 2))
+    return data
+def writeFloat(name, data):
+    f = open(name, 'wb')
+    dim=len(data.shape)
+    if dim>3:
+        raise Exception('bad float file dimension: %d' % dim)
+    f.write(('float\n').encode('ascii'))
+    f.write(('%d\n' % dim).encode('ascii'))
+    if dim == 1:
+        f.write(('%d\n' % data.shape[0]).encode('ascii'))
+    else:
+        f.write(('%d\n' % data.shape[1]).encode('ascii'))
+        f.write(('%d\n' % data.shape[0]).encode('ascii'))
+        for i in range(2, dim):
+            f.write(('%d\n' % data.shape[i]).encode('ascii'))
+    data = data.astype(np.float32)
+    if dim==2:
+        data.tofile(f)
+    else:
+        np.transpose(data, (2, 0, 1)).tofile(f)
+def check_dim_and_resize(tensor_list):
+    shape_list = []
+    for t in tensor_list:
+        shape_list.append(t.shape[2:])
+    if len(set(shape_list)) > 1:
+        desired_shape = shape_list[0]
+        print(f'Inconsistent size of input video frames. All frames will be resized to {desired_shape}')
+        resize_tensor_list = []
+        for t in tensor_list:
+            resize_tensor_list.append(torch.nn.functional.interpolate(t, size=tuple(desired_shape), mode='bilinear'))
+        tensor_list = resize_tensor_list
+    return tensor_list
+class BottleneckBlock(nn.Module):
+    def __init__(self, in_planes, planes, norm_fn='group', stride=1):
+        super(BottleneckBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_planes, planes//4, kernel_size=1, padding=0)
+        self.conv2 = nn.Conv2d(planes//4, planes//4, kernel_size=3, padding=1, stride=stride)
+        self.conv3 = nn.Conv2d(planes//4, planes, kernel_size=1, padding=0)
+        self.relu = nn.ReLU(inplace=True)
+        num_groups = planes // 8
+        if norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes//4)
+            self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes//4)
+            self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            if not stride == 1:
+                self.norm4 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+        elif norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(planes//4)
+            self.norm2 = nn.BatchNorm2d(planes//4)
+            self.norm3 = nn.BatchNorm2d(planes)
+            if not stride == 1:
+                self.norm4 = nn.BatchNorm2d(planes)
+        elif norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(planes//4)
+            self.norm2 = nn.InstanceNorm2d(planes//4)
+            self.norm3 = nn.InstanceNorm2d(planes)
+            if not stride == 1:
+                self.norm4 = nn.InstanceNorm2d(planes)
+        elif norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+            self.norm2 = nn.Sequential()
+            self.norm3 = nn.Sequential()
+            if not stride == 1:
+                self.norm4 = nn.Sequential()
+        if stride == 1:
+            self.downsample = None
+        else:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm4)
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+        y = self.relu(self.norm3(self.conv3(y)))
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return self.relu(x+y)
+class ResidualBlock(nn.Module):
+    def __init__(self, in_planes, planes, norm_fn='group', stride=1):
+        super(ResidualBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, padding=1, stride=stride)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+        num_groups = planes // 8
+        if norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            if not stride == 1:
+                self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+        elif norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(planes)
+            self.norm2 = nn.BatchNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.BatchNorm2d(planes)
+        elif norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(planes)
+            self.norm2 = nn.InstanceNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.InstanceNorm2d(planes)
+        elif norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+            self.norm2 = nn.Sequential()
+            if not stride == 1:
+                self.norm3 = nn.Sequential()
+        if stride == 1:
+            self.downsample = None
+        else:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3)
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return self.relu(x+y)
+class SmallEncoder(nn.Module):
+    def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0):
+        super(SmallEncoder, self).__init__()
+        self.norm_fn = norm_fn
+        if self.norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=32)
+        elif self.norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(32)
+        elif self.norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(32)
+        elif self.norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+        self.conv1 = nn.Conv2d(3, 32, kernel_size=7, stride=2, padding=3)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.in_planes = 32
+        self.layer1 = self._make_layer(32,  stride=1)
+        self.layer2 = self._make_layer(64, stride=2)
+        self.layer3 = self._make_layer(96, stride=2)
+        self.dropout = None
+        if dropout > 0:
+            self.dropout = nn.Dropout2d(p=dropout)
+        self.conv2 = nn.Conv2d(96, output_dim, kernel_size=1)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def _make_layer(self, dim, stride=1):
+        layer1 = BottleneckBlock(self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = BottleneckBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        # if input is list, combine batch dimension
+        is_list = isinstance(x, tuple) or isinstance(x, list)
+        if is_list:
+            batch_dim = x[0].shape[0]
+            x = torch.cat(x, dim=0)
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.conv2(x)
+        if self.training and self.dropout is not None:
+            x = self.dropout(x)
+        if is_list:
+            x = torch.split(x, [batch_dim, batch_dim], dim=0)
+        return x
+class BasicEncoder(nn.Module):
+    def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0):
+        super(BasicEncoder, self).__init__()
+        self.norm_fn = norm_fn
+        if self.norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64)
+        elif self.norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(64)
+        elif self.norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(64)
+        elif self.norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.in_planes = 64
+        self.layer1 = self._make_layer(64,  stride=1)
+        self.layer2 = self._make_layer(72, stride=2)
+        self.layer3 = self._make_layer(128, stride=2)
+        # output convolution
+        self.conv2 = nn.Conv2d(128, output_dim, kernel_size=1)
+        self.dropout = None
+        if dropout > 0:
+            self.dropout = nn.Dropout2d(p=dropout)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def _make_layer(self, dim, stride=1):
+        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        # if input is list, combine batch dimension
+        is_list = isinstance(x, tuple) or isinstance(x, list)
+        if is_list:
+            batch_dim = x[0].shape[0]
+            x = torch.cat(x, dim=0)
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.conv2(x)
+        if self.training and self.dropout is not None:
+            x = self.dropout(x)
+        if is_list:
+            x = torch.split(x, [batch_dim, batch_dim], dim=0)
+        return x
+class LargeEncoder(nn.Module):
+    def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0):
+        super(LargeEncoder, self).__init__()
+        self.norm_fn = norm_fn
+        if self.norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64)
+        elif self.norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(64)
+        elif self.norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(64)
+        elif self.norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.in_planes = 64
+        self.layer1 = self._make_layer(64, stride=1)
+        self.layer2 = self._make_layer(112, stride=2)
+        self.layer3 = self._make_layer(160, stride=2)
+        self.layer3_2 = self._make_layer(160, stride=1)
+        # output convolution
+        self.conv2 = nn.Conv2d(self.in_planes, output_dim, kernel_size=1)
+        self.dropout = None
+        if dropout > 0:
+            self.dropout = nn.Dropout2d(p=dropout)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def _make_layer(self, dim, stride=1):
+        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        # if input is list, combine batch dimension
+        is_list = isinstance(x, tuple) or isinstance(x, list)
+        if is_list:
+            batch_dim = x[0].shape[0]
+            x = torch.cat(x, dim=0)
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer3_2(x)
+        x = self.conv2(x)
+        if self.training and self.dropout is not None:
+            x = self.dropout(x)
+        if is_list:
+            x = torch.split(x, [batch_dim, batch_dim], dim=0)
+        return x
+def resize(x, scale_factor):
+    return F.interpolate(x, scale_factor=scale_factor, mode="bilinear", align_corners=False)
+def convrelu(in_channels, out_channels, kernel_size=3, stride=1, padding=1, dilation=1, groups=1, bias=True):
+    return nn.Sequential(
+        nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias=bias),
+        nn.PReLU(out_channels)
+    )
+class ResBlock(nn.Module):
+    def __init__(self, in_channels, side_channels, bias=True):
+        super(ResBlock, self).__init__()
+        self.side_channels = side_channels
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias),
+            nn.PReLU(in_channels)
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(side_channels, side_channels, kernel_size=3, stride=1, padding=1, bias=bias),
+            nn.PReLU(side_channels)
+        )
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias),
+            nn.PReLU(in_channels)
+        )
+        self.conv4 = nn.Sequential(
+            nn.Conv2d(side_channels, side_channels, kernel_size=3, stride=1, padding=1, bias=bias),
+            nn.PReLU(side_channels)
+        )
+        self.conv5 = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias)
+        self.prelu = nn.PReLU(in_channels)
+    def forward(self, x):
+        out = self.conv1(x)
+        res_feat = out[:, :-self.side_channels, ...]
+        side_feat = out[:, -self.side_channels:, :, :]
+        side_feat = self.conv2(side_feat)
+        out = self.conv3(torch.cat([res_feat, side_feat], 1))
+        res_feat = out[:, :-self.side_channels, ...]
+        side_feat = out[:, -self.side_channels:, :, :]
+        side_feat = self.conv4(side_feat)
+        out = self.conv5(torch.cat([res_feat, side_feat], 1))
+        out = self.prelu(x + out)
+        return out
+class Encoder(nn.Module):
+    def __init__(self, channels, large=False):
+        super(Encoder, self).__init__()
+        self.channels = channels
+        prev_ch = 3
+        for idx, ch in enumerate(channels, 1):
+            k = 7 if large and idx == 1 else 3
+            p = 3 if k ==7 else 1
+            self.register_module(f'pyramid{idx}',
+            nn.Sequential(
+                convrelu(prev_ch, ch, k, 2, p),
+                convrelu(ch, ch, 3, 1, 1)
+            ))
+            prev_ch = ch
+    def forward(self, in_x):
+        fs = []
+        for idx in range(len(self.channels)):
+            out_x = getattr(self, f'pyramid{idx+1}')(in_x)
+            fs.append(out_x)
+            in_x = out_x
+        return fs
+class InitDecoder(nn.Module):
+    def __init__(self, in_ch, out_ch, skip_ch) -> None:
+        super().__init__()
+        self.convblock = nn.Sequential(
+            convrelu(in_ch*2+1, in_ch*2),
+            ResBlock(in_ch*2, skip_ch),
+            nn.ConvTranspose2d(in_ch*2, out_ch+4, 4, 2, 1, bias=True)
+        )
+    def forward(self, f0, f1, embt):
+        h, w = f0.shape[2:]
+        embt = embt.repeat(1, 1, h, w)
+        out = self.convblock(torch.cat([f0, f1, embt], 1))
+        flow0, flow1 = torch.chunk(out[:, :4, ...], 2, 1)
+        ft_ = out[:, 4:, ...]
+        return flow0, flow1, ft_
+class IntermediateDecoder(nn.Module):
+    def __init__(self, in_ch, out_ch, skip_ch) -> None:
+        super().__init__()
+        self.convblock = nn.Sequential(
+            convrelu(in_ch*3+4, in_ch*3),
+            ResBlock(in_ch*3, skip_ch),
+            nn.ConvTranspose2d(in_ch*3, out_ch+4, 4, 2, 1, bias=True)
+        )
+    def forward(self, ft_, f0, f1, flow0_in, flow1_in):
+        f0_warp = warp(f0, flow0_in)
+        f1_warp = warp(f1, flow1_in)
+        f_in = torch.cat([ft_, f0_warp, f1_warp, flow0_in, flow1_in], 1)
+        out = self.convblock(f_in)
+        flow0, flow1 = torch.chunk(out[:, :4, ...], 2, 1)
+        ft_ = out[:, 4:, ...]
+        flow0 = flow0 + 2.0 * resize(flow0_in, scale_factor=2.0)
+        flow1 = flow1 + 2.0 * resize(flow1_in, scale_factor=2.0)
+        return flow0, flow1, ft_
+def multi_flow_combine(comb_block, img0, img1, flow0, flow1,
+                       mask=None, img_res=None, mean=None):
+        '''
+            A parallel implementation of multiple flow field warping
+            comb_block: An nn.Seqential object.
+            img shape: [b, c, h, w]
+            flow shape: [b, 2*num_flows, h, w]
+            mask (opt):
+                If 'mask' is None, the function conduct a simple average.
+            img_res (opt):
+                If 'img_res' is None, the function adds zero instead.
+            mean (opt):
+                If 'mean' is None, the function adds zero instead.
+        '''
+        b, c, h, w = flow0.shape
+        num_flows = c // 2
+        flow0   =   flow0.reshape(b, num_flows, 2, h, w).reshape(-1, 2, h, w)
+        flow1   =   flow1.reshape(b, num_flows, 2, h, w).reshape(-1, 2, h, w)
+        mask    =    mask.reshape(b, num_flows, 1, h, w
+                            ).reshape(-1, 1, h, w) if mask is not None else None
+        img_res = img_res.reshape(b, num_flows, 3, h, w
+                            ).reshape(-1, 3, h, w)  if img_res is not None else 0
+        img0 = torch.stack([img0] * num_flows, 1).reshape(-1, 3, h, w)
+        img1 = torch.stack([img1] * num_flows, 1).reshape(-1, 3, h, w)
+        mean = torch.stack([mean] * num_flows, 1).reshape(-1, 1, 1, 1
+                                                    ) if mean is not None else 0
+        img0_warp = warp(img0, flow0)
+        img1_warp = warp(img1, flow1)
+        img_warps = mask * img0_warp + (1 - mask) * img1_warp + mean + img_res
+        img_warps = img_warps.reshape(b, num_flows, 3, h, w)
+        imgt_pred = img_warps.mean(1) + comb_block(img_warps.view(b, -1, h, w))
+        return imgt_pred
+class MultiFlowDecoder(nn.Module):
+    def __init__(self, in_ch, skip_ch, num_flows=3):
+        super(MultiFlowDecoder, self).__init__()
+        self.num_flows = num_flows
+        self.convblock = nn.Sequential(
+            convrelu(in_ch*3+4, in_ch*3),
+            ResBlock(in_ch*3, skip_ch),
+            nn.ConvTranspose2d(in_ch*3, 8*num_flows, 4, 2, 1, bias=True)
+        )
+    def forward(self, ft_, f0, f1, flow0, flow1):
+        n = self.num_flows
+        f0_warp = warp(f0, flow0)
+        f1_warp = warp(f1, flow1)
+        out = self.convblock(torch.cat([ft_, f0_warp, f1_warp, flow0, flow1], 1))
+        delta_flow0, delta_flow1, mask, img_res = torch.split(out, [2*n, 2*n, n, 3*n], 1)
+        mask = torch.sigmoid(mask)
+        flow0 = delta_flow0 + 2.0 * resize(flow0, scale_factor=2.0
+                                           ).repeat(1, self.num_flows, 1, 1)
+        flow1 = delta_flow1 + 2.0 * resize(flow1, scale_factor=2.0
+                                           ).repeat(1, self.num_flows, 1, 1)
+        return flow0, flow1, mask, img_res
+def resize(x, scale_factor):
+    return F.interpolate(x, scale_factor=scale_factor, mode="bilinear", align_corners=False)
+def bilinear_sampler(img, coords, mask=False):
+    """ Wrapper for grid_sample, uses pixel coordinates """
+    H, W = img.shape[-2:]
+    xgrid, ygrid = coords.split([1,1], dim=-1)
+    xgrid = 2*xgrid/(W-1) - 1
+    ygrid = 2*ygrid/(H-1) - 1
+    grid = torch.cat([xgrid, ygrid], dim=-1)
+    img = F.grid_sample(img, grid, align_corners=True)
+    if mask:
+        mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1)
+        return img, mask.float()
+    return img
+def coords_grid(batch, ht, wd, device):
+    coords = torch.meshgrid(torch.arange(ht, device=device),
+                            torch.arange(wd, device=device),
+                            indexing='ij')
+    coords = torch.stack(coords[::-1], dim=0).float()
+    return coords[None].repeat(batch, 1, 1, 1)
+class SmallUpdateBlock(nn.Module):
+    def __init__(self, cdim, hidden_dim, flow_dim, corr_dim, fc_dim,
+                 corr_levels=4, radius=3, scale_factor=None):
+        super(SmallUpdateBlock, self).__init__()
+        cor_planes = corr_levels * (2 * radius + 1) **2
+        self.scale_factor = scale_factor
+        self.convc1 = nn.Conv2d(2 * cor_planes, corr_dim, 1, padding=0)
+        self.convf1 = nn.Conv2d(4, flow_dim*2, 7, padding=3)
+        self.convf2 = nn.Conv2d(flow_dim*2, flow_dim, 3, padding=1)
+        self.conv = nn.Conv2d(corr_dim+flow_dim, fc_dim, 3, padding=1)
+        self.gru = nn.Sequential(
+            nn.Conv2d(fc_dim+4+cdim, hidden_dim, 3, padding=1),
+            nn.LeakyReLU(negative_slope=0.1, inplace=True),
+            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
+        )
+        self.feat_head = nn.Sequential(
+            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
+            nn.LeakyReLU(negative_slope=0.1, inplace=True),
+            nn.Conv2d(hidden_dim, cdim, 3, padding=1),
+        )
+        self.flow_head = nn.Sequential(
+            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
+            nn.LeakyReLU(negative_slope=0.1, inplace=True),
+            nn.Conv2d(hidden_dim, 4, 3, padding=1),
+        )
+        self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True)
+    def forward(self, net, flow, corr):
+        net = resize(net, 1 / self.scale_factor
+                      ) if self.scale_factor is not None else net
+        cor = self.lrelu(self.convc1(corr))
+        flo = self.lrelu(self.convf1(flow))
+        flo = self.lrelu(self.convf2(flo))
+        cor_flo = torch.cat([cor, flo], dim=1)
+        inp = self.lrelu(self.conv(cor_flo))
+        inp = torch.cat([inp, flow, net], dim=1)
+        out = self.gru(inp)
+        delta_net = self.feat_head(out)
+        delta_flow = self.flow_head(out)
+        if self.scale_factor is not None:
+            delta_net = resize(delta_net, scale_factor=self.scale_factor)
+            delta_flow = self.scale_factor * resize(delta_flow, scale_factor=self.scale_factor)
+        return delta_net, delta_flow
+class BasicUpdateBlock(nn.Module):
+    def __init__(self, cdim, hidden_dim, flow_dim, corr_dim, corr_dim2,
+                 fc_dim, corr_levels=4, radius=3, scale_factor=None, out_num=1):
+        super(BasicUpdateBlock, self).__init__()
+        cor_planes = corr_levels * (2 * radius + 1) **2
+        self.scale_factor = scale_factor
+        self.convc1 = nn.Conv2d(2 * cor_planes, corr_dim, 1, padding=0)
+        self.convc2 = nn.Conv2d(corr_dim, corr_dim2, 3, padding=1)
+        self.convf1 = nn.Conv2d(4, flow_dim*2, 7, padding=3)
+        self.convf2 = nn.Conv2d(flow_dim*2, flow_dim, 3, padding=1)
+        self.conv = nn.Conv2d(flow_dim+corr_dim2, fc_dim, 3, padding=1)
+        self.gru = nn.Sequential(
+            nn.Conv2d(fc_dim+4+cdim, hidden_dim, 3, padding=1),
+            nn.LeakyReLU(negative_slope=0.1, inplace=True),
+            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
+        )
+        self.feat_head = nn.Sequential(
+            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
+            nn.LeakyReLU(negative_slope=0.1, inplace=True),
+            nn.Conv2d(hidden_dim, cdim, 3, padding=1),
+        )
+        self.flow_head = nn.Sequential(
+            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
+            nn.LeakyReLU(negative_slope=0.1, inplace=True),
+            nn.Conv2d(hidden_dim, 4*out_num, 3, padding=1),
+        )
+        self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True)
+    def forward(self, net, flow, corr):
+        net = resize(net, 1 / self.scale_factor
+                      ) if self.scale_factor is not None else net
+        cor = self.lrelu(self.convc1(corr))
+        cor = self.lrelu(self.convc2(cor))
+        flo = self.lrelu(self.convf1(flow))
+        flo = self.lrelu(self.convf2(flo))
+        cor_flo = torch.cat([cor, flo], dim=1)
+        inp = self.lrelu(self.conv(cor_flo))
+        inp = torch.cat([inp, flow, net], dim=1)
+        out = self.gru(inp)
+        delta_net = self.feat_head(out)
+        delta_flow = self.flow_head(out)
+        if self.scale_factor is not None:
+            delta_net = resize(delta_net, scale_factor=self.scale_factor)
+            delta_flow = self.scale_factor * resize(delta_flow, scale_factor=self.scale_factor)
+        return delta_net, delta_flow
+class BidirCorrBlock:
+    def __init__(self, fmap1, fmap2, num_levels=4, radius=4):
+        self.num_levels = num_levels
+        self.radius = radius
+        self.corr_pyramid = []
+        self.corr_pyramid_T = []
+        corr = BidirCorrBlock.corr(fmap1, fmap2)
+        batch, h1, w1, dim, h2, w2 = corr.shape
+        corr_T = corr.clone().permute(0, 4, 5, 3, 1, 2)
+        corr = corr.reshape(batch*h1*w1, dim, h2, w2)
+        corr_T = corr_T.reshape(batch*h2*w2, dim, h1, w1)
+        self.corr_pyramid.append(corr)
+        self.corr_pyramid_T.append(corr_T)
+        for _ in range(self.num_levels-1):
+            corr = F.avg_pool2d(corr, 2, stride=2)
+            corr_T = F.avg_pool2d(corr_T, 2, stride=2)
+            self.corr_pyramid.append(corr)
+            self.corr_pyramid_T.append(corr_T)
+    def __call__(self, coords0, coords1):
+        r = self.radius
+        coords0 = coords0.permute(0, 2, 3, 1)
+        coords1 = coords1.permute(0, 2, 3, 1)
+        assert coords0.shape == coords1.shape, f"coords0 shape: [{coords0.shape}] is not equal to [{coords1.shape}]"
+        batch, h1, w1, _ = coords0.shape
+        out_pyramid = []
+        out_pyramid_T = []
+        for i in range(self.num_levels):
+            corr = self.corr_pyramid[i]
+            corr_T = self.corr_pyramid_T[i]
+            dx = torch.linspace(-r, r, 2*r+1, device=coords0.device)
+            dy = torch.linspace(-r, r, 2*r+1, device=coords0.device)
+            delta = torch.stack(torch.meshgrid(dy, dx, indexing='ij'), axis=-1)
+            delta_lvl = delta.view(1, 2*r+1, 2*r+1, 2)
+            centroid_lvl_0 = coords0.reshape(batch*h1*w1, 1, 1, 2) / 2**i
+            centroid_lvl_1 = coords1.reshape(batch*h1*w1, 1, 1, 2) / 2**i
+            coords_lvl_0 = centroid_lvl_0 + delta_lvl
+            coords_lvl_1 = centroid_lvl_1 + delta_lvl
+            corr = bilinear_sampler(corr, coords_lvl_0)
+            corr_T = bilinear_sampler(corr_T, coords_lvl_1)
+            corr = corr.view(batch, h1, w1, -1)
+            corr_T = corr_T.view(batch, h1, w1, -1)
+            out_pyramid.append(corr)
+            out_pyramid_T.append(corr_T)
+        out = torch.cat(out_pyramid, dim=-1)
+        out_T = torch.cat(out_pyramid_T, dim=-1)
+        return out.permute(0, 3, 1, 2).contiguous().float(), out_T.permute(0, 3, 1, 2).contiguous().float()
+    @staticmethod
+    def corr(fmap1, fmap2):
+        batch, dim, ht, wd = fmap1.shape
+        fmap1 = fmap1.view(batch, dim, ht*wd)
+        fmap2 = fmap2.view(batch, dim, ht*wd)
+        corr = torch.matmul(fmap1.transpose(1,2), fmap2)
+        corr = corr.view(batch, ht, wd, 1, ht, wd)
+        return corr  / torch.sqrt(torch.tensor(dim).float())
+class AMT_S(nn.Module):
+    def __init__(self,
+                 corr_radius=3,
+                 corr_lvls=4,
+                 num_flows=3,
+                 channels=[20, 32, 44, 56],
+                 skip_channels=20):
+        super(AMT_S, self).__init__()
+        self.radius = corr_radius
+        self.corr_levels = corr_lvls
+        self.num_flows = num_flows
+        self.channels = channels
+        self.skip_channels = skip_channels
+        self.feat_encoder = SmallEncoder(output_dim=84, norm_fn='instance', dropout=0.)
+        self.encoder = Encoder(channels)
+        self.decoder4 = InitDecoder(channels[3], channels[2], skip_channels)
+        self.decoder3 = IntermediateDecoder(channels[2], channels[1], skip_channels)
+        self.decoder2 = IntermediateDecoder(channels[1], channels[0], skip_channels)
+        self.decoder1 = MultiFlowDecoder(channels[0], skip_channels, num_flows)
+        self.update4 = self._get_updateblock(44)
+        self.update3 = self._get_updateblock(32, 2)
+        self.update2 = self._get_updateblock(20, 4)
+        self.comb_block = nn.Sequential(
+            nn.Conv2d(3*num_flows, 6*num_flows, 3, 1, 1),
+            nn.PReLU(6*num_flows),
+            nn.Conv2d(6*num_flows, 3, 3, 1, 1),
+        )
+    def _get_updateblock(self, cdim, scale_factor=None):
+        return SmallUpdateBlock(cdim=cdim, hidden_dim=76, flow_dim=20, corr_dim=64,
+                                fc_dim=68, scale_factor=scale_factor,
+                                corr_levels=self.corr_levels, radius=self.radius)
+    def _corr_scale_lookup(self, corr_fn, coord, flow0, flow1, embt, downsample=1):
+        # convert t -> 0 to 0 -> 1 | convert t -> 1 to 1 -> 0
+        # based on linear assumption
+        t1_scale = 1. / embt
+        t0_scale = 1. / (1. - embt)
+        if downsample != 1:
+            inv = 1 / downsample
+            flow0 = inv * resize(flow0, scale_factor=inv)
+            flow1 = inv * resize(flow1, scale_factor=inv)
+        corr0, corr1 = corr_fn(coord + flow1 * t1_scale, coord + flow0 * t0_scale)
+        corr = torch.cat([corr0, corr1], dim=1)
+        flow = torch.cat([flow0, flow1], dim=1)
+        return corr, flow
+    def forward(self, img0, img1, embt, scale_factor=1.0, eval=False, **kwargs):
+        mean_ = torch.cat([img0, img1], 2).mean(1, keepdim=True).mean(2, keepdim=True).mean(3, keepdim=True)
+        img0 = img0 - mean_
+        img1 = img1 - mean_
+        img0_ = resize(img0, scale_factor) if scale_factor != 1.0 else img0
+        img1_ = resize(img1, scale_factor) if scale_factor != 1.0 else img1
+        b, _, h, w = img0_.shape
+        coord = coords_grid(b, h // 8, w // 8, img0.device)
+        fmap0, fmap1 = self.feat_encoder([img0_, img1_]) # [1, 128, H//8, W//8]
+        corr_fn = BidirCorrBlock(fmap0, fmap1, radius=self.radius, num_levels=self.corr_levels)
+        # f0_1: [1, c0, H//2, W//2] | f0_2: [1, c1, H//4, W//4]
+        # f0_3: [1, c2, H//8, W//8] | f0_4: [1, c3, H//16, W//16]
+        f0_1, f0_2, f0_3, f0_4 = self.encoder(img0_)
+        f1_1, f1_2, f1_3, f1_4 = self.encoder(img1_)
+        ######################################### the 4th decoder #########################################
+        up_flow0_4, up_flow1_4, ft_3_ = self.decoder4(f0_4, f1_4, embt)
+        corr_4, flow_4 = self._corr_scale_lookup(corr_fn, coord,
+                                                 up_flow0_4, up_flow1_4,
+                                                 embt, downsample=1)
+        # residue update with lookup corr
+        delta_ft_3_, delta_flow_4 = self.update4(ft_3_, flow_4, corr_4)
+        delta_flow0_4, delta_flow1_4 = torch.chunk(delta_flow_4, 2, 1)
+        up_flow0_4 = up_flow0_4 + delta_flow0_4
+        up_flow1_4 = up_flow1_4 + delta_flow1_4
+        ft_3_ = ft_3_ + delta_ft_3_
+        ######################################### the 3rd decoder #########################################
+        up_flow0_3, up_flow1_3, ft_2_ = self.decoder3(ft_3_, f0_3, f1_3, up_flow0_4, up_flow1_4)
+        corr_3, flow_3 = self._corr_scale_lookup(corr_fn,
+                                                 coord, up_flow0_3, up_flow1_3,
+                                                 embt, downsample=2)
+        # residue update with lookup corr
+        delta_ft_2_, delta_flow_3 = self.update3(ft_2_, flow_3, corr_3)
+        delta_flow0_3, delta_flow1_3 = torch.chunk(delta_flow_3, 2, 1)
+        up_flow0_3 = up_flow0_3 + delta_flow0_3
+        up_flow1_3 = up_flow1_3 + delta_flow1_3
+        ft_2_ = ft_2_ + delta_ft_2_
+        ######################################### the 2nd decoder #########################################
+        up_flow0_2, up_flow1_2, ft_1_  = self.decoder2(ft_2_, f0_2, f1_2, up_flow0_3, up_flow1_3)
+        corr_2, flow_2 = self._corr_scale_lookup(corr_fn,
+                                                 coord, up_flow0_2, up_flow1_2,
+                                                 embt, downsample=4)
+        # residue update with lookup corr
+        delta_ft_1_, delta_flow_2 = self.update2(ft_1_, flow_2, corr_2)
+        delta_flow0_2, delta_flow1_2 = torch.chunk(delta_flow_2, 2, 1)
+        up_flow0_2 = up_flow0_2 + delta_flow0_2
+        up_flow1_2 = up_flow1_2 + delta_flow1_2
+        ft_1_ = ft_1_ + delta_ft_1_
+        ######################################### the 1st decoder #########################################
+        up_flow0_1, up_flow1_1, mask, img_res = self.decoder1(ft_1_, f0_1, f1_1, up_flow0_2, up_flow1_2)
+        if scale_factor != 1.0:
+            up_flow0_1 = resize(up_flow0_1, scale_factor=(1.0/scale_factor)) * (1.0/scale_factor)
+            up_flow1_1 = resize(up_flow1_1, scale_factor=(1.0/scale_factor)) * (1.0/scale_factor)
+            mask = resize(mask, scale_factor=(1.0/scale_factor))
+            img_res = resize(img_res, scale_factor=(1.0/scale_factor))
+        # Merge multiple predictions
+        imgt_pred = multi_flow_combine(self.comb_block, img0, img1, up_flow0_1, up_flow1_1,
+                                                                        mask, img_res, mean_)
+        imgt_pred = torch.clamp(imgt_pred, 0, 1)
+        if eval:
+            return  { 'imgt_pred': imgt_pred, }
+        else:
+            up_flow0_1 = up_flow0_1.reshape(b, self.num_flows, 2, h, w)
+            up_flow1_1 = up_flow1_1.reshape(b, self.num_flows, 2, h, w)
+            return {
+                'imgt_pred': imgt_pred,
+                'flow0_pred': [up_flow0_1, up_flow0_2, up_flow0_3, up_flow0_4],
+                'flow1_pred': [up_flow1_1, up_flow1_2, up_flow1_3, up_flow1_4],
+                'ft_pred': [ft_1_, ft_2_, ft_3_],
+            }
+class AMT_L(nn.Module):
+    def __init__(self,
+                 corr_radius=3,
+                 corr_lvls=4,
+                 num_flows=5,
+                 channels=[48, 64, 72, 128],
+                 skip_channels=48
+                 ):
+        super(AMT_L, self).__init__()
+        self.radius = corr_radius
+        self.corr_levels = corr_lvls
+        self.num_flows = num_flows
+        self.feat_encoder = BasicEncoder(output_dim=128, norm_fn='instance', dropout=0.)
+        self.encoder = Encoder([48, 64, 72, 128], large=True)
+        self.decoder4 = InitDecoder(channels[3], channels[2], skip_channels)
+        self.decoder3 = IntermediateDecoder(channels[2], channels[1], skip_channels)
+        self.decoder2 = IntermediateDecoder(channels[1], channels[0], skip_channels)
+        self.decoder1 = MultiFlowDecoder(channels[0], skip_channels, num_flows)
+        self.update4 = self._get_updateblock(72, None)
+        self.update3 = self._get_updateblock(64, 2.0)
+        self.update2 = self._get_updateblock(48, 4.0)
+        self.comb_block = nn.Sequential(
+            nn.Conv2d(3*self.num_flows, 6*self.num_flows, 7, 1, 3),
+            nn.PReLU(6*self.num_flows),
+            nn.Conv2d(6*self.num_flows, 3, 7, 1, 3),
+        )
+    def _get_updateblock(self, cdim, scale_factor=None):
+        return BasicUpdateBlock(cdim=cdim, hidden_dim=128, flow_dim=48,
+                                corr_dim=256, corr_dim2=160, fc_dim=124,
+                                scale_factor=scale_factor, corr_levels=self.corr_levels,
+                                radius=self.radius)
+    def _corr_scale_lookup(self, corr_fn, coord, flow0, flow1, embt, downsample=1):
+        # convert t -> 0 to 0 -> 1 | convert t -> 1 to 1 -> 0
+        # based on linear assumption
+        t1_scale = 1. / embt
+        t0_scale = 1. / (1. - embt)
+        if downsample != 1:
+            inv = 1 / downsample
+            flow0 = inv * resize(flow0, scale_factor=inv)
+            flow1 = inv * resize(flow1, scale_factor=inv)
+        corr0, corr1 = corr_fn(coord + flow1 * t1_scale, coord + flow0 * t0_scale)
+        corr = torch.cat([corr0, corr1], dim=1)
+        flow = torch.cat([flow0, flow1], dim=1)
+        return corr, flow
+    def forward(self, img0, img1, embt, scale_factor=1.0, eval=False, **kwargs):
+        mean_ = torch.cat([img0, img1], 2).mean(1, keepdim=True).mean(2, keepdim=True).mean(3, keepdim=True)
+        img0 = img0 - mean_
+        img1 = img1 - mean_
+        img0_ = resize(img0, scale_factor) if scale_factor != 1.0 else img0
+        img1_ = resize(img1, scale_factor) if scale_factor != 1.0 else img1
+        b, _, h, w = img0_.shape
+        coord = coords_grid(b, h // 8, w // 8, img0.device)
+        fmap0, fmap1 = self.feat_encoder([img0_, img1_]) # [1, 128, H//8, W//8]
+        corr_fn = BidirCorrBlock(fmap0, fmap1, radius=self.radius, num_levels=self.corr_levels)
+        # f0_1: [1, c0, H//2, W//2] | f0_2: [1, c1, H//4, W//4]
+        # f0_3: [1, c2, H//8, W//8] | f0_4: [1, c3, H//16, W//16]
+        f0_1, f0_2, f0_3, f0_4 = self.encoder(img0_)
+        f1_1, f1_2, f1_3, f1_4 = self.encoder(img1_)
+        ######################################### the 4th decoder #########################################
+        up_flow0_4, up_flow1_4, ft_3_ = self.decoder4(f0_4, f1_4, embt)
+        corr_4, flow_4 = self._corr_scale_lookup(corr_fn, coord,
+                                                 up_flow0_4, up_flow1_4,
+                                                 embt, downsample=1)
+        # residue update with lookup corr
+        delta_ft_3_, delta_flow_4 = self.update4(ft_3_, flow_4, corr_4)
+        delta_flow0_4, delta_flow1_4 = torch.chunk(delta_flow_4, 2, 1)
+        up_flow0_4 = up_flow0_4 + delta_flow0_4
+        up_flow1_4 = up_flow1_4 + delta_flow1_4
+        ft_3_ = ft_3_ + delta_ft_3_
+        ######################################### the 3rd decoder #########################################
+        up_flow0_3, up_flow1_3, ft_2_ = self.decoder3(ft_3_, f0_3, f1_3, up_flow0_4, up_flow1_4)
+        corr_3, flow_3 = self._corr_scale_lookup(corr_fn,
+                                                 coord, up_flow0_3, up_flow1_3,
+                                                 embt, downsample=2)
+        # residue update with lookup corr
+        delta_ft_2_, delta_flow_3 = self.update3(ft_2_, flow_3, corr_3)
+        delta_flow0_3, delta_flow1_3 = torch.chunk(delta_flow_3, 2, 1)
+        up_flow0_3 = up_flow0_3 + delta_flow0_3
+        up_flow1_3 = up_flow1_3 + delta_flow1_3
+        ft_2_ = ft_2_ + delta_ft_2_
+        ######################################### the 2nd decoder #########################################
+        up_flow0_2, up_flow1_2, ft_1_  = self.decoder2(ft_2_, f0_2, f1_2, up_flow0_3, up_flow1_3)
+        corr_2, flow_2 = self._corr_scale_lookup(corr_fn,
+                                                 coord, up_flow0_2, up_flow1_2,
+                                                 embt, downsample=4)
+        # residue update with lookup corr
+        delta_ft_1_, delta_flow_2 = self.update2(ft_1_, flow_2, corr_2)
+        delta_flow0_2, delta_flow1_2 = torch.chunk(delta_flow_2, 2, 1)
+        up_flow0_2 = up_flow0_2 + delta_flow0_2
+        up_flow1_2 = up_flow1_2 + delta_flow1_2
+        ft_1_ = ft_1_ + delta_ft_1_
+        ######################################### the 1st decoder #########################################
+        up_flow0_1, up_flow1_1, mask, img_res = self.decoder1(ft_1_, f0_1, f1_1, up_flow0_2, up_flow1_2)
+        if scale_factor != 1.0:
+            up_flow0_1 = resize(up_flow0_1, scale_factor=(1.0/scale_factor)) * (1.0/scale_factor)
+            up_flow1_1 = resize(up_flow1_1, scale_factor=(1.0/scale_factor)) * (1.0/scale_factor)
+            mask = resize(mask, scale_factor=(1.0/scale_factor))
+            img_res = resize(img_res, scale_factor=(1.0/scale_factor))
+        # Merge multiple predictions
+        imgt_pred = multi_flow_combine(self.comb_block, img0, img1, up_flow0_1, up_flow1_1,
+                                                                        mask, img_res, mean_)
+        imgt_pred = torch.clamp(imgt_pred, 0, 1)
+        if eval:
+            return  { 'imgt_pred': imgt_pred, }
+        else:
+            up_flow0_1 = up_flow0_1.reshape(b, self.num_flows, 2, h, w)
+            up_flow1_1 = up_flow1_1.reshape(b, self.num_flows, 2, h, w)
+            return {
+                'imgt_pred': imgt_pred,
+                'flow0_pred': [up_flow0_1, up_flow0_2, up_flow0_3, up_flow0_4],
+                'flow1_pred': [up_flow1_1, up_flow1_2, up_flow1_3, up_flow1_4],
+                'ft_pred': [ft_1_, ft_2_, ft_3_],
+            }
+class AMT_G(nn.Module):
+    def __init__(self,
+                 corr_radius=3,
+                 corr_lvls=4,
+                 num_flows=5,
+                 channels=[84, 96, 112, 128],
+                 skip_channels=84):
+        super(AMT_G, self).__init__()
+        self.radius = corr_radius
+        self.corr_levels = corr_lvls
+        self.num_flows = num_flows
+        self.feat_encoder = LargeEncoder(output_dim=128, norm_fn='instance', dropout=0.)
+        self.encoder = Encoder(channels, large=True)
+        self.decoder4 = InitDecoder(channels[3], channels[2], skip_channels)
+        self.decoder3 = IntermediateDecoder(channels[2], channels[1], skip_channels)
+        self.decoder2 = IntermediateDecoder(channels[1], channels[0], skip_channels)
+        self.decoder1 = MultiFlowDecoder(channels[0], skip_channels, num_flows)
+        self.update4 = self._get_updateblock(112, None)
+        self.update3_low = self._get_updateblock(96, 2.0)
+        self.update2_low = self._get_updateblock(84, 4.0)
+        self.update3_high = self._get_updateblock(96, None)
+        self.update2_high = self._get_updateblock(84, None)
+        self.comb_block = nn.Sequential(
+            nn.Conv2d(3*self.num_flows, 6*self.num_flows, 7, 1, 3),
+            nn.PReLU(6*self.num_flows),
+            nn.Conv2d(6*self.num_flows, 3, 7, 1, 3),
+        )
+    def _get_updateblock(self, cdim, scale_factor=None):
+        return BasicUpdateBlock(cdim=cdim, hidden_dim=192, flow_dim=64,
+                                corr_dim=256, corr_dim2=192, fc_dim=188,
+                                scale_factor=scale_factor, corr_levels=self.corr_levels,
+                                radius=self.radius)
+    def _corr_scale_lookup(self, corr_fn, coord, flow0, flow1, embt, downsample=1):
+        # convert t -> 0 to 0 -> 1 | convert t -> 1 to 1 -> 0
+        # based on linear assumption
+        t1_scale = 1. / embt
+        t0_scale = 1. / (1. - embt)
+        if downsample != 1:
+            inv = 1 / downsample
+            flow0 = inv * resize(flow0, scale_factor=inv)
+            flow1 = inv * resize(flow1, scale_factor=inv)
+        corr0, corr1 = corr_fn(coord + flow1 * t1_scale, coord + flow0 * t0_scale)
+        corr = torch.cat([corr0, corr1], dim=1)
+        flow = torch.cat([flow0, flow1], dim=1)
+        return corr, flow
+    def forward(self, img0, img1, embt, scale_factor=1.0, eval=False, **kwargs):
+        mean_ = torch.cat([img0, img1], 2).mean(1, keepdim=True).mean(2, keepdim=True).mean(3, keepdim=True)
+        img0 = img0 - mean_
+        img1 = img1 - mean_
+        img0_ = resize(img0, scale_factor) if scale_factor != 1.0 else img0
+        img1_ = resize(img1, scale_factor) if scale_factor != 1.0 else img1
+        b, _, h, w = img0_.shape
+        coord = coords_grid(b, h // 8, w // 8, img0.device)
+        fmap0, fmap1 = self.feat_encoder([img0_, img1_]) # [1, 128, H//8, W//8]
+        corr_fn = BidirCorrBlock(fmap0, fmap1, radius=self.radius, num_levels=self.corr_levels)
+        # f0_1: [1, c0, H//2, W//2] | f0_2: [1, c1, H//4, W//4]
+        # f0_3: [1, c2, H//8, W//8] | f0_4: [1, c3, H//16, W//16]
+        f0_1, f0_2, f0_3, f0_4 = self.encoder(img0_)
+        f1_1, f1_2, f1_3, f1_4 = self.encoder(img1_)
+        ######################################### the 4th decoder #########################################
+        up_flow0_4, up_flow1_4, ft_3_ = self.decoder4(f0_4, f1_4, embt)
+        corr_4, flow_4 = self._corr_scale_lookup(corr_fn, coord,
+                                                 up_flow0_4, up_flow1_4,
+                                                 embt, downsample=1)
+        # residue update with lookup corr
+        delta_ft_3_, delta_flow_4 = self.update4(ft_3_, flow_4, corr_4)
+        delta_flow0_4, delta_flow1_4 = torch.chunk(delta_flow_4, 2, 1)
+        up_flow0_4 = up_flow0_4 + delta_flow0_4
+        up_flow1_4 = up_flow1_4 + delta_flow1_4
+        ft_3_ = ft_3_ + delta_ft_3_
+        ######################################### the 3rd decoder #########################################
+        up_flow0_3, up_flow1_3, ft_2_ = self.decoder3(ft_3_, f0_3, f1_3, up_flow0_4, up_flow1_4)
+        corr_3, flow_3 = self._corr_scale_lookup(corr_fn,
+                                                 coord, up_flow0_3, up_flow1_3,
+                                                 embt, downsample=2)
+        # residue update with lookup corr
+        delta_ft_2_, delta_flow_3 = self.update3_low(ft_2_, flow_3, corr_3)
+        delta_flow0_3, delta_flow1_3 = torch.chunk(delta_flow_3, 2, 1)
+        up_flow0_3 = up_flow0_3 + delta_flow0_3
+        up_flow1_3 = up_flow1_3 + delta_flow1_3
+        ft_2_ = ft_2_ + delta_ft_2_
+        # residue update with lookup corr (hr)
+        corr_3 = resize(corr_3, scale_factor=2.0)
+        up_flow_3 = torch.cat([up_flow0_3, up_flow1_3], dim=1)
+        delta_ft_2_, delta_up_flow_3 = self.update3_high(ft_2_, up_flow_3, corr_3)
+        ft_2_ += delta_ft_2_
+        up_flow0_3 += delta_up_flow_3[:, 0:2]
+        up_flow1_3 += delta_up_flow_3[:, 2:4]
+        ######################################### the 2nd decoder #########################################
+        up_flow0_2, up_flow1_2, ft_1_  = self.decoder2(ft_2_, f0_2, f1_2, up_flow0_3, up_flow1_3)
+        corr_2, flow_2 = self._corr_scale_lookup(corr_fn,
+                                                 coord, up_flow0_2, up_flow1_2,
+                                                 embt, downsample=4)
+        # residue update with lookup corr
+        delta_ft_1_, delta_flow_2 = self.update2_low(ft_1_, flow_2, corr_2)
+        delta_flow0_2, delta_flow1_2 = torch.chunk(delta_flow_2, 2, 1)
+        up_flow0_2 = up_flow0_2 + delta_flow0_2
+        up_flow1_2 = up_flow1_2 + delta_flow1_2
+        ft_1_ = ft_1_ + delta_ft_1_
+        # residue update with lookup corr (hr)
+        corr_2 = resize(corr_2, scale_factor=4.0)
+        up_flow_2 = torch.cat([up_flow0_2, up_flow1_2], dim=1)
+        delta_ft_1_, delta_up_flow_2 = self.update2_high(ft_1_, up_flow_2, corr_2)
+        ft_1_ += delta_ft_1_
+        up_flow0_2 += delta_up_flow_2[:, 0:2]
+        up_flow1_2 += delta_up_flow_2[:, 2:4]
+        ######################################### the 1st decoder #########################################
+        up_flow0_1, up_flow1_1, mask, img_res = self.decoder1(ft_1_, f0_1, f1_1, up_flow0_2, up_flow1_2)
+        if scale_factor != 1.0:
+            up_flow0_1 = resize(up_flow0_1, scale_factor=(1.0/scale_factor)) * (1.0/scale_factor)
+            up_flow1_1 = resize(up_flow1_1, scale_factor=(1.0/scale_factor)) * (1.0/scale_factor)
+            mask = resize(mask, scale_factor=(1.0/scale_factor))
+            img_res = resize(img_res, scale_factor=(1.0/scale_factor))
+        # Merge multiple predictions
+        imgt_pred = multi_flow_combine(self.comb_block, img0, img1, up_flow0_1, up_flow1_1,
+                                                                        mask, img_res, mean_)
+        imgt_pred = torch.clamp(imgt_pred, 0, 1)
+        if eval:
+            return  { 'imgt_pred': imgt_pred, }
+        else:
+            up_flow0_1 = up_flow0_1.reshape(b, self.num_flows, 2, h, w)
+            up_flow1_1 = up_flow1_1.reshape(b, self.num_flows, 2, h, w)
+            return {
+                'imgt_pred': imgt_pred,
+                'flow0_pred': [up_flow0_1, up_flow0_2, up_flow0_3, up_flow0_4],
+                'flow1_pred': [up_flow1_1, up_flow1_2, up_flow1_3, up_flow1_4],
+                'ft_pred': [ft_1_, ft_2_, ft_3_],
+            }

vfi_models/cain/__init__.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import torch
+from torch.utils.data import DataLoader
+import pathlib
+from vfi_utils import load_file_from_github_release, preprocess_frames, postprocess_frames, generic_frame_loop, InterpolationStateList
+import typing
+from comfy.model_management import get_torch_device
+MODEL_TYPE = pathlib.Path(__file__).parent.name
+CKPT_NAMES = ["pretrained_cain.pth"]
+class CAIN_VFI:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "ckpt_name": (CKPT_NAMES, ),
+                "frames": ("IMAGE", ),
+                "clear_cache_after_n_frames": ("INT", {"default": 10, "min": 1, "max": 1000}),
+                "multiplier": ("INT", {"default": 2, "min": 2, "max": 1000})
+            },
+            "optional": {
+                "optional_interpolation_states": ("INTERPOLATION_STATES", )
+            }
+        }
+    RETURN_TYPES = ("IMAGE", )
+    FUNCTION = "vfi"
+    CATEGORY = "ComfyUI-Frame-Interpolation/VFI"
+    def vfi(
+        self,
+        ckpt_name: typing.AnyStr,
+        frames: torch.Tensor,
+        clear_cache_after_n_frames: typing.SupportsInt = 1,
+        multiplier: typing.SupportsInt = 2,
+        optional_interpolation_states: InterpolationStateList = None,
+        **kwargs
+    ):
+        from .cain_arch import CAIN
+        model_path = load_file_from_github_release(MODEL_TYPE, ckpt_name)
+        sd = torch.load(model_path)["state_dict"]
+        sd = {key.replace('module.', ''): value for key, value in sd.items()}
+        global interpolation_model
+        interpolation_model = CAIN(depth=3)
+        interpolation_model.load_state_dict(sd)
+        interpolation_model.eval().to(get_torch_device())
+        del sd
+        frames = preprocess_frames(frames)
+        def return_middle_frame(frame_0, frame_1, timestep, model):
+            #CAIN does some direct modifications to input frame tensors so we need to clone them
+            return model(frame_0.detach().clone(), frame_1.detach().clone())[0]
+        args = [interpolation_model]
+        out = postprocess_frames(
+            generic_frame_loop(type(self).__name__, frames, clear_cache_after_n_frames, multiplier, return_middle_frame, *args,
+                               interpolation_states=optional_interpolation_states, use_timestep=False, dtype=torch.float32)
+        )
+        return (out,)

vfi_models/cain/cain_arch.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import math
+import numpy as np
+import torch
+import torch.nn as nn
+from .common import *
+class Encoder(nn.Module):
+    def __init__(self, in_channels=3, depth=3):
+        super(Encoder, self).__init__()
+        # Shuffle pixels to expand in channel dimension
+        # shuffler_list = [PixelShuffle(0.5) for i in range(depth)]
+        # self.shuffler = nn.Sequential(*shuffler_list)
+        self.shuffler = PixelShuffle(1 / 2**depth)
+        relu = nn.LeakyReLU(0.2, True)
+        # FF_RCAN or FF_Resblocks
+        self.interpolate = Interpolation(5, 12, in_channels * (4**depth), act=relu)
+    def forward(self, x1, x2):
+        """
+        Encoder: Shuffle-spread --> Feature Fusion --> Return fused features
+        """
+        feats1 = self.shuffler(x1)
+        feats2 = self.shuffler(x2)
+        feats = self.interpolate(feats1, feats2)
+        return feats
+class Decoder(nn.Module):
+    def __init__(self, depth=3):
+        super(Decoder, self).__init__()
+        # shuffler_list = [PixelShuffle(2) for i in range(depth)]
+        # self.shuffler = nn.Sequential(*shuffler_list)
+        self.shuffler = PixelShuffle(2**depth)
+    def forward(self, feats):
+        out = self.shuffler(feats)
+        return out
+class CAIN(nn.Module):
+    def __init__(self, depth=3):
+        super(CAIN, self).__init__()
+        self.encoder = Encoder(in_channels=3, depth=depth)
+        self.decoder = Decoder(depth=depth)
+    def forward(self, x1, x2):
+        x1, m1 = sub_mean(x1)
+        x2, m2 = sub_mean(x2)
+        if not self.training:
+            paddingInput, paddingOutput = InOutPaddings(x1)
+            x1 = paddingInput(x1)
+            x2 = paddingInput(x2)
+        feats = self.encoder(x1, x2)
+        out = self.decoder(feats)
+        if not self.training:
+            out = paddingOutput(out)
+        mi = (m1 + m2) / 2
+        out += mi
+        return out, feats

vfi_models/cain/cain_encdec_arch.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import math
+import numpy as np
+import torch
+import torch.nn as nn
+from .common import *
+from comfy.model_management import get_torch_device
+class Encoder(nn.Module):
+    def __init__(self, in_channels=3, depth=3, nf_start=32, norm=False):
+        super(Encoder, self).__init__()
+        self.device = get_torch_device()
+        nf = nf_start
+        relu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
+        self.body = nn.Sequential(
+            ConvNorm(in_channels, nf * 1, 7, stride=1, norm=norm),
+            relu,
+            ConvNorm(nf * 1, nf * 2, 5, stride=2, norm=norm),
+            relu,
+            ConvNorm(nf * 2, nf * 4, 5, stride=2, norm=norm),
+            relu,
+            ConvNorm(nf * 4, nf * 6, 5, stride=2, norm=norm)
+        )
+        self.interpolate = Interpolation(5, 12, nf * 6, reduction=16, act=relu)
+    def forward(self, x1, x2):
+        """
+        Encoder: Feature Extraction --> Feature Fusion --> Return
+        """
+        feats1 = self.body(x1)
+        feats2 = self.body(x2)
+        feats = self.interpolate(feats1, feats2)
+        return feats
+class Decoder(nn.Module):
+    def __init__(self, in_channels=192, out_channels=3, depth=3, norm=False, up_mode='shuffle'):
+        super(Decoder, self).__init__()
+        self.device = get_torch_device()
+        relu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
+        nf = [in_channels, (in_channels*2)//3, in_channels//3, in_channels//6]
+        #nf = [192, 128, 64, 32]
+        #nf = [186, 124, 62, 31]
+        self.body = nn.Sequential(
+            UpConvNorm(nf[0], nf[1], mode=up_mode, norm=norm),
+            ResBlock(nf[1], nf[1], norm=norm, act=relu),
+            UpConvNorm(nf[1], nf[2], mode=up_mode, norm=norm),
+            ResBlock(nf[2], nf[2], norm=norm, act=relu),
+            UpConvNorm(nf[2], nf[3], mode=up_mode, norm=norm),
+            ResBlock(nf[3], nf[3], norm=norm, act=relu),
+            conv7x7(nf[3], out_channels)
+        )
+    def forward(self, feats):
+        out = self.body(feats)
+        #out = self.conv_final(out)
+        return out
+class CAIN_EncDec(nn.Module):
+    def __init__(self, depth=3, n_resblocks=3, start_filts=32, up_mode='shuffle'):
+        super(CAIN_EncDec, self).__init__()
+        self.depth = depth
+        self.encoder = Encoder(in_channels=3, depth=depth, norm=False)
+        self.decoder = Decoder(in_channels=start_filts*6, depth=depth, norm=False, up_mode=up_mode)
+    def forward(self, x1, x2):
+        x1, m1 = sub_mean(x1)
+        x2, m2 = sub_mean(x2)
+        if not self.training:
+            paddingInput, paddingOutput = InOutPaddings(x1)
+            x1 = paddingInput(x1)
+            x2 = paddingInput(x2)
+        feats = self.encoder(x1, x2)
+        out = self.decoder(feats)
+        if not self.training:
+            out = paddingOutput(out)
+        mi = (m1 + m2)/2
+        out += mi
+        return out, feats

vfi_models/cain/cain_noca_arch.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import math
+import numpy as np
+import torch
+import torch.nn as nn
+from .common import *
+from comfy.model_management import get_torch_device
+class Encoder(nn.Module):
+    def __init__(self, in_channels=3, depth=3):
+        super(Encoder, self).__init__()
+        self.device = get_torch_device()
+        self.shuffler = PixelShuffle(1/2**depth)
+        # self.shuffler = nn.Sequential(
+        #    PixelShuffle(1/2),
+        #    PixelShuffle(1/2),
+        #    PixelShuffle(1/2))
+        self.interpolate = Interpolation_res(5, 12, in_channels * (4**depth))
+    def forward(self, x1, x2):
+        feats1 = self.shuffler(x1)
+        feats2 = self.shuffler(x2)
+        feats = self.interpolate(feats1, feats2)
+        return feats
+class Decoder(nn.Module):
+    def __init__(self, depth=3):
+        super(Decoder, self).__init__()
+        self.device = get_torch_device()
+        self.shuffler = PixelShuffle(2**depth)
+        # self.shuffler = nn.Sequential(
+        #    PixelShuffle(2),
+        #    PixelShuffle(2),
+        #    PixelShuffle(2))
+    def forward(self, feats):
+        out = self.shuffler(feats)
+        return out
+class CAIN_NoCA(nn.Module):
+    def __init__(self, depth=3):
+        super(CAIN_NoCA, self).__init__()
+        self.depth = depth
+        self.encoder = Encoder(in_channels=3, depth=depth)
+        self.decoder = Decoder(depth=depth)
+    def forward(self, x1, x2):
+        x1, m1 = sub_mean(x1)
+        x2, m2 = sub_mean(x2)
+        if not self.training:
+            paddingInput, paddingOutput = InOutPaddings(x1)
+            x1 = paddingInput(x1)
+            x2 = paddingInput(x2)
+        feats = self.encoder(x1, x2)
+        out = self.decoder(feats)
+        if not self.training:
+            out = paddingOutput(out)
+        mi = (m1 + m2) / 2
+        out += mi
+        return out, feats

vfi_models/cain/common.py ADDED Viewed

	@@ -0,0 +1,361 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def sub_mean(x):
+    mean = x.mean(2, keepdim=True).mean(3, keepdim=True)
+    x -= mean
+    return x, mean
+def InOutPaddings(x):
+    w, h = x.size(3), x.size(2)
+    padding_width, padding_height = 0, 0
+    if w != ((w >> 7) << 7):
+        padding_width = (((w >> 7) + 1) << 7) - w
+    if h != ((h >> 7) << 7):
+        padding_height = (((h >> 7) + 1) << 7) - h
+    paddingInput = nn.ReflectionPad2d(padding=[padding_width // 2, padding_width - padding_width // 2,
+                                               padding_height // 2, padding_height - padding_height // 2])
+    paddingOutput = nn.ReflectionPad2d(padding=[0 - padding_width // 2, padding_width // 2 - padding_width,
+                                                0 - padding_height // 2, padding_height // 2 - padding_height])
+    return paddingInput, paddingOutput
+class ConvNorm(nn.Module):
+    def __init__(self, in_feat, out_feat, kernel_size, stride=1, norm=False):
+        super(ConvNorm, self).__init__()
+        reflection_padding = kernel_size // 2
+        self.reflection_pad = nn.ReflectionPad2d(reflection_padding)
+        self.conv = nn.Conv2d(in_feat, out_feat, stride=stride, kernel_size=kernel_size, bias=True)
+        self.norm = norm
+        if norm == 'IN':
+            self.norm = nn.InstanceNorm2d(out_feat, track_running_stats=True)
+        elif norm == 'BN':
+            self.norm = nn.BatchNorm2d(out_feat)
+    def forward(self, x):
+        out = self.reflection_pad(x)
+        out = self.conv(out)
+        if self.norm:
+            out = self.norm(out)
+        return out
+class UpConvNorm(nn.Module):
+    def __init__(self, in_channels, out_channels, mode='transpose', norm=False):
+        super(UpConvNorm, self).__init__()
+        if mode == 'transpose':
+            self.upconv = nn.ConvTranspose2d(in_channels, out_channels, kernel_size=4, stride=2, padding=1)
+        elif mode == 'shuffle':
+            self.upconv = nn.Sequential(
+                ConvNorm(in_channels, 4*out_channels, kernel_size=3, stride=1, norm=norm),
+                PixelShuffle(2))
+        else:
+            # out_channels is always going to be the same as in_channels
+            self.upconv = nn.Sequential(
+                nn.Upsample(mode='bilinear', scale_factor=2, align_corners=False),
+                ConvNorm(in_channels, out_channels, kernel_size=1, stride=1, norm=norm))
+    def forward(self, x):
+        out = self.upconv(x)
+        return out
+class meanShift(nn.Module):
+    def __init__(self, rgbRange, rgbMean, sign, nChannel=3):
+        super(meanShift, self).__init__()
+        if nChannel == 1:
+            l = rgbMean[0] * rgbRange * float(sign)
+            self.shifter = nn.Conv2d(1, 1, kernel_size=1, stride=1, padding=0)
+            self.shifter.weight.data = torch.eye(1).view(1, 1, 1, 1)
+            self.shifter.bias.data = torch.Tensor([l])
+        elif nChannel == 3:
+            r = rgbMean[0] * rgbRange * float(sign)
+            g = rgbMean[1] * rgbRange * float(sign)
+            b = rgbMean[2] * rgbRange * float(sign)
+            self.shifter = nn.Conv2d(3, 3, kernel_size=1, stride=1, padding=0)
+            self.shifter.weight.data = torch.eye(3).view(3, 3, 1, 1)
+            self.shifter.bias.data = torch.Tensor([r, g, b])
+        else:
+            r = rgbMean[0] * rgbRange * float(sign)
+            g = rgbMean[1] * rgbRange * float(sign)
+            b = rgbMean[2] * rgbRange * float(sign)
+            self.shifter = nn.Conv2d(6, 6, kernel_size=1, stride=1, padding=0)
+            self.shifter.weight.data = torch.eye(6).view(6, 6, 1, 1)
+            self.shifter.bias.data = torch.Tensor([r, g, b, r, g, b])
+        # Freeze the meanShift layer
+        for params in self.shifter.parameters():
+            params.requires_grad = False
+    def forward(self, x):
+        x = self.shifter(x)
+        return x
+""" CONV - (BN) - RELU - CONV - (BN) """
+class ResBlock(nn.Module):
+    def __init__(self, in_feat, out_feat, kernel_size=3, reduction=False, bias=True, # 'reduction' is just for placeholder
+                 norm=False, act=nn.ReLU(True), downscale=False):
+        super(ResBlock, self).__init__()
+        self.body = nn.Sequential(
+            ConvNorm(in_feat, out_feat, kernel_size=kernel_size, stride=2 if downscale else 1),
+            act,
+            ConvNorm(out_feat, out_feat, kernel_size=kernel_size, stride=1)
+        )
+        self.downscale = None
+        if downscale:
+            self.downscale = nn.Conv2d(in_feat, out_feat, kernel_size=1, stride=2)
+    def forward(self, x):
+        res = x
+        out = self.body(x)
+        if self.downscale is not None:
+            res = self.downscale(res)
+        out += res
+        return out
+## Channel Attention (CA) Layer
+class CALayer(nn.Module):
+    def __init__(self, channel, reduction=16):
+        super(CALayer, self).__init__()
+        # global average pooling: feature --> point
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        # feature channel downscale and upscale --> channel weight
+        self.conv_du = nn.Sequential(
+            nn.Conv2d(channel, channel // reduction, 1, padding=0, bias=True),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(channel // reduction, channel, 1, padding=0, bias=True),
+            nn.Sigmoid()
+        )
+    def forward(self, x):
+        y = self.avg_pool(x)
+        y = self.conv_du(y)
+        return x * y, y
+## Residual Channel Attention Block (RCAB)
+class RCAB(nn.Module):
+    def __init__(self, in_feat, out_feat, kernel_size, reduction, bias=True,
+            norm=False, act=nn.ReLU(True), downscale=False, return_ca=False):
+        super(RCAB, self).__init__()
+        self.body = nn.Sequential(
+            ConvNorm(in_feat, out_feat, kernel_size, stride=2 if downscale else 1, norm=norm),
+            act,
+            ConvNorm(out_feat, out_feat, kernel_size, stride=1, norm=norm),
+            CALayer(out_feat, reduction)
+        )
+        self.downscale = downscale
+        if downscale:
+            self.downConv = nn.Conv2d(in_feat, out_feat, kernel_size=3, stride=2, padding=1)
+        self.return_ca = return_ca
+    def forward(self, x):
+        res = x
+        out, ca = self.body(x)
+        if self.downscale:
+            res = self.downConv(res)
+        out += res
+        if self.return_ca:
+            return out, ca
+        else:
+            return out
+## Residual Group (RG)
+class ResidualGroup(nn.Module):
+    def __init__(self, Block, n_resblocks, n_feat, kernel_size, reduction, act, norm=False):
+        super(ResidualGroup, self).__init__()
+        modules_body = [Block(n_feat, n_feat, kernel_size, reduction, bias=True, norm=norm, act=act)
+            for _ in range(n_resblocks)]
+        modules_body.append(ConvNorm(n_feat, n_feat, kernel_size, stride=1, norm=norm))
+        self.body = nn.Sequential(*modules_body)
+    def forward(self, x):
+        res = self.body(x)
+        res += x
+        return res
+def pixel_shuffle(input, scale_factor):
+    batch_size, channels, in_height, in_width = input.size()
+    out_channels = int(int(channels / scale_factor) / scale_factor)
+    out_height = int(in_height * scale_factor)
+    out_width = int(in_width * scale_factor)
+    if scale_factor >= 1:
+        input_view = input.contiguous().view(batch_size, out_channels, scale_factor, scale_factor, in_height, in_width)
+        shuffle_out = input_view.permute(0, 1, 4, 2, 5, 3).contiguous()
+    else:
+        block_size = int(1 / scale_factor)
+        input_view = input.contiguous().view(batch_size, channels, out_height, block_size, out_width, block_size)
+        shuffle_out = input_view.permute(0, 1, 3, 5, 2, 4).contiguous()
+    return shuffle_out.view(batch_size, out_channels, out_height, out_width)
+class PixelShuffle(nn.Module):
+    def __init__(self, scale_factor):
+        super(PixelShuffle, self).__init__()
+        self.scale_factor = scale_factor
+    def forward(self, x):
+        return pixel_shuffle(x, self.scale_factor)
+    def extra_repr(self):
+        return 'scale_factor={}'.format(self.scale_factor)
+def conv(in_channels, out_channels, kernel_size,
+         stride=1, bias=True, groups=1):
+    return nn.Conv2d(
+        in_channels,
+        out_channels,
+        kernel_size=kernel_size,
+        padding=kernel_size//2,
+        stride=1,
+        bias=bias,
+        groups=groups)
+def conv1x1(in_channels, out_channels, stride=1, bias=True, groups=1):
+    return nn.Conv2d(
+        in_channels,
+        out_channels,
+        kernel_size=1,
+        stride=stride,
+        bias=bias,
+        groups=groups)
+def conv3x3(in_channels, out_channels, stride=1,
+            padding=1, bias=True, groups=1):
+    return nn.Conv2d(
+        in_channels,
+        out_channels,
+        kernel_size=3,
+        stride=stride,
+        padding=padding,
+        bias=bias,
+        groups=groups)
+def conv5x5(in_channels, out_channels, stride=1,
+            padding=2, bias=True, groups=1):
+    return nn.Conv2d(
+        in_channels,
+        out_channels,
+        kernel_size=5,
+        stride=stride,
+        padding=padding,
+        bias=bias,
+        groups=groups)
+def conv7x7(in_channels, out_channels, stride=1,
+            padding=3, bias=True, groups=1):
+    return nn.Conv2d(
+        in_channels,
+        out_channels,
+        kernel_size=7,
+        stride=stride,
+        padding=padding,
+        bias=bias,
+        groups=groups)
+def upconv2x2(in_channels, out_channels, mode='shuffle'):
+    if mode == 'transpose':
+        return nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size=4,
+            stride=2,
+            padding=1)
+    elif mode == 'shuffle':
+        return nn.Sequential(
+            conv3x3(in_channels, 4*out_channels),
+            PixelShuffle(2))
+    else:
+        # out_channels is always going to be the same as in_channels
+        return nn.Sequential(
+            nn.Upsample(mode='bilinear', scale_factor=2, align_corners=False),
+            conv1x1(in_channels, out_channels))
+class Interpolation(nn.Module):
+    def __init__(self, n_resgroups, n_resblocks, n_feats,
+                 reduction=16, act=nn.LeakyReLU(0.2, True), norm=False):
+        super(Interpolation, self).__init__()
+        # define modules: head, body, tail
+        self.headConv = conv3x3(n_feats * 2, n_feats)
+        modules_body = [
+            ResidualGroup(
+                RCAB,
+                n_resblocks=n_resblocks,
+                n_feat=n_feats,
+                kernel_size=3,
+                reduction=reduction,
+                act=act,
+                norm=norm)
+            for _ in range(n_resgroups)]
+        self.body = nn.Sequential(*modules_body)
+        self.tailConv = conv3x3(n_feats, n_feats)
+    def forward(self, x0, x1):
+        # Build input tensor
+        x = torch.cat([x0, x1], dim=1)
+        x = self.headConv(x)
+        res = self.body(x)
+        res += x
+        out = self.tailConv(res)
+        return out
+class Interpolation_res(nn.Module):
+    def __init__(self, n_resgroups, n_resblocks, n_feats,
+                 act=nn.LeakyReLU(0.2, True), norm=False):
+        super(Interpolation_res, self).__init__()
+        # define modules: head, body, tail (reduces concatenated inputs to n_feat)
+        self.headConv = conv3x3(n_feats * 2, n_feats)
+        modules_body = [ResidualGroup(ResBlock, n_resblocks=n_resblocks, n_feat=n_feats, kernel_size=3,
+                            reduction=0, act=act, norm=norm)
+                        for _ in range(n_resgroups)]
+        self.body = nn.Sequential(*modules_body)
+        self.tailConv = conv3x3(n_feats, n_feats)
+    def forward(self, x0, x1):
+        # Build input tensor
+        x = torch.cat([x0, x1], dim=1)
+        x = self.headConv(x)
+        res = x
+        for m in self.body:
+            res = m(res)
+        res += x
+        x = self.tailConv(res)
+        return x

vfi_models/eisai/__init__.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import pathlib
+from vfi_utils import load_file_from_github_release, preprocess_frames, postprocess_frames, generic_frame_loop, InterpolationStateList
+import typing
+import torch
+import torch.nn as nn
+from comfy.model_management import soft_empty_cache, get_torch_device
+MODEL_TYPE = pathlib.Path(__file__).parent.name
+MODEL_FILE_NAMES = {
+    "ssl": "eisai_ssl.pt",
+    "dtm": "eisai_dtm.pt",
+    "raft": "eisai_anime_interp_full.ckpt"
+}
+class EISAI(nn.Module):
+    def __init__(self, model_file_names) -> None:
+        from .eisai_arch import SoftsplatLite, DTM, RAFT
+        super(EISAI, self).__init__()
+        self.raft = RAFT(load_file_from_github_release(MODEL_TYPE, model_file_names["raft"]))
+        self.raft.to(get_torch_device()).eval()
+        self.ssl = SoftsplatLite()
+        self.ssl.load_state_dict(torch.load(load_file_from_github_release(MODEL_TYPE, model_file_names["ssl"])))
+        self.ssl.to(get_torch_device()).eval()
+        self.dtm = DTM()
+        self.dtm.load_state_dict(torch.load(load_file_from_github_release(MODEL_TYPE, model_file_names["dtm"])))
+        self.dtm.to(get_torch_device()).eval()
+    def forward(self, img0, img1, t):
+        with torch.no_grad():
+            flow0, _ = self.raft(img0, img1)
+            flow1, _ = self.raft(img1, img0)
+            x = {
+                "images": torch.stack([img0, img1], dim=1),
+                "flows": torch.stack([flow0, flow1], dim=1),
+            }
+            out_ssl, _ = self.ssl(x, t=t, return_more=True)
+            out_dtm, _ = self.dtm(x, out_ssl, _, return_more=False)
+        return out_dtm[:, :3]
+class EISAI_VFI:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "ckpt_name": (["eisai"], ),
+                "frames": ("IMAGE", ),
+                "clear_cache_after_n_frames": ("INT", {"default": 10, "min": 1, "max": 1000}),
+                "multiplier": ("INT", {"default": 2, "min": 2, "max": 1000}),
+            },
+            "optional": {
+                "optional_interpolation_states": ("INTERPOLATION_STATES", )
+            }
+        }
+    RETURN_TYPES = ("IMAGE", )
+    FUNCTION = "vfi"
+    CATEGORY = "ComfyUI-Frame-Interpolation/VFI"
+    def vfi(
+        self,
+        ckpt_name: typing.AnyStr,
+        frames: torch.Tensor,
+        clear_cache_after_n_frames = 10,
+        multiplier: typing.SupportsInt = 2,
+        optional_interpolation_states: InterpolationStateList = None,
+        **kwargs
+    ):
+        interpolation_model = EISAI(MODEL_FILE_NAMES)
+        interpolation_model.eval().to(get_torch_device())
+        frames = preprocess_frames(frames)
+        def return_middle_frame(frame_0, frame_1, timestep, model):
+            return model(frame_0, frame_1, t=timestep)
+        scale = 1
+        args = [interpolation_model, scale]
+        out = postprocess_frames(
+            generic_frame_loop(type(self).__name__, frames, clear_cache_after_n_frames, multiplier, return_middle_frame, *args,
+                               interpolation_states=optional_interpolation_states, dtype=torch.float32)
+        )
+        return (out,)

vfi_models/eisai/eisai_arch.py ADDED Viewed

	@@ -0,0 +1,2586 @@

+"""
+https://github.com/ShuhongChen/eisai-anime-interpolator/blob/master/_scripts/interpolate.py
+https://github.com/ShuhongChen/eisai-anime-interpolator/blob/master/_train/frame_interpolation/models/ssldtm.py
+https://github.com/ShuhongChen/eisai-anime-interpolator/blob/master/_util/util_v0.py
+https://github.com/ShuhongChen/eisai-anime-interpolator/blob/master/_util/twodee_v0.py
+https://github.com/ShuhongChen/eisai-anime-interpolator/blob/master/_util/pytorch_v0.py
+https://github.com/ShuhongChen/eisai-anime-interpolator/blob/master/_util/distance_transform_v0.py
+https://github.com/ShuhongChen/eisai-anime-interpolator/blob/master/_util/sketchers_v1.py
+https://github.com/ShuhongChen/eisai-anime-interpolator/blob/master/_train/frame_interpolation/helpers/interpolator_v0.py
+https://github.com/ShuhongChen/eisai-anime-interpolator/blob/master/_train/frame_interpolation/helpers/gridnet_v1.py
+https://github.com/ShuhongChen/eisai-anime-interpolator/blob/master/_util/flow_v0.py
+https://github.com/ShuhongChen/eisai-anime-interpolator/blob/master/_util/softsplat_v0.py
+https://github.com/ShuhongChen/eisai-anime-interpolator/blob/master/_train/frame_interpolation/helpers/raft_v1/rfr_new.py
+https://github.com/ShuhongChen/eisai-anime-interpolator/blob/master/_train/frame_interpolation/helpers/raft_v1/extractor.py
+https://github.com/ShuhongChen/eisai-anime-interpolator/blob/master/_train/frame_interpolation/helpers/raft_v1/update.py
+https://github.com/ShuhongChen/eisai-anime-interpolator/blob/master/_train/frame_interpolation/helpers/raft_v1/corr.py
+https://github.com/ShuhongChen/eisai-anime-interpolator/blob/master/_train/frame_interpolation/helpers/raft_v1/utils.py
+"""
+import copy
+import cv2
+import torch.nn.functional as F
+import torchvision.transforms.functional as F
+import gc
+from PIL import Image, ImageFile, ImageFont, ImageDraw
+import inspect
+from scipy import interpolate
+import kornia
+import math
+from argparse import Namespace
+import torch.nn as nn
+import numpy as np
+import os
+from functools import partial
+import pathlib
+import PIL
+import re
+import requests
+from scipy.spatial.transform import Rotation
+import scipy
+import shutil
+import torchvision.transforms as T
+import time
+import torch
+import torchvision as tv
+import zlib
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from tqdm.auto import tqdm as std_tqdm
+from tqdm.auto import trange as std_trange
+from vfi_models.ops import FunctionSoftsplat, batch_edt
+from comfy.model_management import get_torch_device
+device = get_torch_device()
+autocast = torch.autocast
+tqdm = partial(std_tqdm, dynamic_ncols=True)
+trange = partial(std_trange, dynamic_ncols=True)
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+def pixel_ij(x, rounding=True):
+    if isinstance(x, np.ndarray):
+        x = x.tolist()
+    return tuple(
+        pixel_rounder(i, rounding)
+        for i in (x if isinstance(x, tuple) or isinstance(x, list) else (x, x))
+    )
+def rescale_dry(x, factor):
+    h, w = x[-2:] if isinstance(x, tuple) or isinstance(x, list) else I(x).size
+    return (h * factor, w * factor)
+def pixel_rounder(n, mode):
+    if mode == True or mode == "round":
+        return round(n)
+    elif mode == "ceil":
+        return math.ceil(n)
+    elif mode == "floor":
+        return math.floor(n)
+    else:
+        return n
+def diam(x):
+    if isinstance(x, tuple) or isinstance(x, list):
+        h, w = x[-2:]
+    elif isinstance(x, I):
+        h, w = x.size
+    else:
+        h, w = x.shape[-2:]
+    return np.sqrt(h**2 + w**2)
+def pixel_logit(x, pixel_margin=1):
+    x = (x * (255 - 2 * pixel_margin) + pixel_margin) / 255
+    return torch.log(x / (1 - x))
+class InputPadder:
+    """Pads images such that dimensions are divisible by 8"""
+    def __init__(self, dims):
+        self.ht, self.wd = dims[-2:]
+        pad_ht = (((self.ht // 8) + 1) * 8 - self.ht) % 8
+        pad_wd = (((self.wd // 8) + 1) * 8 - self.wd) % 8
+        self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, 0, pad_ht]
+    def pad(self, *inputs):
+        return [F.pad(x, self._pad, mode="replicate") for x in inputs]
+    def unpad(self, x):
+        ht, wd = x.shape[-2:]
+        c = [self._pad[2], ht - self._pad[3], self._pad[0], wd - self._pad[1]]
+        return x[..., c[0] : c[1], c[2] : c[3]]
+def forward_interpolate(flow):
+    flow = flow.detach().cpu().numpy()
+    dx, dy = flow[0], flow[1]
+    ht, wd = dx.shape
+    x0, y0 = np.meshgrid(np.arange(wd), np.arange(ht))
+    x1 = x0 + dx
+    y1 = y0 + dy
+    x1 = x1.reshape(-1)
+    y1 = y1.reshape(-1)
+    dx = dx.reshape(-1)
+    dy = dy.reshape(-1)
+    valid = (x1 > 0) & (x1 < wd) & (y1 > 0) & (y1 < ht)
+    x1 = x1[valid]
+    y1 = y1[valid]
+    dx = dx[valid]
+    dy = dy[valid]
+    flow_x = interpolate.griddata((x1, y1), dx, (x0, y0), method="cubic", fill_value=0)
+    flow_y = interpolate.griddata((x1, y1), dy, (x0, y0), method="cubic", fill_value=0)
+    flow = np.stack([flow_x, flow_y], axis=0)
+    return torch.from_numpy(flow).float()
+def bilinear_sampler(img, coords, mode="bilinear", mask=False):
+    """Wrapper for grid_sample, uses pixel coordinates"""
+    H, W = img.shape[-2:]
+    xgrid, ygrid = coords.split([1, 1], dim=-1)
+    xgrid = 2 * xgrid / (W - 1) - 1
+    ygrid = 2 * ygrid / (H - 1) - 1
+    grid = torch.cat([xgrid, ygrid], dim=-1)
+    # print(img.size())
+    img = F.grid_sample(img, grid, align_corners=True)
+    if mask:
+        mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1)
+        return img, mask.float()
+    return img
+def coords_grid(batch, ht, wd):
+    coords = torch.meshgrid(torch.arange(ht), torch.arange(wd))
+    coords = torch.stack(coords[::-1], dim=0).float()
+    return coords[None].repeat(batch, 1, 1, 1)
+def upflow8(flow, mode="bilinear"):
+    new_size = (8 * flow.shape[2], 8 * flow.shape[3])
+    return 8 * F.interpolate(flow, size=new_size, mode=mode, align_corners=True)
+class CorrBlock:
+    def __init__(self, fmap1, fmap2, num_levels=4, radius=4):
+        self.num_levels = num_levels
+        self.radius = radius
+        self.corr_pyramid = []
+        # all pairs correlation
+        corr = CorrBlock.corr(fmap1, fmap2)
+        batch, h1, w1, dim, h2, w2 = corr.shape
+        corr = corr.reshape(batch * h1 * w1, dim, h2, w2)
+        self.corr_pyramid.append(corr)
+        for i in range(self.num_levels - 1):
+            corr = F.avg_pool2d(corr, 2, stride=2)
+            self.corr_pyramid.append(corr)
+    def __call__(self, coords):
+        r = self.radius
+        coords = coords.permute(0, 2, 3, 1)
+        batch, h1, w1, _ = coords.shape
+        out_pyramid = []
+        for i in range(self.num_levels):
+            corr = self.corr_pyramid[i]
+            dx = torch.linspace(-r, r, 2 * r + 1)
+            dy = torch.linspace(-r, r, 2 * r + 1)
+            delta = torch.stack(torch.meshgrid(dy, dx), dim=-1).to(coords.device)
+            centroid_lvl = coords.reshape(batch * h1 * w1, 1, 1, 2) / 2**i
+            delta_lvl = delta.view(1, 2 * r + 1, 2 * r + 1, 2)
+            coords_lvl = centroid_lvl + delta_lvl
+            corr = bilinear_sampler(corr, coords_lvl)
+            corr = corr.view(batch, h1, w1, -1)
+            out_pyramid.append(corr)
+        out = torch.cat(out_pyramid, dim=-1)
+        return out.permute(0, 3, 1, 2).contiguous().float()
+    @staticmethod
+    def corr(fmap1, fmap2):
+        batch, dim, ht, wd = fmap1.shape
+        fmap1 = fmap1.view(batch, dim, ht * wd)
+        fmap2 = fmap2.view(batch, dim, ht * wd)
+        corr = torch.matmul(fmap1.transpose(1, 2), fmap2)
+        corr = corr.view(batch, ht, wd, 1, ht, wd)
+        return corr / torch.sqrt(torch.tensor(dim).float())
+class FlowHead(nn.Module):
+    def __init__(self, input_dim=128, hidden_dim=256):
+        super(FlowHead, self).__init__()
+        self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1)
+        self.conv2 = nn.Conv2d(hidden_dim, 2, 3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        return self.conv2(self.relu(self.conv1(x)))
+class ConvGRU(nn.Module):
+    def __init__(self, hidden_dim=128, input_dim=192 + 128):
+        super(ConvGRU, self).__init__()
+        self.convz = nn.Conv2d(hidden_dim + input_dim, hidden_dim, 3, padding=1)
+        self.convr = nn.Conv2d(hidden_dim + input_dim, hidden_dim, 3, padding=1)
+        self.convq = nn.Conv2d(hidden_dim + input_dim, hidden_dim, 3, padding=1)
+    def forward(self, h, x):
+        hx = torch.cat([h, x], dim=1)
+        z = torch.sigmoid(self.convz(hx))
+        r = torch.sigmoid(self.convr(hx))
+        q = torch.tanh(self.convq(torch.cat([r * h, x], dim=1)))
+        h = (1 - z) * h + z * q
+        return h
+class SepConvGRU(nn.Module):
+    def __init__(self, hidden_dim=128, input_dim=192 + 128):
+        super(SepConvGRU, self).__init__()
+        self.convz1 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (1, 5), padding=(0, 2)
+        )
+        self.convr1 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (1, 5), padding=(0, 2)
+        )
+        self.convq1 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (1, 5), padding=(0, 2)
+        )
+        self.convz2 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (5, 1), padding=(2, 0)
+        )
+        self.convr2 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (5, 1), padding=(2, 0)
+        )
+        self.convq2 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (5, 1), padding=(2, 0)
+        )
+    def forward(self, h, x):
+        # horizontal
+        hx = torch.cat([h, x], dim=1)
+        z = torch.sigmoid(self.convz1(hx))
+        r = torch.sigmoid(self.convr1(hx))
+        q = torch.tanh(self.convq1(torch.cat([r * h, x], dim=1)))
+        h = (1 - z) * h + z * q
+        # vertical
+        hx = torch.cat([h, x], dim=1)
+        z = torch.sigmoid(self.convz2(hx))
+        r = torch.sigmoid(self.convr2(hx))
+        q = torch.tanh(self.convq2(torch.cat([r * h, x], dim=1)))
+        h = (1 - z) * h + z * q
+        return h
+class SmallMotionEncoder(nn.Module):
+    def __init__(self, args):
+        super(SmallMotionEncoder, self).__init__()
+        cor_planes = args.corr_levels * (2 * args.corr_radius + 1) ** 2
+        self.convc1 = nn.Conv2d(cor_planes, 96, 1, padding=0)
+        self.convf1 = nn.Conv2d(2, 64, 7, padding=3)
+        self.convf2 = nn.Conv2d(64, 32, 3, padding=1)
+        self.conv = nn.Conv2d(128, 80, 3, padding=1)
+    def forward(self, flow, corr):
+        cor = F.relu(self.convc1(corr))
+        flo = F.relu(self.convf1(flow))
+        flo = F.relu(self.convf2(flo))
+        cor_flo = torch.cat([cor, flo], dim=1)
+        out = F.relu(self.conv(cor_flo))
+        return torch.cat([out, flow], dim=1)
+class BasicMotionEncoder(nn.Module):
+    def __init__(self, args):
+        super(BasicMotionEncoder, self).__init__()
+        cor_planes = args.corr_levels * (2 * args.corr_radius + 1) ** 2
+        self.convc1 = nn.Conv2d(cor_planes, 256, 1, padding=0)
+        self.convc2 = nn.Conv2d(256, 192, 3, padding=1)
+        self.convf1 = nn.Conv2d(2, 128, 7, padding=3)
+        self.convf2 = nn.Conv2d(128, 64, 3, padding=1)
+        self.conv = nn.Conv2d(64 + 192, 128 - 2, 3, padding=1)
+    def forward(self, flow, corr):
+        cor = F.relu(self.convc1(corr))
+        cor = F.relu(self.convc2(cor))
+        flo = F.relu(self.convf1(flow))
+        flo = F.relu(self.convf2(flo))
+        cor_flo = torch.cat([cor, flo], dim=1)
+        out = F.relu(self.conv(cor_flo))
+        return torch.cat([out, flow], dim=1)
+class SmallUpdateBlock(nn.Module):
+    def __init__(self, args, hidden_dim=96):
+        super(SmallUpdateBlock, self).__init__()
+        self.encoder = SmallMotionEncoder(args)
+        self.gru = ConvGRU(hidden_dim=hidden_dim, input_dim=82 + 64)
+        self.flow_head = FlowHead(hidden_dim, hidden_dim=128)
+    def forward(self, net, inp, corr, flow):
+        motion_features = self.encoder(flow, corr)
+        inp = torch.cat([inp, motion_features], dim=1)
+        net = self.gru(net, inp)
+        delta_flow = self.flow_head(net)
+        return net, None, delta_flow
+class BasicUpdateBlock(nn.Module):
+    def __init__(self, args, hidden_dim=128, input_dim=128):
+        super(BasicUpdateBlock, self).__init__()
+        self.args = args
+        self.encoder = BasicMotionEncoder(args)
+        self.gru = SepConvGRU(hidden_dim=hidden_dim, input_dim=128 + hidden_dim)
+        self.flow_head = FlowHead(hidden_dim, hidden_dim=256)
+        self.mask = nn.Sequential(
+            nn.Conv2d(128, 256, 3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 64 * 9, 1, padding=0),
+        )
+    def forward(self, net, inp, corr, flow, upsample=True):
+        motion_features = self.encoder(flow, corr)
+        inp = torch.cat([inp, motion_features], dim=1)
+        net = self.gru(net, inp)
+        delta_flow = self.flow_head(net)
+        # scale mask to balence gradients
+        mask = 0.25 * self.mask(net)
+        return net, mask, delta_flow
+class ResidualBlock(nn.Module):
+    def __init__(self, in_planes, planes, norm_fn="group", stride=1):
+        super(ResidualBlock, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_planes, planes, kernel_size=3, padding=1, stride=stride
+        )
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+        num_groups = planes // 8
+        if norm_fn == "group":
+            self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            if not stride == 1:
+                self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+        elif norm_fn == "batch":
+            self.norm1 = nn.BatchNorm2d(planes)
+            self.norm2 = nn.BatchNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.BatchNorm2d(planes)
+        elif norm_fn == "instance":
+            self.norm1 = nn.InstanceNorm2d(planes)
+            self.norm2 = nn.InstanceNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.InstanceNorm2d(planes)
+        elif norm_fn == "none":
+            self.norm1 = nn.Sequential()
+            self.norm2 = nn.Sequential()
+            if not stride == 1:
+                self.norm3 = nn.Sequential()
+        if stride == 1:
+            self.downsample = None
+        else:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3
+            )
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return self.relu(x + y)
+class BottleneckBlock(nn.Module):
+    def __init__(self, in_planes, planes, norm_fn="group", stride=1):
+        super(BottleneckBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_planes, planes // 4, kernel_size=1, padding=0)
+        self.conv2 = nn.Conv2d(
+            planes // 4, planes // 4, kernel_size=3, padding=1, stride=stride
+        )
+        self.conv3 = nn.Conv2d(planes // 4, planes, kernel_size=1, padding=0)
+        self.relu = nn.ReLU(inplace=True)
+        num_groups = planes // 8
+        if norm_fn == "group":
+            self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes // 4)
+            self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes // 4)
+            self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            if not stride == 1:
+                self.norm4 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+        elif norm_fn == "batch":
+            self.norm1 = nn.BatchNorm2d(planes // 4)
+            self.norm2 = nn.BatchNorm2d(planes // 4)
+            self.norm3 = nn.BatchNorm2d(planes)
+            if not stride == 1:
+                self.norm4 = nn.BatchNorm2d(planes)
+        elif norm_fn == "instance":
+            self.norm1 = nn.InstanceNorm2d(planes // 4)
+            self.norm2 = nn.InstanceNorm2d(planes // 4)
+            self.norm3 = nn.InstanceNorm2d(planes)
+            if not stride == 1:
+                self.norm4 = nn.InstanceNorm2d(planes)
+        elif norm_fn == "none":
+            self.norm1 = nn.Sequential()
+            self.norm2 = nn.Sequential()
+            self.norm3 = nn.Sequential()
+            if not stride == 1:
+                self.norm4 = nn.Sequential()
+        if stride == 1:
+            self.downsample = None
+        else:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm4
+            )
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+        y = self.relu(self.norm3(self.conv3(y)))
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return self.relu(x + y)
+class BasicEncoder(nn.Module):
+    def __init__(self, output_dim=128, norm_fn="batch", dropout=0.0):
+        super(BasicEncoder, self).__init__()
+        self.norm_fn = norm_fn
+        if self.norm_fn == "group":
+            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64)
+        elif self.norm_fn == "batch":
+            self.norm1 = nn.BatchNorm2d(64)
+        elif self.norm_fn == "instance":
+            self.norm1 = nn.InstanceNorm2d(64)
+        elif self.norm_fn == "none":
+            self.norm1 = nn.Sequential()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.in_planes = 64
+        self.layer1 = self._make_layer(64, stride=1)
+        self.layer2 = self._make_layer(96, stride=2)
+        self.layer3 = self._make_layer(128, stride=2)
+        # output convolution
+        self.conv2 = nn.Conv2d(128, output_dim, kernel_size=1)
+        self.dropout = None
+        if dropout > 0:
+            self.dropout = nn.Dropout2d(p=dropout)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def _make_layer(self, dim, stride=1):
+        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        # if input is list, combine batch dimension
+        is_list = isinstance(x, tuple) or isinstance(x, list)
+        if is_list:
+            batch_dim = x[0].shape[0]
+            x = torch.cat(x, dim=0)
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.conv2(x)
+        if self.training and self.dropout is not None:
+            x = self.dropout(x)
+        if is_list:
+            x = torch.split(x, [batch_dim, batch_dim], dim=0)
+        return x
+class BasicEncoder1(nn.Module):
+    def __init__(self, output_dim=128, norm_fn="batch", dropout=0.0):
+        super(BasicEncoder1, self).__init__()
+        self.norm_fn = norm_fn
+        if self.norm_fn == "group":
+            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64)
+        elif self.norm_fn == "batch":
+            self.norm1 = nn.BatchNorm2d(64)
+        elif self.norm_fn == "instance":
+            self.norm1 = nn.InstanceNorm2d(64)
+        elif self.norm_fn == "none":
+            self.norm1 = nn.Sequential()
+        self.conv1 = nn.Conv2d(2, 64, kernel_size=7, stride=2, padding=3)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.in_planes = 64
+        self.layer1 = self._make_layer(64, stride=1)
+        self.layer2 = self._make_layer(96, stride=2)
+        self.layer3 = self._make_layer(128, stride=2)
+        # output convolution
+        self.conv2 = nn.Conv2d(128, output_dim, kernel_size=1)
+        self.dropout = None
+        if dropout > 0:
+            self.dropout = nn.Dropout2d(p=dropout)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def _make_layer(self, dim, stride=1):
+        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        # if input is list, combine batch dimension
+        is_list = isinstance(x, tuple) or isinstance(x, list)
+        if is_list:
+            batch_dim = x[0].shape[0]
+            x = torch.cat(x, dim=0)
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.conv2(x)
+        if self.training and self.dropout is not None:
+            x = self.dropout(x)
+        if is_list:
+            x = torch.split(x, [batch_dim, batch_dim], dim=0)
+        return x
+class SmallEncoder(nn.Module):
+    def __init__(self, output_dim=128, norm_fn="batch", dropout=0.0):
+        super(SmallEncoder, self).__init__()
+        self.norm_fn = norm_fn
+        if self.norm_fn == "group":
+            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=32)
+        elif self.norm_fn == "batch":
+            self.norm1 = nn.BatchNorm2d(32)
+        elif self.norm_fn == "instance":
+            self.norm1 = nn.InstanceNorm2d(32)
+        elif self.norm_fn == "none":
+            self.norm1 = nn.Sequential()
+        self.conv1 = nn.Conv2d(3, 32, kernel_size=7, stride=2, padding=3)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.in_planes = 32
+        self.layer1 = self._make_layer(32, stride=1)
+        self.layer2 = self._make_layer(64, stride=2)
+        self.layer3 = self._make_layer(96, stride=2)
+        self.dropout = None
+        if dropout > 0:
+            self.dropout = nn.Dropout2d(p=dropout)
+        self.conv2 = nn.Conv2d(96, output_dim, kernel_size=1)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def _make_layer(self, dim, stride=1):
+        layer1 = BottleneckBlock(self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = BottleneckBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        # if input is list, combine batch dimension
+        is_list = isinstance(x, tuple) or isinstance(x, list)
+        if is_list:
+            batch_dim = x[0].shape[0]
+            x = torch.cat(x, dim=0)
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.conv2(x)
+        if self.training and self.dropout is not None:
+            x = self.dropout(x)
+        if is_list:
+            x = torch.split(x, [batch_dim, batch_dim], dim=0)
+        return x
+##################################################
+#  RFR is implemented based on RAFT optical flow #
+##################################################
+def backwarp(img, flow):
+    _, _, H, W = img.size()
+    u = flow[:, 0, :, :]
+    v = flow[:, 1, :, :]
+    gridX, gridY = np.meshgrid(np.arange(W), np.arange(H))
+    gridX = torch.tensor(
+        gridX,
+        requires_grad=False,
+    ).cuda()
+    gridY = torch.tensor(
+        gridY,
+        requires_grad=False,
+    ).cuda()
+    x = gridX.unsqueeze(0).expand_as(u).float() + u
+    y = gridY.unsqueeze(0).expand_as(v).float() + v
+    # range -1 to 1
+    x = 2 * (x / (W - 1) - 0.5)
+    y = 2 * (y / (H - 1) - 0.5)
+    # stacking X and Y
+    grid = torch.stack((x, y), dim=3)
+    # Sample pixels using bilinear interpolation.
+    imgOut = torch.nn.functional.grid_sample(img, grid, align_corners=True)
+    return imgOut
+class ErrorAttention(nn.Module):
+    """A three-layer network for predicting mask"""
+    def __init__(self, input, output):
+        super(ErrorAttention, self).__init__()
+        self.conv1 = nn.Conv2d(input, 32, 5, padding=2)
+        self.conv2 = nn.Conv2d(32, 32, 3, padding=1)
+        self.conv3 = nn.Conv2d(38, output, 3, padding=1)
+        self.prelu1 = nn.PReLU()
+        self.prelu2 = nn.PReLU()
+    def forward(self, x1):
+        x = self.prelu1(self.conv1(x1))
+        x = self.prelu2(torch.cat([self.conv2(x), x1], dim=1))
+        x = self.conv3(x)
+        return x
+class RFR(nn.Module):
+    def __init__(self, args):
+        super(RFR, self).__init__()
+        self.attention2 = ErrorAttention(6, 1)
+        self.hidden_dim = hdim = 128
+        self.context_dim = cdim = 128
+        args.corr_levels = 4
+        args.corr_radius = 4
+        args.dropout = 0
+        self.args = args
+        # feature network, context network, and update block
+        self.fnet = BasicEncoder(output_dim=256, norm_fn="none", dropout=args.dropout)
+        # self.cnet = BasicEncoder(output_dim=hdim+cdim, norm_fn='none', dropout=args.dropout)
+        self.update_block = BasicUpdateBlock(self.args, hidden_dim=hdim)
+    def freeze_bn(self):
+        for m in self.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.eval()
+    def initialize_flow(self, img):
+        """Flow is represented as difference between two coordinate grids flow = coords1 - coords0"""
+        N, C, H, W = img.shape
+        coords0 = coords_grid(N, H // 8, W // 8).to(img.device)
+        coords1 = coords_grid(N, H // 8, W // 8).to(img.device)
+        # optical flow computed as difference: flow = coords1 - coords0
+        return coords0, coords1
+    def upsample_flow(self, flow, mask):
+        """Upsample flow field [H/8, W/8, 2] -> [H, W, 2] using convex combination"""
+        N, _, H, W = flow.shape
+        mask = mask.view(N, 1, 9, 8, 8, H, W)
+        mask = torch.softmax(mask, dim=2)
+        up_flow = F.unfold(8 * flow, [3, 3], padding=1)
+        up_flow = up_flow.view(N, 2, 9, 1, 1, H, W)
+        up_flow = torch.sum(mask * up_flow, dim=2)
+        up_flow = up_flow.permute(0, 1, 4, 2, 5, 3)
+        return up_flow.reshape(N, 2, 8 * H, 8 * W)
+    def forward(
+        self, image1, image2, iters=12, flow_init=None, upsample=True, test_mode=False
+    ):
+        H, W = image1.size()[2:4]
+        H8 = H // 8 * 8
+        W8 = W // 8 * 8
+        if flow_init is not None:
+            flow_init_resize = F.interpolate(
+                flow_init, size=(H8 // 8, W8 // 8), mode="nearest"
+            )
+            flow_init_resize[:, :1] = (
+                flow_init_resize[:, :1].clone() * (W8 // 8 * 1.0) / flow_init.size()[3]
+            )
+            flow_init_resize[:, 1:] = (
+                flow_init_resize[:, 1:].clone() * (H8 // 8 * 1.0) / flow_init.size()[2]
+            )
+            if not hasattr(self.args, "not_use_rfr_mask") or (
+                hasattr(self.args, "not_use_rfr_mask")
+                and (not self.args.not_use_rfr_mask)
+            ):
+                im18 = F.interpolate(image1, size=(H8 // 8, W8 // 8), mode="bilinear")
+                im28 = F.interpolate(image2, size=(H8 // 8, W8 // 8), mode="bilinear")
+                warp21 = backwarp(im28, flow_init_resize)
+                error21 = torch.sum(torch.abs(warp21 - im18), dim=1, keepdim=True)
+                # print('errormin', error21.min(), error21.max())
+                f12init = (
+                    torch.exp(
+                        -self.attention2(
+                            torch.cat([im18, error21, flow_init_resize], dim=1)
+                        )
+                        ** 2
+                    )
+                    * flow_init_resize
+                )
+        else:
+            flow_init_resize = None
+            flow_init = torch.zeros(
+                image1.size()[0], 2, image1.size()[2] // 8, image1.size()[3] // 8
+            ).cuda()
+            error21 = torch.zeros(
+                image1.size()[0], 1, image1.size()[2] // 8, image1.size()[3] // 8
+            ).cuda()
+            f12_init = flow_init
+            # print('None inital flow!')
+        image1 = F.interpolate(image1, size=(H8, W8), mode="bilinear")
+        image2 = F.interpolate(image2, size=(H8, W8), mode="bilinear")
+        f12s, f12, f12_init = self.forward_pred(
+            image1, image2, iters, flow_init_resize, upsample, test_mode
+        )
+        if hasattr(self.args, "requires_sq_flow") and self.args.requires_sq_flow:
+            for ii in range(len(f12s)):
+                f12s[ii] = F.interpolate(f12s[ii], size=(H, W), mode="bilinear")
+                f12s[ii][:, :1] = f12s[ii][:, :1].clone() / (1.0 * W8) * W
+                f12s[ii][:, 1:] = f12s[ii][:, 1:].clone() / (1.0 * H8) * H
+            if self.training:
+                return f12s
+            else:
+                return [f12s[-1]], f12_init
+        else:
+            f12[:, :1] = f12[:, :1].clone() / (1.0 * W8) * W
+            f12[:, 1:] = f12[:, 1:].clone() / (1.0 * H8) * H
+            f12 = F.interpolate(f12, size=(H, W), mode="bilinear")
+            # print('wo!!')
+            return (
+                f12,
+                f12_init,
+                error21,
+            )
+    def forward_pred(
+        self, image1, image2, iters=12, flow_init=None, upsample=True, test_mode=False
+    ):
+        """Estimate optical flow between pair of frames"""
+        image1 = image1.contiguous()
+        image2 = image2.contiguous()
+        hdim = self.hidden_dim
+        cdim = self.context_dim
+        # run the feature network
+        with autocast(device.type, enabled=self.args.mixed_precision):
+            fmap1, fmap2 = self.fnet([image1, image2])
+        fmap1 = fmap1.float()
+        fmap2 = fmap2.float()
+        corr_fn = CorrBlock(fmap1, fmap2, radius=self.args.corr_radius)
+        # run the context network
+        with autocast(device.type, enabled=self.args.mixed_precision):
+            cnet = self.fnet(image1)
+            net, inp = torch.split(cnet, [hdim, cdim], dim=1)
+            net = torch.tanh(net)
+            inp = torch.relu(inp)
+        coords0, coords1 = self.initialize_flow(image1)
+        if flow_init is not None:
+            coords1 = coords1 + flow_init
+        flow_predictions = []
+        for itr in range(iters):
+            coords1 = coords1.detach()
+            if itr == 0:
+                if flow_init is not None:
+                    coords1 = coords1 + flow_init
+            corr = corr_fn(coords1)  # index correlation volume
+            flow = coords1 - coords0
+            with autocast(device.type, enabled=self.args.mixed_precision):
+                net, up_mask, delta_flow = self.update_block(net, inp, corr, flow)
+            # F(t+1) = F(t) + \Delta(t)
+            coords1 = coords1 + delta_flow
+            # upsample predictions
+            if up_mask is None:
+                flow_up = upflow8(coords1 - coords0)
+            else:
+                flow_up = self.upsample_flow(coords1 - coords0, up_mask)
+            flow_predictions.append(flow_up)
+        return flow_predictions, flow_up, flow_init
+####################### WARPING #######################
+# expects batched tensors, considered low-level operation
+# img: bs, ch, h, w
+# flow: bs, xy (pix displace), h, w
+def flow_backwarp(
+    img, flow, resample="bilinear", padding_mode="border", align_corners=False
+):
+    if len(img.shape) != 4:
+        img = img[None,]
+    if len(flow.shape) != 4:
+        flow = flow[None,]
+    q = (
+        2
+        * flow
+        / torch.tensor(
+            [
+                flow.shape[-2],
+                flow.shape[-1],
+            ],
+            device=flow.device,
+            dtype=torch.float,
+        )[None, :, None, None]
+    )
+    q = q + torch.stack(
+        torch.meshgrid(
+            torch.linspace(-1, 1, flow.shape[-2]),
+            torch.linspace(-1, 1, flow.shape[-1]),
+        )
+    )[
+        None,
+    ].to(
+        flow.device
+    )
+    if img.dtype != q.dtype:
+        img = img.type(q.dtype)
+    return nn.functional.grid_sample(
+        img,
+        q.flip(dims=(1,)).permute(0, 2, 3, 1),
+        mode=resample,  # nearest, bicubic, bilinear
+        padding_mode=padding_mode,  # border, zeros, reflection
+        align_corners=align_corners,
+    )
+backwarp = flow_warp = flow_backwarp
+# mode: sum, avg, lin, softmax
+# lin/softmax w/out metric defaults to avg
+# must use gpu, move back to cpu if retain_device
+# typical metric: -20 * | img0 - backwarp(img1,flow) |
+# From Fannovel16: Changed mode params for common ops.
+def flow_forewarp(
+    img, flow, mode="average", metric=None, mask=False, retain_device=True
+):
+    # setup
+    #if mode == "sum":
+    #    mode = "summation"
+    #elif mode == "avg":
+    #    mode = "average"
+    if mode in ["lin", "linear"]:
+        #mode = "linear" if metric is not None else "average"
+        mode = "linear" if metric is not None else "avg"
+    elif mode in ["sm", "softmax"]:
+        #mode = "softmax" if metric is not None else "average"
+        mode = "soft" if metric is not None else "avg"
+    if len(img.shape) != 4:
+        img = img[None,]
+    if len(flow.shape) != 4:
+        flow = flow[None,]
+    if metric is not None and len(metric.shape) != 4:
+        metric = metric[None,]
+    flow = flow.flip(dims=(1,))
+    if img.dtype != torch.float32:
+        img = img.type(torch.float32)
+    if flow.dtype != torch.float32:
+        flow = flow.type(torch.float32)
+    if metric is not None and metric.dtype != torch.float32:
+        metric = metric.type(torch.float32)
+    # move to gpu if necessary
+    assert img.device == flow.device
+    if metric is not None:
+        assert img.device == metric.device
+    was_cpu = img.device.type == "cpu"
+    if was_cpu:
+        img = img.to("cuda")
+        flow = flow.to("cuda")
+        if metric is not None:
+            metric = metric.to("cuda")
+    # add mask
+    if mask:
+        bs, ch, h, w = img.shape
+        img = torch.cat(
+            [img, torch.ones(bs, 1, h, w, dtype=img.dtype, device=img.device)], dim=1
+        )
+    # forward, move back to cpu if desired
+    ans = FunctionSoftsplat(img, flow, metric, mode)
+    if was_cpu and retain_device:
+        ans = ans.cpu()
+    return ans
+forewarp = flow_forewarp
+# resizing utility
+def flow_resize(flow, size, mode="nearest", align_corners=False):
+    # flow: bs,xy,h,w
+    size = pixel_ij(size, rounding=True)
+    if flow.dtype != torch.float:
+        flow = flow.float()
+    if len(flow.shape) == 3:
+        flow = flow[None,]
+    if flow.shape[-2:] == size:
+        return flow
+    return (
+        nn.functional.interpolate(
+            flow,
+            size=size,
+            mode=mode,
+            align_corners=align_corners if mode != "nearest" else None,
+        )
+        * torch.tensor(
+            [b / a for a, b in zip(flow.shape[-2:], size)],
+            device=flow.device,
+        )[None, :, None, None]
+    )
+####################### TRADITIONAL #######################
+# dense
+_lucaskanade = lambda a, b: np.moveaxis(
+    cv2.optflow.calcOpticalFlowSparseToDense(
+        a,
+        b,  # grid_step=5, sigma=0.5,
+    ),
+    2,
+    0,
+)[
+    None,
+]
+_farneback = lambda a, b: np.moveaxis(
+    cv2.calcOpticalFlowFarneback(
+        a,
+        b,
+        None,
+        0.6,
+        3,
+        25,
+        7,
+        5,
+        1.2,
+        cv2.OPTFLOW_FARNEBACK_GAUSSIAN,
+    ),
+    2,
+    0,
+)[
+    None,
+]
+_dtvl1_ = cv2.optflow.createOptFlow_DualTVL1()
+_dtvl1 = lambda a, b: np.moveaxis(
+    _dtvl1_.calc(
+        a,
+        b,
+        None,
+    ),
+    2,
+    0,
+)[
+    None,
+]
+_simple = lambda a, b: np.moveaxis(
+    cv2.optflow.calcOpticalFlowSF(
+        a,
+        b,
+        3,
+        5,
+        5,
+    ),
+    2,
+    0,
+)[
+    None,
+]
+_pca_ = cv2.optflow.createOptFlow_PCAFlow()
+_pca = lambda a, b: np.moveaxis(
+    _pca_.calc(
+        a,
+        b,
+        None,
+    ),
+    2,
+    0,
+)[
+    None,
+]
+_drlof = lambda a, b: np.moveaxis(
+    cv2.optflow.calcOpticalFlowDenseRLOF(
+        a,
+        b,
+        None,
+    ),
+    2,
+    0,
+)[
+    None,
+]
+_deepflow_ = cv2.optflow.createOptFlow_DeepFlow()
+_deepflow = lambda a, b: np.moveaxis(
+    _deepflow_.calc(
+        a,
+        b,
+        None,
+    ),
+    2,
+    0,
+)[
+    None,
+]
+def cv2flow(a, b, method="lucaskanade", back=False):
+    if method == "lucaskanade":
+        f = _lucaskanade
+        a = a.convert("L").cv2()
+        b = b.convert("L").cv2()
+    elif method == "farneback":
+        f = _farneback
+        a = a.convert("L").cv2()
+        b = b.convert("L").cv2()
+    elif method == "dtvl1":
+        f = _dtvl1
+        a = a.convert("L").cv2()
+        b = b.convert("L").cv2()
+    elif method == "simple":
+        f = _simple
+        a = a.convert("RGB").cv2()
+        b = b.convert("RGB").cv2()
+    elif method == "pca":
+        f = _pca
+        a = a.convert("L").cv2()
+        b = b.convert("L").cv2()
+    elif method == "drlof":
+        f = _drlof
+        a = a.convert("RGB").cv2()
+        b = b.convert("RGB").cv2()
+    elif method == "deepflow":
+        f = _deepflow
+        a = a.convert("L").cv2()
+        b = b.convert("L").cv2()
+    else:
+        assert 0
+    ans = f(b, a)
+    if back:
+        ans = np.concatenate(
+            [
+                ans,
+                f(a, b),
+            ]
+        )
+    return torch.tensor(ans).flip(dims=(1,))
+####################### FLOWNET2 #######################
+def flownet2(img_a, img_b, mode="shm", back=False):
+    # package
+    url = f"http://localhost:8109/get-flow"
+    if mode == "shm":
+        t = time.time()
+        fn_a = img_a.save(mkfile(f"/dev/shm/_flownet2/{t}/img_a.png"))
+        fn_b = img_b.save(mkfile(f"/dev/shm/_flownet2/{t}/img_b.png"))
+    elif mode == "net":
+        assert False, "not impl"
+        q = u2d.img2uri(img.pil("RGB"))
+        q.decode()
+    resp = requests.get(
+        url,
+        params={
+            "img_a": fn_a,
+            "img_b": fn_b,
+            "mode": mode,
+            "back": back,
+            # 'vis': vis,
+        },
+    )
+    # return
+    ans = {"response": resp}
+    if resp.status_code == 200:
+        j = resp.json()
+        ans["time"] = j["time"]
+        ans["output"] = {
+            "flow": torch.tensor(load(j["fn_flow"])),
+        }
+        # if vis:
+        #     ans['output']['vis'] = I(j['fn_vis'])
+    if mode == "shm":
+        shutil.rmtree(f"/dev/shm/_flownet2/{t}")
+    return ans
+####################### VISUALIZATION #######################
+class Gridnet(nn.Module):
+    def __init__(self, channels_0, channels_1, channels_2, total_dropout_p, depth):
+        super().__init__()
+        self.channels_0 = ch0 = channels_0
+        self.channels_1 = ch1 = channels_1
+        self.channels_2 = ch2 = channels_2
+        self.total_dropout_p = p = total_dropout_p
+        self.depth = depth
+        self.encoders = nn.ModuleList(
+            [GridnetEncoder(ch0, ch1, ch2) for i in range(self.depth)]
+        )
+        self.decoders = nn.ModuleList(
+            [GridnetDecoder(ch0, ch1, ch2) for i in range(self.depth)]
+        )
+        self.total_dropout = GridnetTotalDropout(p)
+        return
+    def forward(self, x):
+        for e, enc in enumerate(self.encoders):
+            t = [self.total_dropout(i) for i in t] if e != 0 else x
+            t = enc(t)
+        for d, dec in enumerate(self.decoders):
+            t = [self.total_dropout(i) for i in t]
+            t = dec(t)
+        return t
+class GridnetEncoder(nn.Module):
+    def __init__(self, channels_0, channels_1, channels_2):
+        super().__init__()
+        self.channels_0 = ch0 = channels_0
+        self.channels_1 = ch1 = channels_1
+        self.channels_2 = ch2 = channels_2
+        self.resnet_0 = GridnetResnet(ch0)
+        self.resnet_1 = GridnetResnet(ch1)
+        self.resnet_2 = GridnetResnet(ch2)
+        self.downsample_01 = GridnetDownsample(ch0, ch1)
+        self.downsample_12 = GridnetDownsample(ch1, ch2)
+        return
+    def forward(self, x):
+        out = [
+            None,
+        ] * 3
+        out[0] = self.resnet_0(x[0])
+        out[1] = self.resnet_1(x[1]) + self.downsample_01(out[0])
+        out[2] = self.resnet_2(x[2]) + self.downsample_12(out[1])
+        return out
+class GridnetDecoder(nn.Module):
+    def __init__(self, channels_0, channels_1, channels_2):
+        super().__init__()
+        self.channels_0 = ch0 = channels_0
+        self.channels_1 = ch1 = channels_1
+        self.channels_2 = ch2 = channels_2
+        self.resnet_0 = GridnetResnet(ch0)
+        self.resnet_1 = GridnetResnet(ch1)
+        self.resnet_2 = GridnetResnet(ch2)
+        self.upsample_10 = GridnetUpsample(ch1, ch0)
+        self.upsample_21 = GridnetUpsample(ch2, ch1)
+        return
+    def forward(self, x):
+        out = [
+            None,
+        ] * 3
+        out[2] = self.resnet_2(x[2])
+        out[1] = self.resnet_1(x[1]) + self.upsample_21(out[2])
+        out[0] = self.resnet_0(x[0]) + self.upsample_10(out[1])
+        return out
+class GridnetConverter(nn.Module):
+    def __init__(self, channels_in, channels_out):
+        super().__init__()
+        self.channels_in = cin = channels_in
+        self.channels_out = cout = channels_out
+        self.nets = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.PReLU(a),
+                    nn.Conv2d(a, b, kernel_size=1, padding=0),
+                    nn.BatchNorm2d(b),
+                )
+                for a, b in zip(cin, cout)
+            ]
+        )
+        return
+    def forward(self, x):
+        return [m(q) for m, q in zip(self.nets, x)]
+class GridnetResnet(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.channels = ch = channels
+        self.net = nn.Sequential(
+            nn.PReLU(ch),
+            nn.Conv2d(ch, ch, kernel_size=3, padding=1),
+            nn.BatchNorm2d(ch),
+            nn.PReLU(ch),
+            nn.Conv2d(ch, ch, kernel_size=3, padding=1),
+            nn.BatchNorm2d(ch),
+        )
+        return
+    def forward(self, x):
+        return x + self.net(x)
+class GridnetDownsample(nn.Module):
+    def __init__(self, channels_in, channels_out):
+        super().__init__()
+        self.channels_in = chin = channels_in
+        self.channels_out = chout = channels_out
+        self.net = nn.Sequential(
+            nn.PReLU(chin),
+            nn.Conv2d(chin, chin, kernel_size=3, padding=1, stride=2),
+            nn.BatchNorm2d(chin),
+            nn.PReLU(chin),
+            nn.Conv2d(chin, chout, kernel_size=3, padding=1),
+            nn.BatchNorm2d(chout),
+        )
+        return
+    def forward(self, x):
+        return self.net(x)
+class GridnetUpsample(nn.Module):
+    def __init__(self, channels_in, channels_out):
+        super().__init__()
+        self.channels_in = chin = channels_in
+        self.channels_out = chout = channels_out
+        self.net = nn.Sequential(
+            nn.Upsample(scale_factor=2, mode="nearest"),
+            nn.PReLU(chin),
+            nn.Conv2d(chin, chout, kernel_size=3, padding=1),
+            nn.BatchNorm2d(chout),
+            nn.PReLU(chout),
+            nn.Conv2d(chout, chout, kernel_size=3, padding=1),
+            nn.BatchNorm2d(chout),
+        )
+        return
+    def forward(self, x):
+        return self.net(x)
+class GridnetTotalDropout(nn.Module):
+    def __init__(self, p):
+        super().__init__()
+        assert 0 <= p < 1
+        self.p = p
+        self.weight = 1 / (1 - p)
+        return
+    def get_drop(self, x):
+        d = torch.rand(len(x))[:, None, None, None] < self.p
+        d = (1 - d.float()).to(x.device) * self.weight
+        return d
+    def forward(self, x, force_drop=None):
+        if force_drop is True:
+            ans = x * self.get_drop(x)
+        elif force_drop is False:
+            ans = x
+        else:
+            if self.training:
+                ans = x * self.get_drop(x)
+            else:
+                ans = x
+        return ans
+class Interpolator(nn.Module):
+    def __init__(self, size, mode="bilinear"):
+        super().__init__()
+        self.size = size
+        self.mode = mode
+        return
+    def forward(self, x, is_flow=False):
+        if x.shape[-2] == self.size:
+            return x
+        if len(x.shape) == 4:
+            # bs,ch,h,w
+            bs, ch, h, w = x.shape
+            ans = nn.functional.interpolate(
+                x,
+                size=self.size,
+                mode=self.mode,
+                align_corners=(False, None)[self.mode == "nearest"],
+            )
+            if is_flow:
+                ans = (
+                    ans
+                    * torch.tensor(
+                        [b / a for a, b in zip((h, w), self.size)],
+                        device=ans.device,
+                    )[None, :, None, None]
+                )
+            return ans
+        elif len(x.shape) == 5:
+            # bs,k,ch,h,w (merge bs and k)
+            bs, k, ch, h, w = x.shape
+            return self.forward(
+                x.view(bs * k, ch, h, w),
+                is_flow=is_flow,
+            ).view(bs, k, ch, *self.size)
+        else:
+            assert 0
+###################### CANNY ######################
+def canny(img, a=100, b=200):
+    img = I(img).convert("L")
+    return I(cv2.Canny(img.cv2(), a, b))
+# https://www.pyimagesearch.com/2015/04/06/zero-parameter-automatic-canny-edge-detection-with-python-and-opencv/
+def canny_pis(img, sigma=0.33):
+    # compute the median of the single channel pixel intensities
+    img = I(img).convert("L").uint8(ch_last=False)
+    v = np.median(img)
+    # apply automatic Canny edge detection using the computed median
+    lower = int(max(0, (1.0 - sigma) * v))
+    upper = int(min(255, (1.0 + sigma) * v))
+    edged = cv2.Canny(img[0], lower, upper)
+    # return the edged image
+    return I(edged)
+# https://en.wikipedia.org/wiki/Otsu%27s_method
+def canny_otsu(img):
+    img = I(img).convert("L").uint8(ch_last=False)
+    high, _ = cv2.threshold(img[0], 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    low = 0.5 * high
+    return I(cv2.Canny(img[0], low, high))
+def xdog(img, t=1.0, epsilon=0.04, phi=100, sigma=3, k=1.6):
+    img = I(img).convert("L").uint8(ch_last=False)
+    grey = np.asarray(img, dtype=np.float32)
+    g0 = scipy.ndimage.gaussian_filter(grey, sigma)
+    g1 = scipy.ndimage.gaussian_filter(grey, sigma * k)
+    # ans = ((1+p) * g0 - p * g1) / 255
+    ans = (g0 - t * g1) / 255
+    ans = 1 + np.tanh(phi * (ans - epsilon)) * (ans < epsilon)
+    return ans
+def dog(img, t=1.0, sigma=1.0, k=1.6, epsilon=0.01, kernel_factor=4, clip=True):
+    img = I(img).convert("L").tensor()[None]
+    kern0 = max(2 * int(sigma * kernel_factor) + 1, 3)
+    kern1 = max(2 * int(sigma * k * kernel_factor) + 1, 3)
+    g0 = kornia.filters.gaussian_blur2d(
+        img,
+        (kern0, kern0),
+        (sigma, sigma),
+        border_type="replicate",
+    )
+    g1 = kornia.filters.gaussian_blur2d(
+        img,
+        (kern1, kern1),
+        (sigma * k, sigma * k),
+        border_type="replicate",
+    )
+    ans = 0.5 + t * (g1 - g0) - epsilon
+    ans = ans.clip(0, 1) if clip else ans
+    return ans[0].numpy()
+# input: (bs,rgb(a),h,w) or (bs,1,h,w)
+# returns: (bs,1,h,w)
+def batch_dog(img, t=1.0, sigma=1.0, k=1.6, epsilon=0.01, kernel_factor=4, clip=True):
+    # to grayscale if needed
+    bs, ch, h, w = img.shape
+    if ch in [3, 4]:
+        img = kornia.color.rgb_to_grayscale(img[:, :3])
+    else:
+        assert ch == 1
+    # calculate dog
+    kern0 = max(2 * int(sigma * kernel_factor) + 1, 3)
+    kern1 = max(2 * int(sigma * k * kernel_factor) + 1, 3)
+    g0 = kornia.filters.gaussian_blur2d(
+        img,
+        (kern0, kern0),
+        (sigma, sigma),
+        border_type="replicate",
+    )
+    g1 = kornia.filters.gaussian_blur2d(
+        img,
+        (kern1, kern1),
+        (sigma * k, sigma * k),
+        border_type="replicate",
+    )
+    ans = 0.5 + t * (g1 - g0) - epsilon
+    ans = ans.clip(0, 1) if clip else ans
+    return ans
+############### DERIVED DISTANCES ###############
+# input: (bs,h,w) or (bs,1,h,w)
+# returns: (bs,)
+# normalized s.t. metric is same across proportional image scales
+# average of two asymmetric distances
+# normalized by diameter and area
+def batch_chamfer_distance(gt, pred, block=1024, return_more=False):
+    t = batch_chamfer_distance_t(gt, pred, block=block)
+    p = batch_chamfer_distance_p(gt, pred, block=block)
+    cd = (t + p) / 2
+    return cd
+def batch_chamfer_distance_t(gt, pred, block=1024, return_more=False):
+    assert gt.device == pred.device and gt.shape == pred.shape
+    bs, h, w = gt.shape[0], gt.shape[-2], gt.shape[-1]
+    dpred = batch_edt(pred, block=block)
+    cd = (gt * dpred).float().mean((-2, -1)) / np.sqrt(h**2 + w**2)
+    if len(cd.shape) == 2:
+        assert cd.shape[1] == 1
+        cd = cd.squeeze(1)
+    return cd
+def batch_chamfer_distance_p(gt, pred, block=1024, return_more=False):
+    assert gt.device == pred.device and gt.shape == pred.shape
+    bs, h, w = gt.shape[0], gt.shape[-2], gt.shape[-1]
+    dgt = batch_edt(gt, block=block)
+    cd = (pred * dgt).float().mean((-2, -1)) / np.sqrt(h**2 + w**2)
+    if len(cd.shape) == 2:
+        assert cd.shape[1] == 1
+        cd = cd.squeeze(1)
+    return cd
+# normalized by diameter
+# always between [0,1]
+def batch_hausdorff_distance(gt, pred, block=1024, return_more=False):
+    assert gt.device == pred.device and gt.shape == pred.shape
+    bs, h, w = gt.shape[0], gt.shape[-2], gt.shape[-1]
+    dgt = batch_edt(gt, block=block)
+    dpred = batch_edt(pred, block=block)
+    hd = torch.stack(
+        [
+            (dgt * pred).amax(dim=(-2, -1)),
+            (dpred * gt).amax(dim=(-2, -1)),
+        ]
+    ).amax(dim=0).float() / np.sqrt(h**2 + w**2)
+    if len(hd.shape) == 2:
+        assert hd.shape[1] == 1
+        hd = hd.squeeze(1)
+    return hd
+#################### UTILITIES ####################
+def reset_parameters(model):
+    for layer in model.children():
+        if hasattr(layer, "reset_parameters"):
+            layer.reset_parameters()
+    return model
+def channel_squeeze(x, dim=1):
+    a = x.shape[:dim]
+    b = x.shape[dim + 2 :]
+    return x.reshape(*a, -1, *b)
+def channel_unsqueeze(x, shape, dim=1):
+    a = x.shape[:dim]
+    b = x.shape[dim + 1 :]
+    return x.reshape(*a, *shape, *b)
+def default_collate(items, device=None):
+    return to(dict(torch.utils.data.dataloader.default_collate(items)), device)
+def to(x, device):
+    if device is None:
+        return x
+    if issubclass(x.__class__, dict):
+        return dict(
+            {
+                k: v.to(device) if isinstance(v, torch.Tensor) else v
+                for k, v in x.items()
+            }
+        )
+    if isinstance(x, torch.Tensor):
+        return x.to(device)
+    if isinstance(x, np.ndarray):
+        return torch.tensor(x).to(device)
+    assert 0, "data not understood"
+################ PARSING ################
+from argparse import Namespace
+# args:  all args
+# bargs: base args
+# pargs: data processing args
+# largs: data loading args
+# margs: model args
+# targs: training args
+# typically used to read dataset filters
+def read_filter(fn, cast=None, sort=True, sort_key=None):
+    if cast is None:
+        cast = lambda x: x
+    ans = [cast(line) for line in read(fn).split("\n") if line != ""]
+    if sort:
+        return sorted(ans, key=sort_key)
+    else:
+        return ans
+################ FILE MANAGEMENT ################
+def mkfile(fn, parents=True, exist_ok=True):
+    dn = "/".join(fn.split("/")[:-1])
+    mkdir(dn, parents=parents, exist_ok=exist_ok)
+    return fn
+def mkdir(dn, parents=True, exist_ok=True):
+    pathlib.Path(dn).mkdir(parents=parents, exist_ok=exist_ok)
+    return dn if (not dn[-1] == "/" or dn == "/") else dn[:-1]
+def fstrip(fn, return_more=False):
+    dspl = fn.split("/")
+    dn = "/".join(dspl[:-1]) if len(dspl) > 1 else "."
+    fn = dspl[-1]
+    fspl = fn.split(".")
+    if len(fspl) == 1:
+        bn = fspl[0]
+        ext = ""
+    else:
+        bn = ".".join(fspl[:-1])
+        ext = fspl[-1]
+    if return_more:
+        return Namespace(
+            dn=dn,
+            fn=fn,
+            path=f"{dn}/{fn}",
+            bn_path=f"{dn}/{bn}",
+            bn=bn,
+            ext=ext,
+        )
+    else:
+        return bn
+def read(fn, mode="r"):
+    with open(fn, mode) as handle:
+        return handle.read()
+def write(text, fn, mode="w"):
+    mkfile(fn, parents=True, exist_ok=True)
+    with open(fn, mode) as handle:
+        return handle.write(text)
+import pickle
+def dump(obj, fn, mode="wb"):
+    mkfile(fn, parents=True, exist_ok=True)
+    with open(fn, mode) as handle:
+        return pickle.dump(obj, handle)
+def load(fn, mode="rb"):
+    with open(fn, mode) as handle:
+        return pickle.load(handle)
+import json
+def jwrite(x, fn, mode="w", indent="\t", sort_keys=False):
+    mkfile(fn, parents=True, exist_ok=True)
+    with open(fn, mode) as handle:
+        return json.dump(x, handle, indent=indent, sort_keys=sort_keys)
+def jread(fn, mode="r"):
+    with open(fn, mode) as handle:
+        return json.load(handle)
+try:
+    import yaml
+    def ywrite(x, fn, mode="w", default_flow_style=False):
+        mkfile(fn, parents=True, exist_ok=True)
+        with open(fn, mode) as handle:
+            return yaml.dump(x, handle, default_flow_style=default_flow_style)
+    def yread(fn, mode="r"):
+        with open(fn, mode) as handle:
+            return yaml.safe_load(handle)
+except:
+    pass
+try:
+    import pyunpack
+except:
+    pass
+try:
+    import mysql
+    import mysql.connector
+except:
+    pass
+################ MISC ################
+hakase = "./env/__hakase__.jpg"
+if not os.path.isfile(hakase):
+    hakase = "./__env__/__hakase__.jpg"
+def mem(units="m"):
+    return (
+        psProcess(os.getpid()).memory_info().rss
+        / {
+            "b": 1,
+            "k": 1e3,
+            "m": 1e6,
+            "g": 1e9,
+            "t": 1e12,
+        }[units[0].lower()]
+    )
+def chunk(array, length, colwise=True):
+    if colwise:
+        return [array[i : i + length] for i in range(0, len(array), length)]
+    else:
+        return chunk(array, int(math.ceil(len(array) / length)), colwise=True)
+def classtree(x):
+    return inspect.getclasstree(inspect.getmro(x))
+################ AESTHETIC ################
+class Table:
+    def __init__(
+        self,
+        table,
+        delimiter=" ",
+        orientation="br",
+        double_colon=True,
+    ):
+        self.delimiter = delimiter
+        self.orientation = orientation
+        self.t = Table.parse(table, delimiter, orientation, double_colon)
+        return
+    # rendering
+    def __str__(self):
+        return self.render()
+    def __repr__(self):
+        return self.render()
+    def render(self):
+        # set up empty entry
+        empty = ("", Table._spec(self.orientation, transpose=False))
+        # calculate table size
+        t = copy.deepcopy(self.t)
+        totalrows = len(t)
+        totalcols = [len(r) for r in t]
+        assert min(totalcols) == max(totalcols)
+        totalcols = totalcols[0]
+        # string-ify
+        for i in range(totalrows):
+            for j in range(totalcols):
+                x, s = t[i][j]
+                sp = s[11]
+                if sp:
+                    x = eval(f'f"{{{x}{sp}}}"')
+                Table._put((str(x), s), t, (i, j), empty)
+        # expand delimiters
+        _repl = (
+            lambda s: s[:2] + (1, 0, 0, 0, 0) + s[7:10] + (1,) + s[11:]
+            if s[2]
+            else s[:2] + (0, 0, 0, 0, 0) + s[7:10] + (1,) + s[11:]
+        )
+        for i, row in enumerate(t):
+            for j, (x, s_own) in enumerate(row):
+                # expand delim_up(^)
+                if s_own[3]:
+                    u, v = i, j
+                    while 0 <= u:
+                        _, s = t[u][v]
+                        if (i, j) != (u, v) and (s[2] and not s[10]):
+                            break
+                        Table._put((x, _repl(s)), t, (u, v), empty)
+                        u -= 1
+                # expand delim_down(v)
+                if s_own[4]:
+                    u, v = i, j
+                    while u < totalrows:
+                        _, s = t[u][v]
+                        if (i, j) != (u, v) and (s[2] and not s[10]):
+                            break
+                        Table._put((x, _repl(s)), t, (u, v), empty)
+                        u += 1
+                # expand delim_right(>)
+                if s_own[5]:
+                    u, v = i, j
+                    while v < totalcols:
+                        _, s = t[u][v]
+                        if (i, j) != (u, v) and (s[2] and not s[10]):
+                            break
+                        Table._put((x, _repl(s)), t, (u, v), empty)
+                        v += 1
+                # expand delim_left(<)
+                if s_own[6]:
+                    u, v = i, j
+                    while 0 <= v:
+                        _, s = t[u][v]
+                        if (i, j) != (u, v) and (s[2] and not s[10]):
+                            break
+                        Table._put((x, _repl(s)), t, (u, v), empty)
+                        v -= 1
+        # justification calculation
+        widths = [
+            0,
+        ] * totalcols  # j
+        heights = [
+            0,
+        ] * totalrows  # i
+        for i, row in enumerate(t):
+            for j, (x, s) in enumerate(row):
+                # height caclulation
+                heights[i] = max(heights[i], x.count("\n"))
+                # width calculation; non-delim fillers no contribution
+                if s[2] or not s[10]:
+                    w = max(len(q) for q in x.split("\n"))
+                    widths[j] = max(widths[j], w)
+        # no newline ==> height=1
+        heights = [h + 1 for h in heights]
+        # render table
+        rend = []
+        roff = 0
+        for i, row in enumerate(t):
+            for j, (x, s) in enumerate(row):
+                w, h = widths[j], heights[i]
+                # expand fillers and delimiters
+                if s[2] or s[10]:
+                    xs = x.split("\n")
+                    xw0 = min(len(l) for l in xs)
+                    xw1 = max(len(l) for l in xs)
+                    xh = len(xs)
+                    if (xw0 == xw1 == w) and (xh == h):
+                        pass
+                    elif xw0 == xw1 == w:
+                        x = "\n".join(
+                            [
+                                xs[0],
+                            ]
+                            * h
+                        )
+                    elif xh == h:
+                        x = "\n".join([(l[0] if l else "") * w for l in xs])
+                    else:
+                        x = x[0] if x else " "
+                        x = "\n".join(
+                            [
+                                x * w,
+                            ]
+                            * h
+                        )
+                # justify horizontally
+                x = [l.rjust(w) if s[0] else l.ljust(w) for l in x.split("\n")]
+                # justify vertically
+                plus = [
+                    " " * w,
+                ] * (h - len(x))
+                x = plus + x if not s[1] else x + plus
+                # input to table
+                for r, xline in enumerate(x):
+                    Table._put(xline, rend, (roff + r, j), None)
+            roff += h
+        # return rendered string
+        return "\n".join(["".join(r) for r in rend])
+    # parsing
+    def _spec(s, transpose=False):
+        if ":" in s:
+            i = s.index(":")
+            sp = s[i:]
+            s = s[:i]
+        else:
+            sp = ""
+            s = s.lower()
+        return (
+            int("r" in s),  #  0:: 0:left(l)   1:right(r)
+            int("t" in s),  #  1:: 0:bottom(b) 1:top(t)
+            int(any([i in s for i in [".", "<", ">", "^", "v"]])),  #  2:: delim_here(.)
+            int("^" in s if not transpose else "<" in s),  #  3:: delim_up(^)
+            int("v" in s if not transpose else ">" in s),  #  4:: delim_down(v)
+            int(">" in s if not transpose else "v" in s),  #  5:: delim_right(>)
+            int("<" in s if not transpose else "^" in s),  #  6:: delim_left(<)
+            int("+" in s),  #  7:: subtable(+)
+            int("-" in s if not transpose else "|" in s),  #  8:: subtable_horiz(-)
+            int("|" in s if not transpose else "-" in s),  #  9:: subtable_vert(|)
+            int("_" in s),  # 10:: fill(_); if delim, overwrite; else fit
+            sp,  # 11:: special(:) f-string for numbers
+        )
+    def _put(obj, t, ij, empty):
+        i, j = ij
+        while i >= len(t):
+            t.append([])
+        while j >= len(t[i]):
+            t[i].append(empty)
+        t[i][j] = obj
+        return
+    def parse(
+        table,
+        delimiter=" ",
+        orientation="br",
+        double_colon=True,
+    ):
+        # disabling transpose
+        transpose = False
+        # set up empty entry
+        empty = ("", Table._spec(orientation, transpose))
+        # transpose
+        t = []
+        for i, row in enumerate(table):
+            for j, item in enumerate(row):
+                ij = (i, j) if not transpose else (j, i)
+                if type(item) == tuple and len(item) == 2 and type(item[1]) == str:
+                    item = (item[0], Table._spec(item[1], transpose))
+                elif double_colon and type(item) == str and "::" in item:
+                    x, s = item.split("::")
+                    item = (x, Table._spec(s, transpose))
+                else:
+                    item = (item, Table._spec(orientation, transpose))
+                Table._put(item, t, ij, empty)
+        # normalization
+        maxcol = 0
+        maxrow = len(t)
+        for i, row in enumerate(t):
+            # take element number into account
+            maxcol = max(maxcol, len([i for i in row if not i[1][2]]))
+            # take subtables into account
+            for j, (x, s) in enumerate(row):
+                if s[7]:
+                    r = len(x)
+                    maxrow = max(maxrow, i + r)
+                    c = max(len(q) for q in x)
+                    maxcol = max(maxcol, j + c)
+                elif s[8]:
+                    c = len(x)
+                    maxcol = max(maxcol, j + c)
+                elif s[9]:
+                    r = len(x)
+                    maxrow = max(maxrow, i + r)
+        totalcols = 2 * maxcol + 1
+        totalrows = maxrow
+        t += [[]] * (totalrows - len(t))
+        newt = []
+        delim = (delimiter, Table._spec("._" + orientation, transpose))
+        for i, row in enumerate(t):
+            wasd = False
+            tcount = 0
+            for j in range(totalcols):
+                item = t[i][tcount] if tcount < len(t[i]) else empty
+                isd = item[1][2]
+                if wasd and isd:
+                    Table._put(empty, newt, (i, j), empty)
+                    wasd = False
+                elif wasd and not isd:
+                    Table._put(item, newt, (i, j), empty)
+                    tcount += 1
+                    wasd = False
+                elif not wasd and isd:
+                    Table._put(item, newt, (i, j), empty)
+                    tcount += 1
+                    wasd = True
+                elif not wasd and not isd:
+                    Table._put(delim, newt, (i, j), empty)
+                    wasd = True
+        t = newt
+        # normalization: add dummy last column for delimiter
+        for row in t:
+            row.append(empty)
+        # expand subtables
+        delim_cols = [i for i in range(totalcols) if i % 2 == 0]
+        while True:
+            # find a table
+            ij = None
+            for i, row in enumerate(t):
+                for j, item in enumerate(row):
+                    st, s = item
+                    if s[7]:
+                        ij = i, j, 7, st, s
+                        break
+                    elif s[8]:
+                        ij = i, j, 8, st, s
+                        break
+                    elif s[9]:
+                        ij = i, j, 9, st, s
+                        break
+                if ij is not None:
+                    break
+            if ij is None:
+                break
+            # replace its specs
+            i, j, k, st, s = ij
+            s = list(s)
+            s[7] = s[8] = s[9] = 0
+            s = tuple(s)
+            # expand it
+            if k == 7:  # 2d table
+                for x, row in enumerate(st):
+                    for y, obj in enumerate(row):
+                        a = i + x if not transpose else i + y
+                        b = j + 2 * y if not transpose else j + 2 * x
+                        Table._put((obj, s), t, (a, b), None)
+            if k == 8:  # subtable_horiz
+                for y, obj in enumerate(st):
+                    Table._put((obj, s), t, (i, j + 2 * y), None)
+            if k == 9:  # subtable_vert
+                for x, obj in enumerate(st):
+                    Table._put((obj, s), t, (i + x, j), None)
+        # return, finally
+        return t
+class Resnet(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.channels = ch = channels
+        self.net = nn.Sequential(
+            nn.PReLU(ch),
+            nn.Conv2d(ch, ch, kernel_size=3, padding=1),
+            nn.BatchNorm2d(ch),
+            nn.PReLU(ch),
+            nn.Conv2d(ch, ch, kernel_size=3, padding=1),
+            nn.BatchNorm2d(ch),
+        )
+        return
+    def forward(self, x):
+        return x + self.net(x)
+class Synthesizer(nn.Module):
+    def __init__(
+        self, size, channels_image, channels_flow, channels_mask, channels_feature
+    ):
+        super().__init__()
+        self.size = size
+        self.diam = diam(self.size)
+        self.channels_image = cimg = channels_image
+        self.channels_flow = cflow = channels_flow
+        self.channels_mask = cmask = channels_mask
+        self.channels_feature = cfeat = channels_feature
+        self.channels = ch = cimg + cflow // 2 + cmask + cfeat
+        self.interpolator = Interpolator(self.size, mode="bilinear")
+        self.net = nn.Sequential(
+            nn.Conv2d(ch + 3, 64, kernel_size=1, padding=0),
+            Resnet(64),
+            nn.Sequential(
+                nn.PReLU(64),
+                nn.Conv2d(64, 32, kernel_size=3, padding=1),
+                nn.BatchNorm2d(32),
+            ),
+            Resnet(32),
+            nn.Sequential(
+                nn.PReLU(32),
+                nn.Conv2d(32, 16, kernel_size=3, padding=1),
+                nn.BatchNorm2d(16),
+            ),
+            Resnet(16),
+            nn.Sequential(
+                nn.PReLU(16),
+                nn.Conv2d(16, 3, kernel_size=3, padding=1),
+            ),
+        )
+        return
+    def forward(self, images, flows, masks, features, return_more=False):
+        itp = self.interpolator
+        images = [
+            (images[0] + images[1]) / 2,
+        ] + images
+        logimgs = [itp(pixel_logit(i[:, :3])) for i in images]
+        cat = torch.cat(
+            [
+                *logimgs,
+                *[itp(f).norm(dim=1, keepdim=True) / self.diam for f in flows],
+                *[itp(m) for m in masks],
+                *[itp(f) for f in features],
+            ],
+            dim=1,
+        )
+        residual = self.net(cat)
+        return torch.sigmoid(logimgs[0] + 0.5 * residual), (
+            locals() if return_more else None
+        )
+class FlowZMetric(nn.Module):
+    def __init__(self):
+        super().__init__()
+        return
+    def forward(self, img0, img1, flow0, flow1, return_more=False):
+        # B(i0,f0) = i1
+        # B(i1,f1) = i0
+        # F(x,f0,z0)
+        # F(x,f1,z1)
+        img0 = kornia.color.rgb_to_lab(img0[:, :3])
+        img1 = kornia.color.rgb_to_lab(img1[:, :3])
+        return [
+            -0.1 * (img1 - flow_backwarp(img0, flow0)).norm(dim=1, keepdim=True),  # z0
+            -0.1 * (img0 - flow_backwarp(img1, flow1)).norm(dim=1, keepdim=True),  # z1
+        ], (locals() if return_more else None)
+class NEDT(nn.Module):
+    def __init__(self):
+        super().__init__()
+        return
+    def forward(
+        self,
+        img,
+        t=2.0,
+        sigma_factor=1 / 540,
+        k=1.6,
+        epsilon=0.01,
+        kernel_factor=4,
+        exp_factor=540 / 15,
+        return_more=False,
+    ):
+        with torch.no_grad():
+            dog = batch_dog(
+                img,
+                t=t,
+                sigma=img.shape[-2] * sigma_factor,
+                k=k,
+                epsilon=epsilon,
+                kernel_factor=kernel_factor,
+                clip=False,
+            )
+            edt = batch_edt((dog > 0.5).float())
+            ans = 1 - (-edt * exp_factor / max(edt.shape[-2:])).exp()
+        return ans, (locals() if return_more else None)
+class HalfWarper(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.channels_image = 4 * 3
+        self.channels_flow = 2 * 2
+        self.channels_mask = 2 * 1
+        self.channels = self.channels_image + self.channels_flow + self.channels_mask
+    def morph_open(self, x, k):
+        if k == 0:
+            return x
+        else:
+            with torch.no_grad():
+                return kornia.morphology.opening(x, torch.ones(k, k, device=x.device))
+    def forward(self, img0, img1, flow0, flow1, z0, z1, k, t=0.5, return_more=False):
+        # forewarps
+        flow0_ = (1 - t) * flow0
+        flow1_ = t * flow1
+        f01 = forewarp(img0, flow1_, mode="sm", metric=z1, mask=True)
+        f10 = forewarp(img1, flow0_, mode="sm", metric=z0, mask=True)
+        f01i, f01m = f01[:, :-1], self.morph_open(f01[:, -1:], k=k)
+        f10i, f10m = f10[:, :-1], self.morph_open(f10[:, -1:], k=k)
+        # base guess
+        base0 = f01m * f01i + (1 - f01m) * f10i
+        base1 = f10m * f10i + (1 - f10m) * f01i
+        ans = [
+            [  # images
+                base0,
+                base1,
+                f01i,
+                f10i,
+            ],
+            [  # flows
+                flow0_,
+                flow1_,
+            ],
+            [  # masks
+                f01m,
+                f10m,
+            ],
+        ]
+        return ans, (locals() if return_more else None)
+class ResnetFeatureExtractor(nn.Module):
+    def __init__(self, inferserve_query, size_in=None):
+        super().__init__()
+        self.inferserve_query = iq = inferserve_query
+        self.size_in = si = size_in
+        if iq[0] == "torchvision":
+            # use pytorch pretrained resnet50
+            self.base_hparams = None
+            resnet = tv.models.resnet50(pretrained=True)
+            self.resize = T.Resize(256)
+            self.resnet_preprocess = T.Normalize(
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225],
+            )
+            self.conv1 = resnet.conv1
+            self.bn1 = resnet.bn1
+            self.relu = resnet.relu  #   64ch, 128p (assuming 256p input)
+            self.maxpool = resnet.maxpool
+            self.layer1 = resnet.layer1  #  256ch,  64p
+            self.layer2 = resnet.layer2  #  512ch,  32p
+        else:
+            base = userving.infer_model_load(*iq).eval()
+            self.base_hparams = base.hparams
+            self.resize = T.Resize(base.hparams.largs.size)
+            self.resnet_preprocess = base.resnet_preprocess
+            self.conv1 = base.resnet.conv1
+            self.bn1 = base.resnet.bn1
+            self.relu = base.resnet.relu  #   64ch, 128p (assuming 256p input)
+            self.maxpool = base.resnet.maxpool
+            self.layer1 = base.resnet.layer1  #  256ch,  64p
+            self.layer2 = base.resnet.layer2  #  512ch,  32p
+        if self.size_in is None:
+            self.sizes_out = None
+        else:
+            s = self.resize.size
+            self.sizes_out = [
+                pixel_ij(
+                    rescale_dry(si, (s // 2) / si[0]), rounding="ceil"
+                ),  # conv1, 128p
+                pixel_ij(
+                    rescale_dry(si, (s // 4) / si[0]), rounding="ceil"
+                ),  # layer1, 64p
+                pixel_ij(
+                    rescale_dry(si, (s // 8) / si[0]), rounding="ceil"
+                ),  # layer2, 32p
+            ]
+        self.channels = [
+            64,
+            256,
+            512,
+        ]
+        return
+    def forward(self, x, force_sizes_out=False, return_more=False):
+        ans = []
+        x = x[:, :3]
+        x = self.resize(x)
+        x = self.resnet_preprocess(x)
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        ans.append(x)  # conv1
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        ans.append(x)  # layer1
+        x = self.layer2(x)
+        ans.append(x)  # layer2
+        if force_sizes_out or (self.sizes_out is None):
+            self.sizes_out = [tuple(q.shape[-2:]) for q in ans]
+        return ans, (locals() if return_more else None)
+class NetNedt(nn.Module):
+    def __init__(self):
+        super().__init__()
+        chin = 3 + 1 + 4 + 4 + 1 + 1
+        ch = 16
+        chout = 1
+        self.net = nn.Sequential(
+            nn.PReLU(chin),
+            nn.Conv2d(chin, ch, kernel_size=3, padding=1),
+            nn.BatchNorm2d(ch),
+            nn.PReLU(ch),
+            nn.Conv2d(ch, ch, kernel_size=3, padding=1),
+            nn.BatchNorm2d(ch),
+            nn.PReLU(ch),
+            nn.Conv2d(ch, chout, kernel_size=3, padding=1),
+        )
+        return
+    def forward(self, out_base, out_base_nedt, hw_imgs, hw_masks, return_more=False):
+        cat = torch.cat(
+            [
+                out_base,  # 3
+                out_base_nedt,  # 1
+                hw_imgs[0],  # 4
+                hw_imgs[1],  # 4
+                hw_masks[0],  # 1
+                hw_masks[1],  # 1
+            ],
+            dim=1,
+        )
+        log = pixel_logit(cat.clip(0, 1))
+        ans = torch.sigmoid(self.net(log))
+        return ans, (locals() if return_more else None)
+class NetTail(nn.Module):
+    def __init__(self):
+        super().__init__()
+        chin = 3 + 1 + 1
+        ch = 16
+        chout = 3
+        self.net = nn.Sequential(
+            nn.PReLU(chin),
+            nn.Conv2d(chin, ch, kernel_size=3, padding=1),
+            nn.BatchNorm2d(ch),
+            nn.PReLU(ch),
+            nn.Conv2d(ch, ch, kernel_size=3, padding=1),
+            nn.BatchNorm2d(ch),
+            nn.PReLU(ch),
+            nn.Conv2d(ch, ch, kernel_size=3, padding=1),
+            nn.BatchNorm2d(ch),
+            nn.PReLU(ch),
+            nn.Conv2d(ch, chout, kernel_size=3, padding=1),
+        )
+        return
+    def forward(self, out_base, out_base_nedt, pred_nedt, return_more=False):
+        cat = torch.cat(
+            [
+                out_base,  # 3
+                out_base_nedt,  # 1
+                pred_nedt,  # 1
+            ],
+            dim=1,
+        )
+        log = pixel_logit(cat.clip(0, 1))
+        ans = torch.sigmoid(log[:, :3] + self.net(log))
+        return ans, (locals() if return_more else None)
+class SoftsplatLite(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.feature_extractor = ResnetFeatureExtractor(
+            ("torchvision", "resnet50"),
+            (540, 960),
+        )
+        self.z_metric = FlowZMetric()
+        self.flow_downsamplers = [
+            Interpolator(s, mode="bilinear") for s in self.feature_extractor.sizes_out
+        ]
+        self.gridnet_converter = GridnetConverter(
+            self.feature_extractor.channels,
+            [32, 64, 128],
+        )
+        self.gridnet = Gridnet(
+            *[32, 64, 128],
+            total_dropout_p=0.0,
+            depth=1,  # equivalent to u-net
+        )
+        self.nedt = NEDT()
+        self.half_warper = HalfWarper()
+        self.synthesizer = Synthesizer(
+            (540, 960),
+            self.half_warper.channels_image,
+            self.half_warper.channels_flow,
+            self.half_warper.channels_mask,
+            self.gridnet.channels_0,
+        )
+        return
+    def forward(self, x, t=0.5, k=5, return_more=False):
+        rm = return_more
+        flow0, flow1 = x["flows"].swapaxes(0, 1)
+        img0, img1 = x["images"][:, 0], x["images"][:, -1]
+        (z0, z1), locs_z = self.z_metric(img0, img1, flow0, flow1, return_more=rm)
+        img0 = torch.cat([img0, self.nedt(img0)[0]], dim=1)
+        img1 = torch.cat([img1, self.nedt(img1)[0]], dim=1)
+        # images and flows
+        (hw_imgs, hw_flows, hw_masks), locs_hw = self.half_warper(
+            img0,
+            img1,
+            flow0,
+            flow1,
+            z0,
+            z1,
+            k,
+            t=t,
+            return_more=rm,
+        )
+        # features
+        feats0, locs_fe0 = self.feature_extractor(img0, return_more=rm)
+        feats1, locs_fe1 = self.feature_extractor(img1, return_more=rm)
+        warps = []
+        for ft0, ft1, ds in zip(feats0, feats1, self.flow_downsamplers):
+            (w, _, _), _ = self.half_warper(
+                ft0,
+                ft1,
+                ds(flow0, 1),
+                ds(flow1, 1),
+                ds(z0),
+                ds(z1),
+                k,
+                t=t,
+            )
+            warps.append((w[0] + w[1]) / 2)
+        feats = self.gridnet(self.gridnet_converter(warps))
+        # synthesis
+        pred, locs_synth = self.synthesizer(
+            hw_imgs,
+            hw_flows,
+            hw_masks,
+            [
+                feats[0],
+            ],
+            return_more=rm,
+        )
+        return pred, (locals() if rm else None)
+class DTM(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.net_nedt = NetNedt()
+        self.net_tail = NetTail()
+        self.nedt = NEDT()
+        return
+    def forward(self, x, out_base, locs_base, return_more=False):
+        rm = return_more
+        with torch.no_grad():
+            out_base_nedt, locs_base_nedt = self.nedt(out_base, return_more=rm)
+        hw_imgs, hw_masks = locs_base["hw_imgs"], locs_base["hw_masks"]
+        pred_nedt, locs_nedt = self.net_nedt(
+            out_base, out_base_nedt, hw_imgs, hw_masks, return_more=rm
+        )
+        pred, locs_tail = self.net_tail(
+            out_base, out_base_nedt, pred_nedt.clone().detach(), return_more=rm
+        )
+        return torch.cat([pred, pred_nedt], dim=1), (locals() if rm else None)
+class RAFT(nn.Module):
+    def __init__(self, path="/workspace/tensorrt/models/anime_interp_full.ckpt"):
+        super().__init__()
+        self.raft = RFR(
+            Namespace(
+                small=False,
+                mixed_precision=False,
+            )
+        )
+        if path is not None:
+            sd = torch.load(path)["model_state_dict"]
+            self.raft.load_state_dict(
+                {
+                    k[len("module.flownet.") :]: v
+                    for k, v in sd.items()
+                    if k.startswith("module.flownet.")
+                },
+                strict=False,
+            )
+        return
+    def forward(self, img0, img1, flow0=None, iters=12, return_more=False):
+        if flow0 is not None:
+            flow0 = flow0.flip(dims=(1,))
+        out = self.raft(img1, img0, iters=iters, flow_init=flow0)
+        return out[0].flip(dims=(1,)), (locals() if return_more else None)

vfi_models/film/__init__.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import torch
+from comfy.model_management import get_torch_device, soft_empty_cache
+import bisect
+import numpy as np
+import typing
+from vfi_utils import InterpolationStateList, load_file_from_github_release, preprocess_frames, postprocess_frames
+import pathlib
+import gc
+MODEL_TYPE = pathlib.Path(__file__).parent.name
+DEVICE = get_torch_device()
+def inference(model, img_batch_1, img_batch_2, inter_frames):
+    results = [
+        img_batch_1,
+        img_batch_2
+    ]
+    idxes = [0, inter_frames + 1]
+    remains = list(range(1, inter_frames + 1))
+    splits = torch.linspace(0, 1, inter_frames + 2)
+    for _ in range(len(remains)):
+        starts = splits[idxes[:-1]]
+        ends = splits[idxes[1:]]
+        distances = ((splits[None, remains] - starts[:, None]) / (ends[:, None] - starts[:, None]) - .5).abs()
+        matrix = torch.argmin(distances).item()
+        start_i, step = np.unravel_index(matrix, distances.shape)
+        end_i = start_i + 1
+        x0 = results[start_i].to(DEVICE)
+        x1 = results[end_i].to(DEVICE)
+        dt = x0.new_full((1, 1), (splits[remains[step]] - splits[idxes[start_i]])) / (splits[idxes[end_i]] - splits[idxes[start_i]])
+        with torch.no_grad():
+            prediction = model(x0, x1, dt)
+        insert_position = bisect.bisect_left(idxes, remains[step])
+        idxes.insert(insert_position, remains[step])
+        results.insert(insert_position, prediction.clamp(0, 1).float())
+        del remains[step]
+    return [tensor.flip(0) for tensor in results]
+class FILM_VFI:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "ckpt_name": (["film_net_fp32.pt"], ),
+                "frames": ("IMAGE", ),
+                "clear_cache_after_n_frames": ("INT", {"default": 10, "min": 1, "max": 1000}),
+                "multiplier": ("INT", {"default": 2, "min": 2, "max": 1000}),
+            },
+            "optional": {
+                "optional_interpolation_states": ("INTERPOLATION_STATES", )
+            }
+        }
+    RETURN_TYPES = ("IMAGE", )
+    FUNCTION = "vfi"
+    CATEGORY = "ComfyUI-Frame-Interpolation/VFI"
+    def vfi(
+        self,
+        ckpt_name: typing.AnyStr,
+        frames: torch.Tensor,
+        clear_cache_after_n_frames = 10,
+        multiplier: typing.SupportsInt = 2,
+        optional_interpolation_states: InterpolationStateList = None,
+        **kwargs
+    ):
+        interpolation_states = optional_interpolation_states
+        model_path = load_file_from_github_release(MODEL_TYPE, ckpt_name)
+        model = torch.jit.load(model_path, map_location='cpu')
+        model.eval()
+        model = model.to(DEVICE)
+        dtype = torch.float32
+        frames = preprocess_frames(frames)
+        number_of_frames_processed_since_last_cleared_cuda_cache = 0
+        output_frames = []
+        if type(multiplier) == int:
+            multipliers = [multiplier] * len(frames)
+        else:
+            multipliers = list(map(int, multiplier))
+            multipliers += [2] * (len(frames) - len(multipliers) - 1)
+        for frame_itr in range(len(frames) - 1): # Skip the final frame since there are no frames after it
+            if interpolation_states is not None and interpolation_states.is_frame_skipped(frame_itr):
+                continue
+            #Ensure that input frames are in fp32 - the same dtype as model
+            frame_0 = frames[frame_itr:frame_itr+1].to(DEVICE).float()
+            frame_1 = frames[frame_itr+1:frame_itr+2].to(DEVICE).float()
+            relust = inference(model, frame_0, frame_1, multipliers[frame_itr] - 1)
+            output_frames.extend([frame.detach().cpu().to(dtype=dtype) for frame in relust[:-1]])
+            number_of_frames_processed_since_last_cleared_cuda_cache += 1
+            # Try to avoid a memory overflow by clearing cuda cache regularly
+            if number_of_frames_processed_since_last_cleared_cuda_cache >= clear_cache_after_n_frames:
+                print("Comfy-VFI: Clearing cache...", end = ' ')
+                soft_empty_cache()
+                number_of_frames_processed_since_last_cleared_cuda_cache = 0
+                print("Done cache clearing")
+            gc.collect()
+        output_frames.append(frames[-1:].to(dtype=dtype)) # Append final frame
+        output_frames = [frame.cpu() for frame in output_frames] #Ensure all frames are in cpu
+        out = torch.cat(output_frames, dim=0)
+        # clear cache for courtesy
+        print("Comfy-VFI: Final clearing cache...", end = ' ')
+        soft_empty_cache()
+        print("Done cache clearing")
+        return (postprocess_frames(out), )

vfi_models/film/film_arch.py ADDED Viewed

	@@ -0,0 +1,798 @@

+"""
+https://github.com/dajes/frame-interpolation-pytorch/blob/main/feature_extractor.py
+https://github.com/dajes/frame-interpolation-pytorch/blob/main/fusion.py
+https://github.com/dajes/frame-interpolation-pytorch/blob/main/interpolator.py
+https://github.com/dajes/frame-interpolation-pytorch/blob/main/pyramid_flow_estimator.py
+https://github.com/dajes/frame-interpolation-pytorch/blob/main/util.py
+"""
+"""PyTorch layer for extracting image features for the film_net interpolator.
+The feature extractor implemented here converts an image pyramid into a pyramid
+of deep features. The feature pyramid serves a similar purpose as U-Net
+architecture's encoder, but we use a special cascaded architecture described in
+Multi-view Image Fusion [1].
+For comprehensiveness, below is a short description of the idea. While the
+description is a bit involved, the cascaded feature pyramid can be used just
+like any image feature pyramid.
+Why cascaded architeture?
+=========================
+To understand the concept it is worth reviewing a traditional feature pyramid
+first: *A traditional feature pyramid* as in U-net or in many optical flow
+networks is built by alternating between convolutions and pooling, starting
+from the input image.
+It is well known that early features of such architecture correspond to low
+level concepts such as edges in the image whereas later layers extract
+semantically higher level concepts such as object classes etc. In other words,
+the meaning of the filters in each resolution level is different. For problems
+such as semantic segmentation and many others this is a desirable property.
+However, the asymmetric features preclude sharing weights across resolution
+levels in the feature extractor itself and in any subsequent neural networks
+that follow. This can be a downside, since optical flow prediction, for
+instance is symmetric across resolution levels. The cascaded feature
+architecture addresses this shortcoming.
+How is it built?
+================
+The *cascaded* feature pyramid contains feature vectors that have constant
+length and meaning on each resolution level, except few of the finest ones. The
+advantage of this is that the subsequent optical flow layer can learn
+synergically from many resolutions. This means that coarse level prediction can
+benefit from finer resolution training examples, which can be useful with
+moderately sized datasets to avoid overfitting.
+The cascaded feature pyramid is built by extracting shallower subtree pyramids,
+each one of them similar to the traditional architecture. Each subtree
+pyramid S_i is extracted starting from each resolution level:
+image resolution 0 -> S_0
+image resolution 1 -> S_1
+image resolution 2 -> S_2
+...
+If we denote the features at level j of subtree i as S_i_j, the cascaded pyramid
+is constructed by concatenating features as follows (assuming subtree depth=3):
+lvl
+feat_0 = concat(                               S_0_0 )
+feat_1 = concat(                         S_1_0 S_0_1 )
+feat_2 = concat(                   S_2_0 S_1_1 S_0_2 )
+feat_3 = concat(             S_3_0 S_2_1 S_1_2       )
+feat_4 = concat(       S_4_0 S_3_1 S_2_2             )
+feat_5 = concat( S_5_0 S_4_1 S_3_2                   )
+   ....
+In above, all levels except feat_0 and feat_1 have the same number of features
+with similar semantic meaning. This enables training a single optical flow
+predictor module shared by levels 2,3,4,5... . For more details and evaluation
+see [1].
+[1] Multi-view Image Fusion, Trinidad et al. 2019
+"""
+from typing import List
+import torch
+from torch import nn
+from torch.nn import functional as F
+class SubTreeExtractor(nn.Module):
+    """Extracts a hierarchical set of features from an image.
+    This is a conventional, hierarchical image feature extractor, that extracts
+    [k, k*2, k*4... ] filters for the image pyramid where k=options.sub_levels.
+    Each level is followed by average pooling.
+    """
+    def __init__(self, in_channels=3, channels=64, n_layers=4):
+        super().__init__()
+        convs = []
+        for i in range(n_layers):
+            convs.append(nn.Sequential(
+                conv(in_channels, (channels << i), 3),
+                conv((channels << i), (channels << i), 3)
+            ))
+            in_channels = channels << i
+        self.convs = nn.ModuleList(convs)
+    def forward(self, image: torch.Tensor, n: int) -> List[torch.Tensor]:
+        """Extracts a pyramid of features from the image.
+        Args:
+          image: TORCH.Tensor with shape BATCH_SIZE x HEIGHT x WIDTH x CHANNELS.
+          n: number of pyramid levels to extract. This can be less or equal to
+           options.sub_levels given in the __init__.
+        Returns:
+          The pyramid of features, starting from the finest level. Each element
+          contains the output after the last convolution on the corresponding
+          pyramid level.
+        """
+        head = image
+        pyramid = []
+        for i, layer in enumerate(self.convs):
+            head = layer(head)
+            pyramid.append(head)
+            if i < n - 1:
+                head = F.avg_pool2d(head, kernel_size=2, stride=2)
+        return pyramid
+class FeatureExtractor(nn.Module):
+    """Extracts features from an image pyramid using a cascaded architecture.
+    """
+    def __init__(self, in_channels=3, channels=64, sub_levels=4):
+        super().__init__()
+        self.extract_sublevels = SubTreeExtractor(in_channels, channels, sub_levels)
+        self.sub_levels = sub_levels
+    def forward(self, image_pyramid: List[torch.Tensor]) -> List[torch.Tensor]:
+        """Extracts a cascaded feature pyramid.
+        Args:
+          image_pyramid: Image pyramid as a list, starting from the finest level.
+        Returns:
+          A pyramid of cascaded features.
+        """
+        sub_pyramids: List[List[torch.Tensor]] = []
+        for i in range(len(image_pyramid)):
+            # At each level of the image pyramid, creates a sub_pyramid of features
+            # with 'sub_levels' pyramid levels, re-using the same SubTreeExtractor.
+            # We use the same instance since we want to share the weights.
+            #
+            # However, we cap the depth of the sub_pyramid so we don't create features
+            # that are beyond the coarsest level of the cascaded feature pyramid we
+            # want to generate.
+            capped_sub_levels = min(len(image_pyramid) - i, self.sub_levels)
+            sub_pyramids.append(self.extract_sublevels(image_pyramid[i], capped_sub_levels))
+        # Below we generate the cascades of features on each level of the feature
+        # pyramid. Assuming sub_levels=3, The layout of the features will be
+        # as shown in the example on file documentation above.
+        feature_pyramid: List[torch.Tensor] = []
+        for i in range(len(image_pyramid)):
+            features = sub_pyramids[i][0]
+            for j in range(1, self.sub_levels):
+                if j <= i:
+                    features = torch.cat([features, sub_pyramids[i - j][j]], dim=1)
+            feature_pyramid.append(features)
+        return feature_pyramid
+"""The final fusion stage for the film_net frame interpolator.
+The inputs to this module are the warped input images, image features and
+flow fields, all aligned to the target frame (often midway point between the
+two original inputs). The output is the final image. FILM has no explicit
+occlusion handling -- instead using the abovementioned information this module
+automatically decides how to best blend the inputs together to produce content
+in areas where the pixels can only be borrowed from one of the inputs.
+Similarly, this module also decides on how much to blend in each input in case
+of fractional timestep that is not at the halfway point. For example, if the two
+inputs images are at t=0 and t=1, and we were to synthesize a frame at t=0.1,
+it often makes most sense to favor the first input. However, this is not
+always the case -- in particular in occluded pixels.
+The architecture of the Fusion module follows U-net [1] architecture's decoder
+side, e.g. each pyramid level consists of concatenation with upsampled coarser
+level output, and two 3x3 convolutions.
+The upsampling is implemented as 'resize convolution', e.g. nearest neighbor
+upsampling followed by 2x2 convolution as explained in [2]. The classic U-net
+uses max-pooling which has a tendency to create checkerboard artifacts.
+[1] Ronneberger et al. U-Net: Convolutional Networks for Biomedical Image
+    Segmentation, 2015, https://arxiv.org/pdf/1505.04597.pdf
+[2] https://distill.pub/2016/deconv-checkerboard/
+"""
+from typing import List
+import torch
+from torch import nn
+from torch.nn import functional as F
+_NUMBER_OF_COLOR_CHANNELS = 3
+def get_channels_at_level(level, filters):
+    n_images = 2
+    channels = _NUMBER_OF_COLOR_CHANNELS
+    flows = 2
+    return (sum(filters << i for i in range(level)) + channels + flows) * n_images
+class Fusion(nn.Module):
+    """The decoder."""
+    def __init__(self, n_layers=4, specialized_layers=3, filters=64):
+        """
+        Args:
+            m: specialized levels
+        """
+        super().__init__()
+        # The final convolution that outputs RGB:
+        self.output_conv = nn.Conv2d(filters, 3, kernel_size=1)
+        # Each item 'convs[i]' will contain the list of convolutions to be applied
+        # for pyramid level 'i'.
+        self.convs = nn.ModuleList()
+        # Create the convolutions. Roughly following the feature extractor, we
+        # double the number of filters when the resolution halves, but only up to
+        # the specialized_levels, after which we use the same number of filters on
+        # all levels.
+        #
+        # We create the convs in fine-to-coarse order, so that the array index
+        # for the convs will correspond to our normal indexing (0=finest level).
+        # in_channels: tuple = (128, 202, 256, 522, 512, 1162, 1930, 2442)
+        in_channels = get_channels_at_level(n_layers, filters)
+        increase = 0
+        for i in range(n_layers)[::-1]:
+            num_filters = (filters << i) if i < specialized_layers else (filters << specialized_layers)
+            convs = nn.ModuleList([
+                conv(in_channels, num_filters, size=2, activation=None),
+                conv(in_channels + (increase or num_filters), num_filters, size=3),
+                conv(num_filters, num_filters, size=3)]
+            )
+            self.convs.append(convs)
+            in_channels = num_filters
+            increase = get_channels_at_level(i, filters) - num_filters // 2
+    def forward(self, pyramid: List[torch.Tensor]) -> torch.Tensor:
+        """Runs the fusion module.
+        Args:
+          pyramid: The input feature pyramid as list of tensors. Each tensor being
+            in (B x H x W x C) format, with finest level tensor first.
+        Returns:
+          A batch of RGB images.
+        Raises:
+          ValueError, if len(pyramid) != config.fusion_pyramid_levels as provided in
+            the constructor.
+        """
+        # As a slight difference to a conventional decoder (e.g. U-net), we don't
+        # apply any extra convolutions to the coarsest level, but just pass it
+        # to finer levels for concatenation. This choice has not been thoroughly
+        # evaluated, but is motivated by the educated guess that the fusion part
+        # probably does not need large spatial context, because at this point the
+        # features are spatially aligned by the preceding warp.
+        net = pyramid[-1]
+        # Loop starting from the 2nd coarsest level:
+        # for i in reversed(range(0, len(pyramid) - 1)):
+        for k, layers in enumerate(self.convs):
+            i = len(self.convs) - 1 - k
+            # Resize the tensor from coarser level to match for concatenation.
+            level_size = pyramid[i].shape[2:4]
+            net = F.interpolate(net, size=level_size, mode='nearest')
+            net = layers[0](net)
+            net = torch.cat([pyramid[i], net], dim=1)
+            net = layers[1](net)
+            net = layers[2](net)
+        net = self.output_conv(net)
+        return net
+"""The film_net frame interpolator main model code.
+Basics
+======
+The film_net is an end-to-end learned neural frame interpolator implemented as
+a PyTorch model. It has the following inputs and outputs:
+Inputs:
+  x0: image A.
+  x1: image B.
+  time: desired sub-frame time.
+Outputs:
+  image: the predicted in-between image at the chosen time in range [0, 1].
+Additional outputs include forward and backward warped image pyramids, flow
+pyramids, etc., that can be visualized for debugging and analysis.
+Note that many training sets only contain triplets with ground truth at
+time=0.5. If a model has been trained with such training set, it will only work
+well for synthesizing frames at time=0.5. Such models can only generate more
+in-between frames using recursion.
+Architecture
+============
+The inference consists of three main stages: 1) feature extraction 2) warping
+3) fusion. On high-level, the architecture has similarities to Context-aware
+Synthesis for Video Frame Interpolation [1], but the exact architecture is
+closer to Multi-view Image Fusion [2] with some modifications for the frame
+interpolation use-case.
+Feature extraction stage employs the cascaded multi-scale architecture described
+in [2]. The advantage of this architecture is that coarse level flow prediction
+can be learned from finer resolution image samples. This is especially useful
+to avoid overfitting with moderately sized datasets.
+The warping stage uses a residual flow prediction idea that is similar to
+PWC-Net [3], Multi-view Image Fusion [2] and many others.
+The fusion stage is similar to U-Net's decoder where the skip connections are
+connected to warped image and feature pyramids. This is described in [2].
+Implementation Conventions
+====================
+Pyramids
+--------
+Throughtout the model, all image and feature pyramids are stored as python lists
+with finest level first followed by downscaled versions obtained by successively
+halving the resolution. The depths of all pyramids are determined by
+options.pyramid_levels. The only exception to this is internal to the feature
+extractor, where smaller feature pyramids are temporarily constructed with depth
+options.sub_levels.
+Color ranges & gamma
+--------------------
+The model code makes no assumptions on whether the images are in gamma or
+linearized space or what is the range of RGB color values. So a model can be
+trained with different choices. This does not mean that all the choices lead to
+similar results. In practice the model has been proven to work well with RGB
+scale = [0,1] with gamma-space images (i.e. not linearized).
+[1] Context-aware Synthesis for Video Frame Interpolation, Niklaus and Liu, 2018
+[2] Multi-view Image Fusion, Trinidad et al, 2019
+[3] PWC-Net: CNNs for Optical Flow Using Pyramid, Warping, and Cost Volume
+"""
+from typing import Dict, List
+import torch
+from torch import nn
+class Interpolator(nn.Module):
+    def __init__(
+            self,
+            pyramid_levels=7,
+            fusion_pyramid_levels=5,
+            specialized_levels=3,
+            sub_levels=4,
+            filters=64,
+            flow_convs=(3, 3, 3, 3),
+            flow_filters=(32, 64, 128, 256),
+    ):
+        super().__init__()
+        self.pyramid_levels = pyramid_levels
+        self.fusion_pyramid_levels = fusion_pyramid_levels
+        self.extract = FeatureExtractor(3, filters, sub_levels)
+        self.predict_flow = PyramidFlowEstimator(filters, flow_convs, flow_filters)
+        self.fuse = Fusion(sub_levels, specialized_levels, filters)
+    def shuffle_images(self, x0, x1):
+        return [
+            build_image_pyramid(x0, self.pyramid_levels),
+            build_image_pyramid(x1, self.pyramid_levels)
+        ]
+    def debug_forward(self, x0, x1, batch_dt) -> Dict[str, List[torch.Tensor]]:
+        image_pyramids = self.shuffle_images(x0, x1)
+        # Siamese feature pyramids:
+        feature_pyramids = [self.extract(image_pyramids[0]), self.extract(image_pyramids[1])]
+        # Predict forward flow.
+        forward_residual_flow_pyramid = self.predict_flow(feature_pyramids[0], feature_pyramids[1])
+        # Predict backward flow.
+        backward_residual_flow_pyramid = self.predict_flow(feature_pyramids[1], feature_pyramids[0])
+        # Concatenate features and images:
+        # Note that we keep up to 'fusion_pyramid_levels' levels as only those
+        # are used by the fusion module.
+        forward_flow_pyramid = flow_pyramid_synthesis(forward_residual_flow_pyramid)[:self.fusion_pyramid_levels]
+        backward_flow_pyramid = flow_pyramid_synthesis(backward_residual_flow_pyramid)[:self.fusion_pyramid_levels]
+        # We multiply the flows with t and 1-t to warp to the desired fractional time.
+        #
+        # Note: In film_net we fix time to be 0.5, and recursively invoke the interpo-
+        # lator for multi-frame interpolation. Below, we create a constant tensor of
+        # shape [B]. We use the `time` tensor to infer the batch size.
+        mid_time = torch.full_like(batch_dt, .5)
+        backward_flow = multiply_pyramid(backward_flow_pyramid, mid_time[:, 0])
+        forward_flow = multiply_pyramid(forward_flow_pyramid, 1 - mid_time[:, 0])
+        pyramids_to_warp = [
+            concatenate_pyramids(image_pyramids[0][:self.fusion_pyramid_levels],
+                                      feature_pyramids[0][:self.fusion_pyramid_levels]),
+            concatenate_pyramids(image_pyramids[1][:self.fusion_pyramid_levels],
+                                      feature_pyramids[1][:self.fusion_pyramid_levels])
+        ]
+        # Warp features and images using the flow. Note that we use backward warping
+        # and backward flow is used to read from image 0 and forward flow from
+        # image 1.
+        forward_warped_pyramid = pyramid_warp(pyramids_to_warp[0], backward_flow)
+        backward_warped_pyramid = pyramid_warp(pyramids_to_warp[1], forward_flow)
+        aligned_pyramid = concatenate_pyramids(forward_warped_pyramid,
+                                                    backward_warped_pyramid)
+        aligned_pyramid = concatenate_pyramids(aligned_pyramid, backward_flow)
+        aligned_pyramid = concatenate_pyramids(aligned_pyramid, forward_flow)
+        return {
+            'image': [self.fuse(aligned_pyramid)],
+            'forward_residual_flow_pyramid': forward_residual_flow_pyramid,
+            'backward_residual_flow_pyramid': backward_residual_flow_pyramid,
+            'forward_flow_pyramid': forward_flow_pyramid,
+            'backward_flow_pyramid': backward_flow_pyramid,
+        }
+    def forward(self, x0, x1, batch_dt) -> torch.Tensor:
+        return self.debug_forward(x0, x1, batch_dt)['image'][0]
+"""PyTorch layer for estimating optical flow by a residual flow pyramid.
+This approach of estimating optical flow between two images can be traced back
+to [1], but is also used by later neural optical flow computation methods such
+as SpyNet [2] and PWC-Net [3].
+The basic idea is that the optical flow is first estimated in a coarse
+resolution, then the flow is upsampled to warp the higher resolution image and
+then a residual correction is computed and added to the estimated flow. This
+process is repeated in a pyramid on coarse to fine order to successively
+increase the resolution of both optical flow and the warped image.
+In here, the optical flow predictor is used as an internal component for the
+film_net frame interpolator, to warp the two input images into the inbetween,
+target frame.
+[1] F. Glazer, Hierarchical motion detection. PhD thesis, 1987.
+[2] A. Ranjan and M. J. Black, Optical Flow Estimation using a Spatial Pyramid
+    Network. 2016
+[3] D. Sun X. Yang, M-Y. Liu and J. Kautz, PWC-Net: CNNs for Optical Flow Using
+    Pyramid, Warping, and Cost Volume, 2017
+"""
+from typing import List
+import torch
+from torch import nn
+from torch.nn import functional as F
+class FlowEstimator(nn.Module):
+    """Small-receptive field predictor for computing the flow between two images.
+    This is used to compute the residual flow fields in PyramidFlowEstimator.
+    Note that while the number of 3x3 convolutions & filters to apply is
+    configurable, two extra 1x1 convolutions are appended to extract the flow in
+    the end.
+    Attributes:
+      name: The name of the layer
+      num_convs: Number of 3x3 convolutions to apply
+      num_filters: Number of filters in each 3x3 convolution
+    """
+    def __init__(self, in_channels: int, num_convs: int, num_filters: int):
+        super(FlowEstimator, self).__init__()
+        self._convs = nn.ModuleList()
+        for i in range(num_convs):
+            self._convs.append(conv(in_channels=in_channels, out_channels=num_filters, size=3))
+            in_channels = num_filters
+        self._convs.append(conv(in_channels, num_filters // 2, size=1))
+        in_channels = num_filters // 2
+        # For the final convolution, we want no activation at all to predict the
+        # optical flow vector values. We have done extensive testing on explicitly
+        # bounding these values using sigmoid, but it turned out that having no
+        # activation gives better results.
+        self._convs.append(conv(in_channels, 2, size=1, activation=None))
+    def forward(self, features_a: torch.Tensor, features_b: torch.Tensor) -> torch.Tensor:
+        """Estimates optical flow between two images.
+        Args:
+          features_a: per pixel feature vectors for image A (B x H x W x C)
+          features_b: per pixel feature vectors for image B (B x H x W x C)
+        Returns:
+          A tensor with optical flow from A to B
+        """
+        net = torch.cat([features_a, features_b], dim=1)
+        for conv in self._convs:
+            net = conv(net)
+        return net
+class PyramidFlowEstimator(nn.Module):
+    """Predicts optical flow by coarse-to-fine refinement.
+    """
+    def __init__(self, filters: int = 64,
+                 flow_convs: tuple = (3, 3, 3, 3),
+                 flow_filters: tuple = (32, 64, 128, 256)):
+        super(PyramidFlowEstimator, self).__init__()
+        in_channels = filters << 1
+        predictors = []
+        for i in range(len(flow_convs)):
+            predictors.append(
+                FlowEstimator(
+                    in_channels=in_channels,
+                    num_convs=flow_convs[i],
+                    num_filters=flow_filters[i]))
+            in_channels += filters << (i + 2)
+        self._predictor = predictors[-1]
+        self._predictors = nn.ModuleList(predictors[:-1][::-1])
+    def forward(self, feature_pyramid_a: List[torch.Tensor],
+                feature_pyramid_b: List[torch.Tensor]) -> List[torch.Tensor]:
+        """Estimates residual flow pyramids between two image pyramids.
+        Each image pyramid is represented as a list of tensors in fine-to-coarse
+        order. Each individual image is represented as a tensor where each pixel is
+        a vector of image features.
+        flow_pyramid_synthesis can be used to convert the residual flow
+        pyramid returned by this method into a flow pyramid, where each level
+        encodes the flow instead of a residual correction.
+        Args:
+          feature_pyramid_a: image pyramid as a list in fine-to-coarse order
+          feature_pyramid_b: image pyramid as a list in fine-to-coarse order
+        Returns:
+          List of flow tensors, in fine-to-coarse order, each level encoding the
+          difference against the bilinearly upsampled version from the coarser
+          level. The coarsest flow tensor, e.g. the last element in the array is the
+          'DC-term', e.g. not a residual (alternatively you can think of it being a
+          residual against zero).
+        """
+        levels = len(feature_pyramid_a)
+        v = self._predictor(feature_pyramid_a[-1], feature_pyramid_b[-1])
+        residuals = [v]
+        for i in range(levels - 2, len(self._predictors) - 1, -1):
+            # Upsamples the flow to match the current pyramid level. Also, scales the
+            # magnitude by two to reflect the new size.
+            level_size = feature_pyramid_a[i].shape[2:4]
+            v = F.interpolate(2 * v, size=level_size, mode='bilinear')
+            # Warp feature_pyramid_b[i] image based on the current flow estimate.
+            warped = warp(feature_pyramid_b[i], v)
+            # Estimate the residual flow between pyramid_a[i] and warped image:
+            v_residual = self._predictor(feature_pyramid_a[i], warped)
+            residuals.insert(0, v_residual)
+            v = v_residual + v
+        for k, predictor in enumerate(self._predictors):
+            i = len(self._predictors) - 1 - k
+            # Upsamples the flow to match the current pyramid level. Also, scales the
+            # magnitude by two to reflect the new size.
+            level_size = feature_pyramid_a[i].shape[2:4]
+            v = F.interpolate(2 * v, size=level_size, mode='bilinear')
+            # Warp feature_pyramid_b[i] image based on the current flow estimate.
+            warped = warp(feature_pyramid_b[i], v)
+            # Estimate the residual flow between pyramid_a[i] and warped image:
+            v_residual = predictor(feature_pyramid_a[i], warped)
+            residuals.insert(0, v_residual)
+            v = v_residual + v
+        return residuals
+"""Various utilities used in the film_net frame interpolator model."""
+from typing import List, Optional
+import cv2
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+def pad_batch(batch, align):
+    height, width = batch.shape[1:3]
+    height_to_pad = (align - height % align) if height % align != 0 else 0
+    width_to_pad = (align - width % align) if width % align != 0 else 0
+    crop_region = [height_to_pad >> 1, width_to_pad >> 1, height + (height_to_pad >> 1), width + (width_to_pad >> 1)]
+    batch = np.pad(batch, ((0, 0), (height_to_pad >> 1, height_to_pad - (height_to_pad >> 1)),
+                           (width_to_pad >> 1, width_to_pad - (width_to_pad >> 1)), (0, 0)), mode='constant')
+    return batch, crop_region
+def load_image(path, align=64):
+    image = cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2RGB).astype(np.float32) / np.float32(255)
+    image_batch, crop_region = pad_batch(np.expand_dims(image, axis=0), align)
+    return image_batch, crop_region
+def build_image_pyramid(image: torch.Tensor, pyramid_levels: int = 3) -> List[torch.Tensor]:
+    """Builds an image pyramid from a given image.
+    The original image is included in the pyramid and the rest are generated by
+    successively halving the resolution.
+    Args:
+      image: the input image.
+      options: film_net options object
+    Returns:
+      A list of images starting from the finest with options.pyramid_levels items
+    """
+    pyramid = []
+    for i in range(pyramid_levels):
+        pyramid.append(image)
+        if i < pyramid_levels - 1:
+            image = F.avg_pool2d(image, 2, 2)
+    return pyramid
+def warp(image: torch.Tensor, flow: torch.Tensor) -> torch.Tensor:
+    """Backward warps the image using the given flow.
+    Specifically, the output pixel in batch b, at position x, y will be computed
+    as follows:
+      (flowed_y, flowed_x) = (y+flow[b, y, x, 1], x+flow[b, y, x, 0])
+      output[b, y, x] = bilinear_lookup(image, b, flowed_y, flowed_x)
+    Note that the flow vectors are expected as [x, y], e.g. x in position 0 and
+    y in position 1.
+    Args:
+      image: An image with shape BxHxWxC.
+      flow: A flow with shape BxHxWx2, with the two channels denoting the relative
+        offset in order: (dx, dy).
+    Returns:
+      A warped image.
+    """
+    flow = -flow.flip(1)
+    dtype = flow.dtype
+    device = flow.device
+    # warped = tfa_image.dense_image_warp(image, flow)
+    # Same as above but with pytorch
+    ls1 = 1 - 1 / flow.shape[3]
+    ls2 = 1 - 1 / flow.shape[2]
+    normalized_flow2 = flow.permute(0, 2, 3, 1) / torch.tensor(
+        [flow.shape[2] * .5, flow.shape[3] * .5], dtype=dtype, device=device)[None, None, None]
+    normalized_flow2 = torch.stack([
+        torch.linspace(-ls1, ls1, flow.shape[3], dtype=dtype, device=device)[None, None, :] - normalized_flow2[..., 1],
+        torch.linspace(-ls2, ls2, flow.shape[2], dtype=dtype, device=device)[None, :, None] - normalized_flow2[..., 0],
+    ], dim=3)
+    padding_mode = "border"
+    if device.type == "mps":
+        # https://github.com/pytorch/pytorch/issues/125098
+        padding_mode = "zeros"
+        normalized_flow2 = normalized_flow2.clamp(-1, 1)
+    warped = F.grid_sample(
+        input=image,
+        grid=normalized_flow2,
+        mode='bilinear',
+        padding_mode=padding_mode,
+        align_corners=False,
+    )
+    return warped.reshape(image.shape)
+def multiply_pyramid(pyramid: List[torch.Tensor],
+                     scalar: torch.Tensor) -> List[torch.Tensor]:
+    """Multiplies all image batches in the pyramid by a batch of scalars.
+    Args:
+      pyramid: Pyramid of image batches.
+      scalar: Batch of scalars.
+    Returns:
+      An image pyramid with all images multiplied by the scalar.
+    """
+    # To multiply each image with its corresponding scalar, we first transpose
+    # the batch of images from BxHxWxC-format to CxHxWxB. This can then be
+    # multiplied with a batch of scalars, then we transpose back to the standard
+    # BxHxWxC form.
+    return [image * scalar for image in pyramid]
+def flow_pyramid_synthesis(
+        residual_pyramid: List[torch.Tensor]) -> List[torch.Tensor]:
+    """Converts a residual flow pyramid into a flow pyramid."""
+    flow = residual_pyramid[-1]
+    flow_pyramid: List[torch.Tensor] = [flow]
+    for residual_flow in residual_pyramid[:-1][::-1]:
+        level_size = residual_flow.shape[2:4]
+        flow = F.interpolate(2 * flow, size=level_size, mode='bilinear')
+        flow = residual_flow + flow
+        flow_pyramid.insert(0, flow)
+    return flow_pyramid
+def pyramid_warp(feature_pyramid: List[torch.Tensor],
+                 flow_pyramid: List[torch.Tensor]) -> List[torch.Tensor]:
+    """Warps the feature pyramid using the flow pyramid.
+    Args:
+      feature_pyramid: feature pyramid starting from the finest level.
+      flow_pyramid: flow fields, starting from the finest level.
+    Returns:
+      Reverse warped feature pyramid.
+    """
+    warped_feature_pyramid = []
+    for features, flow in zip(feature_pyramid, flow_pyramid):
+        warped_feature_pyramid.append(warp(features, flow))
+    return warped_feature_pyramid
+def concatenate_pyramids(pyramid1: List[torch.Tensor],
+                         pyramid2: List[torch.Tensor]) -> List[torch.Tensor]:
+    """Concatenates each pyramid level together in the channel dimension."""
+    result = []
+    for features1, features2 in zip(pyramid1, pyramid2):
+        result.append(torch.cat([features1, features2], dim=1))
+    return result
+def conv(in_channels, out_channels, size, activation: Optional[str] = 'relu'):
+    # Since PyTorch doesn't have an in-built activation in Conv2d, we use a
+    # Sequential layer to combine Conv2d and Leaky ReLU in one module.
+    _conv = nn.Conv2d(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        kernel_size=size,
+        padding='same')
+    if activation is None:
+        return _conv
+    assert activation == 'relu'
+    return nn.Sequential(
+        _conv,
+        nn.LeakyReLU(.2)
+    )

vfi_models/flavr/__init__.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import torch
+from comfy.model_management import get_torch_device, soft_empty_cache
+import numpy as np
+import typing
+from vfi_utils import InterpolationStateList, load_file_from_github_release, preprocess_frames, postprocess_frames, assert_batch_size
+import pathlib
+import warnings
+from .flavr_arch import UNet_3D_3D, InputPadder
+import gc
+device = get_torch_device()
+NBR_FRAME = 4
+def build_flavr(model_path):
+    sd = torch.load(model_path)['state_dict']
+    sd = {k.partition("module.")[-1]:v for k,v in sd.items()}
+    #Ref: Class UNet_3D_3D
+    model = UNet_3D_3D("unet_18", n_inputs=NBR_FRAME, n_outputs=sd["outconv.1.weight"].shape[0] // 3, joinType="concat" , upmode="transpose")
+    model.load_state_dict(sd)
+    model.to(device).eval()
+    del sd
+    return model
+MODEL_TYPE = pathlib.Path(__file__).parent.name
+CKPT_NAMES = ["FLAVR_2x.pth", "FLAVR_4x.pth", "FLAVR_8x.pth"]
+class FLAVR_VFI:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "ckpt_name": (CKPT_NAMES, ),
+                "frames": ("IMAGE", ),
+                "clear_cache_after_n_frames": ("INT", {"default": 10, "min": 1, "max": 1000}),
+                "multiplier": ("INT", {"default": 2, "min": 2, "max": 2}), #TODO: Implement recursively invoking interpolator for multi-frame interpolation
+                "duplicate_first_last_frames": ("BOOLEAN", {"default": False})
+            },
+            "optional": {
+                "optional_interpolation_states": ("INTERPOLATION_STATES", )
+            }
+        }
+    RETURN_TYPES = ("IMAGE", )
+    FUNCTION = "vfi"
+    CATEGORY = "ComfyUI-Frame-Interpolation/VFI"
+    #Reference: https://github.com/danier97/ST-MFNet/blob/main/interpolate_yuv.py#L93
+    def vfi(
+        self,
+        ckpt_name: typing.AnyStr,
+        frames: torch.Tensor,
+        clear_cache_after_n_frames = 10,
+        multiplier: typing.SupportsInt = 2,
+        duplicate_first_last_frames: bool = False,
+        optional_interpolation_states: InterpolationStateList = None,
+        **kwargs
+    ):
+        if multiplier != 2:
+            warnings.warn("Currently, FLAVR only supports 2x interpolation. The process will continue but please set multiplier=2 afterward")
+        assert_batch_size(frames, batch_size=4, vfi_name="ST-MFNet")
+        interpolation_states = optional_interpolation_states
+        model_path = load_file_from_github_release(MODEL_TYPE, ckpt_name)
+        model = build_flavr(model_path)
+        frames = preprocess_frames(frames)
+        padder = InputPadder(frames.shape, 16)
+        frames = padder.pad(frames)
+        number_of_frames_processed_since_last_cleared_cuda_cache = 0
+        output_frames = []
+        for frame_itr in range(len(frames) - 3):
+            #Does skipping frame i+1 make sanse in this case?
+            if interpolation_states is not None and interpolation_states.is_frame_skipped(frame_itr) and interpolation_states.is_frame_skipped(frame_itr + 1):
+                continue
+            #Ensure that input frames are in fp32 - the same dtype as model
+            frame0, frame1, frame2, frame3 = (
+                frames[frame_itr:frame_itr+1].float(),
+                frames[frame_itr+1:frame_itr+2].float(),
+                frames[frame_itr+2:frame_itr+3].float(),
+                frames[frame_itr+3:frame_itr+4].float()
+            )
+            new_frame = model([frame0.to(device), frame1.to(device), frame2.to(device), frame3.to(device)])[0].detach().cpu()
+            number_of_frames_processed_since_last_cleared_cuda_cache += 2
+            if frame_itr == 0:
+                output_frames.append(frame0)
+                if duplicate_first_last_frames:
+                    output_frames.append(frame0) # repeat the first frame
+                output_frames.append(frame1)
+            output_frames.append(new_frame)
+            output_frames.append(frame2)
+            if frame_itr == len(frames) - 4:
+                output_frames.append(frame3)
+                if duplicate_first_last_frames:
+                    output_frames.append(frame3) # repeat the last frame
+            # Try to avoid a memory overflow by clearing cuda cache regularly
+            if number_of_frames_processed_since_last_cleared_cuda_cache >= clear_cache_after_n_frames:
+                print("Comfy-VFI: Clearing cache...", end = ' ')
+                soft_empty_cache()
+                number_of_frames_processed_since_last_cleared_cuda_cache = 0
+                print("Done cache clearing")
+            gc.collect()
+        dtype = torch.float32
+        output_frames = [frame.cpu().to(dtype=dtype) for frame in output_frames] #Ensure all frames are in cpu
+        out = torch.cat(output_frames, dim=0)
+        out = padder.unpad(out)
+        # clear cache for courtesy
+        print("Comfy-VFI: Final clearing cache...", end=' ')
+        soft_empty_cache()
+        print("Done cache clearing")
+        return (postprocess_frames(out), )

vfi_models/flavr/flavr_arch.py ADDED Viewed

	@@ -0,0 +1,217 @@

+"""
+https://github.com/tarun005/FLAVR/blob/main/model/FLAVR_arch.py
+https://github.com/tarun005/FLAVR/blob/main/model/resnet_3D.py (only SEGating)
+"""
+import math
+import numpy as np
+import importlib
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class SEGating(nn.Module):
+    def __init__(self , inplanes , reduction=16):
+        super().__init__()
+        self.pool = nn.AdaptiveAvgPool3d(1)
+        self.attn_layer = nn.Sequential(
+            nn.Conv3d(inplanes , inplanes , kernel_size=1 , stride=1 , bias=True),
+            nn.Sigmoid()
+        )
+    def forward(self , x):
+        out = self.pool(x)
+        y = self.attn_layer(out)
+        return x * y
+def joinTensors(X1 , X2 , type="concat"):
+    if type == "concat":
+        return torch.cat([X1 , X2] , dim=1)
+    elif type == "add":
+        return X1 + X2
+    else:
+        return X1
+class Conv_2d(nn.Module):
+    def __init__(self, in_ch, out_ch, kernel_size, stride=1, padding=0, bias=False, batchnorm=False):
+        super().__init__()
+        self.conv = [nn.Conv2d(in_ch, out_ch, kernel_size=kernel_size, stride=stride, padding=padding, bias=bias)]
+        if batchnorm:
+            self.conv += [nn.BatchNorm2d(out_ch)]
+        self.conv = nn.Sequential(*self.conv)
+    def forward(self, x):
+        return self.conv(x)
+class upConv3D(nn.Module):
+    def __init__(self, in_ch, out_ch, kernel_size, stride, padding, upmode="transpose" , batchnorm=False):
+        super().__init__()
+        self.upmode = upmode
+        if self.upmode=="transpose":
+            self.upconv = nn.ModuleList(
+                [nn.ConvTranspose3d(in_ch, out_ch, kernel_size=kernel_size, stride=stride, padding=padding),
+                SEGating(out_ch)
+                ]
+            )
+        else:
+            self.upconv = nn.ModuleList(
+                [nn.Upsample(mode='trilinear', scale_factor=(1,2,2), align_corners=False),
+                nn.Conv3d(in_ch, out_ch , kernel_size=1 , stride=1),
+                SEGating(out_ch)
+                ]
+            )
+        if batchnorm:
+            self.upconv += [nn.BatchNorm3d(out_ch)]
+        self.upconv = nn.Sequential(*self.upconv)
+    def forward(self, x):
+        return self.upconv(x)
+class Conv_3d(nn.Module):
+    def __init__(self, in_ch, out_ch, kernel_size, stride=1, padding=0, bias=True, batchnorm=False):
+        super().__init__()
+        self.conv = [nn.Conv3d(in_ch, out_ch, kernel_size=kernel_size, stride=stride, padding=padding, bias=bias),
+                    SEGating(out_ch)
+                    ]
+        if batchnorm:
+            self.conv += [nn.BatchNorm3d(out_ch)]
+        self.conv = nn.Sequential(*self.conv)
+    def forward(self, x):
+        return self.conv(x)
+class upConv2D(nn.Module):
+    def __init__(self, in_ch, out_ch, kernel_size, stride, padding, upmode="transpose" , batchnorm=False):
+        super().__init__()
+        self.upmode = upmode
+        if self.upmode=="transpose":
+            self.upconv = [nn.ConvTranspose2d(in_ch, out_ch, kernel_size=kernel_size, stride=stride, padding=padding)]
+        else:
+            self.upconv = [
+                nn.Upsample(mode='bilinear', scale_factor=2, align_corners=False),
+                nn.Conv2d(in_ch, out_ch , kernel_size=1 , stride=1)
+            ]
+        if batchnorm:
+            self.upconv += [nn.BatchNorm2d(out_ch)]
+        self.upconv = nn.Sequential(*self.upconv)
+    def forward(self, x):
+        return self.upconv(x)
+class UNet_3D_3D(nn.Module):
+    def __init__(self, block , n_inputs, n_outputs, batchnorm=False , joinType="concat" , upmode="transpose"):
+        super().__init__()
+        nf = [512 , 256 , 128 , 64]
+        out_channels = 3*n_outputs
+        self.joinType = joinType
+        self.n_outputs = n_outputs
+        growth = 2 if joinType == "concat" else 1
+        self.lrelu = nn.LeakyReLU(0.2, True)
+        unet_3D = importlib.import_module(".resnet_3D", "vfi_models.flavr")
+        if n_outputs > 1:
+            unet_3D.useBias = True
+        self.encoder = getattr(unet_3D , block)(pretrained=False , bn=batchnorm)
+        self.decoder = nn.Sequential(
+            Conv_3d(nf[0], nf[1] , kernel_size=3, padding=1, bias=True, batchnorm=batchnorm),
+            upConv3D(nf[1]*growth, nf[2], kernel_size=(3,4,4), stride=(1,2,2), padding=(1,1,1) , upmode=upmode, batchnorm=batchnorm),
+            upConv3D(nf[2]*growth, nf[3], kernel_size=(3,4,4), stride=(1,2,2), padding=(1,1,1) , upmode=upmode, batchnorm=batchnorm),
+            Conv_3d(nf[3]*growth, nf[3] , kernel_size=3, padding=1, bias=True, batchnorm=batchnorm),
+            upConv3D(nf[3]*growth , nf[3], kernel_size=(3,4,4), stride=(1,2,2), padding=(1,1,1) , upmode=upmode, batchnorm=batchnorm)
+        )
+        self.feature_fuse = Conv_2d(nf[3]*n_inputs , nf[3] , kernel_size=1 , stride=1, batchnorm=batchnorm)
+        self.outconv = nn.Sequential(
+            nn.ReflectionPad2d(3),
+            nn.Conv2d(nf[3], out_channels , kernel_size=7 , stride=1, padding=0)
+        )
+    def forward(self, images):
+        images = torch.stack(images , dim=2)
+        ## Batch mean normalization works slightly better than global mean normalization, thanks to https://github.com/myungsub/CAIN
+        mean_ = images.mean(2, keepdim=True).mean(3, keepdim=True).mean(4,keepdim=True)
+        images = images-mean_
+        x_0 , x_1 , x_2 , x_3 , x_4 = self.encoder(images)
+        dx_3 = self.lrelu(self.decoder[0](x_4))
+        dx_3 = joinTensors(dx_3 , x_3 , type=self.joinType)
+        dx_2 = self.lrelu(self.decoder[1](dx_3))
+        dx_2 = joinTensors(dx_2 , x_2 , type=self.joinType)
+        dx_1 = self.lrelu(self.decoder[2](dx_2))
+        dx_1 = joinTensors(dx_1 , x_1 , type=self.joinType)
+        dx_0 = self.lrelu(self.decoder[3](dx_1))
+        dx_0 = joinTensors(dx_0 , x_0 , type=self.joinType)
+        dx_out = self.lrelu(self.decoder[4](dx_0))
+        dx_out = torch.cat(torch.unbind(dx_out , 2) , 1)
+        out = self.lrelu(self.feature_fuse(dx_out))
+        out = self.outconv(out)
+        out = torch.split(out, dim=1, split_size_or_sections=3)
+        mean_ = mean_.squeeze(2)
+        out = [o+mean_ for o in out]
+        return out
+class InputPadder:
+    """ Pads images such that dimensions are divisible by divisor """
+    def __init__(self, dims, divisor=16):
+        self.ht, self.wd = dims[-2:]
+        pad_ht = (((self.ht // divisor) + 1) * divisor - self.ht) % divisor
+        pad_wd = (((self.wd // divisor) + 1) * divisor - self.wd) % divisor
+        self._pad = [pad_wd//2, pad_wd - pad_wd//2, pad_ht//2, pad_ht - pad_ht//2]
+    def pad(self, input_tensor):
+        return F.pad(input_tensor, self._pad, mode='replicate')
+    def unpad(self, input_tensor):
+        return self._unpad(input_tensor)
+    def _unpad(self, x):
+        ht, wd = x.shape[-2:]
+        c = [self._pad[2], ht-self._pad[3], self._pad[0], wd-self._pad[1]]
+        return x[..., c[0]:c[1], c[2]:c[3]]

vfi_models/flavr/resnet_3D.py ADDED Viewed

	@@ -0,0 +1,288 @@

+# Modified from https://github.com/pytorch/vision/tree/master/torchvision/models/video
+import torch
+import torch.nn as nn
+__all__ = ['unet_18', 'unet_34']
+useBias = False
+class identity(nn.Module):
+    def __init__(self , *args , **kwargs):
+        super().__init__()
+    def forward(self , x):
+        return x
+class Conv3DSimple(nn.Conv3d):
+    def __init__(self,
+                 in_planes,
+                 out_planes,
+                 midplanes=None,
+                 stride=1,
+                 padding=1):
+        super(Conv3DSimple, self).__init__(
+            in_channels=in_planes,
+            out_channels=out_planes,
+            kernel_size=(3, 3, 3),
+            stride=stride,
+            padding=padding,
+            bias=useBias)
+    @staticmethod
+    def get_downsample_stride(stride , temporal_stride):
+        if temporal_stride:
+            return (temporal_stride, stride, stride)
+        else:
+            return (stride , stride , stride)
+class BasicStem(nn.Sequential):
+    """The default conv-batchnorm-relu stem
+    """
+    def __init__(self):
+        super().__init__(
+            nn.Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2),
+                padding=(1, 3, 3), bias=useBias),
+            batchnorm(64),
+            nn.ReLU(inplace=False))
+class Conv2Plus1D(nn.Sequential):
+    def __init__(self,
+                 in_planes,
+                 out_planes,
+                 midplanes,
+                 stride=1,
+                 padding=1):
+        if not isinstance(stride , int):
+            temporal_stride , stride , stride = stride
+        else:
+            temporal_stride = stride
+        super(Conv2Plus1D, self).__init__(
+            nn.Conv3d(in_planes, midplanes, kernel_size=(1, 3, 3),
+                      stride=(1, stride, stride), padding=(0, padding, padding),
+                      bias=False),
+            # batchnorm(midplanes),
+            nn.ReLU(inplace=True),
+            nn.Conv3d(midplanes, out_planes, kernel_size=(3, 1, 1),
+                      stride=(temporal_stride, 1, 1), padding=(padding, 0, 0),
+                      bias=False))
+    @staticmethod
+    def get_downsample_stride(stride , temporal_stride):
+        if temporal_stride:
+            return (temporal_stride, stride, stride)
+        else:
+            return (stride , stride , stride)
+class R2Plus1dStem(nn.Sequential):
+    """R(2+1)D stem is different than the default one as it uses separated 3D convolution
+    """
+    def __init__(self):
+        super().__init__(
+            nn.Conv3d(3, 45, kernel_size=(1, 7, 7),
+                      stride=(1, 2, 2), padding=(0, 3, 3),
+                      bias=False),
+            batchnorm(45),
+            nn.ReLU(inplace=True),
+            nn.Conv3d(45, 64, kernel_size=(3, 1, 1),
+                      stride=(1, 1, 1), padding=(1, 0, 0),
+                      bias=False),
+            batchnorm(64),
+            nn.ReLU(inplace=True))
+class SEGating(nn.Module):
+    def __init__(self , inplanes , reduction=16):
+        super().__init__()
+        self.pool = nn.AdaptiveAvgPool3d(1)
+        self.attn_layer = nn.Sequential(
+            nn.Conv3d(inplanes , inplanes , kernel_size=1 , stride=1 , bias=True),
+            nn.Sigmoid()
+        )
+    def forward(self , x):
+        out = self.pool(x)
+        y = self.attn_layer(out)
+        return x * y
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, conv_builder, stride=1, downsample=None):
+        midplanes = (inplanes * planes * 3 * 3 * 3) // (inplanes * 3 * 3 + 3 * planes)
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Sequential(
+            conv_builder(inplanes, planes, midplanes, stride),
+            batchnorm(planes),
+            nn.ReLU(inplace=True)
+        )
+        self.conv2 = nn.Sequential(
+            conv_builder(planes, planes, midplanes),
+            batchnorm(planes)
+        )
+        self.fg = SEGating(planes) ## Feature Gating
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+        out = self.fg(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class VideoResNet(nn.Module):
+    def __init__(self, block, conv_makers, layers,
+                 stem, zero_init_residual=False):
+        """Generic resnet video generator.
+        Args:
+            block (nn.Module): resnet building block
+            conv_makers (list(functions)): generator function for each layer
+            layers (List[int]): number of blocks per layer
+            stem (nn.Module, optional): Resnet stem, if None, defaults to conv-bn-relu. Defaults to None.
+        """
+        super(VideoResNet, self).__init__()
+        self.inplanes = 64
+        self.stem = stem()
+        self.layer1 = self._make_layer(block, conv_makers[0], 64, layers[0], stride=1 )
+        self.layer2 = self._make_layer(block, conv_makers[1], 128, layers[1], stride=2 , temporal_stride=1)
+        self.layer3 = self._make_layer(block, conv_makers[2], 256, layers[2], stride=2 , temporal_stride=1)
+        self.layer4 = self._make_layer(block, conv_makers[3], 512, layers[3], stride=1, temporal_stride=1)
+        # init weights
+        self._initialize_weights()
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    nn.init.constant_(m.bn3.weight, 0)
+    def forward(self, x):
+        x_0 = self.stem(x)
+        x_1 = self.layer1(x_0)
+        x_2 = self.layer2(x_1)
+        x_3 = self.layer3(x_2)
+        x_4 = self.layer4(x_3)
+        return x_0 , x_1 , x_2 , x_3 , x_4
+    def _make_layer(self, block, conv_builder, planes, blocks, stride=1, temporal_stride=None):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            ds_stride = conv_builder.get_downsample_stride(stride , temporal_stride)
+            downsample = nn.Sequential(
+                nn.Conv3d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=ds_stride, bias=False),
+                batchnorm(planes * block.expansion)
+            )
+            stride = ds_stride
+        layers = []
+        layers.append(block(self.inplanes, planes, conv_builder, stride, downsample ))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, conv_builder ))
+        return nn.Sequential(*layers)
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv3d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out',
+                                        nonlinearity='relu')
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm3d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                nn.init.constant_(m.bias, 0)
+def _video_resnet(arch, pretrained=False, progress=True, **kwargs):
+    model = VideoResNet(**kwargs)
+    ## TODO: Other 3D resnet models, like S3D, r(2+1)D.
+    if pretrained:
+        state_dict = load_state_dict_from_url(model_urls[arch],
+                                              progress=progress)
+        model.load_state_dict(state_dict)
+    return model
+def unet_18(pretrained=False, bn=False, progress=True, **kwargs):
+    """
+    Construct 18 layer Unet3D model as in
+    https://arxiv.org/abs/1711.11248
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on Kinetics-400
+        progress (bool): If True, displays a progress bar of the download to stderr
+    Returns:
+        nn.Module: R3D-18 encoder
+    """
+    global batchnorm
+    if bn:
+        batchnorm = nn.BatchNorm3d
+    else:
+        batchnorm = identity
+    return _video_resnet('r3d_18',
+                         pretrained, progress,
+                         block=BasicBlock,
+                         conv_makers=[Conv3DSimple] * 4,
+                         layers=[2, 2, 2, 2],
+                         stem=BasicStem, **kwargs)
+def unet_34(pretrained=False, bn=False, progress=True, **kwargs):
+    """
+    Construct 34 layer Unet3D model as in
+    https://arxiv.org/abs/1711.11248
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on Kinetics-400
+        progress (bool): If True, displays a progress bar of the download to stderr
+    Returns:
+        nn.Module: R3D-18 encoder
+    """
+    global batchnorm
+    # bn = False
+    if bn:
+        batchnorm = nn.BatchNorm3d
+    else:
+        batchnorm = identity
+    return _video_resnet('r3d_34',
+                         pretrained, progress,
+                         block=BasicBlock,
+                         conv_makers=[Conv3DSimple] * 4,
+                         layers=[3, 4, 6, 3],
+                         stem=BasicStem, **kwargs)

vfi_models/gmfss_fortuna/GMFSS_Fortuna.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import itertools
+import numpy as np
+import vapoursynth as vs
+from .GMFSS_Fortuna_arch import Model_inference
+import torch
+import traceback
+class GMFSS_Fortuna:
+    def __init__(self):
+        self.cache = False
+        self.amount_input_img = 2
+        torch.set_grad_enabled(False)
+        torch.backends.cudnn.enabled = True
+        torch.backends.cudnn.benchmark = True
+        self.model = Model_inference()
+        self.model.eval()
+    def execute(self, I0, I1, timestep):
+        with torch.inference_mode():
+            middle = self.model(I0, I1, timestep).cpu()
+        return middle

vfi_models/gmfss_fortuna/GMFSS_Fortuna_arch.py ADDED Viewed

	@@ -0,0 +1,1850 @@

+"""
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/GMFSS_infer_b.py
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/softsplat.py
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/FusionNet_b.py
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/FeatureNet.py
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/MetricNet.py
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/IFNet_HDv3.py
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/gmflow/gmflow.py
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/gmflow/utils.py
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/gmflow/position.py
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/gmflow/geometry.py
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/gmflow/matching.py
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/gmflow/transformer.py
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/gmflow/backbone.py
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/gmflow/trident_conv.py
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/warplayer.py
+"""
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.modules.utils import _pair
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch
+import math
+from vfi_models.rife.rife_arch import IFNet
+from vfi_models.ops import softsplat
+from comfy.model_management import get_torch_device
+device = get_torch_device()
+backwarp_tenGrid = {}
+def warp(tenInput, tenFlow):
+    k = (str(tenFlow.device), str(tenFlow.size()))
+    if k not in backwarp_tenGrid:
+        tenHorizontal = (
+            torch.linspace(-1.0, 1.0, tenFlow.shape[3], device=device)
+            .view(1, 1, 1, tenFlow.shape[3])
+            .expand(tenFlow.shape[0], -1, tenFlow.shape[2], -1)
+        )
+        tenVertical = (
+            torch.linspace(-1.0, 1.0, tenFlow.shape[2], device=device)
+            .view(1, 1, tenFlow.shape[2], 1)
+            .expand(tenFlow.shape[0], -1, -1, tenFlow.shape[3])
+        )
+        backwarp_tenGrid[k] = torch.cat([tenHorizontal, tenVertical], 1).to(device)
+    tenFlow = torch.cat(
+        [
+            tenFlow[:, 0:1, :, :] / ((tenInput.shape[3] - 1.0) / 2.0),
+            tenFlow[:, 1:2, :, :] / ((tenInput.shape[2] - 1.0) / 2.0),
+        ],
+        1,
+    )
+    g = (backwarp_tenGrid[k] + tenFlow).permute(0, 2, 3, 1)
+    return torch.nn.functional.grid_sample(
+        input=tenInput,
+        grid=g,
+        mode="bilinear",
+        padding_mode="border",
+        align_corners=True,
+    )
+class MultiScaleTridentConv(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        strides=1,
+        paddings=0,
+        dilations=1,
+        dilation=1,
+        groups=1,
+        num_branch=1,
+        test_branch_idx=-1,
+        bias=False,
+        norm=None,
+        activation=None,
+    ):
+        super(MultiScaleTridentConv, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.num_branch = num_branch
+        self.stride = _pair(stride)
+        self.groups = groups
+        self.with_bias = bias
+        self.dilation = dilation
+        if isinstance(paddings, int):
+            paddings = [paddings] * self.num_branch
+        if isinstance(dilations, int):
+            dilations = [dilations] * self.num_branch
+        if isinstance(strides, int):
+            strides = [strides] * self.num_branch
+        self.paddings = [_pair(padding) for padding in paddings]
+        self.dilations = [_pair(dilation) for dilation in dilations]
+        self.strides = [_pair(stride) for stride in strides]
+        self.test_branch_idx = test_branch_idx
+        self.norm = norm
+        self.activation = activation
+        assert len({self.num_branch, len(self.paddings), len(self.strides)}) == 1
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // groups, *self.kernel_size)
+        )
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.bias = None
+        nn.init.kaiming_uniform_(self.weight, nonlinearity="relu")
+        if self.bias is not None:
+            nn.init.constant_(self.bias, 0)
+    def forward(self, inputs):
+        num_branch = (
+            self.num_branch if self.training or self.test_branch_idx == -1 else 1
+        )
+        assert len(inputs) == num_branch
+        if self.training or self.test_branch_idx == -1:
+            outputs = [
+                F.conv2d(
+                    input,
+                    self.weight,
+                    self.bias,
+                    stride,
+                    padding,
+                    self.dilation,
+                    self.groups,
+                )
+                for input, stride, padding in zip(inputs, self.strides, self.paddings)
+            ]
+        else:
+            outputs = [
+                F.conv2d(
+                    inputs[0],
+                    self.weight,
+                    self.bias,
+                    self.strides[self.test_branch_idx]
+                    if self.test_branch_idx == -1
+                    else self.strides[-1],
+                    self.paddings[self.test_branch_idx]
+                    if self.test_branch_idx == -1
+                    else self.paddings[-1],
+                    self.dilation,
+                    self.groups,
+                )
+            ]
+        if self.norm is not None:
+            outputs = [self.norm(x) for x in outputs]
+        if self.activation is not None:
+            outputs = [self.activation(x) for x in outputs]
+        return outputs
+class ResidualBlock_class(nn.Module):
+    def __init__(
+        self,
+        in_planes,
+        planes,
+        norm_layer=nn.InstanceNorm2d,
+        stride=1,
+        dilation=1,
+    ):
+        super(ResidualBlock_class, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_planes,
+            planes,
+            kernel_size=3,
+            dilation=dilation,
+            padding=dilation,
+            stride=stride,
+            bias=False,
+        )
+        self.conv2 = nn.Conv2d(
+            planes,
+            planes,
+            kernel_size=3,
+            dilation=dilation,
+            padding=dilation,
+            bias=False,
+        )
+        self.relu = nn.ReLU(inplace=True)
+        self.norm1 = norm_layer(planes)
+        self.norm2 = norm_layer(planes)
+        if not stride == 1 or in_planes != planes:
+            self.norm3 = norm_layer(planes)
+        if stride == 1 and in_planes == planes:
+            self.downsample = None
+        else:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3
+            )
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return self.relu(x + y)
+class CNNEncoder(nn.Module):
+    def __init__(
+        self,
+        output_dim=128,
+        norm_layer=nn.InstanceNorm2d,
+        num_output_scales=1,
+        **kwargs,
+    ):
+        super(CNNEncoder, self).__init__()
+        self.num_branch = num_output_scales
+        feature_dims = [64, 96, 128]
+        self.conv1 = nn.Conv2d(
+            3, feature_dims[0], kernel_size=7, stride=2, padding=3, bias=False
+        )  # 1/2
+        self.norm1 = norm_layer(feature_dims[0])
+        self.relu1 = nn.ReLU(inplace=True)
+        self.in_planes = feature_dims[0]
+        self.layer1 = self._make_layer(
+            feature_dims[0], stride=1, norm_layer=norm_layer
+        )  # 1/2
+        self.layer2 = self._make_layer(
+            feature_dims[1], stride=2, norm_layer=norm_layer
+        )  # 1/4
+        # highest resolution 1/4 or 1/8
+        stride = 2 if num_output_scales == 1 else 1
+        self.layer3 = self._make_layer(
+            feature_dims[2],
+            stride=stride,
+            norm_layer=norm_layer,
+        )  # 1/4 or 1/8
+        self.conv2 = nn.Conv2d(feature_dims[2], output_dim, 1, 1, 0)
+        if self.num_branch > 1:
+            if self.num_branch == 4:
+                strides = (1, 2, 4, 8)
+            elif self.num_branch == 3:
+                strides = (1, 2, 4)
+            elif self.num_branch == 2:
+                strides = (1, 2)
+            else:
+                raise ValueError
+            self.trident_conv = MultiScaleTridentConv(
+                output_dim,
+                output_dim,
+                kernel_size=3,
+                strides=strides,
+                paddings=1,
+                num_branch=self.num_branch,
+            )
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def _make_layer(self, dim, stride=1, dilation=1, norm_layer=nn.InstanceNorm2d):
+        layer1 = ResidualBlock_class(
+            self.in_planes, dim, norm_layer=norm_layer, stride=stride, dilation=dilation
+        )
+        layer2 = ResidualBlock_class(
+            dim, dim, norm_layer=norm_layer, stride=1, dilation=dilation
+        )
+        layers = (layer1, layer2)
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+        x = self.layer1(x)  # 1/2
+        x = self.layer2(x)  # 1/4
+        x = self.layer3(x)  # 1/8 or 1/4
+        x = self.conv2(x)
+        if self.num_branch > 1:
+            out = self.trident_conv([x] * self.num_branch)  # high to low res
+        else:
+            out = [x]
+        return out
+def single_head_full_attention(q, k, v):
+    # q, k, v: [B, L, C]
+    assert q.dim() == k.dim() == v.dim() == 3
+    scores = torch.matmul(q, k.permute(0, 2, 1)) / (q.size(2) ** 0.5)  # [B, L, L]
+    attn = torch.softmax(scores, dim=2)  # [B, L, L]
+    out = torch.matmul(attn, v)  # [B, L, C]
+    return out
+def generate_shift_window_attn_mask(
+    input_resolution,
+    window_size_h,
+    window_size_w,
+    shift_size_h,
+    shift_size_w,
+    device=get_torch_device(),
+):
+    # Ref: https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
+    # calculate attention mask for SW-MSA
+    h, w = input_resolution
+    img_mask = torch.zeros((1, h, w, 1)).to(device)  # 1 H W 1
+    h_slices = (
+        slice(0, -window_size_h),
+        slice(-window_size_h, -shift_size_h),
+        slice(-shift_size_h, None),
+    )
+    w_slices = (
+        slice(0, -window_size_w),
+        slice(-window_size_w, -shift_size_w),
+        slice(-shift_size_w, None),
+    )
+    cnt = 0
+    for h in h_slices:
+        for w in w_slices:
+            img_mask[:, h, w, :] = cnt
+            cnt += 1
+    mask_windows = split_feature(
+        img_mask, num_splits=input_resolution[-1] // window_size_w, channel_last=True
+    )
+    mask_windows = mask_windows.view(-1, window_size_h * window_size_w)
+    attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+    attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(
+        attn_mask == 0, float(0.0)
+    )
+    return attn_mask
+def single_head_split_window_attention(
+    q,
+    k,
+    v,
+    num_splits=1,
+    with_shift=False,
+    h=None,
+    w=None,
+    attn_mask=None,
+):
+    # Ref: https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
+    # q, k, v: [B, L, C]
+    assert q.dim() == k.dim() == v.dim() == 3
+    assert h is not None and w is not None
+    assert q.size(1) == h * w
+    b, _, c = q.size()
+    b_new = b * num_splits * num_splits
+    window_size_h = h // num_splits
+    window_size_w = w // num_splits
+    q = q.view(b, h, w, c)  # [B, H, W, C]
+    k = k.view(b, h, w, c)
+    v = v.view(b, h, w, c)
+    scale_factor = c**0.5
+    if with_shift:
+        assert attn_mask is not None  # compute once
+        shift_size_h = window_size_h // 2
+        shift_size_w = window_size_w // 2
+        q = torch.roll(q, shifts=(-shift_size_h, -shift_size_w), dims=(1, 2))
+        k = torch.roll(k, shifts=(-shift_size_h, -shift_size_w), dims=(1, 2))
+        v = torch.roll(v, shifts=(-shift_size_h, -shift_size_w), dims=(1, 2))
+    q = split_feature(
+        q, num_splits=num_splits, channel_last=True
+    )  # [B*K*K, H/K, W/K, C]
+    k = split_feature(k, num_splits=num_splits, channel_last=True)
+    v = split_feature(v, num_splits=num_splits, channel_last=True)
+    scores = (
+        torch.matmul(q.view(b_new, -1, c), k.view(b_new, -1, c).permute(0, 2, 1))
+        / scale_factor
+    )  # [B*K*K, H/K*W/K, H/K*W/K]
+    if with_shift:
+        scores += attn_mask.repeat(b, 1, 1)
+    attn = torch.softmax(scores, dim=-1)
+    out = torch.matmul(attn, v.view(b_new, -1, c))  # [B*K*K, H/K*W/K, C]
+    out = merge_splits(
+        out.view(b_new, h // num_splits, w // num_splits, c),
+        num_splits=num_splits,
+        channel_last=True,
+    )  # [B, H, W, C]
+    # shift back
+    if with_shift:
+        out = torch.roll(out, shifts=(shift_size_h, shift_size_w), dims=(1, 2))
+    out = out.view(b, -1, c)
+    return out
+class TransformerLayer(nn.Module):
+    def __init__(
+        self,
+        d_model=256,
+        nhead=1,
+        attention_type="swin",
+        no_ffn=False,
+        ffn_dim_expansion=4,
+        with_shift=False,
+        **kwargs,
+    ):
+        super(TransformerLayer, self).__init__()
+        self.dim = d_model
+        self.nhead = nhead
+        self.attention_type = attention_type
+        self.no_ffn = no_ffn
+        self.with_shift = with_shift
+        # multi-head attention
+        self.q_proj = nn.Linear(d_model, d_model, bias=False)
+        self.k_proj = nn.Linear(d_model, d_model, bias=False)
+        self.v_proj = nn.Linear(d_model, d_model, bias=False)
+        self.merge = nn.Linear(d_model, d_model, bias=False)
+        self.norm1 = nn.LayerNorm(d_model)
+        # no ffn after self-attn, with ffn after cross-attn
+        if not self.no_ffn:
+            in_channels = d_model * 2
+            self.mlp = nn.Sequential(
+                nn.Linear(in_channels, in_channels * ffn_dim_expansion, bias=False),
+                nn.GELU(),
+                nn.Linear(in_channels * ffn_dim_expansion, d_model, bias=False),
+            )
+            self.norm2 = nn.LayerNorm(d_model)
+    def forward(
+        self,
+        source,
+        target,
+        height=None,
+        width=None,
+        shifted_window_attn_mask=None,
+        attn_num_splits=None,
+        **kwargs,
+    ):
+        # source, target: [B, L, C]
+        query, key, value = source, target, target
+        # single-head attention
+        query = self.q_proj(query)  # [B, L, C]
+        key = self.k_proj(key)  # [B, L, C]
+        value = self.v_proj(value)  # [B, L, C]
+        if self.attention_type == "swin" and attn_num_splits > 1:
+            if self.nhead > 1:
+                # we observe that multihead attention slows down the speed and increases the memory consumption
+                # without bringing obvious performance gains and thus the implementation is removed
+                raise NotImplementedError
+            else:
+                message = single_head_split_window_attention(
+                    query,
+                    key,
+                    value,
+                    num_splits=attn_num_splits,
+                    with_shift=self.with_shift,
+                    h=height,
+                    w=width,
+                    attn_mask=shifted_window_attn_mask,
+                )
+        else:
+            message = single_head_full_attention(query, key, value)  # [B, L, C]
+        message = self.merge(message)  # [B, L, C]
+        message = self.norm1(message)
+        if not self.no_ffn:
+            message = self.mlp(torch.cat([source, message], dim=-1))
+            message = self.norm2(message)
+        return source + message
+class TransformerBlock(nn.Module):
+    """self attention + cross attention + FFN"""
+    def __init__(
+        self,
+        d_model=256,
+        nhead=1,
+        attention_type="swin",
+        ffn_dim_expansion=4,
+        with_shift=False,
+        **kwargs,
+    ):
+        super(TransformerBlock, self).__init__()
+        self.self_attn = TransformerLayer(
+            d_model=d_model,
+            nhead=nhead,
+            attention_type=attention_type,
+            no_ffn=True,
+            ffn_dim_expansion=ffn_dim_expansion,
+            with_shift=with_shift,
+        )
+        self.cross_attn_ffn = TransformerLayer(
+            d_model=d_model,
+            nhead=nhead,
+            attention_type=attention_type,
+            ffn_dim_expansion=ffn_dim_expansion,
+            with_shift=with_shift,
+        )
+    def forward(
+        self,
+        source,
+        target,
+        height=None,
+        width=None,
+        shifted_window_attn_mask=None,
+        attn_num_splits=None,
+        **kwargs,
+    ):
+        # source, target: [B, L, C]
+        # self attention
+        source = self.self_attn(
+            source,
+            source,
+            height=height,
+            width=width,
+            shifted_window_attn_mask=shifted_window_attn_mask,
+            attn_num_splits=attn_num_splits,
+        )
+        # cross attention and ffn
+        source = self.cross_attn_ffn(
+            source,
+            target,
+            height=height,
+            width=width,
+            shifted_window_attn_mask=shifted_window_attn_mask,
+            attn_num_splits=attn_num_splits,
+        )
+        return source
+class FeatureTransformer(nn.Module):
+    def __init__(
+        self,
+        num_layers=6,
+        d_model=128,
+        nhead=1,
+        attention_type="swin",
+        ffn_dim_expansion=4,
+        **kwargs,
+    ):
+        super(FeatureTransformer, self).__init__()
+        self.attention_type = attention_type
+        self.d_model = d_model
+        self.nhead = nhead
+        self.layers = nn.ModuleList(
+            [
+                TransformerBlock(
+                    d_model=d_model,
+                    nhead=nhead,
+                    attention_type=attention_type,
+                    ffn_dim_expansion=ffn_dim_expansion,
+                    with_shift=True
+                    if attention_type == "swin" and i % 2 == 1
+                    else False,
+                )
+                for i in range(num_layers)
+            ]
+        )
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def forward(
+        self,
+        feature0,
+        feature1,
+        attn_num_splits=None,
+        **kwargs,
+    ):
+        b, c, h, w = feature0.shape
+        assert self.d_model == c
+        feature0 = feature0.flatten(-2).permute(0, 2, 1)  # [B, H*W, C]
+        feature1 = feature1.flatten(-2).permute(0, 2, 1)  # [B, H*W, C]
+        if self.attention_type == "swin" and attn_num_splits > 1:
+            # global and refine use different number of splits
+            window_size_h = h // attn_num_splits
+            window_size_w = w // attn_num_splits
+            # compute attn mask once
+            shifted_window_attn_mask = generate_shift_window_attn_mask(
+                input_resolution=(h, w),
+                window_size_h=window_size_h,
+                window_size_w=window_size_w,
+                shift_size_h=window_size_h // 2,
+                shift_size_w=window_size_w // 2,
+                device=feature0.device,
+            )  # [K*K, H/K*W/K, H/K*W/K]
+        else:
+            shifted_window_attn_mask = None
+        # concat feature0 and feature1 in batch dimension to compute in parallel
+        concat0 = torch.cat((feature0, feature1), dim=0)  # [2B, H*W, C]
+        concat1 = torch.cat((feature1, feature0), dim=0)  # [2B, H*W, C]
+        for layer in self.layers:
+            concat0 = layer(
+                concat0,
+                concat1,
+                height=h,
+                width=w,
+                shifted_window_attn_mask=shifted_window_attn_mask,
+                attn_num_splits=attn_num_splits,
+            )
+            # update feature1
+            concat1 = torch.cat(concat0.chunk(chunks=2, dim=0)[::-1], dim=0)
+        feature0, feature1 = concat0.chunk(chunks=2, dim=0)  # [B, H*W, C]
+        # reshape back
+        feature0 = (
+            feature0.view(b, h, w, c).permute(0, 3, 1, 2).contiguous()
+        )  # [B, C, H, W]
+        feature1 = (
+            feature1.view(b, h, w, c).permute(0, 3, 1, 2).contiguous()
+        )  # [B, C, H, W]
+        return feature0, feature1
+class FeatureFlowAttention(nn.Module):
+    """
+    flow propagation with self-attention on feature
+    query: feature0, key: feature0, value: flow
+    """
+    def __init__(
+        self,
+        in_channels,
+        **kwargs,
+    ):
+        super(FeatureFlowAttention, self).__init__()
+        self.q_proj = nn.Linear(in_channels, in_channels)
+        self.k_proj = nn.Linear(in_channels, in_channels)
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def forward(
+        self,
+        feature0,
+        flow,
+        local_window_attn=False,
+        local_window_radius=1,
+        **kwargs,
+    ):
+        # q, k: feature [B, C, H, W], v: flow [B, 2, H, W]
+        if local_window_attn:
+            return self.forward_local_window_attn(
+                feature0, flow, local_window_radius=local_window_radius
+            )
+        b, c, h, w = feature0.size()
+        query = feature0.view(b, c, h * w).permute(0, 2, 1)  # [B, H*W, C]
+        # a note: the ``correct'' implementation should be:
+        # ``query = self.q_proj(query), key = self.k_proj(query)''
+        # this problem is observed while cleaning up the code
+        # however, this doesn't affect the performance since the projection is a linear operation,
+        # thus the two projection matrices for key can be merged
+        # so I just leave it as is in order to not re-train all models :)
+        query = self.q_proj(query)  # [B, H*W, C]
+        key = self.k_proj(query)  # [B, H*W, C]
+        value = flow.view(b, flow.size(1), h * w).permute(0, 2, 1)  # [B, H*W, 2]
+        scores = torch.matmul(query, key.permute(0, 2, 1)) / (c**0.5)  # [B, H*W, H*W]
+        prob = torch.softmax(scores, dim=-1)
+        out = torch.matmul(prob, value)  # [B, H*W, 2]
+        out = out.view(b, h, w, value.size(-1)).permute(0, 3, 1, 2)  # [B, 2, H, W]
+        return out
+    def forward_local_window_attn(
+        self,
+        feature0,
+        flow,
+        local_window_radius=1,
+    ):
+        assert flow.size(1) == 2
+        assert local_window_radius > 0
+        b, c, h, w = feature0.size()
+        feature0_reshape = self.q_proj(
+            feature0.view(b, c, -1).permute(0, 2, 1)
+        ).reshape(
+            b * h * w, 1, c
+        )  # [B*H*W, 1, C]
+        kernel_size = 2 * local_window_radius + 1
+        feature0_proj = (
+            self.k_proj(feature0.view(b, c, -1).permute(0, 2, 1))
+            .permute(0, 2, 1)
+            .reshape(b, c, h, w)
+        )
+        feature0_window = F.unfold(
+            feature0_proj, kernel_size=kernel_size, padding=local_window_radius
+        )  # [B, C*(2R+1)^2), H*W]
+        feature0_window = (
+            feature0_window.view(b, c, kernel_size**2, h, w)
+            .permute(0, 3, 4, 1, 2)
+            .reshape(b * h * w, c, kernel_size**2)
+        )  # [B*H*W, C, (2R+1)^2]
+        flow_window = F.unfold(
+            flow, kernel_size=kernel_size, padding=local_window_radius
+        )  # [B, 2*(2R+1)^2), H*W]
+        flow_window = (
+            flow_window.view(b, 2, kernel_size**2, h, w)
+            .permute(0, 3, 4, 2, 1)
+            .reshape(b * h * w, kernel_size**2, 2)
+        )  # [B*H*W, (2R+1)^2, 2]
+        scores = torch.matmul(feature0_reshape, feature0_window) / (
+            c**0.5
+        )  # [B*H*W, 1, (2R+1)^2]
+        prob = torch.softmax(scores, dim=-1)
+        out = (
+            torch.matmul(prob, flow_window)
+            .view(b, h, w, 2)
+            .permute(0, 3, 1, 2)
+            .contiguous()
+        )  # [B, 2, H, W]
+        return out
+def global_correlation_softmax(
+    feature0,
+    feature1,
+    pred_bidir_flow=False,
+):
+    # global correlation
+    b, c, h, w = feature0.shape
+    feature0 = feature0.view(b, c, -1).permute(0, 2, 1)  # [B, H*W, C]
+    feature1 = feature1.view(b, c, -1)  # [B, C, H*W]
+    correlation = torch.matmul(feature0, feature1).view(b, h, w, h, w) / (
+        c**0.5
+    )  # [B, H, W, H, W]
+    # flow from softmax
+    init_grid = coords_grid(b, h, w).to(correlation.device)  # [B, 2, H, W]
+    grid = init_grid.view(b, 2, -1).permute(0, 2, 1)  # [B, H*W, 2]
+    correlation = correlation.view(b, h * w, h * w)  # [B, H*W, H*W]
+    if pred_bidir_flow:
+        correlation = torch.cat(
+            (correlation, correlation.permute(0, 2, 1)), dim=0
+        )  # [2*B, H*W, H*W]
+        init_grid = init_grid.repeat(2, 1, 1, 1)  # [2*B, 2, H, W]
+        grid = grid.repeat(2, 1, 1)  # [2*B, H*W, 2]
+        b = b * 2
+    prob = F.softmax(correlation, dim=-1)  # [B, H*W, H*W]
+    correspondence = (
+        torch.matmul(prob, grid).view(b, h, w, 2).permute(0, 3, 1, 2)
+    )  # [B, 2, H, W]
+    # when predicting bidirectional flow, flow is the concatenation of forward flow and backward flow
+    flow = correspondence - init_grid
+    return flow, prob
+def local_correlation_softmax(
+    feature0,
+    feature1,
+    local_radius,
+    padding_mode="zeros",
+):
+    b, c, h, w = feature0.size()
+    coords_init = coords_grid(b, h, w).to(feature0.device)  # [B, 2, H, W]
+    coords = coords_init.view(b, 2, -1).permute(0, 2, 1)  # [B, H*W, 2]
+    local_h = 2 * local_radius + 1
+    local_w = 2 * local_radius + 1
+    window_grid = generate_window_grid(
+        -local_radius,
+        local_radius,
+        -local_radius,
+        local_radius,
+        local_h,
+        local_w,
+        device=feature0.device,
+    )  # [2R+1, 2R+1, 2]
+    window_grid = window_grid.reshape(-1, 2).repeat(b, 1, 1, 1)  # [B, 1, (2R+1)^2, 2]
+    sample_coords = coords.unsqueeze(-2) + window_grid  # [B, H*W, (2R+1)^2, 2]
+    sample_coords_softmax = sample_coords
+    # exclude coords that are out of image space
+    valid_x = (sample_coords[:, :, :, 0] >= 0) & (
+        sample_coords[:, :, :, 0] < w
+    )  # [B, H*W, (2R+1)^2]
+    valid_y = (sample_coords[:, :, :, 1] >= 0) & (
+        sample_coords[:, :, :, 1] < h
+    )  # [B, H*W, (2R+1)^2]
+    valid = (
+        valid_x & valid_y
+    )  # [B, H*W, (2R+1)^2], used to mask out invalid values when softmax
+    # normalize coordinates to [-1, 1]
+    sample_coords_norm = normalize_coords(sample_coords, h, w)  # [-1, 1]
+    window_feature = F.grid_sample(
+        feature1, sample_coords_norm, padding_mode=padding_mode, align_corners=True
+    ).permute(
+        0, 2, 1, 3
+    )  # [B, H*W, C, (2R+1)^2]
+    feature0_view = feature0.permute(0, 2, 3, 1).view(b, h * w, 1, c)  # [B, H*W, 1, C]
+    corr = torch.matmul(feature0_view, window_feature).view(b, h * w, -1) / (
+        c**0.5
+    )  # [B, H*W, (2R+1)^2]
+    # mask invalid locations
+    corr[~valid] = -1e9
+    prob = F.softmax(corr, -1)  # [B, H*W, (2R+1)^2]
+    correspondence = (
+        torch.matmul(prob.unsqueeze(-2), sample_coords_softmax)
+        .squeeze(-2)
+        .view(b, h, w, 2)
+        .permute(0, 3, 1, 2)
+    )  # [B, 2, H, W]
+    flow = correspondence - coords_init
+    match_prob = prob
+    return flow, match_prob
+def coords_grid(b, h, w, homogeneous=False, device=None):
+    y, x = torch.meshgrid(torch.arange(h), torch.arange(w))  # [H, W]
+    stacks = [x, y]
+    if homogeneous:
+        ones = torch.ones_like(x)  # [H, W]
+        stacks.append(ones)
+    grid = torch.stack(stacks, dim=0).float()  # [2, H, W] or [3, H, W]
+    grid = grid[None].repeat(b, 1, 1, 1)  # [B, 2, H, W] or [B, 3, H, W]
+    if device is not None:
+        grid = grid.to(device)
+    return grid
+def generate_window_grid(h_min, h_max, w_min, w_max, len_h, len_w, device=None):
+    assert device is not None
+    x, y = torch.meshgrid(
+        [
+            torch.linspace(w_min, w_max, len_w, device=device),
+            torch.linspace(h_min, h_max, len_h, device=device),
+        ],
+    )
+    grid = torch.stack((x, y), -1).transpose(0, 1).float()  # [H, W, 2]
+    return grid
+def normalize_coords(coords, h, w):
+    # coords: [B, H, W, 2]
+    c = torch.Tensor([(w - 1) / 2.0, (h - 1) / 2.0]).float().to(coords.device)
+    return (coords - c) / c  # [-1, 1]
+def bilinear_sample(
+    img, sample_coords, mode="bilinear", padding_mode="zeros", return_mask=False
+):
+    # img: [B, C, H, W]
+    # sample_coords: [B, 2, H, W] in image scale
+    if sample_coords.size(1) != 2:  # [B, H, W, 2]
+        sample_coords = sample_coords.permute(0, 3, 1, 2)
+    b, _, h, w = sample_coords.shape
+    # Normalize to [-1, 1]
+    x_grid = 2 * sample_coords[:, 0] / (w - 1) - 1
+    y_grid = 2 * sample_coords[:, 1] / (h - 1) - 1
+    grid = torch.stack([x_grid, y_grid], dim=-1)  # [B, H, W, 2]
+    img = F.grid_sample(
+        img, grid, mode=mode, padding_mode=padding_mode, align_corners=True
+    )
+    if return_mask:
+        mask = (
+            (x_grid >= -1) & (y_grid >= -1) & (x_grid <= 1) & (y_grid <= 1)
+        )  # [B, H, W]
+        return img, mask
+    return img
+def flow_warp(feature, flow, mask=False, padding_mode="zeros"):
+    b, c, h, w = feature.size()
+    assert flow.size(1) == 2
+    grid = coords_grid(b, h, w).to(flow.device) + flow  # [B, 2, H, W]
+    return bilinear_sample(feature, grid, padding_mode=padding_mode, return_mask=mask)
+def forward_backward_consistency_check(fwd_flow, bwd_flow, alpha=0.01, beta=0.5):
+    # fwd_flow, bwd_flow: [B, 2, H, W]
+    # alpha and beta values are following UnFlow (https://arxiv.org/abs/1711.07837)
+    assert fwd_flow.dim() == 4 and bwd_flow.dim() == 4
+    assert fwd_flow.size(1) == 2 and bwd_flow.size(1) == 2
+    flow_mag = torch.norm(fwd_flow, dim=1) + torch.norm(bwd_flow, dim=1)  # [B, H, W]
+    warped_bwd_flow = flow_warp(bwd_flow, fwd_flow)  # [B, 2, H, W]
+    warped_fwd_flow = flow_warp(fwd_flow, bwd_flow)  # [B, 2, H, W]
+    diff_fwd = torch.norm(fwd_flow + warped_bwd_flow, dim=1)  # [B, H, W]
+    diff_bwd = torch.norm(bwd_flow + warped_fwd_flow, dim=1)
+    threshold = alpha * flow_mag + beta
+    fwd_occ = (diff_fwd > threshold).float()  # [B, H, W]
+    bwd_occ = (diff_bwd > threshold).float()
+    return fwd_occ, bwd_occ
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+    def __init__(self, num_pos_feats=64, temperature=10000, normalize=True, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+    def forward(self, x):
+        # x = tensor_list.tensors  # [B, C, H, W]
+        # mask = tensor_list.mask  # [B, H, W], input with padding, valid as 0
+        b, c, h, w = x.size()
+        mask = torch.ones((b, h, w), device=x.device)  # [B, H, W]
+        y_embed = mask.cumsum(1, dtype=torch.float32)
+        x_embed = mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
+        ).flatten(3)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
+        ).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+def split_feature(
+    feature,
+    num_splits=2,
+    channel_last=False,
+):
+    if channel_last:  # [B, H, W, C]
+        b, h, w, c = feature.size()
+        assert h % num_splits == 0 and w % num_splits == 0
+        b_new = b * num_splits * num_splits
+        h_new = h // num_splits
+        w_new = w // num_splits
+        feature = (
+            feature.view(b, num_splits, h // num_splits, num_splits, w // num_splits, c)
+            .permute(0, 1, 3, 2, 4, 5)
+            .reshape(b_new, h_new, w_new, c)
+        )  # [B*K*K, H/K, W/K, C]
+    else:  # [B, C, H, W]
+        b, c, h, w = feature.size()
+        assert h % num_splits == 0 and w % num_splits == 0
+        b_new = b * num_splits * num_splits
+        h_new = h // num_splits
+        w_new = w // num_splits
+        feature = (
+            feature.view(b, c, num_splits, h // num_splits, num_splits, w // num_splits)
+            .permute(0, 2, 4, 1, 3, 5)
+            .reshape(b_new, c, h_new, w_new)
+        )  # [B*K*K, C, H/K, W/K]
+    return feature
+def merge_splits(
+    splits,
+    num_splits=2,
+    channel_last=False,
+):
+    if channel_last:  # [B*K*K, H/K, W/K, C]
+        b, h, w, c = splits.size()
+        new_b = b // num_splits // num_splits
+        splits = splits.view(new_b, num_splits, num_splits, h, w, c)
+        merge = (
+            splits.permute(0, 1, 3, 2, 4, 5)
+            .contiguous()
+            .view(new_b, num_splits * h, num_splits * w, c)
+        )  # [B, H, W, C]
+    else:  # [B*K*K, C, H/K, W/K]
+        b, c, h, w = splits.size()
+        new_b = b // num_splits // num_splits
+        splits = splits.view(new_b, num_splits, num_splits, c, h, w)
+        merge = (
+            splits.permute(0, 3, 1, 4, 2, 5)
+            .contiguous()
+            .view(new_b, c, num_splits * h, num_splits * w)
+        )  # [B, C, H, W]
+    return merge
+def normalize_img(img0, img1):
+    # loaded images are in [0, 255]
+    # normalize by ImageNet mean and std
+    mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(img1.device)
+    std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(img1.device)
+    img0 = (img0 - mean) / std
+    img1 = (img1 - mean) / std
+    return img0, img1
+def feature_add_position(feature0, feature1, attn_splits, feature_channels):
+    pos_enc = PositionEmbeddingSine(num_pos_feats=feature_channels // 2)
+    if attn_splits > 1:  # add position in splited window
+        feature0_splits = split_feature(feature0, num_splits=attn_splits)
+        feature1_splits = split_feature(feature1, num_splits=attn_splits)
+        position = pos_enc(feature0_splits)
+        feature0_splits = feature0_splits + position
+        feature1_splits = feature1_splits + position
+        feature0 = merge_splits(feature0_splits, num_splits=attn_splits)
+        feature1 = merge_splits(feature1_splits, num_splits=attn_splits)
+    else:
+        position = pos_enc(feature0)
+        feature0 = feature0 + position
+        feature1 = feature1 + position
+    return feature0, feature1
+class GMFlow(nn.Module):
+    def __init__(
+        self,
+        num_scales=2,
+        upsample_factor=4,
+        feature_channels=128,
+        attention_type="swin",
+        num_transformer_layers=6,
+        ffn_dim_expansion=4,
+        num_head=1,
+        **kwargs,
+    ):
+        super(GMFlow, self).__init__()
+        self.num_scales = num_scales
+        self.feature_channels = feature_channels
+        self.upsample_factor = upsample_factor
+        self.attention_type = attention_type
+        self.num_transformer_layers = num_transformer_layers
+        # CNN backbone
+        self.backbone = CNNEncoder(
+            output_dim=feature_channels, num_output_scales=num_scales
+        )
+        # Transformer
+        self.transformer = FeatureTransformer(
+            num_layers=num_transformer_layers,
+            d_model=feature_channels,
+            nhead=num_head,
+            attention_type=attention_type,
+            ffn_dim_expansion=ffn_dim_expansion,
+        )
+        # flow propagation with self-attn
+        self.feature_flow_attn = FeatureFlowAttention(in_channels=feature_channels)
+        # convex upsampling: concat feature0 and flow as input
+        self.upsampler = nn.Sequential(
+            nn.Conv2d(2 + feature_channels, 256, 3, 1, 1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, upsample_factor**2 * 9, 1, 1, 0),
+        )
+    def extract_feature(self, img0, img1):
+        concat = torch.cat((img0, img1), dim=0)  # [2B, C, H, W]
+        features = self.backbone(
+            concat
+        )  # list of [2B, C, H, W], resolution from high to low
+        # reverse: resolution from low to high
+        features = features[::-1]
+        feature0, feature1 = [], []
+        for i in range(len(features)):
+            feature = features[i]
+            chunks = torch.chunk(feature, 2, 0)  # tuple
+            feature0.append(chunks[0])
+            feature1.append(chunks[1])
+        return feature0, feature1
+    def upsample_flow(
+        self,
+        flow,
+        feature,
+        bilinear=False,
+        upsample_factor=8,
+    ):
+        if bilinear:
+            up_flow = (
+                F.interpolate(
+                    flow,
+                    scale_factor=upsample_factor,
+                    mode="bilinear",
+                    align_corners=True,
+                )
+                * upsample_factor
+            )
+        else:
+            # convex upsampling
+            concat = torch.cat((flow, feature), dim=1)
+            mask = self.upsampler(concat)
+            b, flow_channel, h, w = flow.shape
+            mask = mask.view(
+                b, 1, 9, self.upsample_factor, self.upsample_factor, h, w
+            )  # [B, 1, 9, K, K, H, W]
+            mask = torch.softmax(mask, dim=2)
+            up_flow = F.unfold(self.upsample_factor * flow, [3, 3], padding=1)
+            up_flow = up_flow.view(
+                b, flow_channel, 9, 1, 1, h, w
+            )  # [B, 2, 9, 1, 1, H, W]
+            up_flow = torch.sum(mask * up_flow, dim=2)  # [B, 2, K, K, H, W]
+            up_flow = up_flow.permute(0, 1, 4, 2, 5, 3)  # [B, 2, K, H, K, W]
+            up_flow = up_flow.reshape(
+                b, flow_channel, self.upsample_factor * h, self.upsample_factor * w
+            )  # [B, 2, K*H, K*W]
+        return up_flow
+    def forward(
+        self,
+        img0,
+        img1,
+        attn_splits_list=[2, 8],
+        corr_radius_list=[-1, 4],
+        prop_radius_list=[-1, 1],
+        pred_bidir_flow=False,
+        **kwargs,
+    ):
+        img0, img1 = normalize_img(img0, img1)  # [B, 3, H, W]
+        # resolution low to high
+        feature0_list, feature1_list = self.extract_feature(
+            img0, img1
+        )  # list of features
+        flow = None
+        assert (
+            len(attn_splits_list)
+            == len(corr_radius_list)
+            == len(prop_radius_list)
+            == self.num_scales
+        )
+        for scale_idx in range(self.num_scales):
+            feature0, feature1 = feature0_list[scale_idx], feature1_list[scale_idx]
+            if pred_bidir_flow and scale_idx > 0:
+                # predicting bidirectional flow with refinement
+                feature0, feature1 = torch.cat((feature0, feature1), dim=0), torch.cat(
+                    (feature1, feature0), dim=0
+                )
+            upsample_factor = self.upsample_factor * (
+                2 ** (self.num_scales - 1 - scale_idx)
+            )
+            if scale_idx > 0:
+                flow = (
+                    F.interpolate(
+                        flow, scale_factor=2, mode="bilinear", align_corners=True
+                    )
+                    * 2
+                )
+            if flow is not None:
+                flow = flow.detach()
+                feature1 = flow_warp(feature1, flow)  # [B, C, H, W]
+            attn_splits = attn_splits_list[scale_idx]
+            corr_radius = corr_radius_list[scale_idx]
+            prop_radius = prop_radius_list[scale_idx]
+            # add position to features
+            feature0, feature1 = feature_add_position(
+                feature0, feature1, attn_splits, self.feature_channels
+            )
+            # Transformer
+            feature0, feature1 = self.transformer(
+                feature0, feature1, attn_num_splits=attn_splits
+            )
+            # correlation and softmax
+            if corr_radius == -1:  # global matching
+                flow_pred = global_correlation_softmax(
+                    feature0, feature1, pred_bidir_flow
+                )[0]
+            else:  # local matching
+                flow_pred = local_correlation_softmax(feature0, feature1, corr_radius)[
+                    0
+                ]
+            # flow or residual flow
+            flow = flow + flow_pred if flow is not None else flow_pred
+            # upsample to the original resolution for supervison
+            if (
+                self.training
+            ):  # only need to upsample intermediate flow predictions at training time
+                flow_bilinear = self.upsample_flow(
+                    flow, None, bilinear=True, upsample_factor=upsample_factor
+                )
+            # flow propagation with self-attn
+            if pred_bidir_flow and scale_idx == 0:
+                feature0 = torch.cat(
+                    (feature0, feature1), dim=0
+                )  # [2*B, C, H, W] for propagation
+            flow = self.feature_flow_attn(
+                feature0,
+                flow.detach(),
+                local_window_attn=prop_radius > 0,
+                local_window_radius=prop_radius,
+            )
+            # bilinear upsampling at training time except the last one
+            if self.training and scale_idx < self.num_scales - 1:
+                flow_up = self.upsample_flow(
+                    flow, feature0, bilinear=True, upsample_factor=upsample_factor
+                )
+            if scale_idx == self.num_scales - 1:
+                flow_up = self.upsample_flow(flow, feature0)
+        return flow_up
+backwarp_tenGrid = {}
+def backwarp(tenIn, tenflow):
+    if str(tenflow.shape) not in backwarp_tenGrid:
+        tenHor = (
+            torch.linspace(
+                start=-1.0,
+                end=1.0,
+                steps=tenflow.shape[3],
+                dtype=tenflow.dtype,
+                device=tenflow.device,
+            )
+            .view(1, 1, 1, -1)
+            .repeat(1, 1, tenflow.shape[2], 1)
+        )
+        tenVer = (
+            torch.linspace(
+                start=-1.0,
+                end=1.0,
+                steps=tenflow.shape[2],
+                dtype=tenflow.dtype,
+                device=tenflow.device,
+            )
+            .view(1, 1, -1, 1)
+            .repeat(1, 1, 1, tenflow.shape[3])
+        )
+        backwarp_tenGrid[str(tenflow.shape)] = torch.cat([tenHor, tenVer], 1).to(get_torch_device())
+    # end
+    tenflow = torch.cat(
+        [
+            tenflow[:, 0:1, :, :] / ((tenIn.shape[3] - 1.0) / 2.0),
+            tenflow[:, 1:2, :, :] / ((tenIn.shape[2] - 1.0) / 2.0),
+        ],
+        1,
+    )
+    return torch.nn.functional.grid_sample(
+        input=tenIn,
+        grid=(backwarp_tenGrid[str(tenflow.shape)] + tenflow).permute(0, 2, 3, 1),
+        mode="bilinear",
+        padding_mode="zeros",
+        align_corners=True,
+    )
+class MetricNet(nn.Module):
+    def __init__(self):
+        super(MetricNet, self).__init__()
+        self.metric_in = nn.Conv2d(14, 64, 3, 1, 1)
+        self.metric_net1 = nn.Sequential(nn.PReLU(), nn.Conv2d(64, 64, 3, 1, 1))
+        self.metric_net2 = nn.Sequential(nn.PReLU(), nn.Conv2d(64, 64, 3, 1, 1))
+        self.metric_net3 = nn.Sequential(nn.PReLU(), nn.Conv2d(64, 64, 3, 1, 1))
+        self.metric_out = nn.Sequential(nn.PReLU(), nn.Conv2d(64, 2, 3, 1, 1))
+    def forward(self, img0, img1, flow01, flow10):
+        metric0 = F.l1_loss(img0, backwarp(img1, flow01), reduction="none").mean(
+            [1], True
+        )
+        metric1 = F.l1_loss(img1, backwarp(img0, flow10), reduction="none").mean(
+            [1], True
+        )
+        fwd_occ, bwd_occ = forward_backward_consistency_check(flow01, flow10)
+        flow01 = torch.cat(
+            [
+                flow01[:, 0:1, :, :] / ((flow01.shape[3] - 1.0) / 2.0),
+                flow01[:, 1:2, :, :] / ((flow01.shape[2] - 1.0) / 2.0),
+            ],
+            1,
+        )
+        flow10 = torch.cat(
+            [
+                flow10[:, 0:1, :, :] / ((flow10.shape[3] - 1.0) / 2.0),
+                flow10[:, 1:2, :, :] / ((flow10.shape[2] - 1.0) / 2.0),
+            ],
+            1,
+        )
+        img = torch.cat((img0, img1), 1)
+        metric = torch.cat((-metric0, -metric1), 1)
+        flow = torch.cat((flow01, flow10), 1)
+        occ = torch.cat((fwd_occ.unsqueeze(1), bwd_occ.unsqueeze(1)), 1)
+        feat = self.metric_in(torch.cat((img, metric, flow, occ), 1))
+        feat = self.metric_net1(feat) + feat
+        feat = self.metric_net2(feat) + feat
+        feat = self.metric_net3(feat) + feat
+        metric = self.metric_out(feat)
+        metric = torch.tanh(metric) * 10
+        return metric[:, :1], metric[:, 1:2]
+class FeatureNet(nn.Module):
+    """The quadratic model"""
+    def __init__(self):
+        super(FeatureNet, self).__init__()
+        self.block1 = nn.Sequential(
+            nn.PReLU(),
+            nn.Conv2d(3, 64, 3, 2, 1),
+            nn.PReLU(),
+            nn.Conv2d(64, 64, 3, 1, 1),
+        )
+        self.block2 = nn.Sequential(
+            nn.PReLU(),
+            nn.Conv2d(64, 128, 3, 2, 1),
+            nn.PReLU(),
+            nn.Conv2d(128, 128, 3, 1, 1),
+        )
+        self.block3 = nn.Sequential(
+            nn.PReLU(),
+            nn.Conv2d(128, 192, 3, 2, 1),
+            nn.PReLU(),
+            nn.Conv2d(192, 192, 3, 1, 1),
+        )
+    def forward(self, x):
+        x1 = self.block1(x)
+        x2 = self.block2(x1)
+        x3 = self.block3(x2)
+        return x1, x2, x3
+# Residual Block
+def ResidualBlock(in_channels, out_channels, stride=1):
+    return torch.nn.Sequential(
+        nn.PReLU(),
+        nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=True,
+        ),
+        nn.PReLU(),
+        nn.Conv2d(
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=True,
+        ),
+    )
+# downsample block
+def DownsampleBlock(in_channels, out_channels, stride=2):
+    return torch.nn.Sequential(
+        nn.PReLU(),
+        nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=True,
+        ),
+        nn.PReLU(),
+        nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=True
+        ),
+    )
+# upsample block
+def UpsampleBlock(in_channels, out_channels, stride=2):
+    return torch.nn.Sequential(
+        nn.PReLU(),
+        nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size=4,
+            stride=stride,
+            padding=1,
+            bias=True,
+        ),
+        nn.PReLU(),
+        nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=True
+        ),
+    )
+class PixelShuffleBlcok(nn.Module):
+    def __init__(self, in_feat, num_feat, num_out_ch):
+        super(PixelShuffleBlcok, self).__init__()
+        self.conv_before_upsample = nn.Sequential(
+            nn.Conv2d(in_feat, num_feat, 3, 1, 1), nn.PReLU()
+        )
+        self.upsample = nn.Sequential(
+            nn.Conv2d(num_feat, 4 * num_feat, 3, 1, 1), nn.PixelShuffle(2)
+        )
+        self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
+    def forward(self, x):
+        x = self.conv_before_upsample(x)
+        x = self.conv_last(self.upsample(x))
+        return x
+# grid network
+class GridNet(nn.Module):
+    def __init__(
+        self,
+        in_channels=12,
+        in_channels1=128,
+        in_channels2=256,
+        in_channels3=384,
+        out_channels=3,
+    ):
+        super(GridNet, self).__init__()
+        self.residual_model_head = ResidualBlock(in_channels, 64)
+        self.residual_model_head1 = ResidualBlock(in_channels1, 64)
+        self.residual_model_head2 = ResidualBlock(in_channels2, 128)
+        self.residual_model_head3 = ResidualBlock(in_channels3, 192)
+        self.residual_model_01 = ResidualBlock(64, 64)
+        # self.residual_model_02=ResidualBlock(64, 64)
+        # self.residual_model_03=ResidualBlock(64, 64)
+        self.residual_model_04 = ResidualBlock(64, 64)
+        self.residual_model_05 = ResidualBlock(64, 64)
+        self.residual_model_tail = PixelShuffleBlcok(64, 64, out_channels)
+        self.residual_model_11 = ResidualBlock(128, 128)
+        # self.residual_model_12=ResidualBlock(128, 128)
+        # self.residual_model_13=ResidualBlock(128, 128)
+        self.residual_model_14 = ResidualBlock(128, 128)
+        self.residual_model_15 = ResidualBlock(128, 128)
+        self.residual_model_21 = ResidualBlock(192, 192)
+        # self.residual_model_22=ResidualBlock(192, 192)
+        # self.residual_model_23=ResidualBlock(192, 192)
+        self.residual_model_24 = ResidualBlock(192, 192)
+        self.residual_model_25 = ResidualBlock(192, 192)
+        #
+        self.downsample_model_10 = DownsampleBlock(64, 128)
+        self.downsample_model_20 = DownsampleBlock(128, 192)
+        self.downsample_model_11 = DownsampleBlock(64, 128)
+        self.downsample_model_21 = DownsampleBlock(128, 192)
+        # self.downsample_model_12=DownsampleBlock(64, 128)
+        # self.downsample_model_22=DownsampleBlock(128, 192)
+        #
+        # self.upsample_model_03=UpsampleBlock(128, 64)
+        # self.upsample_model_13=UpsampleBlock(192, 128)
+        self.upsample_model_04 = UpsampleBlock(128, 64)
+        self.upsample_model_14 = UpsampleBlock(192, 128)
+        self.upsample_model_05 = UpsampleBlock(128, 64)
+        self.upsample_model_15 = UpsampleBlock(192, 128)
+    def forward(self, x, x1, x2, x3):
+        X00 = self.residual_model_head(x) + self.residual_model_head1(
+            x1
+        )  # ---   182 ~ 185
+        # X10 = self.residual_model_head1(x1)
+        X01 = self.residual_model_01(X00) + X00  # ---   208 ~ 211 ,AddBackward1213
+        X10 = self.downsample_model_10(X00) + self.residual_model_head2(
+            x2
+        )  # ---   186 ~ 189
+        X20 = self.downsample_model_20(X10) + self.residual_model_head3(
+            x3
+        )  # ---   190 ~ 193
+        residual_11 = (
+            self.residual_model_11(X10) + X10
+        )  # 201 ~ 204    , sum  AddBackward1206
+        downsample_11 = self.downsample_model_11(X01)  # 214 ~ 217
+        X11 = residual_11 + downsample_11  # ---      AddBackward1218
+        residual_21 = (
+            self.residual_model_21(X20) + X20
+        )  # 194 ~ 197  ,   sum  AddBackward1199
+        downsample_21 = self.downsample_model_21(X11)  # 219 ~ 222
+        X21 = residual_21 + downsample_21  # AddBackward1223
+        X24 = self.residual_model_24(X21) + X21  # ---   224 ~ 227 , AddBackward1229
+        X25 = self.residual_model_25(X24) + X24  # ---   230 ~ 233 , AddBackward1235
+        upsample_14 = self.upsample_model_14(X24)  # 242 ~ 246
+        residual_14 = self.residual_model_14(X11) + X11  # 248 ~ 251, AddBackward1253
+        X14 = upsample_14 + residual_14  # ---   AddBackward1254
+        upsample_04 = self.upsample_model_04(X14)  # 268 ~ 272
+        residual_04 = self.residual_model_04(X01) + X01  # 274 ~ 277, AddBackward1279
+        X04 = upsample_04 + residual_04  # ---  AddBackward1280
+        upsample_15 = self.upsample_model_15(X25)  # 236 ~ 240
+        residual_15 = self.residual_model_15(X14) + X14  # 255 ~ 258, AddBackward1260
+        X15 = upsample_15 + residual_15  # AddBackward1261
+        upsample_05 = self.upsample_model_05(X15)  # 262 ~ 266
+        residual_05 = self.residual_model_05(X04) + X04  # 281 ~ 284,AddBackward1286
+        X05 = upsample_05 + residual_05  # AddBackward1287
+        X_tail = self.residual_model_tail(X05)  # 288 ~ 291
+        return X_tail
+# end
+class Model:
+    def __init__(self):
+        self.flownet = GMFlow()
+        self.metricnet = MetricNet()
+        self.feat_ext = FeatureNet()
+        self.fusionnet = GridNet()
+        self.version = 3.9
+    def eval(self):
+        self.flownet.eval()
+        self.metricnet.eval()
+        self.feat_ext.eval()
+        self.fusionnet.eval()
+    def device(self):
+        self.flownet.to(device)
+        self.metricnet.to(device)
+        self.feat_ext.to(device)
+        self.fusionnet.to(device)
+    def load_model(self, path_dict):
+        #models/GMFSS_fortuna_flownet.pkl
+        self.flownet.load_state_dict(torch.load(path_dict["flownet"]))
+        #models/GMFSS_fortuna_metric.pkl
+        self.metricnet.load_state_dict(torch.load(path_dict["metricnet"]))
+        #models/GMFSS_fortuna_feat.pkl
+        self.feat_ext.load_state_dict(torch.load(path_dict["feat_ext"]))
+        #models/GMFSS_fortuna_fusionnet.pkl
+        self.fusionnet.load_state_dict(torch.load(path_dict["fusionnet"]))
+    def reuse(self, img0, img1, scale):
+        feat11, feat12, feat13 = self.feat_ext(img0)
+        feat21, feat22, feat23 = self.feat_ext(img1)
+        img0 = F.interpolate(
+            img0, scale_factor=0.5, mode="bilinear", align_corners=False
+        )
+        img1 = F.interpolate(
+            img1, scale_factor=0.5, mode="bilinear", align_corners=False
+        )
+        if scale != 1.0:
+            imgf0 = F.interpolate(
+                img0, scale_factor=scale, mode="bilinear", align_corners=False
+            )
+            imgf1 = F.interpolate(
+                img1, scale_factor=scale, mode="bilinear", align_corners=False
+            )
+        else:
+            imgf0 = img0
+            imgf1 = img1
+        flow01 = self.flownet(imgf0, imgf1, return_flow=True)
+        flow10 = self.flownet(imgf1, imgf0, return_flow=True)
+        if scale != 1.0:
+            flow01 = (
+                F.interpolate(
+                    flow01,
+                    scale_factor=1.0 / scale,
+                    mode="bilinear",
+                    align_corners=False,
+                )
+                / scale
+            )
+            flow10 = (
+                F.interpolate(
+                    flow10,
+                    scale_factor=1.0 / scale,
+                    mode="bilinear",
+                    align_corners=False,
+                )
+                / scale
+            )
+        metric0, metric1 = self.metricnet(img0, img1, flow01, flow10)
+        return (
+            flow01,
+            flow10,
+            metric0,
+            metric1,
+            feat11,
+            feat12,
+            feat13,
+            feat21,
+            feat22,
+            feat23,
+        )
+    def inference(
+        self,
+        img0,
+        img1,
+        flow01,
+        flow10,
+        metric0,
+        metric1,
+        feat11,
+        feat12,
+        feat13,
+        feat21,
+        feat22,
+        feat23,
+        timestep,
+    ):
+        F1t = timestep * flow01
+        F2t = (1 - timestep) * flow10
+        Z1t = timestep * metric0
+        Z2t = (1 - timestep) * metric1
+        img0 = F.interpolate(
+            img0, scale_factor=0.5, mode="bilinear", align_corners=False
+        )
+        I1t = softsplat(img0, F1t, Z1t, strMode="soft")
+        img1 = F.interpolate(
+            img1, scale_factor=0.5, mode="bilinear", align_corners=False
+        )
+        I2t = softsplat(img1, F2t, Z2t, strMode="soft")
+        feat1t1 = softsplat(feat11, F1t, Z1t, strMode="soft")
+        feat2t1 = softsplat(feat21, F2t, Z2t, strMode="soft")
+        F1td = (
+            F.interpolate(F1t, scale_factor=0.5, mode="bilinear", align_corners=False)
+            * 0.5
+        )
+        Z1d = F.interpolate(Z1t, scale_factor=0.5, mode="bilinear", align_corners=False)
+        feat1t2 = softsplat(feat12, F1td, Z1d, strMode="soft")
+        F2td = (
+            F.interpolate(F2t, scale_factor=0.5, mode="bilinear", align_corners=False)
+            * 0.5
+        )
+        Z2d = F.interpolate(Z2t, scale_factor=0.5, mode="bilinear", align_corners=False)
+        feat2t2 = softsplat(feat22, F2td, Z2d, strMode="soft")
+        F1tdd = (
+            F.interpolate(F1t, scale_factor=0.25, mode="bilinear", align_corners=False)
+            * 0.25
+        )
+        Z1dd = F.interpolate(
+            Z1t, scale_factor=0.25, mode="bilinear", align_corners=False
+        )
+        feat1t3 = softsplat(feat13, F1tdd, Z1dd, strMode="soft")
+        F2tdd = (
+            F.interpolate(F2t, scale_factor=0.25, mode="bilinear", align_corners=False)
+            * 0.25
+        )
+        Z2dd = F.interpolate(
+            Z2t, scale_factor=0.25, mode="bilinear", align_corners=False
+        )
+        feat2t3 = softsplat(feat23, F2tdd, Z2dd, strMode="soft")
+        out = self.fusionnet(
+            torch.cat([img0, I1t, I2t, img1], dim=1),
+            torch.cat([feat1t1, feat2t1], dim=1),
+            torch.cat([feat1t2, feat2t2], dim=1),
+            torch.cat([feat1t3, feat2t3], dim=1),
+        )
+        return torch.clamp(out, 0, 1)

vfi_models/gmfss_fortuna/GMFSS_Fortuna_union.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import itertools
+import numpy as np
+import vapoursynth as vs
+from .GMFSS_Fortuna_union_arch import Model_inference
+import torch
+class GMFSS_Fortuna_union:
+    def __init__(self):
+        self.cache = False
+        self.amount_input_img = 2
+        torch.set_grad_enabled(False)
+        torch.backends.cudnn.enabled = True
+        torch.backends.cudnn.benchmark = True
+        self.model = Model_inference()
+        self.model.eval()
+    def execute(self, I0, I1, timestep):
+        with torch.inference_mode():
+            middle = self.model(I0, I1, timestep).cpu()
+        return middle

vfi_models/gmfss_fortuna/GMFSS_Fortuna_union_arch.py ADDED Viewed

	@@ -0,0 +1,1857 @@

+"""
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/GMFSS_infer_u.py
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/softsplat.py
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/FusionNet_u.py
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/FeatureNet.py
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/MetricNet.py
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/IFNet_HDv3.py
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/gmflow/gmflow.py
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/gmflow/utils.py
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/gmflow/position.py
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/gmflow/geometry.py
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/gmflow/matching.py
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/gmflow/transformer.py
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/gmflow/backbone.py
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/gmflow/trident_conv.py
+https://github.com/98mxr/GMFSS_Fortuna/blob/b5d0bd544e3f1eee6a059e49c69bcd3124c8343c/model/warplayer.py
+"""
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.modules.utils import _pair
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch
+import math
+from vfi_models.rife.rife_arch import IFNet
+from vfi_models.ops import softsplat
+from comfy.model_management import get_torch_device
+device = get_torch_device()
+backwarp_tenGrid = {}
+def warp(tenInput, tenFlow):
+    k = (str(tenFlow.device), str(tenFlow.size()))
+    if k not in backwarp_tenGrid:
+        tenHorizontal = (
+            torch.linspace(-1.0, 1.0, tenFlow.shape[3], device=device)
+            .view(1, 1, 1, tenFlow.shape[3])
+            .expand(tenFlow.shape[0], -1, tenFlow.shape[2], -1)
+        )
+        tenVertical = (
+            torch.linspace(-1.0, 1.0, tenFlow.shape[2], device=device)
+            .view(1, 1, tenFlow.shape[2], 1)
+            .expand(tenFlow.shape[0], -1, -1, tenFlow.shape[3])
+        )
+        backwarp_tenGrid[k] = torch.cat([tenHorizontal, tenVertical], 1).to(device)
+    tenFlow = torch.cat(
+        [
+            tenFlow[:, 0:1, :, :] / ((tenInput.shape[3] - 1.0) / 2.0),
+            tenFlow[:, 1:2, :, :] / ((tenInput.shape[2] - 1.0) / 2.0),
+        ],
+        1,
+    )
+    g = (backwarp_tenGrid[k] + tenFlow).permute(0, 2, 3, 1)
+    return torch.nn.functional.grid_sample(
+        input=tenInput,
+        grid=g,
+        mode="bilinear",
+        padding_mode="border",
+        align_corners=True,
+    )
+class MultiScaleTridentConv(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        strides=1,
+        paddings=0,
+        dilations=1,
+        dilation=1,
+        groups=1,
+        num_branch=1,
+        test_branch_idx=-1,
+        bias=False,
+        norm=None,
+        activation=None,
+    ):
+        super(MultiScaleTridentConv, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.num_branch = num_branch
+        self.stride = _pair(stride)
+        self.groups = groups
+        self.with_bias = bias
+        self.dilation = dilation
+        if isinstance(paddings, int):
+            paddings = [paddings] * self.num_branch
+        if isinstance(dilations, int):
+            dilations = [dilations] * self.num_branch
+        if isinstance(strides, int):
+            strides = [strides] * self.num_branch
+        self.paddings = [_pair(padding) for padding in paddings]
+        self.dilations = [_pair(dilation) for dilation in dilations]
+        self.strides = [_pair(stride) for stride in strides]
+        self.test_branch_idx = test_branch_idx
+        self.norm = norm
+        self.activation = activation
+        assert len({self.num_branch, len(self.paddings), len(self.strides)}) == 1
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // groups, *self.kernel_size)
+        )
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.bias = None
+        nn.init.kaiming_uniform_(self.weight, nonlinearity="relu")
+        if self.bias is not None:
+            nn.init.constant_(self.bias, 0)
+    def forward(self, inputs):
+        num_branch = (
+            self.num_branch if self.training or self.test_branch_idx == -1 else 1
+        )
+        assert len(inputs) == num_branch
+        if self.training or self.test_branch_idx == -1:
+            outputs = [
+                F.conv2d(
+                    input,
+                    self.weight,
+                    self.bias,
+                    stride,
+                    padding,
+                    self.dilation,
+                    self.groups,
+                )
+                for input, stride, padding in zip(inputs, self.strides, self.paddings)
+            ]
+        else:
+            outputs = [
+                F.conv2d(
+                    inputs[0],
+                    self.weight,
+                    self.bias,
+                    self.strides[self.test_branch_idx]
+                    if self.test_branch_idx == -1
+                    else self.strides[-1],
+                    self.paddings[self.test_branch_idx]
+                    if self.test_branch_idx == -1
+                    else self.paddings[-1],
+                    self.dilation,
+                    self.groups,
+                )
+            ]
+        if self.norm is not None:
+            outputs = [self.norm(x) for x in outputs]
+        if self.activation is not None:
+            outputs = [self.activation(x) for x in outputs]
+        return outputs
+class ResidualBlock_class(nn.Module):
+    def __init__(
+        self,
+        in_planes,
+        planes,
+        norm_layer=nn.InstanceNorm2d,
+        stride=1,
+        dilation=1,
+    ):
+        super(ResidualBlock_class, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_planes,
+            planes,
+            kernel_size=3,
+            dilation=dilation,
+            padding=dilation,
+            stride=stride,
+            bias=False,
+        )
+        self.conv2 = nn.Conv2d(
+            planes,
+            planes,
+            kernel_size=3,
+            dilation=dilation,
+            padding=dilation,
+            bias=False,
+        )
+        self.relu = nn.ReLU(inplace=True)
+        self.norm1 = norm_layer(planes)
+        self.norm2 = norm_layer(planes)
+        if not stride == 1 or in_planes != planes:
+            self.norm3 = norm_layer(planes)
+        if stride == 1 and in_planes == planes:
+            self.downsample = None
+        else:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3
+            )
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return self.relu(x + y)
+class CNNEncoder(nn.Module):
+    def __init__(
+        self,
+        output_dim=128,
+        norm_layer=nn.InstanceNorm2d,
+        num_output_scales=1,
+        **kwargs,
+    ):
+        super(CNNEncoder, self).__init__()
+        self.num_branch = num_output_scales
+        feature_dims = [64, 96, 128]
+        self.conv1 = nn.Conv2d(
+            3, feature_dims[0], kernel_size=7, stride=2, padding=3, bias=False
+        )  # 1/2
+        self.norm1 = norm_layer(feature_dims[0])
+        self.relu1 = nn.ReLU(inplace=True)
+        self.in_planes = feature_dims[0]
+        self.layer1 = self._make_layer(
+            feature_dims[0], stride=1, norm_layer=norm_layer
+        )  # 1/2
+        self.layer2 = self._make_layer(
+            feature_dims[1], stride=2, norm_layer=norm_layer
+        )  # 1/4
+        # highest resolution 1/4 or 1/8
+        stride = 2 if num_output_scales == 1 else 1
+        self.layer3 = self._make_layer(
+            feature_dims[2],
+            stride=stride,
+            norm_layer=norm_layer,
+        )  # 1/4 or 1/8
+        self.conv2 = nn.Conv2d(feature_dims[2], output_dim, 1, 1, 0)
+        if self.num_branch > 1:
+            if self.num_branch == 4:
+                strides = (1, 2, 4, 8)
+            elif self.num_branch == 3:
+                strides = (1, 2, 4)
+            elif self.num_branch == 2:
+                strides = (1, 2)
+            else:
+                raise ValueError
+            self.trident_conv = MultiScaleTridentConv(
+                output_dim,
+                output_dim,
+                kernel_size=3,
+                strides=strides,
+                paddings=1,
+                num_branch=self.num_branch,
+            )
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def _make_layer(self, dim, stride=1, dilation=1, norm_layer=nn.InstanceNorm2d):
+        layer1 = ResidualBlock_class(
+            self.in_planes, dim, norm_layer=norm_layer, stride=stride, dilation=dilation
+        )
+        layer2 = ResidualBlock_class(
+            dim, dim, norm_layer=norm_layer, stride=1, dilation=dilation
+        )
+        layers = (layer1, layer2)
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+        x = self.layer1(x)  # 1/2
+        x = self.layer2(x)  # 1/4
+        x = self.layer3(x)  # 1/8 or 1/4
+        x = self.conv2(x)
+        if self.num_branch > 1:
+            out = self.trident_conv([x] * self.num_branch)  # high to low res
+        else:
+            out = [x]
+        return out
+def single_head_full_attention(q, k, v):
+    # q, k, v: [B, L, C]
+    assert q.dim() == k.dim() == v.dim() == 3
+    scores = torch.matmul(q, k.permute(0, 2, 1)) / (q.size(2) ** 0.5)  # [B, L, L]
+    attn = torch.softmax(scores, dim=2)  # [B, L, L]
+    out = torch.matmul(attn, v)  # [B, L, C]
+    return out
+def generate_shift_window_attn_mask(
+    input_resolution,
+    window_size_h,
+    window_size_w,
+    shift_size_h,
+    shift_size_w,
+    device=get_torch_device(),
+):
+    # Ref: https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
+    # calculate attention mask for SW-MSA
+    h, w = input_resolution
+    img_mask = torch.zeros((1, h, w, 1)).to(device)  # 1 H W 1
+    h_slices = (
+        slice(0, -window_size_h),
+        slice(-window_size_h, -shift_size_h),
+        slice(-shift_size_h, None),
+    )
+    w_slices = (
+        slice(0, -window_size_w),
+        slice(-window_size_w, -shift_size_w),
+        slice(-shift_size_w, None),
+    )
+    cnt = 0
+    for h in h_slices:
+        for w in w_slices:
+            img_mask[:, h, w, :] = cnt
+            cnt += 1
+    mask_windows = split_feature(
+        img_mask, num_splits=input_resolution[-1] // window_size_w, channel_last=True
+    )
+    mask_windows = mask_windows.view(-1, window_size_h * window_size_w)
+    attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+    attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(
+        attn_mask == 0, float(0.0)
+    )
+    return attn_mask
+def single_head_split_window_attention(
+    q,
+    k,
+    v,
+    num_splits=1,
+    with_shift=False,
+    h=None,
+    w=None,
+    attn_mask=None,
+):
+    # Ref: https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
+    # q, k, v: [B, L, C]
+    assert q.dim() == k.dim() == v.dim() == 3
+    assert h is not None and w is not None
+    assert q.size(1) == h * w
+    b, _, c = q.size()
+    b_new = b * num_splits * num_splits
+    window_size_h = h // num_splits
+    window_size_w = w // num_splits
+    q = q.view(b, h, w, c)  # [B, H, W, C]
+    k = k.view(b, h, w, c)
+    v = v.view(b, h, w, c)
+    scale_factor = c**0.5
+    if with_shift:
+        assert attn_mask is not None  # compute once
+        shift_size_h = window_size_h // 2
+        shift_size_w = window_size_w // 2
+        q = torch.roll(q, shifts=(-shift_size_h, -shift_size_w), dims=(1, 2))
+        k = torch.roll(k, shifts=(-shift_size_h, -shift_size_w), dims=(1, 2))
+        v = torch.roll(v, shifts=(-shift_size_h, -shift_size_w), dims=(1, 2))
+    q = split_feature(
+        q, num_splits=num_splits, channel_last=True
+    )  # [B*K*K, H/K, W/K, C]
+    k = split_feature(k, num_splits=num_splits, channel_last=True)
+    v = split_feature(v, num_splits=num_splits, channel_last=True)
+    scores = (
+        torch.matmul(q.view(b_new, -1, c), k.view(b_new, -1, c).permute(0, 2, 1))
+        / scale_factor
+    )  # [B*K*K, H/K*W/K, H/K*W/K]
+    if with_shift:
+        scores += attn_mask.repeat(b, 1, 1)
+    attn = torch.softmax(scores, dim=-1)
+    out = torch.matmul(attn, v.view(b_new, -1, c))  # [B*K*K, H/K*W/K, C]
+    out = merge_splits(
+        out.view(b_new, h // num_splits, w // num_splits, c),
+        num_splits=num_splits,
+        channel_last=True,
+    )  # [B, H, W, C]
+    # shift back
+    if with_shift:
+        out = torch.roll(out, shifts=(shift_size_h, shift_size_w), dims=(1, 2))
+    out = out.view(b, -1, c)
+    return out
+class TransformerLayer(nn.Module):
+    def __init__(
+        self,
+        d_model=256,
+        nhead=1,
+        attention_type="swin",
+        no_ffn=False,
+        ffn_dim_expansion=4,
+        with_shift=False,
+        **kwargs,
+    ):
+        super(TransformerLayer, self).__init__()
+        self.dim = d_model
+        self.nhead = nhead
+        self.attention_type = attention_type
+        self.no_ffn = no_ffn
+        self.with_shift = with_shift
+        # multi-head attention
+        self.q_proj = nn.Linear(d_model, d_model, bias=False)
+        self.k_proj = nn.Linear(d_model, d_model, bias=False)
+        self.v_proj = nn.Linear(d_model, d_model, bias=False)
+        self.merge = nn.Linear(d_model, d_model, bias=False)
+        self.norm1 = nn.LayerNorm(d_model)
+        # no ffn after self-attn, with ffn after cross-attn
+        if not self.no_ffn:
+            in_channels = d_model * 2
+            self.mlp = nn.Sequential(
+                nn.Linear(in_channels, in_channels * ffn_dim_expansion, bias=False),
+                nn.GELU(),
+                nn.Linear(in_channels * ffn_dim_expansion, d_model, bias=False),
+            )
+            self.norm2 = nn.LayerNorm(d_model)
+    def forward(
+        self,
+        source,
+        target,
+        height=None,
+        width=None,
+        shifted_window_attn_mask=None,
+        attn_num_splits=None,
+        **kwargs,
+    ):
+        # source, target: [B, L, C]
+        query, key, value = source, target, target
+        # single-head attention
+        query = self.q_proj(query)  # [B, L, C]
+        key = self.k_proj(key)  # [B, L, C]
+        value = self.v_proj(value)  # [B, L, C]
+        if self.attention_type == "swin" and attn_num_splits > 1:
+            if self.nhead > 1:
+                # we observe that multihead attention slows down the speed and increases the memory consumption
+                # without bringing obvious performance gains and thus the implementation is removed
+                raise NotImplementedError
+            else:
+                message = single_head_split_window_attention(
+                    query,
+                    key,
+                    value,
+                    num_splits=attn_num_splits,
+                    with_shift=self.with_shift,
+                    h=height,
+                    w=width,
+                    attn_mask=shifted_window_attn_mask,
+                )
+        else:
+            message = single_head_full_attention(query, key, value)  # [B, L, C]
+        message = self.merge(message)  # [B, L, C]
+        message = self.norm1(message)
+        if not self.no_ffn:
+            message = self.mlp(torch.cat([source, message], dim=-1))
+            message = self.norm2(message)
+        return source + message
+class TransformerBlock(nn.Module):
+    """self attention + cross attention + FFN"""
+    def __init__(
+        self,
+        d_model=256,
+        nhead=1,
+        attention_type="swin",
+        ffn_dim_expansion=4,
+        with_shift=False,
+        **kwargs,
+    ):
+        super(TransformerBlock, self).__init__()
+        self.self_attn = TransformerLayer(
+            d_model=d_model,
+            nhead=nhead,
+            attention_type=attention_type,
+            no_ffn=True,
+            ffn_dim_expansion=ffn_dim_expansion,
+            with_shift=with_shift,
+        )
+        self.cross_attn_ffn = TransformerLayer(
+            d_model=d_model,
+            nhead=nhead,
+            attention_type=attention_type,
+            ffn_dim_expansion=ffn_dim_expansion,
+            with_shift=with_shift,
+        )
+    def forward(
+        self,
+        source,
+        target,
+        height=None,
+        width=None,
+        shifted_window_attn_mask=None,
+        attn_num_splits=None,
+        **kwargs,
+    ):
+        # source, target: [B, L, C]
+        # self attention
+        source = self.self_attn(
+            source,
+            source,
+            height=height,
+            width=width,
+            shifted_window_attn_mask=shifted_window_attn_mask,
+            attn_num_splits=attn_num_splits,
+        )
+        # cross attention and ffn
+        source = self.cross_attn_ffn(
+            source,
+            target,
+            height=height,
+            width=width,
+            shifted_window_attn_mask=shifted_window_attn_mask,
+            attn_num_splits=attn_num_splits,
+        )
+        return source
+class FeatureTransformer(nn.Module):
+    def __init__(
+        self,
+        num_layers=6,
+        d_model=128,
+        nhead=1,
+        attention_type="swin",
+        ffn_dim_expansion=4,
+        **kwargs,
+    ):
+        super(FeatureTransformer, self).__init__()
+        self.attention_type = attention_type
+        self.d_model = d_model
+        self.nhead = nhead
+        self.layers = nn.ModuleList(
+            [
+                TransformerBlock(
+                    d_model=d_model,
+                    nhead=nhead,
+                    attention_type=attention_type,
+                    ffn_dim_expansion=ffn_dim_expansion,
+                    with_shift=True
+                    if attention_type == "swin" and i % 2 == 1
+                    else False,
+                )
+                for i in range(num_layers)
+            ]
+        )
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def forward(
+        self,
+        feature0,
+        feature1,
+        attn_num_splits=None,
+        **kwargs,
+    ):
+        b, c, h, w = feature0.shape
+        assert self.d_model == c
+        feature0 = feature0.flatten(-2).permute(0, 2, 1)  # [B, H*W, C]
+        feature1 = feature1.flatten(-2).permute(0, 2, 1)  # [B, H*W, C]
+        if self.attention_type == "swin" and attn_num_splits > 1:
+            # global and refine use different number of splits
+            window_size_h = h // attn_num_splits
+            window_size_w = w // attn_num_splits
+            # compute attn mask once
+            shifted_window_attn_mask = generate_shift_window_attn_mask(
+                input_resolution=(h, w),
+                window_size_h=window_size_h,
+                window_size_w=window_size_w,
+                shift_size_h=window_size_h // 2,
+                shift_size_w=window_size_w // 2,
+                device=feature0.device,
+            )  # [K*K, H/K*W/K, H/K*W/K]
+        else:
+            shifted_window_attn_mask = None
+        # concat feature0 and feature1 in batch dimension to compute in parallel
+        concat0 = torch.cat((feature0, feature1), dim=0)  # [2B, H*W, C]
+        concat1 = torch.cat((feature1, feature0), dim=0)  # [2B, H*W, C]
+        for layer in self.layers:
+            concat0 = layer(
+                concat0,
+                concat1,
+                height=h,
+                width=w,
+                shifted_window_attn_mask=shifted_window_attn_mask,
+                attn_num_splits=attn_num_splits,
+            )
+            # update feature1
+            concat1 = torch.cat(concat0.chunk(chunks=2, dim=0)[::-1], dim=0)
+        feature0, feature1 = concat0.chunk(chunks=2, dim=0)  # [B, H*W, C]
+        # reshape back
+        feature0 = (
+            feature0.view(b, h, w, c).permute(0, 3, 1, 2).contiguous()
+        )  # [B, C, H, W]
+        feature1 = (
+            feature1.view(b, h, w, c).permute(0, 3, 1, 2).contiguous()
+        )  # [B, C, H, W]
+        return feature0, feature1
+class FeatureFlowAttention(nn.Module):
+    """
+    flow propagation with self-attention on feature
+    query: feature0, key: feature0, value: flow
+    """
+    def __init__(
+        self,
+        in_channels,
+        **kwargs,
+    ):
+        super(FeatureFlowAttention, self).__init__()
+        self.q_proj = nn.Linear(in_channels, in_channels)
+        self.k_proj = nn.Linear(in_channels, in_channels)
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def forward(
+        self,
+        feature0,
+        flow,
+        local_window_attn=False,
+        local_window_radius=1,
+        **kwargs,
+    ):
+        # q, k: feature [B, C, H, W], v: flow [B, 2, H, W]
+        if local_window_attn:
+            return self.forward_local_window_attn(
+                feature0, flow, local_window_radius=local_window_radius
+            )
+        b, c, h, w = feature0.size()
+        query = feature0.view(b, c, h * w).permute(0, 2, 1)  # [B, H*W, C]
+        # a note: the ``correct'' implementation should be:
+        # ``query = self.q_proj(query), key = self.k_proj(query)''
+        # this problem is observed while cleaning up the code
+        # however, this doesn't affect the performance since the projection is a linear operation,
+        # thus the two projection matrices for key can be merged
+        # so I just leave it as is in order to not re-train all models :)
+        query = self.q_proj(query)  # [B, H*W, C]
+        key = self.k_proj(query)  # [B, H*W, C]
+        value = flow.view(b, flow.size(1), h * w).permute(0, 2, 1)  # [B, H*W, 2]
+        scores = torch.matmul(query, key.permute(0, 2, 1)) / (c**0.5)  # [B, H*W, H*W]
+        prob = torch.softmax(scores, dim=-1)
+        out = torch.matmul(prob, value)  # [B, H*W, 2]
+        out = out.view(b, h, w, value.size(-1)).permute(0, 3, 1, 2)  # [B, 2, H, W]
+        return out
+    def forward_local_window_attn(
+        self,
+        feature0,
+        flow,
+        local_window_radius=1,
+    ):
+        assert flow.size(1) == 2
+        assert local_window_radius > 0
+        b, c, h, w = feature0.size()
+        feature0_reshape = self.q_proj(
+            feature0.view(b, c, -1).permute(0, 2, 1)
+        ).reshape(
+            b * h * w, 1, c
+        )  # [B*H*W, 1, C]
+        kernel_size = 2 * local_window_radius + 1
+        feature0_proj = (
+            self.k_proj(feature0.view(b, c, -1).permute(0, 2, 1))
+            .permute(0, 2, 1)
+            .reshape(b, c, h, w)
+        )
+        feature0_window = F.unfold(
+            feature0_proj, kernel_size=kernel_size, padding=local_window_radius
+        )  # [B, C*(2R+1)^2), H*W]
+        feature0_window = (
+            feature0_window.view(b, c, kernel_size**2, h, w)
+            .permute(0, 3, 4, 1, 2)
+            .reshape(b * h * w, c, kernel_size**2)
+        )  # [B*H*W, C, (2R+1)^2]
+        flow_window = F.unfold(
+            flow, kernel_size=kernel_size, padding=local_window_radius
+        )  # [B, 2*(2R+1)^2), H*W]
+        flow_window = (
+            flow_window.view(b, 2, kernel_size**2, h, w)
+            .permute(0, 3, 4, 2, 1)
+            .reshape(b * h * w, kernel_size**2, 2)
+        )  # [B*H*W, (2R+1)^2, 2]
+        scores = torch.matmul(feature0_reshape, feature0_window) / (
+            c**0.5
+        )  # [B*H*W, 1, (2R+1)^2]
+        prob = torch.softmax(scores, dim=-1)
+        out = (
+            torch.matmul(prob, flow_window)
+            .view(b, h, w, 2)
+            .permute(0, 3, 1, 2)
+            .contiguous()
+        )  # [B, 2, H, W]
+        return out
+def global_correlation_softmax(
+    feature0,
+    feature1,
+    pred_bidir_flow=False,
+):
+    # global correlation
+    b, c, h, w = feature0.shape
+    feature0 = feature0.view(b, c, -1).permute(0, 2, 1)  # [B, H*W, C]
+    feature1 = feature1.view(b, c, -1)  # [B, C, H*W]
+    correlation = torch.matmul(feature0, feature1).view(b, h, w, h, w) / (
+        c**0.5
+    )  # [B, H, W, H, W]
+    # flow from softmax
+    init_grid = coords_grid(b, h, w).to(correlation.device)  # [B, 2, H, W]
+    grid = init_grid.view(b, 2, -1).permute(0, 2, 1)  # [B, H*W, 2]
+    correlation = correlation.view(b, h * w, h * w)  # [B, H*W, H*W]
+    if pred_bidir_flow:
+        correlation = torch.cat(
+            (correlation, correlation.permute(0, 2, 1)), dim=0
+        )  # [2*B, H*W, H*W]
+        init_grid = init_grid.repeat(2, 1, 1, 1)  # [2*B, 2, H, W]
+        grid = grid.repeat(2, 1, 1)  # [2*B, H*W, 2]
+        b = b * 2
+    prob = F.softmax(correlation, dim=-1)  # [B, H*W, H*W]
+    correspondence = (
+        torch.matmul(prob, grid).view(b, h, w, 2).permute(0, 3, 1, 2)
+    )  # [B, 2, H, W]
+    # when predicting bidirectional flow, flow is the concatenation of forward flow and backward flow
+    flow = correspondence - init_grid
+    return flow, prob
+def local_correlation_softmax(
+    feature0,
+    feature1,
+    local_radius,
+    padding_mode="zeros",
+):
+    b, c, h, w = feature0.size()
+    coords_init = coords_grid(b, h, w).to(feature0.device)  # [B, 2, H, W]
+    coords = coords_init.view(b, 2, -1).permute(0, 2, 1)  # [B, H*W, 2]
+    local_h = 2 * local_radius + 1
+    local_w = 2 * local_radius + 1
+    window_grid = generate_window_grid(
+        -local_radius,
+        local_radius,
+        -local_radius,
+        local_radius,
+        local_h,
+        local_w,
+        device=feature0.device,
+    )  # [2R+1, 2R+1, 2]
+    window_grid = window_grid.reshape(-1, 2).repeat(b, 1, 1, 1)  # [B, 1, (2R+1)^2, 2]
+    sample_coords = coords.unsqueeze(-2) + window_grid  # [B, H*W, (2R+1)^2, 2]
+    sample_coords_softmax = sample_coords
+    # exclude coords that are out of image space
+    valid_x = (sample_coords[:, :, :, 0] >= 0) & (
+        sample_coords[:, :, :, 0] < w
+    )  # [B, H*W, (2R+1)^2]
+    valid_y = (sample_coords[:, :, :, 1] >= 0) & (
+        sample_coords[:, :, :, 1] < h
+    )  # [B, H*W, (2R+1)^2]
+    valid = (
+        valid_x & valid_y
+    )  # [B, H*W, (2R+1)^2], used to mask out invalid values when softmax
+    # normalize coordinates to [-1, 1]
+    sample_coords_norm = normalize_coords(sample_coords, h, w)  # [-1, 1]
+    window_feature = F.grid_sample(
+        feature1, sample_coords_norm, padding_mode=padding_mode, align_corners=True
+    ).permute(
+        0, 2, 1, 3
+    )  # [B, H*W, C, (2R+1)^2]
+    feature0_view = feature0.permute(0, 2, 3, 1).view(b, h * w, 1, c)  # [B, H*W, 1, C]
+    corr = torch.matmul(feature0_view, window_feature).view(b, h * w, -1) / (
+        c**0.5
+    )  # [B, H*W, (2R+1)^2]
+    # mask invalid locations
+    corr[~valid] = -1e9
+    prob = F.softmax(corr, -1)  # [B, H*W, (2R+1)^2]
+    correspondence = (
+        torch.matmul(prob.unsqueeze(-2), sample_coords_softmax)
+        .squeeze(-2)
+        .view(b, h, w, 2)
+        .permute(0, 3, 1, 2)
+    )  # [B, 2, H, W]
+    flow = correspondence - coords_init
+    match_prob = prob
+    return flow, match_prob
+def coords_grid(b, h, w, homogeneous=False, device=None):
+    y, x = torch.meshgrid(torch.arange(h), torch.arange(w))  # [H, W]
+    stacks = [x, y]
+    if homogeneous:
+        ones = torch.ones_like(x)  # [H, W]
+        stacks.append(ones)
+    grid = torch.stack(stacks, dim=0).float()  # [2, H, W] or [3, H, W]
+    grid = grid[None].repeat(b, 1, 1, 1)  # [B, 2, H, W] or [B, 3, H, W]
+    if device is not None:
+        grid = grid.to(device)
+    return grid
+def generate_window_grid(h_min, h_max, w_min, w_max, len_h, len_w, device=None):
+    assert device is not None
+    x, y = torch.meshgrid(
+        [
+            torch.linspace(w_min, w_max, len_w, device=device),
+            torch.linspace(h_min, h_max, len_h, device=device),
+        ],
+    )
+    grid = torch.stack((x, y), -1).transpose(0, 1).float()  # [H, W, 2]
+    return grid
+def normalize_coords(coords, h, w):
+    # coords: [B, H, W, 2]
+    c = torch.Tensor([(w - 1) / 2.0, (h - 1) / 2.0]).float().to(coords.device)
+    return (coords - c) / c  # [-1, 1]
+def bilinear_sample(
+    img, sample_coords, mode="bilinear", padding_mode="zeros", return_mask=False
+):
+    # img: [B, C, H, W]
+    # sample_coords: [B, 2, H, W] in image scale
+    if sample_coords.size(1) != 2:  # [B, H, W, 2]
+        sample_coords = sample_coords.permute(0, 3, 1, 2)
+    b, _, h, w = sample_coords.shape
+    # Normalize to [-1, 1]
+    x_grid = 2 * sample_coords[:, 0] / (w - 1) - 1
+    y_grid = 2 * sample_coords[:, 1] / (h - 1) - 1
+    grid = torch.stack([x_grid, y_grid], dim=-1)  # [B, H, W, 2]
+    img = F.grid_sample(
+        img, grid, mode=mode, padding_mode=padding_mode, align_corners=True
+    )
+    if return_mask:
+        mask = (
+            (x_grid >= -1) & (y_grid >= -1) & (x_grid <= 1) & (y_grid <= 1)
+        )  # [B, H, W]
+        return img, mask
+    return img
+def flow_warp(feature, flow, mask=False, padding_mode="zeros"):
+    b, c, h, w = feature.size()
+    assert flow.size(1) == 2
+    grid = coords_grid(b, h, w).to(flow.device) + flow  # [B, 2, H, W]
+    return bilinear_sample(feature, grid, padding_mode=padding_mode, return_mask=mask)
+def forward_backward_consistency_check(fwd_flow, bwd_flow, alpha=0.01, beta=0.5):
+    # fwd_flow, bwd_flow: [B, 2, H, W]
+    # alpha and beta values are following UnFlow (https://arxiv.org/abs/1711.07837)
+    assert fwd_flow.dim() == 4 and bwd_flow.dim() == 4
+    assert fwd_flow.size(1) == 2 and bwd_flow.size(1) == 2
+    flow_mag = torch.norm(fwd_flow, dim=1) + torch.norm(bwd_flow, dim=1)  # [B, H, W]
+    warped_bwd_flow = flow_warp(bwd_flow, fwd_flow)  # [B, 2, H, W]
+    warped_fwd_flow = flow_warp(fwd_flow, bwd_flow)  # [B, 2, H, W]
+    diff_fwd = torch.norm(fwd_flow + warped_bwd_flow, dim=1)  # [B, H, W]
+    diff_bwd = torch.norm(bwd_flow + warped_fwd_flow, dim=1)
+    threshold = alpha * flow_mag + beta
+    fwd_occ = (diff_fwd > threshold).float()  # [B, H, W]
+    bwd_occ = (diff_bwd > threshold).float()
+    return fwd_occ, bwd_occ
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+    def __init__(self, num_pos_feats=64, temperature=10000, normalize=True, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+    def forward(self, x):
+        # x = tensor_list.tensors  # [B, C, H, W]
+        # mask = tensor_list.mask  # [B, H, W], input with padding, valid as 0
+        b, c, h, w = x.size()
+        mask = torch.ones((b, h, w), device=x.device)  # [B, H, W]
+        y_embed = mask.cumsum(1, dtype=torch.float32)
+        x_embed = mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
+        ).flatten(3)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
+        ).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+def split_feature(
+    feature,
+    num_splits=2,
+    channel_last=False,
+):
+    if channel_last:  # [B, H, W, C]
+        b, h, w, c = feature.size()
+        assert h % num_splits == 0 and w % num_splits == 0
+        b_new = b * num_splits * num_splits
+        h_new = h // num_splits
+        w_new = w // num_splits
+        feature = (
+            feature.view(b, num_splits, h // num_splits, num_splits, w // num_splits, c)
+            .permute(0, 1, 3, 2, 4, 5)
+            .reshape(b_new, h_new, w_new, c)
+        )  # [B*K*K, H/K, W/K, C]
+    else:  # [B, C, H, W]
+        b, c, h, w = feature.size()
+        assert h % num_splits == 0 and w % num_splits == 0
+        b_new = b * num_splits * num_splits
+        h_new = h // num_splits
+        w_new = w // num_splits
+        feature = (
+            feature.view(b, c, num_splits, h // num_splits, num_splits, w // num_splits)
+            .permute(0, 2, 4, 1, 3, 5)
+            .reshape(b_new, c, h_new, w_new)
+        )  # [B*K*K, C, H/K, W/K]
+    return feature
+def merge_splits(
+    splits,
+    num_splits=2,
+    channel_last=False,
+):
+    if channel_last:  # [B*K*K, H/K, W/K, C]
+        b, h, w, c = splits.size()
+        new_b = b // num_splits // num_splits
+        splits = splits.view(new_b, num_splits, num_splits, h, w, c)
+        merge = (
+            splits.permute(0, 1, 3, 2, 4, 5)
+            .contiguous()
+            .view(new_b, num_splits * h, num_splits * w, c)
+        )  # [B, H, W, C]
+    else:  # [B*K*K, C, H/K, W/K]
+        b, c, h, w = splits.size()
+        new_b = b // num_splits // num_splits
+        splits = splits.view(new_b, num_splits, num_splits, c, h, w)
+        merge = (
+            splits.permute(0, 3, 1, 4, 2, 5)
+            .contiguous()
+            .view(new_b, c, num_splits * h, num_splits * w)
+        )  # [B, C, H, W]
+    return merge
+def normalize_img(img0, img1):
+    # loaded images are in [0, 255]
+    # normalize by ImageNet mean and std
+    mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(img1.device)
+    std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(img1.device)
+    img0 = (img0 - mean) / std
+    img1 = (img1 - mean) / std
+    return img0, img1
+def feature_add_position(feature0, feature1, attn_splits, feature_channels):
+    pos_enc = PositionEmbeddingSine(num_pos_feats=feature_channels // 2)
+    if attn_splits > 1:  # add position in splited window
+        feature0_splits = split_feature(feature0, num_splits=attn_splits)
+        feature1_splits = split_feature(feature1, num_splits=attn_splits)
+        position = pos_enc(feature0_splits)
+        feature0_splits = feature0_splits + position
+        feature1_splits = feature1_splits + position
+        feature0 = merge_splits(feature0_splits, num_splits=attn_splits)
+        feature1 = merge_splits(feature1_splits, num_splits=attn_splits)
+    else:
+        position = pos_enc(feature0)
+        feature0 = feature0 + position
+        feature1 = feature1 + position
+    return feature0, feature1
+class GMFlow(nn.Module):
+    def __init__(
+        self,
+        num_scales=2,
+        upsample_factor=4,
+        feature_channels=128,
+        attention_type="swin",
+        num_transformer_layers=6,
+        ffn_dim_expansion=4,
+        num_head=1,
+        **kwargs,
+    ):
+        super(GMFlow, self).__init__()
+        self.num_scales = num_scales
+        self.feature_channels = feature_channels
+        self.upsample_factor = upsample_factor
+        self.attention_type = attention_type
+        self.num_transformer_layers = num_transformer_layers
+        # CNN backbone
+        self.backbone = CNNEncoder(
+            output_dim=feature_channels, num_output_scales=num_scales
+        )
+        # Transformer
+        self.transformer = FeatureTransformer(
+            num_layers=num_transformer_layers,
+            d_model=feature_channels,
+            nhead=num_head,
+            attention_type=attention_type,
+            ffn_dim_expansion=ffn_dim_expansion,
+        )
+        # flow propagation with self-attn
+        self.feature_flow_attn = FeatureFlowAttention(in_channels=feature_channels)
+        # convex upsampling: concat feature0 and flow as input
+        self.upsampler = nn.Sequential(
+            nn.Conv2d(2 + feature_channels, 256, 3, 1, 1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, upsample_factor**2 * 9, 1, 1, 0),
+        )
+    def extract_feature(self, img0, img1):
+        concat = torch.cat((img0, img1), dim=0)  # [2B, C, H, W]
+        features = self.backbone(
+            concat
+        )  # list of [2B, C, H, W], resolution from high to low
+        # reverse: resolution from low to high
+        features = features[::-1]
+        feature0, feature1 = [], []
+        for i in range(len(features)):
+            feature = features[i]
+            chunks = torch.chunk(feature, 2, 0)  # tuple
+            feature0.append(chunks[0])
+            feature1.append(chunks[1])
+        return feature0, feature1
+    def upsample_flow(
+        self,
+        flow,
+        feature,
+        bilinear=False,
+        upsample_factor=8,
+    ):
+        if bilinear:
+            up_flow = (
+                F.interpolate(
+                    flow,
+                    scale_factor=upsample_factor,
+                    mode="bilinear",
+                    align_corners=True,
+                )
+                * upsample_factor
+            )
+        else:
+            # convex upsampling
+            concat = torch.cat((flow, feature), dim=1)
+            mask = self.upsampler(concat)
+            b, flow_channel, h, w = flow.shape
+            mask = mask.view(
+                b, 1, 9, self.upsample_factor, self.upsample_factor, h, w
+            )  # [B, 1, 9, K, K, H, W]
+            mask = torch.softmax(mask, dim=2)
+            up_flow = F.unfold(self.upsample_factor * flow, [3, 3], padding=1)
+            up_flow = up_flow.view(
+                b, flow_channel, 9, 1, 1, h, w
+            )  # [B, 2, 9, 1, 1, H, W]
+            up_flow = torch.sum(mask * up_flow, dim=2)  # [B, 2, K, K, H, W]
+            up_flow = up_flow.permute(0, 1, 4, 2, 5, 3)  # [B, 2, K, H, K, W]
+            up_flow = up_flow.reshape(
+                b, flow_channel, self.upsample_factor * h, self.upsample_factor * w
+            )  # [B, 2, K*H, K*W]
+        return up_flow
+    def forward(
+        self,
+        img0,
+        img1,
+        attn_splits_list=[2, 8],
+        corr_radius_list=[-1, 4],
+        prop_radius_list=[-1, 1],
+        pred_bidir_flow=False,
+        **kwargs,
+    ):
+        img0, img1 = normalize_img(img0, img1)  # [B, 3, H, W]
+        # resolution low to high
+        feature0_list, feature1_list = self.extract_feature(
+            img0, img1
+        )  # list of features
+        flow = None
+        assert (
+            len(attn_splits_list)
+            == len(corr_radius_list)
+            == len(prop_radius_list)
+            == self.num_scales
+        )
+        for scale_idx in range(self.num_scales):
+            feature0, feature1 = feature0_list[scale_idx], feature1_list[scale_idx]
+            if pred_bidir_flow and scale_idx > 0:
+                # predicting bidirectional flow with refinement
+                feature0, feature1 = torch.cat((feature0, feature1), dim=0), torch.cat(
+                    (feature1, feature0), dim=0
+                )
+            upsample_factor = self.upsample_factor * (
+                2 ** (self.num_scales - 1 - scale_idx)
+            )
+            if scale_idx > 0:
+                flow = (
+                    F.interpolate(
+                        flow, scale_factor=2, mode="bilinear", align_corners=True
+                    )
+                    * 2
+                )
+            if flow is not None:
+                flow = flow.detach()
+                feature1 = flow_warp(feature1, flow)  # [B, C, H, W]
+            attn_splits = attn_splits_list[scale_idx]
+            corr_radius = corr_radius_list[scale_idx]
+            prop_radius = prop_radius_list[scale_idx]
+            # add position to features
+            feature0, feature1 = feature_add_position(
+                feature0, feature1, attn_splits, self.feature_channels
+            )
+            # Transformer
+            feature0, feature1 = self.transformer(
+                feature0, feature1, attn_num_splits=attn_splits
+            )
+            # correlation and softmax
+            if corr_radius == -1:  # global matching
+                flow_pred = global_correlation_softmax(
+                    feature0, feature1, pred_bidir_flow
+                )[0]
+            else:  # local matching
+                flow_pred = local_correlation_softmax(feature0, feature1, corr_radius)[
+                    0
+                ]
+            # flow or residual flow
+            flow = flow + flow_pred if flow is not None else flow_pred
+            # upsample to the original resolution for supervison
+            if (
+                self.training
+            ):  # only need to upsample intermediate flow predictions at training time
+                flow_bilinear = self.upsample_flow(
+                    flow, None, bilinear=True, upsample_factor=upsample_factor
+                )
+            # flow propagation with self-attn
+            if pred_bidir_flow and scale_idx == 0:
+                feature0 = torch.cat(
+                    (feature0, feature1), dim=0
+                )  # [2*B, C, H, W] for propagation
+            flow = self.feature_flow_attn(
+                feature0,
+                flow.detach(),
+                local_window_attn=prop_radius > 0,
+                local_window_radius=prop_radius,
+            )
+            # bilinear upsampling at training time except the last one
+            if self.training and scale_idx < self.num_scales - 1:
+                flow_up = self.upsample_flow(
+                    flow, feature0, bilinear=True, upsample_factor=upsample_factor
+                )
+            if scale_idx == self.num_scales - 1:
+                flow_up = self.upsample_flow(flow, feature0)
+        return flow_up
+backwarp_tenGrid = {}
+def backwarp(tenIn, tenflow):
+    if str(tenflow.shape) not in backwarp_tenGrid:
+        tenHor = (
+            torch.linspace(
+                start=-1.0,
+                end=1.0,
+                steps=tenflow.shape[3],
+                dtype=tenflow.dtype,
+                device=tenflow.device,
+            )
+            .view(1, 1, 1, -1)
+            .repeat(1, 1, tenflow.shape[2], 1)
+        )
+        tenVer = (
+            torch.linspace(
+                start=-1.0,
+                end=1.0,
+                steps=tenflow.shape[2],
+                dtype=tenflow.dtype,
+                device=tenflow.device,
+            )
+            .view(1, 1, -1, 1)
+            .repeat(1, 1, 1, tenflow.shape[3])
+        )
+        backwarp_tenGrid[str(tenflow.shape)] = torch.cat([tenHor, tenVer], 1).to(get_torch_device())
+    # end
+    tenflow = torch.cat(
+        [
+            tenflow[:, 0:1, :, :] / ((tenIn.shape[3] - 1.0) / 2.0),
+            tenflow[:, 1:2, :, :] / ((tenIn.shape[2] - 1.0) / 2.0),
+        ],
+        1,
+    )
+    return torch.nn.functional.grid_sample(
+        input=tenIn,
+        grid=(backwarp_tenGrid[str(tenflow.shape)] + tenflow).permute(0, 2, 3, 1),
+        mode="bilinear",
+        padding_mode="zeros",
+        align_corners=True,
+    )
+class MetricNet(nn.Module):
+    def __init__(self):
+        super(MetricNet, self).__init__()
+        self.metric_in = nn.Conv2d(14, 64, 3, 1, 1)
+        self.metric_net1 = nn.Sequential(nn.PReLU(), nn.Conv2d(64, 64, 3, 1, 1))
+        self.metric_net2 = nn.Sequential(nn.PReLU(), nn.Conv2d(64, 64, 3, 1, 1))
+        self.metric_net3 = nn.Sequential(nn.PReLU(), nn.Conv2d(64, 64, 3, 1, 1))
+        self.metric_out = nn.Sequential(nn.PReLU(), nn.Conv2d(64, 2, 3, 1, 1))
+    def forward(self, img0, img1, flow01, flow10):
+        metric0 = F.l1_loss(img0, backwarp(img1, flow01), reduction="none").mean(
+            [1], True
+        )
+        metric1 = F.l1_loss(img1, backwarp(img0, flow10), reduction="none").mean(
+            [1], True
+        )
+        fwd_occ, bwd_occ = forward_backward_consistency_check(flow01, flow10)
+        flow01 = torch.cat(
+            [
+                flow01[:, 0:1, :, :] / ((flow01.shape[3] - 1.0) / 2.0),
+                flow01[:, 1:2, :, :] / ((flow01.shape[2] - 1.0) / 2.0),
+            ],
+            1,
+        )
+        flow10 = torch.cat(
+            [
+                flow10[:, 0:1, :, :] / ((flow10.shape[3] - 1.0) / 2.0),
+                flow10[:, 1:2, :, :] / ((flow10.shape[2] - 1.0) / 2.0),
+            ],
+            1,
+        )
+        img = torch.cat((img0, img1), 1)
+        metric = torch.cat((-metric0, -metric1), 1)
+        flow = torch.cat((flow01, flow10), 1)
+        occ = torch.cat((fwd_occ.unsqueeze(1), bwd_occ.unsqueeze(1)), 1)
+        feat = self.metric_in(torch.cat((img, metric, flow, occ), 1))
+        feat = self.metric_net1(feat) + feat
+        feat = self.metric_net2(feat) + feat
+        feat = self.metric_net3(feat) + feat
+        metric = self.metric_out(feat)
+        metric = torch.tanh(metric) * 10
+        return metric[:, :1], metric[:, 1:2]
+class FeatureNet(nn.Module):
+    """The quadratic model"""
+    def __init__(self):
+        super(FeatureNet, self).__init__()
+        self.block1 = nn.Sequential(
+            nn.PReLU(),
+            nn.Conv2d(3, 64, 3, 2, 1),
+            nn.PReLU(),
+            nn.Conv2d(64, 64, 3, 1, 1),
+        )
+        self.block2 = nn.Sequential(
+            nn.PReLU(),
+            nn.Conv2d(64, 128, 3, 2, 1),
+            nn.PReLU(),
+            nn.Conv2d(128, 128, 3, 1, 1),
+        )
+        self.block3 = nn.Sequential(
+            nn.PReLU(),
+            nn.Conv2d(128, 192, 3, 2, 1),
+            nn.PReLU(),
+            nn.Conv2d(192, 192, 3, 1, 1),
+        )
+    def forward(self, x):
+        x1 = self.block1(x)
+        x2 = self.block2(x1)
+        x3 = self.block3(x2)
+        return x1, x2, x3
+# Residual Block
+def ResidualBlock(in_channels, out_channels, stride=1):
+    return torch.nn.Sequential(
+        nn.PReLU(),
+        nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=True,
+        ),
+        nn.PReLU(),
+        nn.Conv2d(
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=True,
+        ),
+    )
+# downsample block
+def DownsampleBlock(in_channels, out_channels, stride=2):
+    return torch.nn.Sequential(
+        nn.PReLU(),
+        nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=True,
+        ),
+        nn.PReLU(),
+        nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=True
+        ),
+    )
+# upsample block
+def UpsampleBlock(in_channels, out_channels, stride=2):
+    return torch.nn.Sequential(
+        nn.PReLU(),
+        nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size=4,
+            stride=stride,
+            padding=1,
+            bias=True,
+        ),
+        nn.PReLU(),
+        nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=True
+        ),
+    )
+class PixelShuffleBlcok(nn.Module):
+    def __init__(self, in_feat, num_feat, num_out_ch):
+        super(PixelShuffleBlcok, self).__init__()
+        self.conv_before_upsample = nn.Sequential(
+            nn.Conv2d(in_feat, num_feat, 3, 1, 1), nn.PReLU()
+        )
+        self.upsample = nn.Sequential(
+            nn.Conv2d(num_feat, 4 * num_feat, 3, 1, 1), nn.PixelShuffle(2)
+        )
+        self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
+    def forward(self, x):
+        x = self.conv_before_upsample(x)
+        x = self.conv_last(self.upsample(x))
+        return x
+# grid network
+class GridNet(nn.Module):
+    def __init__(
+        self,
+        in_channels=9,
+        in_channels1=128,
+        in_channels2=256,
+        in_channels3=384,
+        out_channels=3,
+    ):
+        super(GridNet, self).__init__()
+        self.residual_model_head0 = ResidualBlock(in_channels, 64)
+        self.residual_model_head1 = ResidualBlock(in_channels1, 64)
+        self.residual_model_head2 = ResidualBlock(in_channels2, 128)
+        self.residual_model_head3 = ResidualBlock(in_channels3, 192)
+        self.residual_model_01 = ResidualBlock(64, 64)
+        # self.residual_model_02=ResidualBlock(64, 64)
+        # self.residual_model_03=ResidualBlock(64, 64)
+        self.residual_model_04 = ResidualBlock(64, 64)
+        self.residual_model_05 = ResidualBlock(64, 64)
+        self.residual_model_tail = PixelShuffleBlcok(64, 64, out_channels)
+        self.residual_model_11 = ResidualBlock(128, 128)
+        # self.residual_model_12=ResidualBlock(128, 128)
+        # self.residual_model_13=ResidualBlock(128, 128)
+        self.residual_model_14 = ResidualBlock(128, 128)
+        self.residual_model_15 = ResidualBlock(128, 128)
+        self.residual_model_21 = ResidualBlock(192, 192)
+        # self.residual_model_22=ResidualBlock(192, 192)
+        # self.residual_model_23=ResidualBlock(192, 192)
+        self.residual_model_24 = ResidualBlock(192, 192)
+        self.residual_model_25 = ResidualBlock(192, 192)
+        #
+        self.downsample_model_10 = DownsampleBlock(64, 128)
+        self.downsample_model_20 = DownsampleBlock(128, 192)
+        self.downsample_model_11 = DownsampleBlock(64, 128)
+        self.downsample_model_21 = DownsampleBlock(128, 192)
+        # self.downsample_model_12=DownsampleBlock(64, 128)
+        # self.downsample_model_22=DownsampleBlock(128, 192)
+        #
+        # self.upsample_model_03=UpsampleBlock(128, 64)
+        # self.upsample_model_13=UpsampleBlock(192, 128)
+        self.upsample_model_04 = UpsampleBlock(128, 64)
+        self.upsample_model_14 = UpsampleBlock(192, 128)
+        self.upsample_model_05 = UpsampleBlock(128, 64)
+        self.upsample_model_15 = UpsampleBlock(192, 128)
+    def forward(self, x, x1, x2, x3):
+        X00 = self.residual_model_head0(x) + self.residual_model_head1(
+            x1
+        )  # ---   182 ~ 185
+        # X10 = self.residual_model_head1(x1)
+        X01 = self.residual_model_01(X00) + X00  # ---   208 ~ 211 ,AddBackward1213
+        X10 = self.downsample_model_10(X00) + self.residual_model_head2(
+            x2
+        )  # ---   186 ~ 189
+        X20 = self.downsample_model_20(X10) + self.residual_model_head3(
+            x3
+        )  # ---   190 ~ 193
+        residual_11 = (
+            self.residual_model_11(X10) + X10
+        )  # 201 ~ 204    , sum  AddBackward1206
+        downsample_11 = self.downsample_model_11(X01)  # 214 ~ 217
+        X11 = residual_11 + downsample_11  # ---      AddBackward1218
+        residual_21 = (
+            self.residual_model_21(X20) + X20
+        )  # 194 ~ 197  ,   sum  AddBackward1199
+        downsample_21 = self.downsample_model_21(X11)  # 219 ~ 222
+        X21 = residual_21 + downsample_21  # AddBackward1223
+        X24 = self.residual_model_24(X21) + X21  # ---   224 ~ 227 , AddBackward1229
+        X25 = self.residual_model_25(X24) + X24  # ---   230 ~ 233 , AddBackward1235
+        upsample_14 = self.upsample_model_14(X24)  # 242 ~ 246
+        residual_14 = self.residual_model_14(X11) + X11  # 248 ~ 251, AddBackward1253
+        X14 = upsample_14 + residual_14  # ---   AddBackward1254
+        upsample_04 = self.upsample_model_04(X14)  # 268 ~ 272
+        residual_04 = self.residual_model_04(X01) + X01  # 274 ~ 277, AddBackward1279
+        X04 = upsample_04 + residual_04  # ---  AddBackward1280
+        upsample_15 = self.upsample_model_15(X25)  # 236 ~ 240
+        residual_15 = self.residual_model_15(X14) + X14  # 255 ~ 258, AddBackward1260
+        X15 = upsample_15 + residual_15  # AddBackward1261
+        upsample_05 = self.upsample_model_05(X15)  # 262 ~ 266
+        residual_05 = self.residual_model_05(X04) + X04  # 281 ~ 284,AddBackward1286
+        X05 = upsample_05 + residual_05  # AddBackward1287
+        X_tail = self.residual_model_tail(X05)  # 288 ~ 291
+        return X_tail
+# end
+class Model:
+    def __init__(self):
+        self.flownet = GMFlow()
+        self.ifnet = IFNet(arch_ver="4.6")
+        self.metricnet = MetricNet()
+        self.feat_ext = FeatureNet()
+        self.fusionnet = GridNet()
+        self.version = 3.9
+    def eval(self):
+        self.flownet.eval()
+        self.ifnet.eval()
+        self.metricnet.eval()
+        self.feat_ext.eval()
+        self.fusionnet.eval()
+    def device(self):
+        self.flownet.to(device)
+        self.ifnet.to(device)
+        self.metricnet.to(device)
+        self.feat_ext.to(device)
+        self.fusionnet.to(device)
+    def load_model(self, path_dict):
+        #models/rife46.pth
+        self.ifnet.load_state_dict(torch.load(path_dict["ifnet"]))
+        #models/GMFSS_fortuna_flownet.pkl
+        self.flownet.load_state_dict(torch.load(path_dict["flownet"]))
+        #models/GMFSS_fortuna_union_metric.pkl
+        self.metricnet.load_state_dict(torch.load(path_dict["metricnet"]))
+        #models/GMFSS_fortuna_union_feat.pkl
+        self.feat_ext.load_state_dict(torch.load(path_dict["feat_ext"]))
+        #models/GMFSS_fortuna_union_fusionnet.pkl
+        self.fusionnet.load_state_dict(torch.load(path_dict["fusionnet"]))
+    def reuse(self, img0, img1, scale):
+        feat11, feat12, feat13 = self.feat_ext(img0)
+        feat21, feat22, feat23 = self.feat_ext(img1)
+        img0 = F.interpolate(
+            img0, scale_factor=0.5, mode="bilinear", align_corners=False
+        )
+        img1 = F.interpolate(
+            img1, scale_factor=0.5, mode="bilinear", align_corners=False
+        )
+        if scale != 1.0:
+            imgf0 = F.interpolate(
+                img0, scale_factor=scale, mode="bilinear", align_corners=False
+            )
+            imgf1 = F.interpolate(
+                img1, scale_factor=scale, mode="bilinear", align_corners=False
+            )
+        else:
+            imgf0 = img0
+            imgf1 = img1
+        flow01 = self.flownet(imgf0, imgf1, return_flow=True)
+        flow10 = self.flownet(imgf1, imgf0, return_flow=True)
+        if scale != 1.0:
+            flow01 = (
+                F.interpolate(
+                    flow01,
+                    scale_factor=1.0 / scale,
+                    mode="bilinear",
+                    align_corners=False,
+                )
+                / scale
+            )
+            flow10 = (
+                F.interpolate(
+                    flow10,
+                    scale_factor=1.0 / scale,
+                    mode="bilinear",
+                    align_corners=False,
+                )
+                / scale
+            )
+        metric0, metric1 = self.metricnet(img0, img1, flow01, flow10)
+        return (
+            flow01,
+            flow10,
+            metric0,
+            metric1,
+            feat11,
+            feat12,
+            feat13,
+            feat21,
+            feat22,
+            feat23,
+        )
+    def inference(
+        self,
+        img0,
+        img1,
+        flow01,
+        flow10,
+        metric0,
+        metric1,
+        feat11,
+        feat12,
+        feat13,
+        feat21,
+        feat22,
+        feat23,
+        timestep,
+    ):
+        F1t = timestep * flow01
+        F2t = (1 - timestep) * flow10
+        Z1t = timestep * metric0
+        Z2t = (1 - timestep) * metric1
+        img0 = F.interpolate(
+            img0, scale_factor=0.5, mode="bilinear", align_corners=False
+        )
+        I1t = softsplat(img0, F1t, Z1t, strMode="soft")
+        img1 = F.interpolate(
+            img1, scale_factor=0.5, mode="bilinear", align_corners=False
+        )
+        I2t = softsplat(img1, F2t, Z2t, strMode="soft")
+        rife = self.ifnet(img0, img1, timestep, scale_list=[8, 4, 2, 1])
+        feat1t1 = softsplat(feat11, F1t, Z1t, strMode="soft")
+        feat2t1 = softsplat(feat21, F2t, Z2t, strMode="soft")
+        F1td = (
+            F.interpolate(F1t, scale_factor=0.5, mode="bilinear", align_corners=False)
+            * 0.5
+        )
+        Z1d = F.interpolate(Z1t, scale_factor=0.5, mode="bilinear", align_corners=False)
+        feat1t2 = softsplat(feat12, F1td, Z1d, strMode="soft")
+        F2td = (
+            F.interpolate(F2t, scale_factor=0.5, mode="bilinear", align_corners=False)
+            * 0.5
+        )
+        Z2d = F.interpolate(Z2t, scale_factor=0.5, mode="bilinear", align_corners=False)
+        feat2t2 = softsplat(feat22, F2td, Z2d, strMode="soft")
+        F1tdd = (
+            F.interpolate(F1t, scale_factor=0.25, mode="bilinear", align_corners=False)
+            * 0.25
+        )
+        Z1dd = F.interpolate(
+            Z1t, scale_factor=0.25, mode="bilinear", align_corners=False
+        )
+        feat1t3 = softsplat(feat13, F1tdd, Z1dd, strMode="soft")
+        F2tdd = (
+            F.interpolate(F2t, scale_factor=0.25, mode="bilinear", align_corners=False)
+            * 0.25
+        )
+        Z2dd = F.interpolate(
+            Z2t, scale_factor=0.25, mode="bilinear", align_corners=False
+        )
+        feat2t3 = softsplat(feat23, F2tdd, Z2dd, strMode="soft")
+        out = self.fusionnet(
+            torch.cat([I1t, rife, I2t], dim=1),
+            torch.cat([feat1t1, feat2t1], dim=1),
+            torch.cat([feat1t2, feat2t2], dim=1),
+            torch.cat([feat1t3, feat2t3], dim=1),
+        )
+        return torch.clamp(out, 0, 1)

vfi_models/gmfss_fortuna/__init__.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import pathlib
+from vfi_utils import load_file_from_github_release, preprocess_frames, postprocess_frames, generic_frame_loop, InterpolationStateList
+import typing
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from comfy.model_management import get_torch_device
+GLOBAL_MODEL_TYPE = pathlib.Path(__file__).parent.name
+CKPTS_PATH_CONFIG = {
+    "GMFSS_fortuna_union": {
+        "ifnet": ("rife", "rife46.pth"),
+        "flownet": (GLOBAL_MODEL_TYPE, "GMFSS_fortuna_flownet.pkl"),
+        "metricnet": (GLOBAL_MODEL_TYPE, "GMFSS_fortuna_union_metric.pkl"),
+        "feat_ext": (GLOBAL_MODEL_TYPE, "GMFSS_fortuna_union_feat.pkl"),
+        "fusionnet": (GLOBAL_MODEL_TYPE, "GMFSS_fortuna_union_fusionnet.pkl")
+    },
+    "GMFSS_fortuna": {
+        "flownet": (GLOBAL_MODEL_TYPE, "GMFSS_fortuna_flownet.pkl"),
+        "metricnet": (GLOBAL_MODEL_TYPE, "GMFSS_fortuna_metric.pkl"),
+        "feat_ext": (GLOBAL_MODEL_TYPE, "GMFSS_fortuna_feat.pkl"),
+        "fusionnet": (GLOBAL_MODEL_TYPE, "GMFSS_fortuna_fusionnet.pkl")
+    }
+}
+class CommonModelInference(nn.Module):
+    def __init__(self, model_type):
+        super(CommonModelInference, self).__init__()
+        from .GMFSS_Fortuna_arch import Model as GMFSS
+        from .GMFSS_Fortuna_union_arch import Model as GMFSS_Union
+        self.model = GMFSS_Union() if "union" in model_type else GMFSS()
+        self.model.eval()
+        self.model.device()
+        _model_path_config = CKPTS_PATH_CONFIG[model_type]
+        self.model.load_model({
+            key: load_file_from_github_release(*_model_path_config[key])
+            for key in _model_path_config
+        })
+    def forward(self, I0, I1, timestep, scale=1.0):
+        n, c, h, w = I0.shape
+        tmp = max(64, int(64 / scale))
+        ph = ((h - 1) // tmp + 1) * tmp
+        pw = ((w - 1) // tmp + 1) * tmp
+        padding = (0, pw - w, 0, ph - h)
+        I0 = F.pad(I0, padding)
+        I1 = F.pad(I1, padding)
+        (
+            flow01,
+            flow10,
+            metric0,
+            metric1,
+            feat11,
+            feat12,
+            feat13,
+            feat21,
+            feat22,
+            feat23,
+        ) = self.model.reuse(I0, I1, scale)
+        output = self.model.inference(
+            I0,
+            I1,
+            flow01,
+            flow10,
+            metric0,
+            metric1,
+            feat11,
+            feat12,
+            feat13,
+            feat21,
+            feat22,
+            feat23,
+            timestep,
+        )
+        return output[:, :, :h, :w]
+class GMFSS_Fortuna_VFI:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "ckpt_name": (list(CKPTS_PATH_CONFIG.keys()), ),
+                "frames": ("IMAGE", ),
+                "clear_cache_after_n_frames": ("INT", {"default": 10, "min": 1, "max": 1000}),
+                "multiplier": ("INT", {"default": 2, "min": 2, "max": 1000}),
+            },
+            "optional": {
+                "optional_interpolation_states": ("INTERPOLATION_STATES", )
+            }
+        }
+    RETURN_TYPES = ("IMAGE", )
+    FUNCTION = "vfi"
+    CATEGORY = "ComfyUI-Frame-Interpolation/VFI"
+    def vfi(
+        self,
+        ckpt_name: typing.AnyStr,
+        frames: torch.Tensor,
+        clear_cache_after_n_frames = 10,
+        multiplier: typing.SupportsInt = 2,
+        optional_interpolation_states: InterpolationStateList = None,
+        **kwargs
+    ):
+        """
+        Perform video frame interpolation using a given checkpoint model.
+        Args:
+            ckpt_name (str): The name of the checkpoint model to use.
+            frames (torch.Tensor): A tensor containing input video frames.
+            clear_cache_after_n_frames (int, optional): The number of frames to process before clearing CUDA cache
+                to prevent memory overflow. Defaults to 10. Lower numbers are safer but mean more processing time.
+                How high you should set it depends on how many input frames there are, input resolution (after upscaling),
+                how many times you want to multiply them, and how long you're willing to wait for the process to complete.
+            multiplier (int, optional): The multiplier for each input frame. 60 input frames * 2 = 120 output frames. Defaults to 2.
+        Returns:
+            tuple: A tuple containing the output interpolated frames.
+        Note:
+            This method interpolates frames in a video sequence using a specified checkpoint model.
+            It processes each frame sequentially, generating interpolated frames between them.
+            To prevent memory overflow, it clears the CUDA cache after processing a specified number of frames.
+        """
+        interpolation_model = CommonModelInference(model_type=ckpt_name)
+        interpolation_model.eval().to(get_torch_device())
+        frames = preprocess_frames(frames)
+        def return_middle_frame(frame_0, frame_1, timestep, model, scale):
+            return model(frame_0, frame_1, timestep, scale)
+        scale = 1
+        args = [interpolation_model, scale]
+        out = postprocess_frames(
+            generic_frame_loop(type(self).__name__, frames, clear_cache_after_n_frames, multiplier, return_middle_frame, *args,
+                               interpolation_states=optional_interpolation_states, dtype=torch.float32)
+        )
+        return (out,)

vfi_models/ifrnet/IFRNet_L_arch.py ADDED Viewed

	@@ -0,0 +1,293 @@

+# https://github.com/ltkong218/IFRNet/blob/main/models/IFRNet_L.py
+# https://github.com/ltkong218/IFRNet/blob/main/utils.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from comfy.model_management import get_torch_device
+def warp(img, flow):
+    B, _, H, W = flow.shape
+    xx = torch.linspace(-1.0, 1.0, W).view(1, 1, 1, W).expand(B, -1, H, -1)
+    yy = torch.linspace(-1.0, 1.0, H).view(1, 1, H, 1).expand(B, -1, -1, W)
+    grid = torch.cat([xx, yy], 1).to(img)
+    flow_ = torch.cat(
+        [
+            flow[:, 0:1, :, :] / ((W - 1.0) / 2.0),
+            flow[:, 1:2, :, :] / ((H - 1.0) / 2.0),
+        ],
+        1,
+    )
+    grid_ = (grid + flow_).permute(0, 2, 3, 1)
+    output = F.grid_sample(
+        input=img,
+        grid=grid_,
+        mode="bilinear",
+        padding_mode="border",
+        align_corners=True,
+    )
+    return output
+def get_robust_weight(flow_pred, flow_gt, beta):
+    epe = ((flow_pred.detach() - flow_gt) ** 2).sum(dim=1, keepdim=True) ** 0.5
+    robust_weight = torch.exp(-beta * epe)
+    return robust_weight
+def resize(x, scale_factor):
+    return F.interpolate(
+        x, scale_factor=scale_factor, mode="bilinear", align_corners=False
+    )
+def convrelu(
+    in_channels,
+    out_channels,
+    kernel_size=3,
+    stride=1,
+    padding=1,
+    dilation=1,
+    groups=1,
+    bias=True,
+):
+    return nn.Sequential(
+        nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=bias,
+        ),
+        nn.PReLU(out_channels),
+    )
+class ResBlock(nn.Module):
+    def __init__(self, in_channels, side_channels, bias=True):
+        super(ResBlock, self).__init__()
+        self.side_channels = side_channels
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(
+                in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias
+            ),
+            nn.PReLU(in_channels),
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(
+                side_channels,
+                side_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=bias,
+            ),
+            nn.PReLU(side_channels),
+        )
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(
+                in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias
+            ),
+            nn.PReLU(in_channels),
+        )
+        self.conv4 = nn.Sequential(
+            nn.Conv2d(
+                side_channels,
+                side_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=bias,
+            ),
+            nn.PReLU(side_channels),
+        )
+        self.conv5 = nn.Conv2d(
+            in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=bias
+        )
+        self.prelu = nn.PReLU(in_channels)
+    def forward(self, x):
+        out = self.conv1(x)
+        out[:, -self.side_channels :, :, :] = self.conv2(
+            out[:, -self.side_channels :, :, :]
+        )
+        out = self.conv3(out)
+        out[:, -self.side_channels :, :, :] = self.conv4(
+            out[:, -self.side_channels :, :, :]
+        )
+        out = self.prelu(x + self.conv5(out))
+        return out
+class Encoder(nn.Module):
+    def __init__(self):
+        super(Encoder, self).__init__()
+        self.pyramid1 = nn.Sequential(
+            convrelu(3, 64, 7, 2, 3), convrelu(64, 64, 3, 1, 1)
+        )
+        self.pyramid2 = nn.Sequential(
+            convrelu(64, 96, 3, 2, 1), convrelu(96, 96, 3, 1, 1)
+        )
+        self.pyramid3 = nn.Sequential(
+            convrelu(96, 144, 3, 2, 1), convrelu(144, 144, 3, 1, 1)
+        )
+        self.pyramid4 = nn.Sequential(
+            convrelu(144, 192, 3, 2, 1), convrelu(192, 192, 3, 1, 1)
+        )
+    def forward(self, img):
+        f1 = self.pyramid1(img)
+        f2 = self.pyramid2(f1)
+        f3 = self.pyramid3(f2)
+        f4 = self.pyramid4(f3)
+        return f1, f2, f3, f4
+class Decoder4(nn.Module):
+    def __init__(self):
+        super(Decoder4, self).__init__()
+        self.convblock = nn.Sequential(
+            convrelu(384 + 1, 384),
+            ResBlock(384, 64),
+            nn.ConvTranspose2d(384, 148, 4, 2, 1, bias=True),
+        )
+    def forward(self, f0, f1, embt):
+        b, c, h, w = f0.shape
+        embt = embt.repeat(1, 1, h, w)
+        f_in = torch.cat([f0, f1, embt], 1)
+        f_out = self.convblock(f_in)
+        return f_out
+class Decoder3(nn.Module):
+    def __init__(self):
+        super(Decoder3, self).__init__()
+        self.convblock = nn.Sequential(
+            convrelu(436, 432),
+            ResBlock(432, 64),
+            nn.ConvTranspose2d(432, 100, 4, 2, 1, bias=True),
+        )
+    def forward(self, ft_, f0, f1, up_flow0, up_flow1):
+        f0_warp = warp(f0, up_flow0)
+        f1_warp = warp(f1, up_flow1)
+        f_in = torch.cat([ft_, f0_warp, f1_warp, up_flow0, up_flow1], 1)
+        f_out = self.convblock(f_in)
+        return f_out
+class Decoder2(nn.Module):
+    def __init__(self):
+        super(Decoder2, self).__init__()
+        self.convblock = nn.Sequential(
+            convrelu(292, 288),
+            ResBlock(288, 64),
+            nn.ConvTranspose2d(288, 68, 4, 2, 1, bias=True),
+        )
+    def forward(self, ft_, f0, f1, up_flow0, up_flow1):
+        f0_warp = warp(f0, up_flow0)
+        f1_warp = warp(f1, up_flow1)
+        f_in = torch.cat([ft_, f0_warp, f1_warp, up_flow0, up_flow1], 1)
+        f_out = self.convblock(f_in)
+        return f_out
+class Decoder1(nn.Module):
+    def __init__(self):
+        super(Decoder1, self).__init__()
+        self.convblock = nn.Sequential(
+            convrelu(196, 192),
+            ResBlock(192, 64),
+            nn.ConvTranspose2d(192, 8, 4, 2, 1, bias=True),
+        )
+    def forward(self, ft_, f0, f1, up_flow0, up_flow1):
+        f0_warp = warp(f0, up_flow0)
+        f1_warp = warp(f1, up_flow1)
+        f_in = torch.cat([ft_, f0_warp, f1_warp, up_flow0, up_flow1], 1)
+        f_out = self.convblock(f_in)
+        return f_out
+class IRFNet_L(nn.Module):
+    def __init__(self):
+        super(IRFNet_L, self).__init__()
+        self.encoder = Encoder()
+        self.decoder4 = Decoder4()
+        self.decoder3 = Decoder3()
+        self.decoder2 = Decoder2()
+        self.decoder1 = Decoder1()
+    def forward(self, img0, img1, scale_factor=1.0, timestep=0.5):
+        # emb1 = torch.tensor(1/2).view(1, 1, 1, 1).float()
+        # emb2 = torch.tensor(2/2).view(1, 1, 1, 1).float()
+        # embt = torch.cat([emb1, emb2], 0)
+        n, c, h, w = img0.shape
+        ph = ((h - 1) // 64 + 1) * 64
+        pw = ((w - 1) // 64 + 1) * 64
+        padding = (0, pw - w, 0, ph - h)
+        img0 = F.pad(img0, padding)
+        img1 = F.pad(img1, padding)
+        #Support multiple batches
+        embt = torch.tensor([timestep] * n).view(n, 1, 1, 1).float().to(get_torch_device())
+        if "HalfTensor" in str(img0.type()):
+            embt = embt.half()
+        mean_ = (
+            torch.cat([img0, img1], 2)
+            .mean(1, keepdim=True)
+            .mean(2, keepdim=True)
+            .mean(3, keepdim=True)
+        )
+        img0 = img0 - mean_
+        img1 = img1 - mean_
+        img0_ = resize(img0, scale_factor=scale_factor)
+        img1_ = resize(img1, scale_factor=scale_factor)
+        f0_1, f0_2, f0_3, f0_4 = self.encoder(img0_)
+        f1_1, f1_2, f1_3, f1_4 = self.encoder(img1_)
+        out4 = self.decoder4(f0_4, f1_4, embt)
+        up_flow0_4 = out4[:, 0:2]
+        up_flow1_4 = out4[:, 2:4]
+        ft_3_ = out4[:, 4:]
+        out3 = self.decoder3(ft_3_, f0_3, f1_3, up_flow0_4, up_flow1_4)
+        up_flow0_3 = out3[:, 0:2] + 2.0 * resize(up_flow0_4, scale_factor=2.0)
+        up_flow1_3 = out3[:, 2:4] + 2.0 * resize(up_flow1_4, scale_factor=2.0)
+        ft_2_ = out3[:, 4:]
+        out2 = self.decoder2(ft_2_, f0_2, f1_2, up_flow0_3, up_flow1_3)
+        up_flow0_2 = out2[:, 0:2] + 2.0 * resize(up_flow0_3, scale_factor=2.0)
+        up_flow1_2 = out2[:, 2:4] + 2.0 * resize(up_flow1_3, scale_factor=2.0)
+        ft_1_ = out2[:, 4:]
+        out1 = self.decoder1(ft_1_, f0_1, f1_1, up_flow0_2, up_flow1_2)
+        up_flow0_1 = out1[:, 0:2] + 2.0 * resize(up_flow0_2, scale_factor=2.0)
+        up_flow1_1 = out1[:, 2:4] + 2.0 * resize(up_flow1_2, scale_factor=2.0)
+        up_mask_1 = torch.sigmoid(out1[:, 4:5])
+        up_res_1 = out1[:, 5:]
+        up_flow0_1 = resize(up_flow0_1, scale_factor=(1.0 / scale_factor)) * (
+            1.0 / scale_factor
+        )
+        up_flow1_1 = resize(up_flow1_1, scale_factor=(1.0 / scale_factor)) * (
+            1.0 / scale_factor
+        )
+        up_mask_1 = resize(up_mask_1, scale_factor=(1.0 / scale_factor))
+        up_res_1 = resize(up_res_1, scale_factor=(1.0 / scale_factor))
+        img0_warp = warp(img0, up_flow0_1)
+        img1_warp = warp(img1, up_flow1_1)
+        imgt_merge = up_mask_1 * img0_warp + (1 - up_mask_1) * img1_warp + mean_
+        imgt_pred = imgt_merge + up_res_1
+        imgt_pred = torch.clamp(imgt_pred, 0, 1)
+        return imgt_pred[:, :, :h, :w]